Reduce chattiness of Patroni logs (#1955)

1. When everything goes normal, only one line will be written for every run of HA loop (see examples):
```
INFO: no action. I am (postgresql0) the leader with the lock
INFO: no action. I am a secondary (postgresql1) and following a leader (postgresql0)
```

2. The `does not have lock` became a debug message.
3. The `Lock owner: postgresql0; I am postgresql1` will be shown only when stream doesn't look normal.
This commit is contained in:
Alexander Kukushkin
2021-06-22 09:13:30 +02:00
committed by GitHub
parent 03e71b6717
commit f403719bb4
4 changed files with 66 additions and 39 deletions

View File

@@ -404,7 +404,7 @@ class Ha(object):
self.state_handler.set_role('replica')
if not node_to_follow:
return 'no action'
return 'no action. I am ({0})'.format(self.state_handler.name)
elif is_leader:
self.demote('immediate-nolock')
return demote_reason
@@ -992,13 +992,13 @@ class Ha(object):
# in case of standby cluster we don't really need to
# enforce anything, since the leader is not a master.
# So just remind the role.
msg = 'no action. i am the standby leader with the lock' \
msg = 'no action. I am ({0}) the standby leader with the lock'.format(self.state_handler.name) \
if self.state_handler.role == 'standby_leader' else \
'promoted self to a standby leader because i had the session lock'
return self.enforce_follow_remote_master(msg)
else:
return self.enforce_master_role(
'no action. i am the leader with the lock',
'no action. I am ({0}) the leader with the lock'.format(self.state_handler.name),
'promoted self to leader because i had the session lock'
)
else:
@@ -1012,12 +1012,15 @@ class Ha(object):
else:
return 'not promoting because failed to update leader lock in DCS'
else:
logger.info('does not have lock')
logger.debug('does not have lock')
lock_owner = self.cluster.leader and self.cluster.leader.name
if self.is_standby_cluster():
return self.follow('cannot be a real master in standby cluster',
'no action. i am a secondary and i am following a standby leader', refresh=False)
return self.follow('demoting self because i do not have the lock and i was a leader',
'no action. i am a secondary and i am following a leader', refresh=False)
return self.follow('cannot be a real primary in a standby cluster',
'no action. I am a secondary ({0}) and following a standby leader ({1})'.format(
self.state_handler.name, lock_owner), refresh=False)
return self.follow('demoting self because I do not have the lock and I was a leader',
'no action. I am a secondary ({0}) and following a leader ({1})'.format(
self.state_handler.name, lock_owner), refresh=False)
def evaluate_scheduled_restart(self):
if self._async_executor.busy: # Restart already in progress

View File

@@ -166,6 +166,8 @@ class PatroniLogger(Thread):
self._root_logger.addHandler(self._queue_handler)
self._root_logger.removeHandler(self._proxy_handler)
prev_record = None
while True:
self._close_old_handlers()
@@ -173,7 +175,18 @@ class PatroniLogger(Thread):
if record is None:
break
self.log_handler.handle(record)
if self._root_logger.level == logging.INFO:
if record.msg.startswith('Lock owner: '):
prev_record, record = record, None
else:
if prev_record and prev_record.thread == record.thread:
if not (record.msg.startswith('no action. ') or record.msg.startswith('PAUSE: no action')):
self.log_handler.handle(prev_record)
prev_record = None
if record:
self.log_handler.handle(record)
self._queue_handler.queue.task_done()
def shutdown(self):

View File

@@ -329,7 +329,7 @@ class TestHa(PostgresInit):
self.ha.has_lock = true
self.p.is_leader = false
self.p.set_role('master')
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
def test_demote_after_failing_to_obtain_lock(self):
self.ha.acquire_lock = false
@@ -369,12 +369,12 @@ class TestHa(PostgresInit):
self.ha.cluster = get_cluster_initialized_with_leader()
self.ha.cluster.is_unlocked = false
self.ha.has_lock = true
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
def test_demote_because_not_having_lock(self):
self.ha.cluster.is_unlocked = false
with patch.object(Watchdog, 'is_running', PropertyMock(return_value=True)):
self.assertEqual(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader')
self.assertEqual(self.ha.run_cycle(), 'demoting self because I do not have the lock and I was a leader')
def test_demote_because_update_lock_failed(self):
self.ha.cluster.is_unlocked = false
@@ -388,16 +388,16 @@ class TestHa(PostgresInit):
def test_follow(self):
self.ha.cluster.is_unlocked = false
self.p.is_leader = false
self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader')
self.assertEqual(self.ha.run_cycle(), 'no action. I am a secondary (postgresql0) and following a leader ()')
self.ha.patroni.replicatefrom = "foo"
self.p.config.check_recovery_conf = Mock(return_value=(True, False))
self.ha.cluster.config.data.update({'slots': {'l': {'database': 'a', 'plugin': 'b'}}})
self.ha.cluster.members[1].data['tags']['replicatefrom'] = 'postgresql0'
self.ha.patroni.nofailover = True
self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader')
self.assertEqual(self.ha.run_cycle(), 'no action. I am a secondary (postgresql0) and following a leader ()')
del self.ha.cluster.config.data['slots']
self.ha.cluster.config.data.update({'postgresql': {'use_slots': False}})
self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader')
self.assertEqual(self.ha.run_cycle(), 'no action. I am a secondary (postgresql0) and following a leader ()')
del self.ha.cluster.config.data['postgresql']['use_slots']
def test_follow_in_pause(self):
@@ -405,7 +405,7 @@ class TestHa(PostgresInit):
self.ha.is_paused = true
self.assertEqual(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock')
self.p.is_leader = false
self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action')
self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. I am (postgresql0)')
@patch.object(Rewind, 'rewind_or_reinitialize_needed_and_possible', Mock(return_value=True))
@patch.object(Rewind, 'can_rewind', PropertyMock(return_value=True))
@@ -516,27 +516,27 @@ class TestHa(PostgresInit):
self.ha.fetch_node_status = get_node_status()
self.ha.has_lock = true
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', '', None))
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', self.p.name, None))
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', 'blabla', None))
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
f = Failover(0, self.p.name, '', None)
self.ha.cluster = get_cluster_initialized_with_leader(f)
self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself')
self.ha._rewind.rewind_or_reinitialize_needed_and_possible = true
self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself')
self.ha.fetch_node_status = get_node_status(nofailover=True)
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
self.ha.fetch_node_status = get_node_status(watchdog_failed=True)
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
self.ha.fetch_node_status = get_node_status(timeline=1)
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
self.ha.fetch_node_status = get_node_status(wal_position=1)
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
# manual failover from the previous leader to us won't happen if we hold the nofailover flag
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, None))
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
# Failover scheduled time must include timezone
scheduled = datetime.datetime.now()
@@ -545,28 +545,28 @@ class TestHa(PostgresInit):
scheduled = datetime.datetime.utcnow().replace(tzinfo=tzutc)
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled))
self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
scheduled = scheduled + datetime.timedelta(seconds=30)
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled))
self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
scheduled = scheduled + datetime.timedelta(seconds=-600)
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled))
self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
scheduled = None
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled))
self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
def test_manual_failover_from_leader_in_pause(self):
self.ha.has_lock = true
self.ha.is_paused = true
scheduled = datetime.datetime.now()
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled))
self.assertEqual('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('PAUSE: no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, '', None))
self.assertEqual('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('PAUSE: no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
def test_manual_failover_from_leader_in_synchronous_mode(self):
self.p.is_leader = true
@@ -575,7 +575,7 @@ class TestHa(PostgresInit):
self.ha.is_failover_possible = false
self.ha.process_sync_replication = Mock()
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, None))
self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle())
self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle())
self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, 'a'))
self.ha.is_failover_possible = true
self.assertEqual('manual failover: demoting myself', self.ha.run_cycle())
@@ -736,7 +736,7 @@ class TestHa(PostgresInit):
self.p.config.check_recovery_conf = Mock(return_value=(False, False))
self.ha._leader_timeline = 1
self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock')
self.assertEqual(self.ha.run_cycle(), 'no action. i am the standby leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (leader) the standby leader with the lock')
self.p.set_role('replica')
self.p.config.check_recovery_conf = Mock(return_value=(True, False))
self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock')
@@ -745,7 +745,8 @@ class TestHa(PostgresInit):
self.p.is_leader = false
self.p.name = 'replica'
self.ha.cluster = get_standby_cluster_initialized_with_only_leader()
self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a standby leader')
self.assertEqual(self.ha.run_cycle(),
'no action. I am a secondary (replica) and following a standby leader (leader)')
with patch.object(Leader, 'conn_url', PropertyMock(return_value='')):
self.assertEqual(self.ha.run_cycle(), 'continue following the old known standby leader')
@@ -838,7 +839,8 @@ class TestHa(PostgresInit):
self.ha.has_lock = false
self.p.is_leader = false
self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader')
self.assertEqual(self.ha.run_cycle(),
'no action. I am a secondary (postgresql0) and following a leader (leader)')
check_calls([(update_lock, False), (demote, False)])
def test_manual_failover_while_starting(self):
@@ -1063,7 +1065,7 @@ class TestHa(PostgresInit):
self.ha.cluster.config.data.clear()
self.ha.has_lock = true
self.ha.cluster.is_unlocked = false
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
def test_watch(self):
self.ha.cluster = get_cluster_initialized_with_leader()
@@ -1098,7 +1100,7 @@ class TestHa(PostgresInit):
self.ha.cluster.is_unlocked = false
for tl in (1, 3):
self.p.get_master_timeline = Mock(return_value=tl)
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
@patch('sys.exit', return_value=1)
def test_abort_join(self, exit_mock):
@@ -1111,9 +1113,9 @@ class TestHa(PostgresInit):
self.ha.has_lock = true
self.ha.cluster.is_unlocked = false
self.ha.is_paused = true
self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. I am (postgresql0) the leader with the lock')
self.ha.is_paused = false
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock')
@patch('psycopg2.connect', psycopg2_connect)
def test_permanent_logical_slots_after_promote(self):
@@ -1123,7 +1125,7 @@ class TestHa(PostgresInit):
self.assertEqual(self.ha.run_cycle(), 'acquired session lock as a leader')
self.ha.cluster = get_cluster_initialized_without_leader(leader=True, cluster_config=config)
self.ha.has_lock = true
self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock')
self.assertEqual(self.ha.run_cycle(), 'no action. I am (other) the leader with the lock')
@patch.object(Cluster, 'has_member', true)
def test_run_cycle(self):

View File

@@ -63,3 +63,12 @@ class TestPatroniLogger(unittest.TestCase):
self.assertRaises(Exception, logger.shutdown)
self.assertLessEqual(logger.queue_size, 2) # "Failed to close the old log handler" could be still in the queue
self.assertEqual(logger.records_lost, 0)
def test_interceptor(self):
logger = PatroniLogger()
logger.reload_config({'level': 'INFO'})
logger.start()
_LOG.info('Lock owner: ')
_LOG.info('blabla')
logger.shutdown()
self.assertEqual(logger.records_lost, 0)