From f403719bb409f8763ab9489a7c6d01efc86aedd9 Mon Sep 17 00:00:00 2001 From: Alexander Kukushkin Date: Tue, 22 Jun 2021 09:13:30 +0200 Subject: [PATCH] Reduce chattiness of Patroni logs (#1955) 1. When everything goes normal, only one line will be written for every run of HA loop (see examples): ``` INFO: no action. I am (postgresql0) the leader with the lock INFO: no action. I am a secondary (postgresql1) and following a leader (postgresql0) ``` 2. The `does not have lock` became a debug message. 3. The `Lock owner: postgresql0; I am postgresql1` will be shown only when stream doesn't look normal. --- patroni/ha.py | 19 +++++++++------ patroni/log.py | 15 +++++++++++- tests/test_ha.py | 62 ++++++++++++++++++++++++----------------------- tests/test_log.py | 9 +++++++ 4 files changed, 66 insertions(+), 39 deletions(-) diff --git a/patroni/ha.py b/patroni/ha.py index 8d94a94d..507d586b 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -404,7 +404,7 @@ class Ha(object): self.state_handler.set_role('replica') if not node_to_follow: - return 'no action' + return 'no action. I am ({0})'.format(self.state_handler.name) elif is_leader: self.demote('immediate-nolock') return demote_reason @@ -992,13 +992,13 @@ class Ha(object): # in case of standby cluster we don't really need to # enforce anything, since the leader is not a master. # So just remind the role. - msg = 'no action. i am the standby leader with the lock' \ + msg = 'no action. I am ({0}) the standby leader with the lock'.format(self.state_handler.name) \ if self.state_handler.role == 'standby_leader' else \ 'promoted self to a standby leader because i had the session lock' return self.enforce_follow_remote_master(msg) else: return self.enforce_master_role( - 'no action. i am the leader with the lock', + 'no action. I am ({0}) the leader with the lock'.format(self.state_handler.name), 'promoted self to leader because i had the session lock' ) else: @@ -1012,12 +1012,15 @@ class Ha(object): else: return 'not promoting because failed to update leader lock in DCS' else: - logger.info('does not have lock') + logger.debug('does not have lock') + lock_owner = self.cluster.leader and self.cluster.leader.name if self.is_standby_cluster(): - return self.follow('cannot be a real master in standby cluster', - 'no action. i am a secondary and i am following a standby leader', refresh=False) - return self.follow('demoting self because i do not have the lock and i was a leader', - 'no action. i am a secondary and i am following a leader', refresh=False) + return self.follow('cannot be a real primary in a standby cluster', + 'no action. I am a secondary ({0}) and following a standby leader ({1})'.format( + self.state_handler.name, lock_owner), refresh=False) + return self.follow('demoting self because I do not have the lock and I was a leader', + 'no action. I am a secondary ({0}) and following a leader ({1})'.format( + self.state_handler.name, lock_owner), refresh=False) def evaluate_scheduled_restart(self): if self._async_executor.busy: # Restart already in progress diff --git a/patroni/log.py b/patroni/log.py index ce808bb1..1231f1dd 100644 --- a/patroni/log.py +++ b/patroni/log.py @@ -166,6 +166,8 @@ class PatroniLogger(Thread): self._root_logger.addHandler(self._queue_handler) self._root_logger.removeHandler(self._proxy_handler) + prev_record = None + while True: self._close_old_handlers() @@ -173,7 +175,18 @@ class PatroniLogger(Thread): if record is None: break - self.log_handler.handle(record) + if self._root_logger.level == logging.INFO: + if record.msg.startswith('Lock owner: '): + prev_record, record = record, None + else: + if prev_record and prev_record.thread == record.thread: + if not (record.msg.startswith('no action. ') or record.msg.startswith('PAUSE: no action')): + self.log_handler.handle(prev_record) + prev_record = None + + if record: + self.log_handler.handle(record) + self._queue_handler.queue.task_done() def shutdown(self): diff --git a/tests/test_ha.py b/tests/test_ha.py index ed534b27..85f1cfc8 100644 --- a/tests/test_ha.py +++ b/tests/test_ha.py @@ -329,7 +329,7 @@ class TestHa(PostgresInit): self.ha.has_lock = true self.p.is_leader = false self.p.set_role('master') - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') def test_demote_after_failing_to_obtain_lock(self): self.ha.acquire_lock = false @@ -369,12 +369,12 @@ class TestHa(PostgresInit): self.ha.cluster = get_cluster_initialized_with_leader() self.ha.cluster.is_unlocked = false self.ha.has_lock = true - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false with patch.object(Watchdog, 'is_running', PropertyMock(return_value=True)): - self.assertEqual(self.ha.run_cycle(), 'demoting self because i do not have the lock and i was a leader') + self.assertEqual(self.ha.run_cycle(), 'demoting self because I do not have the lock and I was a leader') def test_demote_because_update_lock_failed(self): self.ha.cluster.is_unlocked = false @@ -388,16 +388,16 @@ class TestHa(PostgresInit): def test_follow(self): self.ha.cluster.is_unlocked = false self.p.is_leader = false - self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') + self.assertEqual(self.ha.run_cycle(), 'no action. I am a secondary (postgresql0) and following a leader ()') self.ha.patroni.replicatefrom = "foo" self.p.config.check_recovery_conf = Mock(return_value=(True, False)) self.ha.cluster.config.data.update({'slots': {'l': {'database': 'a', 'plugin': 'b'}}}) self.ha.cluster.members[1].data['tags']['replicatefrom'] = 'postgresql0' self.ha.patroni.nofailover = True - self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') + self.assertEqual(self.ha.run_cycle(), 'no action. I am a secondary (postgresql0) and following a leader ()') del self.ha.cluster.config.data['slots'] self.ha.cluster.config.data.update({'postgresql': {'use_slots': False}}) - self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') + self.assertEqual(self.ha.run_cycle(), 'no action. I am a secondary (postgresql0) and following a leader ()') del self.ha.cluster.config.data['postgresql']['use_slots'] def test_follow_in_pause(self): @@ -405,7 +405,7 @@ class TestHa(PostgresInit): self.ha.is_paused = true self.assertEqual(self.ha.run_cycle(), 'PAUSE: continue to run as master without lock') self.p.is_leader = false - self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action') + self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. I am (postgresql0)') @patch.object(Rewind, 'rewind_or_reinitialize_needed_and_possible', Mock(return_value=True)) @patch.object(Rewind, 'can_rewind', PropertyMock(return_value=True)) @@ -516,27 +516,27 @@ class TestHa(PostgresInit): self.ha.fetch_node_status = get_node_status() self.ha.has_lock = true self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', '', None)) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', self.p.name, None)) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', 'blabla', None)) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') f = Failover(0, self.p.name, '', None) self.ha.cluster = get_cluster_initialized_with_leader(f) self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha._rewind.rewind_or_reinitialize_needed_and_possible = true self.assertEqual(self.ha.run_cycle(), 'manual failover: demoting myself') self.ha.fetch_node_status = get_node_status(nofailover=True) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') self.ha.fetch_node_status = get_node_status(watchdog_failed=True) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') self.ha.fetch_node_status = get_node_status(timeline=1) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') self.ha.fetch_node_status = get_node_status(wal_position=1) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') # manual failover from the previous leader to us won't happen if we hold the nofailover flag self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, None)) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') # Failover scheduled time must include timezone scheduled = datetime.datetime.now() @@ -545,28 +545,28 @@ class TestHa(PostgresInit): scheduled = datetime.datetime.utcnow().replace(tzinfo=tzutc) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) - self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=30) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) - self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) scheduled = scheduled + datetime.timedelta(seconds=-600) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) - self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) scheduled = None self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) - self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) def test_manual_failover_from_leader_in_pause(self): self.ha.has_lock = true self.ha.is_paused = true scheduled = datetime.datetime.now() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, 'blabla', self.p.name, scheduled)) - self.assertEqual('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('PAUSE: no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, '', None)) - self.assertEqual('PAUSE: no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('PAUSE: no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) def test_manual_failover_from_leader_in_synchronous_mode(self): self.p.is_leader = true @@ -575,7 +575,7 @@ class TestHa(PostgresInit): self.ha.is_failover_possible = false self.ha.process_sync_replication = Mock() self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, None)) - self.assertEqual('no action. i am the leader with the lock', self.ha.run_cycle()) + self.assertEqual('no action. I am (postgresql0) the leader with the lock', self.ha.run_cycle()) self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, self.p.name, 'a', None), (self.p.name, 'a')) self.ha.is_failover_possible = true self.assertEqual('manual failover: demoting myself', self.ha.run_cycle()) @@ -736,7 +736,7 @@ class TestHa(PostgresInit): self.p.config.check_recovery_conf = Mock(return_value=(False, False)) self.ha._leader_timeline = 1 self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock') - self.assertEqual(self.ha.run_cycle(), 'no action. i am the standby leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (leader) the standby leader with the lock') self.p.set_role('replica') self.p.config.check_recovery_conf = Mock(return_value=(True, False)) self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock') @@ -745,7 +745,8 @@ class TestHa(PostgresInit): self.p.is_leader = false self.p.name = 'replica' self.ha.cluster = get_standby_cluster_initialized_with_only_leader() - self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a standby leader') + self.assertEqual(self.ha.run_cycle(), + 'no action. I am a secondary (replica) and following a standby leader (leader)') with patch.object(Leader, 'conn_url', PropertyMock(return_value='')): self.assertEqual(self.ha.run_cycle(), 'continue following the old known standby leader') @@ -838,7 +839,8 @@ class TestHa(PostgresInit): self.ha.has_lock = false self.p.is_leader = false - self.assertEqual(self.ha.run_cycle(), 'no action. i am a secondary and i am following a leader') + self.assertEqual(self.ha.run_cycle(), + 'no action. I am a secondary (postgresql0) and following a leader (leader)') check_calls([(update_lock, False), (demote, False)]) def test_manual_failover_while_starting(self): @@ -1063,7 +1065,7 @@ class TestHa(PostgresInit): self.ha.cluster.config.data.clear() self.ha.has_lock = true self.ha.cluster.is_unlocked = false - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') def test_watch(self): self.ha.cluster = get_cluster_initialized_with_leader() @@ -1098,7 +1100,7 @@ class TestHa(PostgresInit): self.ha.cluster.is_unlocked = false for tl in (1, 3): self.p.get_master_timeline = Mock(return_value=tl) - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') @patch('sys.exit', return_value=1) def test_abort_join(self, exit_mock): @@ -1111,9 +1113,9 @@ class TestHa(PostgresInit): self.ha.has_lock = true self.ha.cluster.is_unlocked = false self.ha.is_paused = true - self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'PAUSE: no action. I am (postgresql0) the leader with the lock') self.ha.is_paused = false - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0) the leader with the lock') @patch('psycopg2.connect', psycopg2_connect) def test_permanent_logical_slots_after_promote(self): @@ -1123,7 +1125,7 @@ class TestHa(PostgresInit): self.assertEqual(self.ha.run_cycle(), 'acquired session lock as a leader') self.ha.cluster = get_cluster_initialized_without_leader(leader=True, cluster_config=config) self.ha.has_lock = true - self.assertEqual(self.ha.run_cycle(), 'no action. i am the leader with the lock') + self.assertEqual(self.ha.run_cycle(), 'no action. I am (other) the leader with the lock') @patch.object(Cluster, 'has_member', true) def test_run_cycle(self): diff --git a/tests/test_log.py b/tests/test_log.py index 96dc0a3f..e5482c67 100644 --- a/tests/test_log.py +++ b/tests/test_log.py @@ -63,3 +63,12 @@ class TestPatroniLogger(unittest.TestCase): self.assertRaises(Exception, logger.shutdown) self.assertLessEqual(logger.queue_size, 2) # "Failed to close the old log handler" could be still in the queue self.assertEqual(logger.records_lost, 0) + + def test_interceptor(self): + logger = PatroniLogger() + logger.reload_config({'level': 'INFO'}) + logger.start() + _LOG.info('Lock owner: ') + _LOG.info('blabla') + logger.shutdown() + self.assertEqual(logger.records_lost, 0)