From 91e2be092c24e161ce07a121adfcdbd5ead493ae Mon Sep 17 00:00:00 2001 From: Alexander Kukushkin Date: Tue, 26 Sep 2023 11:14:20 +0200 Subject: [PATCH] Detect and solve inconsistency between /sync and actual sync nodes (#2877) Patroni is changing `synchronous_standby_names` and the `/sync` key in a very specific order, first we add nodes to `synchronous_standby_names` and only after, when they are recognized as synchronous they are added to the `/sync` key. When removing nodes the order is different: they are first removed from the `/sync` key and only after that from the `synchronous_standby_names`. As a result Patroni expects that either actual synchronous nodes will match with the nodes listed in the `/sync` key or that new candidates to synchronous nodes will not match with nodes listed in the `/sync` key. In case if `synchronous_standby_names` was removed from the `postgresql.conf`, manually, or due the the bug (#2876), the state becomes inconsistent because of the wrong order of updates. To solve inconsistent state we introduce additional checks and will update the `/sync` key with actual names of synchronous nodes (usually empty set). --- patroni/ha.py | 8 ++++++++ tests/test_ha.py | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/patroni/ha.py b/patroni/ha.py index 3673e775..6ebbdd97 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -654,6 +654,14 @@ class Ha(object): current = CaseInsensitiveSet(sync.members) picked, allow_promote = self.state_handler.sync_handler.current_state(self.cluster) + if picked == current and current != allow_promote: + logger.warning('Inconsistent state between synchronous_standby_names = %s and /sync = %s key ' + 'detected, updating synchronous replication key...', list(allow_promote), list(current)) + sync = self.dcs.write_sync_state(self.state_handler.name, allow_promote, version=sync.version) + if not sync: + return logger.warning("Updating sync state failed") + current = CaseInsensitiveSet(sync.members) + if picked != current: # update synchronous standby list in dcs temporarily to point to common nodes in current and picked sync_common = current & allow_promote diff --git a/tests/test_ha.py b/tests/test_ha.py index 29c4ca11..7a7ca069 100644 --- a/tests/test_ha.py +++ b/tests/test_ha.py @@ -1313,6 +1313,24 @@ class TestHa(PostgresInit): self.ha.run_cycle() self.assertEqual(mock_logger.call_args[0][0], 'Updating sync state failed') + @patch.object(Cluster, 'is_unlocked', Mock(return_value=False)) + def test_inconsistent_synchronous_state(self): + self.ha.is_synchronous_mode = true + self.ha.has_lock = true + self.p.name = 'leader' + self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'a')) + self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet('a'), CaseInsensitiveSet())) + self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty()) + mock_set_sync = self.p.sync_handler.set_synchronous_standby_names = Mock() + with patch('patroni.ha.logger.warning') as mock_logger: + self.ha.run_cycle() + mock_set_sync.assert_called_once() + self.assertTrue(mock_logger.call_args_list[0][0][0].startswith('Inconsistent state between ')) + self.ha.dcs.write_sync_state = Mock(return_value=None) + with patch('patroni.ha.logger.warning') as mock_logger: + self.ha.run_cycle() + self.assertEqual(mock_logger.call_args[0][0], 'Updating sync state failed') + def test_effective_tags(self): self.ha._disable_sync = True self.assertEqual(self.ha.get_effective_tags(), {'foo': 'bar', 'nosync': True})