Detect and solve inconsistency between /sync and actual sync nodes (#2877)

Patroni is changing `synchronous_standby_names` and the `/sync` key in a very specific order, first we add nodes to `synchronous_standby_names` and only after, when they are recognized as synchronous they are added to the `/sync` key. When removing nodes the order is different: they are first removed from the `/sync` key and only after that from the `synchronous_standby_names`.

As a result Patroni expects that either actual synchronous nodes will match with the nodes listed in the `/sync` key or that new candidates to synchronous nodes will not match with nodes listed in the `/sync` key. In case if `synchronous_standby_names` was removed from the `postgresql.conf`, manually, or due the the bug (#2876), the state becomes inconsistent because of the wrong order of updates.

To solve inconsistent state we introduce additional checks and will update the `/sync` key with actual names of synchronous nodes (usually empty set).
This commit is contained in:
Alexander Kukushkin
2023-09-26 11:14:20 +02:00
parent 4148e0b5b2
commit 91e2be092c
2 changed files with 26 additions and 0 deletions

View File

@@ -654,6 +654,14 @@ class Ha(object):
current = CaseInsensitiveSet(sync.members)
picked, allow_promote = self.state_handler.sync_handler.current_state(self.cluster)
if picked == current and current != allow_promote:
logger.warning('Inconsistent state between synchronous_standby_names = %s and /sync = %s key '
'detected, updating synchronous replication key...', list(allow_promote), list(current))
sync = self.dcs.write_sync_state(self.state_handler.name, allow_promote, version=sync.version)
if not sync:
return logger.warning("Updating sync state failed")
current = CaseInsensitiveSet(sync.members)
if picked != current:
# update synchronous standby list in dcs temporarily to point to common nodes in current and picked
sync_common = current & allow_promote

View File

@@ -1313,6 +1313,24 @@ class TestHa(PostgresInit):
self.ha.run_cycle()
self.assertEqual(mock_logger.call_args[0][0], 'Updating sync state failed')
@patch.object(Cluster, 'is_unlocked', Mock(return_value=False))
def test_inconsistent_synchronous_state(self):
self.ha.is_synchronous_mode = true
self.ha.has_lock = true
self.p.name = 'leader'
self.ha.cluster = get_cluster_initialized_without_leader(sync=('leader', 'a'))
self.p.sync_handler.current_state = Mock(return_value=(CaseInsensitiveSet('a'), CaseInsensitiveSet()))
self.ha.dcs.write_sync_state = Mock(return_value=SyncState.empty())
mock_set_sync = self.p.sync_handler.set_synchronous_standby_names = Mock()
with patch('patroni.ha.logger.warning') as mock_logger:
self.ha.run_cycle()
mock_set_sync.assert_called_once()
self.assertTrue(mock_logger.call_args_list[0][0][0].startswith('Inconsistent state between '))
self.ha.dcs.write_sync_state = Mock(return_value=None)
with patch('patroni.ha.logger.warning') as mock_logger:
self.ha.run_cycle()
self.assertEqual(mock_logger.call_args[0][0], 'Updating sync state failed')
def test_effective_tags(self):
self.ha._disable_sync = True
self.assertEqual(self.ha.get_effective_tags(), {'foo': 'bar', 'nosync': True})