Fix a couple of bugs in quorum state machine (#3278)

1. when evaluating whether there are healthy nodes for a leader race before demoting we need to take into account quorum requirements. Without it the former leader may end up in recovery surrounded by asynchronous nodes.
2. QuorumStateResolver wasn't correctly handling the case when the replica node quickly joined and disconnected, what was resulting in the following errors:
```
  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 427, in _generate_transitions
    yield from self.__remove_gone_nodes()
  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 327, in __remove_gone_nodes
    yield from self.sync_update(numsync, sync)
  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 227, in sync_update
    raise QuorumError(f'Sync {numsync} > N of ({sync})')
patroni.quorum.QuorumError: Sync 2 > N of ({'postgresql2'})
2025-02-14 10:18:07,058 INFO: Unexpected exception raised, please report it as a BUG

  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 246, in __iter__
    transitions = list(self._generate_transitions())
  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 423, in _generate_transitions
    yield from self.__handle_non_steady_cases()
  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 281, in __handle_non_steady_cases
    yield from self.quorum_update(len(voters) - self.numsync, voters)
  File "/home/akukushkin/git/patroni/patroni/quorum.py", line 184, in quorum_update
    raise QuorumError(f'Quorum {quorum} < 0 of ({voters})')
patroni.quorum.QuorumError: Quorum -1 < 0 of ({'postgresql1'})
2025-02-18 15:50:48,243 INFO: Unexpected exception raised, please report it as a BUG
```
This commit is contained in:
Alexander Kukushkin
2025-02-20 11:00:22 +01:00
committed by GitHub
parent cf427e8b0b
commit e9ba775959
3 changed files with 80 additions and 10 deletions

View File

@@ -1349,7 +1349,7 @@ class Ha(object):
elif not candidates:
logger.warning('%s: candidates list is empty', action)
ret = False
quorum_votes = -1
cluster_timeline = self.cluster.timeline
for st in self.fetch_nodes_statuses(candidates):
not_allowed_reason = st.failover_limitation()
@@ -1362,8 +1362,10 @@ class Ha(object):
logger.info('Timeline %s of member %s is behind the cluster timeline %s',
st.timeline, st.member.name, cluster_timeline)
else:
ret = True
return ret
quorum_votes += 1
# In case of quorum replication we need to make sure that there is enough healthy synchronous replicas!
return quorum_votes >= (self.cluster.sync.quorum if self.quorum_commit_mode_is_active() else 0)
def manual_failover_process_no_leader(self) -> Optional[bool]:
"""Handles manual failover/switchover when the old leader already stepped down.
@@ -2397,11 +2399,8 @@ class Ha(object):
exclude = [self.state_handler.name] + ([failover.candidate] if failover and exclude_failover_candidate else [])
def is_eligible(node: Member) -> bool:
# If quorum commit is requested we want to check all nodes (even not voters),
# because they could get enough votes and reach necessary quorum + 1.
# in synchronous mode we allow failover (not switchover!) to async node
if self.sync_mode_is_active()\
and not (self.is_quorum_commit_mode() or self.cluster.sync.matches(node.name))\
if self.sync_mode_is_active() and not self.cluster.sync.matches(node.name)\
and not (failover and not failover.leader):
return False
# Don't spend time on "nofailover" nodes checking.

View File

@@ -188,7 +188,7 @@ class QuorumStateResolver:
old_leader = self.leader
if leader is not None: # Change of leader was requested
self.leader = leader
elif self.numsync_confirmed == 0:
elif self.numsync_confirmed == 0 and not self.voters:
# If there are no nodes that known to caught up with the primary we want to reset quorum/voters in /sync key
quorum = 0
voters = CaseInsensitiveSet()
@@ -275,6 +275,12 @@ class QuorumStateResolver:
logger.debug("Case 2: synchronous_standby_names %s is a superset of DCS state %s", self.sync, self.voters)
# Case 2: sync is superset of voters nodes. In the middle of changing replication factor (sync).
# Add to voters nodes that are already synced and active
remove_from_sync = self.sync - self.active
sync = CaseInsensitiveSet(self.sync - remove_from_sync)
# If sync will not become empty after removing dead nodes - remove them.
# However, do it carefully, between sync and voters should remain common nodes!
if remove_from_sync and sync and (not self.voters or sync & self.voters):
yield from self.sync_update(min(self.numsync, len(self.sync) - len(remove_from_sync)), sync)
add_to_voters = (self.sync - self.voters) & self.active
if add_to_voters:
voters = CaseInsensitiveSet(self.voters | add_to_voters)
@@ -323,7 +329,7 @@ class QuorumStateResolver:
remove = CaseInsensitiveSet(sorted(to_remove, reverse=True)[:can_reduce_quorum_by])
sync = CaseInsensitiveSet(self.sync - remove)
# when removing nodes from sync we can safely increase numsync if requested
numsync = min(self.sync_wanted, len(sync)) if self.sync_wanted > self.numsync else self.numsync
numsync = min(self.sync_wanted if self.sync_wanted > self.numsync else self.numsync, len(sync))
yield from self.sync_update(numsync, sync)
voters = CaseInsensitiveSet(self.voters - remove)
to_remove &= self.sync

View File

@@ -1,3 +1,4 @@
import itertools
import unittest
from typing import List, Set, Tuple
@@ -7,14 +8,23 @@ from patroni.quorum import QuorumError, QuorumStateResolver
class QuorumTest(unittest.TestCase):
def setUp(self):
self.nesting = 0
self.failures = []
def tearDown(self):
self.assertEqual(self.failures, [])
def check_state_transitions(self, leader: str, quorum: int, voters: Set[str], numsync: int, sync: Set[str],
numsync_confirmed: int, active: Set[str], sync_wanted: int, leader_wanted: str,
expected: List[Tuple[str, str, int, Set[str]]]) -> None:
self.nesting += 1
kwargs = {
'leader': leader, 'quorum': quorum, 'voters': voters,
'numsync': numsync, 'sync': sync, 'numsync_confirmed': numsync_confirmed,
'active': active, 'sync_wanted': sync_wanted, 'leader_wanted': leader_wanted
}
state = {k: v for k, v in kwargs.items()}
result = list(QuorumStateResolver(**kwargs))
self.assertEqual(result, expected)
@@ -23,9 +33,64 @@ class QuorumTest(unittest.TestCase):
if result[0][0] == 'sync':
kwargs.update(numsync=result[0][2], sync=result[0][3])
else:
kwargs.update(leader=result[0][1], quorum=result[0][2], voters=result[0][3])
kwargs.update(quorum=result[0][2], voters=result[0][3])
kwargs['expected'] = expected[1:]
self.check_state_transitions(**kwargs)
self.nesting -= 1
# Now we want to automatically check interruped states, emulating situation when new nodes joined and gone away.
# We are not going to compare exact state transitions, but rather check that they will not produce exceptions.
if self.nesting > 0 or not active > sync:
return
for e in expected:
if e[0] == 'restart' or e[1] != state['leader']:
return
if e[0] == 'sync':
state.update(numsync=e[2], sync=e[3])
else:
state.update(quorum=e[2], voters=e[3])
safety_margin = state['quorum'] + state['numsync'] - len(state['voters'] | state['sync'])
# we are only interested in non-steady cases, when quorum is higher than required by numsync
if safety_margin == 0:
return
# prepare initial state
state = {k: v for k, v in kwargs.items() if k != 'expected'}
def combinations(a):
for r in range(0, len(a) + 1):
for c in itertools.combinations(a, r):
yield set(c)
for e in expected:
if e[0] == 'sync':
state.update(numsync=e[2], sync=e[3])
else:
state.update(quorum=e[2], voters=e[3])
for a in combinations(sync):
# we will check cases with reverting back to active being subsets of sync nodes
state['active'] = a
for c in range(0, len(state['active']) + 1):
# in addition to that we want to consider cases with numsync_confirmed having different values
state['numsync_confirmed'] = c
try:
result = list(QuorumStateResolver(**state))
except Exception as e:
self.failures.append(e)
# besides, we want to make a difference between voters being empty and non-empty
if state['voters']:
voters = state['voters']
quorum = state['quorum']
state.update(voters=set(), quorum=0)
try:
result = list(QuorumStateResolver(**state))
except Exception as e:
self.failures.append(e)
state.update(voters=voters, quorum=quorum)
def test_1111(self):
leader = 'a'