mirror of
https://github.com/outbackdingo/patroni.git
synced 2026-01-27 10:20:10 +00:00
Fix a couple of bugs in quorum state machine (#3278)
1. when evaluating whether there are healthy nodes for a leader race before demoting we need to take into account quorum requirements. Without it the former leader may end up in recovery surrounded by asynchronous nodes.
2. QuorumStateResolver wasn't correctly handling the case when the replica node quickly joined and disconnected, what was resulting in the following errors:
```
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 427, in _generate_transitions
yield from self.__remove_gone_nodes()
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 327, in __remove_gone_nodes
yield from self.sync_update(numsync, sync)
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 227, in sync_update
raise QuorumError(f'Sync {numsync} > N of ({sync})')
patroni.quorum.QuorumError: Sync 2 > N of ({'postgresql2'})
2025-02-14 10:18:07,058 INFO: Unexpected exception raised, please report it as a BUG
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 246, in __iter__
transitions = list(self._generate_transitions())
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 423, in _generate_transitions
yield from self.__handle_non_steady_cases()
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 281, in __handle_non_steady_cases
yield from self.quorum_update(len(voters) - self.numsync, voters)
File "/home/akukushkin/git/patroni/patroni/quorum.py", line 184, in quorum_update
raise QuorumError(f'Quorum {quorum} < 0 of ({voters})')
patroni.quorum.QuorumError: Quorum -1 < 0 of ({'postgresql1'})
2025-02-18 15:50:48,243 INFO: Unexpected exception raised, please report it as a BUG
```
This commit is contained in:
committed by
GitHub
parent
cf427e8b0b
commit
e9ba775959
@@ -1349,7 +1349,7 @@ class Ha(object):
|
||||
elif not candidates:
|
||||
logger.warning('%s: candidates list is empty', action)
|
||||
|
||||
ret = False
|
||||
quorum_votes = -1
|
||||
cluster_timeline = self.cluster.timeline
|
||||
for st in self.fetch_nodes_statuses(candidates):
|
||||
not_allowed_reason = st.failover_limitation()
|
||||
@@ -1362,8 +1362,10 @@ class Ha(object):
|
||||
logger.info('Timeline %s of member %s is behind the cluster timeline %s',
|
||||
st.timeline, st.member.name, cluster_timeline)
|
||||
else:
|
||||
ret = True
|
||||
return ret
|
||||
quorum_votes += 1
|
||||
|
||||
# In case of quorum replication we need to make sure that there is enough healthy synchronous replicas!
|
||||
return quorum_votes >= (self.cluster.sync.quorum if self.quorum_commit_mode_is_active() else 0)
|
||||
|
||||
def manual_failover_process_no_leader(self) -> Optional[bool]:
|
||||
"""Handles manual failover/switchover when the old leader already stepped down.
|
||||
@@ -2397,11 +2399,8 @@ class Ha(object):
|
||||
exclude = [self.state_handler.name] + ([failover.candidate] if failover and exclude_failover_candidate else [])
|
||||
|
||||
def is_eligible(node: Member) -> bool:
|
||||
# If quorum commit is requested we want to check all nodes (even not voters),
|
||||
# because they could get enough votes and reach necessary quorum + 1.
|
||||
# in synchronous mode we allow failover (not switchover!) to async node
|
||||
if self.sync_mode_is_active()\
|
||||
and not (self.is_quorum_commit_mode() or self.cluster.sync.matches(node.name))\
|
||||
if self.sync_mode_is_active() and not self.cluster.sync.matches(node.name)\
|
||||
and not (failover and not failover.leader):
|
||||
return False
|
||||
# Don't spend time on "nofailover" nodes checking.
|
||||
|
||||
@@ -188,7 +188,7 @@ class QuorumStateResolver:
|
||||
old_leader = self.leader
|
||||
if leader is not None: # Change of leader was requested
|
||||
self.leader = leader
|
||||
elif self.numsync_confirmed == 0:
|
||||
elif self.numsync_confirmed == 0 and not self.voters:
|
||||
# If there are no nodes that known to caught up with the primary we want to reset quorum/voters in /sync key
|
||||
quorum = 0
|
||||
voters = CaseInsensitiveSet()
|
||||
@@ -275,6 +275,12 @@ class QuorumStateResolver:
|
||||
logger.debug("Case 2: synchronous_standby_names %s is a superset of DCS state %s", self.sync, self.voters)
|
||||
# Case 2: sync is superset of voters nodes. In the middle of changing replication factor (sync).
|
||||
# Add to voters nodes that are already synced and active
|
||||
remove_from_sync = self.sync - self.active
|
||||
sync = CaseInsensitiveSet(self.sync - remove_from_sync)
|
||||
# If sync will not become empty after removing dead nodes - remove them.
|
||||
# However, do it carefully, between sync and voters should remain common nodes!
|
||||
if remove_from_sync and sync and (not self.voters or sync & self.voters):
|
||||
yield from self.sync_update(min(self.numsync, len(self.sync) - len(remove_from_sync)), sync)
|
||||
add_to_voters = (self.sync - self.voters) & self.active
|
||||
if add_to_voters:
|
||||
voters = CaseInsensitiveSet(self.voters | add_to_voters)
|
||||
@@ -323,7 +329,7 @@ class QuorumStateResolver:
|
||||
remove = CaseInsensitiveSet(sorted(to_remove, reverse=True)[:can_reduce_quorum_by])
|
||||
sync = CaseInsensitiveSet(self.sync - remove)
|
||||
# when removing nodes from sync we can safely increase numsync if requested
|
||||
numsync = min(self.sync_wanted, len(sync)) if self.sync_wanted > self.numsync else self.numsync
|
||||
numsync = min(self.sync_wanted if self.sync_wanted > self.numsync else self.numsync, len(sync))
|
||||
yield from self.sync_update(numsync, sync)
|
||||
voters = CaseInsensitiveSet(self.voters - remove)
|
||||
to_remove &= self.sync
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import itertools
|
||||
import unittest
|
||||
|
||||
from typing import List, Set, Tuple
|
||||
@@ -7,14 +8,23 @@ from patroni.quorum import QuorumError, QuorumStateResolver
|
||||
|
||||
class QuorumTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.nesting = 0
|
||||
self.failures = []
|
||||
|
||||
def tearDown(self):
|
||||
self.assertEqual(self.failures, [])
|
||||
|
||||
def check_state_transitions(self, leader: str, quorum: int, voters: Set[str], numsync: int, sync: Set[str],
|
||||
numsync_confirmed: int, active: Set[str], sync_wanted: int, leader_wanted: str,
|
||||
expected: List[Tuple[str, str, int, Set[str]]]) -> None:
|
||||
self.nesting += 1
|
||||
kwargs = {
|
||||
'leader': leader, 'quorum': quorum, 'voters': voters,
|
||||
'numsync': numsync, 'sync': sync, 'numsync_confirmed': numsync_confirmed,
|
||||
'active': active, 'sync_wanted': sync_wanted, 'leader_wanted': leader_wanted
|
||||
}
|
||||
state = {k: v for k, v in kwargs.items()}
|
||||
result = list(QuorumStateResolver(**kwargs))
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
@@ -23,9 +33,64 @@ class QuorumTest(unittest.TestCase):
|
||||
if result[0][0] == 'sync':
|
||||
kwargs.update(numsync=result[0][2], sync=result[0][3])
|
||||
else:
|
||||
kwargs.update(leader=result[0][1], quorum=result[0][2], voters=result[0][3])
|
||||
kwargs.update(quorum=result[0][2], voters=result[0][3])
|
||||
kwargs['expected'] = expected[1:]
|
||||
self.check_state_transitions(**kwargs)
|
||||
self.nesting -= 1
|
||||
|
||||
# Now we want to automatically check interruped states, emulating situation when new nodes joined and gone away.
|
||||
# We are not going to compare exact state transitions, but rather check that they will not produce exceptions.
|
||||
if self.nesting > 0 or not active > sync:
|
||||
return
|
||||
|
||||
for e in expected:
|
||||
if e[0] == 'restart' or e[1] != state['leader']:
|
||||
return
|
||||
if e[0] == 'sync':
|
||||
state.update(numsync=e[2], sync=e[3])
|
||||
else:
|
||||
state.update(quorum=e[2], voters=e[3])
|
||||
safety_margin = state['quorum'] + state['numsync'] - len(state['voters'] | state['sync'])
|
||||
|
||||
# we are only interested in non-steady cases, when quorum is higher than required by numsync
|
||||
if safety_margin == 0:
|
||||
return
|
||||
|
||||
# prepare initial state
|
||||
state = {k: v for k, v in kwargs.items() if k != 'expected'}
|
||||
|
||||
def combinations(a):
|
||||
for r in range(0, len(a) + 1):
|
||||
for c in itertools.combinations(a, r):
|
||||
yield set(c)
|
||||
|
||||
for e in expected:
|
||||
if e[0] == 'sync':
|
||||
state.update(numsync=e[2], sync=e[3])
|
||||
else:
|
||||
state.update(quorum=e[2], voters=e[3])
|
||||
|
||||
for a in combinations(sync):
|
||||
# we will check cases with reverting back to active being subsets of sync nodes
|
||||
state['active'] = a
|
||||
for c in range(0, len(state['active']) + 1):
|
||||
# in addition to that we want to consider cases with numsync_confirmed having different values
|
||||
state['numsync_confirmed'] = c
|
||||
try:
|
||||
result = list(QuorumStateResolver(**state))
|
||||
except Exception as e:
|
||||
self.failures.append(e)
|
||||
|
||||
# besides, we want to make a difference between voters being empty and non-empty
|
||||
if state['voters']:
|
||||
voters = state['voters']
|
||||
quorum = state['quorum']
|
||||
state.update(voters=set(), quorum=0)
|
||||
try:
|
||||
result = list(QuorumStateResolver(**state))
|
||||
except Exception as e:
|
||||
self.failures.append(e)
|
||||
state.update(voters=voters, quorum=quorum)
|
||||
|
||||
def test_1111(self):
|
||||
leader = 'a'
|
||||
|
||||
Reference in New Issue
Block a user