diff --git a/patroni/ha.py b/patroni/ha.py index 31ae3f76..a2a206a7 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -1349,7 +1349,7 @@ class Ha(object): elif not candidates: logger.warning('%s: candidates list is empty', action) - ret = False + quorum_votes = -1 cluster_timeline = self.cluster.timeline for st in self.fetch_nodes_statuses(candidates): not_allowed_reason = st.failover_limitation() @@ -1362,8 +1362,10 @@ class Ha(object): logger.info('Timeline %s of member %s is behind the cluster timeline %s', st.timeline, st.member.name, cluster_timeline) else: - ret = True - return ret + quorum_votes += 1 + + # In case of quorum replication we need to make sure that there is enough healthy synchronous replicas! + return quorum_votes >= (self.cluster.sync.quorum if self.quorum_commit_mode_is_active() else 0) def manual_failover_process_no_leader(self) -> Optional[bool]: """Handles manual failover/switchover when the old leader already stepped down. @@ -2397,11 +2399,8 @@ class Ha(object): exclude = [self.state_handler.name] + ([failover.candidate] if failover and exclude_failover_candidate else []) def is_eligible(node: Member) -> bool: - # If quorum commit is requested we want to check all nodes (even not voters), - # because they could get enough votes and reach necessary quorum + 1. # in synchronous mode we allow failover (not switchover!) to async node - if self.sync_mode_is_active()\ - and not (self.is_quorum_commit_mode() or self.cluster.sync.matches(node.name))\ + if self.sync_mode_is_active() and not self.cluster.sync.matches(node.name)\ and not (failover and not failover.leader): return False # Don't spend time on "nofailover" nodes checking. diff --git a/patroni/quorum.py b/patroni/quorum.py index 059c8acc..050940ae 100644 --- a/patroni/quorum.py +++ b/patroni/quorum.py @@ -188,7 +188,7 @@ class QuorumStateResolver: old_leader = self.leader if leader is not None: # Change of leader was requested self.leader = leader - elif self.numsync_confirmed == 0: + elif self.numsync_confirmed == 0 and not self.voters: # If there are no nodes that known to caught up with the primary we want to reset quorum/voters in /sync key quorum = 0 voters = CaseInsensitiveSet() @@ -275,6 +275,12 @@ class QuorumStateResolver: logger.debug("Case 2: synchronous_standby_names %s is a superset of DCS state %s", self.sync, self.voters) # Case 2: sync is superset of voters nodes. In the middle of changing replication factor (sync). # Add to voters nodes that are already synced and active + remove_from_sync = self.sync - self.active + sync = CaseInsensitiveSet(self.sync - remove_from_sync) + # If sync will not become empty after removing dead nodes - remove them. + # However, do it carefully, between sync and voters should remain common nodes! + if remove_from_sync and sync and (not self.voters or sync & self.voters): + yield from self.sync_update(min(self.numsync, len(self.sync) - len(remove_from_sync)), sync) add_to_voters = (self.sync - self.voters) & self.active if add_to_voters: voters = CaseInsensitiveSet(self.voters | add_to_voters) @@ -323,7 +329,7 @@ class QuorumStateResolver: remove = CaseInsensitiveSet(sorted(to_remove, reverse=True)[:can_reduce_quorum_by]) sync = CaseInsensitiveSet(self.sync - remove) # when removing nodes from sync we can safely increase numsync if requested - numsync = min(self.sync_wanted, len(sync)) if self.sync_wanted > self.numsync else self.numsync + numsync = min(self.sync_wanted if self.sync_wanted > self.numsync else self.numsync, len(sync)) yield from self.sync_update(numsync, sync) voters = CaseInsensitiveSet(self.voters - remove) to_remove &= self.sync diff --git a/tests/test_quorum.py b/tests/test_quorum.py index 3e2d20d2..8eed8dc1 100644 --- a/tests/test_quorum.py +++ b/tests/test_quorum.py @@ -1,3 +1,4 @@ +import itertools import unittest from typing import List, Set, Tuple @@ -7,14 +8,23 @@ from patroni.quorum import QuorumError, QuorumStateResolver class QuorumTest(unittest.TestCase): + def setUp(self): + self.nesting = 0 + self.failures = [] + + def tearDown(self): + self.assertEqual(self.failures, []) + def check_state_transitions(self, leader: str, quorum: int, voters: Set[str], numsync: int, sync: Set[str], numsync_confirmed: int, active: Set[str], sync_wanted: int, leader_wanted: str, expected: List[Tuple[str, str, int, Set[str]]]) -> None: + self.nesting += 1 kwargs = { 'leader': leader, 'quorum': quorum, 'voters': voters, 'numsync': numsync, 'sync': sync, 'numsync_confirmed': numsync_confirmed, 'active': active, 'sync_wanted': sync_wanted, 'leader_wanted': leader_wanted } + state = {k: v for k, v in kwargs.items()} result = list(QuorumStateResolver(**kwargs)) self.assertEqual(result, expected) @@ -23,9 +33,64 @@ class QuorumTest(unittest.TestCase): if result[0][0] == 'sync': kwargs.update(numsync=result[0][2], sync=result[0][3]) else: - kwargs.update(leader=result[0][1], quorum=result[0][2], voters=result[0][3]) + kwargs.update(quorum=result[0][2], voters=result[0][3]) kwargs['expected'] = expected[1:] self.check_state_transitions(**kwargs) + self.nesting -= 1 + + # Now we want to automatically check interruped states, emulating situation when new nodes joined and gone away. + # We are not going to compare exact state transitions, but rather check that they will not produce exceptions. + if self.nesting > 0 or not active > sync: + return + + for e in expected: + if e[0] == 'restart' or e[1] != state['leader']: + return + if e[0] == 'sync': + state.update(numsync=e[2], sync=e[3]) + else: + state.update(quorum=e[2], voters=e[3]) + safety_margin = state['quorum'] + state['numsync'] - len(state['voters'] | state['sync']) + + # we are only interested in non-steady cases, when quorum is higher than required by numsync + if safety_margin == 0: + return + + # prepare initial state + state = {k: v for k, v in kwargs.items() if k != 'expected'} + + def combinations(a): + for r in range(0, len(a) + 1): + for c in itertools.combinations(a, r): + yield set(c) + + for e in expected: + if e[0] == 'sync': + state.update(numsync=e[2], sync=e[3]) + else: + state.update(quorum=e[2], voters=e[3]) + + for a in combinations(sync): + # we will check cases with reverting back to active being subsets of sync nodes + state['active'] = a + for c in range(0, len(state['active']) + 1): + # in addition to that we want to consider cases with numsync_confirmed having different values + state['numsync_confirmed'] = c + try: + result = list(QuorumStateResolver(**state)) + except Exception as e: + self.failures.append(e) + + # besides, we want to make a difference between voters being empty and non-empty + if state['voters']: + voters = state['voters'] + quorum = state['quorum'] + state.update(voters=set(), quorum=0) + try: + result = list(QuorumStateResolver(**state)) + except Exception as e: + self.failures.append(e) + state.update(voters=voters, quorum=quorum) def test_1111(self): leader = 'a'