From 783112385fcd2d6bd490b0e28c80cdebcd8f54df Mon Sep 17 00:00:00 2001 From: ChenChangAo <86454826+ChenChangAo@users.noreply.github.com> Date: Fri, 4 Aug 2023 19:55:56 +0800 Subject: [PATCH] reset failsafe state when promote (#2803) consider the scenario(enable failsafe_mode): 0. node1(primary) - node2(replica) 1. stop all etcd nodes; wait ttl seconds; start all etcd nodes; (node2's failsafe will contain the info about node1) 2. switchover to node2; (node2's failsafe still contain the info about node1) 3. stop all etcd nodes; wait ttl seconds; start all etcd nodes; 4. node2 will demote because it consider node1 as primary Resetting failsafe state when running as a primary fixes the issue. --- patroni/ha.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/patroni/ha.py b/patroni/ha.py index 5603ba44..b50884f1 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -83,11 +83,7 @@ class Failsafe(object): def __init__(self, dcs: AbstractDCS) -> None: self._lock = RLock() self._dcs = dcs - self._last_update = 0 - self._name = None - self._conn_url = None - self._api_url = None - self._slots = None + self._reset_state() def update(self, data: Dict[str, Any]) -> None: with self._lock: @@ -97,6 +93,13 @@ class Failsafe(object): self._api_url = data['api_url'] self._slots = data.get('slots') + def _reset_state(self) -> None: + self._last_update = 0 + self._name = None + self._conn_url = None + self._api_url = None + self._slots = None + @property def leader(self) -> Optional[Leader]: with self._lock: @@ -130,6 +133,8 @@ class Failsafe(object): def set_is_active(self, value: float) -> None: with self._lock: self._last_update = value + if not value: + self._reset_state() class Ha(object): @@ -777,6 +782,9 @@ class Ha(object): self.state_handler.sync_handler.set_synchronous_standby_names( CaseInsensitiveSet('*') if self.global_config.is_synchronous_mode_strict else CaseInsensitiveSet()) if self.state_handler.role not in ('master', 'promoted', 'primary'): + # reset failsafe state when promote + self._failsafe.set_is_active(0) + def before_promote(): self.notify_citus_coordinator('before_promote')