reset failsafe state when promote (#2803)

consider the scenario(enable failsafe_mode): 0. node1(primary) - node2(replica) 1. stop all etcd nodes; wait ttl seconds; start all etcd nodes; (node2's failsafe will contain the info about node1) 2. switchover to node2; (node2's failsafe still contain the info about node1) 3. stop all etcd nodes; wait ttl seconds; start all etcd nodes; 4. node2 will demote because it consider node1 as primary Resetting failsafe state when running as a primary fixes the issue.
2026-01-27 10:20:10 +00:00 · 2023-08-04 19:55:56 +08:00
parent 84aac437c1
commit 783112385f
1 changed files with 13 additions and 5 deletions
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -83,11 +83,7 @@ class Failsafe(object):
    def __init__(self, dcs: AbstractDCS) -> None:
        self._lock = RLock()
        self._dcs = dcs
-        self._last_update = 0
-        self._name = None
-        self._conn_url = None
-        self._api_url = None
-        self._slots = None
+        self._reset_state()

    def update(self, data: Dict[str, Any]) -> None:
        with self._lock:
@@ -97,6 +93,13 @@ class Failsafe(object):
            self._api_url = data['api_url']
            self._slots = data.get('slots')

+    def _reset_state(self) -> None:
+        self._last_update = 0
+        self._name = None
+        self._conn_url = None
+        self._api_url = None
+        self._slots = None
+
    @property
    def leader(self) -> Optional[Leader]:
        with self._lock:
@@ -130,6 +133,8 @@ class Failsafe(object):
    def set_is_active(self, value: float) -> None:
        with self._lock:
            self._last_update = value
+            if not value:
+                self._reset_state()


 class Ha(object):
@@ -777,6 +782,9 @@ class Ha(object):
                self.state_handler.sync_handler.set_synchronous_standby_names(
                    CaseInsensitiveSet('*') if self.global_config.is_synchronous_mode_strict else CaseInsensitiveSet())
            if self.state_handler.role not in ('master', 'promoted', 'primary'):
+                # reset failsafe state when promote
+                self._failsafe.set_is_active(0)
+
                def before_promote():
                    self.notify_citus_coordinator('before_promote')