reset failsafe state when promote (#2803)

consider the scenario(enable failsafe_mode):

0. node1(primary) - node2(replica)
1. stop all etcd nodes; wait ttl seconds; start all etcd nodes; (node2's failsafe will contain the info about node1)
2. switchover to node2; (node2's failsafe still contain the info about node1)
3. stop all etcd nodes; wait ttl seconds; start all etcd nodes;
4. node2 will demote because it consider node1 as primary

Resetting failsafe state when running as a primary fixes the issue.
This commit is contained in:
ChenChangAo
2023-08-04 19:55:56 +08:00
committed by Alexander Kukushkin
parent 84aac437c1
commit 783112385f

View File

@@ -83,11 +83,7 @@ class Failsafe(object):
def __init__(self, dcs: AbstractDCS) -> None:
self._lock = RLock()
self._dcs = dcs
self._last_update = 0
self._name = None
self._conn_url = None
self._api_url = None
self._slots = None
self._reset_state()
def update(self, data: Dict[str, Any]) -> None:
with self._lock:
@@ -97,6 +93,13 @@ class Failsafe(object):
self._api_url = data['api_url']
self._slots = data.get('slots')
def _reset_state(self) -> None:
self._last_update = 0
self._name = None
self._conn_url = None
self._api_url = None
self._slots = None
@property
def leader(self) -> Optional[Leader]:
with self._lock:
@@ -130,6 +133,8 @@ class Failsafe(object):
def set_is_active(self, value: float) -> None:
with self._lock:
self._last_update = value
if not value:
self._reset_state()
class Ha(object):
@@ -777,6 +782,9 @@ class Ha(object):
self.state_handler.sync_handler.set_synchronous_standby_names(
CaseInsensitiveSet('*') if self.global_config.is_synchronous_mode_strict else CaseInsensitiveSet())
if self.state_handler.role not in ('master', 'promoted', 'primary'):
# reset failsafe state when promote
self._failsafe.set_is_active(0)
def before_promote():
self.notify_citus_coordinator('before_promote')