mirror of
https://github.com/outbackdingo/patroni.git
synced 2026-01-27 10:20:10 +00:00
Do crash recovery only when we sure that postgres was running as master (#707)
pg_controldata reports in this case: * 'in production' * 'shutting down' * 'in crash recovery'
This commit is contained in:
committed by
GitHub
parent
2537147810
commit
aadd39b0a4
@@ -201,11 +201,6 @@ class Ha(object):
|
||||
self._async_executor.run_async(self.state_handler.rewind, (self.cluster.leader,))
|
||||
return True
|
||||
|
||||
def _start_crash_recovery(self, msg):
|
||||
self._async_executor.schedule(msg)
|
||||
self._async_executor.run_async(self.state_handler.fix_cluster_state)
|
||||
return msg
|
||||
|
||||
def recover(self):
|
||||
# Postgres is not running and we will restart in standby mode. Watchdog is not needed until we promote.
|
||||
self.watchdog.disable()
|
||||
@@ -226,10 +221,12 @@ class Ha(object):
|
||||
timeout = None
|
||||
|
||||
data = self.state_handler.controldata()
|
||||
if data.get('Database cluster state') == 'in production' and not self._crash_recovery_executed and \
|
||||
(self.cluster.is_unlocked() or self.state_handler.can_rewind):
|
||||
if data.get('Database cluster state') in ('in production', 'shutting down', 'in crash recovery') and \
|
||||
not self._crash_recovery_executed and (self.cluster.is_unlocked() or self.state_handler.can_rewind):
|
||||
self._crash_recovery_executed = True
|
||||
return self._start_crash_recovery('doing crash recovery in a single user mode')
|
||||
self._async_executor.schedule('doing crash recovery in a single user mode')
|
||||
self._async_executor.run_async(self.state_handler.fix_cluster_state)
|
||||
return self._async_executor.scheduled_action
|
||||
|
||||
self.load_cluster_from_dcs()
|
||||
|
||||
@@ -244,13 +241,6 @@ class Ha(object):
|
||||
msg = "starting as a secondary"
|
||||
node_to_follow = self._get_node_to_follow(self.cluster)
|
||||
|
||||
# once we already tried to start postgres but failed, single user mode is a rescue in this case
|
||||
if self.recovering and not self.state_handler.rewind_executed \
|
||||
and not self._crash_recovery_executed and self.state_handler.can_rewind \
|
||||
and data.get('Database cluster state') not in ('shut down', 'shut down in recovery'):
|
||||
self.recovering = False
|
||||
return self._start_crash_recovery('fixing cluster state in a single user mode')
|
||||
|
||||
self.recovering = True
|
||||
|
||||
self._async_executor.schedule('restarting after failure')
|
||||
|
||||
@@ -15,9 +15,9 @@ from patroni.utils import tzutc
|
||||
from test_etcd import socket_getaddrinfo, etcd_read, etcd_write, requests_get
|
||||
from test_postgresql import psycopg2_connect, MockPostmaster
|
||||
|
||||
|
||||
SYSID = '12345678901'
|
||||
|
||||
|
||||
def true(*args, **kwargs):
|
||||
return True
|
||||
|
||||
@@ -207,15 +207,6 @@ class TestHa(unittest.TestCase):
|
||||
self.ha.cluster = get_cluster_initialized_with_leader()
|
||||
self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader')
|
||||
|
||||
@patch.object(Postgresql, 'can_rewind', PropertyMock(return_value=True))
|
||||
@patch.object(Postgresql, 'fix_cluster_state', Mock())
|
||||
def test_single_user_after_recover_failed(self):
|
||||
self.p.controldata = lambda: {'Database cluster state': 'in recovery', 'Database system identifier': SYSID}
|
||||
self.p.is_running = false
|
||||
self.p.follow = false
|
||||
self.assertEquals(self.ha.run_cycle(), 'starting as a secondary')
|
||||
self.assertEquals(self.ha.run_cycle(), 'fixing cluster state in a single user mode')
|
||||
|
||||
@patch('sys.exit', return_value=1)
|
||||
@patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True))
|
||||
def test_sysid_no_match(self, exit_mock):
|
||||
|
||||
Reference in New Issue
Block a user