Do crash recovery only when we sure that postgres was running as master (#707)

pg_controldata reports in this case:
* 'in production'
* 'shutting down'
* 'in crash recovery'
This commit is contained in:
Alexander Kukushkin
2018-06-12 14:09:09 +02:00
committed by GitHub
parent 2537147810
commit aadd39b0a4
2 changed files with 6 additions and 25 deletions

View File

@@ -201,11 +201,6 @@ class Ha(object):
self._async_executor.run_async(self.state_handler.rewind, (self.cluster.leader,))
return True
def _start_crash_recovery(self, msg):
self._async_executor.schedule(msg)
self._async_executor.run_async(self.state_handler.fix_cluster_state)
return msg
def recover(self):
# Postgres is not running and we will restart in standby mode. Watchdog is not needed until we promote.
self.watchdog.disable()
@@ -226,10 +221,12 @@ class Ha(object):
timeout = None
data = self.state_handler.controldata()
if data.get('Database cluster state') == 'in production' and not self._crash_recovery_executed and \
(self.cluster.is_unlocked() or self.state_handler.can_rewind):
if data.get('Database cluster state') in ('in production', 'shutting down', 'in crash recovery') and \
not self._crash_recovery_executed and (self.cluster.is_unlocked() or self.state_handler.can_rewind):
self._crash_recovery_executed = True
return self._start_crash_recovery('doing crash recovery in a single user mode')
self._async_executor.schedule('doing crash recovery in a single user mode')
self._async_executor.run_async(self.state_handler.fix_cluster_state)
return self._async_executor.scheduled_action
self.load_cluster_from_dcs()
@@ -244,13 +241,6 @@ class Ha(object):
msg = "starting as a secondary"
node_to_follow = self._get_node_to_follow(self.cluster)
# once we already tried to start postgres but failed, single user mode is a rescue in this case
if self.recovering and not self.state_handler.rewind_executed \
and not self._crash_recovery_executed and self.state_handler.can_rewind \
and data.get('Database cluster state') not in ('shut down', 'shut down in recovery'):
self.recovering = False
return self._start_crash_recovery('fixing cluster state in a single user mode')
self.recovering = True
self._async_executor.schedule('restarting after failure')

View File

@@ -15,9 +15,9 @@ from patroni.utils import tzutc
from test_etcd import socket_getaddrinfo, etcd_read, etcd_write, requests_get
from test_postgresql import psycopg2_connect, MockPostmaster
SYSID = '12345678901'
def true(*args, **kwargs):
return True
@@ -207,15 +207,6 @@ class TestHa(unittest.TestCase):
self.ha.cluster = get_cluster_initialized_with_leader()
self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader')
@patch.object(Postgresql, 'can_rewind', PropertyMock(return_value=True))
@patch.object(Postgresql, 'fix_cluster_state', Mock())
def test_single_user_after_recover_failed(self):
self.p.controldata = lambda: {'Database cluster state': 'in recovery', 'Database system identifier': SYSID}
self.p.is_running = false
self.p.follow = false
self.assertEquals(self.ha.run_cycle(), 'starting as a secondary')
self.assertEquals(self.ha.run_cycle(), 'fixing cluster state in a single user mode')
@patch('sys.exit', return_value=1)
@patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True))
def test_sysid_no_match(self, exit_mock):