From aadd39b0a45daa9cef7119cffbed4370b90693a3 Mon Sep 17 00:00:00 2001 From: Alexander Kukushkin Date: Tue, 12 Jun 2018 14:09:09 +0200 Subject: [PATCH] Do crash recovery only when we sure that postgres was running as master (#707) pg_controldata reports in this case: * 'in production' * 'shutting down' * 'in crash recovery' --- patroni/ha.py | 20 +++++--------------- tests/test_ha.py | 11 +---------- 2 files changed, 6 insertions(+), 25 deletions(-) diff --git a/patroni/ha.py b/patroni/ha.py index d8eca50a..f2bf6651 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -201,11 +201,6 @@ class Ha(object): self._async_executor.run_async(self.state_handler.rewind, (self.cluster.leader,)) return True - def _start_crash_recovery(self, msg): - self._async_executor.schedule(msg) - self._async_executor.run_async(self.state_handler.fix_cluster_state) - return msg - def recover(self): # Postgres is not running and we will restart in standby mode. Watchdog is not needed until we promote. self.watchdog.disable() @@ -226,10 +221,12 @@ class Ha(object): timeout = None data = self.state_handler.controldata() - if data.get('Database cluster state') == 'in production' and not self._crash_recovery_executed and \ - (self.cluster.is_unlocked() or self.state_handler.can_rewind): + if data.get('Database cluster state') in ('in production', 'shutting down', 'in crash recovery') and \ + not self._crash_recovery_executed and (self.cluster.is_unlocked() or self.state_handler.can_rewind): self._crash_recovery_executed = True - return self._start_crash_recovery('doing crash recovery in a single user mode') + self._async_executor.schedule('doing crash recovery in a single user mode') + self._async_executor.run_async(self.state_handler.fix_cluster_state) + return self._async_executor.scheduled_action self.load_cluster_from_dcs() @@ -244,13 +241,6 @@ class Ha(object): msg = "starting as a secondary" node_to_follow = self._get_node_to_follow(self.cluster) - # once we already tried to start postgres but failed, single user mode is a rescue in this case - if self.recovering and not self.state_handler.rewind_executed \ - and not self._crash_recovery_executed and self.state_handler.can_rewind \ - and data.get('Database cluster state') not in ('shut down', 'shut down in recovery'): - self.recovering = False - return self._start_crash_recovery('fixing cluster state in a single user mode') - self.recovering = True self._async_executor.schedule('restarting after failure') diff --git a/tests/test_ha.py b/tests/test_ha.py index 240a85f8..7bc170da 100644 --- a/tests/test_ha.py +++ b/tests/test_ha.py @@ -15,9 +15,9 @@ from patroni.utils import tzutc from test_etcd import socket_getaddrinfo, etcd_read, etcd_write, requests_get from test_postgresql import psycopg2_connect, MockPostmaster - SYSID = '12345678901' + def true(*args, **kwargs): return True @@ -207,15 +207,6 @@ class TestHa(unittest.TestCase): self.ha.cluster = get_cluster_initialized_with_leader() self.assertEquals(self.ha.run_cycle(), 'running pg_rewind from leader') - @patch.object(Postgresql, 'can_rewind', PropertyMock(return_value=True)) - @patch.object(Postgresql, 'fix_cluster_state', Mock()) - def test_single_user_after_recover_failed(self): - self.p.controldata = lambda: {'Database cluster state': 'in recovery', 'Database system identifier': SYSID} - self.p.is_running = false - self.p.follow = false - self.assertEquals(self.ha.run_cycle(), 'starting as a secondary') - self.assertEquals(self.ha.run_cycle(), 'fixing cluster state in a single user mode') - @patch('sys.exit', return_value=1) @patch('patroni.ha.Ha.sysid_valid', MagicMock(return_value=True)) def test_sysid_no_match(self, exit_mock):