diff --git a/features/steps/watchdog.py b/features/steps/watchdog.py index 9f2b9cf0..05eba73c 100644 --- a/features/steps/watchdog.py +++ b/features/steps/watchdog.py @@ -15,7 +15,7 @@ def polling_loop(timeout, interval=1): @step('I start {name:w} with watchdog') def start_patroni_with_watchdog(context, name): - return context.pctl.start(name, custom_config={'watchdog': True}) + return context.pctl.start(name, custom_config={'watchdog': True, 'bootstrap': {'dcs': {'ttl': 20}}}) @step('{name:w} watchdog has been pinged after {timeout:d} seconds') @@ -31,6 +31,11 @@ def watchdog_was_closed(context, name): assert context.pctl.get_watchdog(name).was_closed +@step('{name:w} watchdog has a {timeout:d} second timeout') +def watchdog_has_timeout(context, name, timeout): + assert context.pctl.get_watchdog(name).timeout == timeout + + @step('I reset {name:w} watchdog state') def watchdog_reset_pinged(context, name): context.pctl.get_watchdog(name).reset() diff --git a/features/watchdog.feature b/features/watchdog.feature index 216b0c4e..cea38f02 100644 --- a/features/watchdog.feature +++ b/features/watchdog.feature @@ -6,6 +6,14 @@ Feature: watchdog Then postgres0 is a leader after 10 seconds And postgres0 role is the primary after 10 seconds And postgres0 watchdog has been pinged after 10 seconds + And postgres0 watchdog has a 15 second timeout + + Scenario: watchdog is reconfigured after global ttl changed + Given I run patronictl.py edit-config batman -s ttl=30 --force + Then I receive a response returncode 0 + And I receive a response output "+ttl: 30" + When I sleep for 4 seconds + Then postgres0 watchdog has a 25 second timeout Scenario: watchdog is disabled during pause Given I run patronictl.py pause batman diff --git a/patroni/__main__.py b/patroni/__main__.py index bf800eff..1849690e 100644 --- a/patroni/__main__.py +++ b/patroni/__main__.py @@ -47,6 +47,7 @@ class Patroni(AbstractPatroniDaemon): elif not self.config.dynamic_configuration and 'bootstrap' in self.config: if self.config.set_dynamic_configuration(self.config['bootstrap']['dcs']): self.dcs.reload_config(self.config) + self.watchdog.reload_config(self.config) break except DCSError: logger.warning('Can not get cluster from dcs') diff --git a/patroni/watchdog/base.py b/patroni/watchdog/base.py index 5c155101..51c14604 100644 --- a/patroni/watchdog/base.py +++ b/patroni/watchdog/base.py @@ -215,6 +215,10 @@ class Watchdog(object): self._activate() if self.config.timeout != self.active_config.timeout: self.impl.set_timeout(self.config.timeout) + if self.is_running: + logger.info("{0} updated with {1} second timeout, timing slack {2} seconds" + .format(self.impl.describe(), self.impl.get_timeout(), self.config.timing_slack)) + self.active_config = self.config except WatchdogError as e: logger.error("Error while sending keepalive: %s", e) diff --git a/tests/test_watchdog.py b/tests/test_watchdog.py index c326c1f3..6d68e1ea 100644 --- a/tests/test_watchdog.py +++ b/tests/test_watchdog.py @@ -164,6 +164,13 @@ class TestWatchdog(unittest.TestCase): watchdog.reload_config({'ttl': 60, 'loop_wait': 15, 'watchdog': {'mode': 'required'}}) watchdog.keepalive() + self.assertTrue(watchdog.is_running) + self.assertEqual(watchdog.config.timeout, 60 - 5) + + watchdog.reload_config({'ttl': 60, 'loop_wait': 15, 'watchdog': {'mode': 'required', 'safety_margin': -1}}) + watchdog.keepalive() + self.assertTrue(watchdog.is_running) + self.assertEqual(watchdog.config.timeout, 60 // 2) class TestNullWatchdog(unittest.TestCase):