Ensure watchdog configuration matches bootstrap.dcs config and log changes (#2480)

Fix issue of patroni configuring watchdog with defaults when bootstrapping a new cluster rather than taking configuration used to bootstrap the DCS.
Also log changes to watchdog configuration based on calculated timeout value.

Close #2470
This commit is contained in:
Matt Baker
2022-12-13 15:59:23 +00:00
committed by GitHub
parent 92d3e1c167
commit e5027c7a13
5 changed files with 26 additions and 1 deletions

View File

@@ -15,7 +15,7 @@ def polling_loop(timeout, interval=1):
@step('I start {name:w} with watchdog')
def start_patroni_with_watchdog(context, name):
return context.pctl.start(name, custom_config={'watchdog': True})
return context.pctl.start(name, custom_config={'watchdog': True, 'bootstrap': {'dcs': {'ttl': 20}}})
@step('{name:w} watchdog has been pinged after {timeout:d} seconds')
@@ -31,6 +31,11 @@ def watchdog_was_closed(context, name):
assert context.pctl.get_watchdog(name).was_closed
@step('{name:w} watchdog has a {timeout:d} second timeout')
def watchdog_has_timeout(context, name, timeout):
assert context.pctl.get_watchdog(name).timeout == timeout
@step('I reset {name:w} watchdog state')
def watchdog_reset_pinged(context, name):
context.pctl.get_watchdog(name).reset()

View File

@@ -6,6 +6,14 @@ Feature: watchdog
Then postgres0 is a leader after 10 seconds
And postgres0 role is the primary after 10 seconds
And postgres0 watchdog has been pinged after 10 seconds
And postgres0 watchdog has a 15 second timeout
Scenario: watchdog is reconfigured after global ttl changed
Given I run patronictl.py edit-config batman -s ttl=30 --force
Then I receive a response returncode 0
And I receive a response output "+ttl: 30"
When I sleep for 4 seconds
Then postgres0 watchdog has a 25 second timeout
Scenario: watchdog is disabled during pause
Given I run patronictl.py pause batman

View File

@@ -47,6 +47,7 @@ class Patroni(AbstractPatroniDaemon):
elif not self.config.dynamic_configuration and 'bootstrap' in self.config:
if self.config.set_dynamic_configuration(self.config['bootstrap']['dcs']):
self.dcs.reload_config(self.config)
self.watchdog.reload_config(self.config)
break
except DCSError:
logger.warning('Can not get cluster from dcs')

View File

@@ -215,6 +215,10 @@ class Watchdog(object):
self._activate()
if self.config.timeout != self.active_config.timeout:
self.impl.set_timeout(self.config.timeout)
if self.is_running:
logger.info("{0} updated with {1} second timeout, timing slack {2} seconds"
.format(self.impl.describe(), self.impl.get_timeout(), self.config.timing_slack))
self.active_config = self.config
except WatchdogError as e:
logger.error("Error while sending keepalive: %s", e)

View File

@@ -164,6 +164,13 @@ class TestWatchdog(unittest.TestCase):
watchdog.reload_config({'ttl': 60, 'loop_wait': 15, 'watchdog': {'mode': 'required'}})
watchdog.keepalive()
self.assertTrue(watchdog.is_running)
self.assertEqual(watchdog.config.timeout, 60 - 5)
watchdog.reload_config({'ttl': 60, 'loop_wait': 15, 'watchdog': {'mode': 'required', 'safety_margin': -1}})
watchdog.keepalive()
self.assertTrue(watchdog.is_running)
self.assertEqual(watchdog.config.timeout, 60 // 2)
class TestNullWatchdog(unittest.TestCase):