Improve behave tests (#1313)

Hopefully, make them less flaky
2026-01-27 10:20:10 +00:00 · 2019-12-02 10:33:44 +01:00
parent a3be2958a7
commit a5ff38a034
6 changed files with 36 additions and 17 deletions
--- a/features/basic_replication.feature
+++ b/features/basic_replication.feature
@@ -4,6 +4,7 @@ Feature: basic replication
  Scenario: check replication of a single table
    Given I start postgres0
    Then postgres0 is a leader after 10 seconds
+    And there is a non empty initialize key in DCS after 15 seconds
    When I issue a PATCH request to http://127.0.0.1:8008/config with {"ttl": 20, "loop_wait": 2, "synchronous_mode": true}
    Then I receive a response code 200
    When I start postgres1
@@ -35,10 +36,12 @@ Feature: basic replication
    And I run patronictl.py resume batman
    Then I receive a response returncode 0
    And postgres2 role is the primary after 24 seconds
+    And Response on GET http://127.0.0.1:8010/history contains recovery after 10 seconds
    When I issue a PATCH request to http://127.0.0.1:8010/config with {"synchronous_mode": null, "master_start_timeout": 0}
    Then I receive a response code 200
    When I add the table bar to postgres2
    Then table bar is present on postgres1 after 20 seconds
+    And Response on GET http://127.0.0.1:8010/config contains master_start_timeout after 10 seconds

  Scenario: check immediate failover when master_start_timeout=0
    Given I kill postmaster on postgres2
--- a/features/custom_bootstrap.feature
+++ b/features/custom_bootstrap.feature
@@ -13,5 +13,5 @@ Scenario: make a backup and do a restore into a new cluster
    Given I add the table bar to postgres1
    And I do a backup of postgres1
    When I start postgres2 in a cluster batman2 from backup
-    Then postgres2 is a leader of batman2 after 10 seconds
+    Then postgres2 is a leader of batman2 after 30 seconds
    And table bar is present on postgres2 after 10 seconds
--- a/features/environment.py
+++ b/features/environment.py
@@ -454,8 +454,9 @@ class KubernetesController(AbstractDcsController):
            return (pod.metadata.annotations or {}).get('status', '')
        else:
            try:
-                e = self._api.read_namespaced_endpoints(scope + ('' if key == 'leader' else '-' + key), self._namespace)
-                if key == 'leader':
+                ep = scope + {'leader': '', 'history': '-config', 'initialize': '-config'}.get(key, '-' + key)
+                e = self._api.read_namespaced_endpoints(ep, self._namespace)
+                if key != 'sync':
                    return e.metadata.annotations[key]
                else:
                    return json.dumps(e.metadata.annotations)
--- a/features/patroni_api.feature
+++ b/features/patroni_api.feature
@@ -43,6 +43,20 @@ Scenario: check dynamic configuration change via DCS
 	When I issue a GET request to http://127.0.0.1:8008/patroni
 	Then I receive a response code 200
 	And I receive a response tags {'new_tag': 'new_value'}
+	And I sleep for 4 seconds
+
+Scenario: check the scheduled restart
+	Given I issue a PATCH request to http://127.0.0.1:8008/config with {"postgresql": {"parameters": {"superuser_reserved_connections": "6"}}}
+	Then I receive a response code 200
+		And Response on GET http://127.0.0.1:8008/patroni contains pending_restart after 5 seconds
+	Given I issue a scheduled restart at http://127.0.0.1:8008 in 3 seconds with {"role": "replica"}
+	Then I receive a response code 202
+		And I sleep for 4 seconds
+		And Response on GET http://127.0.0.1:8008/patroni contains pending_restart after 10 seconds
+	Given I issue a scheduled restart at http://127.0.0.1:8008 in 3 seconds with {"restart_pending": "True"}
+	Then I receive a response code 202
+		And Response on GET http://127.0.0.1:8008/patroni does not contain pending_restart after 10 seconds
+		And postgres0 role is the primary after 10 seconds

 Scenario: check API requests for the primary-replica pair in the pause mode
 	Given I run patronictl.py pause batman
@@ -101,16 +115,3 @@ Scenario: check the scheduled switchover
 	Then I receive a response code 503
 	When I issue a GET request to http://127.0.0.1:8009/replica
 	Then I receive a response code 200
-
-Scenario: check the scheduled restart
-	Given I issue a PATCH request to http://127.0.0.1:8008/config with {"postgresql": {"parameters": {"superuser_reserved_connections": "6"}}}
-	Then I receive a response code 200
-		And Response on GET http://127.0.0.1:8008/patroni contains pending_restart after 5 seconds
-	Given I issue a scheduled restart at http://127.0.0.1:8008 in 3 seconds with {"role": "replica"}
-	Then I receive a response code 202
-		And I sleep for 4 seconds
-		And Response on GET http://127.0.0.1:8008/patroni contains pending_restart after 10 seconds
-	Given I issue a scheduled restart at http://127.0.0.1:8008 in 3 seconds with {"restart_pending": "True"}
-	Then I receive a response code 202
-		And Response on GET http://127.0.0.1:8008/patroni does not contain pending_restart after 10 seconds
-
--- a/features/standby_cluster.feature
+++ b/features/standby_cluster.feature
@@ -2,7 +2,7 @@ Feature: standby cluster
  Scenario: check permanent logical slots are preserved on failover/switchover
    Given I start postgres1
    Then postgres1 is a leader after 10 seconds
-    And I sleep for 3 seconds
+    And there is a non empty initialize key in DCS after 15 seconds
    When I issue a PATCH request to http://127.0.0.1:8009/config with {"loop_wait": 2, "slots": {"pm_1": {"type": "physical"}}, "postgresql": {"parameters": {"wal_level": "logical"}}}
    Then I receive a response code 200
    And Response on GET http://127.0.0.1:8009/config contains slots after 10 seconds
--- a/features/steps/cascading_replication.py
+++ b/features/steps/cascading_replication.py
@@ -34,3 +34,17 @@ def check_member(context, name, key, value, time_limit):
            pass
        time.sleep(1)
    assert False, "{0} does not have {1}={2} in dcs after {3} seconds".format(name, key, value, time_limit)
+
+
+@step('there is a non empty {key:w} key in DCS after {time_limit:d} seconds')
+def check_initialize(context, key, time_limit):
+    time_limit *= context.timeout_multiplier
+    max_time = time.time() + int(time_limit)
+    while time.time() < max_time:
+        try:
+            if context.dcs_ctl.query(key):
+                return
+        except Exception:
+            pass
+        time.sleep(1)
+    assert False, "There is no {0} in dcs after {1} seconds".format(key, time_limit)