From fc962121d775ac05989724c2fdbfdf396f65313e Mon Sep 17 00:00:00 2001
From: ksarabu1 <62157128+ksarabu1@users.noreply.github.com>
Date: Fri, 17 Apr 2020 01:49:55 -0400
Subject: [PATCH 01/40] Reinitialize Bug fix with user defined tablespaces
 (#1494)

During reinit, Patroni removing pgdata and leaving user-defined tablespace directory. This is causing Patroni to loop in reinit.
---
 patroni/postgresql/__init__.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 8e2d017f..1c64e3cd 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -17,7 +17,7 @@ from patroni.postgresql.misc import parse_history, postgres_major_version_to_int
 from patroni.postgresql.postmaster import PostmasterProcess
 from patroni.postgresql.slots import SlotsHandler
 from patroni.exceptions import PostgresConnectionException
-from patroni.utils import Retry, RetryFailedError, polling_loop, data_directory_is_empty
+from patroni.utils import Retry, RetryFailedError, polling_loop, data_directory_is_empty, parse_int
 from threading import current_thread, Lock
 from psutil import TimeoutExpired
 
@@ -821,6 +821,15 @@ class Postgresql(object):
                         pg_wal_realpath = os.path.realpath(pg_wal_path)
                         logger.info('Removing WAL directory: %s', pg_wal_realpath)
                         shutil.rmtree(pg_wal_realpath)
+                # Remove user defined tablespace directory
+                pg_tblsp_dir = os.path.join(self._data_dir, 'pg_tblspc')
+                if os.path.exists(pg_tblsp_dir):
+                    for tsdn in os.listdir(pg_tblsp_dir):
+                        pg_tsp_path = os.path.join(pg_tblsp_dir, tsdn)
+                        if parse_int(tsdn) and os.path.islink(pg_tsp_path):
+                            pg_tsp_rpath = os.path.realpath(pg_tsp_path)
+                            logger.info('Removing user defined tablespace directory: %s', pg_tsp_rpath)
+                            shutil.rmtree(pg_tsp_rpath, ignore_errors=True)
 
                 shutil.rmtree(self._data_dir)
         except (IOError, OSError):

From 5fa912f8fafeccb124ea0b99834b0a8bfbd02da3 Mon Sep 17 00:00:00 2001
From: ksarabu1 <62157128+ksarabu1@users.noreply.github.com>
Date: Fri, 17 Apr 2020 10:27:38 -0400
Subject: [PATCH 02/40] Make max history timelines in DCS configurable (#1491)

Close https://github.com/zalando/patroni/issues/1487
---
 docs/SETTINGS.rst              | 1 +
 docs/dynamic_configuration.rst | 1 +
 patroni/dcs/__init__.py        | 4 ++++
 patroni/ha.py                  | 1 +
 4 files changed, 7 insertions(+)

diff --git a/docs/SETTINGS.rst b/docs/SETTINGS.rst
index 7a2ed244..74a5e2b7 100644
--- a/docs/SETTINGS.rst
+++ b/docs/SETTINGS.rst
@@ -15,6 +15,7 @@ Dynamic configuration is stored in the DCS (Distributed Configuration Store) and
 -  **ttl**: the TTL to acquire the leader lock (in seconds). Think of it as the length of time before initiation of the automatic failover process. Default value: 30
 -  **retry\_timeout**: timeout for DCS and PostgreSQL operation retries (in seconds). DCS or network issues shorter than this will not cause Patroni to demote the leader. Default value: 10
 -  **maximum\_lag\_on\_failover**: the maximum bytes a follower may lag to be able to participate in leader election.
+-  **max\_timelines\_history**: maximum number of timeline history items kept in DCS.  Default value: 0. When set to 0, it keeps the full history in DCS.
 -  **master\_start\_timeout**: the amount of time a master is allowed to recover from failures before failover is triggered (in seconds). Default is 300 seconds. When set to 0 failover is done immediately after a crash is detected if possible. When using asynchronous replication a failover can cause lost transactions. Worst case failover time for master failure is: loop\_wait + master\_start\_timeout + loop\_wait, unless master\_start\_timeout is zero, in which case it's just loop\_wait. Set the value according to your durability/availability tradeoff.
 - **master\_stop\_timeout**: The number of seconds Patroni is allowed to wait when stopping Postgres and effective only when synchronous_mode is enabled. When set to > 0 and the synchronous_mode is enabled, Patroni sends SIGKILL to the postmaster if the stop operation is running for more than the value set by master_stop_timeout. Set the value according to your durability/availability tradeoff. If the parameter is not set or set <= 0, master_stop_timeout does not apply.
 -  **synchronous\_mode**: turns on synchronous replication mode. In this mode a replica will be chosen as synchronous and only the latest leader and synchronous replica are able to participate in leader election. Synchronous mode makes sure that successfully committed transactions will not be lost at failover, at the cost of losing availability for writes when Patroni cannot ensure transaction durability. See :ref:`replication modes documentation <replication_modes>` for details.
diff --git a/docs/dynamic_configuration.rst b/docs/dynamic_configuration.rst
index 85c7cb1d..6f350b18 100644
--- a/docs/dynamic_configuration.rst
+++ b/docs/dynamic_configuration.rst
@@ -76,6 +76,7 @@ Also, the following Patroni configuration options can be changed only dynamicall
 - loop_wait: 10
 - retry_timeouts: 10
 - maximum_lag_on_failover: 1048576
+- max_timelines_history: 0
 - check_timeline: false
 - postgresql.use_slots: true
 
diff --git a/patroni/dcs/__init__.py b/patroni/dcs/__init__.py
index 946a65e5..193b7fe1 100644
--- a/patroni/dcs/__init__.py
+++ b/patroni/dcs/__init__.py
@@ -341,6 +341,10 @@ class ClusterConfig(namedtuple('ClusterConfig', 'index,data,modify_index')):
                 self.data.get('permanent_slots') or self.data.get('slots')
         ) or {}
 
+    @property
+    def max_timelines_history(self):
+        return self.data.get('max_timelines_history', 0)
+
 
 class SyncState(namedtuple('SyncState', 'index,leader,sync_standby')):
     """Immutable object (namedtuple) which represents last observed synhcronous replication state
diff --git a/patroni/ha.py b/patroni/ha.py
index bdc943e4..4ef19fe4 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -526,6 +526,7 @@ class Ha(object):
             cluster_history = {l[0]: l for l in cluster_history or []}
             history = self.state_handler.get_history(master_timeline)
             if history:
+                history = history[-self.cluster.config.max_timelines_history:]
                 for line in history:
                     # enrich current history with promotion timestamps stored in DCS
                     if len(line) == 3 and line[0] in cluster_history \

From 80fbe900562bbe73d17e48d6a7f2eec53a4ba2a4 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Mon, 20 Apr 2020 11:55:05 +0200
Subject: [PATCH 03/40] Issue CHEKPOINT explicitely after promote happened
 (#1498)

It is safe to call pg_rewind on the replica only when pg_control on the primary contains information about the latest timeline. Postgres is usually doing immediate checkpoint right after promote and in most cases it works just fine. Unfortunately we regularly receive complaints that it takes to long (minutes) until the checkpoint is done and replicas can't perform rewind. At the same time doing the checkpoint manually immediately helped. So Patroni starts doing the same. When the promotion happened and postgres is not running in recovery, we explicitly issue the checkpoint.

We are intentionally not using the AsyncExecutor here, because we want the HA loop continues doing its normal flow.
---
 patroni/ha.py                |  2 +-
 patroni/postgresql/rewind.py | 42 ++++++++++++++++++++++++++++++------
 tests/test_ha.py             |  1 +
 tests/test_rewind.py         | 31 +++++++++++++++++++++++---
 4 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index 4ef19fe4..5deb2d80 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -945,7 +945,7 @@ class Ha(object):
                     return msg
 
                 # check if the node is ready to be used by pg_rewind
-                self._rewind.check_for_checkpoint_after_promote()
+                self._rewind.ensure_checkpoint_after_promote()
 
                 if self.is_standby_cluster():
                     # in case of standby cluster we don't really need to
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index 091d1f93..9dd30c7a 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -2,9 +2,12 @@ import logging
 import os
 import subprocess
 
-from patroni.dcs import Leader
-from patroni.postgresql.connection import get_connection_cursor
-from patroni.postgresql.misc import parse_history, parse_lsn
+from threading import Lock, Thread
+
+from .connection import get_connection_cursor
+from .misc import parse_history, parse_lsn
+from ..async_executor import CriticalTask
+from ..dcs import Leader
 
 logger = logging.getLogger(__name__)
 
@@ -16,6 +19,7 @@ class Rewind(object):
 
     def __init__(self, postgresql):
         self._postgresql = postgresql
+        self._checkpoint_task_lock = Lock()
         self.reset_state()
 
     @staticmethod
@@ -131,10 +135,32 @@ class Rewind(object):
             self._check_timeline_and_lsn(leader)
         return leader and leader.conn_url and self._state == REWIND_STATUS.NEED
 
-    def check_for_checkpoint_after_promote(self):
-        if self._state == REWIND_STATUS.INITIAL and self._postgresql.is_leader() and \
-                self._postgresql.get_master_timeline() == self._postgresql.pg_control_timeline():
-            self._state = REWIND_STATUS.CHECKPOINT
+    def __checkpoint(self, task):
+        try:
+            result = self._postgresql.checkpoint()
+        except Exception as e:
+            result = 'Exception: ' + str(e)
+        with task:
+            task.complete(not bool(result))
+
+    def ensure_checkpoint_after_promote(self):
+        """After promote issue a CHECKPOINT from a new thread and asynchronously check the result.
+        In case if CHECKPOINT failed, just check that timeline in pg_control was updated."""
+
+        if self._state == REWIND_STATUS.INITIAL and self._postgresql.is_leader():
+            with self._checkpoint_task_lock:
+                if self._checkpoint_task:
+                    with self._checkpoint_task:
+                        if self._checkpoint_task.result:
+                            self._state = REWIND_STATUS.CHECKPOINT
+                        if self._checkpoint_task.result is not False:
+                            return
+                else:
+                    self._checkpoint_task = CriticalTask()
+                    return Thread(target=self.__checkpoint, args=(self._checkpoint_task,)).start()
+
+            if self._postgresql.get_master_timeline() == self._postgresql.pg_control_timeline():
+                self._state = REWIND_STATUS.CHECKPOINT
 
     def checkpoint_after_promote(self):
         return self._state == REWIND_STATUS.CHECKPOINT
@@ -191,6 +217,8 @@ class Rewind(object):
 
     def reset_state(self):
         self._state = REWIND_STATUS.INITIAL
+        with self._checkpoint_task_lock:
+            self._checkpoint_task = None
 
     @property
     def is_needed(self):
diff --git a/tests/test_ha.py b/tests/test_ha.py
index d7d16abe..50a1bc75 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -172,6 +172,7 @@ def run_async(self, func, args=()):
 @patch('patroni.postgresql.polling_loop', Mock(return_value=range(1)))
 @patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=False))
 @patch('patroni.async_executor.AsyncExecutor.run_async', run_async)
+@patch('patroni.postgresql.rewind.Thread', Mock())
 @patch('subprocess.call', Mock(return_value=0))
 @patch('time.sleep', Mock())
 class TestHa(PostgresInit):
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 4da12d08..88715beb 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -7,6 +7,16 @@ from patroni.postgresql.rewind import Rewind
 from . import BaseTestPostgresql, MockCursor, psycopg2_connect
 
 
+class MockThread(object):
+
+    def __init__(self, target, args):
+        self._target = target
+        self._args = args
+
+    def start(self):
+        self._target(*self._args)
+
+
 @patch('subprocess.call', Mock(return_value=0))
 @patch('psycopg2.connect', psycopg2_connect)
 class TestRewind(BaseTestPostgresql):
@@ -102,6 +112,21 @@ class TestRewind(BaseTestPostgresql):
         self.r.check_leader_is_not_in_recovery()
         self.r.check_leader_is_not_in_recovery()
 
-    @patch.object(Postgresql, 'controldata', Mock(return_value={"Latest checkpoint's TimeLineID": 1}))
-    def test_check_for_checkpoint_after_promote(self):
-        self.r.check_for_checkpoint_after_promote()
+    @patch('patroni.postgresql.rewind.Thread', MockThread)
+    @patch.object(Postgresql, 'controldata')
+    @patch.object(Postgresql, 'checkpoint')
+    def test_ensure_checkpoint_after_promote(self, mock_checkpoint, mock_controldata):
+        mock_checkpoint.return_value = None
+        self.r.ensure_checkpoint_after_promote()
+        self.r.ensure_checkpoint_after_promote()
+
+        self.r.reset_state()
+        mock_controldata.return_value = {"Latest checkpoint's TimeLineID": 1}
+        mock_checkpoint.side_effect = Exception
+        self.r.ensure_checkpoint_after_promote()
+        self.r.ensure_checkpoint_after_promote()
+
+        self.r.reset_state()
+        mock_controldata.side_effect = TypeError
+        self.r.ensure_checkpoint_after_promote()
+        self.r.ensure_checkpoint_after_promote()

From be4c078d9502648eb0053076fccde350bf3854bf Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Thu, 23 Apr 2020 12:51:05 +0200
Subject: [PATCH 04/40] Etcd smart refresh members (#1499)

In dynamic environments it is common that during the rolling upgrade etcd nodes are changing their IP addresses. If the etcd node where Patroni is currently connected to is upgraded last, it could happen that the cached topology doesn't contain any live node anymore and therefore request can't be retried and totally fails, usually resulting in demoting of the primary.

In order to partially overcome the problem, Patroni is already doing a periodic (every 5 minutes) rediscovery of the etcd cluster topology, but in case of very fast node rotation there was still a possibility to hit the issue.

This PR is an attempt to address the problem. If the list of nodes exhausted, Patroni will try to perform initial discovery via an external mechanism, like resolving A or SRV dns records and if the new list is different from the original, Patroni will use it as the new etcd cluster topology.

In order to deal with tcp issues the connect_timeout is set to max(read_timeout/2, 1). It will make list of members exhaust faster, but leaves the time to perform topology rediscovery and another attempt.

The third issue addressed by this PR - it could happen that dns names of etcd nodes didn't change, but ip addresses are new, therefore we clean up the internal dns cache when doing topology rediscovery.

Besides that, this commit makes `_machines_cache` property pretty much static, it will be updated only when the topology has changed and helps to avoid concurrency issues.
---
 patroni/dcs/etcd.py | 206 +++++++++++++++++++++++++-------------------
 tests/test_etcd.py  |  35 +++++---
 2 files changed, 139 insertions(+), 102 deletions(-)

diff --git a/patroni/dcs/etcd.py b/patroni/dcs/etcd.py
index 02e88407..844948ea 100644
--- a/patroni/dcs/etcd.py
+++ b/patroni/dcs/etcd.py
@@ -11,6 +11,7 @@ import time
 
 from dns.exception import DNSException
 from dns import resolver
+from urllib3 import Timeout
 from urllib3.exceptions import HTTPError, ReadTimeoutError, ProtocolError
 from six.moves.queue import Queue
 from six.moves.http_client import HTTPException
@@ -69,6 +70,9 @@ class DnsCachingResolver(Thread):
     def resolve_async(self, host, port, attempt=0):
         self._resolve_queue.put(((host, port), attempt))
 
+    def remove(self, host, port):
+        self._cache.pop((host, port), None)
+
     @staticmethod
     def _do_resolve(host, port):
         try:
@@ -91,6 +95,7 @@ class Client(etcd.Client):
         # Workaround for the case when https://github.com/jplana/python-etcd/pull/196 is not applied
         self.http.connection_pool_kw.pop('ssl_version', None)
         self._config = config
+        self._initial_machines_cache = []
         self._load_machines_cache()
         self._allow_reconnect = True
         # allow passing retry argument to api_execute in params
@@ -98,14 +103,13 @@ class Client(etcd.Client):
         self._read_options.add('retry')
         self._del_conditions.add('retry')
 
-    def _calculate_timeouts(self, etcd_nodes=None, timeout=None):
+    def _calculate_timeouts(self, etcd_nodes, timeout=None):
         """Calculate a request timeout and number of retries per single etcd node.
         In case if the timeout per node is too small (less than one second) we will reduce the number of nodes.
         For the cluster with only one node we will try to do 2 retries.
         For clusters with 2 nodes we will try to do 1 retry for every node.
         No retries for clusters with 3 or more nodes. We better rely on switching to a different node."""
 
-        etcd_nodes = etcd_nodes or len(self._machines_cache) + 1
         per_node_timeout = timeout = float(timeout or self.read_timeout)
 
         max_retries = 4 - min(etcd_nodes, 3)
@@ -130,19 +134,25 @@ class Client(etcd.Client):
         basic_auth = ':'.join((self.username, self.password)) if self.username and self.password else None
         return urllib3.make_headers(basic_auth=basic_auth, user_agent=USER_AGENT)
 
-    def _build_request_parameters(self, timeout=None):
+    def _build_request_parameters(self, etcd_nodes, timeout=None):
         kwargs = {'headers': self._get_headers(), 'redirect': self.allow_redirect}
 
         if timeout is not None:
             kwargs.update(retries=0, timeout=timeout)
         else:
-            _, per_node_timeout, per_node_retries = self._calculate_timeouts()
-            kwargs.update(timeout=per_node_timeout, retries=per_node_retries)
+            _, per_node_timeout, per_node_retries = self._calculate_timeouts(etcd_nodes)
+            connect_timeout = max(1, per_node_timeout/2)
+            kwargs.update(timeout=Timeout(connect=connect_timeout, total=per_node_timeout), retries=per_node_retries)
         return kwargs
 
     def set_machines_cache_ttl(self, cache_ttl):
         self._machines_cache_ttl = cache_ttl
 
+    @property
+    def machines_cache(self):
+        base_uri, cache = self._base_uri, self._machines_cache
+        return ([base_uri] if base_uri in cache else []) + [machine for machine in cache if machine != base_uri]
+
     @property
     def machines(self):
         """Original `machines` method(property) of `etcd.Client` class raise exception
@@ -155,58 +165,58 @@ class Client(etcd.Client):
         Also this method implements the same timeout-retry logic as `api_execute`, because
         the original method was retrying 2 times with the `read_timeout` on each node."""
 
-        kwargs = self._build_request_parameters()
+        machines_cache = self.machines_cache
+        kwargs = self._build_request_parameters(len(machines_cache))
 
-        while True:
+        for base_uri in machines_cache:
             try:
-                response = self.http.request(self._MGET, self._base_uri + self.version_prefix + '/machines', **kwargs)
+                response = self.http.request(self._MGET, base_uri + self.version_prefix + '/machines', **kwargs)
                 data = self._handle_server_response(response).data.decode('utf-8')
                 machines = [m.strip() for m in data.split(',') if m.strip()]
                 logger.debug("Retrieved list of machines: %s", machines)
-                if not machines:
-                    raise etcd.EtcdException
-                random.shuffle(machines)
-                for url in machines:
-                    r = urlparse(url)
-                    port = r.port or (443 if r.scheme == 'https' else 80)
-                    self._dns_resolver.resolve_async(r.hostname, port)
-                return machines
+                if machines:
+                    random.shuffle(machines)
+                    self._update_dns_cache(self._dns_resolver.resolve_async, machines)
+                    return machines
             except Exception as e:
-                # We can't get the list of machines, if one server is in the
-                # machines cache, try on it
-                logger.error("Failed to get list of machines from %s%s: %r", self._base_uri, self.version_prefix, e)
-                if self._machines_cache:
-                    self._base_uri = self._machines_cache.pop(0)
-                    logger.info("Retrying on %s", self._base_uri)
-                elif self._update_machines_cache:
-                    raise etcd.EtcdException("Could not get the list of servers, "
-                                             "maybe you provided the wrong "
-                                             "host(s) to connect to?")
-                else:
-                    return []
+                self.http.clear()
+                logger.error("Failed to get list of machines from %s%s: %r", base_uri, self.version_prefix, e)
+
+        raise etcd.EtcdConnectionFailed('No more machines in the cluster')
 
     def set_read_timeout(self, timeout):
         self._read_timeout = timeout
 
-    def _do_http_request(self, request_executor, method, url, fields=None, **kwargs):
-        try:
-            response = request_executor(method, url, fields=fields, **kwargs)
-            response.data.decode('utf-8')
-            self._check_cluster_id(response)
-        except (HTTPError, HTTPException, socket.error, socket.timeout) as e:
-            if (isinstance(fields, dict) and fields.get("wait") == "true" and
-                    isinstance(e, (ReadTimeoutError, ProtocolError))):
-                logger.debug("Watch timed out.")
+    def _do_http_request(self, retry, machines_cache, request_executor, method, path, fields=None, **kwargs):
+        some_request_failed = False
+        for i, base_uri in enumerate(machines_cache):
+            if i > 0:
+                logger.info("Retrying on %s", base_uri)
+            try:
+                response = request_executor(method, base_uri + path, fields=fields, **kwargs)
+                response.data.decode('utf-8')
+                self._check_cluster_id(response)
+                if some_request_failed:
+                    self.set_base_uri(base_uri)
+                    self._refresh_machines_cache()
+                return response
+            except (HTTPError, HTTPException, socket.error, socket.timeout) as e:
+                self.http.clear()
                 # switch to the next etcd node because we don't know exactly what happened,
                 # whether the key didn't received an update or there is a network problem.
-                self._machines_cache.insert(0, self._base_uri)
-                self._base_uri = self._next_server()
-                raise etcd.EtcdWatchTimedOut("Watch timed out: {0}".format(e), cause=e)
-            logger.error("Request to server %s failed: %r", self._base_uri, e)
-            logger.info("Reconnection allowed, looking for another server.")
-            self._base_uri = self._next_server(cause=e)
-            response = False
-        return response
+                if not retry and i + 1 < len(machines_cache):
+                    self.set_base_uri(machines_cache[i + 1])
+                if (isinstance(fields, dict) and fields.get("wait") == "true" and
+                        isinstance(e, (ReadTimeoutError, ProtocolError))):
+                    logger.debug("Watch timed out.")
+                    raise etcd.EtcdWatchTimedOut("Watch timed out: {0}".format(e), cause=e)
+                logger.error("Request to server %s failed: %r", base_uri, e)
+                logger.info("Reconnection allowed, looking for another server.")
+                if not retry:
+                    raise etcd.EtcdException('{0} {1} request failed'.format(method, path))
+                some_request_failed = True
+
+        raise etcd.EtcdConnectionFailed('No more machines in the cluster')
 
     def api_execute(self, path, method, params=None, timeout=None):
         if not path.startswith('/'):
@@ -229,44 +239,34 @@ class Client(etcd.Client):
         elif not self._use_proxies and time.time() - self._machines_cache_updated > self._machines_cache_ttl:
             self._refresh_machines_cache()
 
-        kwargs.update(self._build_request_parameters(timeout))
-
-        if retry:
-            machines_cache = [self._base_uri] + self._machines_cache
-
-        response = False
+        machines_cache = self.machines_cache
+        etcd_nodes = len(machines_cache)
+        kwargs.update(self._build_request_parameters(etcd_nodes, timeout))
 
         while True:
             try:
-                some_request_failed = False
-                while not response:
-                    response = self._do_http_request(request_executor, method, self._base_uri + path, **kwargs)
-
-                    if response is False:
-                        if not retry:
-                            raise etcd.EtcdException('{0} {1} request failed'.format(method, path))
-                        some_request_failed = True
-                if some_request_failed:
-                    self._refresh_machines_cache()
-                if response:
-                    break
-            except etcd.EtcdConnectionFailed:
-                if not retry:
-                    raise
+                response = self._do_http_request(retry, machines_cache, request_executor, method, path, **kwargs)
+                return self._handle_server_response(response)
+            except etcd.EtcdWatchTimedOut:
+                raise
+            except etcd.EtcdConnectionFailed as ex:
+                try:
+                    if self._load_machines_cache():
+                        machines_cache = self.machines_cache
+                        etcd_nodes = len(machines_cache)
+                except Exception as e:
+                    logger.debug('Failed to update list of etcd nodes: %r', e)
                 sleeptime = retry.sleeptime
                 remaining_time = retry.stoptime - sleeptime - time.time()
-                nodes, timeout, retries = self._calculate_timeouts(len(machines_cache), remaining_time)
+                nodes, timeout, retries = self._calculate_timeouts(etcd_nodes, remaining_time)
                 if nodes == 0:
                     self._update_machines_cache = True
-                    raise
+                    raise ex
                 retry.sleep_func(sleeptime)
                 retry.update_delay()
-                # We still have some time left. Partially restore `_machines_cache` and retry request
-                kwargs.update(timeout=timeout, retries=retries)
-                self._base_uri = machines_cache[0]
-                self._machines_cache = machines_cache[1:nodes]
-
-        return self._handle_server_response(response)
+                # We still have some time left. Partially reduce `machines_cache` and retry request
+                kwargs.update(timeout=Timeout(connect=max(1, timeout/2), total=timeout), retries=retries)
+                machines_cache = machines_cache[:nodes]
 
     @staticmethod
     def get_srv_record(host):
@@ -327,6 +327,13 @@ class Client(etcd.Client):
             machines_cache = self._get_machines_cache_from_dns(self._config['host'], self._config['port'])
         return machines_cache
 
+    @staticmethod
+    def _update_dns_cache(func, machines):
+        for url in machines:
+            r = urlparse(url)
+            port = r.port or (443 if r.scheme == 'https' else 80)
+            func(r.hostname, port)
+
     def _load_machines_cache(self):
         """This method should fill up `_machines_cache` from scratch.
         It could happen only in two cases:
@@ -338,25 +345,49 @@ class Client(etcd.Client):
         if 'srv' not in self._config and 'host' not in self._config and 'hosts' not in self._config:
             raise Exception('Neither srv, hosts, host nor url are defined in etcd section of config')
 
-        self._machines_cache = self._get_machines_cache_from_config()
-
+        machines_cache = self._get_machines_cache_from_config()
         # Can not bootstrap list of etcd-cluster members, giving up
-        if not self._machines_cache:
+        if not machines_cache:
             raise etcd.EtcdException
 
-        # After filling up initial list of machines_cache we should ask etcd-cluster about actual list
-        self._base_uri = self._next_server()
-        self._refresh_machines_cache()
-        self._update_machines_cache = False
+        # enforce resolving dns name,they might get new ips
+        self._update_dns_cache(self._dns_resolver.remove, machines_cache)
 
-    def _refresh_machines_cache(self):
-        self._machines_cache = self._get_machines_cache_from_config() if self._use_proxies else self.machines
-        if self._base_uri in self._machines_cache:
-            self._machines_cache.remove(self._base_uri)
-        elif self._machines_cache:
-            self._base_uri = self._next_server()
+        # The etcd cluster could change its topology over time and depending on how we resolve the initial
+        # topology (list of hosts in the Patroni config or DNS records, A or SRV) we might get into the situation
+        # the the real topology doesn't match anymore with the topology resolved from the configuration file.
+        # In case if the "initial" topology is the same as before we will not override the `_machines_cache`.
+        ret = set(machines_cache) != set(self._initial_machines_cache)
+        if ret:
+            self._initial_machines_cache = self._machines_cache = machines_cache
+
+            # After filling up the initial list of machines_cache we should ask etcd-cluster about actual list
+            self._refresh_machines_cache(True)
+
+        self._update_machines_cache = False
+        return ret
+
+    def _refresh_machines_cache(self, updating_cache=False):
+        if self._use_proxies:
+            self._machines_cache = self._get_machines_cache_from_config()
+        else:
+            try:
+                self._machines_cache = self.machines
+            except etcd.EtcdConnectionFailed:
+                if updating_cache:
+                    raise etcd.EtcdException("Could not get the list of servers, "
+                                             "maybe you provided the wrong "
+                                             "host(s) to connect to?")
+                return
+
+        if self._base_uri not in self._machines_cache:
+            self.set_base_uri(self._machines_cache[0])
         self._machines_cache_updated = time.time()
 
+    def set_base_uri(self, value):
+        logger.info('Selected new etcd server %s', value)
+        self._base_uri = value
+
 
 class Etcd(AbstractDCS):
 
@@ -633,7 +664,6 @@ class Etcd(AbstractDCS):
                     # than reestablishing http connection every time from every replica.
                     return True
                 except etcd.EtcdWatchTimedOut:
-                    self._client.http.clear()
                     self._has_failed = False
                     return False
                 except (etcd.EtcdEventIndexCleared, etcd.EtcdWatcherCleared):  # Watch failed
diff --git a/tests/test_etcd.py b/tests/test_etcd.py
index 8432fad9..ceef0d2d 100644
--- a/tests/test_etcd.py
+++ b/tests/test_etcd.py
@@ -130,12 +130,11 @@ class TestClient(unittest.TestCase):
             self.client.http.request_encode_body = http_request
 
     def test_machines(self):
-        self.client._base_uri = 'http://localhost:4001'
-        self.client._machines_cache = ['http://localhost:2379']
+        self.client._base_uri = 'http://localhost:4002'
+        self.client._machines_cache = ['http://localhost:4002', 'http://localhost:2379']
         self.assertIsNotNone(self.client.machines)
         self.client._base_uri = 'http://localhost:4001'
-        self.client._machines_cache = []
-        self.assertIsNotNone(self.client.machines)
+        self.client._machines_cache = ['http://localhost:4001']
         self.client._update_machines_cache = True
         machines = None
         try:
@@ -146,16 +145,13 @@ class TestClient(unittest.TestCase):
 
     @patch.object(Client, 'machines')
     def test_api_execute(self, mock_machines):
-        mock_machines.__get__ = Mock(return_value=['http://localhost:2379'])
+        mock_machines.__get__ = Mock(return_value=['http://localhost:4001', 'http://localhost:2379'])
         self.assertRaises(ValueError, self.client.api_execute, '', '')
         self.client._base_uri = 'http://localhost:4001'
-        self.client._machines_cache = ['http://localhost:2379']
         self.assertRaises(etcd.EtcdException, self.client.api_execute, '/', 'POST', timeout=0)
         self.client._base_uri = 'http://localhost:4001'
-        self.client._machines_cache = ['http://localhost:2379']
         rtry = Retry(deadline=10, max_delay=1, max_tries=-1, retry_exceptions=(etcd.EtcdLeaderElectionInProgress,))
         rtry(self.client.api_execute, '/', 'POST', timeout=0, params={'retry': rtry})
-        mock_machines.__get__ = Mock(return_value=['http://localhost:2379'])
         self.client._machines_cache_updated = 0
         self.client.api_execute('/', 'POST', timeout=0)
         self.client._machines_cache = [self.client._base_uri]
@@ -163,10 +159,17 @@ class TestClient(unittest.TestCase):
         self.assertRaises(etcd.EtcdWatchTimedOut, self.client.api_execute, '/timeout', 'POST', params={'wait': 'true'})
         self.assertRaises(etcd.EtcdException, self.client.api_execute, '/', '')
 
-        with patch.object(Client, '_do_http_request', Mock(side_effect=etcd.EtcdConnectionFailed)):
-            with patch.object(Client, '_calculate_timeouts', Mock(side_effect=[(1, 1, 0), (1, 1, 0), (0, 1, 0)])):
-                self.assertRaises(etcd.EtcdException, rtry, self.client.api_execute, '/', 'GET', params={'retry': rtry})
-            self.client._read_timeout = 0
+        with patch.object(Client, '_calculate_timeouts', Mock(side_effect=[(1, 1, 0), (1, 1, 0), (0, 1, 0)])),\
+                patch.object(Client, '_load_machines_cache', Mock(side_effect=Exception)):
+            self.client.http.request = Mock(side_effect=socket.error)
+            self.assertRaises(etcd.EtcdException, rtry, self.client.api_execute, '/', 'GET', params={'retry': rtry})
+
+        with patch.object(Client, '_calculate_timeouts', Mock(side_effect=[(1, 1, 0), (1, 1, 0), (0, 1, 0)])),\
+                patch.object(Client, '_load_machines_cache', Mock(return_value=True)):
+            self.assertRaises(etcd.EtcdException, rtry, self.client.api_execute, '/', 'GET', params={'retry': rtry})
+
+        with patch.object(Client, '_do_http_request', Mock(side_effect=etcd.EtcdException)):
+            self.client._read_timeout = 0.01
             self.assertRaises(etcd.EtcdException, self.client.api_execute, '/', 'GET')
 
     def test_get_srv_record(self):
@@ -182,8 +185,12 @@ class TestClient(unittest.TestCase):
         self.client._get_machines_cache_from_dns('error', 2379)
 
     @patch.object(Client, 'machines')
-    def test__load_machines_cache(self, mock_machines):
-        mock_machines.__get__ = Mock(return_value=['http://localhost:2379'])
+    def test__refresh_machines_cache(self, mock_machines):
+        mock_machines.__get__ = Mock(side_effect=etcd.EtcdConnectionFailed)
+        self.assertIsNone(self.client._refresh_machines_cache())
+        self.assertRaises(etcd.EtcdException, self.client._refresh_machines_cache, True)
+
+    def test__load_machines_cache(self):
         self.client._config = {}
         self.assertRaises(Exception, self.client._load_machines_cache)
         self.client._config = {'srv': 'blabla'}

From 8b860d7528b7200bafdba0f583912017b9022f94 Mon Sep 17 00:00:00 2001
From: Feike Steenbergen <feikesteenbergen@users.noreply.github.com>
Date: Thu, 23 Apr 2020 12:51:38 +0200
Subject: [PATCH 05/40] Skip missing values from pg_controldata (#1501)

Skip missing values from pg_controldata

When calling controldata(), it may return an empty dictionary, which in
turn caused the following error to occur:

    effective_configuration
        cvalue = parse_int(data[cname])
    KeyError: 'max_wal_senders setting'

Instead of causing a crash of this part, we now log the error and
continue.


This is the full output of the error:
```
2020-04-17 14:31:54,791 ERROR: Exception during execution of long running task restarting after failure
Traceback (most recent call last):
  File "/usr/lib/python3/dist-packages/patroni/async_executor.py", line 97, in run
    wakeup = func(*args) if args else func()
  File "/usr/lib/python3/dist-packages/patroni/postgresql/__init__.py", line 707, in follow
    self.start(timeout=timeout, block_callbacks=change_role, role=role)
  File "/usr/lib/python3/dist-packages/patroni/postgresql/__init__.py", line 409, in start
    configuration = self.config.effective_configuration
  File "/usr/lib/python3/dist-packages/patroni/postgresql/config.py", line 983, in effective_configuration
    cvalue = parse_int(data[cname])
KeyError: 'max_wal_senders setting'
```
---
 patroni/postgresql/__init__.py | 2 +-
 patroni/postgresql/config.py   | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 1c64e3cd..b0565968 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -664,7 +664,7 @@ class Postgresql(object):
                 data = subprocess.check_output([self.pgcommand('pg_controldata'), self._data_dir], env=env)
                 if data:
                     data = data.decode('utf-8').splitlines()
-                    # pg_controldata output depends on major verion. Some of parameters are prefixed by 'Current '
+                    # pg_controldata output depends on major version. Some of parameters are prefixed by 'Current '
                     result = {l.split(':')[0].replace('Current ', '', 1): l.split(':', 1)[1].strip() for l in data
                               if l and ':' in l}
             except subprocess.CalledProcessError:
diff --git a/patroni/postgresql/config.py b/patroni/postgresql/config.py
index a358c373..e6b33d63 100644
--- a/patroni/postgresql/config.py
+++ b/patroni/postgresql/config.py
@@ -1025,6 +1025,10 @@ class ConfigHandler(object):
 
         for name, cname in options_mapping.items():
             value = parse_int(effective_configuration[name])
+            if cname not in data:
+                logger.warning('%s is missing from pg_controldata output', cname)
+                continue
+
             cvalue = parse_int(data[cname])
             if cvalue > value:
                 effective_configuration[name] = cvalue

From fe23d1f2d06271a374972e6f5e99de5d947f1c66 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Thu, 23 Apr 2020 16:02:01 +0200
Subject: [PATCH 06/40] Release 1.6.5 (#1503)

* bump version
* update release notes
* implement missing unit-tests and format code.
---
 docs/releases.rst        | 116 +++++++++++++++++++++++++++++++++++++++
 patroni/version.py       |   2 +-
 tests/test_ctl.py        |   8 +--
 tests/test_kubernetes.py |   1 +
 tests/test_postgresql.py |   5 +-
 5 files changed, 125 insertions(+), 7 deletions(-)

diff --git a/docs/releases.rst b/docs/releases.rst
index 4d8842bd..0bc37a21 100644
--- a/docs/releases.rst
+++ b/docs/releases.rst
@@ -3,6 +3,122 @@
 Release notes
 =============
 
+Version 1.6.5
+-------------
+
+**New features**
+
+- Master stop timeout (Krishna Sarabu)
+
+  The number of seconds Patroni is allowed to wait when stopping Postgres. Effective only when ``synchronous_mode`` is enabled. When set to value greater than 0 and the ``synchronous_mode`` is enabled, Patroni sends ``SIGKILL`` to the postmaster if the stop operation is running for more than the value set by ``master_stop_timeout``. Set the value according to your durability/availability tradeoff. If the parameter is not set or set to non-positive value, ``master_stop_timeout`` does not have an effect.
+
+- Don't create permanent physical slot with name of the primary (Alexander Kukushkin)
+
+  It is a common problem that the primary recycles WAL segments while the replica is down. Now we have a good solution for static clusters, with a fixed number of nodes and names that never change. You just need to list the names of all nodes in the ``slots`` so the primary will not remove the slot when the node is down (not registered in DCS).
+
+- First draft of Config Validator (Igor Yanchenko)
+
+  Use ``patroni --validate-config patroni.yaml`` in order to validate Patroni configuration.
+
+- Possibility to configure max length of timelines history (Krishna)
+
+  Patroni writes the history of failovers/switchovers into the ``/history`` key in DCS. Over time the size of this key becomes big, but in most cases only the last few lines are interesting. The ``max_timelines_history`` parameter allows to specify the maximum number of timeline history items to be kept in DCS.
+
+- Kazoo 2.7.0 compatibility (Danyal Prout)
+
+  Some non-public methods in Kazoo changed their signatures, but Patroni was relying on them.
+
+
+**Improvements in patronictl**
+
+- Show member tags (Kostiantyn Nemchenko, Alexander)
+
+  Tags are configured individually for every node and there was no easy way to get an overview of them
+
+- Improve members output (Alexander)
+
+  The redundant cluster name won't be shown anymore on every line, only in the table header.
+
+.. code-block:: bash
+
+    $ patronictl list
+    + Cluster: batman (6813309862653668387) +---------+----+-----------+---------------------+
+    |    Member   |      Host      |  Role  |  State  | TL | Lag in MB | Tags                |
+    +-------------+----------------+--------+---------+----+-----------+---------------------+
+    | postgresql0 | 127.0.0.1:5432 | Leader | running |  3 |           | clonefrom: true     |
+    |             |                |        |         |    |           | noloadbalance: true |
+    |             |                |        |         |    |           | nosync: true        |
+    +-------------+----------------+--------+---------+----+-----------+---------------------+
+    | postgresql1 | 127.0.0.1:5433 |        | running |  3 |       0.0 |                     |
+    +-------------+----------------+--------+---------+----+-----------+---------------------+
+
+- Fail if a config file is specified explicitly but not found (Kaarel Moppel)
+
+  Previously ``patronictl`` was only reporting a ``DEBUG`` message.
+
+- Solved the problem of not initialized K8s pod breaking patronictl (Alexander)
+
+  Patroni is relying on certain pod annotations on K8s. When one of the Patroni pods is stopping or starting there is no valid annotation yet and ``patronictl`` was failing with an exception.
+
+
+**Stability improvements**
+
+- Apply 1 second backoff if LIST call to K8s API server failed (Alexander)
+
+  It is mostly necessary to avoid flooding logs, but also helps to prevent starvation of the main thread.
+
+- Retry if the ``retry-after`` HTTP header is returned by K8s API (Alexander)
+
+  If the K8s API server is overwhelmed with requests it might ask to retry.
+
+- Scrub ``KUBERNETES_`` environment from the postmaster (Feike Steenbergen)
+
+  The ``KUBERNETES_`` environment variables are not required for PostgreSQL, yet having them exposed to the postmaster will also expose them to backends and to regular database users (using pl/perl for example).
+
+- Clean up tablespaces on reinitialize (Krishna)
+
+  During reinit, Patroni was removing only ``PGDATA`` and leaving user-defined tablespace directories. This is causing Patroni to loop in reinit. The previous workarond for the problem was implementing the :ref:`custom bootstrap <custom_bootstrap>` script.
+
+- Explicitly execute ``CHECKPOINT`` after promote happened (Alexander)
+
+  It helps to reduce the time before the new primary is usable for ``pg_rewind``.
+
+- Smart refresh of Etcd members (Alexander)
+
+  In case Patroni failed to execute a request on all members of the Etcd cluster, Patroni will re-check ``A`` or ``SRV`` records for changes of IPs/hosts before retrying the next time.
+
+- Skip missing values from ``pg_controldata`` (Feike)
+
+  Values are missing when trying to use binaries of a version that doesn't match PGDATA. Patroni will try to start Postgres anyway, and Postgres will complain that the major version doesn't match and abort with an error.
+
+
+**Bugfixes**
+
+- Disable SSL verification for Consul when required (Julien Riou)
+
+  Starting from a certain version of ``urllib3``, the ``cert_reqs`` must be explicitly set to ``ssl.CERT_NONE`` in order to effectively disable SSL verification.
+
+- Avoid opening replication connection on every cycle of HA loop (Alexander)
+
+  Regression was introduced in 1.6.4.
+
+- Call ``on_role_change`` callback on failed primary (Alexander)
+
+  In certain cases it could lead to the virtual IP remaining attached to the old primary. Regression was introduced in 1.4.5.
+
+- Reset rewind state if postgres started after successful pg_rewind (Alexander)
+
+  As a result of this bug Patroni was starting up manually shut down postgres in the pause mode.
+
+- Convert ``recovery_min_apply_delay`` to ``ms`` when checking ``recovery.conf``
+
+  Patroni was indefinitely restarting replica if ``recovery_min_apply_delay`` was configured on PostgreSQL older than 12.
+
+- PyInstaller compatibility (Alexander)
+
+  PyInstaller freezes (packages) Python applications into stand-alone executables. The compatibility was broken when we switched to the ``spawn`` method instead of ``fork`` for ``multiprocessing``.
+
+
 Version 1.6.4
 -------------
 
diff --git a/patroni/version.py b/patroni/version.py
index d07785c5..f3df7f04 100644
--- a/patroni/version.py
+++ b/patroni/version.py
@@ -1 +1 @@
-__version__ = '1.6.4'
+__version__ = '1.6.5'
diff --git a/tests/test_ctl.py b/tests/test_ctl.py
index b72785b6..cc97c85b 100644
--- a/tests/test_ctl.py
+++ b/tests/test_ctl.py
@@ -19,9 +19,7 @@ from .test_ha import get_cluster_initialized_without_leader, get_cluster_initial
     get_cluster_initialized_with_only_leader, get_cluster_not_initialized_without_leader, get_cluster, Member
 
 
-
 def test_rw_config():
-    global CONFIG_FILE_PATH
     runner = CliRunner()
     with runner.isolated_filesystem():
         load_config(CONFIG_FILE_PATH, None)
@@ -32,9 +30,9 @@ def test_rw_config():
         os.rmdir(CONFIG_PATH)
 
 
-@patch('patroni.ctl.load_config',
-       Mock(return_value={'scope': 'alpha', 'postgresql': {'data_dir': '.', 'pgpass': './pgpass', 'parameters': {}, 'retry_timeout': 5},
-                          'restapi': {'listen': '::', 'certfile': 'a'}, 'etcd': {'host': 'localhost:2379'}}))
+@patch('patroni.ctl.load_config', Mock(return_value={
+    'scope': 'alpha', 'restapi': {'listen': '::', 'certfile': 'a'}, 'etcd': {'host': 'localhost:2379'},
+    'postgresql': {'data_dir': '.', 'pgpass': './pgpass', 'parameters': {}, 'retry_timeout': 5}}))
 class TestCtl(unittest.TestCase):
 
     @patch('socket.getaddrinfo', socket_getaddrinfo)
diff --git a/tests/test_kubernetes.py b/tests/test_kubernetes.py
index ed9b2141..5fb89c23 100644
--- a/tests/test_kubernetes.py
+++ b/tests/test_kubernetes.py
@@ -33,6 +33,7 @@ def mock_config_map(*args, **kwargs):
     mock.metadata.resource_version = '2'
     return mock
 
+
 @patch('socket.TCP_KEEPIDLE', 4, create=True)
 @patch('socket.TCP_KEEPINTVL', 5, create=True)
 @patch('socket.TCP_KEEPCNT', 6, create=True)
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
index 721c6169..fabab68a 100644
--- a/tests/test_postgresql.py
+++ b/tests/test_postgresql.py
@@ -412,6 +412,10 @@ class TestPostgresql(BaseTestPostgresql):
                     pass
         os.makedirs(os.path.join(self.p.data_dir, 'foo'))
         _symlink('foo', os.path.join(self.p.data_dir, 'pg_wal'))
+        os.makedirs(os.path.join(self.p.data_dir, 'foo_tsp'))
+        pg_tblspc = os.path.join(self.p.data_dir, 'pg_tblspc')
+        os.makedirs(pg_tblspc)
+        _symlink('../foo_tsp', os.path.join(pg_tblspc, '12345'))
         self.p.remove_data_directory()
         open(self.p.data_dir, 'w').close()
         self.p.remove_data_directory()
@@ -712,7 +716,6 @@ class TestPostgresql(BaseTestPostgresql):
         with patch.object(Postgresql, 'controldata',
                           Mock(return_value={'max_connections setting': '200',
                                              'max_worker_processes setting': '20',
-                                             'max_prepared_xacts setting': '100',
                                              'max_locks_per_xact setting': '100',
                                              'max_wal_senders setting': 10})):
             self.p.cancellable.cancel()

From 703a129646df2388166daaa828a4ee4465728d54 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Wed, 13 May 2020 12:22:47 +0200
Subject: [PATCH 07/40] Don't try calling a non existing leader in patronictl
 pause (#1542)

While pausing a cluster without a leader on K8s patronictl was showing warnings that member "None" could not be accessed.
---
 patroni/ctl.py                 | 4 ++--
 patroni/ha.py                  | 2 +-
 patroni/postgresql/__init__.py | 8 +++-----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/patroni/ctl.py b/patroni/ctl.py
index 4f2fcd0b..d473da01 100644
--- a/patroni/ctl.py
+++ b/patroni/ctl.py
@@ -942,7 +942,7 @@ def toggle_pause(config, cluster_name, paused, wait):
         raise PatroniCtlException('Cluster is {0} paused'.format(paused and 'already' or 'not'))
 
     members = []
-    if cluster.leader:
+    if cluster.leader and cluster.leader.member.api_url:
         members.append(cluster.leader.member)
     members.extend([m for m in cluster.members if m.api_url and (not members or members[0].name != m.name)])
 
@@ -1008,7 +1008,7 @@ def show_diff(before_editing, after_editing):
     If the output is to a tty the diff will be colored. Inputs are expected to be unicode strings.
     """
     def listify(string):
-        return [l+'\n' for l in string.rstrip('\n').split('\n')]
+        return [line + '\n' for line in string.rstrip('\n').split('\n')]
 
     unified_diff = difflib.unified_diff(listify(before_editing), listify(after_editing))
 
diff --git a/patroni/ha.py b/patroni/ha.py
index 5deb2d80..c0655ea0 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -523,7 +523,7 @@ class Ha(object):
             if cluster_history:
                 self.dcs.set_history_value('[]')
         elif not cluster_history or cluster_history[-1][0] != master_timeline - 1 or len(cluster_history[-1]) != 4:
-            cluster_history = {l[0]: l for l in cluster_history or []}
+            cluster_history = {line[0]: line for line in cluster_history or []}
             history = self.state_handler.get_history(master_timeline)
             if history:
                 history = history[-self.cluster.config.max_timelines_history:]
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index b0565968..566494db 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -655,7 +655,6 @@ class Postgresql(object):
 
     def controldata(self):
         """ return the contents of pg_controldata, or non-True value if pg_controldata call failed """
-        result = {}
         # Don't try to call pg_controldata during backup restore
         if self._version_file_exists() and self.state != 'creating replica':
             try:
@@ -663,13 +662,12 @@ class Postgresql(object):
                 env.update(LANG='C', LC_ALL='C')
                 data = subprocess.check_output([self.pgcommand('pg_controldata'), self._data_dir], env=env)
                 if data:
-                    data = data.decode('utf-8').splitlines()
+                    data = filter(lambda e: ':' in e, data.decode('utf-8').splitlines())
                     # pg_controldata output depends on major version. Some of parameters are prefixed by 'Current '
-                    result = {l.split(':')[0].replace('Current ', '', 1): l.split(':', 1)[1].strip() for l in data
-                              if l and ':' in l}
+                    return {k.replace('Current ', '', 1): v.strip() for k, v in map(lambda e: e.split(':', 1), data)}
             except subprocess.CalledProcessError:
                 logger.exception("Error when calling pg_controldata")
-        return result
+        return {}
 
     @contextmanager
     def get_replication_connection_cursor(self, host='localhost', port=5432, **kwargs):

From a6fbc2dd7b87478faa5ea0fdece67be1c469639a Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Wed, 13 May 2020 12:26:39 +0200
Subject: [PATCH 08/40] Handle the case when member conn_url is missing (#1510)

Close https://github.com/zalando/patroni/issues/1508
---
 patroni/ctl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/patroni/ctl.py b/patroni/ctl.py
index d473da01..81ced93c 100644
--- a/patroni/ctl.py
+++ b/patroni/ctl.py
@@ -760,13 +760,13 @@ def output_members(cluster, name, extended=False, fmt='pretty'):
     # Show Host as 'host:port' if somebody is running on non-standard port or two nodes are running on the same host
     members = [m for m in cluster['members'] if 'host' in m]
     append_port = any('port' in m and m['port'] != 5432 for m in members) or\
-        len(set(m['host'] for m in cluster['members'])) < len(members)
+        len(set(m['host'] for m in members)) < len(members)
 
     for m in cluster['members']:
         logging.debug(m)
 
         lag = m.get('lag', '')
-        m.update(cluster=name, member=m['name'], host=m.get('host'), tl=m.get('timeline', ''),
+        m.update(cluster=name, member=m['name'], host=m.get('host', ''), tl=m.get('timeline', ''),
                  role='' if m['role'] == 'replica' else m['role'].replace('_', ' ').title(),
                  lag_in_mb=round(lag/1024/1024) if isinstance(lag, six.integer_types) else lag,
                  pending_restart='*' if m.get('pending_restart') else '')

From 25516840073b174df3c02f083ce608c4d77580eb Mon Sep 17 00:00:00 2001
From: ksarabu1 <62157128+ksarabu1@users.noreply.github.com>
Date: Wed, 13 May 2020 06:29:06 -0400
Subject: [PATCH 09/40] bugfix for attribute error during bootstrap (#1538)

initial bootstrap by attaching Patroni to the running postgres was causing the following error.

```
  File "/xx/lib/python3.8/site-packages/patroni/ha.py", line 529, in update_cluster_history
    history = history[-self.cluster.config.max_timelines_history:]
AttributeError: 'NoneType' object has no attribute 'max_timelines_history'
```
---
 patroni/ha.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index c0655ea0..7a0e0adb 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -525,7 +525,7 @@ class Ha(object):
         elif not cluster_history or cluster_history[-1][0] != master_timeline - 1 or len(cluster_history[-1]) != 4:
             cluster_history = {line[0]: line for line in cluster_history or []}
             history = self.state_handler.get_history(master_timeline)
-            if history:
+            if history and self.cluster.config:
                 history = history[-self.cluster.config.max_timelines_history:]
                 for line in history:
                     # enrich current history with promotion timestamps stored in DCS

From ffb879c6e61f672bbea8fb93d547cc64cc85a20c Mon Sep 17 00:00:00 2001
From: Takashi Idobe <idobetakashi@gmail.com>
Date: Wed, 13 May 2020 05:35:42 -0500
Subject: [PATCH 10/40] Adding --enable-v2=true flag to README.md (#1537)

Without supplying the --enable-v2=true flag to etcd on startup, patroni cannot find etcd to run.

after running `etcd --data-dir=data/etcd` in one terminal and `patroni postgres0.yaml` in another terminal, etcd starts fine, but the postgres instance cannot find etcd.

```
patroni postgres0.yaml
2020-05-09 15:58:48,560 ERROR: Failed to get list of machines from http://127.0.0.1:2379/v2: EtcdException('Bad response : 404 page not found\n')
2020-05-09 15:58:48,560 INFO: waiting on etcd
```
If etcd is passed the flag `--enable-v2=true` on startup, everything works fine.
---
 README.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.rst b/README.rst
index 0cc2baaf..d305f8ba 100644
--- a/README.rst
+++ b/README.rst
@@ -124,7 +124,7 @@ Running and Configuring
 To get started, do the following from different terminals:
 ::
 
-    > etcd --data-dir=data/etcd
+    > etcd --data-dir=data/etcd --enable-v2=true
     > ./patroni.py postgres0.yml
     > ./patroni.py postgres1.yml
 

From 0d957076ca192218b21ea71f3bb90d16a0074d88 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Wed, 13 May 2020 13:13:25 +0200
Subject: [PATCH 11/40] Improve compatibility with PostgreSQL 12 and 13 (#1523)

There were two new connection parameters introduced:
1. `gssencmode` in 12
2. `channel_binding` in 13
---
 patroni/config.py            |  4 +++-
 patroni/postgresql/config.py | 21 ++++++++++++++-------
 tests/test_postgresql.py     |  2 +-
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/patroni/config.py b/patroni/config.py
index 1afce2f6..f6fa4854 100644
--- a/patroni/config.py
+++ b/patroni/config.py
@@ -22,7 +22,9 @@ _AUTH_ALLOWED_PARAMETERS = (
     'sslcert',
     'sslkey',
     'sslrootcert',
-    'sslcrl'
+    'sslcrl',
+    'gssencmode',
+    'channel_binding'
 )
 
 
diff --git a/patroni/postgresql/config.py b/patroni/postgresql/config.py
index e6b33d63..fcab640e 100644
--- a/patroni/postgresql/config.py
+++ b/patroni/postgresql/config.py
@@ -113,17 +113,17 @@ def parse_dsn(value):
     Very simple equivalent of `psycopg2.extensions.parse_dsn` introduced in 2.7.0.
     We are not using psycopg2 function in order to remain compatible with 2.5.4+.
     There is one minor difference though, this function removes `dbname` from the result
-    and sets the sslmode` to `prefer` if it is not present in the connection string.
-    This is necessary to simplify comparison of the old and the new values.
+    and sets the `sslmode`, 'gssencmode', and `channel_binding` to `prefer` if it is not present in
+    the connection string. This is necessary to simplify comparison of the old and the new values.
 
     >>> r = parse_dsn('postgresql://u%2Fse:pass@:%2f123,[%2Fhost2]/db%2Fsdf?application_name=mya%2Fpp&ssl=true')
     >>> r == {'application_name': 'mya/pp', 'host': ',/host2', 'sslmode': 'require',\
-              'password': 'pass', 'port': '/123', 'user': 'u/se'}
+              'password': 'pass', 'port': '/123', 'user': 'u/se', 'gssencmode': 'prefer', 'channel_binding': 'prefer'}
     True
     >>> r = parse_dsn(" host = 'host' dbname = db\\\\ name requiressl=1 ")
-    >>> r == {'host': 'host', 'sslmode': 'require'}
+    >>> r == {'host': 'host', 'sslmode': 'require', 'gssencmode': 'prefer', 'channel_binding': 'prefer'}
     True
-    >>> parse_dsn('requiressl = 0\\\\') == {'sslmode': 'prefer'}
+    >>> parse_dsn('requiressl = 0\\\\') == {'sslmode': 'prefer', 'gssencmode': 'prefer', 'channel_binding': 'prefer'}
     True
     >>> parse_dsn("host=a foo = '") is None
     True
@@ -147,6 +147,8 @@ def parse_dsn(value):
             ret.setdefault('sslmode', 'prefer')
         if 'dbname' in ret:
             del ret['dbname']
+        ret.setdefault('gssencmode', 'prefer')
+        ret.setdefault('channel_binding', 'prefer')
     return ret
 
 
@@ -492,6 +494,10 @@ class ConfigHandler(object):
         ret = member.conn_kwargs(self.replication)
         ret['application_name'] = self._postgresql.name
         ret.setdefault('sslmode', 'prefer')
+        if self._postgresql.major_version >= 120000:
+            ret.setdefault('gssencmode', 'prefer')
+        if self._postgresql.major_version >= 130000:
+            ret.setdefault('channel_binding', 'prefer')
         if self._krbsrvname:
             ret['krbsrvname'] = self._krbsrvname
         if 'database' in ret:
@@ -500,8 +506,9 @@ class ConfigHandler(object):
 
     def format_dsn(self, params, include_dbname=False):
         # A list of keywords that can be found in a conninfo string. Follows what is acceptable by libpq
-        keywords = ('dbname', 'user', 'passfile' if params.get('passfile') else 'password', 'host', 'port', 'sslmode',
-                    'sslcompression', 'sslcert', 'sslkey', 'sslrootcert', 'sslcrl', 'application_name', 'krbsrvname')
+        keywords = ('dbname', 'user', 'passfile' if params.get('passfile') else 'password', 'host', 'port',
+                    'sslmode', 'sslcompression', 'sslcert', 'sslkey', 'sslrootcert', 'sslcrl',
+                    'application_name', 'krbsrvname', 'gssencmode', 'channel_binding')
         if include_dbname:
             params = params.copy()
             params['dbname'] = params.get('database') or self._postgresql.database
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
index fabab68a..78a9eeb5 100644
--- a/tests/test_postgresql.py
+++ b/tests/test_postgresql.py
@@ -93,7 +93,7 @@ class TestPostgresql(BaseTestPostgresql):
     @patch('subprocess.call', Mock(return_value=0))
     @patch('os.rename', Mock())
     @patch('patroni.postgresql.CallbackExecutor', Mock())
-    @patch.object(Postgresql, 'get_major_version', Mock(return_value=120000))
+    @patch.object(Postgresql, 'get_major_version', Mock(return_value=130000))
     @patch.object(Postgresql, 'is_running', Mock(return_value=True))
     def setUp(self):
         super(TestPostgresql, self).setUp()

From e6ef3c340a7c8e87d6f78af449bbc5ae10868608 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 16:02:17 +0200
Subject: [PATCH 12/40] Wake up the main thread after checkpoint is done
 (#1524)

Replicas are waiting for checkpoint indication via member key of the leader in DCS. The key is normally updated only one time per HA loop.
Without waking the main thread up replicas will have to wait up to `loop_wait` seconds longer than necessary.
---
 patroni/ha.py                |  2 +-
 patroni/postgresql/rewind.py |  8 +++++---
 tests/test_rewind.py         | 12 ++++++------
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index 7a0e0adb..07a8122e 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -945,7 +945,7 @@ class Ha(object):
                     return msg
 
                 # check if the node is ready to be used by pg_rewind
-                self._rewind.ensure_checkpoint_after_promote()
+                self._rewind.ensure_checkpoint_after_promote(self.wakeup)
 
                 if self.is_standby_cluster():
                     # in case of standby cluster we don't really need to
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index 9dd30c7a..87a3626b 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -135,15 +135,17 @@ class Rewind(object):
             self._check_timeline_and_lsn(leader)
         return leader and leader.conn_url and self._state == REWIND_STATUS.NEED
 
-    def __checkpoint(self, task):
+    def __checkpoint(self, task, wakeup):
         try:
             result = self._postgresql.checkpoint()
         except Exception as e:
             result = 'Exception: ' + str(e)
         with task:
             task.complete(not bool(result))
+            if task.result:
+                wakeup()
 
-    def ensure_checkpoint_after_promote(self):
+    def ensure_checkpoint_after_promote(self, wakeup):
         """After promote issue a CHECKPOINT from a new thread and asynchronously check the result.
         In case if CHECKPOINT failed, just check that timeline in pg_control was updated."""
 
@@ -157,7 +159,7 @@ class Rewind(object):
                             return
                 else:
                     self._checkpoint_task = CriticalTask()
-                    return Thread(target=self.__checkpoint, args=(self._checkpoint_task,)).start()
+                    return Thread(target=self.__checkpoint, args=(self._checkpoint_task, wakeup)).start()
 
             if self._postgresql.get_master_timeline() == self._postgresql.pg_control_timeline():
                 self._state = REWIND_STATUS.CHECKPOINT
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 88715beb..20739903 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -117,16 +117,16 @@ class TestRewind(BaseTestPostgresql):
     @patch.object(Postgresql, 'checkpoint')
     def test_ensure_checkpoint_after_promote(self, mock_checkpoint, mock_controldata):
         mock_checkpoint.return_value = None
-        self.r.ensure_checkpoint_after_promote()
-        self.r.ensure_checkpoint_after_promote()
+        self.r.ensure_checkpoint_after_promote(Mock())
+        self.r.ensure_checkpoint_after_promote(Mock())
 
         self.r.reset_state()
         mock_controldata.return_value = {"Latest checkpoint's TimeLineID": 1}
         mock_checkpoint.side_effect = Exception
-        self.r.ensure_checkpoint_after_promote()
-        self.r.ensure_checkpoint_after_promote()
+        self.r.ensure_checkpoint_after_promote(Mock())
+        self.r.ensure_checkpoint_after_promote(Mock())
 
         self.r.reset_state()
         mock_controldata.side_effect = TypeError
-        self.r.ensure_checkpoint_after_promote()
-        self.r.ensure_checkpoint_after_promote()
+        self.r.ensure_checkpoint_after_promote(Mock())
+        self.r.ensure_checkpoint_after_promote(Mock())

From 285bffc68dd6c3fdf1a265c68760a9751d5e422d Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 16:05:07 +0200
Subject: [PATCH 13/40] Use pg_rewind with --restore-target-wal on 13 if
 possible (#1525)

On PostgreSQL 13 check if restore_command is configured and tell pg_rewind to use it
---
 patroni/postgresql/__init__.py | 9 +++++++++
 patroni/postgresql/rewind.py   | 8 ++++++--
 tests/test_rewind.py           | 7 +++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 566494db..248c3a59 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -653,6 +653,15 @@ class Postgresql(object):
             return False
         return True
 
+    def get_guc_value(self, name):
+        cmd = [self.pgcommand('postgres'), self._data_dir, '-C', name]
+        try:
+            data = subprocess.check_output(cmd)
+            if data:
+                return data.decode('utf-8').strip()
+        except Exception as e:
+            logger.error('Failed to execute %s: %r', cmd, e)
+
     def controldata(self):
         """ return the contents of pg_controldata, or non-True value if pg_controldata call failed """
         # Don't try to call pg_controldata during backup restore
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index 87a3626b..268e6699 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -173,9 +173,13 @@ class Rewind(object):
         env['PGOPTIONS'] = '-c statement_timeout=0'
         dsn = self._postgresql.config.format_dsn(r, True)
         logger.info('running pg_rewind from %s', dsn)
+
+        cmd = [self._postgresql.pgcommand('pg_rewind')]
+        if self._postgresql.major_version >= 130000 and self._postgresql.get_guc_value('restore_command'):
+            cmd.append('--restore-target-wal')
+        cmd.extend(['-D', self._postgresql.data_dir, '--source-server', dsn])
         try:
-            return self._postgresql.cancellable.call([self._postgresql.pgcommand('pg_rewind'), '-D',
-                                                      self._postgresql.data_dir, '--source-server', dsn], env=env) == 0
+            return self._postgresql.cancellable.call(cmd, env=env) == 0
         except OSError:
             return False
 
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 20739903..272eb79a 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -35,13 +35,16 @@ class TestRewind(BaseTestPostgresql):
         self.p.config._config['use_pg_rewind'] = False
         self.assertFalse(self.r.can_rewind)
 
+    @patch.object(Postgresql, 'major_version', PropertyMock(return_value=130000))
     @patch.object(CancellableSubprocess, 'call')
     def test_pg_rewind(self, mock_cancellable_subprocess_call):
         r = {'user': '', 'host': '', 'port': '', 'database': '', 'password': ''}
         mock_cancellable_subprocess_call.return_value = 0
-        self.assertTrue(self.r.pg_rewind(r))
+        with patch('subprocess.check_output', Mock(return_value=b'foo')):
+            self.assertTrue(self.r.pg_rewind(r))
         mock_cancellable_subprocess_call.side_effect = OSError
-        self.assertFalse(self.r.pg_rewind(r))
+        with patch('subprocess.check_output', Mock(side_effect=Exception)):
+            self.assertFalse(self.r.pg_rewind(r))
 
     @patch.object(Rewind, 'can_rewind', PropertyMock(return_value=True))
     def test__get_local_timeline_lsn(self):

From 7cf0b753ab922e0f64254da9202957250e156b47 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 16:13:16 +0200
Subject: [PATCH 14/40] Update optime/leader with checkpoint location after
 clean shut down (#1527)

Potentially this information could be used in order to make sure that there is no data loss on switchover.
---
 patroni/dcs/__init__.py        | 13 +++++++++++--
 patroni/dcs/consul.py          |  2 +-
 patroni/dcs/etcd.py            |  2 +-
 patroni/dcs/kubernetes.py      | 10 ++++++++--
 patroni/dcs/zookeeper.py       |  2 +-
 patroni/ha.py                  | 14 ++++++++------
 patroni/postgresql/__init__.py | 13 ++++++++++++-
 tests/test_ha.py               |  5 ++++-
 tests/test_kubernetes.py       |  2 +-
 tests/test_postgresql.py       |  5 +++++
 10 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/patroni/dcs/__init__.py b/patroni/dcs/__init__.py
index 193b7fe1..473cbe4a 100644
--- a/patroni/dcs/__init__.py
+++ b/patroni/dcs/__init__.py
@@ -758,10 +758,19 @@ class AbstractDCS(object):
         otherwise it should return `!False`"""
 
     @abc.abstractmethod
-    def delete_leader(self):
-        """Voluntarily remove leader key from DCS
+    def _delete_leader(self):
+        """Remove leader key from DCS.
         This method should remove leader key if current instance is the leader"""
 
+    def delete_leader(self, last_operation=None):
+        """Update optime/leader and voluntarily remove leader key from DCS.
+        This method should remove leader key if current instance is the leader.
+        :param last_operation: latest checkpoint location in bytes"""
+
+        if last_operation:
+            self.write_leader_optime(last_operation)
+        return self._delete_leader()
+
     @abc.abstractmethod
     def cancel_initialization(self):
         """ Removes the initialize key for a cluster """
diff --git a/patroni/dcs/consul.py b/patroni/dcs/consul.py
index efa81ea1..c22d0f8e 100644
--- a/patroni/dcs/consul.py
+++ b/patroni/dcs/consul.py
@@ -503,7 +503,7 @@ class Consul(AbstractDCS):
         return self._client.kv.put(self.history_path, value)
 
     @catch_consul_errors
-    def delete_leader(self):
+    def _delete_leader(self):
         cluster = self.cluster
         if cluster and isinstance(cluster.leader, Leader) and cluster.leader.name == self._name:
             return self._client.kv.delete(self.leader_path, cas=cluster.leader.index)
diff --git a/patroni/dcs/etcd.py b/patroni/dcs/etcd.py
index 844948ea..cb13f4e7 100644
--- a/patroni/dcs/etcd.py
+++ b/patroni/dcs/etcd.py
@@ -625,7 +625,7 @@ class Etcd(AbstractDCS):
         return self.retry(self._client.write, self.initialize_path, sysid, prevExist=(not create_new))
 
     @catch_etcd_errors
-    def delete_leader(self):
+    def _delete_leader(self):
         return self._client.delete(self.leader_path, prevValue=self._name)
 
     @catch_etcd_errors
diff --git a/patroni/dcs/kubernetes.py b/patroni/dcs/kubernetes.py
index 9244ca04..15f4463d 100644
--- a/patroni/dcs/kubernetes.py
+++ b/patroni/dcs/kubernetes.py
@@ -546,9 +546,15 @@ class Kubernetes(AbstractDCS):
         resource_version = cluster.config.index if cluster and cluster.config and cluster.config.index else None
         return self.patch_or_create_config({self._INITIALIZE: sysid}, resource_version)
 
-    def delete_leader(self):
+    def _delete_leader(self):
+        """Unused"""
+
+    def delete_leader(self, last_operation=None):
         if self.cluster and isinstance(self.cluster.leader, Leader) and self.cluster.leader.name == self._name:
-            self.patch_or_create(self.leader_path, {self._LEADER: None}, self._leader_resource_version, True, False, [])
+            annotations = {self._LEADER: None}
+            if last_operation:
+                annotations[self._OPTIME] = last_operation
+            self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, True, False, [])
             self.reset_cluster()
 
     def cancel_initialization(self):
diff --git a/patroni/dcs/zookeeper.py b/patroni/dcs/zookeeper.py
index 8e9f5637..336fca70 100644
--- a/patroni/dcs/zookeeper.py
+++ b/patroni/dcs/zookeeper.py
@@ -315,7 +315,7 @@ class ZooKeeper(AbstractDCS):
     def _update_leader(self):
         return True
 
-    def delete_leader(self):
+    def _delete_leader(self):
         self._client.restart()
         return True
 
diff --git a/patroni/ha.py b/patroni/ha.py
index 07a8122e..4b62159f 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -748,13 +748,13 @@ class Ha(object):
 
         return self._is_healthiest_node(members.values())
 
-    def _delete_leader(self):
+    def _delete_leader(self, last_operation=None):
         self.set_is_leader(False)
-        self.dcs.delete_leader()
+        self.dcs.delete_leader(last_operation)
         self.dcs.reset_cluster()
 
-    def release_leader_key_voluntarily(self):
-        self._delete_leader()
+    def release_leader_key_voluntarily(self, last_operation=None):
+        self._delete_leader(last_operation)
         self.touch_member()
         logger.info("Leader key released")
 
@@ -784,8 +784,9 @@ class Ha(object):
         self.set_is_leader(False)
 
         if mode_control['release']:
+            checkpoint_location = self.state_handler.latest_checkpoint_location() if mode == 'graceful' else None
             with self._async_executor:
-                self.release_leader_key_voluntarily()
+                self.release_leader_key_voluntarily(checkpoint_location)
             time.sleep(2)  # Give a time to somebody to take the leader lock
         if mode_control['offline']:
             node_to_follow, leader = None, None
@@ -1384,7 +1385,8 @@ class Ha(object):
                                                                         stop_timeout=self.master_stop_timeout()))
             if not self.state_handler.is_running():
                 if self.has_lock():
-                    self.dcs.delete_leader()
+                    checkpoint_location = self.state_handler.latest_checkpoint_location()
+                    self.dcs.delete_leader(checkpoint_location)
                 self.touch_member()
             else:
                 # XXX: what about when Patroni is started as the wrong user that has access to the watchdog device
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 248c3a59..6bed69e9 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -13,7 +13,7 @@ from patroni.postgresql.bootstrap import Bootstrap
 from patroni.postgresql.cancellable import CancellableSubprocess
 from patroni.postgresql.config import ConfigHandler
 from patroni.postgresql.connection import Connection, get_connection_cursor
-from patroni.postgresql.misc import parse_history, postgres_major_version_to_int
+from patroni.postgresql.misc import parse_history, parse_lsn, postgres_major_version_to_int
 from patroni.postgresql.postmaster import PostmasterProcess
 from patroni.postgresql.slots import SlotsHandler
 from patroni.exceptions import PostgresConnectionException
@@ -310,6 +310,17 @@ class Postgresql(object):
         except (TypeError, ValueError):
             logger.exception('Failed to parse timeline from pg_controldata output')
 
+    def latest_checkpoint_location(self):
+        """Returns checkpoint location for the cleanly shut down primary"""
+
+        data = self.controldata()
+        lsn = data.get('Latest checkpoint location')
+        if data.get('Database cluster state') == 'shut down' and lsn:
+            try:
+                return str(parse_lsn(lsn))
+            except (IndexError, ValueError) as e:
+                logger.error('Exception when parsing lsn %s: %r', lsn, e)
+
     def is_running(self):
         """Returns PostmasterProcess if one is running on the data directory or None. If most recently seen process
         is running updates the cached process based on pid file."""
diff --git a/tests/test_ha.py b/tests/test_ha.py
index 50a1bc75..80550ec8 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -154,7 +154,10 @@ def run_async(self, func, args=()):
 @patch.object(Postgresql, '_cluster_info_state_get', Mock(return_value=3))
 @patch.object(Postgresql, 'call_nowait', Mock(return_value=True))
 @patch.object(Postgresql, 'data_directory_empty', Mock(return_value=False))
-@patch.object(Postgresql, 'controldata', Mock(return_value={'Database system identifier': SYSID}))
+@patch.object(Postgresql, 'controldata', Mock(return_value={
+    'Database system identifier': SYSID,
+    'Database cluster state': 'shut down',
+    'Latest checkpoint location': '0/12345678'}))
 @patch.object(SlotsHandler, 'sync_replication_slots', Mock())
 @patch.object(ConfigHandler, 'append_pg_hba', Mock())
 @patch.object(ConfigHandler, 'write_pgpass', Mock(return_value={}))
diff --git a/tests/test_kubernetes.py b/tests/test_kubernetes.py
index 5fb89c23..e82870f4 100644
--- a/tests/test_kubernetes.py
+++ b/tests/test_kubernetes.py
@@ -113,7 +113,7 @@ class TestKubernetes(unittest.TestCase):
         self.k.initialize()
 
     def test_delete_leader(self):
-        self.k.delete_leader()
+        self.k.delete_leader(1)
 
     def test_cancel_initialization(self):
         self.k.cancel_initialization()
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
index 78a9eeb5..858b6f2a 100644
--- a/tests/test_postgresql.py
+++ b/tests/test_postgresql.py
@@ -338,6 +338,11 @@ class TestPostgresql(BaseTestPostgresql):
         with patch.object(Postgresql, '_query', Mock(side_effect=RetryFailedError(''))):
             self.assertRaises(PostgresConnectionException, self.p.is_leader)
 
+    @patch.object(Postgresql, 'controldata',
+                  Mock(return_value={'Database cluster state': 'shut down', 'Latest checkpoint location': 'X/678'}))
+    def test_latest_checkpoint_location(self):
+        self.assertIsNone(self.p.latest_checkpoint_location())
+
     def test_reload(self):
         self.assertTrue(self.p.reload())
 

From 30aa355eb51116c97a2a3dbaf68bc512d9af5aaa Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 16:14:25 +0200
Subject: [PATCH 15/40] Shorten and beautify history log output (#1526)

when Patroni is trying to figure out the necessity of pg_rewind it could write the content history file from the primary into the log. The history file is growing with every failover/switchover and eventually starts taking too many lines in the log, most of them are not so much useful.
Instead of showing the raw data, we will show only 3 lines before the current replica timeline and 2 lines after.
---
 patroni/postgresql/rewind.py | 33 +++++++++++++++++++++++++++++----
 tests/__init__.py            |  2 +-
 tests/test_postgresql.py     |  2 +-
 tests/test_rewind.py         |  9 ++++++++-
 4 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index 268e6699..b30458b8 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -15,6 +15,11 @@ REWIND_STATUS = type('Enum', (), {'INITIAL': 0, 'CHECKPOINT': 1, 'CHECK': 2, 'NE
                                   'NOT_NEED': 4, 'SUCCESS': 5, 'FAILED': 6})
 
 
+def format_lsn(lsn, full=False):
+    template = '{0:X}/{1:08X}' if full else '{0:X}/{1:X}'
+    return template.format(lsn >> 32, lsn & 0xFFFFFFFF)
+
+
 class Rewind(object):
 
     def __init__(self, postgresql):
@@ -88,6 +93,24 @@ class Rewind(object):
         logger.info('Local timeline=%s lsn=%s', timeline, lsn)
         return timeline, lsn
 
+    @staticmethod
+    def _log_master_history(history, i):
+        start = max(0, i - 3)
+        end = None if i + 4 >= len(history) else i + 2
+        history_show = []
+
+        def format_history_line(l):
+            return '{0}\t{1}\t{2}'.format(l[0], format_lsn(l[1]), l[2])
+
+        for line in history[start:end]:
+            history_show.append(format_history_line(line))
+
+        if line != history[-1]:
+            history_show.append('...')
+            history_show.append(format_history_line(history[-1]))
+
+        logger.info('master: history=%s', '\n'.join(history_show))
+
     def _check_timeline_and_lsn(self, leader):
         local_timeline, local_lsn = self._get_local_timeline_lsn()
         if local_timeline is None or local_lsn is None:
@@ -108,17 +131,18 @@ class Rewind(object):
                 logger.info('master_timeline=%s', master_timeline)
                 if local_timeline > master_timeline:  # Not always supported by pg_rewind
                     need_rewind = True
+                elif local_timeline == master_timeline:
+                    need_rewind = False
                 elif master_timeline > 1:
                     cur.execute('TIMELINE_HISTORY %s', (master_timeline,))
                     history = bytes(cur.fetchone()[1]).decode('utf-8')
-                    logger.info('master: history=%s', history)
-                else:  # local_timeline == master_timeline == 1
-                    need_rewind = False
+                    logger.debug('master: history=%s', history)
         except Exception:
             return logger.exception('Exception when working with master via replication connection')
 
         if history is not None:
-            for parent_timeline, switchpoint, _ in parse_history(history):
+            history = list(parse_history(history))
+            for i, (parent_timeline, switchpoint, _) in enumerate(history):
                 if parent_timeline == local_timeline:
                     try:
                         need_rewind = parse_lsn(local_lsn) >= switchpoint
@@ -127,6 +151,7 @@ class Rewind(object):
                     break
                 elif parent_timeline > local_timeline:
                     break
+            self._log_master_history(history, i)
 
         self._state = need_rewind and REWIND_STATUS.NEED or REWIND_STATUS.NOT_NEED
 
diff --git a/tests/__init__.py b/tests/__init__.py
index 8a075d61..2bf33b3d 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -106,7 +106,7 @@ class MockCursor(object):
                             ('autovacuum', 'on', None, 'bool', 'sighup'),
                             ('unix_socket_directories', '/tmp', None, 'string', 'postmaster')]
         elif sql.startswith('IDENTIFY_SYSTEM'):
-            self.results = [('1', 2, '0/402EEC0', '')]
+            self.results = [('1', 3, '0/402EEC0', '')]
         elif sql.startswith('SELECT isdir, modification'):
             self.results = [(False, datetime.datetime.now())]
         elif sql.startswith('SELECT pg_catalog.pg_read_file'):
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
index 858b6f2a..144cd976 100644
--- a/tests/test_postgresql.py
+++ b/tests/test_postgresql.py
@@ -711,7 +711,7 @@ class TestPostgresql(BaseTestPostgresql):
         self.assertTrue(self.p.fix_cluster_state())
 
     def test_replica_cached_timeline(self):
-        self.assertEqual(self.p.replica_cached_timeline(1), 2)
+        self.assertEqual(self.p.replica_cached_timeline(2), 3)
 
     def test_get_master_timeline(self):
         self.assertEqual(self.p.get_master_timeline(), 1)
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 272eb79a..810cfec9 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -85,6 +85,13 @@ class TestRewind(BaseTestPostgresql):
         with patch.object(Postgresql, 'is_running', Mock(return_value=True)):
             self.r.execute(self.leader)
 
+    @patch('patroni.postgresql.rewind.logger.info')
+    def test__log_master_history(self, mock_logger):
+        history = [[n, n, ''] for n in range(1, 10)]
+        self.r._log_master_history(history, 1)
+        expected = '\n'.join(['{0}\t0/{0}\t'.format(n) for n in range(1, 4)] + ['...', '9\t0/9\t'])
+        self.assertEqual(mock_logger.call_args[0][1], expected)
+
     @patch.object(Postgresql, 'start', Mock())
     @patch.object(Rewind, 'can_rewind', PropertyMock(return_value=True))
     @patch.object(Rewind, '_get_local_timeline_lsn', Mock(return_value=(2, '40159C1')))
@@ -101,7 +108,7 @@ class TestRewind(BaseTestPostgresql):
         with patch('psycopg2.connect', Mock(side_effect=Exception)):
             self.assertFalse(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
         self.r.trigger_check_diverged_lsn()
-        with patch.object(MockCursor, 'fetchone', Mock(side_effect=[('', 2, '0/0'), ('', b'3\t0/40159C0\tn\n')])):
+        with patch.object(MockCursor, 'fetchone', Mock(side_effect=[('', 3, '0/0'), ('', b'3\t0/40159C0\tn\n')])):
             self.assertFalse(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
         self.r.trigger_check_diverged_lsn()
         with patch.object(MockCursor, 'fetchone', Mock(return_value=('', 1, '0/0'))):

From 4cc6034165eba747b90c96851bdd9ec2c53f0c20 Mon Sep 17 00:00:00 2001
From: Pavlo Golub <pavlo.golub@gmail.com>
Date: Fri, 15 May 2020 16:22:15 +0200
Subject: [PATCH 16/40] Fix features/steps/standby_cluster.py under Windows
 (#1535)

Resolves #1534
---
 features/steps/standby_cluster.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/features/steps/standby_cluster.py b/features/steps/standby_cluster.py
index 063d9f82..c04fd6f3 100644
--- a/features/steps/standby_cluster.py
+++ b/features/steps/standby_cluster.py
@@ -10,7 +10,8 @@ SELECT * FROM pg_catalog.pg_stat_replication
 WHERE application_name = '{0}'
 """
 
-callback = sys.executable + " features/callback2.py "
+executable = sys.executable if os.name != 'nt' else sys.executable.replace('\\', '/')
+callback = executable + " features/callback2.py "
 
 
 @step('I start {name:w} with callback configured')
@@ -18,7 +19,7 @@ def start_patroni_with_callbacks(context, name):
     return context.pctl.start(name, custom_config={
         "postgresql": {
             "callbacks": {
-                "on_role_change": sys.executable + " features/callback.py"
+                "on_role_change": executable + " features/callback.py"
             }
         }
     })
@@ -31,7 +32,7 @@ def start_patroni(context, name, cluster_name):
         "postgresql": {
             "callbacks": {c: callback + name for c in ('on_start', 'on_stop', 'on_restart', 'on_role_change')},
             "backup_restore": {
-                "command": (sys.executable + " features/backup_restore.py --sourcedir=" +
+                "command": (executable + " features/backup_restore.py --sourcedir=" +
                             os.path.join(context.pctl.patroni_path, 'data', 'basebackup'))}
         }
     })

From 08b3d5d20dcdd5bd0f4db957b0d4dfcd66fa7d41 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 16:22:57 +0200
Subject: [PATCH 17/40] Move ensure_clean_shutdown into rewind module (#1528)

Logically fits there better
---
 patroni/ha.py                  |  2 +-
 patroni/postgresql/__init__.py | 47 ----------------------------------
 patroni/postgresql/rewind.py   | 47 ++++++++++++++++++++++++++++++++++
 tests/test_ha.py               |  2 +-
 tests/test_postgresql.py       | 36 --------------------------
 tests/test_rewind.py           | 39 +++++++++++++++++++++++++++-
 6 files changed, 87 insertions(+), 86 deletions(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index 4b62159f..3219a18e 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -323,7 +323,7 @@ class Ha(object):
                 (self.cluster.is_unlocked() or self._rewind.can_rewind):
             self._crash_recovery_executed = True
             msg = 'doing crash recovery in a single user mode'
-            return self._async_executor.try_run_async(msg, self.state_handler.fix_cluster_state) or msg
+            return self._async_executor.try_run_async(msg, self._rewind.ensure_clean_shutdown) or msg
 
         self.load_cluster_from_dcs()
 
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 6bed69e9..8fd158b9 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -887,53 +887,6 @@ class Postgresql(object):
             return candidates[0], False
         return None, False
 
-    def read_postmaster_opts(self):
-        """returns the list of option names/values from postgres.opts, Empty dict if read failed or no file"""
-        result = {}
-        try:
-            with open(os.path.join(self._data_dir, 'postmaster.opts')) as f:
-                data = f.read()
-                for opt in data.split('" "'):
-                    if '=' in opt and opt.startswith('--'):
-                        name, val = opt.split('=', 1)
-                        result[name.strip('-')] = val.rstrip('"\n')
-        except IOError:
-            logger.exception('Error when reading postmaster.opts')
-        return result
-
-    def single_user_mode(self, command=None, options=None):
-        """run a given command in a single-user mode. If the command is empty - then just start and stop"""
-        cmd = [self.pgcommand('postgres'), '--single', '-D', self._data_dir]
-        for opt, val in sorted((options or {}).items()):
-            cmd.extend(['-c', '{0}={1}'.format(opt, val)])
-        # need a database name to connect
-        cmd.append(self._database)
-        return self.cancellable.call(cmd, communicate_input=command)
-
-    def cleanup_archive_status(self):
-        status_dir = os.path.join(self._data_dir, 'pg_' + self.wal_name, 'archive_status')
-        try:
-            for f in os.listdir(status_dir):
-                path = os.path.join(status_dir, f)
-                try:
-                    if os.path.islink(path):
-                        os.unlink(path)
-                    elif os.path.isfile(path):
-                        os.remove(path)
-                except OSError:
-                    logger.exception('Unable to remove %s', path)
-        except OSError:
-            logger.exception('Unable to list %s', status_dir)
-
-    def fix_cluster_state(self):
-        self.cleanup_archive_status()
-
-        # Start in a single user mode and stop to produce a clean shutdown
-        opts = self.read_postmaster_opts()
-        opts.update({'archive_mode': 'on', 'archive_command': 'false'})
-        self.config.remove_recovery_conf()
-        return self.single_user_mode(options=opts) == 0 or None
-
     def schedule_sanity_checks_after_pause(self):
         """
             After coming out of pause we have to:
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index b30458b8..1e6cd621 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -262,3 +262,50 @@ class Rewind(object):
     @property
     def failed(self):
         return self._state == REWIND_STATUS.FAILED
+
+    def read_postmaster_opts(self):
+        """returns the list of option names/values from postgres.opts, Empty dict if read failed or no file"""
+        result = {}
+        try:
+            with open(os.path.join(self._postgresql.data_dir, 'postmaster.opts')) as f:
+                data = f.read()
+                for opt in data.split('" "'):
+                    if '=' in opt and opt.startswith('--'):
+                        name, val = opt.split('=', 1)
+                        result[name.strip('-')] = val.rstrip('"\n')
+        except IOError:
+            logger.exception('Error when reading postmaster.opts')
+        return result
+
+    def single_user_mode(self, command=None, options=None):
+        """run a given command in a single-user mode. If the command is empty - then just start and stop"""
+        cmd = [self._postgresql.pgcommand('postgres'), '--single', '-D', self._postgresql.data_dir]
+        for opt, val in sorted((options or {}).items()):
+            cmd.extend(['-c', '{0}={1}'.format(opt, val)])
+        # need a database name to connect
+        cmd.append('template1')
+        return self._postgresql.cancellable.call(cmd, communicate_input=command)
+
+    def cleanup_archive_status(self):
+        status_dir = os.path.join(self._postgresql.data_dir, 'pg_' + self._postgresql.wal_name, 'archive_status')
+        try:
+            for f in os.listdir(status_dir):
+                path = os.path.join(status_dir, f)
+                try:
+                    if os.path.islink(path):
+                        os.unlink(path)
+                    elif os.path.isfile(path):
+                        os.remove(path)
+                except OSError:
+                    logger.exception('Unable to remove %s', path)
+        except OSError:
+            logger.exception('Unable to list %s', status_dir)
+
+    def ensure_clean_shutdown(self):
+        self.cleanup_archive_status()
+
+        # Start in a single user mode and stop to produce a clean shutdown
+        opts = self.read_postmaster_opts()
+        opts.update({'archive_mode': 'on', 'archive_command': 'false'})
+        self._postgresql.config.remove_recovery_conf()
+        return self.single_user_mode(options=opts) == 0 or None
diff --git a/tests/test_ha.py b/tests/test_ha.py
index 80550ec8..2597d416 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -257,7 +257,7 @@ class TestHa(PostgresInit):
         self.ha.cluster = get_cluster_initialized_with_leader()
         self.assertEqual(self.ha.run_cycle(), 'starting as readonly because i had the session lock')
 
-    @patch.object(Postgresql, 'fix_cluster_state', Mock())
+    @patch.object(Rewind, 'ensure_clean_shutdown', Mock())
     def test_crash_recovery(self):
         self.p.is_running = false
         self.p.controldata = lambda: {'Database cluster state': 'in production', 'Database system identifier': SYSID}
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
index 144cd976..9f52c0b9 100644
--- a/tests/test_postgresql.py
+++ b/tests/test_postgresql.py
@@ -674,42 +674,6 @@ class TestPostgresql(BaseTestPostgresql):
         mock_postmaster.signal_stop.assert_called()
         mock_postmaster.wait.assert_called()
 
-    def test_read_postmaster_opts(self):
-        m = mock_open(read_data='/usr/lib/postgres/9.6/bin/postgres "-D" "data/postgresql0" \
-"--listen_addresses=127.0.0.1" "--port=5432" "--hot_standby=on" "--wal_level=hot_standby" \
-"--wal_log_hints=on" "--max_wal_senders=5" "--max_replication_slots=5"\n')
-        with patch.object(builtins, 'open', m):
-            data = self.p.read_postmaster_opts()
-            self.assertEqual(data['wal_level'], 'hot_standby')
-            self.assertEqual(int(data['max_replication_slots']), 5)
-            self.assertEqual(data.get('D'), None)
-
-            m.side_effect = IOError
-            data = self.p.read_postmaster_opts()
-            self.assertEqual(data, dict())
-
-    @patch('psutil.Popen')
-    def test_single_user_mode(self, subprocess_popen_mock):
-        subprocess_popen_mock.return_value.wait.return_value = 0
-        self.assertEqual(self.p.single_user_mode('CHECKPOINT', {'archive_mode': 'on'}), 0)
-
-    @patch('os.listdir', Mock(side_effect=[OSError, ['a', 'b']]))
-    @patch('os.unlink', Mock(side_effect=OSError))
-    @patch('os.remove', Mock())
-    @patch('os.path.islink', Mock(side_effect=[True, False]))
-    @patch('os.path.isfile', Mock(return_value=True))
-    def test_cleanup_archive_status(self):
-        self.p.cleanup_archive_status()
-        self.p.cleanup_archive_status()
-
-    @patch('os.unlink', Mock())
-    @patch('os.listdir', Mock(return_value=[]))
-    @patch('os.path.isfile', Mock(return_value=True))
-    @patch.object(Postgresql, 'read_postmaster_opts', Mock(return_value={}))
-    @patch.object(Postgresql, 'single_user_mode', Mock(return_value=0))
-    def test_fix_cluster_state(self):
-        self.assertTrue(self.p.fix_cluster_state())
-
     def test_replica_cached_timeline(self):
         self.assertEqual(self.p.replica_cached_timeline(2), 3)
 
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 810cfec9..343de8b1 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -1,8 +1,9 @@
-from mock import Mock, PropertyMock, patch
+from mock import Mock, PropertyMock, patch, mock_open
 
 from patroni.postgresql import Postgresql
 from patroni.postgresql.cancellable import CancellableSubprocess
 from patroni.postgresql.rewind import Rewind
+from six.moves import builtins
 
 from . import BaseTestPostgresql, MockCursor, psycopg2_connect
 
@@ -122,6 +123,42 @@ class TestRewind(BaseTestPostgresql):
         self.r.check_leader_is_not_in_recovery()
         self.r.check_leader_is_not_in_recovery()
 
+    def test_read_postmaster_opts(self):
+        m = mock_open(read_data='/usr/lib/postgres/9.6/bin/postgres "-D" "data/postgresql0" \
+"--listen_addresses=127.0.0.1" "--port=5432" "--hot_standby=on" "--wal_level=hot_standby" \
+"--wal_log_hints=on" "--max_wal_senders=5" "--max_replication_slots=5"\n')
+        with patch.object(builtins, 'open', m):
+            data = self.r.read_postmaster_opts()
+            self.assertEqual(data['wal_level'], 'hot_standby')
+            self.assertEqual(int(data['max_replication_slots']), 5)
+            self.assertEqual(data.get('D'), None)
+
+            m.side_effect = IOError
+            data = self.r.read_postmaster_opts()
+            self.assertEqual(data, dict())
+
+    @patch('psutil.Popen')
+    def test_single_user_mode(self, subprocess_popen_mock):
+        subprocess_popen_mock.return_value.wait.return_value = 0
+        self.assertEqual(self.r.single_user_mode('CHECKPOINT', {'archive_mode': 'on'}), 0)
+
+    @patch('os.listdir', Mock(side_effect=[OSError, ['a', 'b']]))
+    @patch('os.unlink', Mock(side_effect=OSError))
+    @patch('os.remove', Mock())
+    @patch('os.path.islink', Mock(side_effect=[True, False]))
+    @patch('os.path.isfile', Mock(return_value=True))
+    def test_cleanup_archive_status(self):
+        self.r.cleanup_archive_status()
+        self.r.cleanup_archive_status()
+
+    @patch('os.unlink', Mock())
+    @patch('os.listdir', Mock(return_value=[]))
+    @patch('os.path.isfile', Mock(return_value=True))
+    @patch.object(Rewind, 'read_postmaster_opts', Mock(return_value={}))
+    @patch.object(Rewind, 'single_user_mode', Mock(return_value=0))
+    def test_ensure_clean_shutdown(self):
+        self.assertTrue(self.r.ensure_clean_shutdown())
+
     @patch('patroni.postgresql.rewind.Thread', MockThread)
     @patch.object(Postgresql, 'controldata')
     @patch.object(Postgresql, 'checkpoint')

From ad5c686c11df6631bea3d5b88ee2b55460cbdb76 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 18:04:24 +0200
Subject: [PATCH 18/40] Take advantage of pg_stat_wal_recevier (#1513)

So far Patroni was parsing `recovery.conf` or querying `pg_settings` in order to get the current values of recovery parameters. On PostgreSQL earlier than 12 it could easily happen that the value of `primary_conninfo` in the `recovery.conf` has nothing to do with reality. Luckily for us, on PostgreSQL 9.6+ there is a `pg_stat_wal_receiver` view, which contains current values of `primary_conninfo` and `primary_slot_name`. The password field is masked through, but this is fine, because authentication happens only during opening the connection. All other parameters we compare as usual.

Another advantage of `pg_stat_wal_recevier` - it contains the current timeline, therefore on 9.6+ we don't need to use the replication connection trick if walreceiver process is alive.

If there is no walreceiver process available or it is not streaming we will stick to old methods.
---
 patroni/ha.py                  |  2 ++
 patroni/postgresql/__init__.py | 28 +++++++++++++++++++++++-----
 patroni/postgresql/config.py   | 18 ++++++++++++++++++
 patroni/postgresql/rewind.py   |  4 ++--
 tests/__init__.py              |  2 +-
 tests/test_ha.py               |  1 +
 tests/test_postgresql.py       | 11 +++++++++++
 7 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index 3219a18e..953c9636 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -198,6 +198,8 @@ class Ha(object):
                 try:
                     timeline, wal_position, pg_control_timeline = self.state_handler.timeline_wal_position()
                     data['xlog_location'] = wal_position
+                    if not timeline:  # try pg_stat_wal_receiver to get the timeline
+                        timeline = self.state_handler.received_timeline()
                     if not timeline:
                         # So far the only way to get the current timeline on the standby is from
                         # the replication connection. In order to avoid opening the replication
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 8fd158b9..ff3efa24 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -135,17 +135,25 @@ class Postgresql(object):
 
     @property
     def cluster_info_query(self):
-        pg_control_timeline = 'timeline_id FROM pg_catalog.pg_control_checkpoint()' \
-                if self._major_version >= 90600 and self.role == 'standby_leader' else '0'
+        if self._major_version >= 90600:
+            extra = (", CASE WHEN latest_end_lsn IS NULL THEN NULL ELSE received_tli END,"
+                     " slot_name, conninfo FROM pg_catalog.pg_stat_get_wal_receiver()")
+            if self.role == 'standby_leader':
+                extra = "timeline_id" + extra + ", pg_catalog.pg_control_checkpoint()"
+            else:
+                extra = "0" + extra
+        else:
+            extra = "0, NULL, NULL, NULL"
+
         return ("SELECT CASE WHEN pg_catalog.pg_is_in_recovery() THEN 0 "
                 "ELSE ('x' || pg_catalog.substr(pg_catalog.pg_{0}file_name("
                 "pg_catalog.pg_current_{0}_{1}()), 1, 8))::bit(32)::int END, "
                 "CASE WHEN pg_catalog.pg_is_in_recovery() THEN GREATEST("
                 " pg_catalog.pg_{0}_{1}_diff(COALESCE("
                 "pg_catalog.pg_last_{0}_receive_{1}(), '0/0'), '0/0')::bigint,"
-                " pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}(), '0/0')::bigint)"
+                " pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}(), '0/0')::bigint) "
                 "ELSE pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_current_{0}_{1}(), '0/0')::bigint "
-                "END, {2}").format(self.wal_name, self.lsn_name, pg_control_timeline)
+                "END, {2}").format(self.wal_name, self.lsn_name, extra)
 
     def _version_file_exists(self):
         return not self.data_directory_empty() and os.path.isfile(self._version_file)
@@ -290,7 +298,8 @@ class Postgresql(object):
         if not self._cluster_info_state:
             try:
                 result = self._is_leader_retry(self._query, self.cluster_info_query).fetchone()
-                self._cluster_info_state = dict(zip(['timeline', 'wal_position', 'pg_control_timeline'], result))
+                self._cluster_info_state = dict(zip(['timeline', 'wal_position', 'pg_control_timeline',
+                                                     'received_tli', 'slot_name', 'conninfo'], result))
             except RetryFailedError as e:  # SELECT failed two times
                 self._cluster_info_state = {'error': str(e)}
                 if not self.is_starting() and self.pg_isready() == STATE_REJECT:
@@ -301,6 +310,15 @@ class Postgresql(object):
 
         return self._cluster_info_state.get(name)
 
+    def primary_slot_name(self):
+        return self._cluster_info_state_get('slot_name')
+
+    def primary_conninfo(self):
+        return self._cluster_info_state_get('conninfo')
+
+    def received_timeline(self):
+        return self._cluster_info_state_get('received_tli')
+
     def is_leader(self):
         return bool(self._cluster_info_state_get('timeline'))
 
diff --git a/patroni/postgresql/config.py b/patroni/postgresql/config.py
index fcab640e..2e16e1e4 100644
--- a/patroni/postgresql/config.py
+++ b/patroni/postgresql/config.py
@@ -652,6 +652,17 @@ class ConfigHandler(object):
         elif not primary_conninfo:
             return False
 
+        wal_receiver_primary_conninfo = self._postgresql.primary_conninfo()
+        if wal_receiver_primary_conninfo:
+            wal_receiver_primary_conninfo = parse_dsn(wal_receiver_primary_conninfo)
+            # when wal receiver is alive use primary_conninfo from pg_stat_wal_receiver for comparison
+            if wal_receiver_primary_conninfo:
+                primary_conninfo = wal_receiver_primary_conninfo
+                # There could be no password in the primary_conninfo or it is masked.
+                # Just copy the "desired" value in order to make comparison succeed.
+                if 'password' in wanted_primary_conninfo:
+                    primary_conninfo['password'] = wanted_primary_conninfo['password']
+
         if 'passfile' in primary_conninfo and 'password' not in primary_conninfo \
                 and 'password' in wanted_primary_conninfo:
             if self._check_passfile(primary_conninfo['passfile'], wanted_primary_conninfo):
@@ -695,6 +706,13 @@ class ConfigHandler(object):
             else:  # empty string, primary_conninfo is not in the config
                 primary_conninfo[0] = {}
 
+        # when wal receiver is alive take primary_slot_name from pg_stat_wal_receiver
+        wal_receiver_primary_slot_name = self._postgresql.primary_slot_name()
+        if not wal_receiver_primary_slot_name and self._postgresql.primary_conninfo():
+            wal_receiver_primary_slot_name = ''
+        if wal_receiver_primary_slot_name is not None:
+            self._current_recovery_params['primary_slot_name'][0] = wal_receiver_primary_slot_name
+
         required = {'restart': 0, 'reload': 0}
 
         def record_missmatch(mtype):
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index 1e6cd621..06a0e49c 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -99,8 +99,8 @@ class Rewind(object):
         end = None if i + 4 >= len(history) else i + 2
         history_show = []
 
-        def format_history_line(l):
-            return '{0}\t{1}\t{2}'.format(l[0], format_lsn(l[1]), l[2])
+        def format_history_line(line):
+            return '{0}\t{1}\t{2}'.format(line[0], format_lsn(line[1]), line[2])
 
         for line in history[start:end]:
             history_show.append(format_history_line(line))
diff --git a/tests/__init__.py b/tests/__init__.py
index 2bf33b3d..ab67116b 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -88,7 +88,7 @@ class MockCursor(object):
         elif sql.startswith('SELECT slot_name'):
             self.results = [('blabla', 'physical'), ('foobar', 'physical'), ('ls', 'logical', 'a', 'b')]
         elif sql.startswith('SELECT CASE WHEN pg_catalog.pg_is_in_recovery()'):
-            self.results = [(1, 2, 1)]
+            self.results = [(1, 2, 1, 1, None, None)]
         elif sql.startswith('SELECT pg_catalog.pg_is_in_recovery()'):
             self.results = [(False, 2)]
         elif sql.startswith('SELECT pg_catalog.to_char'):
diff --git a/tests/test_ha.py b/tests/test_ha.py
index 2597d416..a35641f3 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -202,6 +202,7 @@ class TestHa(PostgresInit):
         self.p.last_operation = Mock(side_effect=PostgresConnectionException(''))
         self.assertTrue(self.ha.update_lock(True))
 
+    @patch.object(Postgresql, 'received_timeline', Mock(return_value=None))
     def test_touch_member(self):
         self.p.timeline_wal_position = Mock(return_value=(0, 1, 0))
         self.p.replica_cached_timeline = Mock(side_effect=Exception)
diff --git a/tests/test_postgresql.py b/tests/test_postgresql.py
index 9f52c0b9..adb92a32 100644
--- a/tests/test_postgresql.py
+++ b/tests/test_postgresql.py
@@ -250,6 +250,10 @@ class TestPostgresql(BaseTestPostgresql):
             self.p.config.write_recovery_conf({'standby_mode': 'on', 'primary_conninfo': conninfo.copy()})
             self.p.config.write_postgresql_conf()
             self.assertEqual(self.p.config.check_recovery_conf(None), (False, False))
+            with patch.object(Postgresql, 'primary_conninfo', Mock(return_value='host=1')):
+                mock_get_pg_settings.return_value['primary_slot_name'] = [
+                    'primary_slot_name', '', '', 'string', 'postmaster', self.p.config._postgresql_conf]
+                self.assertEqual(self.p.config.check_recovery_conf(None), (True, True))
 
     @patch.object(Postgresql, 'major_version', PropertyMock(return_value=120000))
     @patch.object(Postgresql, 'is_running', MockPostmaster)
@@ -267,6 +271,7 @@ class TestPostgresql(BaseTestPostgresql):
             self.assertEqual(self.p.config.check_recovery_conf(None), (True, True))
 
     @patch.object(Postgresql, 'major_version', PropertyMock(return_value=100000))
+    @patch.object(Postgresql, 'primary_conninfo', Mock(return_value='host=1'))
     def test__read_recovery_params_pre_v12(self):
         self.p.config.write_recovery_conf({'standby_mode': 'on', 'primary_conninfo': {'password': 'foo'}})
         self.assertEqual(self.p.config.check_recovery_conf(None), (True, True))
@@ -695,3 +700,9 @@ class TestPostgresql(BaseTestPostgresql):
     @patch('os.path.isfile', Mock(return_value=False))
     def test_pgpass_is_dir(self):
         self.assertRaises(PatroniException, self.setUp)
+
+    @patch.object(Postgresql, '_query', Mock(side_effect=RetryFailedError('')))
+    def test_received_timeline(self):
+        self.p.set_role('standby_leader')
+        self.p.reset_cluster_info_state()
+        self.assertRaises(PostgresConnectionException, self.p.received_timeline)

From 881bba9e1cd081f7cd4c340f9a6ad977daa92c58 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 15 May 2020 18:04:59 +0200
Subject: [PATCH 19/40] Sync HA loops of all pods in one cluster (#1515)

There is no expire mechanism available on K8s, therefore we implement soft leader lock, i.e. every pod is "watching" for changes of the leader object and when there are no changes during the TTL it starts leader race.

Before we switched to LIST+WATCH approach in #1189 and #1276, we only watched for the leader object and every time it was updated, the main thread of the HA loop was waking up. As a result, all replica pods were synchronized, and starting the leader race more or less at the same time.

The new approach made all pods "unsynchronized" and the biggest downside of it - it takes `ttl + loop_wait` in the worst case to detect the leader failure.

This commit makes all pods in one cluster to sync HA loops again based on updates of the leader object.
---
 patroni/dcs/kubernetes.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/patroni/dcs/kubernetes.py b/patroni/dcs/kubernetes.py
index 15f4463d..3325bed0 100644
--- a/patroni/dcs/kubernetes.py
+++ b/patroni/dcs/kubernetes.py
@@ -98,7 +98,7 @@ def catch_kubernetes_errors(func):
 
 class ObjectCache(Thread):
 
-    def __init__(self, dcs, func, retry, condition):
+    def __init__(self, dcs, func, retry, condition, name=None):
         Thread.__init__(self)
         self.daemon = True
         self._api_client = k8s_client.ApiClient()
@@ -106,6 +106,7 @@ class ObjectCache(Thread):
         self._func = func
         self._retry = retry
         self._condition = condition
+        self._name = name  # name of this pod
         self._is_ready = False
         self._object_cache = {}
         self._object_cache_lock = Lock()
@@ -180,9 +181,14 @@ class ObjectCache(Thread):
                     if old_value:
                         old_value = (old_value.metadata.annotations or {}).get(self._annotations_map.get(name))
 
-                    if old_value != new_value and \
-                            (name != self._dcs.config_path or old_value is not None and new_value is not None):
+                    value_changed = old_value != new_value and \
+                        (name != self._dcs.config_path or old_value is not None and new_value is not None)
+
+                    if value_changed:
                         logger.debug('%s changed from %s to %s', name, old_value, new_value)
+
+                    # Do not wake up HA loop if we run as leader and received leader object update event
+                    if value_changed or name == self._dcs.leader_path and self._name != new_value:
                         self._dcs.event.set()
         finally:
             with self._condition:
@@ -251,7 +257,7 @@ class Kubernetes(AbstractDCS):
 
         kinds_func = functools.partial(self._api.list_namespaced_kind, self._namespace,
                                        label_selector=self._label_selector)
-        self._kinds = ObjectCache(self, kinds_func, self._retry, self._condition)
+        self._kinds = ObjectCache(self, kinds_func, self._retry, self._condition, self._name)
 
     def retry(self, *args, **kwargs):
         return self._retry.copy()(*args, **kwargs)
@@ -582,6 +588,6 @@ class Kubernetes(AbstractDCS):
             return True
 
         try:
-            return super(Kubernetes, self).watch(None, timeout)
+            return super(Kubernetes, self).watch(None, timeout + 0.5)
         finally:
             self.event.clear()

From 6a0d2924a00b45de840347706084a98176f789d3 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Wed, 27 May 2020 13:33:37 +0200
Subject: [PATCH 20/40] Separate received and replayed location (#1514)

When making a decision whether the running replica is able to stream from the new primary or must be rewound we should use replayed location, therefore we extract received and replayed independently.

Reuse the part of the query that extracts the timeline and locations in the REST API.
---
 patroni/api.py                 | 37 +++++++++-------------
 patroni/postgresql/__init__.py | 56 +++++++++++++++++++++++-----------
 tests/__init__.py              |  2 +-
 tests/test_api.py              |  2 ++
 4 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/patroni/api.py b/patroni/api.py
index bd331d2e..67dc011e 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -418,64 +418,55 @@ class RestApiHandler(BaseHTTPRequestHandler):
         return retry(self.server.query, sql, *params)
 
     def get_postgresql_status(self, retry=False):
+        postgresql = self.server.patroni.postgresql
         try:
             cluster = self.server.patroni.dcs.cluster
 
-            if self.server.patroni.postgresql.state not in ('running', 'restarting', 'starting'):
+            if postgresql.state not in ('running', 'restarting', 'starting'):
                 raise RetryFailedError('')
-            stmt = ("SELECT pg_catalog.to_char(pg_catalog.pg_postmaster_start_time(), 'YYYY-MM-DD HH24:MI:SS.MS TZ'),"
-                    " CASE WHEN pg_catalog.pg_is_in_recovery() THEN 0"
-                    " ELSE ('x' || pg_catalog.substr(pg_catalog.pg_{0}file_name("
-                    "pg_catalog.pg_current_{0}_{1}()), 1, 8))::bit(32)::int END,"
-                    " CASE WHEN pg_catalog.pg_is_in_recovery() THEN 0"
-                    " ELSE pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_current_{0}_{1}(), '0/0')::bigint END,"
-                    " pg_catalog.pg_{0}_{1}_diff(COALESCE(pg_catalog.pg_last_{0}_receive_{1}(),"
-                    " pg_catalog.pg_last_{0}_replay_{1}()), '0/0')::bigint,"
-                    " pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}(), '0/0')::bigint,"
+            stmt = ("SELECT " + postgresql.POSTMASTER_START_TIME + ", " + postgresql.TL_LSN + ","
                     " pg_catalog.to_char(pg_catalog.pg_last_xact_replay_timestamp(), 'YYYY-MM-DD HH24:MI:SS.MS TZ'),"
-                    " pg_catalog.pg_is_in_recovery() AND pg_catalog.pg_is_{0}_replay_paused(), "
                     " pg_catalog.array_to_json(pg_catalog.array_agg(pg_catalog.row_to_json(ri))) "
                     "FROM (SELECT (SELECT rolname FROM pg_authid WHERE oid = usesysid) AS usename,"
                     " application_name, client_addr, w.state, sync_state, sync_priority"
                     " FROM pg_catalog.pg_stat_get_wal_senders() w, pg_catalog.pg_stat_get_activity(pid)) AS ri")
 
-            row = self.query(stmt.format(self.server.patroni.postgresql.wal_name,
-                                         self.server.patroni.postgresql.lsn_name), retry=retry)[0]
+            row = self.query(stmt.format(postgresql.wal_name, postgresql.lsn_name), retry=retry)[0]
 
             result = {
-                'state': self.server.patroni.postgresql.state,
+                'state': postgresql.state,
                 'postmaster_start_time': row[0],
                 'role': 'replica' if row[1] == 0 else 'master',
-                'server_version': self.server.patroni.postgresql.server_version,
+                'server_version': postgresql.server_version,
                 'cluster_unlocked': bool(not cluster or cluster.is_unlocked()),
                 'xlog': ({
-                    'received_location': row[3],
-                    'replayed_location': row[4],
-                    'replayed_timestamp': row[5],
-                    'paused': row[6]} if row[1] == 0 else {
+                    'received_location': row[4] or row[3],
+                    'replayed_location': row[3],
+                    'replayed_timestamp': row[6],
+                    'paused': row[5]} if row[1] == 0 else {
                     'location': row[2]
                 })
             }
 
             if result['role'] == 'replica' and self.server.patroni.ha.is_standby_cluster():
-                result['role'] = self.server.patroni.postgresql.role
+                result['role'] = postgresql.role
 
             if row[1] > 0:
                 result['timeline'] = row[1]
             else:
                 leader_timeline = None if not cluster or cluster.is_unlocked() else cluster.leader.timeline
-                result['timeline'] = self.server.patroni.postgresql.replica_cached_timeline(leader_timeline)
+                result['timeline'] = postgresql.replica_cached_timeline(leader_timeline)
 
             if row[7]:
                 result['replication'] = row[7]
 
             return result
         except (psycopg2.Error, RetryFailedError, PostgresConnectionException):
-            state = self.server.patroni.postgresql.state
+            state = postgresql.state
             if state == 'running':
                 logger.exception('get_postgresql_status')
                 state = 'unknown'
-            return {'state': state, 'role': self.server.patroni.postgresql.role}
+            return {'state': state, 'role': postgresql.role}
 
     def log_message(self, fmt, *args):
         logger.debug("API thread: %s - - [%s] %s", self.client_address[0], self.log_date_time_string(), fmt % args)
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index ff3efa24..760d7364 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -46,6 +46,16 @@ def null_context():
 
 class Postgresql(object):
 
+    POSTMASTER_START_TIME = "pg_catalog.to_char(pg_catalog.pg_postmaster_start_time(), 'YYYY-MM-DD HH24:MI:SS.MS TZ')"
+    TL_LSN = ("CASE WHEN pg_catalog.pg_is_in_recovery() THEN 0 "
+              "ELSE ('x' || pg_catalog.substr(pg_catalog.pg_{0}file_name("
+              "pg_catalog.pg_current_{0}_{1}()), 1, 8))::bit(32)::int END, "  # master timeline
+              "CASE WHEN pg_catalog.pg_is_in_recovery() THEN 0 "
+              "ELSE pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_current_{0}_{1}(), '0/0')::bigint END, "  # write_lsn
+              "pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}(), '0/0')::bigint,  "
+              "pg_catalog.pg_{0}_{1}_diff(COALESCE(pg_catalog.pg_last_{0}_receive_{1}(), '0/0'), '0/0')::bigint, "
+              "pg_catalog.pg_is_in_recovery() AND pg_catalog.pg_is_{0}_replay_paused()")
+
     def __init__(self, config):
         self.name = config['name']
         self.scope = config['scope']
@@ -145,15 +155,7 @@ class Postgresql(object):
         else:
             extra = "0, NULL, NULL, NULL"
 
-        return ("SELECT CASE WHEN pg_catalog.pg_is_in_recovery() THEN 0 "
-                "ELSE ('x' || pg_catalog.substr(pg_catalog.pg_{0}file_name("
-                "pg_catalog.pg_current_{0}_{1}()), 1, 8))::bit(32)::int END, "
-                "CASE WHEN pg_catalog.pg_is_in_recovery() THEN GREATEST("
-                " pg_catalog.pg_{0}_{1}_diff(COALESCE("
-                "pg_catalog.pg_last_{0}_receive_{1}(), '0/0'), '0/0')::bigint,"
-                " pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_last_{0}_replay_{1}(), '0/0')::bigint) "
-                "ELSE pg_catalog.pg_{0}_{1}_diff(pg_catalog.pg_current_{0}_{1}(), '0/0')::bigint "
-                "END, {2}").format(self.wal_name, self.lsn_name, extra)
+        return ("SELECT " + self.TL_LSN + ", {2}").format(self.wal_name, self.lsn_name, extra)
 
     def _version_file_exists(self):
         return not self.data_directory_empty() and os.path.isfile(self._version_file)
@@ -298,7 +300,8 @@ class Postgresql(object):
         if not self._cluster_info_state:
             try:
                 result = self._is_leader_retry(self._query, self.cluster_info_query).fetchone()
-                self._cluster_info_state = dict(zip(['timeline', 'wal_position', 'pg_control_timeline',
+                self._cluster_info_state = dict(zip(['timeline', 'wal_position', 'replayed_location',
+                                                     'received_location', 'replay_paused', 'pg_control_timeline',
                                                      'received_tli', 'slot_name', 'conninfo'], result))
             except RetryFailedError as e:  # SELECT failed two times
                 self._cluster_info_state = {'error': str(e)}
@@ -310,6 +313,12 @@ class Postgresql(object):
 
         return self._cluster_info_state.get(name)
 
+    def replayed_location(self):
+        return self._cluster_info_state_get('replayed_location')
+
+    def received_location(self):
+        return self._cluster_info_state_get('received_location')
+
     def primary_slot_name(self):
         return self._cluster_info_state_get('slot_name')
 
@@ -797,21 +806,31 @@ class Postgresql(object):
             ret = self._wait_promote(wait_seconds)
         return ret
 
+    @staticmethod
+    def _wal_position(is_leader, wal_position, received_location, replayed_location):
+        return wal_position if is_leader else max(received_location or 0, replayed_location or 0)
+
     def timeline_wal_position(self):
         # This method could be called from different threads (simultaneously with some other `_query` calls).
         # If it is called not from main thread we will create a new cursor to execute statement.
         if current_thread().ident == self.__thread_ident:
-            return (self._cluster_info_state_get('timeline'),
-                    self._cluster_info_state_get('wal_position'),
-                    self._cluster_info_state_get('pg_control_timeline'))
+            timeline = self._cluster_info_state_get('timeline')
+            wal_position = self._cluster_info_state_get('wal_position')
+            replayed_location = self.replayed_location()
+            received_location = self.received_location()
+            pg_control_timeline = self._cluster_info_state_get('pg_control_timeline')
+        else:
+            with self.connection().cursor() as cursor:
+                cursor.execute(self.cluster_info_query)
+                (timeline, wal_position, replayed_location,
+                 received_location, _, pg_control_timeline) = cursor.fetchone()[:6]
 
-        with self.connection().cursor() as cursor:
-            cursor.execute(self.cluster_info_query)
-            return cursor.fetchone()[:3]
+        wal_position = self._wal_position(timeline, wal_position, received_location, replayed_location)
+        return (timeline, wal_position, pg_control_timeline)
 
     def postmaster_start_time(self):
         try:
-            query = "SELECT pg_catalog.to_char(pg_catalog.pg_postmaster_start_time(), 'YYYY-MM-DD HH24:MI:SS.MS TZ')"
+            query = "SELECT " + self.POSTMASTER_START_TIME
             if current_thread().ident == self.__thread_ident:
                 return self.query(query).fetchone()[0]
             with self.connection().cursor() as cursor:
@@ -821,7 +840,8 @@ class Postgresql(object):
             return None
 
     def last_operation(self):
-        return str(self._cluster_info_state_get('wal_position'))
+        return str(self._wal_position(self.is_leader(), self._cluster_info_state_get('wal_position'),
+                                      self.received_location(), self.replayed_location()))
 
     def configure_server_parameters(self):
         self._major_version = self.get_major_version()
diff --git a/tests/__init__.py b/tests/__init__.py
index ab67116b..3a86c5b0 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -88,7 +88,7 @@ class MockCursor(object):
         elif sql.startswith('SELECT slot_name'):
             self.results = [('blabla', 'physical'), ('foobar', 'physical'), ('ls', 'logical', 'a', 'b')]
         elif sql.startswith('SELECT CASE WHEN pg_catalog.pg_is_in_recovery()'):
-            self.results = [(1, 2, 1, 1, None, None)]
+            self.results = [(1, 2, 1, 0, False, 1, 1, None, None)]
         elif sql.startswith('SELECT pg_catalog.pg_is_in_recovery()'):
             self.results = [(False, 2)]
         elif sql.startswith('SELECT pg_catalog.to_char'):
diff --git a/tests/test_api.py b/tests/test_api.py
index 803b15ed..748d3372 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -30,6 +30,8 @@ class MockPostgresql(object):
     pending_restart = True
     wal_name = 'wal'
     lsn_name = 'lsn'
+    POSTMASTER_START_TIME = 'pg_catalog.to_char(pg_catalog.pg_postmaster_start_time'
+    TL_LSN = 'CASE WHEN pg_catalog.pg_is_in_recovery()'
 
     @staticmethod
     def connection():

From 798c46bc0376f9b401bc25e45ef3be9745cc58f8 Mon Sep 17 00:00:00 2001
From: Mateusz Kowalski <mateusz@mkowalski.com>
Date: Fri, 29 May 2020 14:13:33 +0200
Subject: [PATCH 21/40] Handle IPv6 addresses in split_host_port (#1533)

This PR makes split_host_port return IPv6 address without enclosing brackets.
This is due to the fact that e.g. socket.* functions expect host not to contain them when being called with IPv6.

Close: #1532
---
 patroni/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/patroni/utils.py b/patroni/utils.py
index 7886bd6c..94b98524 100644
--- a/patroni/utils.py
+++ b/patroni/utils.py
@@ -357,6 +357,8 @@ def polling_loop(timeout, interval=1):
 
 def split_host_port(value, default_port):
     t = value.rsplit(':', 1)
+    if ':' in t[0]:
+        t[0] = t[0].strip('[]')
     t.append(default_port)
     return t[0], int(t[1])
 

From c6207933d1ef88a7bf3e66bf8a3b2da165c7060e Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 29 May 2020 14:14:11 +0200
Subject: [PATCH 22/40] Properly handle the exception raised from
 refresh_session (#1531)

The `touch_member()` could be called from the finally block of the `_run_cycle()`. In case if it raised an exception the whole Patroni process was crashing.
In order to avoid future crashes we wrap `_run_cycle()` into the try..except block and ask a user to report a BUG.

Close https://github.com/zalando/patroni/issues/1529
---
 patroni/dcs/consul.py | 6 +++++-
 patroni/ha.py         | 8 ++++++--
 tests/test_consul.py  | 2 ++
 tests/test_ha.py      | 5 +++++
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/patroni/dcs/consul.py b/patroni/dcs/consul.py
index c22d0f8e..ed4e86ac 100644
--- a/patroni/dcs/consul.py
+++ b/patroni/dcs/consul.py
@@ -367,7 +367,11 @@ class Consul(AbstractDCS):
     def touch_member(self, data, permanent=False):
         cluster = self.cluster
         member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
-        create_member = not permanent and self.refresh_session()
+
+        try:
+            create_member = not permanent and self.refresh_session()
+        except DCSError:
+            return False
 
         if member and (create_member or member.session != self._session):
             self._client.kv.delete(self.member_path)
diff --git a/patroni/ha.py b/patroni/ha.py
index 953c9636..7c0490b4 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -1370,8 +1370,12 @@ class Ha(object):
 
     def run_cycle(self):
         with self._async_executor:
-            info = self._run_cycle()
-            return (self.is_paused() and 'PAUSE: ' or '') + info
+            try:
+                info = self._run_cycle()
+                return (self.is_paused() and 'PAUSE: ' or '') + info
+            except Exception:
+                logger.exception('Unexpected exception')
+                return 'Unexpected exception raised, please report it as a BUG'
 
     def shutdown(self):
         if self.is_paused():
diff --git a/tests/test_consul.py b/tests/test_consul.py
index 785e80f3..6d0c4732 100644
--- a/tests/test_consul.py
+++ b/tests/test_consul.py
@@ -122,6 +122,8 @@ class TestConsul(unittest.TestCase):
         self.c.refresh_session = Mock(return_value=True)
         for _ in range(0, 4):
             self.c.touch_member({'balbla': 'blabla'})
+        self.c.refresh_session = Mock(side_effect=ConsulError('foo'))
+        self.assertFalse(self.c.touch_member({'balbla': 'blabla'}))
 
     @patch.object(consul.Consul.KV, 'put', Mock(side_effect=InvalidSession))
     def test_take_leader(self):
diff --git a/tests/test_ha.py b/tests/test_ha.py
index a35641f3..178ca1ef 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -1073,3 +1073,8 @@ class TestHa(PostgresInit):
         self.ha.cluster = get_cluster_initialized_without_leader(leader=True, cluster_config=config)
         self.ha.has_lock = true
         self.assertEqual(self.ha.run_cycle(), 'no action.  i am the leader with the lock')
+
+    @patch.object(Cluster, 'has_member', true)
+    def test_run_cycle(self):
+        self.ha.dcs.touch_member = Mock(side_effect=DCSError('foo'))
+        self.assertEqual(self.ha.run_cycle(), 'Unexpected exception raised, please report it as a BUG')

From 98c2081c676c535a4c1be72b2cbf5666425d7433 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 29 May 2020 14:14:47 +0200
Subject: [PATCH 23/40] Detect a new timeline in the standby cluster (#1522)

The standby cluster doesn't know about leader elections in the main cluster and therefore the usual mechanisms of detecting divergences don't work. For example, it could happen that the standby cluster is ahead of the new primary of the main cluster and must be rewound.
There is a way to know that the new timeline has been created by checking the presence of a history file in pg_wal. If the new file is there, we will start usual procedures of making sure that we can continue streaming or will run the pg_rewind.
---
 patroni/ha.py                  |  4 ++++
 patroni/postgresql/__init__.py | 26 ++++++++++++++------------
 tests/__init__.py              |  6 ------
 tests/test_ha.py               | 12 ++++++++++--
 4 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index 7c0490b4..dec4bdf4 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -400,6 +400,10 @@ class Ha(object):
             self.demote('immediate-nolock')
             return demote_reason
 
+        if self.is_standby_cluster() and self._leader_timeline and \
+                self.state_handler.get_history(self._leader_timeline + 1):
+            self._rewind.trigger_check_diverged_lsn()
+
         msg = self._handle_rewind_or_reinitialize()
         if msg:
             return msg
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 760d7364..4d196911 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -8,10 +8,12 @@ import time
 
 from contextlib import contextmanager
 from copy import deepcopy
+from dateutil import tz
+from datetime import datetime
 from patroni.postgresql.callback_executor import CallbackExecutor
 from patroni.postgresql.bootstrap import Bootstrap
 from patroni.postgresql.cancellable import CancellableSubprocess
-from patroni.postgresql.config import ConfigHandler
+from patroni.postgresql.config import ConfigHandler, mtime
 from patroni.postgresql.connection import Connection, get_connection_cursor
 from patroni.postgresql.misc import parse_history, parse_lsn, postgres_major_version_to_int
 from patroni.postgresql.postmaster import PostmasterProcess
@@ -746,19 +748,19 @@ class Postgresql(object):
         return self._cluster_info_state_get('timeline')
 
     def get_history(self, timeline):
-        history_path = 'pg_{0}/{1:08X}.history'.format(self.wal_name, timeline)
-        try:
-            cursor = self._connection.cursor()
-            cursor.execute('SELECT isdir, modification FROM pg_catalog.pg_stat_file(%s)', (history_path,))
-            isdir, modification = cursor.fetchone()
-            if not isdir:
-                cursor.execute('SELECT pg_catalog.pg_read_file(%s)', (history_path,))
-                history = list(parse_history(cursor.fetchone()[0]))
+        history_path = os.path.join(self._data_dir, 'pg_' + self.wal_name, '{0:08X}.history'.format(timeline))
+        history_mtime = mtime(history_path)
+        if history_mtime:
+            try:
+                with open(history_path, 'r') as f:
+                    history = f.read()
+                history = list(parse_history(history))
                 if history[-1][0] == timeline - 1:
-                    history[-1].append(modification.isoformat())
+                    history_mtime = datetime.fromtimestamp(history_mtime).replace(tzinfo=tz.tzlocal())
+                    history[-1].append(history_mtime.isoformat())
                 return history
-        except Exception:
-            logger.exception('Failed to read and parse %s', (history_path,))
+            except Exception:
+                logger.exception('Failed to read and parse %s', (history_path,))
 
     def follow(self, member, role='replica', timeout=None, do_reload=False):
         recovery_params = self.config.build_recovery_params(member)
diff --git a/tests/__init__.py b/tests/__init__.py
index 3a86c5b0..95d72e1e 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,4 +1,3 @@
-import datetime
 import os
 import shutil
 import unittest
@@ -107,11 +106,6 @@ class MockCursor(object):
                             ('unix_socket_directories', '/tmp', None, 'string', 'postmaster')]
         elif sql.startswith('IDENTIFY_SYSTEM'):
             self.results = [('1', 3, '0/402EEC0', '')]
-        elif sql.startswith('SELECT isdir, modification'):
-            self.results = [(False, datetime.datetime.now())]
-        elif sql.startswith('SELECT pg_catalog.pg_read_file'):
-            self.results = [('1\t0/40159C0\tno recovery target specified\n\n'
-                             '2\t1/40159C0\tno recovery target specified\n',)]
         elif sql.startswith('TIMELINE_HISTORY '):
             self.results = [('', b'x\t0/40159C0\tno recovery target specified\n\n'
                                  b'1\t0/40159C0\tno recovery target specified\n\n'
diff --git a/tests/test_ha.py b/tests/test_ha.py
index 178ca1ef..9dfa438d 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -3,7 +3,7 @@ import etcd
 import os
 import sys
 
-from mock import Mock, MagicMock, PropertyMock, patch
+from mock import Mock, MagicMock, PropertyMock, patch, mock_open
 from patroni.config import Config
 from patroni.dcs import Cluster, ClusterConfig, Failover, Leader, Member, get_dcs, SyncState, TimelineHistory
 from patroni.dcs.etcd import Client
@@ -17,6 +17,7 @@ from patroni.postgresql.rewind import Rewind
 from patroni.postgresql.slots import SlotsHandler
 from patroni.utils import tzutc
 from patroni.watchdog import Watchdog
+from six.moves import builtins
 
 from . import PostgresInit, MockPostmaster, psycopg2_connect, requests_get
 from .test_etcd import socket_getaddrinfo, etcd_read, etcd_write
@@ -694,11 +695,14 @@ class TestHa(PostgresInit):
         self.ha.cluster = get_cluster_initialized_with_leader(Failover(0, '', self.p.name, None))
         self.assertEqual(self.ha.run_cycle(), 'PAUSE: waiting to become master after promote...')
 
+    @patch('patroni.postgresql.mtime', Mock(return_value=1588316884))
+    @patch.object(builtins, 'open', mock_open(read_data='1\t0/40159C0\tno recovery target specified\n'))
     def test_process_healthy_standby_cluster_as_standby_leader(self):
         self.p.is_leader = false
         self.p.name = 'leader'
         self.ha.cluster = get_standby_cluster_initialized_with_only_leader()
         self.p.config.check_recovery_conf = Mock(return_value=(False, False))
+        self.ha._leader_timeline = 1
         self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock')
         self.assertEqual(self.ha.run_cycle(), 'no action.  i am the standby leader with the lock')
 
@@ -1011,6 +1015,8 @@ class TestHa(PostgresInit):
         self.ha._disable_sync = False
         self.assertEqual(self.ha.get_effective_tags(), {'foo': 'bar'})
 
+    @patch('patroni.postgresql.mtime', Mock(return_value=1588316884))
+    @patch.object(builtins, 'open', Mock(side_effect=Exception))
     def test_restore_cluster_config(self):
         self.ha.cluster.config.data.clear()
         self.ha.has_lock = true
@@ -1042,7 +1048,9 @@ class TestHa(PostgresInit):
         # will not say bootstrap from leader as replica can't self elect
         self.assertEqual(self.ha.run_cycle(), "trying to bootstrap from replica 'other'")
 
-    @patch('psycopg2.connect', psycopg2_connect)
+    @patch('patroni.postgresql.mtime', Mock(return_value=1588316884))
+    @patch.object(builtins, 'open', mock_open(read_data=('1\t0/40159C0\tno recovery target specified\n\n'
+                                                         '2\t1/40159C0\tno recovery target specified\n')))
     def test_update_cluster_history(self):
         self.ha.has_lock = true
         self.ha.cluster.is_unlocked = false

From cd1b2741fab92469312420f92aaca9a50c3353af Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 29 May 2020 14:15:10 +0200
Subject: [PATCH 24/40] Improve timeline divergence check (#1563)

We don't need to rewind when:
1. replayed location for the former replica is not ahead of switchpoint
2. end of checkpoint record for the former primary is the same as switchpoint

In order to get the end of checkpoint record we use the `pg_waldump` and parse its output.

Close https://github.com/zalando/patroni/issues/1493
---
 patroni/ha.py                  |  1 +
 patroni/postgresql/__init__.py |  9 +---
 patroni/postgresql/rewind.py   | 82 +++++++++++++++++++++++++++++-----
 tests/test_ha.py               |  2 +-
 tests/test_rewind.py           | 31 ++++++++++---
 5 files changed, 100 insertions(+), 25 deletions(-)

diff --git a/patroni/ha.py b/patroni/ha.py
index dec4bdf4..8afe7aa2 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -421,6 +421,7 @@ class Ha(object):
                                                        self.state_handler.follow, args=(node_to_follow, role))
                 else:
                     self.state_handler.follow(node_to_follow, role, do_reload=True)
+                self._rewind.trigger_check_diverged_lsn()
             elif role == 'standby_leader' and self.state_handler.role != role:
                 self.state_handler.set_role(role)
                 self.state_handler.call_nowait(ACTION_ON_ROLE_CHANGE)
diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index 4d196911..ce69c7fa 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -726,18 +726,13 @@ class Postgresql(object):
         with get_connection_cursor(**conn_kwargs) as cur:
             yield cur
 
-    def get_local_timeline_lsn_from_replication_connection(self):
-        timeline = lsn = None
+    def get_replica_timeline(self):
         try:
             with self.get_replication_connection_cursor(**self.config.local_replication_address) as cur:
                 cur.execute('IDENTIFY_SYSTEM')
-                timeline, lsn = cur.fetchone()[1:3]
+                return cur.fetchone()[1]
         except Exception:
             logger.exception('Can not fetch local timeline and lsn from replication connection')
-        return timeline, lsn
-
-    def get_replica_timeline(self):
-        return self.get_local_timeline_lsn_from_replication_connection()[0]
 
     def replica_cached_timeline(self, master_timeline):
         if not self._cached_replica_timeline or not master_timeline or self._cached_replica_timeline != master_timeline:
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index 06a0e49c..b90cf038 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -1,5 +1,6 @@
 import logging
 import os
+import six
 import subprocess
 
 from threading import Lock, Thread
@@ -69,29 +70,81 @@ class Rewind(object):
         except Exception:
             return logger.exception('Exception when working with leader')
 
+    def _get_checkpoint_end(self, timeline, lsn):
+        """The checkpoint record size in WAL depends on postgres major version and platform (memory alignment).
+        Hence, the only reliable way to figure out where it ends, read the record from file with the help of pg_waldump
+        and parse the output. We are trying to read two records, and expect that it wil fail to read the second one:
+        `pg_waldump: fatal: error in WAL record at 0/182E220: invalid record length at 0/182E298: wanted 24, got 0`
+        The error message contains information about LSN of the next record, which is exactly where checkpoint ends."""
+
+        cmd = self._postgresql.pgcommand('pg_{0}dump'.format(self._postgresql.wal_name))
+        lsn8 = format_lsn(lsn, True)
+        lsn = format_lsn(lsn)
+        env = os.environ.copy()
+        env.update(LANG='C', LC_ALL='C', PGDATA=self._postgresql.data_dir)
+        try:
+            waldump = subprocess.Popen([cmd, '-t', str(timeline), '-s', lsn, '-n', '2'],
+                                       stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
+            out, err = waldump.communicate()
+            waldump.wait()
+        except Exception as e:
+            logger.error('Failed to execute `%s -t %s -s %s -n 2`: %r', cmd, timeline, lsn, e)
+        else:
+            out = out.decode('utf-8').rstrip().split('\n')
+            err = err.decode('utf-8').rstrip().split('\n')
+            pattern = 'error in WAL record at {0}: invalid record length at '.format(lsn)
+
+            if len(out) == 1 and len(err) == 1 and ', lsn: {0}, prev '.format(lsn8) in out[0] and pattern in err[0]:
+                i = err[0].find(pattern) + len(pattern)
+                j = err[0].find(": wanted ", i)
+                if j > -1:
+                    try:
+                        return parse_lsn(err[0][i:j])
+                    except Exception as e:
+                        logger.error('Failed to parse lsn %s: %r', err[0][i:j], e)
+            logger.error('Failed to parse `%s -t %s -s %s -n 2` output', cmd, timeline, lsn)
+            logger.error(' stdout=%s', '\n'.join(out))
+            logger.error(' stderr=%s', '\n'.join(err))
+
+        return 0
+
     def _get_local_timeline_lsn_from_controldata(self):
-        timeline = lsn = None
+        in_recovery = timeline = lsn = None
         data = self._postgresql.controldata()
         try:
             if data.get('Database cluster state') == 'shut down in recovery':
+                in_recovery = True
                 lsn = data.get('Minimum recovery ending location')
                 timeline = int(data.get("Min recovery ending loc's timeline"))
                 if lsn == '0/0' or timeline == 0:  # it was a master when it crashed
                     data['Database cluster state'] = 'shut down'
             if data.get('Database cluster state') == 'shut down':
+                in_recovery = False
                 lsn = data.get('Latest checkpoint location')
                 timeline = int(data.get("Latest checkpoint's TimeLineID"))
         except (TypeError, ValueError):
             logger.exception('Failed to get local timeline and lsn from pg_controldata output')
-        return timeline, lsn
+
+        if lsn is not None:
+            try:
+                lsn = parse_lsn(lsn)
+            except (IndexError, ValueError) as e:
+                logger.error('Exception when parsing lsn %s: %r', lsn, e)
+                lsn = None
+
+        return in_recovery, timeline, lsn
 
     def _get_local_timeline_lsn(self):
-        if self._postgresql.is_running():  # if postgres is running - get timeline and lsn from replication connection
-            timeline, lsn = self._postgresql.get_local_timeline_lsn_from_replication_connection()
+        if self._postgresql.is_running():  # if postgres is running - get timeline from replication connection
+            in_recovery = True
+            timeline = self._postgresql.received_timeline() or self._postgresql.get_replica_timeline()
+            lsn = self._postgresql.replayed_location()
         else:  # otherwise analyze pg_controldata output
-            timeline, lsn = self._get_local_timeline_lsn_from_controldata()
-        logger.info('Local timeline=%s lsn=%s', timeline, lsn)
-        return timeline, lsn
+            in_recovery, timeline, lsn = self._get_local_timeline_lsn_from_controldata()
+
+        log_lsn = format_lsn(lsn) if isinstance(lsn, six.integer_types) else lsn
+        logger.info('Local timeline=%s lsn=%s', timeline, log_lsn)
+        return in_recovery, timeline, lsn
 
     @staticmethod
     def _log_master_history(history, i):
@@ -112,7 +165,7 @@ class Rewind(object):
         logger.info('master: history=%s', '\n'.join(history_show))
 
     def _check_timeline_and_lsn(self, leader):
-        local_timeline, local_lsn = self._get_local_timeline_lsn()
+        in_recovery, local_timeline, local_lsn = self._get_local_timeline_lsn()
         if local_timeline is None or local_lsn is None:
             return
 
@@ -144,10 +197,15 @@ class Rewind(object):
             history = list(parse_history(history))
             for i, (parent_timeline, switchpoint, _) in enumerate(history):
                 if parent_timeline == local_timeline:
-                    try:
-                        need_rewind = parse_lsn(local_lsn) >= switchpoint
-                    except (IndexError, ValueError):
-                        logger.exception('Exception when parsing lsn')
+                    # We don't need to rewind when:
+                    # 1. for replica: replayed location is not ahead of switchpoint
+                    # 2. for the former primary: end of checkpoint record is the same as switchpoint
+                    if in_recovery:
+                        need_rewind = local_lsn > switchpoint
+                    elif local_lsn >= switchpoint:
+                        need_rewind = True
+                    else:
+                        need_rewind = switchpoint != self._get_checkpoint_end(local_timeline, local_lsn)
                     break
                 elif parent_timeline > local_timeline:
                     break
diff --git a/tests/test_ha.py b/tests/test_ha.py
index 9dfa438d..c99152eb 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -167,7 +167,7 @@ def run_async(self, func, args=()):
 @patch.object(Postgresql, 'query', Mock())
 @patch.object(Postgresql, 'checkpoint', Mock())
 @patch.object(CancellableSubprocess, 'call', Mock(return_value=0))
-@patch.object(Postgresql, 'get_local_timeline_lsn_from_replication_connection', Mock(return_value=[2, 10]))
+@patch.object(Postgresql, 'get_replica_timeline', Mock(return_value=2))
 @patch.object(Postgresql, 'get_master_timeline', Mock(return_value=2))
 @patch.object(ConfigHandler, 'restore_configuration_files', Mock())
 @patch.object(etcd.Client, 'write', etcd_write)
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 343de8b1..0f5c2805 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -53,11 +53,12 @@ class TestRewind(BaseTestPostgresql):
         with patch.object(Postgresql, 'controldata',
                           Mock(return_value={'Database cluster state': 'shut down in recovery',
                                              'Minimum recovery ending location': '0/0',
-                                             "Min recovery ending loc's timeline": '0'})):
+                                             "Min recovery ending loc's timeline": '0',
+                                             'Latest checkpoint location': '0/'})):
             self.r.rewind_or_reinitialize_needed_and_possible(self.leader)
 
         with patch.object(Postgresql, 'is_running', Mock(return_value=True)):
-            with patch.object(MockCursor, 'fetchone', Mock(side_effect=[(False, ), Exception])):
+            with patch.object(MockCursor, 'fetchone', Mock(side_effect=[(0, 0, 1, 1,), Exception])):
                 self.r.rewind_or_reinitialize_needed_and_possible(self.leader)
 
     @patch.object(CancellableSubprocess, 'call', Mock(return_value=0))
@@ -95,9 +96,10 @@ class TestRewind(BaseTestPostgresql):
 
     @patch.object(Postgresql, 'start', Mock())
     @patch.object(Rewind, 'can_rewind', PropertyMock(return_value=True))
-    @patch.object(Rewind, '_get_local_timeline_lsn', Mock(return_value=(2, '40159C1')))
+    @patch.object(Rewind, '_get_local_timeline_lsn')
     @patch.object(Rewind, 'check_leader_is_not_in_recovery')
-    def test__check_timeline_and_lsn(self, mock_check_leader_is_not_in_recovery):
+    def test__check_timeline_and_lsn(self, mock_check_leader_is_not_in_recovery, mock_get_local_timeline_lsn):
+        mock_get_local_timeline_lsn.return_value = (True, 2, 67197377)
         mock_check_leader_is_not_in_recovery.return_value = False
         self.r.trigger_check_diverged_lsn()
         self.assertFalse(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
@@ -113,11 +115,30 @@ class TestRewind(BaseTestPostgresql):
             self.assertFalse(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
         self.r.trigger_check_diverged_lsn()
         with patch.object(MockCursor, 'fetchone', Mock(return_value=('', 1, '0/0'))):
-            with patch.object(Rewind, '_get_local_timeline_lsn', Mock(return_value=(1, '0/0'))):
+            with patch.object(Rewind, '_get_local_timeline_lsn', Mock(return_value=(True, 1, '0/0'))):
                 self.assertFalse(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
             self.r.trigger_check_diverged_lsn()
             self.assertTrue(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
 
+        self.r.reset_state()
+        self.r.trigger_check_diverged_lsn()
+        mock_get_local_timeline_lsn.return_value = (False, 2, 67296664)
+        self.assertTrue(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
+
+        with patch('subprocess.Popen') as mock_popen:
+            mock_popen.return_value.communicate.return_value = (
+                b'0, lsn: 0/040159C1, prev 0/\n',
+                b'pg_waldump: fatal: error in WAL record at 0/40159C1: invalid record length at /: wanted 24, got 0\n'
+            )
+            self.r.reset_state()
+            self.r.trigger_check_diverged_lsn()
+            mock_get_local_timeline_lsn.return_value = (False, 2, 67197377)
+            self.assertTrue(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
+            self.r.reset_state()
+            self.r.trigger_check_diverged_lsn()
+            mock_popen.side_effect = Exception
+            self.assertTrue(self.r.rewind_or_reinitialize_needed_and_possible(self.leader))
+
     @patch.object(MockCursor, 'fetchone', Mock(side_effect=[(True,), Exception]))
     def test_check_leader_is_not_in_recovery(self):
         self.r.check_leader_is_not_in_recovery()

From 6e4ca1717c42a9bd1f135d8e5b7e582106bfd4b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=D0=A1=D0=B5=D1=80=D0=B3=D0=B5=D0=B9=20=D0=91=D1=83=D1=80?=
 =?UTF-8?q?=D0=BB=D0=B0=D0=B4=D1=8F=D0=BD?= <eshkinkot@gmail.com>
Date: Tue, 2 Jun 2020 09:55:49 +0300
Subject: [PATCH 25/40] Correct CRLF after HTTP headers in OPTIONS request
 (#1570)

Close #1569
---
 patroni/api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/patroni/api.py b/patroni/api.py
index 67dc011e..d3e60e2f 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -118,7 +118,7 @@ class RestApiHandler(BaseHTTPRequestHandler):
 
         if write_status_code_only:  # when haproxy sends OPTIONS request it reads only status code and nothing more
             message = self.responses[status_code][0]
-            self.wfile.write('{0} {1} {2}\r\n'.format(self.protocol_version, status_code, message).encode('utf-8'))
+            self.wfile.write('{0} {1} {2}\r\n\r\n'.format(self.protocol_version, status_code, message).encode('utf-8'))
         else:
             self._write_status_response(status_code, response)
 

From 76cfcf3ae8aad0726f60819d35dfa767b6d5bb9a Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Wed, 3 Jun 2020 09:54:04 +0200
Subject: [PATCH 26/40] Don't rely on deprecated flake8 setuptools entrypoint
 (#1557)

Define and use own command class for that
---
 setup.py | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ac280920..45a4b034 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,55 @@ CONSOLE_SCRIPTS = ['patroni = patroni:main',
                    "patroni_aws = patroni.scripts.aws:main"]
 
 
+class Flake8(Command):
+
+    user_options = []
+
+    def initialize_options(self):
+        from flake8.main import application
+
+        self.flake8 = application.Application()
+        self.flake8.initialize([])
+
+    def finalize_options(self):
+        pass
+
+    def package_files(self):
+        seen_package_directories = ()
+        directories = self.distribution.package_dir or {}
+        empty_directory_exists = "" in directories
+        packages = self.distribution.packages or []
+        for package in packages:
+            if package in directories:
+                package_directory = directories[package]
+            elif empty_directory_exists:
+                package_directory = os.path.join(directories[""], package)
+            else:
+                package_directory = package
+
+            if not package_directory.startswith(seen_package_directories):
+                seen_package_directories += (package_directory + ".",)
+                yield package_directory
+
+    def targets(self):
+        return [package for package in self.package_files()] + ['tests', 'setup.py']
+
+    def run(self):
+        self.flake8.run_checks(self.targets())
+        self.flake8.formatter.start()
+        self.flake8.report_errors()
+        self.flake8.report_statistics()
+        self.flake8.report_benchmarks()
+        self.flake8.formatter.stop()
+        try:
+            self.flake8.exit()
+        except SystemExit as e:
+            # Cause system exit only if exit code is not zero (terminates
+            # other possibly remaining/pending setuptools commands).
+            if e.code:
+                raise
+
+
 class PyTest(Command):
 
     user_options = [('cov=', None, 'Run coverage'), ('cov-xml=', None, 'Generate junit xml report'),
@@ -106,7 +155,7 @@ def read(fname):
 
 def setup_package(version):
     # Assemble additional setup commands
-    cmdclass = {'test': PyTest}
+    cmdclass = {'test': PyTest, 'flake8': Flake8}
 
     install_requires = []
 

From 6406b39b77269252fc0b0149ca4472d0bab0f06c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Posp=C3=AD=C5=A1ek?=
 <tpo_deb@sourcepole.ch>
Date: Wed, 3 Jun 2020 09:55:21 +0200
Subject: [PATCH 27/40] add config section keys, improve verify_client
 documentation (#1549)

---
 docs/ENVIRONMENT.rst |   2 +-
 docs/SETTINGS.rst    | 171 ++++++++++++++++++++++---------------------
 2 files changed, 87 insertions(+), 86 deletions(-)

diff --git a/docs/ENVIRONMENT.rst b/docs/ENVIRONMENT.rst
index 987b8b34..857a5780 100644
--- a/docs/ENVIRONMENT.rst
+++ b/docs/ENVIRONMENT.rst
@@ -126,7 +126,7 @@ REST API
 -  **PATRONI\_RESTAPI\_CERTFILE**: Specifies the file with the certificate in the PEM format. If the certfile is not specified or is left empty, the API server will work without SSL.
 -  **PATRONI\_RESTAPI\_KEYFILE**: Specifies the file with the secret key in the PEM format.
 -  **PATRONI\_RESTAPI\_CAFILE**: Specifies the file with the CA_BUNDLE with certificates of trusted CAs to use while verifying client certs.
--  **PATRONI\_RESTAPI\_VERIFY\_CLIENT**: ``none``, ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. If ``verify_client`` is set to ``optional`` or ``required`` basic-auth is not checked.
+-  **PATRONI\_RESTAPI\_VERIFY\_CLIENT**: ``none`` (default), ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. If ``verify_client`` is set to ``optional`` or ``required`` basic-auth is not checked. When ``required`` is used, then client authentication succeeds, if the certificate signature verification succeeds. For ``optional`` the client cert will only be checked for ``PUT``, ``POST``, ``PATCH``, and ``DELETE`` requests. 
 
 CTL
 ---
diff --git a/docs/SETTINGS.rst b/docs/SETTINGS.rst
index 74a5e2b7..27fe3f79 100644
--- a/docs/SETTINGS.rst
+++ b/docs/SETTINGS.rst
@@ -63,25 +63,26 @@ Log
 
 Bootstrap configuration
 -----------------------
--  **dcs**: This section will be written into `/<namespace>/<scope>/config` of the given configuration store after initializing of new cluster. The global dynamic configuration for the cluster. Under the ``bootstrap.dcs`` you can put any of the parameters described in the :ref:`Dynamic Configuration settings <dynamic_configuration_settings>` and after Patroni initialized (bootstrapped) the new cluster, it will write this section into `/<namespace>/<scope>/config` of the configuration store. All later changes of ``bootstrap.dcs`` will not take any effect! If you want to change them please use either ``patronictl edit-config`` or Patroni :ref:`REST API <rest_api>`.
--  **method**: custom script to use for bootstrapping this cluster.
-   See :ref:`custom bootstrap methods documentation <custom_bootstrap>` for details.
-   When ``initdb`` is specified revert to the default ``initdb`` command. ``initdb`` is also triggered when no ``method``
-   parameter is present in the configuration file.
--  **initdb**: List options to be passed on to initdb.
-        -  **- data-checksums**: Must be enabled when pg_rewind is needed on 9.3.
-        -  **- encoding: UTF8**: default encoding for new databases.
-        -  **- locale: UTF8**: default locale for new databases.
--  **pg\_hba**: list of lines that you should add to pg\_hba.conf.
-        -  **- host all all 0.0.0.0/0 md5**.
-        -  **- host replication replicator 127.0.0.1/32 md5**: A line like this is required for replication.
--  **users**: Some additional users which need to be created after initializing new cluster
-    -  **admin**: the name of user
-        -  **password: zalando**:
-        -  **options**: list of options for CREATE USER statement
-            -  **- createrole**
-            -  **- createdb**
--  **post\_bootstrap** or **post\_init**: An additional script that will be executed after initializing the cluster. The script receives a connection string URL (with the cluster superuser as a user name). The PGPASSFILE variable is set to the location of pgpass file.
+-  **bootstrap**:
+    -  **dcs**: This section will be written into `/<namespace>/<scope>/config` of the given configuration store after initializing of new cluster. The global dynamic configuration for the cluster. Under the ``bootstrap.dcs`` you can put any of the parameters described in the :ref:`Dynamic Configuration settings <dynamic_configuration_settings>` and after Patroni initialized (bootstrapped) the new cluster, it will write this section into `/<namespace>/<scope>/config` of the configuration store. All later changes of ``bootstrap.dcs`` will not take any effect! If you want to change them please use either ``patronictl edit-config`` or Patroni :ref:`REST API <rest_api>`.
+    -  **method**: custom script to use for bootstrapping this cluster.
+       See :ref:`custom bootstrap methods documentation <custom_bootstrap>` for details.
+       When ``initdb`` is specified revert to the default ``initdb`` command. ``initdb`` is also triggered when no ``method``
+       parameter is present in the configuration file.
+    -  **initdb**: List options to be passed on to initdb.
+            -  **- data-checksums**: Must be enabled when pg_rewind is needed on 9.3.
+            -  **- encoding: UTF8**: default encoding for new databases.
+            -  **- locale: UTF8**: default locale for new databases.
+    -  **pg\_hba**: list of lines that you should add to pg\_hba.conf.
+            -  **- host all all 0.0.0.0/0 md5**.
+            -  **- host replication replicator 127.0.0.1/32 md5**: A line like this is required for replication.
+    -  **users**: Some additional users which need to be created after initializing new cluster
+        -  **admin**: the name of user
+            -  **password: zalando**:
+            -  **options**: list of options for CREATE USER statement
+                -  **- createrole**
+                -  **- createdb**
+    -  **post\_bootstrap** or **post\_init**: An additional script that will be executed after initializing the cluster. The script receives a connection string URL (with the cluster superuser as a user name). The PGPASSFILE variable is set to the location of pgpass file.
 
 .. _consul_settings:
 
@@ -147,83 +148,83 @@ Kubernetes
 
 PostgreSQL
 ----------
--  **authentication**:
-    -  **superuser**:
-        -  **username**: name for the superuser, set during initialization (initdb) and later used by Patroni to connect to the postgres.
-        -  **password**: password for the superuser, set during initialization (initdb).
-        -  **sslmode**: (optional) maps to the `sslmode <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE>`__ connection parameter, which allows a client to specify the type of TLS negotiation mode with the server. For more information on how each mode works, please visit the `PostgreSQL documentation <https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-SSLMODE-STATEMENTS>`__. The default mode is ``prefer``.
-        -  **sslkey**: (optional) maps to the `sslkey <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLKEY>`__ connection parameter, which specifies the location of the secret key used with the client's certificate.
-        -  **sslcert**: (optional) maps to the `sslcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCERT>`__ connection parameter, which specifies the location of the client certificate.
-        -  **sslrootcert**: (optional) maps to the `sslrootcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLROOTCERT>`__ connection parameter, which specifies the location of a file containing one ore more certificate authorities (CA) certificates that the client will use to verify a server's certificate.
-        -  **sslcrl**: (optional) maps to the `sslcrl <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCRL>`__ connection parameter, which specifies the location of a file containing a certificate revocation list. A client will reject connecting to any server that has a certificate present in this list.
-    -  **replication**:
-        -  **username**: replication username; the user will be created during initialization. Replicas will use this user to access master via streaming replication
-        -  **password**: replication password; the user will be created during initialization.
-        -  **sslmode**: (optional) maps to the `sslmode <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE>`__ connection parameter, which allows a client to specify the type of TLS negotiation mode with the server. For more information on how each mode works, please visit the `PostgreSQL documentation <https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-SSLMODE-STATEMENTS>`__. The default mode is ``prefer``.
-        -  **sslkey**: (optional) maps to the `sslkey <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLKEY>`__ connection parameter, which specifies the location of the secret key used with the client's certificate.
-        -  **sslcert**: (optional) maps to the `sslcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCERT>`__ connection parameter, which specifies the location of the client certificate.
-        -  **sslrootcert**: (optional) maps to the `sslrootcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLROOTCERT>`__ connection parameter, which specifies the location of a file containing one ore more certificate authorities (CA) certificates that the client will use to verify a server's certificate.
-        -  **sslcrl**: (optional) maps to the `sslcrl <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCRL>`__ connection parameter, which specifies the location of a file containing a certificate revocation list. A client will reject connecting to any server that has a certificate present in this list.
-    -  **rewind**:
-        -  **username**: name for the user for ``pg_rewind``; the user will be created during initialization of postgres 11+ and all necessary `permissions <https://www.postgresql.org/docs/11/app-pgrewind.html#id-1.9.5.8.8>`__ will be granted.
-        -  **password**: password for the user for ``pg_rewind``; the user will be created during initialization.
-        -  **sslmode**: (optional) maps to the `sslmode <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE>`__ connection parameter, which allows a client to specify the type of TLS negotiation mode with the server. For more information on how each mode works, please visit the `PostgreSQL documentation <https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-SSLMODE-STATEMENTS>`__. The default mode is ``prefer``.
-        -  **sslkey**: (optional) maps to the `sslkey <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLKEY>`__ connection parameter, which specifies the location of the secret key used with the client's certificate.
-        -  **sslcert**: (optional) maps to the `sslcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCERT>`__ connection parameter, which specifies the location of the client certificate.
-        -  **sslrootcert**: (optional) maps to the `sslrootcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLROOTCERT>`__ connection parameter, which specifies the location of a file containing one ore more certificate authorities (CA) certificates that the client will use to verify a server's certificate.
-        -  **sslcrl**: (optional) maps to the `sslcrl <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCRL>`__ connection parameter, which specifies the location of a file containing a certificate revocation list. A client will reject connecting to any server that has a certificate present in this list.
--  **callbacks**: callback scripts to run on certain actions. Patroni will pass the action, role and cluster name. (See scripts/aws.py as an example of how to write them.)
-        -  **on\_reload**: run this script when configuration reload is triggered.
-        -  **on\_restart**: run this script when the postgres restarts (without changing role).
-        -  **on\_role\_change**: run this script when the postgres is being promoted or demoted.
-        -  **on\_start**: run this script when the postgres starts.
-        -  **on\_stop**: run this script when the postgres stops.
--  **connect\_address**: IP address + port through which Postgres is accessible from other nodes and applications.
--  **create\_replica\_methods**: an ordered list of the create methods for turning a Patroni node into a new replica.
-   "basebackup" is the default method; other methods are assumed to refer to scripts, each of which is configured as its
-   own config item. See :ref:`custom replica creation methods documentation <custom_replica_creation>` for further explanation.
--  **data\_dir**: The location of the Postgres data directory, either :ref:`existing <existing_data>` or to be initialized by Patroni.
--  **config\_dir**: The location of the Postgres configuration directory, defaults to the data directory. Must be writable by Patroni.
--  **bin\_dir**: Path to PostgreSQL binaries (pg_ctl, pg_rewind, pg_basebackup, postgres). The default value is an empty string meaning that PATH environment variable will be used to find the executables.
--  **listen**: IP address + port that Postgres listens to; must be accessible from other nodes in the cluster, if you're using streaming replication. Multiple comma-separated addresses are permitted, as long as the port component is appended after to the last one with a colon, i.e. ``listen: 127.0.0.1,127.0.0.2:5432``. Patroni will use the first address from this list to establish local connections to the PostgreSQL node.
--  **use\_unix\_socket**: specifies that Patroni should prefer to use unix sockets to connect to the cluster. Default value is ``false``. If ``unix_socket_directories`` is defined, Patroni will use the first suitable value from it to connect to the cluster and fallback to tcp if nothing is suitable. If ``unix_socket_directories`` is not specified in ``postgresql.parameters``, Patroni will assume that the default value should be used and omit ``host`` from the connection parameters.
--  **pgpass**: path to the `.pgpass <https://www.postgresql.org/docs/current/static/libpq-pgpass.html>`__ password file. Patroni creates this file before executing pg\_basebackup, the post_init script and under some other circumstances. The location must be writable by Patroni.
--  **recovery\_conf**: additional configuration settings written to recovery.conf when configuring follower.
--  **custom\_conf** : path to an optional custom ``postgresql.conf`` file, that will be used in place of ``postgresql.base.conf``. The file must exist on all cluster nodes, be readable by PostgreSQL and will be included from its location on the real ``postgresql.conf``. Note that Patroni will not monitor this file for changes, nor backup it. However, its settings can still be overridden by Patroni's own configuration facilities - see :ref:`dynamic configuration <dynamic_configuration>` for details.
--  **parameters**: list of configuration settings for Postgres. Many of these are required for replication to work.
--  **pg\_hba**: list of lines that Patroni will use to generate ``pg_hba.conf``. This parameter has higher priority than ``bootstrap.pg_hba``. Together with :ref:`dynamic configuration <dynamic_configuration>` it simplifies management of ``pg_hba.conf``.
-        -  **- host all all 0.0.0.0/0 md5**.
-        -  **- host replication replicator 127.0.0.1/32 md5**: A line like this is required for replication.
--  **pg\_ident**: list of lines that Patroni will use to generate ``pg_ident.conf``. Together with :ref:`dynamic configuration <dynamic_configuration>` it simplifies management of ``pg_ident.conf``.
-        -  **- mapname1 systemname1 pguser1**.
-        -  **- mapname1 systemname2 pguser2**.
--  **pg\_ctl\_timeout**: How long should pg_ctl wait when doing ``start``, ``stop`` or ``restart``. Default value is 60 seconds.
--  **use\_pg\_rewind**: try to use pg\_rewind on the former leader when it joins cluster as a replica.
--  **remove\_data\_directory\_on\_rewind\_failure**: If this option is enabled, Patroni will remove the PostgreSQL data directory and recreate the replica. Otherwise it will try to follow the new leader. Default value is **false**.
--  **remove\_data\_directory\_on\_diverged\_timelines**: Patroni will remove the PostgreSQL data directory and recreate the replica if it notices that timelines are diverging and the former master can not start streaming from the new master. This option is useful when ``pg_rewind`` can not be used. Default value is **false**.
--  **replica\_method**: for each create_replica_methods other than basebackup, you would add a configuration section of the same name. At a minimum, this should include "command" with a full path to the actual script to be executed. Other configuration parameters will be passed along to the script in the form "parameter=value".
+-  **postgresql**:
+    -  **authentication**:
+        -  **superuser**:
+            -  **username**: name for the superuser, set during initialization (initdb) and later used by Patroni to connect to the postgres.
+            -  **password**: password for the superuser, set during initialization (initdb).
+            -  **sslmode**: (optional) maps to the `sslmode <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE>`__ connection parameter, which allows a client to specify the type of TLS negotiation mode with the server. For more information on how each mode works, please visit the `PostgreSQL documentation <https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-SSLMODE-STATEMENTS>`__. The default mode is ``prefer``.
+            -  **sslkey**: (optional) maps to the `sslkey <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLKEY>`__ connection parameter, which specifies the location of the secret key used with the client's certificate.
+            -  **sslcert**: (optional) maps to the `sslcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCERT>`__ connection parameter, which specifies the location of the client certificate.
+            -  **sslrootcert**: (optional) maps to the `sslrootcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLROOTCERT>`__ connection parameter, which specifies the location of a file containing one ore more certificate authorities (CA) certificates that the client will use to verify a server's certificate.
+            -  **sslcrl**: (optional) maps to the `sslcrl <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCRL>`__ connection parameter, which specifies the location of a file containing a certificate revocation list. A client will reject connecting to any server that has a certificate present in this list.
+        -  **replication**:
+            -  **username**: replication username; the user will be created during initialization. Replicas will use this user to access master via streaming replication
+            -  **password**: replication password; the user will be created during initialization.
+            -  **sslmode**: (optional) maps to the `sslmode <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE>`__ connection parameter, which allows a client to specify the type of TLS negotiation mode with the server. For more information on how each mode works, please visit the `PostgreSQL documentation <https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-SSLMODE-STATEMENTS>`__. The default mode is ``prefer``.
+            -  **sslkey**: (optional) maps to the `sslkey <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLKEY>`__ connection parameter, which specifies the location of the secret key used with the client's certificate.
+            -  **sslcert**: (optional) maps to the `sslcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCERT>`__ connection parameter, which specifies the location of the client certificate.
+            -  **sslrootcert**: (optional) maps to the `sslrootcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLROOTCERT>`__ connection parameter, which specifies the location of a file containing one ore more certificate authorities (CA) certificates that the client will use to verify a server's certificate.
+            -  **sslcrl**: (optional) maps to the `sslcrl <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCRL>`__ connection parameter, which specifies the location of a file containing a certificate revocation list. A client will reject connecting to any server that has a certificate present in this list.
+        -  **rewind**:
+            -  **username**: name for the user for ``pg_rewind``; the user will be created during initialization of postgres 11+ and all necessary `permissions <https://www.postgresql.org/docs/11/app-pgrewind.html#id-1.9.5.8.8>`__ will be granted.
+            -  **password**: password for the user for ``pg_rewind``; the user will be created during initialization.
+            -  **sslmode**: (optional) maps to the `sslmode <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLMODE>`__ connection parameter, which allows a client to specify the type of TLS negotiation mode with the server. For more information on how each mode works, please visit the `PostgreSQL documentation <https://www.postgresql.org/docs/current/libpq-ssl.html#LIBPQ-SSL-SSLMODE-STATEMENTS>`__. The default mode is ``prefer``.
+            -  **sslkey**: (optional) maps to the `sslkey <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLKEY>`__ connection parameter, which specifies the location of the secret key used with the client's certificate.
+            -  **sslcert**: (optional) maps to the `sslcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCERT>`__ connection parameter, which specifies the location of the client certificate.
+            -  **sslrootcert**: (optional) maps to the `sslrootcert <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLROOTCERT>`__ connection parameter, which specifies the location of a file containing one ore more certificate authorities (CA) certificates that the client will use to verify a server's certificate.
+            -  **sslcrl**: (optional) maps to the `sslcrl <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNECT-SSLCRL>`__ connection parameter, which specifies the location of a file containing a certificate revocation list. A client will reject connecting to any server that has a certificate present in this list.
+    -  **callbacks**: callback scripts to run on certain actions. Patroni will pass the action, role and cluster name. (See scripts/aws.py as an example of how to write them.)
+            -  **on\_reload**: run this script when configuration reload is triggered.
+            -  **on\_restart**: run this script when the postgres restarts (without changing role).
+            -  **on\_role\_change**: run this script when the postgres is being promoted or demoted.
+            -  **on\_start**: run this script when the postgres starts.
+            -  **on\_stop**: run this script when the postgres stops.
+    -  **connect\_address**: IP address + port through which Postgres is accessible from other nodes and applications.
+    -  **create\_replica\_methods**: an ordered list of the create methods for turning a Patroni node into a new replica.
+       "basebackup" is the default method; other methods are assumed to refer to scripts, each of which is configured as its
+       own config item. See :ref:`custom replica creation methods documentation <custom_replica_creation>` for further explanation.
+    -  **data\_dir**: The location of the Postgres data directory, either :ref:`existing <existing_data>` or to be initialized by Patroni.
+    -  **config\_dir**: The location of the Postgres configuration directory, defaults to the data directory. Must be writable by Patroni.
+    -  **bin\_dir**: Path to PostgreSQL binaries (pg_ctl, pg_rewind, pg_basebackup, postgres). The default value is an empty string meaning that PATH environment variable will be used to find the executables.
+    -  **listen**: IP address + port that Postgres listens to; must be accessible from other nodes in the cluster, if you're using streaming replication. Multiple comma-separated addresses are permitted, as long as the port component is appended after to the last one with a colon, i.e. ``listen: 127.0.0.1,127.0.0.2:5432``. Patroni will use the first address from this list to establish local connections to the PostgreSQL node.
+    -  **use\_unix\_socket**: specifies that Patroni should prefer to use unix sockets to connect to the cluster. Default value is ``false``. If ``unix_socket_directories`` is defined, Patroni will use the first suitable value from it to connect to the cluster and fallback to tcp if nothing is suitable. If ``unix_socket_directories`` is not specified in ``postgresql.parameters``, Patroni will assume that the default value should be used and omit ``host`` from the connection parameters.
+    -  **pgpass**: path to the `.pgpass <https://www.postgresql.org/docs/current/static/libpq-pgpass.html>`__ password file. Patroni creates this file before executing pg\_basebackup, the post_init script and under some other circumstances. The location must be writable by Patroni.
+    -  **recovery\_conf**: additional configuration settings written to recovery.conf when configuring follower.
+    -  **custom\_conf** : path to an optional custom ``postgresql.conf`` file, that will be used in place of ``postgresql.base.conf``. The file must exist on all cluster nodes, be readable by PostgreSQL and will be included from its location on the real ``postgresql.conf``. Note that Patroni will not monitor this file for changes, nor backup it. However, its settings can still be overridden by Patroni's own configuration facilities - see :ref:`dynamic configuration <dynamic_configuration>` for details.
+    -  **parameters**: list of configuration settings for Postgres. Many of these are required for replication to work.
+    -  **pg\_hba**: list of lines that Patroni will use to generate ``pg_hba.conf``. This parameter has higher priority than ``bootstrap.pg_hba``. Together with :ref:`dynamic configuration <dynamic_configuration>` it simplifies management of ``pg_hba.conf``.
+            -  **- host all all 0.0.0.0/0 md5**.
+            -  **- host replication replicator 127.0.0.1/32 md5**: A line like this is required for replication.
+    -  **pg\_ident**: list of lines that Patroni will use to generate ``pg_ident.conf``. Together with :ref:`dynamic configuration <dynamic_configuration>` it simplifies management of ``pg_ident.conf``.
+            -  **- mapname1 systemname1 pguser1**.
+            -  **- mapname1 systemname2 pguser2**.
+    -  **pg\_ctl\_timeout**: How long should pg_ctl wait when doing ``start``, ``stop`` or ``restart``. Default value is 60 seconds.
+    -  **use\_pg\_rewind**: try to use pg\_rewind on the former leader when it joins cluster as a replica.
+    -  **remove\_data\_directory\_on\_rewind\_failure**: If this option is enabled, Patroni will remove the PostgreSQL data directory and recreate the replica. Otherwise it will try to follow the new leader. Default value is **false**.
+    -  **remove\_data\_directory\_on\_diverged\_timelines**: Patroni will remove the PostgreSQL data directory and recreate the replica if it notices that timelines are diverging and the former master can not start streaming from the new master. This option is useful when ``pg_rewind`` can not be used. Default value is **false**.
+    -  **replica\_method**: for each create_replica_methods other than basebackup, you would add a configuration section of the same name. At a minimum, this should include "command" with a full path to the actual script to be executed. Other configuration parameters will be passed along to the script in the form "parameter=value".
 
 REST API
 --------
--  **connect\_address**: IP address (or hostname) and port, to access the Patroni's :ref:`REST API <rest_api>`. All the members of the cluster must be able to connect to this address, so unless the Patroni setup is intended for a demo inside the localhost, this address must be a non "localhost" or loopback address (ie: "localhost" or "127.0.0.1"). It can serve as an endpoint for HTTP health checks (read below about the "listen" REST API parameter), and also for user queries (either directly or via the REST API), as well as for the health checks done by the cluster members during leader elections (for example, to determine whether the master is still running, or if there is a node which has a WAL position that is ahead of the one doing the query; etc.) The connect_address is put in the member key in DCS, making it possible to translate the member name into the address to connect to its REST API.
+-  **restapi**:
+        -  **connect\_address**: IP address (or hostname) and port, to access the Patroni's :ref:`REST API <rest_api>`. All the members of the cluster must be able to connect to this address, so unless the Patroni setup is intended for a demo inside the localhost, this address must be a non "localhost" or loopback address (ie: "localhost" or "127.0.0.1"). It can serve as an endpoint for HTTP health checks (read below about the "listen" REST API parameter), and also for user queries (either directly or via the REST API), as well as for the health checks done by the cluster members during leader elections (for example, to determine whether the master is still running, or if there is a node which has a WAL position that is ahead of the one doing the query; etc.) The connect_address is put in the member key in DCS, making it possible to translate the member name into the address to connect to its REST API.
+        
+        -  **listen**: IP address (or hostname) and port that Patroni will listen to for the REST API - to provide also the same health checks and cluster messaging between the participating nodes, as described above. to provide health-check information for HAProxy (or any other load balancer capable of doing a HTTP "OPTION" or "GET" checks).
 
--  **listen**: IP address (or hostname) and port that Patroni will listen to for the REST API - to provide also the same health checks and cluster messaging between the participating nodes, as described above. to provide health-check information for HAProxy (or any other load balancer capable of doing a HTTP "OPTION" or "GET" checks).
-
--  **Optional**:
-        -  **authentication**:
+        -  **authentication**: (optional)
             -  **username**: Basic-auth username to protect unsafe REST API endpoints.
             -  **password**: Basic-auth password to protect unsafe REST API endpoints.
-
-        -  **certfile**: Specifies the file with the certificate in the PEM format. If the certfile is not specified or is left empty, the API server will work without SSL.
-        -  **keyfile**: Specifies the file with the secret key in the PEM format.
-        -  **cafile**: Specifies the file with the CA_BUNDLE with certificates of trusted CAs to use while verifying client certs.
-        -  **verify\_client**: ``none``, ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. If ``verify_client`` is set to ``optional`` or ``required`` basic-auth is not checked.
+        -  **certfile**: (optional): Specifies the file with the certificate in the PEM format. If the certfile is not specified or is left empty, the API server will work without SSL.
+        -  **keyfile**: (optional): Specifies the file with the secret key in the PEM format.
+        -  **cafile**: (optional): Specifies the file with the CA_BUNDLE with certificates of trusted CAs to use while verifying client certs.
+        -  **verify\_client**: (optional): ``none`` (default), ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. If ``verify_client`` is set to ``optional`` or ``required`` basic-auth is not checked. When ``required`` is used, then client authentication succeeds, if the certificate signature verification succeeds.  For ``optional`` the client cert will only be checked for ``PUT``, ``POST``, ``PATCH``, and ``DELETE`` requests.
 
 .. _patronictl_settings:
 
 CTL
 ---
--  **Optional**:
+- **ctl**: (optional)
     -  **insecure**: Allow connections to REST API without verifying SSL certs.
     -  **cacert**: Specifies the file with the CA_BUNDLE file or directory with certificates of trusted CAs to use while verifying REST API SSL certs. If not provided patronictl will use the value provided for REST API "cafile" parameter.
     -  **certfile**: Specifies the file with the client certificate in the PEM format. If not provided patronictl will use the value provided for REST API "certfile" parameter.

From c2a78ee65250a34252aaf34612973e4c152aeb50 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 5 Jun 2020 09:23:54 +0200
Subject: [PATCH 28/40] Bugfix: GET /cluster was showing stale member info in
 zookeeper (#1573)

Zookpeeper implementation heavily relies on cached version of the cluster view in order to minimize the number of requests. Having stale members information is fine for Patroni workflow because it basically relies only on member names and tags.

The `GET /cluster` is a different case. Being exposed outside it might be used for monitoring purposes and therefore we should show the up-to-date members information.
---
 patroni/api.py           | 2 +-
 patroni/dcs/__init__.py  | 7 ++++++-
 patroni/dcs/zookeeper.py | 3 +++
 tests/test_api.py        | 4 ++--
 tests/test_zookeeper.py  | 2 +-
 5 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/patroni/api.py b/patroni/api.py
index d3e60e2f..324a3752 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -134,7 +134,7 @@ class RestApiHandler(BaseHTTPRequestHandler):
         self._write_status_response(200, response)
 
     def do_GET_cluster(self):
-        cluster = self.server.patroni.dcs.cluster or self.server.patroni.dcs.get_cluster()
+        cluster = self.server.patroni.dcs.get_cluster(True)
         self._write_json_response(200, cluster_as_json(cluster))
 
     def do_GET_history(self):
diff --git a/patroni/dcs/__init__.py b/patroni/dcs/__init__.py
index 473cbe4a..cda9d43d 100644
--- a/patroni/dcs/__init__.py
+++ b/patroni/dcs/__init__.py
@@ -642,7 +642,12 @@ class AbstractDCS(object):
            If the current node was running as a master and exception raised,
            instance would be demoted."""
 
-    def get_cluster(self):
+    def _bypass_caches(self):
+        """Used only in zookeeper"""
+
+    def get_cluster(self, force=False):
+        if force:
+            self._bypass_caches()
         try:
             cluster = self._load_cluster()
         except Exception:
diff --git a/patroni/dcs/zookeeper.py b/patroni/dcs/zookeeper.py
index 336fca70..5a852547 100644
--- a/patroni/dcs/zookeeper.py
+++ b/patroni/dcs/zookeeper.py
@@ -223,6 +223,9 @@ class ZooKeeper(AbstractDCS):
                 raise ZooKeeperError('ZooKeeper in not responding properly')
         return cluster
 
+    def _bypass_caches(self):
+        self._fetch_cluster = True
+
     def _create(self, path, value, retry=False, ephemeral=False):
         try:
             if retry:
diff --git a/tests/test_api.py b/tests/test_api.py
index 748d3372..0819c6c9 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -203,8 +203,8 @@ class TestRestApiHandler(unittest.TestCase):
 
     @patch.object(MockPatroni, 'dcs')
     def test_do_GET_cluster(self, mock_dcs):
-        mock_dcs.cluster = get_cluster_initialized_without_leader()
-        mock_dcs.cluster.members[1].data['xlog_location'] = 11
+        mock_dcs.get_cluster.return_value = get_cluster_initialized_without_leader()
+        mock_dcs.get_cluster.return_value.members[1].data['xlog_location'] = 11
         self.assertIsNotNone(MockRestApiServer(RestApiHandler, 'GET /cluster'))
 
     @patch.object(MockPatroni, 'dcs')
diff --git a/tests/test_zookeeper.py b/tests/test_zookeeper.py
index 931aabfb..e121de93 100644
--- a/tests/test_zookeeper.py
+++ b/tests/test_zookeeper.py
@@ -150,7 +150,7 @@ class TestZooKeeper(unittest.TestCase):
 
     def test_get_cluster(self):
         self.assertRaises(ZooKeeperError, self.zk.get_cluster)
-        cluster = self.zk.get_cluster()
+        cluster = self.zk.get_cluster(True)
         self.assertIsInstance(cluster.leader, Leader)
         self.zk.touch_member({'foo': 'foo'})
 

From 80ed08a2bb95251ab7b18b62a8831bd161cde08f Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 5 Jun 2020 09:24:47 +0200
Subject: [PATCH 29/40] Enforce synchronous_commit=local for post_init script
 (#1559)

Patroni was already doing that before creating users for a long time, but the post_init was an oversight. It will help to all utilities relying on libpq and reduce the end-user confusion.
---
 patroni/postgresql/bootstrap.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/patroni/postgresql/bootstrap.py b/patroni/postgresql/bootstrap.py
index a1e7dd52..5af95bdb 100644
--- a/patroni/postgresql/bootstrap.py
+++ b/patroni/postgresql/bootstrap.py
@@ -130,6 +130,7 @@ class Bootstrap(object):
                 r['host'] = 'localhost'  # set it to localhost to write into pgpass
 
             env = self._postgresql.config.write_pgpass(r) if 'password' in r else None
+            env['PGOPTIONS'] = '-c synchronous_commit=local'
 
             try:
                 ret = self._postgresql.cancellable.call(shlex.split(cmd) + [connstring], env=env)

From 1b2491cedf442337b0e3dc672d6f086932a56995 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 5 Jun 2020 09:25:33 +0200
Subject: [PATCH 30/40] Check basic-auth indepandantly from client certificate
 (#1556)

this is absolutely valid use-case
---
 docs/ENVIRONMENT.rst | 2 +-
 docs/SETTINGS.rst    | 2 +-
 patroni/api.py       | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/ENVIRONMENT.rst b/docs/ENVIRONMENT.rst
index 857a5780..4ab39d49 100644
--- a/docs/ENVIRONMENT.rst
+++ b/docs/ENVIRONMENT.rst
@@ -126,7 +126,7 @@ REST API
 -  **PATRONI\_RESTAPI\_CERTFILE**: Specifies the file with the certificate in the PEM format. If the certfile is not specified or is left empty, the API server will work without SSL.
 -  **PATRONI\_RESTAPI\_KEYFILE**: Specifies the file with the secret key in the PEM format.
 -  **PATRONI\_RESTAPI\_CAFILE**: Specifies the file with the CA_BUNDLE with certificates of trusted CAs to use while verifying client certs.
--  **PATRONI\_RESTAPI\_VERIFY\_CLIENT**: ``none`` (default), ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. If ``verify_client`` is set to ``optional`` or ``required`` basic-auth is not checked. When ``required`` is used, then client authentication succeeds, if the certificate signature verification succeeds. For ``optional`` the client cert will only be checked for ``PUT``, ``POST``, ``PATCH``, and ``DELETE`` requests. 
+-  **PATRONI\_RESTAPI\_VERIFY\_CLIENT**: ``none`` (default), ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. When ``required`` is used, then client authentication succeeds, if the certificate signature verification succeeds. For ``optional`` the client cert will only be checked for ``PUT``, ``POST``, ``PATCH``, and ``DELETE`` requests. 
 
 CTL
 ---
diff --git a/docs/SETTINGS.rst b/docs/SETTINGS.rst
index 27fe3f79..d0a3ab3c 100644
--- a/docs/SETTINGS.rst
+++ b/docs/SETTINGS.rst
@@ -218,7 +218,7 @@ REST API
         -  **certfile**: (optional): Specifies the file with the certificate in the PEM format. If the certfile is not specified or is left empty, the API server will work without SSL.
         -  **keyfile**: (optional): Specifies the file with the secret key in the PEM format.
         -  **cafile**: (optional): Specifies the file with the CA_BUNDLE with certificates of trusted CAs to use while verifying client certs.
-        -  **verify\_client**: (optional): ``none`` (default), ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. If ``verify_client`` is set to ``optional`` or ``required`` basic-auth is not checked. When ``required`` is used, then client authentication succeeds, if the certificate signature verification succeeds.  For ``optional`` the client cert will only be checked for ``PUT``, ``POST``, ``PATCH``, and ``DELETE`` requests.
+        -  **verify\_client**: (optional): ``none`` (default), ``optional`` or ``required``. When ``none`` REST API will not check client certificates. When ``required`` client certificates are required for all REST API calls. When ``optional`` client certificates are required for all unsafe REST API endpoints. When ``required`` is used, then client authentication succeeds, if the certificate signature verification succeeds.  For ``optional`` the client cert will only be checked for ``PUT``, ``POST``, ``PATCH``, and ``DELETE`` requests.
 
 .. _patronictl_settings:
 
diff --git a/patroni/api.py b/patroni/api.py
index 324a3752..2c10ac22 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -516,10 +516,10 @@ class RestApiServer(ThreadingMixIn, HTTPServer, Thread):
             if self.__protocol == 'https' and self.__ssl_options.get('verify_client') in ('required', 'optional'):
                 return rh._write_response(403, 'client certificate required')
 
-            reason = self.check_auth_header(rh.headers.get('Authorization'))
-            if reason:
-                headers = {'WWW-Authenticate': 'Basic realm="' + self.patroni.__class__.__name__ + '"'}
-                return rh._write_response(401, reason, headers=headers)
+        reason = self.check_auth_header(rh.headers.get('Authorization'))
+        if reason:
+            headers = {'WWW-Authenticate': 'Basic realm="' + self.patroni.__class__.__name__ + '"'}
+            return rh._write_response(401, reason, headers=headers)
         return True
 
     @staticmethod

From 1229cf2c16e4c46a410dad985b7262a326414e15 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 5 Jun 2020 09:33:50 +0200
Subject: [PATCH 31/40] Ignore hba_file and ident_file when they match with
 defaults (#1555)

It is possible to specify custom hba_file and ident_file in the postgresql configuration parameters and Patroni is considering that these files are managed externally. It could happen that locations of these files matching with default locations of pg_hba,conf and pg_ident.conf. In this case we will ignore custom values and fallback to the default workflow, i.e. Patroni will overwrite them.

Close: https://github.com/zalando/patroni/issues/1544
---
 patroni/postgresql/bootstrap.py |  2 +-
 patroni/postgresql/config.py    | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/patroni/postgresql/bootstrap.py b/patroni/postgresql/bootstrap.py
index 5af95bdb..01608adc 100644
--- a/patroni/postgresql/bootstrap.py
+++ b/patroni/postgresql/bootstrap.py
@@ -363,7 +363,7 @@ END;$$""".format(f, rewind['username'])
                     # at this point there should be no recovery.conf
                     postgresql.config.remove_recovery_conf()
 
-                    if postgresql.config.hba_file and postgresql.config.hba_file != postgresql.config.pg_hba_conf:
+                    if postgresql.config.hba_file:
                         postgresql.restart()
                     else:
                         postgresql.config.replace_pg_hba()
diff --git a/patroni/postgresql/config.py b/patroni/postgresql/config.py
index 2e16e1e4..0225a9bd 100644
--- a/patroni/postgresql/config.py
+++ b/patroni/postgresql/config.py
@@ -376,7 +376,7 @@ class ConfigHandler(object):
             configuration.append(os.path.basename(self._postgresql_base_conf_name))
         if not self.hba_file:
             configuration.append('pg_hba.conf')
-        if not self._server_parameters.get('ident_file'):
+        if not self.ident_file:
             configuration.append('pg_ident.conf')
         return configuration
 
@@ -483,7 +483,7 @@ class ConfigHandler(object):
         :returns: True if pg_ident.conf was rewritten.
         """
 
-        if not self._server_parameters.get('ident_file') and self._config.get('pg_ident'):
+        if not self.ident_file and self._config.get('pg_ident'):
             with ConfigWriter(self._pg_ident_conf) as f:
                 f.writelines(self._config['pg_ident'])
             return True
@@ -963,10 +963,12 @@ class ConfigHandler(object):
                         logger.warning('Removing invalid parameter `%s` from postgresql.parameters', param)
                         server_parameters.pop(param)
 
-            if not server_parameters.get('hba_file') and config.get('pg_hba'):
+            if (not server_parameters.get('hba_file') or server_parameters['hba_file'] == self._pg_hba_conf) \
+                    and config.get('pg_hba'):
                 hba_changed = self._config.get('pg_hba', []) != config['pg_hba']
 
-            if not server_parameters.get('ident_file') and config.get('pg_ident'):
+            if (not server_parameters.get('ident_file') or server_parameters['ident_file'] == self._pg_hba_conf) \
+                    and config.get('pg_ident'):
                 ident_changed = self._config.get('pg_ident', []) != config['pg_ident']
 
         self._config = config
@@ -1073,9 +1075,15 @@ class ConfigHandler(object):
         return self._config['authentication'].get('rewind', self._superuser) \
                 if self._postgresql.major_version >= 110000 else self._superuser
 
+    @property
+    def ident_file(self):
+        ident_file = self._server_parameters.get('ident_file')
+        return None if ident_file == self._pg_ident_conf else ident_file
+
     @property
     def hba_file(self):
-        return self._server_parameters.get('hba_file')
+        hba_file = self._server_parameters.get('hba_file')
+        return None if hba_file == self._pg_hba_conf else hba_file
 
     @property
     def pg_hba_conf(self):

From 4f1a3e53cd493ebad3fb72441c82fa519b33853a Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 5 Jun 2020 09:36:13 +0200
Subject: [PATCH 32/40] Defer TLS handshake until thread has started (#1547)

The `SSLSocket` is immediately doing the handshake on accept. Effectively it blocks the whole API thread if the client-side doesn't send any data.
In order to solve the issue we defer the handshake until a thread serving request has started.

The solution is a bit hacky, but thread-safe.

Close https://github.com/zalando/patroni/issues/1545
---
 patroni/api.py    | 17 +++++++++++++++++
 tests/test_api.py | 36 +++++++++++++++++++++++++-----------
 2 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/patroni/api.py b/patroni/api.py
index 2c10ac22..3cd005d1 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -590,6 +590,23 @@ class RestApiServer(ThreadingMixIn, HTTPServer, Thread):
         if reloading_config:
             self.start()
 
+    def process_request_thread(self, request, client_address):
+        if isinstance(request, tuple):
+            sock, newsock = request
+            try:
+                request = sock.context.wrap_socket(newsock, do_handshake_on_connect=sock.do_handshake_on_connect,
+                                                   suppress_ragged_eofs=sock.suppress_ragged_eofs, server_side=True)
+            except socket.error:
+                return
+        super(RestApiServer, self).process_request_thread(request, client_address)
+
+    def get_request(self):
+        sock = self.socket
+        newsock, addr = socket.socket.accept(sock)
+        if hasattr(sock, 'context'):  # SSLSocket, we want to do the deferred handshake from a thread
+            newsock = (sock, newsock)
+        return newsock, addr
+
     def reload_config(self, config):
         if 'listen' not in config:  # changing config in runtime
             raise ValueError('Can not find "restapi.listen" config')
diff --git a/tests/test_api.py b/tests/test_api.py
index 0819c6c9..928def0c 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -415,25 +415,27 @@ class TestRestApiHandler(unittest.TestCase):
         MockRestApiServer(RestApiHandler, post + '37\n\n{"candidate":"2","scheduled_at": "1"}')
 
 
-@patch('ssl.SSLContext.load_cert_chain', Mock())
-@patch('ssl.SSLContext.wrap_socket', Mock(return_value=0))
-@patch.object(BaseHTTPServer.HTTPServer, '__init__', Mock())
 class TestRestApiServer(unittest.TestCase):
 
+    @patch('ssl.SSLContext.load_cert_chain', Mock())
+    @patch('ssl.SSLContext.wrap_socket', Mock(return_value=0))
+    @patch.object(BaseHTTPServer.HTTPServer, '__init__', Mock())
+    def setUp(self):
+        self.srv = MockRestApiServer(Mock(), '', {'listen': '*:8008', 'certfile': 'a', 'verify_client': 'required'})
+
+    @patch.object(BaseHTTPServer.HTTPServer, '__init__', Mock())
     def test_reload_config(self):
         bad_config = {'listen': 'foo'}
         self.assertRaises(ValueError, MockRestApiServer, None, '', bad_config)
-        srv = MockRestApiServer(Mock(), '', {'listen': '*:8008', 'certfile': 'a', 'verify_client': 'required'})
-        self.assertRaises(ValueError, srv.reload_config, bad_config)
-        self.assertRaises(ValueError, srv.reload_config, {})
+        self.assertRaises(ValueError, self.srv.reload_config, bad_config)
+        self.assertRaises(ValueError, self.srv.reload_config, {})
         with patch.object(socket.socket, 'setsockopt', Mock(side_effect=socket.error)):
-            srv.reload_config({'listen': ':8008'})
+            self.srv.reload_config({'listen': ':8008'})
 
     def test_check_auth(self):
-        srv = MockRestApiServer(Mock(), '', {'listen': '*:8008', 'certfile': 'a', 'verify_client': 'required'})
         mock_rh = Mock()
         mock_rh.request.getpeercert.return_value = None
-        self.assertIsNot(srv.check_auth(mock_rh), True)
+        self.assertIsNot(self.srv.check_auth(mock_rh), True)
 
     def test_handle_error(self):
         try:
@@ -441,6 +443,18 @@ class TestRestApiServer(unittest.TestCase):
         except Exception:
             self.assertIsNone(MockRestApiServer.handle_error(None, ('127.0.0.1', 55555)))
 
+    @patch.object(BaseHTTPServer.HTTPServer, '__init__', Mock(side_effect=socket.error))
     def test_socket_error(self):
-        with patch.object(BaseHTTPServer.HTTPServer, '__init__', Mock(side_effect=socket.error)):
-            self.assertRaises(socket.error, MockRestApiServer, Mock(), '', {'listen': '*:8008'})
+        self.assertRaises(socket.error, MockRestApiServer, Mock(), '', {'listen': '*:8008'})
+
+    @patch.object(MockRestApiServer, 'finish_request', Mock())
+    def test_process_request_thread(self):
+        mock_socket = Mock()
+        self.srv.process_request_thread((mock_socket, 1), '2')
+        mock_socket.context.wrap_socket.side_effect = socket.error
+        self.srv.process_request_thread((mock_socket, 1), '2')
+
+    @patch.object(socket.socket, 'accept', Mock(return_value=(1, '2')))
+    def test_get_request(self):
+        self.srv.socket = Mock()
+        self.assertEqual(self.srv.get_request(), ((self.srv.socket, 1), '2'))

From e95e54b94e08a9bdf624ba6cd126c26c6c95ff94 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Fri, 5 Jun 2020 10:37:02 +0200
Subject: [PATCH 33/40] Handle correctly health-checks for standby cluster
 (#1553)

Close https://github.com/zalando/patroni/issues/1388
---
 features/patroni_api.feature     |  2 ++
 features/standby_cluster.feature |  4 ++--
 patroni/api.py                   | 25 +++++++++++++++++--------
 patroni/ha.py                    |  7 +++++--
 tests/test_api.py                |  2 ++
 5 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/features/patroni_api.feature b/features/patroni_api.feature
index d6d53a17..edda1220 100644
--- a/features/patroni_api.feature
+++ b/features/patroni_api.feature
@@ -8,6 +8,8 @@ Scenario: check API requests on a stand-alone server
 	Then I receive a response code 200
 	And I receive a response state running
 	And I receive a response role master
+	When I issue a GET request to http://127.0.0.1:8008/standby_leader
+	Then I receive a response code 503
 	When I issue a GET request to http://127.0.0.1:8008/health
 	Then I receive a response code 200
 	When I issue a GET request to http://127.0.0.1:8008/replica
diff --git a/features/standby_cluster.feature b/features/standby_cluster.feature
index 44310290..78e488b6 100644
--- a/features/standby_cluster.feature
+++ b/features/standby_cluster.feature
@@ -26,7 +26,7 @@ Feature: standby cluster
     When I add the table foo to postgres0
     Then table foo is present on postgres1 after 20 seconds
     When I issue a GET request to http://127.0.0.1:8009/master
-    Then I receive a response code 200
+    Then I receive a response code 503
     When I issue a GET request to http://127.0.0.1:8009/standby_leader
     Then I receive a response code 200
     And I receive a response role standby_leader
@@ -40,7 +40,7 @@ Feature: standby cluster
     And I kill postmaster on postgres1
     Then postgres2 is replicating from postgres0 after 32 seconds
     When I issue a GET request to http://127.0.0.1:8010/master
-    Then I receive a response code 200
+    Then I receive a response code 503
     When I issue a GET request to http://127.0.0.1:8010/standby_leader
     Then I receive a response code 200
     And I receive a response role standby_leader
diff --git a/patroni/api.py b/patroni/api.py
index 3cd005d1..4616595e 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -89,23 +89,32 @@ class RestApiHandler(BaseHTTPRequestHandler):
         patroni = self.server.patroni
         cluster = patroni.dcs.cluster
 
-        if not cluster and patroni.ha.is_paused():
-            primary_status_code = 200 if response['role'] == 'master' else 503
-        else:
-            primary_status_code = 200 if patroni.ha.is_leader() else 503
-
         replica_status_code = 200 if not patroni.noloadbalance and \
             response.get('role') == 'replica' and response.get('state') == 'running' else 503
+
+        if not cluster and patroni.ha.is_paused():
+            primary_status_code = 200 if response.get('role') == 'master' else 503
+            standby_leader_status_code = 200 if response.get('role') == 'standby_leader' else 503
+        elif patroni.ha.is_leader():
+            if patroni.ha.is_standby_cluster():
+                primary_status_code = replica_status_code = 503
+                standby_leader_status_code = 200 if response.get('role') in ('replica', 'standby_leader') else 503
+            else:
+                primary_status_code = 200
+                standby_leader_status_code = 503
+        else:
+            primary_status_code = standby_leader_status_code = 503
+
         status_code = 503
 
-        if patroni.ha.is_standby_cluster() and ('standby_leader' in path or 'standby-leader' in path):
-            status_code = 200 if patroni.ha.is_leader() else 503
+        if 'standby_leader' in path or 'standby-leader' in path:
+            status_code = standby_leader_status_code
         elif 'master' in path or 'leader' in path or 'primary' in path or 'read-write' in path:
             status_code = primary_status_code
         elif 'replica' in path:
             status_code = replica_status_code
         elif 'read-only' in path:
-            status_code = 200 if primary_status_code == 200 else replica_status_code
+            status_code = 200 if 200 in (primary_status_code, standby_leader_status_code) else replica_status_code
         elif 'health' in path:
             status_code = 200 if response.get('state') == 'running' else 503
         elif cluster:  # dcs is available
diff --git a/patroni/ha.py b/patroni/ha.py
index 8afe7aa2..d144dd02 100644
--- a/patroni/ha.py
+++ b/patroni/ha.py
@@ -391,10 +391,13 @@ class Ha(object):
         if self.is_paused():
             if not (self._rewind.is_needed and self._rewind.can_rewind_or_reinitialize_allowed)\
                     or self.cluster.is_unlocked():
-                self.state_handler.set_role('master' if is_leader else 'replica')
                 if is_leader:
+                    self.state_handler.set_role('master')
                     return 'continue to run as master without lock'
-                elif not node_to_follow:
+                elif self.state_handler.role != 'standby_leader':
+                    self.state_handler.set_role('replica')
+
+                if not node_to_follow:
                     return 'no action'
         elif is_leader:
             self.demote('immediate-nolock')
diff --git a/tests/test_api.py b/tests/test_api.py
index 928def0c..92ea89c2 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -179,6 +179,8 @@ class TestRestApiHandler(unittest.TestCase):
             MockRestApiServer(RestApiHandler, 'GET /asynchronous')
         MockPatroni.ha.is_leader = Mock(return_value=True)
         MockRestApiServer(RestApiHandler, 'GET /replica')
+        with patch.object(MockHa, 'is_standby_cluster', Mock(return_value=True)):
+            MockRestApiServer(RestApiHandler, 'GET /standby_leader')
         MockPatroni.dcs.cluster = None
         with patch.object(RestApiHandler, 'get_postgresql_status', Mock(return_value={'role': 'master'})):
             MockRestApiServer(RestApiHandler, 'GET /master')

From 2d5c8e006769416356634bd2928c93a9e83a20c7 Mon Sep 17 00:00:00 2001
From: ponvenkates <62520272+ponvenkates@users.noreply.github.com>
Date: Thu, 11 Jun 2020 20:03:00 +0530
Subject: [PATCH 34/40] Increasing maxsize in pool manager (#1575)

Close #1474
---
 patroni/dcs/consul.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/patroni/dcs/consul.py b/patroni/dcs/consul.py
index ed4e86ac..14ed29e8 100644
--- a/patroni/dcs/consul.py
+++ b/patroni/dcs/consul.py
@@ -54,7 +54,7 @@ class HTTPClient(object):
         if ca_cert:
             kwargs['ca_certs'] = ca_cert
         kwargs['cert_reqs'] = ssl.CERT_REQUIRED if verify or ca_cert else ssl.CERT_NONE
-        self.http = urllib3.PoolManager(num_pools=10, **kwargs)
+        self.http = urllib3.PoolManager(num_pools=10, maxsize=10, **kwargs)
         self._ttl = None
 
     def set_read_timeout(self, timeout):

From 623b5945399ae48ce17455ae13612c86dc42847d Mon Sep 17 00:00:00 2001
From: Maxim Fedotov <m.a.fedotov@gmail.com>
Date: Fri, 12 Jun 2020 16:23:42 +0300
Subject: [PATCH 35/40] patronictl add ability to print ASCII topology (#1576)

Example:
```bash
$ patronictl topology
+ Cluster: batman (6834835313225022118) -----+---------+----+-----------+------------------------------------------------+
| Member          |      Host      |   Role  |  State  | TL | Lag in MB | Tags                                           |
+-----------------+----------------+---------+---------+----+-----------+------------------------------------------------+
| postgresql0     | localhost:5432 |  Leader | running |  2 |           |                                                |
| + postgresql1   | localhost:5433 | Replica | running |  2 |       0.0 |                                                |
|   + postgresql2 | localhost:5434 | Replica | running |  2 |       0.0 | {nofailover: true, replicatefrom: postgresql1} |
+-----------------+----------------+---------+---------+----+-----------+------------------------------------------------+
```
---
 patroni/ctl.py    | 53 ++++++++++++++++++++++++++++++++++++++++-------
 tests/test_ctl.py | 32 +++++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/patroni/ctl.py b/patroni/ctl.py
index 81ced93c..efdda0fe 100644
--- a/patroni/ctl.py
+++ b/patroni/ctl.py
@@ -23,6 +23,7 @@ import time
 import yaml
 
 from click import ClickException
+from collections import defaultdict
 from contextlib import contextmanager
 from patroni.dcs import get_dcs as _get_dcs
 from patroni.exceptions import PatroniException
@@ -170,14 +171,14 @@ def print_output(columns, rows, alignment=None, fmt='pretty', header=None, delim
         elements = [{k: v for k, v in zip(columns, r) if not header or str(v)} for r in rows]
         func = json.dumps if fmt == 'json' else format_config_for_editing
         click.echo(func(elements))
-    elif fmt in {'pretty', 'tsv'}:
+    elif fmt in {'pretty', 'tsv', 'topology'}:
         list_cluster = bool(header and columns and columns[0] == 'Cluster')
         if list_cluster and 'Tags' in columns:  # we want to format member tags as YAML
             i = columns.index('Tags')
             for row in rows:
                 if row[i]:
-                    row[i] = format_config_for_editing(row[i], fmt == 'tsv').strip()
-        if list_cluster and fmt == 'pretty':  # skip cluster name if pretty-printing
+                    row[i] = format_config_for_editing(row[i], fmt != 'pretty').strip()
+        if list_cluster and fmt != 'tsv':  # skip cluster name if pretty-printing
             columns = columns[1:] if columns else []
             rows = [row[1:] for row in rows]
 
@@ -746,6 +747,33 @@ def switchover(obj, cluster_name, master, candidate, force, scheduled):
     _do_failover_or_switchover(obj, 'switchover', cluster_name, master, candidate, force, scheduled)
 
 
+def generate_topology(level, member, topology):
+    members = topology.get(member['name'], [])
+
+    if level > 0:
+        member['name'] = '{0}+ {1}'.format((' ' * (level - 1) * 2), member['name'])
+
+    if member['name']:
+        yield member
+
+    for member in members:
+        for member in generate_topology(level + 1, member, topology):
+            yield member
+
+
+def topology_sort(members):
+    topology = defaultdict(list)
+    leader = next((m for m in members if m['role'].endswith('leader')), {'name': None})
+    replicas = set(member['name'] for member in members if not member['role'].endswith('leader'))
+    for member in members:
+        if not member['role'].endswith('leader'):
+            parent = member.get('tags', {}).get('replicatefrom')
+            parent = parent if parent and parent != member['name'] and parent in replicas else leader['name']
+            topology[parent].append(member)
+    for member in generate_topology(0, leader, topology):
+        yield member
+
+
 def output_members(cluster, name, extended=False, fmt='pretty'):
     rows = []
     logging.debug(cluster)
@@ -762,12 +790,13 @@ def output_members(cluster, name, extended=False, fmt='pretty'):
     append_port = any('port' in m and m['port'] != 5432 for m in members) or\
         len(set(m['host'] for m in members)) < len(members)
 
-    for m in cluster['members']:
+    sort = topology_sort if fmt == 'topology' else iter
+    for m in sort(cluster['members']):
         logging.debug(m)
 
         lag = m.get('lag', '')
         m.update(cluster=name, member=m['name'], host=m.get('host', ''), tl=m.get('timeline', ''),
-                 role='' if m['role'] == 'replica' else m['role'].replace('_', ' ').title(),
+                 role=m['role'].replace('_', ' ').title(),
                  lag_in_mb=round(lag/1024/1024) if isinstance(lag, six.integer_types) else lag,
                  pending_restart='*' if m.get('pending_restart') else '')
 
@@ -782,10 +811,10 @@ def output_members(cluster, name, extended=False, fmt='pretty'):
 
         rows.append([m.get(n.lower().replace(' ', '_'), '') for n in columns])
 
-    print_output(columns, rows, {'Lag in MB': 'r', 'TL': 'r', 'Tags': 'l'},
+    print_output(columns, rows, {'Member': 'l', 'Lag in MB': 'r', 'TL': 'r', 'Tags': 'l'},
                  fmt, ' Cluster: {0} ({1}) '.format(name, initialize))
 
-    if fmt != 'pretty':  # Omit service info when using machine-readable formats
+    if fmt not in ('pretty', 'topology'):  # Omit service info when using machine-readable formats
         return
 
     service_info = []
@@ -829,6 +858,16 @@ def members(obj, cluster_names, fmt, watch, w, extended, ts):
             output_members(cluster, cluster_name, extended, fmt)
 
 
+@ctl.command('topology', help='Prints ASCII topology for given cluster')
+@click.argument('cluster_names', nargs=-1)
+@option_watch
+@option_watchrefresh
+@click.pass_obj
+@click.pass_context
+def topology(ctx, obj, cluster_names, watch, w):
+    ctx.forward(members, fmt='topology')
+
+
 def timestamp(precision=6):
     return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:precision - 7]
 
diff --git a/tests/test_ctl.py b/tests/test_ctl.py
index cc97c85b..e2cb11de 100644
--- a/tests/test_ctl.py
+++ b/tests/test_ctl.py
@@ -73,7 +73,7 @@ class TestCtl(unittest.TestCase):
         scheduled_at = datetime.now(tzutc) + timedelta(seconds=600)
         cluster = get_cluster_initialized_with_leader(Failover(1, 'foo', 'bar', scheduled_at))
         del cluster.members[1].data['conn_url']
-        for fmt in ('pretty', 'json', 'yaml', 'tsv'):
+        for fmt in ('pretty', 'json', 'yaml', 'tsv', 'topology'):
             self.assertIsNone(output_members(cluster, name='abc', fmt=fmt))
 
     @patch('patroni.ctl.get_dcs')
@@ -417,6 +417,36 @@ class TestCtl(unittest.TestCase):
         assert '2100' in result.output
         assert 'Scheduled restart' in result.output
 
+    @patch('patroni.ctl.get_dcs')
+    def test_topology(self, mock_get_dcs):
+        mock_get_dcs.return_value = self.e
+        cluster = get_cluster_initialized_with_leader()
+        cascade_member = Member(0, 'cascade', 28, {'conn_url': 'postgres://replicator:rep-pass@127.0.0.1:5437/postgres',
+                                                   'api_url': 'http://127.0.0.1:8012/patroni',
+                                                   'state': 'running',
+                                                   'tags': {'replicatefrom': 'other'},
+                                                   })
+        cascade_member_wrong_tags = Member(0, 'wrong_cascade', 28,
+                                           {'conn_url': 'postgres://replicator:rep-pass@127.0.0.1:5438/postgres',
+                                            'api_url': 'http://127.0.0.1:8013/patroni',
+                                            'state': 'running',
+                                            'tags': {'replicatefrom': 'nonexistinghost'},
+                                            })
+        cluster.members.append(cascade_member)
+        cluster.members.append(cascade_member_wrong_tags)
+        mock_get_dcs.return_value.get_cluster = Mock(return_value=cluster)
+        result = self.runner.invoke(ctl, ['topology', 'dummy'])
+        assert '+\n| leader          | 127.0.0.1:5435 |  Leader |' in result.output
+        assert '|\n| + other         | 127.0.0.1:5436 | Replica |' in result.output
+        assert '|\n|   + cascade     | 127.0.0.1:5437 | Replica |' in result.output
+        assert '|\n| + wrong_cascade | 127.0.0.1:5438 | Replica |' in result.output
+
+        cluster = get_cluster_initialized_without_leader()
+        mock_get_dcs.return_value.get_cluster = Mock(return_value=cluster)
+        result = self.runner.invoke(ctl, ['topology', 'dummy'])
+        assert '+\n| + leader | 127.0.0.1:5435 | Replica |' in result.output
+        assert '|\n| + other  | 127.0.0.1:5436 | Replica |' in result.output
+
     @patch('patroni.ctl.get_dcs')
     @patch.object(PoolManager, 'request', Mock(return_value=MockResponse()))
     def test_flush(self, mock_get_dcs):

From ee4bf79c1138fe0f2b3145386c7428bff80c6d8f Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Tue, 16 Jun 2020 12:56:20 +0200
Subject: [PATCH 36/40] Populate references and nodename in subsets addresses
 (#1591)

It makes subsets to exactly look like they were populated by the service with label selector and would help with https://github.com/zalando/postgres-operator/issues/340#issuecomment-587001109

Unit-tests are refactored to minimize amount of mocks.
---
 patroni/dcs/kubernetes.py | 122 ++++++++++++++++++++---------------
 tests/test_kubernetes.py  | 129 +++++++++++++++++++-------------------
 2 files changed, 138 insertions(+), 113 deletions(-)

diff --git a/patroni/dcs/kubernetes.py b/patroni/dcs/kubernetes.py
index 3325bed0..85fbe212 100644
--- a/patroni/dcs/kubernetes.py
+++ b/patroni/dcs/kubernetes.py
@@ -46,7 +46,7 @@ class CoreV1ApiProxy(object):
         self._api.api_client.user_agent = USER_AGENT
         self._api.api_client.rest_client.pool_manager.connection_pool_kw['maxsize'] = 10
         self._request_timeout = None
-        self._use_endpoints = use_endpoints
+        self._use_endpoints = bool(use_endpoints)
 
     def configure_timeouts(self, loop_wait, retry_timeout, ttl):
         # Normally every loop_wait seconds we should have receive something from the socket.
@@ -80,6 +80,10 @@ class CoreV1ApiProxy(object):
                 raise
         return wrapper
 
+    @property
+    def use_endpoints(self):
+        return self._use_endpoints
+
 
 def catch_kubernetes_errors(func):
     def wrapper(*args, **kwargs):
@@ -229,18 +233,16 @@ class Kubernetes(AbstractDCS):
         except k8s_config.ConfigException:
             k8s_config.load_kube_config(context=config.get('context', 'local'))
 
-        self.__subsets = None
-        use_endpoints = config.get('use_endpoints') and (config.get('patronictl') or 'pod_ip' in config)
-        if use_endpoints:
-            addresses = [k8s_client.V1EndpointAddress(ip='127.0.0.1' if config.get('patronictl') else config['pod_ip'])]
-            ports = []
-            for p in config.get('ports', [{}]):
-                port = {'port': int(p.get('port', '5432'))}
-                port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)})
-                ports.append(k8s_client.V1EndpointPort(**port))
-            self.__subsets = [k8s_client.V1EndpointSubset(addresses=addresses, ports=ports)]
-            self._should_create_config_service = True
-        self._api = CoreV1ApiProxy(use_endpoints)
+        self.__my_pod = None
+        self.__ips = [] if config.get('patronictl') else [config.get('pod_ip')]
+        self.__ports = []
+        for p in config.get('ports', [{}]):
+            port = {'port': int(p.get('port', '5432'))}
+            port.update({n: p[n] for n in ('name', 'protocol') if p.get(n)})
+            self.__ports.append(k8s_client.V1EndpointPort(**port))
+
+        self._api = CoreV1ApiProxy(config.get('use_endpoints'))
+        self._should_create_config_service = self._api.use_endpoints
         self.reload_config(config)
         self._leader_observed_record = {}
         self._leader_observed_time = None
@@ -267,7 +269,7 @@ class Kubernetes(AbstractDCS):
 
     @property
     def leader_path(self):
-        return self._base_path[1:] if self.__subsets else super(Kubernetes, self).leader_path
+        return self._base_path[1:] if self._api.use_endpoints else super(Kubernetes, self).leader_path
 
     def set_ttl(self, ttl):
         ttl = int(ttl)
@@ -305,7 +307,9 @@ class Kubernetes(AbstractDCS):
             with self._condition:
                 self._wait_caches()
 
-                members = [self.member(pod) for pod in self._pods.copy().values()]
+                pods = self._pods.copy()
+                self.__my_pod = pods.get(self._name)
+                members = [self.member(pod) for pod in pods.values()]
                 nodes = self._kinds.copy()
 
             config = nodes.get(self.config_path)
@@ -328,7 +332,8 @@ class Kubernetes(AbstractDCS):
             leader = nodes.get(self.leader_path)
             metadata = leader and leader.metadata
             self._leader_resource_version = metadata.resource_version if metadata else None
-            self._leader_observed_subsets = leader.subsets if self.__subsets and leader and leader.subsets else []
+            self._leader_observed_subsets = leader.subsets \
+                if self._api.use_endpoints and leader and leader.subsets else []
             annotations = metadata and metadata.annotations or {}
 
             # get last leader operation
@@ -377,50 +382,70 @@ class Kubernetes(AbstractDCS):
         return p1.name == p2.name and p1.port == p2.port and (p1.protocol or 'TCP') == (p2.protocol or 'TCP')
 
     @staticmethod
-    def subsets_changed(last_observed_subsets, subsets):
+    def subsets_changed(last_observed_subsets, ip, ports):
         """
-        >>> Kubernetes.subsets_changed([], [])
-        False
-        >>> Kubernetes.subsets_changed([], [k8s_client.V1EndpointSubset()])
+        >>> Kubernetes.subsets_changed([], None, [])
         True
-        >>> s1 = [k8s_client.V1EndpointSubset(addresses=[k8s_client.V1EndpointAddress(ip='1.2.3.4')])]
-        >>> s2 = [k8s_client.V1EndpointSubset(addresses=[k8s_client.V1EndpointAddress(ip='1.2.3.5')])]
-        >>> Kubernetes.subsets_changed(s1, s2)
+        >>> ip = '1.2.3.4'
+        >>> a = [k8s_client.V1EndpointAddress(ip=ip)]
+        >>> s = [k8s_client.V1EndpointSubset(addresses=a)]
+        >>> Kubernetes.subsets_changed(s, '1.2.3.5', [])
         True
-        >>> a = [k8s_client.V1EndpointAddress(ip='1.2.3.4')]
-        >>> s1 = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(protocol='TCP', port=1)])]
-        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(port=5432)])]
-        >>> Kubernetes.subsets_changed(s1, s2)
+        >>> s = [k8s_client.V1EndpointSubset(addresses=a, ports=[k8s_client.V1EndpointPort(protocol='TCP', port=1)])]
+        >>> Kubernetes.subsets_changed(s, '1.2.3.4', [k8s_client.V1EndpointPort(port=5432)])
         True
         >>> p1 = k8s_client.V1EndpointPort(name='port1', port=1)
         >>> p2 = k8s_client.V1EndpointPort(name='port2', port=2)
         >>> p3 = k8s_client.V1EndpointPort(name='port3', port=3)
-        >>> s1 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p1, p2])]
-        >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p2, p3])]
-        >>> Kubernetes.subsets_changed(s1, s2)
+        >>> s = [k8s_client.V1EndpointSubset(addresses=a, ports=[p1, p2])]
+        >>> Kubernetes.subsets_changed(s, ip, [p2, p3])
         True
         >>> s2 = [k8s_client.V1EndpointSubset(addresses=a, ports=[p2, p1])]
-        >>> Kubernetes.subsets_changed(s1, s2)
+        >>> Kubernetes.subsets_changed(s, ip, [p2, p1])
         False
         """
-        if len(last_observed_subsets) != len(subsets):
+
+        if len(last_observed_subsets) != 1:
             return True
-        if subsets == []:
-            return False
         if len(last_observed_subsets[0].addresses or []) != 1 or \
-                last_observed_subsets[0].addresses[0].ip != subsets[0].addresses[0].ip or \
-                len(last_observed_subsets[0].ports) != len(subsets[0].ports):
+                last_observed_subsets[0].addresses[0].ip != ip or \
+                len(last_observed_subsets[0].ports) != len(ports):
             return True
-        if len(subsets[0].ports) == 1:
-            return not Kubernetes.compare_ports(last_observed_subsets[0].ports[0], subsets[0].ports[0])
+        if len(ports) == 1:
+            return not Kubernetes.compare_ports(last_observed_subsets[0].ports[0], ports[0])
         observed_ports = {p.name: p for p in last_observed_subsets[0].ports}
-        for p in subsets[0].ports:
+        for p in ports:
             if p.name not in observed_ports or not Kubernetes.compare_ports(p, observed_ports.pop(p.name)):
                 return True
         return False
 
+    def __target_ref(self, leader_ip, pod):
+        # we want to re-use existing target_ref if possible
+        for subset in self._leader_observed_subsets:
+            for address in subset.addresses or []:
+                if address.ip == leader_ip and address.target_ref and address.target_ref.name == self._name:
+                    return address.target_ref
+        return k8s_client.V1ObjectReference(kind='Pod', uid=pod.metadata.uid, namespace=self._namespace,
+                                            name=self._name, resource_version=pod.metadata.resource_version)
+
+    def _map_subsets(self, endpoints, ips):
+        if not ips:
+            # We want to have subsets empty
+            if self._leader_observed_subsets:
+                endpoints['subsets'] = []
+            return
+
+        pod = self.__my_pod
+        leader_ip = ips[0] or pod and pod.status.pod_ip
+        # don't touch subsets if our (leader) ip is unknown or subsets is valid
+        if leader_ip and self.subsets_changed(self._leader_observed_subsets, leader_ip, self.__ports):
+            kwargs = {'hostname': pod.spec.hostname, 'node_name': pod.spec.node_name,
+                      'target_ref': self.__target_ref(leader_ip, pod)} if pod else {}
+            address = k8s_client.V1EndpointAddress(ip=leader_ip, **kwargs)
+            endpoints['subsets'] = [k8s_client.V1EndpointSubset(addresses=[address], ports=self.__ports)]
+
     @catch_kubernetes_errors
-    def patch_or_create(self, name, annotations, resource_version=None, patch=False, retry=True, subsets=None):
+    def patch_or_create(self, name, annotations, resource_version=None, patch=False, retry=True, ips=None):
         metadata = {'namespace': self._namespace, 'name': name, 'labels': self._labels, 'annotations': annotations}
         if patch or resource_version:
             if resource_version is not None:
@@ -432,10 +457,9 @@ class Kubernetes(AbstractDCS):
             metadata['annotations'] = {k: v for k, v in metadata['annotations'].items() if v is not None}
 
         metadata = k8s_client.V1ObjectMeta(**metadata)
-        if subsets is not None and self.__subsets:
+        if ips is not None and self._api.use_endpoints:
             endpoints = {'metadata': metadata}
-            if self.subsets_changed(self._leader_observed_subsets, subsets):
-                endpoints['subsets'] = subsets
+            self._map_subsets(endpoints, ips)
             body = k8s_client.V1Endpoints(**endpoints)
         else:
             body = k8s_client.V1ConfigMap(metadata=metadata)
@@ -446,7 +470,7 @@ class Kubernetes(AbstractDCS):
 
     def patch_or_create_config(self, annotations, resource_version=None, patch=False, retry=True):
         # SCOPE-config endpoint requires corresponding service otherwise it might be "cleaned" by k8s master
-        if self.__subsets and not patch and not resource_version:
+        if self._api.use_endpoints and not patch and not resource_version:
             self._should_create_config_service = True
             self._create_config_service()
         ret = self.patch_or_create(self.config_path, annotations, resource_version, patch, retry)
@@ -479,9 +503,9 @@ class Kubernetes(AbstractDCS):
         if last_operation:
             annotations[self._OPTIME] = last_operation
 
-        subsets = [] if access_is_restricted else self.__subsets
+        ips = [] if access_is_restricted else self.__ips
 
-        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, subsets=subsets)
+        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, ips=ips)
         if ret:
             self._leader_resource_version = ret.metadata.resource_version
         return ret
@@ -501,8 +525,8 @@ class Kubernetes(AbstractDCS):
             else:
                 annotations['acquireTime'] = self._leader_observed_record.get('acquireTime') or now
             annotations['transitions'] = str(transitions)
-        subsets = [] if self.__subsets else None
-        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, subsets=subsets)
+        ips = [] if self._api.use_endpoints else None
+        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, ips=ips)
         if ret:
             self._leader_resource_version = ret.metadata.resource_version
         else:
@@ -543,7 +567,7 @@ class Kubernetes(AbstractDCS):
                         'annotations': {'status': json.dumps(data, separators=(',', ':'))}}
             body = k8s_client.V1Pod(metadata=k8s_client.V1ObjectMeta(**metadata))
             ret = self._api.patch_namespaced_pod(self._name, self._namespace, body)
-        if self.__subsets and self._should_create_config_service:
+        if self._should_create_config_service:
             self._create_config_service()
         return ret
 
diff --git a/tests/test_kubernetes.py b/tests/test_kubernetes.py
index e82870f4..1a04ce9a 100644
--- a/tests/test_kubernetes.py
+++ b/tests/test_kubernetes.py
@@ -8,7 +8,7 @@ from threading import Thread
 from . import SleepException
 
 
-def mock_list_namespaced_config_map(self, *args, **kwargs):
+def mock_list_namespaced_config_map(*args, **kwargs):
     metadata = {'resource_version': '1', 'labels': {'f': 'b'}, 'name': 'test-config',
                 'annotations': {'initialize': '123', 'config': '{}'}}
     items = [k8s_client.V1ConfigMap(metadata=k8s_client.V1ObjectMeta(**metadata))]
@@ -22,73 +22,72 @@ def mock_list_namespaced_config_map(self, *args, **kwargs):
     return k8s_client.V1ConfigMapList(metadata=metadata, items=items, kind='ConfigMapList')
 
 
-def mock_list_namespaced_pod(self, *args, **kwargs):
-    metadata = k8s_client.V1ObjectMeta(resource_version='1', name='p-0', annotations={'status': '{}'})
-    items = [k8s_client.V1Pod(metadata=metadata)]
+def mock_list_namespaced_endpoints(*args, **kwargs):
+    target_ref = k8s_client.V1ObjectReference(kind='Pod', resource_version='10', name='p-0',
+                                              namespace='default', uid='964dfeae-e79b-4476-8a5a-1920b5c2a69d')
+    address0 = k8s_client.V1EndpointAddress(ip='10.0.0.0', target_ref=target_ref)
+    address1 = k8s_client.V1EndpointAddress(ip='10.0.0.1')
+    port = k8s_client.V1EndpointPort(port=5432, name='postgresql', protocol='TCP')
+    subset = k8s_client.V1EndpointSubset(addresses=[address1, address0], ports=[port])
+    metadata = k8s_client.V1ObjectMeta(resource_version='1', labels={'f': 'b'}, name='test',
+                                       annotations={'optime': '1234', 'leader': 'p-0', 'ttl': '30s'})
+    endpoint = k8s_client.V1Endpoints(subsets=[subset], metadata=metadata)
+    metadata = k8s_client.V1ObjectMeta(resource_version='1')
+    return k8s_client.V1EndpointsList(metadata=metadata, items=[endpoint], kind='V1EndpointsList')
+
+
+def mock_list_namespaced_pod(*args, **kwargs):
+    metadata = k8s_client.V1ObjectMeta(resource_version='1', name='p-0', annotations={'status': '{}'},
+                                       uid='964dfeae-e79b-4476-8a5a-1920b5c2a69d')
+    status = k8s_client.V1PodStatus(pod_ip='10.0.0.0')
+    spec = k8s_client.V1PodSpec(hostname='p-0', node_name='kind-control-plane', containers=[])
+    items = [k8s_client.V1Pod(metadata=metadata, status=status, spec=spec)]
     return k8s_client.V1PodList(items=items, kind='PodList')
 
 
-def mock_config_map(*args, **kwargs):
+def mock_namespaced_kind(*args, **kwargs):
     mock = Mock()
     mock.metadata.resource_version = '2'
     return mock
 
 
-@patch('socket.TCP_KEEPIDLE', 4, create=True)
-@patch('socket.TCP_KEEPINTVL', 5, create=True)
-@patch('socket.TCP_KEEPCNT', 6, create=True)
-@patch.object(k8s_client.CoreV1Api, 'patch_namespaced_config_map', mock_config_map)
-@patch.object(k8s_client.CoreV1Api, 'create_namespaced_config_map', mock_config_map)
-@patch('kubernetes.client.api_client.ThreadPool', Mock(), create=True)
-@patch.object(Thread, 'start', Mock())
-class TestKubernetes(unittest.TestCase):
+class BaseTestKubernetes(unittest.TestCase):
 
     @patch('socket.TCP_KEEPIDLE', 4, create=True)
     @patch('socket.TCP_KEEPINTVL', 5, create=True)
     @patch('socket.TCP_KEEPCNT', 6, create=True)
     @patch('kubernetes.config.load_kube_config', Mock())
-    @patch.object(k8s_client.CoreV1Api, 'list_namespaced_config_map', mock_list_namespaced_config_map)
-    @patch.object(k8s_client.CoreV1Api, 'list_namespaced_pod', mock_list_namespaced_pod)
     @patch('kubernetes.client.api_client.ThreadPool', Mock(), create=True)
     @patch.object(Thread, 'start', Mock())
-    def setUp(self):
-        self.k = Kubernetes({'ttl': 30, 'scope': 'test', 'name': 'p-0',
-                             'loop_wait': 10, 'retry_timeout': 10, 'labels': {'f': 'b'}})
+    @patch.object(k8s_client.CoreV1Api, 'list_namespaced_pod', mock_list_namespaced_pod)
+    @patch.object(k8s_client.CoreV1Api, 'list_namespaced_config_map', mock_list_namespaced_config_map)
+    def setUp(self, config=None):
+        config = config or {}
+        config.update(ttl=30, scope='test', name='p-0', loop_wait=10, retry_timeout=10, labels={'f': 'b'})
+        self.k = Kubernetes(config)
         self.assertRaises(AttributeError, self.k._pods._build_cache)
         self.k._pods._is_ready = True
         self.assertRaises(AttributeError, self.k._kinds._build_cache)
         self.k._kinds._is_ready = True
         self.k.get_cluster()
 
+
+@patch.object(k8s_client.CoreV1Api, 'patch_namespaced_config_map', mock_namespaced_kind)
+class TestKubernetesConfigMaps(BaseTestKubernetes):
+
     @patch('time.time', Mock(side_effect=[1, 10.9, 100]))
     def test__wait_caches(self):
         self.k._pods._is_ready = False
         with self.k._condition:
             self.assertRaises(RetryFailedError, self.k._wait_caches)
 
+    @patch('time.time', Mock(return_value=time.time() + 100))
     def test_get_cluster(self):
-        with patch.object(k8s_client.CoreV1Api, 'list_namespaced_config_map', mock_list_namespaced_config_map), \
-                patch.object(k8s_client.CoreV1Api, 'list_namespaced_pod', mock_list_namespaced_pod), \
-                patch('time.time', Mock(return_value=time.time() + 31)):
-            self.k.get_cluster()
+        self.k.get_cluster()
 
         with patch.object(Kubernetes, '_wait_caches', Mock(side_effect=Exception)):
             self.assertRaises(KubernetesError, self.k.get_cluster)
 
-    @patch('kubernetes.config.load_kube_config', Mock())
-    @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints', Mock())
-    def test_update_leader(self):
-        k = Kubernetes({'ttl': 30, 'scope': 'test', 'name': 'p-0', 'loop_wait': 10, 'retry_timeout': 10,
-                        'labels': {'f': 'b'}, 'use_endpoints': True, 'pod_ip': '10.0.0.0'})
-        self.assertIsNotNone(k.update_leader('123'))
-
-    @patch('kubernetes.config.load_kube_config', Mock())
-    @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints', Mock())
-    def test_update_leader_with_restricted_access(self):
-        k = Kubernetes({'ttl': 30, 'scope': 'test', 'name': 'p-0', 'loop_wait': 10, 'retry_timeout': 10,
-                        'labels': {'f': 'b'}, 'use_endpoints': True, 'pod_ip': '10.0.0.0'})
-        self.assertIsNotNone(k.update_leader('123', True))
-
     def test_take_leader(self):
         self.k.take_leader()
         self.k._leader_observed_record['leader'] = 'test'
@@ -123,14 +122,6 @@ class TestKubernetes(unittest.TestCase):
     def test_delete_cluster(self):
         self.k.delete_cluster()
 
-    @patch('kubernetes.config.load_kube_config', Mock())
-    @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints',
-                  Mock(side_effect=[k8s_client.rest.ApiException(502, ''), k8s_client.rest.ApiException(500, '')]))
-    def test_delete_sync_state(self):
-        k = Kubernetes({'ttl': 30, 'scope': 'test', 'name': 'p-0', 'loop_wait': 10, 'retry_timeout': 10,
-                        'labels': {'f': 'b'}, 'use_endpoints': True, 'pod_ip': '10.0.0.0'})
-        self.assertFalse(k.delete_sync_state())
-
     def test_watch(self):
         self.k.set_ttl(10)
         self.k.watch(None, 0)
@@ -139,31 +130,41 @@ class TestKubernetes(unittest.TestCase):
     def test_set_history_value(self):
         self.k.set_history_value('{}')
 
-    @patch('kubernetes.config.load_kube_config', Mock())
-    @patch('patroni.dcs.kubernetes.ObjectCache', Mock())
-    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_pod', Mock(return_value=True))
-    @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints', Mock())
+
+class TestKubernetesEndpoints(BaseTestKubernetes):
+
+    @patch.object(k8s_client.CoreV1Api, 'list_namespaced_endpoints', mock_list_namespaced_endpoints)
+    def setUp(self, config=None):
+        super(TestKubernetesEndpoints, self).setUp({'use_endpoints': True, 'pod_ip': '10.0.0.0'})
+
+    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_endpoints')
+    def test_update_leader(self, mock_patch_namespaced_endpoints):
+        self.assertIsNotNone(self.k.update_leader('123'))
+        args = mock_patch_namespaced_endpoints.call_args[0]
+        self.assertEqual(args[2].subsets[0].addresses[0].target_ref.resource_version, '10')
+        self.k._leader_observed_subsets = []
+        self.assertIsNotNone(self.k.update_leader('123'))
+
+    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_endpoints', mock_namespaced_kind)
+    def test_update_leader_with_restricted_access(self):
+        self.assertIsNotNone(self.k.update_leader('123', True))
+
+    @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints',
+                  Mock(side_effect=[k8s_client.rest.ApiException(502, ''), k8s_client.rest.ApiException(500, '')]))
+    def test_delete_sync_state(self):
+        self.assertFalse(self.k.delete_sync_state())
+
+    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_pod', mock_namespaced_kind)
+    @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints', mock_namespaced_kind)
     @patch.object(k8s_client.CoreV1Api, 'create_namespaced_service',
                   Mock(side_effect=[True, False, k8s_client.rest.ApiException(500, '')]))
     def test__create_config_service(self):
-        k = Kubernetes({'ttl': 30, 'scope': 'test', 'name': 'p-0', 'loop_wait': 10, 'retry_timeout': 10,
-                        'labels': {'f': 'b'}, 'use_endpoints': True, 'pod_ip': '10.0.0.0'})
-        self.assertIsNotNone(k.patch_or_create_config({'foo': 'bar'}))
-        self.assertIsNotNone(k.patch_or_create_config({'foo': 'bar'}))
-        k.touch_member({'state': 'running', 'role': 'replica'})
+        self.assertIsNotNone(self.k.patch_or_create_config({'foo': 'bar'}))
+        self.assertIsNotNone(self.k.patch_or_create_config({'foo': 'bar'}))
+        self.k.touch_member({'state': 'running', 'role': 'replica'})
 
 
-class TestCacheBuilder(unittest.TestCase):
-
-    @patch('socket.TCP_KEEPIDLE', 4, create=True)
-    @patch('socket.TCP_KEEPINTVL', 5, create=True)
-    @patch('socket.TCP_KEEPCNT', 6, create=True)
-    @patch('kubernetes.config.load_kube_config', Mock())
-    @patch('kubernetes.client.api_client.ThreadPool', Mock(), create=True)
-    @patch.object(Thread, 'start', Mock())
-    def setUp(self):
-        self.k = Kubernetes({'ttl': 30, 'scope': 'test', 'name': 'p-0',
-                             'loop_wait': 10, 'retry_timeout': 10, 'labels': {'f': 'b'}})
+class TestCacheBuilder(BaseTestKubernetes):
 
     @patch.object(k8s_client.CoreV1Api, 'list_namespaced_config_map', mock_list_namespaced_config_map)
     @patch('patroni.dcs.kubernetes.ObjectCache._watch')

From e00acdf6df0ddc2f783294c5a98ce52e8ad5a0fc Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Mon, 22 Jun 2020 16:07:52 +0200
Subject: [PATCH 37/40] Fix possible race conditions in update_leader (#1596)

1. Between get_cluster() and update_leader() calls the K8s leader object might be updated from outside and therefore the resource version will not match (error code=409). Since we are watching for all changes, the ObjectCache likely will have the most up-to-date version and we will take advantage of that. There is still a chance to hit a race-condition, but it would be smaller than before. Actually, other DCS are free of this issue. Etcd - update is based on the value comparison, Zookeeper and Consul are relying on session mechanism.
2. If the update still failed - recheck the resource version of the leader object and that the current node is still the leader there and repeat the call.

P.S. The leader race is still relying on the version of the leader object as it was during the get_cluster() call.

In addition to that fixed handling of K8s API errors, we should retry on 500, not on 502.
Close https://github.com/zalando/patroni/issues/1589
---
 patroni/dcs/kubernetes.py | 114 ++++++++++++++++++++++++++------------
 tests/test_kubernetes.py  |  26 +++++++--
 2 files changed, 102 insertions(+), 38 deletions(-)

diff --git a/patroni/dcs/kubernetes.py b/patroni/dcs/kubernetes.py
index 85fbe212..44276da8 100644
--- a/patroni/dcs/kubernetes.py
+++ b/patroni/dcs/kubernetes.py
@@ -75,7 +75,7 @@ class CoreV1ApiProxy(object):
             try:
                 return getattr(self._api, func)(*args, **kwargs)
             except k8s_client.rest.ApiException as e:
-                if e.status in (502, 503, 504) or e.headers and 'retry-after' in e.headers:  # XXX
+                if e.status in (500, 503, 504) or e.headers and 'retry-after' in e.headers:  # XXX
                     raise KubernetesRetriableException(e)
                 raise
         return wrapper
@@ -148,6 +148,10 @@ class ObjectCache(Thread):
         with self._object_cache_lock:
             return self._object_cache.copy()
 
+    def get(self, name):
+        with self._object_cache_lock:
+            return self._object_cache.get(name)
+
     def _build_cache(self):
         objects = self._list()
         return_type = 'V1' + objects.kind[:-4]
@@ -244,11 +248,10 @@ class Kubernetes(AbstractDCS):
         self._api = CoreV1ApiProxy(config.get('use_endpoints'))
         self._should_create_config_service = self._api.use_endpoints
         self.reload_config(config)
+        # leader_observed_record, leader_resource_version, and leader_observed_time are used only for leader race!
         self._leader_observed_record = {}
         self._leader_observed_time = None
         self._leader_resource_version = None
-        self._leader_observed_subsets = []
-        self._config_resource_version = None
         self.__do_not_watch = False
 
         self._condition = Condition()
@@ -307,14 +310,11 @@ class Kubernetes(AbstractDCS):
             with self._condition:
                 self._wait_caches()
 
-                pods = self._pods.copy()
-                self.__my_pod = pods.get(self._name)
-                members = [self.member(pod) for pod in pods.values()]
+                members = [self.member(pod) for pod in self._pods.copy().values()]
                 nodes = self._kinds.copy()
 
             config = nodes.get(self.config_path)
             metadata = config and config.metadata
-            self._config_resource_version = metadata.resource_version if metadata else None
             annotations = metadata and metadata.annotations or {}
 
             # get initialize flag
@@ -332,8 +332,6 @@ class Kubernetes(AbstractDCS):
             leader = nodes.get(self.leader_path)
             metadata = leader and leader.metadata
             self._leader_resource_version = metadata.resource_version if metadata else None
-            self._leader_observed_subsets = leader.subsets \
-                if self._api.use_endpoints and leader and leader.subsets else []
             annotations = metadata and metadata.annotations or {}
 
             # get last leader operation
@@ -419,9 +417,9 @@ class Kubernetes(AbstractDCS):
                 return True
         return False
 
-    def __target_ref(self, leader_ip, pod):
+    def __target_ref(self, leader_ip, latest_subsets, pod):
         # we want to re-use existing target_ref if possible
-        for subset in self._leader_observed_subsets:
+        for subset in latest_subsets:
             for address in subset.addresses or []:
                 if address.ip == leader_ip and address.target_ref and address.target_ref.name == self._name:
                     return address.target_ref
@@ -429,23 +427,24 @@ class Kubernetes(AbstractDCS):
                                             name=self._name, resource_version=pod.metadata.resource_version)
 
     def _map_subsets(self, endpoints, ips):
+        leader = self._kinds.get(self.leader_path)
+        latest_subsets = leader and leader.subsets or []
         if not ips:
             # We want to have subsets empty
-            if self._leader_observed_subsets:
+            if latest_subsets:
                 endpoints['subsets'] = []
             return
 
-        pod = self.__my_pod
+        pod = self._pods.get(self._name)
         leader_ip = ips[0] or pod and pod.status.pod_ip
         # don't touch subsets if our (leader) ip is unknown or subsets is valid
-        if leader_ip and self.subsets_changed(self._leader_observed_subsets, leader_ip, self.__ports):
+        if leader_ip and self.subsets_changed(latest_subsets, leader_ip, self.__ports):
             kwargs = {'hostname': pod.spec.hostname, 'node_name': pod.spec.node_name,
-                      'target_ref': self.__target_ref(leader_ip, pod)} if pod else {}
+                      'target_ref': self.__target_ref(leader_ip, latest_subsets, pod)} if pod else {}
             address = k8s_client.V1EndpointAddress(ip=leader_ip, **kwargs)
             endpoints['subsets'] = [k8s_client.V1EndpointSubset(addresses=[address], ports=self.__ports)]
 
-    @catch_kubernetes_errors
-    def patch_or_create(self, name, annotations, resource_version=None, patch=False, retry=True, ips=None):
+    def _patch_or_create(self, name, annotations, resource_version=None, patch=False, retry=None, ips=None):
         metadata = {'namespace': self._namespace, 'name': name, 'labels': self._labels, 'annotations': annotations}
         if patch or resource_version:
             if resource_version is not None:
@@ -463,20 +462,23 @@ class Kubernetes(AbstractDCS):
             body = k8s_client.V1Endpoints(**endpoints)
         else:
             body = k8s_client.V1ConfigMap(metadata=metadata)
-        ret = self.retry(func, self._namespace, body) if retry else func(self._namespace, body)
+        ret = retry(func, self._namespace, body) if retry else func(self._namespace, body)
         if ret:
             self._kinds.set(name, ret)
         return ret
 
+    @catch_kubernetes_errors
+    def patch_or_create(self, name, annotations, resource_version=None, patch=False, retry=True, ips=None):
+        if retry is True:
+            retry = self.retry
+        return self._patch_or_create(name, annotations, resource_version, patch, retry, ips)
+
     def patch_or_create_config(self, annotations, resource_version=None, patch=False, retry=True):
         # SCOPE-config endpoint requires corresponding service otherwise it might be "cleaned" by k8s master
         if self._api.use_endpoints and not patch and not resource_version:
             self._should_create_config_service = True
             self._create_config_service()
-        ret = self.patch_or_create(self.config_path, annotations, resource_version, patch, retry)
-        if ret:
-            self._config_resource_version = ret.metadata.resource_version
-        return ret
+        return self.patch_or_create(self.config_path, annotations, resource_version, patch, retry)
 
     def _create_config_service(self):
         metadata = k8s_client.V1ObjectMeta(namespace=self._namespace, name=self.config_path, labels=self._labels)
@@ -495,20 +497,58 @@ class Kubernetes(AbstractDCS):
     def _update_leader(self):
         """Unused"""
 
+    def _update_leader_with_retry(self, annotations, resource_version, ips):
+        retry = self._retry.copy()
+
+        def _retry(*args, **kwargs):
+            return retry(*args, **kwargs)
+
+        try:
+            return self._patch_or_create(self.leader_path, annotations, resource_version, ips=ips, retry=_retry)
+        except k8s_client.rest.ApiException as e:
+            if e.status == 409:
+                logger.warning('Concurrent update of %s', self.leader_path)
+            else:
+                logger.exception('Permission denied' if e.status == 403 else 'Unexpected error from Kubernetes API')
+                return False
+        except RetryFailedError:
+            return False
+
+        deadline = retry.stoptime - time.time()
+        if deadline < 2:
+            return False
+
+        retry.sleep_func(1)  # Give a chance for ObjectCache to receive the latest version
+
+        kind = self._kinds.get(self.leader_path)
+        kind_annotations = kind and kind.metadata.annotations or {}
+        kind_resource_version = kind and kind.metadata.resource_version
+
+        # There is different leader or resource_version in cache didn't change
+        if kind and (kind_annotations.get(self._LEADER) != self._name or kind_resource_version == resource_version):
+            return False
+
+        retry.deadline = deadline - 1  # Update deadline and retry
+        return self.patch_or_create(self.leader_path, annotations, kind_resource_version, ips=ips, retry=_retry)
+
     def update_leader(self, last_operation, access_is_restricted=False):
+        kind = self._kinds.get(self.leader_path)
+        kind_annotations = kind and kind.metadata.annotations or {}
+
+        if kind and kind_annotations.get(self._LEADER) != self._name:
+            return False
+
         now = datetime.datetime.now(tzutc).isoformat()
+        leader_observed_record = kind_annotations or self._leader_observed_record
         annotations = {self._LEADER: self._name, 'ttl': str(self._ttl), 'renewTime': now,
-                       'acquireTime': self._leader_observed_record.get('acquireTime') or now,
-                       'transitions': self._leader_observed_record.get('transitions') or '0'}
+                       'acquireTime': leader_observed_record.get('acquireTime') or now,
+                       'transitions': leader_observed_record.get('transitions') or '0'}
         if last_operation:
             annotations[self._OPTIME] = last_operation
 
+        resource_version = kind and kind.metadata.resource_version
         ips = [] if access_is_restricted else self.__ips
-
-        ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, ips=ips)
-        if ret:
-            self._leader_resource_version = ret.metadata.resource_version
-        return ret
+        return self._update_leader_with_retry(annotations, resource_version, ips)
 
     def attempt_to_acquire_leader(self, permanent=False):
         now = datetime.datetime.now(tzutc).isoformat()
@@ -527,9 +567,7 @@ class Kubernetes(AbstractDCS):
             annotations['transitions'] = str(transitions)
         ips = [] if self._api.use_endpoints else None
         ret = self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, ips=ips)
-        if ret:
-            self._leader_resource_version = ret.metadata.resource_version
-        else:
+        if not ret:
             logger.info('Could not take out TTL lock')
         return ret
 
@@ -545,6 +583,11 @@ class Kubernetes(AbstractDCS):
         patch = bool(self.cluster and isinstance(self.cluster.failover, Failover) and self.cluster.failover.index)
         return self.patch_or_create(self.failover_path, annotations, index, bool(index or patch), False)
 
+    @property
+    def _config_resource_version(self):
+        config = self._kinds.get(self.config_path)
+        return config and config.metadata.resource_version
+
     def set_config_value(self, value, index=None):
         return self.patch_or_create_config({self._CONFIG: value}, index, bool(self._config_resource_version), False)
 
@@ -567,6 +610,8 @@ class Kubernetes(AbstractDCS):
                         'annotations': {'status': json.dumps(data, separators=(',', ':'))}}
             body = k8s_client.V1Pod(metadata=k8s_client.V1ObjectMeta(**metadata))
             ret = self._api.patch_namespaced_pod(self._name, self._namespace, body)
+            if ret:
+                self._pods.set(self._name, ret)
         if self._should_create_config_service:
             self._create_config_service()
         return ret
@@ -580,11 +625,12 @@ class Kubernetes(AbstractDCS):
         """Unused"""
 
     def delete_leader(self, last_operation=None):
-        if self.cluster and isinstance(self.cluster.leader, Leader) and self.cluster.leader.name == self._name:
+        kind = self._kinds.get(self.leader_path)
+        if kind and (kind.metadata.annotations or {}).get(self._LEADER) == self._name:
             annotations = {self._LEADER: None}
             if last_operation:
                 annotations[self._OPTIME] = last_operation
-            self.patch_or_create(self.leader_path, annotations, self._leader_resource_version, True, False, [])
+            self.patch_or_create(self.leader_path, annotations, kind.metadata.resource_version, True, False, [])
             self.reset_cluster()
 
     def cancel_initialization(self):
diff --git a/tests/test_kubernetes.py b/tests/test_kubernetes.py
index 1a04ce9a..03185348 100644
--- a/tests/test_kubernetes.py
+++ b/tests/test_kubernetes.py
@@ -101,8 +101,9 @@ class TestKubernetesConfigMaps(BaseTestKubernetes):
     def test_set_config_value(self):
         self.k.set_config_value('{}')
 
-    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_pod', Mock(return_value=True))
-    def test_touch_member(self):
+    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_pod')
+    def test_touch_member(self, mock_patch_namespaced_pod):
+        mock_patch_namespaced_pod.return_value.metadata.resource_version = '10'
         self.k.touch_member({'role': 'replica'})
         self.k._name = 'p-1'
         self.k.touch_member({'state': 'running', 'role': 'replica'})
@@ -142,15 +143,32 @@ class TestKubernetesEndpoints(BaseTestKubernetes):
         self.assertIsNotNone(self.k.update_leader('123'))
         args = mock_patch_namespaced_endpoints.call_args[0]
         self.assertEqual(args[2].subsets[0].addresses[0].target_ref.resource_version, '10')
-        self.k._leader_observed_subsets = []
+        self.k._kinds._object_cache['test'].subsets[:] = []
         self.assertIsNotNone(self.k.update_leader('123'))
+        self.k._kinds._object_cache['test'].metadata.annotations['leader'] = 'p-1'
+        self.assertFalse(self.k.update_leader('123'))
 
     @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_endpoints', mock_namespaced_kind)
     def test_update_leader_with_restricted_access(self):
         self.assertIsNotNone(self.k.update_leader('123', True))
 
+    @patch.object(k8s_client.CoreV1Api, 'patch_namespaced_endpoints')
+    def test__update_leader_with_retry(self, mock_patch):
+        mock_patch.side_effect = k8s_client.rest.ApiException(502, '')
+        self.assertFalse(self.k.update_leader('123'))
+        mock_patch.side_effect = RetryFailedError('')
+        self.assertFalse(self.k.update_leader('123'))
+        mock_patch.side_effect = k8s_client.rest.ApiException(409, '')
+        with patch('time.time', Mock(side_effect=[0, 100, 200])):
+            self.assertFalse(self.k.update_leader('123'))
+        with patch('time.sleep', Mock()):
+            self.assertFalse(self.k.update_leader('123'))
+            mock_patch.side_effect = [k8s_client.rest.ApiException(409, ''), mock_namespaced_kind()]
+            self.k._kinds._object_cache['test'].metadata.resource_version = '2'
+            self.assertIsNotNone(self.k._update_leader_with_retry({}, '1', []))
+
     @patch.object(k8s_client.CoreV1Api, 'create_namespaced_endpoints',
-                  Mock(side_effect=[k8s_client.rest.ApiException(502, ''), k8s_client.rest.ApiException(500, '')]))
+                  Mock(side_effect=[k8s_client.rest.ApiException(500, ''), k8s_client.rest.ApiException(502, '')]))
     def test_delete_sync_state(self):
         self.assertFalse(self.k.delete_sync_state())
 

From 7f343c2c574f511615b04f97663107bc354853bb Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Thu, 25 Jun 2020 16:24:21 +0200
Subject: [PATCH 38/40] Try to fetch missing WAL if pg_rewind complains about
 it (#1561)

It could happen that the WAL segment required for `pg_rewind` doesn't exist in the `pg_wal` anymore and therefore `pg_rewind` can't find the checkpoint location before the diverging point.
Starting from PostgreSQL 13 `pg_rewind` could use `restore_command` for fetching missing WALs, but we can do better than that.
On older PostgreSQL versions Patroni will parse the stdout and stderr of failed rewind attempt, try to fetch the missing WAL by calling the `restore_command`, and repeat an attempt.
---
 patroni/postgresql/__init__.py    |  6 ++-
 patroni/postgresql/cancellable.py | 26 +++++-----
 patroni/postgresql/rewind.py      | 79 +++++++++++++++++++++++++++----
 tests/test_cancellable.py         |  2 +-
 tests/test_rewind.py              | 52 +++++++++++++++-----
 5 files changed, 128 insertions(+), 37 deletions(-)

diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index ce69c7fa..ad3541a4 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -137,6 +137,10 @@ class Postgresql(object):
     def callback(self):
         return self.config.get('callbacks') or {}
 
+    @property
+    def wal_dir(self):
+        return os.path.join(self._data_dir, 'pg_' + self.wal_name)
+
     @property
     def wal_name(self):
         return 'wal' if self._major_version >= 100000 else 'xlog'
@@ -743,7 +747,7 @@ class Postgresql(object):
         return self._cluster_info_state_get('timeline')
 
     def get_history(self, timeline):
-        history_path = os.path.join(self._data_dir, 'pg_' + self.wal_name, '{0:08X}.history'.format(timeline))
+        history_path = os.path.join(self.wal_dir, '{0:08X}.history'.format(timeline))
         history_mtime = mtime(history_path)
         if history_mtime:
             try:
diff --git a/patroni/postgresql/cancellable.py b/patroni/postgresql/cancellable.py
index ffb7040e..fc734d03 100644
--- a/patroni/postgresql/cancellable.py
+++ b/patroni/postgresql/cancellable.py
@@ -1,11 +1,9 @@
 import logging
-import os
 import psutil
 import subprocess
 
 from patroni.exceptions import PostgresException
 from patroni.utils import polling_loop
-from six import string_types
 from threading import Lock
 
 logger = logging.getLogger(__name__)
@@ -75,16 +73,16 @@ class CancellableSubprocess(CancellableExecutor):
         for s in ('stdin', 'stdout', 'stderr'):
             kwargs.pop(s, None)
 
-        communicate_input = 'communicate_input' in kwargs
-        if communicate_input:
-            input_data = kwargs.pop('communicate_input', None)
-            if not isinstance(input_data, string_types):
-                input_data = ''
-            if input_data and input_data[-1] != '\n':
-                input_data += '\n'
+        communicate = kwargs.pop('communicate', None)
+        if isinstance(communicate, dict):
+            input_data = communicate.get('input')
+            if input_data:
+                if input_data[-1] != '\n':
+                    input_data += '\n'
+                input_data = input_data.encode('utf-8')
             kwargs['stdin'] = subprocess.PIPE
-            kwargs['stdout'] = open(os.devnull, 'w')
-            kwargs['stderr'] = subprocess.STDOUT
+            kwargs['stdout'] = subprocess.PIPE
+            kwargs['stderr'] = subprocess.PIPE
 
         try:
             with self._lock:
@@ -95,10 +93,8 @@ class CancellableSubprocess(CancellableExecutor):
                 started = self._start_process(*args, **kwargs)
 
             if started:
-                if communicate_input:
-                    if input_data:
-                        self._process.communicate(input_data)
-                    self._process.stdin.close()
+                if isinstance(communicate, dict):
+                    communicate['stdout'], communicate['stderr'] = self._process.communicate(input_data)
                 return self._process.wait()
         finally:
             with self._lock:
diff --git a/patroni/postgresql/rewind.py b/patroni/postgresql/rewind.py
index b90cf038..937cdae2 100644
--- a/patroni/postgresql/rewind.py
+++ b/patroni/postgresql/rewind.py
@@ -250,21 +250,82 @@ class Rewind(object):
     def checkpoint_after_promote(self):
         return self._state == REWIND_STATUS.CHECKPOINT
 
+    def _fetch_missing_wal(self, restore_command, wal_filename):
+        cmd = ''
+        length = len(restore_command)
+        i = 0
+        while i < length:
+            if restore_command[i] == '%' and i + 1 < length:
+                i += 1
+                if restore_command[i] == 'p':
+                    cmd += os.path.join(self._postgresql.wal_dir, wal_filename)
+                elif restore_command[i] == 'f':
+                    cmd += wal_filename
+                elif restore_command[i] == 'r':
+                    cmd += '000000010000000000000001'
+                elif restore_command[i] == '%':
+                    cmd += '%'
+                else:
+                    cmd += '%'
+                    i -= 1
+            else:
+                cmd += restore_command[i]
+            i += 1
+
+        logger.info('Trying to fetch the missing wal: %s', cmd)
+        return self._postgresql.cancellable.call(cmd, shell=True) == 0
+
+    def _find_missing_wal(self, data):
+        # could not open file "$PGDATA/pg_wal/0000000A00006AA100000068": No such file or directory
+        pattern = 'could not open file "'
+        for line in data.decode('utf-8').split('\n'):
+            b = line.find(pattern)
+            if b > -1:
+                b += len(pattern)
+                e = line.find('": ', b)
+                if e > -1:
+                    waldir, wal_filename = os.path.split(line[b:e])
+                    if waldir.endswith(os.path.sep + 'pg_' + self._postgresql.wal_name) and len(wal_filename) == 24:
+                        return wal_filename
+
     def pg_rewind(self, r):
         # prepare pg_rewind connection
         env = self._postgresql.config.write_pgpass(r)
-        env['PGOPTIONS'] = '-c statement_timeout=0'
+        env.update(LANG='C', LC_ALL='C', PGOPTIONS='-c statement_timeout=0')
         dsn = self._postgresql.config.format_dsn(r, True)
         logger.info('running pg_rewind from %s', dsn)
 
+        restore_command = self._postgresql.config.get('recovery_conf', {}).get('restore_command') \
+            if self._postgresql.major_version < 120000 else self._postgresql.get_guc_value('restore_command')
+
         cmd = [self._postgresql.pgcommand('pg_rewind')]
-        if self._postgresql.major_version >= 130000 and self._postgresql.get_guc_value('restore_command'):
+        if self._postgresql.major_version >= 130000 and restore_command:
             cmd.append('--restore-target-wal')
         cmd.extend(['-D', self._postgresql.data_dir, '--source-server', dsn])
-        try:
-            return self._postgresql.cancellable.call(cmd, env=env) == 0
-        except OSError:
-            return False
+
+        while True:
+            results = {}
+            ret = self._postgresql.cancellable.call(cmd, env=env, communicate=results)
+
+            logger.info('pg_rewind exit code=%s', ret)
+            if ret is None:
+                return False
+
+            logger.info(' stdout=%s', results['stdout'].decode('utf-8'))
+            logger.info(' stderr=%s', results['stderr'].decode('utf-8'))
+            if ret == 0:
+                return True
+
+            if not restore_command or self._postgresql.major_version >= 130000:
+                return False
+
+            missing_wal = self._find_missing_wal(results['stderr']) or self._find_missing_wal(results['stdout'])
+            if not missing_wal:
+                return False
+
+            if not self._fetch_missing_wal(restore_command, missing_wal):
+                logger.info('Failed to fetch WAL segment %s required for pg_rewind', missing_wal)
+                return False
 
     def execute(self, leader):
         if self._postgresql.is_running() and not self._postgresql.stop(checkpoint=False):
@@ -335,17 +396,17 @@ class Rewind(object):
             logger.exception('Error when reading postmaster.opts')
         return result
 
-    def single_user_mode(self, command=None, options=None):
+    def single_user_mode(self, communicate=None, options=None):
         """run a given command in a single-user mode. If the command is empty - then just start and stop"""
         cmd = [self._postgresql.pgcommand('postgres'), '--single', '-D', self._postgresql.data_dir]
         for opt, val in sorted((options or {}).items()):
             cmd.extend(['-c', '{0}={1}'.format(opt, val)])
         # need a database name to connect
         cmd.append('template1')
-        return self._postgresql.cancellable.call(cmd, communicate_input=command)
+        return self._postgresql.cancellable.call(cmd, communicate=communicate)
 
     def cleanup_archive_status(self):
-        status_dir = os.path.join(self._postgresql.data_dir, 'pg_' + self._postgresql.wal_name, 'archive_status')
+        status_dir = os.path.join(self._postgresql.wal_dir, 'archive_status')
         try:
             for f in os.listdir(status_dir):
                 path = os.path.join(status_dir, f)
diff --git a/tests/test_cancellable.py b/tests/test_cancellable.py
index 7a0e24b7..b555472b 100644
--- a/tests/test_cancellable.py
+++ b/tests/test_cancellable.py
@@ -13,7 +13,7 @@ class TestCancellableSubprocess(unittest.TestCase):
 
     def test_call(self):
         self.c.cancel()
-        self.assertRaises(PostgresException, self.c.call, communicate_input=None)
+        self.assertRaises(PostgresException, self.c.call)
 
     def test__kill_children(self):
         self.c._process_children = [Mock()]
diff --git a/tests/test_rewind.py b/tests/test_rewind.py
index 0f5c2805..db4bce39 100644
--- a/tests/test_rewind.py
+++ b/tests/test_rewind.py
@@ -18,6 +18,28 @@ class MockThread(object):
         self._target(*self._args)
 
 
+def mock_cancellable_call(*args, **kwargs):
+    communicate = kwargs.pop('communicate', None)
+    if isinstance(communicate, dict):
+        communicate.update(stdout=b'', stderr=b'pg_rewind: error: could not open file ' +
+                                              b'"data/postgresql0/pg_xlog/000000010000000000000003": No such file')
+    return 1
+
+
+def mock_cancellable_call0(*args, **kwargs):
+    communicate = kwargs.pop('communicate', None)
+    if isinstance(communicate, dict):
+        communicate.update(stdout=b'', stderr=b'')
+    return 0
+
+
+def mock_cancellable_call1(*args, **kwargs):
+    communicate = kwargs.pop('communicate', None)
+    if isinstance(communicate, dict):
+        communicate.update(stdout=b'', stderr=b'')
+    return 1
+
+
 @patch('subprocess.call', Mock(return_value=0))
 @patch('psycopg2.connect', psycopg2_connect)
 class TestRewind(BaseTestPostgresql):
@@ -36,16 +58,23 @@ class TestRewind(BaseTestPostgresql):
         self.p.config._config['use_pg_rewind'] = False
         self.assertFalse(self.r.can_rewind)
 
-    @patch.object(Postgresql, 'major_version', PropertyMock(return_value=130000))
-    @patch.object(CancellableSubprocess, 'call')
-    def test_pg_rewind(self, mock_cancellable_subprocess_call):
+    def test_pg_rewind(self):
         r = {'user': '', 'host': '', 'port': '', 'database': '', 'password': ''}
-        mock_cancellable_subprocess_call.return_value = 0
-        with patch('subprocess.check_output', Mock(return_value=b'foo')):
-            self.assertTrue(self.r.pg_rewind(r))
-        mock_cancellable_subprocess_call.side_effect = OSError
-        with patch('subprocess.check_output', Mock(side_effect=Exception)):
-            self.assertFalse(self.r.pg_rewind(r))
+        with patch.object(Postgresql, 'major_version', PropertyMock(return_value=130000)),\
+                patch.object(CancellableSubprocess, 'call', Mock(return_value=None)):
+            with patch('subprocess.check_output', Mock(return_value=b'boo')):
+                self.assertFalse(self.r.pg_rewind(r))
+            with patch('subprocess.check_output', Mock(side_effect=Exception)):
+                self.assertFalse(self.r.pg_rewind(r))
+
+        with patch.object(Postgresql, 'major_version', PropertyMock(return_value=120000)),\
+                patch('subprocess.check_output', Mock(return_value=b'foo %f %p %r %% % %')):
+            with patch.object(CancellableSubprocess, 'call', mock_cancellable_call):
+                self.assertFalse(self.r.pg_rewind(r))
+            with patch.object(CancellableSubprocess, 'call', mock_cancellable_call0):
+                self.assertTrue(self.r.pg_rewind(r))
+            with patch.object(CancellableSubprocess, 'call', mock_cancellable_call1):
+                self.assertFalse(self.r.pg_rewind(r))
 
     @patch.object(Rewind, 'can_rewind', PropertyMock(return_value=True))
     def test__get_local_timeline_lsn(self):
@@ -61,7 +90,7 @@ class TestRewind(BaseTestPostgresql):
             with patch.object(MockCursor, 'fetchone', Mock(side_effect=[(0, 0, 1, 1,), Exception])):
                 self.r.rewind_or_reinitialize_needed_and_possible(self.leader)
 
-    @patch.object(CancellableSubprocess, 'call', Mock(return_value=0))
+    @patch.object(CancellableSubprocess, 'call', mock_cancellable_call)
     @patch.object(Postgresql, 'checkpoint', side_effect=['', '1'],)
     @patch.object(Postgresql, 'stop', Mock(return_value=False))
     @patch.object(Postgresql, 'start', Mock())
@@ -161,7 +190,8 @@ class TestRewind(BaseTestPostgresql):
     @patch('psutil.Popen')
     def test_single_user_mode(self, subprocess_popen_mock):
         subprocess_popen_mock.return_value.wait.return_value = 0
-        self.assertEqual(self.r.single_user_mode('CHECKPOINT', {'archive_mode': 'on'}), 0)
+        subprocess_popen_mock.return_value.communicate.return_value = ('', '')
+        self.assertEqual(self.r.single_user_mode({'input': 'CHECKPOINT'}, {'archive_mode': 'on'}), 0)
 
     @patch('os.listdir', Mock(side_effect=[OSError, ['a', 'b']]))
     @patch('os.unlink', Mock(side_effect=OSError))

From cbff544b9c4877f3372d89288fbc4f225e1024d3 Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Thu, 25 Jun 2020 16:27:57 +0200
Subject: [PATCH 39/40] Implement patronictl flush switchover (#1554)

It includes implementing the `DELETE /switchover` REST API endpoint.

Close https://github.com/zalando/patroni/issues/1376
---
 docs/rest_api.rst |  7 +++++--
 patroni/api.py    | 14 ++++++++++++++
 patroni/ctl.py    | 48 +++++++++++++++++++++++++++++++++++------------
 tests/test_api.py |  9 +++++++++
 tests/test_ctl.py | 22 +++++++++++++++++++++-
 5 files changed, 85 insertions(+), 15 deletions(-)

diff --git a/docs/rest_api.rst b/docs/rest_api.rst
index 93d5b52b..f0f9b6ab 100644
--- a/docs/rest_api.rst
+++ b/docs/rest_api.rst
@@ -299,7 +299,10 @@ Example: schedule a switchover from the leader to any other healthy replica in t
 
 Depending on the situation the request might finish with a different HTTP status code and body. The status code **200** is returned when the switchover or failover successfully completed. If the switchover was successfully scheduled, Patroni will return HTTP status code **202**. In case something went wrong, the error status code (one of **400**, **412** or **503**) will be returned with some details in the response body. For more information please check the source code of ``patroni/api.py:do_POST_failover()`` method.
 
-The switchover and failover endpoints are used by ``patronictl switchover`` and ``patronictl failover``, respectively.
+- ``DELETE /switchover``: delete the scheduled switchover
+
+The ``POST /switchover`` and ``POST failover`` endpoints are used by ``patronictl switchover`` and ``patronictl failover``, respectively.
+The ``DELETE /switchover`` is used by ``patronictl flush <cluster-name> switchover``.
 
 
 Restart endpoint
@@ -315,7 +318,7 @@ Restart endpoint
 
 - ``DELETE /restart``: delete the scheduled restart
 
-``POST /restart`` and ``DELETE /restart`` endpoints are used by ``patronictl restart`` and ``patronictl flush`` respectively.
+``POST /restart`` and ``DELETE /restart`` endpoints are used by ``patronictl restart`` and ``patronictl flush <cluster-name> restart`` respectively.
 
 
 Reload endpoint
diff --git a/patroni/api.py b/patroni/api.py
index 4616595e..23c4e6bf 100644
--- a/patroni/api.py
+++ b/patroni/api.py
@@ -292,6 +292,20 @@ class RestApiHandler(BaseHTTPRequestHandler):
             code = 404
         self._write_response(code, data)
 
+    @check_auth
+    def do_DELETE_switchover(self):
+        failover = self.server.patroni.dcs.get_cluster().failover
+        if failover and failover.scheduled_at:
+            if not self.server.patroni.dcs.manual_failover('', '', index=failover.index):
+                return self.send_error(409)
+            else:
+                data = "scheduled switchover deleted"
+                code = 200
+        else:
+            data = "no switchover is scheduled"
+            code = 404
+        self._write_response(code, data)
+
     @check_auth
     def do_POST_reinitialize(self):
         request = self._read_json_content(body_is_optional=True)
diff --git a/patroni/ctl.py b/patroni/ctl.py
index efdda0fe..94df4a5b 100644
--- a/patroni/ctl.py
+++ b/patroni/ctl.py
@@ -242,6 +242,15 @@ def get_any_member(cluster, role='master', member=None):
             return m
 
 
+def get_all_members_leader_first(cluster):
+    leader_name = cluster.leader.member.name if cluster.leader and cluster.leader.member.api_url else None
+    if leader_name:
+        yield cluster.leader.member
+    for member in cluster.members:
+        if member.api_url and member.name != leader_name:
+            yield member
+
+
 def get_cursor(cluster, connect_parameters, role='master', member=None):
     member = get_any_member(cluster, role=role, member=member)
     if member is None:
@@ -935,25 +944,45 @@ def scaffold(obj, cluster_name, sysid):
     click.echo("Cluster {0} has been created successfully".format(cluster_name))
 
 
-@ctl.command('flush', help='Discard scheduled events (restarts only currently)')
+@ctl.command('flush', help='Discard scheduled events')
 @click.argument('cluster_name')
 @click.argument('member_names', nargs=-1)
-@click.argument('target', type=click.Choice(['restart']))
+@click.argument('target', type=click.Choice(['restart', 'switchover']))
 @click.option('--role', '-r', help='Flush only members with this role', default='any',
               type=click.Choice(['master', 'replica', 'any']))
 @option_force
 @click.pass_obj
 def flush(obj, cluster_name, member_names, force, role, target):
-    cluster = get_dcs(obj, cluster_name).get_cluster()
+    dcs = get_dcs(obj, cluster_name)
+    cluster = dcs.get_cluster()
 
-    members = get_members(cluster, cluster_name, member_names, role, force, 'flush')
-    for member in members:
-        if target == 'restart':
+    if target == 'restart':
+        for member in get_members(cluster, cluster_name, member_names, role, force, 'flush'):
             if member.data.get('scheduled_restart'):
                 r = request_patroni(member, 'delete', 'restart')
                 check_response(r, member.name, 'flush scheduled restart')
             else:
                 click.echo('No scheduled restart for member {0}'.format(member.name))
+    elif target == 'switchover':
+        failover = cluster.failover
+        if not failover or not failover.scheduled_at:
+            return click.echo('No pending scheduled switchover')
+        for member in get_all_members_leader_first(cluster):
+            try:
+                r = request_patroni(member, 'delete', 'switchover')
+                if r.status in (200, 404):
+                    prefix = 'Success' if r.status == 200 else 'Failed'
+                    return click.echo('{0}: {1}'.format(prefix, r.data.decode('utf-8')))
+            except Exception as err:
+                logging.warning(str(err))
+                logging.warning('Member %s is not accessible', member.name)
+
+            click.echo('Failed: member={0}, status_code={1}, ({2})'.format(
+                member.name, r.status, r.data.decode('utf-8')))
+
+        logging.warning('Failing over to DCS')
+        click.echo('{0} Could not find any accessible member of cluster {1}'.format(timestamp(), cluster_name))
+        dcs.manual_failover('', '', index=failover.index)
 
 
 def wait_until_pause_is_applied(dcs, paused, old_cluster):
@@ -980,12 +1009,7 @@ def toggle_pause(config, cluster_name, paused, wait):
     if cluster.is_paused() == paused:
         raise PatroniCtlException('Cluster is {0} paused'.format(paused and 'already' or 'not'))
 
-    members = []
-    if cluster.leader and cluster.leader.member.api_url:
-        members.append(cluster.leader.member)
-    members.extend([m for m in cluster.members if m.api_url and (not members or members[0].name != m.name)])
-
-    for member in members:
+    for member in get_all_members_leader_first(cluster):
         try:
             r = request_patroni(member, 'patch', 'config', {'pause': paused or None})
         except Exception as err:
diff --git a/tests/test_api.py b/tests/test_api.py
index 92ea89c2..32de0869 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -311,6 +311,15 @@ class TestRestApiHandler(unittest.TestCase):
                 request = 'DELETE /restart HTTP/1.0' + self._authorization
                 self.assertIsNotNone(MockRestApiServer(RestApiHandler, request))
 
+    @patch.object(MockPatroni, 'dcs')
+    def test_do_DELETE_switchover(self, mock_dcs):
+        request = 'DELETE /switchover HTTP/1.0' + self._authorization
+        self.assertIsNotNone(MockRestApiServer(RestApiHandler, request))
+        mock_dcs.manual_failover.return_value = False
+        self.assertIsNotNone(MockRestApiServer(RestApiHandler, request))
+        mock_dcs.get_cluster.return_value.failover = None
+        self.assertIsNotNone(MockRestApiServer(RestApiHandler, request))
+
     @patch.object(MockPatroni, 'dcs')
     def test_do_POST_reinitialize(self, mock_dcs):
         cluster = mock_dcs.get_cluster.return_value
diff --git a/tests/test_ctl.py b/tests/test_ctl.py
index e2cb11de..02883af3 100644
--- a/tests/test_ctl.py
+++ b/tests/test_ctl.py
@@ -449,7 +449,7 @@ class TestCtl(unittest.TestCase):
 
     @patch('patroni.ctl.get_dcs')
     @patch.object(PoolManager, 'request', Mock(return_value=MockResponse()))
-    def test_flush(self, mock_get_dcs):
+    def test_flush_restart(self, mock_get_dcs):
         mock_get_dcs.return_value = self.e
         mock_get_dcs.return_value.get_cluster = get_cluster_initialized_with_leader
 
@@ -462,6 +462,26 @@ class TestCtl(unittest.TestCase):
             result = self.runner.invoke(ctl, ['flush', 'dummy', 'restart', '--force'])
             assert 'Failed: flush scheduled restart' in result.output
 
+    @patch('patroni.ctl.get_dcs')
+    @patch.object(PoolManager, 'request', Mock(return_value=MockResponse()))
+    def test_flush_switchover(self, mock_get_dcs):
+        mock_get_dcs.return_value = self.e
+
+        mock_get_dcs.return_value.get_cluster = get_cluster_initialized_with_leader
+        result = self.runner.invoke(ctl, ['flush', 'dummy', 'switchover'])
+        assert 'No pending scheduled switchover' in result.output
+
+        scheduled_at = datetime.now(tzutc) + timedelta(seconds=600)
+        mock_get_dcs.return_value.get_cluster = Mock(
+                return_value=get_cluster_initialized_with_leader(Failover(1, 'a', 'b', scheduled_at)))
+        result = self.runner.invoke(ctl, ['flush', 'dummy', 'switchover'])
+        assert result.output.startswith('Success: ')
+
+        mock_get_dcs.return_value.manual_failover = Mock()
+        with patch.object(PoolManager, 'request', side_effect=[MockResponse(409), Exception]):
+            result = self.runner.invoke(ctl, ['flush', 'dummy', 'switchover'])
+            assert 'Could not find any accessible member of cluster' in result.output
+
     @patch.object(PoolManager, 'request')
     @patch('patroni.ctl.get_dcs')
     @patch('patroni.ctl.polling_loop', Mock(return_value=[1]))

From 8eb01c77b67021d55260c53b6a7713f88743504f Mon Sep 17 00:00:00 2001
From: Alexander Kukushkin <cyberdemn@gmail.com>
Date: Mon, 29 Jun 2020 14:49:25 +0200
Subject: [PATCH 40/40] Don't fire on_reload when promoting to standby_leader
 on 13+ (#1552)

PostgreSQL 13 finally introduced the possibility to change the `primary_conninfo` without a restart. Just doing reload is enough, but in case if the role is changing from the `replica` to the `standby_leader` we want to call only `on_role_change` callback and skip `on_reload`, because they duplicate each other.
---
 patroni/postgresql/__init__.py | 7 ++++---
 tests/test_ha.py               | 4 +++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py
index ad3541a4..ac48f953 100644
--- a/patroni/postgresql/__init__.py
+++ b/patroni/postgresql/__init__.py
@@ -615,9 +615,9 @@ class Postgresql(object):
         except psycopg2.Error:
             pass
 
-    def reload(self):
+    def reload(self, block_callbacks=False):
         ret = self.pg_ctl('reload')
-        if ret:
+        if ret and not block_callbacks:
             self.call_nowait(ACTION_ON_RELOAD)
         return ret
 
@@ -777,7 +777,8 @@ class Postgresql(object):
         if self.is_running():
             if do_reload:
                 self.config.write_postgresql_conf()
-                self.reload()
+                if self.reload(block_callbacks=change_role) and change_role:
+                    self.set_role(role)
             else:
                 self.restart(block_callbacks=change_role, role=role)
         else:
diff --git a/tests/test_ha.py b/tests/test_ha.py
index c99152eb..86a3c6b6 100644
--- a/tests/test_ha.py
+++ b/tests/test_ha.py
@@ -153,7 +153,6 @@ def run_async(self, func, args=()):
 @patch.object(Postgresql, 'is_leader', Mock(return_value=True))
 @patch.object(Postgresql, 'timeline_wal_position', Mock(return_value=(1, 10, 1)))
 @patch.object(Postgresql, '_cluster_info_state_get', Mock(return_value=3))
-@patch.object(Postgresql, 'call_nowait', Mock(return_value=True))
 @patch.object(Postgresql, 'data_directory_empty', Mock(return_value=False))
 @patch.object(Postgresql, 'controldata', Mock(return_value={
     'Database system identifier': SYSID,
@@ -705,6 +704,9 @@ class TestHa(PostgresInit):
         self.ha._leader_timeline = 1
         self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock')
         self.assertEqual(self.ha.run_cycle(), 'no action.  i am the standby leader with the lock')
+        self.p.set_role('replica')
+        self.p.config.check_recovery_conf = Mock(return_value=(True, False))
+        self.assertEqual(self.ha.run_cycle(), 'promoted self to a standby leader because i had the session lock')
 
     def test_process_healthy_standby_cluster_as_cascade_replica(self):
         self.p.is_leader = false