diff --git a/Dockerfile.citus b/Dockerfile.citus new file mode 100644 index 00000000..a4c9ed04 --- /dev/null +++ b/Dockerfile.citus @@ -0,0 +1,173 @@ +## This Dockerfile is meant to aid in the building and debugging patroni whilst developing on your local machine +## It has all the necessary components to play/debug with a single node appliance, running etcd +ARG PG_MAJOR=15 +ARG COMPRESS=false +ARG PGHOME=/home/postgres +ARG PGDATA=$PGHOME/data +ARG LC_ALL=C.UTF-8 +ARG LANG=C.UTF-8 + +FROM postgres:$PG_MAJOR as builder + +ARG PGHOME +ARG PGDATA +ARG LC_ALL +ARG LANG + +ENV ETCDVERSION=3.3.13 CONFDVERSION=0.16.0 + +RUN set -ex \ + && export DEBIAN_FRONTEND=noninteractive \ + && echo 'APT::Install-Recommends "0";\nAPT::Install-Suggests "0";' > /etc/apt/apt.conf.d/01norecommend \ + && apt-get update -y \ + # postgres:10 is based on debian, which has the patroni package. We will install all required dependencies + && apt-cache depends patroni | sed -n -e 's/.*Depends: \(python3-.\+\)$/\1/p' \ + | grep -Ev '^python3-(sphinx|etcd|consul|kazoo|kubernetes)' \ + | xargs apt-get install -y vim curl less jq locales haproxy sudo \ + python3-etcd python3-kazoo python3-pip busybox \ + net-tools iputils-ping --fix-missing \ + && curl https://install.citusdata.com/community/deb.sh | bash \ + && apt-get -y install postgresql-$PG_MAJOR-citus-11.1 \ + && pip3 install dumb-init \ +\ + # Cleanup all locales but en_US.UTF-8 + && find /usr/share/i18n/charmaps/ -type f ! -name UTF-8.gz -delete \ + && find /usr/share/i18n/locales/ -type f ! -name en_US ! -name en_GB ! -name i18n* ! -name iso14651_t1 ! -name iso14651_t1_common ! -name 'translit_*' -delete \ + && echo 'en_US.UTF-8 UTF-8' > /usr/share/i18n/SUPPORTED \ +\ + # Make sure we have a en_US.UTF-8 locale available + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 \ +\ + # haproxy dummy config + && echo 'global\n stats socket /run/haproxy/admin.sock mode 660 level admin' > /etc/haproxy/haproxy.cfg \ +\ + # vim config + && echo 'syntax on\nfiletype plugin indent on\nset mouse-=a\nautocmd FileType yaml setlocal ts=2 sts=2 sw=2 expandtab' > /etc/vim/vimrc.local \ +\ + # Prepare postgres/patroni/haproxy environment + && mkdir -p $PGHOME/.config/patroni /patroni /run/haproxy \ + && ln -s ../../postgres0.yml $PGHOME/.config/patroni/patronictl.yaml \ + && ln -s /patronictl.py /usr/local/bin/patronictl \ + && sed -i "s|/var/lib/postgresql.*|$PGHOME:/bin/bash|" /etc/passwd \ + && chown -R postgres:postgres /var/log \ +\ + # Download etcd + && curl -sL https://github.com/coreos/etcd/releases/download/v${ETCDVERSION}/etcd-v${ETCDVERSION}-linux-$(dpkg --print-architecture).tar.gz \ + | tar xz -C /usr/local/bin --strip=1 --wildcards --no-anchored etcd etcdctl \ +\ + # Download confd + && curl -sL https://github.com/kelseyhightower/confd/releases/download/v${CONFDVERSION}/confd-${CONFDVERSION}-linux-$(dpkg --print-architecture) \ + > /usr/local/bin/confd && chmod +x /usr/local/bin/confd \ + # Prepare client cert for HAProxy + && cat /etc/ssl/private/ssl-cert-snakeoil.key /etc/ssl/certs/ssl-cert-snakeoil.pem > /etc/ssl/private/ssl-cert-snakeoil.crt \ +\ + # Clean up all useless packages and some files + && apt-get purge -y --allow-remove-essential python3-pip gzip bzip2 util-linux e2fsprogs \ + libmagic1 bsdmainutils login ncurses-bin libmagic-mgc e2fslibs bsdutils \ + exim4-config gnupg-agent dirmngr libpython2.7-stdlib libpython2.7-minimal \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* \ + /root/.cache \ + /var/cache/debconf/* \ + /etc/rc?.d \ + /etc/systemd \ + /docker-entrypoint* \ + /sbin/pam* \ + /sbin/swap* \ + /sbin/unix* \ + /usr/local/bin/gosu \ + /usr/sbin/[acgipr]* \ + /usr/sbin/*user* \ + /usr/share/doc* \ + /usr/share/man \ + /usr/share/info \ + /usr/share/i18n/locales/translit_hangul \ + /usr/share/locale/?? \ + /usr/share/locale/??_?? \ + /usr/share/postgresql/*/man \ + /usr/share/postgresql-common/pg_wrapper \ + /usr/share/vim/vim80/doc \ + /usr/share/vim/vim80/lang \ + /usr/share/vim/vim80/tutor \ +# /var/lib/dpkg/info/* \ + && find /usr/bin -xtype l -delete \ + && find /var/log -type f -exec truncate --size 0 {} \; \ + && find /usr/lib/python3/dist-packages -name '*test*' | xargs rm -fr \ + && find /lib/$(uname -m)-linux-gnu/security -type f ! -name pam_env.so ! -name pam_permit.so ! -name pam_unix.so -delete + +# perform compression if it is necessary +ARG COMPRESS +RUN if [ "$COMPRESS" = "true" ]; then \ + set -ex \ + # Allow certain sudo commands from postgres + && echo 'postgres ALL=(ALL) NOPASSWD: /bin/tar xpJf /a.tar.xz -C /, /bin/rm /a.tar.xz, /bin/ln -snf dash /bin/sh' >> /etc/sudoers \ + && ln -snf busybox /bin/sh \ + && arch=$(uname -m) \ + && darch=$(uname -m | sed 's/_/-/') \ + && files="/bin/sh /usr/bin/sudo /usr/lib/sudo/sudoers.so /lib/$arch-linux-gnu/security/pam_*.so" \ + && libs="$(ldd $files | awk '{print $3;}' | grep '^/' | sort -u) /lib/ld-linux-$darch.so.* /lib/$arch-linux-gnu/ld-linux-$darch.so.* /lib/$arch-linux-gnu/libnsl.so.* /lib/$arch-linux-gnu/libnss_compat.so.* /lib/$arch-linux-gnu/libnss_files.so.*" \ + && (echo /var/run $files $libs | tr ' ' '\n' && realpath $files $libs) | sort -u | sed 's/^\///' > /exclude \ + && find /etc/alternatives -xtype l -delete \ + && save_dirs="usr lib var bin sbin etc/ssl etc/init.d etc/alternatives etc/apt" \ + && XZ_OPT=-e9v tar -X /exclude -cpJf a.tar.xz $save_dirs \ + # we call "cat /exclude" to avoid including files from the $save_dirs that are also among + # the exceptions listed in the /exclude, as "uniq -u" eliminates all non-unique lines. + # By calling "cat /exclude" a second time we guarantee that there will be at least two lines + # for each exception and therefore they will be excluded from the output passed to 'rm'. + && /bin/busybox sh -c "(find $save_dirs -not -type d && cat /exclude /exclude && echo exclude) | sort | uniq -u | xargs /bin/busybox rm" \ + && /bin/busybox --install -s \ + && /bin/busybox sh -c "find $save_dirs -type d -depth -exec rmdir -p {} \; 2> /dev/null"; \ + else \ + /bin/busybox --install -s; \ + fi + +FROM scratch +COPY --from=builder / / + +LABEL maintainer="Alexander Kukushkin " + +ARG PG_MAJOR +ARG COMPRESS +ARG PGHOME +ARG PGDATA +ARG LC_ALL +ARG LANG + +ARG PGBIN=/usr/lib/postgresql/$PG_MAJOR/bin + +ENV LC_ALL=$LC_ALL LANG=$LANG EDITOR=/usr/bin/editor +ENV PGDATA=$PGDATA PATH=$PATH:$PGBIN + +COPY patroni /patroni/ +COPY extras/confd/conf.d/haproxy.toml /etc/confd/conf.d/ +COPY extras/confd/templates/haproxy-citus.tmpl /etc/confd/templates/haproxy.tmpl +COPY patroni*.py docker/entrypoint.sh / +COPY postgres?.yml $PGHOME/ + +WORKDIR $PGHOME + +RUN sed -i 's/env python/&3/' /patroni*.py \ + # "fix" patroni configs + && sed -i 's/^\( connect_address:\| - host\)/#&/' postgres?.yml \ + && sed -i 's/^ listen: 127.0.0.1/ listen: 0.0.0.0/' postgres?.yml \ + && sed -i "s|^\( data_dir: \).*|\1$PGDATA|" postgres?.yml \ + && sed -i "s|^#\( bin_dir: \).*|\1$PGBIN|" postgres?.yml \ + && sed -i 's/^ - encoding: UTF8/ - locale: en_US.UTF-8\n&/' postgres?.yml \ + && sed -i 's/^scope:/log:\n loggers:\n patroni.postgresql.citus: DEBUG\n#&/' postgres?.yml \ + && sed -i 's/^\(name\|etcd\| host\| authentication\| pg_hba\| parameters\):/#&/' postgres?.yml \ + && sed -i 's/^ \(replication\|superuser\|rewind\|unix_socket_directories\|\(\( \)\{0,1\}\(username\|password\)\)\):/#&/' postgres?.yml \ + && sed -i 's/^postgresql:/&\n basebackup:\n checkpoint: fast/' postgres?.yml \ + && sed -i 's|^ parameters:| pg_hba:\n - local all all trust\n - hostssl replication all all md5 clientcert=verify-ca\n - hostssl all all all md5 clientcert=verify-ca\n&\n max_connections: 100\n shared_buffers: 16MB\n ssl: "on"\n ssl_ca_file: /etc/ssl/certs/ssl-cert-snakeoil.pem\n ssl_cert_file: /etc/ssl/certs/ssl-cert-snakeoil.pem\n ssl_key_file: /etc/ssl/private/ssl-cert-snakeoil.key\n citus.node_conninfo: "sslrootcert=/etc/ssl/certs/ssl-cert-snakeoil.pem sslkey=/etc/ssl/private/ssl-cert-snakeoil.key sslcert=/etc/ssl/certs/ssl-cert-snakeoil.pem sslmode=verify-ca"|' postgres?.yml \ + && sed -i 's/^#\(ctl\| certfile\| keyfile\)/\1/' postgres?.yml \ + && sed -i 's|^# cafile: .*$| verify_client: required\n cafile: /etc/ssl/certs/ssl-cert-snakeoil.pem|' postgres?.yml \ + && sed -i 's|^# cacert: .*$| cacert: /etc/ssl/certs/ssl-cert-snakeoil.pem|' postgres?.yml \ + && sed -i 's/^# insecure: .*/ insecure: on/' postgres?.yml \ + # client cert for HAProxy to access Patroni REST API + && if [ "$COMPRESS" = "true" ]; then chmod u+s /usr/bin/sudo; fi \ + && chmod +s /bin/ping \ + && chown -R postgres:postgres $PGHOME /run /etc/haproxy + +USER postgres + +ENTRYPOINT ["/bin/sh", "/entrypoint.sh"] diff --git a/README.rst b/README.rst index 6da1a9a4..b1474592 100644 --- a/README.rst +++ b/README.rst @@ -14,6 +14,8 @@ We call Patroni a "template" because it is far from being a one-size-fits-all or Currently supported PostgreSQL versions: 9.3 to 15. +**Note to Citus users**: Starting from 3.0 Patroni nicely integrates with `Citus `. Please check `Citus support `__ page for more information. + **Note to Kubernetes users**: Patroni can run natively on top of Kubernetes. Take a look at the `Kubernetes `__ chapter of the Patroni documentation. .. contents:: diff --git a/docker-compose-citus.yml b/docker-compose-citus.yml new file mode 100644 index 00000000..f141bc3b --- /dev/null +++ b/docker-compose-citus.yml @@ -0,0 +1,139 @@ +# docker compose file for running a Citus cluster +# with 3-node etcd v3 cluster as the DCS and one haproxy node. +# The Citus cluster has a coordinator (3 nodes) +# and two worker clusters (2 nodes). +# +# Before starting it up you need to build the docker image: +# $ docker build -f Dockerfile.citus -t patroni-citus . +# The cluster could be started as: +# $ docker-compose -f docker-compose-citus.yml up -d +# You can read more about it in the: +# https://github.com/zalando/patroni/blob/master/docker/README.md#citus-cluster +version: "2" + +networks: + demo: + +services: + etcd1: &etcd + image: patroni-citus + networks: [ demo ] + environment: + ETCDCTL_API: 3 + ETCD_LISTEN_PEER_URLS: http://0.0.0.0:2380 + ETCD_LISTEN_CLIENT_URLS: http://0.0.0.0:2379 + ETCD_INITIAL_CLUSTER: etcd1=http://etcd1:2380,etcd2=http://etcd2:2380,etcd3=http://etcd3:2380 + ETCD_INITIAL_CLUSTER_STATE: new + ETCD_INITIAL_CLUSTER_TOKEN: tutorial + container_name: demo-etcd1 + hostname: etcd1 + command: etcd -name etcd1 -initial-advertise-peer-urls http://etcd1:2380 + + etcd2: + <<: *etcd + container_name: demo-etcd2 + hostname: etcd2 + command: etcd -name etcd2 -initial-advertise-peer-urls http://etcd2:2380 + + etcd3: + <<: *etcd + container_name: demo-etcd3 + hostname: etcd3 + command: etcd -name etcd3 -initial-advertise-peer-urls http://etcd3:2380 + + haproxy: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: haproxy + container_name: demo-haproxy + ports: + - "5000:5000" # Access to the coorinator primary + - "5001:5001" # Load-balancing across workers primaries + command: haproxy + environment: &haproxy_env + ETCDCTL_API: 3 + ETCDCTL_ENDPOINTS: http://etcd1:2379,http://etcd2:2379,http://etcd3:2379 + PATRONI_ETCD3_HOSTS: "'etcd1:2379','etcd2:2379','etcd3:2379'" + PATRONI_SCOPE: demo + PATRONI_CITUS_GROUP: 0 + PATRONI_CITUS_DATABASE: citus + PGSSLMODE: verify-ca + PGSSLKEY: /etc/ssl/private/ssl-cert-snakeoil.key + PGSSLCERT: /etc/ssl/certs/ssl-cert-snakeoil.pem + PGSSLROOTCERT: /etc/ssl/certs/ssl-cert-snakeoil.pem + + coord1: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: coord1 + container_name: demo-coord1 + environment: &coord_env + <<: *haproxy_env + PATRONI_NAME: coord1 + PATRONI_CITUS_GROUP: 0 + + coord2: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: coord2 + container_name: demo-coord2 + environment: + <<: *coord_env + PATRONI_NAME: coord2 + + coord3: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: coord3 + container_name: demo-coord3 + environment: + <<: *coord_env + PATRONI_NAME: coord3 + + + work1-1: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: work1-1 + container_name: demo-work1-1 + environment: &work1_env + <<: *haproxy_env + PATRONI_NAME: work1-1 + PATRONI_CITUS_GROUP: 1 + + work1-2: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: work1-2 + container_name: demo-work1-2 + environment: + <<: *work1_env + PATRONI_NAME: work1-2 + + + work2-1: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: work2-1 + container_name: demo-work2-1 + environment: &work2_env + <<: *haproxy_env + PATRONI_NAME: work2-1 + PATRONI_CITUS_GROUP: 2 + + work2-2: + image: patroni-citus + networks: [ demo ] + env_file: docker/patroni.env + hostname: work2-2 + container_name: demo-work2-2 + environment: + <<: *work2_env + PATRONI_NAME: work2-2 diff --git a/docker-compose.yml b/docker-compose.yml index ab3d294b..626391de 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,12 @@ # docker compose file for running a 3-node PostgreSQL cluster # with 3-node etcd cluster as the DCS and one haproxy node +# +# requires a patroni image build from the Dockerfile: +# $ docker build -t patroni . +# The cluster could be started as: +# $ docker-compose up -d +# You can read more about it in the: +# https://github.com/zalando/patroni/blob/master/docker/README.md version: "2" networks: diff --git a/docker/README.md b/docker/README.md index db8c2f60..f2b30ab6 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,10 +1,10 @@ -# Patroni Dockerfile -You can run Patroni in a docker container using this Dockerfile +# Dockerfile and Dockerfile.citus +You can run Patroni in a docker container using these Dockerfiles -This Dockerfile is meant in aiding development of Patroni and quick testing of features. It is not a production-worthy -Dockerfile +They are meant in aiding development of Patroni and quick testing of features and not a production-worthy! docker build -t patroni . + docker build -f Dockerfile.citus -t patroni-citus . # Examples @@ -12,7 +12,10 @@ Dockerfile docker run -d patroni -## Three-node Patroni cluster with three-node etcd cluster and one haproxy container using docker-compose +## Three-node Patroni cluster + +In addition to three Patroni containers the stack starts three containers with etcd (forming a three-node cluster), and one container with haproxy. +The haproxy listens on ports 5000 (connects to the primary) and 5001 (does load-balancing between healthy standbys). Example session: @@ -92,7 +95,8 @@ Example session: b2e169fcb8a34028: name=etcd1 peerURLs=http://etcd1:2380 clientURLs=http://etcd1:2379 isLeader=false postgres@patroni1:~$ exit - $ psql -h localhost -p 5000 -U postgres -W + $ docker exec -ti demo-haproxy bash + postgres@haproxy:~$ psql -h localhost -p 5000 -U postgres -W Password: postgres psql (11.2 (Ubuntu 11.2-1.pgdg18.04+1), server 10.7 (Debian 10.7-1.pgdg90+1)) Type "help" for help. @@ -105,7 +109,7 @@ Example session: localhost/postgres=# \q - $ psql -h localhost -p 5001 -U postgres -W + $postgres@haproxy:~ psql -h localhost -p 5001 -U postgres -W Password: postgres psql (11.2 (Ubuntu 11.2-1.pgdg18.04+1), server 10.7 (Debian 10.7-1.pgdg90+1)) Type "help" for help. @@ -115,3 +119,188 @@ Example session: ─────────────────── t (1 row) + +## Citus cluster + +The stack starts three containers with etcd (forming a three-node etcd cluster), seven containers with Patroni+PostgreSQL+Citus (three coordinator nodes, and two worker clusters with two nodes each), and one container with haproxy. +The haproxy listens on ports 5000 (connects to the coordinator primary) and 5001 (does load-balancing between worker primary nodes). + +Example session: + + $ docker-compose -f docker-compose-citus.yml up -d + Creating demo-work2-1 ... done + Creating demo-work1-1 ... done + Creating demo-etcd2 ... done + Creating demo-etcd1 ... done + Creating demo-coord3 ... done + Creating demo-etcd3 ... done + Creating demo-coord1 ... done + Creating demo-haproxy ... done + Creating demo-work2-2 ... done + Creating demo-coord2 ... done + Creating demo-work1-2 ... done + + $ docker ps + CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES + 852d8885a612 patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 3 seconds demo-coord3 + cdd692f947ab patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 3 seconds demo-work1-2 + 9f4e340b36da patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 3 seconds demo-etcd3 + d69c129a960a patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 4 seconds demo-etcd1 + c5849689b8cd patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 4 seconds demo-coord1 + c9d72bd6217d patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 3 seconds demo-work2-1 + 24b1b43efa05 patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 4 seconds demo-coord2 + cb0cc2b4ca0a patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 3 seconds demo-work2-2 + 9796c6b8aad5 patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 5 seconds demo-work1-1 + 8baccd74dcae patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 4 seconds demo-etcd2 + 353ec62a0187 patroni-citus "/bin/sh /entrypoint…" 6 seconds ago Up 4 seconds 0.0.0.0:5000-5001->5000-5001/tcp demo-haproxy + + $ docker logs demo-coord1 + 2023-01-05 15:09:31,295 INFO: Selected new etcd server http://172.27.0.4:2379 + 2023-01-05 15:09:31,388 INFO: Lock owner: None; I am coord1 + 2023-01-05 15:09:31,501 INFO: trying to bootstrap a new cluster + ... + 2023-01-05 15:09:45,096 INFO: postmaster pid=39 + localhost:5432 - no response + 2023-01-05 15:09:45.137 UTC [39] LOG: starting PostgreSQL 15.1 (Debian 15.1-1.pgdg110+1) on x86_64-pc-linux-gnu, compiled by gcc (Debian 10.2.1-6) 10.2.1 20210110, 64-bit + 2023-01-05 15:09:45.137 UTC [39] LOG: listening on IPv4 address "0.0.0.0", port 5432 + 2023-01-05 15:09:45.152 UTC [39] LOG: listening on Unix socket "/var/run/postgresql/.s.PGSQL.5432" + 2023-01-05 15:09:45.177 UTC [43] LOG: database system was shut down at 2023-01-05 15:09:32 UTC + 2023-01-05 15:09:45.193 UTC [39] LOG: database system is ready to accept connections + localhost:5432 - accepting connections + localhost:5432 - accepting connections + 2023-01-05 15:09:46,139 INFO: establishing a new patroni connection to the postgres cluster + 2023-01-05 15:09:46,208 INFO: running post_bootstrap + 2023-01-05 15:09:47.209 UTC [55] LOG: starting maintenance daemon on database 16386 user 10 + 2023-01-05 15:09:47.209 UTC [55] CONTEXT: Citus maintenance daemon for database 16386 user 10 + 2023-01-05 15:09:47,215 WARNING: Could not activate Linux watchdog device: "Can't open watchdog device: [Errno 2] No such file or directory: '/dev/watchdog'" + 2023-01-05 15:09:47.446 UTC [41] LOG: checkpoint starting: immediate force wait + 2023-01-05 15:09:47,466 INFO: initialized a new cluster + 2023-01-05 15:09:47,594 DEBUG: query(SELECT nodeid, groupid, nodename, nodeport, noderole FROM pg_catalog.pg_dist_node WHERE noderole = 'primary', ()) + 2023-01-05 15:09:47,594 INFO: establishing a new patroni connection to the postgres cluster + 2023-01-05 15:09:47,467 INFO: Lock owner: coord1; I am coord1 + 2023-01-05 15:09:47,613 DEBUG: query(SELECT pg_catalog.citus_set_coordinator_host(%s, %s, 'primary', 'default'), ('172.27.0.6', 5432)) + 2023-01-05 15:09:47,924 INFO: no action. I am (coord1), the leader with the lock + 2023-01-05 15:09:51.282 UTC [41] LOG: checkpoint complete: wrote 1086 buffers (53.0%); 0 WAL file(s) added, 0 removed, 0 recycled; write=0.029 s, sync=3.746 s, total=3.837 s; sync files=280, longest=0.028 s, average=0.014 s; distance=8965 kB, estimate=8965 kB + 2023-01-05 15:09:51.283 UTC [41] LOG: checkpoint starting: immediate force wait + 2023-01-05 15:09:51.495 UTC [41] LOG: checkpoint complete: wrote 18 buffers (0.9%); 0 WAL file(s) added, 0 removed, 0 recycled; write=0.044 s, sync=0.091 s, total=0.212 s; sync files=15, longest=0.015 s, average=0.007 s; distance=67 kB, estimate=8076 kB + 2023-01-05 15:09:57,467 INFO: Lock owner: coord1; I am coord1 + 2023-01-05 15:09:57,569 INFO: Assigning synchronous standby status to ['coord3'] + server signaled + 2023-01-05 15:09:57.574 UTC [39] LOG: received SIGHUP, reloading configuration files + 2023-01-05 15:09:57.580 UTC [39] LOG: parameter "synchronous_standby_names" changed to "coord3" + 2023-01-05 15:09:59,637 INFO: Synchronous standby status assigned to ['coord3'] + 2023-01-05 15:09:59,638 DEBUG: query(SELECT pg_catalog.citus_add_node(%s, %s, %s, 'primary', 'default'), ('172.27.0.2', 5432, 1)) + 2023-01-05 15:09:59.690 UTC [67] LOG: standby "coord3" is now a synchronous standby with priority 1 + 2023-01-05 15:09:59.690 UTC [67] STATEMENT: START_REPLICATION SLOT "coord3" 0/3000000 TIMELINE 1 + 2023-01-05 15:09:59,694 INFO: no action. I am (coord1), the leader with the lock + 2023-01-05 15:09:59,704 DEBUG: query(SELECT pg_catalog.citus_add_node(%s, %s, %s, 'primary', 'default'), ('172.27.0.8', 5432, 2)) + 2023-01-05 15:10:07,625 INFO: no action. I am (coord1), the leader with the lock + 2023-01-05 15:10:17,579 INFO: no action. I am (coord1), the leader with the lock + + $ docker exec -ti demo-haproxy bash + postgres@haproxy:~$ etcdctl member list + 1bab629f01fa9065, started, etcd3, http://etcd3:2380, http://172.27.0.10:2379 + 8ecb6af518d241cc, started, etcd2, http://etcd2:2380, http://172.27.0.4:2379 + b2e169fcb8a34028, started, etcd1, http://etcd1:2380, http://172.27.0.7:2379 + + postgres@haproxy:~$ etcdctl get --keys-only --prefix /service/demo + /service/demo/0/config + /service/demo/0/initialize + /service/demo/0/leader + /service/demo/0/members/coord1 + /service/demo/0/members/coord2 + /service/demo/0/members/coord3 + /service/demo/0/status + /service/demo/0/sync + /service/demo/1/config + /service/demo/1/initialize + /service/demo/1/leader + /service/demo/1/members/work1-1 + /service/demo/1/members/work1-2 + /service/demo/1/status + /service/demo/1/sync + /service/demo/2/config + /service/demo/2/initialize + /service/demo/2/leader + /service/demo/2/members/work2-1 + /service/demo/2/members/work2-2 + /service/demo/2/status + /service/demo/2/sync + + postgres@haproxy:~$ psql -h localhost -p 5000 -U postgres -d citus + Password for user postgres: postgres + psql (15.1 (Debian 15.1-1.pgdg110+1)) + SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, compression: off) + Type "help" for help. + + citus=# select pg_is_in_recovery(); + pg_is_in_recovery + ------------------- + f + (1 row) + + citus=# table pg_dist_node; + nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards + --------+---------+------------+----------+----------+-------------+----------+----------+-------------+----------------+------------------ + 1 | 0 | 172.27.0.6 | 5432 | default | t | t | primary | default | t | f + 2 | 1 | 172.27.0.2 | 5432 | default | t | t | primary | default | t | t + 3 | 2 | 172.27.0.8 | 5432 | default | t | t | primary | default | t | t + (3 rows) + + citus=# \q + + postgres@haproxy:~$ patronictl list + + Citus cluster: demo ----------+--------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+--------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.6 | Leader | running | 1 | | + | 0 | coord2 | 172.27.0.5 | Replica | running | 1 | 0 | + | 0 | coord3 | 172.27.0.9 | Sync Standby | running | 1 | 0 | + | 1 | work1-1 | 172.27.0.2 | Leader | running | 1 | | + | 1 | work1-2 | 172.27.0.12 | Sync Standby | running | 1 | 0 | + | 2 | work2-1 | 172.27.0.11 | Sync Standby | running | 1 | 0 | + | 2 | work2-2 | 172.27.0.8 | Leader | running | 1 | | + +-------+---------+-------------+--------------+---------+----+-----------+ + + postgres@haproxy:~$ patronictl switchover --group 2 --force + Current cluster topology + + Citus cluster: demo (group: 2, 7185185529556963355) +-----------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+-------------+--------------+---------+----+-----------+ + | work2-1 | 172.27.0.11 | Sync Standby | running | 1 | 0 | + | work2-2 | 172.27.0.8 | Leader | running | 1 | | + +---------+-------------+--------------+---------+----+-----------+ + 2023-01-05 15:29:29.54204 Successfully switched over to "work2-1" + + Citus cluster: demo (group: 2, 7185185529556963355) -------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+-------------+---------+---------+----+-----------+ + | work2-1 | 172.27.0.11 | Leader | running | 1 | | + | work2-2 | 172.27.0.8 | Replica | stopped | | unknown | + +---------+-------------+---------+---------+----+-----------+ + + postgres@haproxy:~$ patronictl list + + Citus cluster: demo ----------+--------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+--------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.6 | Leader | running | 1 | | + | 0 | coord2 | 172.27.0.5 | Replica | running | 1 | 0 | + | 0 | coord3 | 172.27.0.9 | Sync Standby | running | 1 | 0 | + | 1 | work1-1 | 172.27.0.2 | Leader | running | 1 | | + | 1 | work1-2 | 172.27.0.12 | Sync Standby | running | 1 | 0 | + | 2 | work2-1 | 172.27.0.11 | Leader | running | 2 | | + | 2 | work2-2 | 172.27.0.8 | Sync Standby | running | 2 | 0 | + +-------+---------+-------------+--------------+---------+----+-----------+ + + postgres@haproxy:~$ psql -h localhost -p 5000 -U postgres -d citus + Password for user postgres: postgres + psql (15.1 (Debian 15.1-1.pgdg110+1)) + SSL connection (protocol: TLSv1.3, cipher: TLS_AES_256_GCM_SHA384, compression: off) + Type "help" for help. + + citus=# table pg_dist_node; + nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards + --------+---------+-------------+----------+----------+-------------+----------+----------+-------------+----------------+------------------ + 1 | 0 | 172.27.0.6 | 5432 | default | t | t | primary | default | t | f + 3 | 2 | 172.27.0.11 | 5432 | default | t | t | primary | default | t | t + 2 | 1 | 172.27.0.2 | 5432 | default | t | t | primary | default | t | t + (3 rows) diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 13d473d5..fb30bee7 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -23,7 +23,7 @@ case "$1" in done set -- "$@" zookeeper -node "$PATRONI_ZOOKEEPER_HOSTS" else - while ! etcdctl cluster-health 2> /dev/null; do + while ! etcdctl member list 2> /dev/null; do sleep 1 done set -- "$@" etcdv3 @@ -63,5 +63,13 @@ export PATRONI_REPLICATION_USERNAME="${PATRONI_REPLICATION_USERNAME:-replicator} export PATRONI_REPLICATION_PASSWORD="${PATRONI_REPLICATION_PASSWORD:-replicate}" export PATRONI_SUPERUSER_USERNAME="${PATRONI_SUPERUSER_USERNAME:-postgres}" export PATRONI_SUPERUSER_PASSWORD="${PATRONI_SUPERUSER_PASSWORD:-postgres}" +export PATRONI_REPLICATION_SSLMODE="${PATRONI_REPLICATION_SSLMODE:-$PGSSLMODE}" +export PATRONI_REPLICATION_SSLKEY="${PATRONI_REPLICATION_SSLKEY:-$PGSSLKEY}" +export PATRONI_REPLICATION_SSLCERT="${PATRONI_REPLICATION_SSLCERT:-$PGSSLCERT}" +export PATRONI_REPLICATION_SSLROOTCERT="${PATRONI_REPLICATION_SSLROOTCERT:-$PGSSLROOTCERT}" +export PATRONI_SUPERUSER_SSLMODE="${PATRONI_SUPERUSER_SSLMODE:-$PGSSLMODE}" +export PATRONI_SUPERUSER_SSLKEY="${PATRONI_SUPERUSER_SSLKEY:-$PGSSLKEY}" +export PATRONI_SUPERUSER_SSLCERT="${PATRONI_SUPERUSER_SSLCERT:-$PGSSLCERT}" +export PATRONI_SUPERUSER_SSLROOTCERT="${PATRONI_SUPERUSER_SSLROOTCERT:-$PGSSLROOTCERT}" exec python3 /patroni.py postgres0.yml diff --git a/docs/ENVIRONMENT.rst b/docs/ENVIRONMENT.rst index d189b63f..c2579f0e 100644 --- a/docs/ENVIRONMENT.rst +++ b/docs/ENVIRONMENT.rst @@ -33,6 +33,13 @@ It is possible to create new database users right after the successful initializ Example: defining ``PATRONI_admin_PASSWORD=strongpasswd`` and ``PATRONI_admin_OPTIONS='createrole,createdb'`` will cause creation of the user **admin** with the password **strongpasswd** that is allowed to create other users and databases. +Citus +----- +Enables integration Patroni with :ref:`Citus `__. If configured, Patroni will take care of registering Citus worker nodes on the coordinator. You can find more information about Citus support :ref:`here `. + +- **PATRONI\_CITUS\_GROUP**: the Citus group id, integer. Use ``0`` for coordinator and ``1``, ``2``, etc... for workers +- **PATRONI\_CITUS\_DATABASE**: the database where ``citus`` extension should be created. Must be the same on the coordinator and all workers. Currently only one database is supported. + Consul ------ - **PATRONI\_CONSUL\_HOST**: the host:port for the Consul local agent. diff --git a/docs/SETTINGS.rst b/docs/SETTINGS.rst index a0e6e8af..e5d26937 100644 --- a/docs/SETTINGS.rst +++ b/docs/SETTINGS.rst @@ -112,6 +112,15 @@ Bootstrap configuration - **- createdb** - **post\_bootstrap** or **post\_init**: An additional script that will be executed after initializing the cluster. The script receives a connection string URL (with the cluster superuser as a user name). The PGPASSFILE variable is set to the location of pgpass file. +.. _citus_settings: + +Citus +----- +Enables integration Patroni with :ref:`Citus `__. If configured, Patroni will take care of registering Citus worker nodes on the coordinator. You can find more information about Citus support :ref:`here `. + +- **group**: the Citus group id, integer. Use ``0`` for coordinator and ``1``, ``2``, etc... for workers +- **database**: the database where ``citus`` extension should be created. Must be the same on the coordinator and all workers. Currently only one database is supported. + .. _consul_settings: Consul @@ -320,6 +329,8 @@ PostgreSQL - **replica\_method**: for each create_replica_methods other than basebackup, you would add a configuration section of the same name. At a minimum, this should include "command" with a full path to the actual script to be executed. Other configuration parameters will be passed along to the script in the form "parameter=value". - **pre\_promote**: a fencing script that executes during a failover after acquiring the leader lock but before promoting the replica. If the script exits with a non-zero code, Patroni does not promote the replica and removes the leader key from DCS. +.. _restapi_settings: + REST API -------- - **restapi**: diff --git a/docs/citus.rst b/docs/citus.rst new file mode 100644 index 00000000..a48df30c --- /dev/null +++ b/docs/citus.rst @@ -0,0 +1,352 @@ +.. _citus: + +Citus support +============= + +Patroni makes it extremely simple to deploy `Multi-Node Citus`__ clusters. + +__ https://docs.citusdata.com/en/stable/installation/multi_node.html + +TL;DR +----- + +There are only a few simple rules you need to follow: + +1. Citus extension must be available on all nodes. Absolute minimum supported + Citus version is 10.0, but, to take all benefits from transparent + switchovers and restarts of workers we recommend using at least Citus 11.2. +2. Cluster name (``scope``) must be the same for all Citus nodes! +3. Superuser credentials must be the same on coordinator and all worker + nodes, and ``pg_hba.conf`` should allow superuser access between all nodes. +4. :ref:`REST API ` access should be allowed from worker + nodes to the coordinator. E.g., credentials should be the same and if + configured, client certificates from worker nodes must be accepted by the + coordinator. +5. Add the following section to the ``patroni.yaml``: + +.. code:: YAML + + citus: + group: X # 0 for coordinator and 1, 2, 3, etc for workers + database: citus # must be the same on all nodes + + +After that you just need to start Patroni and it will handle the rest: + +1. ``citus`` extension will be automatically added to ``shared_preload_libraries``. +2. If ``max_prepared_transactions`` isn't explicitly set in the global + :ref:`dynamic configuration ` Patroni will + automatically set it to ``2*max_connections``. +3. The ``citus.database`` will be automatically created followed by ``CREATE EXTENSION citus``. +4. Current superuser :ref:`credentials ` will be added to the ``pg_dist_authinfo`` + table to allow cross-node communication. Don't forget to update them if + later you decide to change superuser username/password/sslcert/sslkey! +5. The coordinator primary node will automatically discover worker primary + nodes and add them to the ``pg_dist_node`` table using the + ``citus_add_node()`` function. +6. Patroni will also maintain ``pg_dist_node`` in case failover/switchover + on the coordinator or worker clusters occurs. + +patronictl +---------- + +Coordinator and worker clusters are physically different PostgreSQL/Patroni +clusters that are just logically groupped together using Citus. Therefore in +most cases it is not possible to manage them as a single entity. + +It results in two major differences in ``patronictl`` behaviour when +``patroni.yaml`` has the ``citus`` section comparing with the usual: + +1. The ``list`` and the ``topology`` by default output all members of the Citus + formation (coordinators and workers). The new column ``Group`` indicates + which Citus group they belong to. +2. For all ``patronictl`` commands the new option is introduced, named + ``--group``. For some commands the default value for the group might be + taken from the ``patroni.yaml``. For example, ``patronictl pause`` will + enable the maintenance mode by default for the ``group`` that is set in the + ``citus`` section, but for example for ``patronictl switchover`` or + ``patronictl remove`` the group must be explicitly specified. + +An example of ``patronictl list`` output for the Citus cluster:: + + postgres@coord1:~$ patronictl list demo + + Citus cluster: demo ----------+--------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+--------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | 0 | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | + | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | + | 1 | work1-1 | 172.27.0.8 | Sync Standby | running | 1 | 0 | + | 1 | work1-2 | 172.27.0.2 | Leader | running | 1 | | + | 2 | work2-1 | 172.27.0.5 | Sync Standby | running | 1 | 0 | + | 2 | work2-2 | 172.27.0.7 | Leader | running | 1 | | + +-------+---------+-------------+--------------+---------+----+-----------+ + +If we add the ``--group`` option, the output will change to:: + + postgres@coord1:~$ patronictl list demo --group 0 + + Citus cluster: demo (group: 0, 7179854923829112860) -----------+ + | Member | Host | Role | State | TL | Lag in MB | + +--------+-------------+--------------+---------+----+-----------+ + | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | + | coord3 | 172.27.0.4 | Leader | running | 1 | | + +--------+-------------+--------------+---------+----+-----------+ + + postgres@coord1:~$ patronictl list demo --group 1 + + Citus cluster: demo (group: 1, 7179854923881963547) -----------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+------------+--------------+---------+----+-----------+ + | work1-1 | 172.27.0.8 | Sync Standby | running | 1 | 0 | + | work1-2 | 172.27.0.2 | Leader | running | 1 | | + +---------+------------+--------------+---------+----+-----------+ + +Citus worker switchover +----------------------- + +When a switchover is orchestrated for a Citus worker node, Citus offers the +opportunity to make the switchover close to transparent for an application. +Because the application connects to the coordinator, which in turn connects to +the worker nodes, then it is possible with Citus to `pause` the SQL traffic on +the coordinator for the shards hosted on a worker node. The switchover then +happens while the traffic is kept on the coordinator, and resumes as soon as a +new primary worker node is ready to accept read-write queries. + +An example of ``patronictl switchover`` on the worker cluster:: + + postgres@coord1:~$ patronictl switchover demo + + Citus cluster: demo ----------+--------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+--------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | 0 | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | + | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | + | 1 | work1-1 | 172.27.0.8 | Leader | running | 1 | | + | 1 | work1-2 | 172.27.0.2 | Sync Standby | running | 1 | 0 | + | 2 | work2-1 | 172.27.0.5 | Sync Standby | running | 1 | 0 | + | 2 | work2-2 | 172.27.0.7 | Leader | running | 1 | | + +-------+---------+-------------+--------------+---------+----+-----------+ + Citus group: 2 + Master [work2-2]: + Candidate ['work2-1'] []: + When should the switchover take place (e.g. 2022-12-22T08:02 ) [now]: + Current cluster topology + + Citus cluster: demo (group: 2, 7179854924063375386) -----------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+------------+--------------+---------+----+-----------+ + | work2-1 | 172.27.0.5 | Sync Standby | running | 1 | 0 | + | work2-2 | 172.27.0.7 | Leader | running | 1 | | + +---------+------------+--------------+---------+----+-----------+ + Are you sure you want to switchover cluster demo, demoting current master work2-2? [y/N]: y + 2022-12-22 07:02:40.33003 Successfully switched over to "work2-1" + + Citus cluster: demo (group: 2, 7179854924063375386) ------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------+------------+---------+---------+----+-----------+ + | work2-1 | 172.27.0.5 | Leader | running | 1 | | + | work2-2 | 172.27.0.7 | Replica | stopped | | unknown | + +---------+------------+---------+---------+----+-----------+ + + postgres@coord1:~$ patronictl list demo + + Citus cluster: demo ----------+--------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------+-------------+--------------+---------+----+-----------+ + | 0 | coord1 | 172.27.0.10 | Replica | running | 1 | 0 | + | 0 | coord2 | 172.27.0.6 | Sync Standby | running | 1 | 0 | + | 0 | coord3 | 172.27.0.4 | Leader | running | 1 | | + | 1 | work1-1 | 172.27.0.8 | Leader | running | 1 | | + | 1 | work1-2 | 172.27.0.2 | Sync Standby | running | 1 | 0 | + | 2 | work2-1 | 172.27.0.5 | Leader | running | 2 | | + | 2 | work2-2 | 172.27.0.7 | Sync Standby | running | 2 | 0 | + +-------+---------+-------------+--------------+---------+----+-----------+ + +And this is how it looks on the coordinator side:: + + # The worker primary notifies the coordinator that it is going to execute "pg_ctl stop". + 2022-12-22 07:02:38,636 DEBUG: query("BEGIN") + 2022-12-22 07:02:38,636 DEBUG: query("SELECT pg_catalog.citus_update_node(3, '172.27.0.7-demoted', 5432, true, 10000)") + # From this moment all application traffic on the coordinator to the worker group 2 is paused. + + # The future worker primary notifies the coordinator that it acquired the leader lock in DCS and about to run "pg_ctl promote". + 2022-12-22 07:02:40,085 DEBUG: query("SELECT pg_catalog.citus_update_node(3, '172.27.0.5', 5432)") + + # The new worker primary just finished promote and notifies coordinator that it is ready to accept read-write traffic. + 2022-12-22 07:02:41,485 DEBUG: query("COMMIT") + # From this moment the application traffic on the coordinator to the worker group 2 is unblocked. + +Peek into DCS +------------- + +The Citus cluster (coordinator and workers) are stored in DCS as a fleet of +Patroni clusters logically grouped together:: + + /service/batman/ # scope=batman + /service/batman/0/ # citus.group=0, coordinator + /service/batman/0/initialize + /service/batman/0/leader + /service/batman/0/members/ + /service/batman/0/members/m1 + /service/batman/0/members/m2 + /service/batman/1/ # citus.group=1, worker + /service/batman/1/initialize + /service/batman/1/leader + /service/batman/1/members/ + /service/batman/1/members/m3 + /service/batman/1/members/m4 + ... + +Such an approach was chosen because for most DCS it becomes possible to fetch +the entire Citus cluster with a single recursive read request. Only Citus +coordinator nodes are reading the whole tree, because they have to discover +worker nodes. Worker nodes are reading only the subtree for their own group and +in some cases they could read the subtree of the coordinator group. + +Citus on Kubernetes +------------------- + +Since Kubernetes doesn't support hierarchical structures we had to include the +citus group to all K8s objects Patroni creates:: + + batman-0-leader # the leader config map for the coordinator + batman-0-config # the config map holding initialize, config, and history "keys" + ... + batman-1-leader # the leader config map for worker group 1 + batman-1-config + ... + +I.e., the naming pattern is: ``${scope}-${citus.group}-${type}``. + +All Kubernetes objects are discovered by Patroni using the `label selector`__, +therefore all Pods with Patroni&Citus and Endpoints/ConfigMaps must have +similar labels, and Patroni must be configured to use them using Kubernetes +:ref:`settings ` or :ref:`environment variables +`. + +__ https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#label-selectors + +A couple of examples of Patroni configuration using Pods environment variables: + +1. for the coordinator cluster + +.. code:: YAML + + apiVersion: v1 + kind: Pod + metadata: + labels: + application: patroni + citus-group: "0" + citus-type: coordinator + cluster-name: citusdemo + name: citusdemo-0-0 + namespace: default + spec: + containers: + - env: + - name: PATRONI_SCOPE + value: citusdemo + - name: PATRONI_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: PATRONI_KUBERNETES_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PATRONI_KUBERNETES_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: PATRONI_KUBERNETES_LABELS + value: '{application: patroni}' + - name: PATRONI_CITUS_DATABASE + value: citus + - name: PATRONI_CITUS_GROUP + value: "0" + +2. for the worker cluster from the group 2 + +.. code:: YAML + + apiVersion: v1 + kind: Pod + metadata: + labels: + application: patroni + citus-group: "2" + citus-type: worker + cluster-name: citusdemo + name: citusdemo-2-0 + namespace: default + spec: + containers: + - env: + - name: PATRONI_SCOPE + value: citusdemo + - name: PATRONI_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: PATRONI_KUBERNETES_POD_IP + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: status.podIP + - name: PATRONI_KUBERNETES_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + - name: PATRONI_KUBERNETES_LABELS + value: '{application: patroni}' + - name: PATRONI_CITUS_DATABASE + value: citus + - name: PATRONI_CITUS_GROUP + value: "2" + +As you may noticed, both examples have ``citus-group`` label set. This label +allows Patroni to identify object as belonging to a certain Citus group. In +addition to that, there is also ``PATRONI_CITUS_GROUP`` environment variable, +which has the same value as the ``citus-group`` label. When Patroni creates +new Kubernetes objects ConfigMaps or Endpoints, it automatically puts the +``citus-group: ${env.PATRONI_CITUS_GROUP}`` label on them: + +.. code:: YAML + + apiVersion: v1 + kind: ConfigMap + metadata: + name: citusdemo-0-leader # Is generated as ${env.PATRONI_SCOPE}-${env.PATRONI_CITUS_GROUP}-leader + labels: + application: patroni # Is set from the ${env.PATRONI_KUBERNETES_LABELS} + cluster-name: citusdemo # Is automatically set from the ${env.PATRONI_SCOPE} + citus-group: '0' # Is automatically set from the ${env.PATRONI_CITUS_GROUP} + +You can find a complete example of Patroni deployment on Kubernetes with Citus +support in the `kubernetes`__ folder of the Patroni repository. + +__ https://github.com/zalando/patroni/tree/master/kubernetes + +There are two important files for you: + +1. Dockerfile.citus +2. citus_k8s.yaml + +Citus upgrades and PostgreSQL major upgrades +-------------------------------------------- + +First, please read about upgrading Citus version in the `documentation`__. +There is one minor change in the process. When executing upgrade, you have to +use ``patronictl restart`` instead of ``systemctl restart`` to restart +PostgreSQL. + +__ https://docs.citusdata.com/en/latest/admin_guide/upgrading_citus.html + +The PostgreSQL major upgrade with Citus is a bit more complex. You will have to +combine techniques used in the Citus documentation about major upgrades and +Patroni documentation about :ref:`PostgreSQL major upgrade`. +Please keep in mind that Citus cluster consists of many Patroni clusters +(coordinator and workers) and they all have to be upgraded independently. diff --git a/docs/existing_data.rst b/docs/existing_data.rst index d19854a5..7c78f6e3 100644 --- a/docs/existing_data.rst +++ b/docs/existing_data.rst @@ -23,6 +23,8 @@ A Patroni cluster can be started with a data directory from a single-node Postgr 3. Start Patroni (e.g. ``patroni /etc/patroni/patroni.yml``). It automatically detects that PostgreSQL daemon is already running but its configuration might be out-of-date. 4. Ask Patroni to restart the node with ``patronictl restart cluster-name node-name``. This step is only required if PostgreSQL configuration is out-of-date. +.. _major_upgrade: + Major Upgrade of PostgreSQL Version =================================== diff --git a/docs/index.rst b/docs/index.rst index 008ea54a..9612b576 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,6 +12,8 @@ We call Patroni a "template" because it is far from being a one-size-fits-all or Currently supported PostgreSQL versions: 9.3 to 15. +**Note to Citus users**: Starting from 3.0 Patroni nicely integrates with `Citus `. Please check :ref:`here ` page for more information. + **Note to Kubernetes users**: Patroni can run natively on top of Kubernetes. Take a look at the :ref:`Kubernetes ` chapter of the Patroni documentation. @@ -20,6 +22,7 @@ Currently supported PostgreSQL versions: 9.3 to 15. :caption: Contents: README + citus dynamic_configuration dcs_failsafe_mode rest_api diff --git a/extras/confd/conf.d/haproxy.toml b/extras/confd/conf.d/haproxy.toml index f9f04fbc..b0d3fff0 100644 --- a/extras/confd/conf.d/haproxy.toml +++ b/extras/confd/conf.d/haproxy.toml @@ -9,5 +9,5 @@ check_cmd = "/usr/sbin/haproxy -c -f {{ .src }}" reload_cmd = "haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D -sf $(cat /var/run/haproxy.pid)" keys = [ - "/members/", + "/", ] diff --git a/extras/confd/templates/haproxy-citus.tmpl b/extras/confd/templates/haproxy-citus.tmpl new file mode 100644 index 00000000..92b21cc0 --- /dev/null +++ b/extras/confd/templates/haproxy-citus.tmpl @@ -0,0 +1,32 @@ +global + maxconn 100 + +defaults + log global + mode tcp + retries 2 + timeout client 30m + timeout connect 4s + timeout server 30m + timeout check 5s + +listen stats + mode http + bind *:7000 + stats enable + stats uri / + +listen coordinator + bind *:5000 + option httpchk HEAD /primary + http-check expect status 200 + default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions +{{range gets "/0/members/*"}} server {{base .Key}} {{$data := json .Value}}{{base (replace (index (split $data.conn_url "/") 2) "@" "/" -1)}} maxconn 100 check check-ssl port {{index (split (index (split $data.api_url "/") 2) ":") 1}} verify required ca-file /etc/ssl/certs/ssl-cert-snakeoil.pem crt /etc/ssl/private/ssl-cert-snakeoil.crt +{{end}} +listen workers + bind *:5001 + option httpchk HEAD /primary + http-check expect status 200 + default-server inter 3s fall 3 rise 2 on-marked-down shutdown-sessions +{{range gets "/*/members/*"}}{{$group := index (split .Key "/") 1}}{{if ne $group "0"}} server {{base .Key}} {{$data := json .Value}}{{base (replace (index (split $data.conn_url "/") 2) "@" "/" -1)}} maxconn 100 check check-ssl port {{index (split (index (split $data.api_url "/") 2) ":") 1}} verify required ca-file /etc/ssl/certs/ssl-cert-snakeoil.pem crt /etc/ssl/private/ssl-cert-snakeoil.crt +{{end}}{{end}} diff --git a/features/citus.feature b/features/citus.feature new file mode 100644 index 00000000..a7f53ef0 --- /dev/null +++ b/features/citus.feature @@ -0,0 +1,72 @@ +Feature: citus + We should check that coordinator discovers and registers workers and clients don't have errors when worker cluster switches over + + Scenario: check that worker cluster is registered in the coordinator + Given I start postgres0 in citus group 0 + And I start postgres2 in citus group 1 + Then postgres0 is a leader in a group 0 after 10 seconds + And postgres2 is a leader in a group 1 after 10 seconds + When I start postgres1 in citus group 0 + And I start postgres3 in citus group 1 + Then replication works from postgres0 to postgres1 after 15 seconds + Then replication works from postgres2 to postgres3 after 15 seconds + And postgres0 is registered in the coordinator postgres0 as the worker in group 0 + And postgres2 is registered in the coordinator postgres0 as the worker in group 1 + + Scenario: coordinator failover updates pg_dist_node + Given I run patronictl.py failover batman --group 0 --candidate postgres1 --force + Then postgres1 role is the primary after 10 seconds + And replication works from postgres1 to postgres0 after 15 seconds + And "sync" key in a group 0 in DCS has sync_standby=postgres0 after 15 seconds + And postgres1 is registered in the coordinator postgres1 as the worker in group 0 + When I run patronictl.py failover batman --group 0 --candidate postgres0 --force + Then postgres0 role is the primary after 10 seconds + And replication works from postgres0 to postgres1 after 15 seconds + And "sync" key in a group 0 in DCS has sync_standby=postgres1 after 15 seconds + And postgres0 is registered in the coordinator postgres0 as the worker in group 0 + + Scenario: worker switchover doesn't break client queries on the coordinator + Given I create a distributed table on postgres0 + And I start a thread inserting data on postgres0 + When I run patronictl.py switchover batman --group 1 --force + Then I receive a response returncode 0 + And postgres3 role is the primary after 10 seconds + And replication works from postgres3 to postgres2 after 15 seconds + And "sync" key in a group 1 in DCS has sync_standby=postgres2 after 15 seconds + And postgres3 is registered in the coordinator postgres0 as the worker in group 1 + And a thread is still alive + When I run patronictl.py switchover batman --group 1 --force + Then I receive a response returncode 0 + And postgres2 role is the primary after 10 seconds + And replication works from postgres2 to postgres3 after 15 seconds + And "sync" key in a group 1 in DCS has sync_standby=postgres3 after 15 seconds + And postgres2 is registered in the coordinator postgres0 as the worker in group 1 + And a thread is still alive + When I stop a thread + Then a distributed table on postgres0 has expected rows + + Scenario: worker primary restart doesn't break client queries on the coordinator + Given I cleanup a distributed table on postgres0 + And I start a thread inserting data on postgres0 + When I run patronictl.py restart batman postgres2 --group 1 --force + Then I receive a response returncode 0 + And postgres2 role is the primary after 10 seconds + And replication works from postgres2 to postgres3 after 15 seconds + And postgres2 is registered in the coordinator postgres0 as the worker in group 1 + And a thread is still alive + When I stop a thread + Then a distributed table on postgres0 has expected rows + + Scenario: check that in-flight transaction is rolled back after timeout when other workers need to change pg_dist_node + Given I start postgres4 in citus group 2 + Then postgres4 is a leader in a group 2 after 10 seconds + And "members/postgres4" key in a group 2 in DCS has role=master after 3 seconds + When I run patronictl.py edit-config batman --group 2 -s ttl=20 --force + Then I receive a response returncode 0 + And I receive a response output "+ttl: 20" + When I sleep for 2 seconds + Then postgres4 is registered in the coordinator postgres0 as the worker in group 2 + When I shut down postgres4 + Then There is a transaction in progress on postgres0 changing pg_dist_node + When I run patronictl.py restart batman postgres2 --group 1 --force + Then a transaction finishes in 20 seconds diff --git a/features/dcs_failsafe_mode.feature b/features/dcs_failsafe_mode.feature index 1c250762..10f103d6 100644 --- a/features/dcs_failsafe_mode.feature +++ b/features/dcs_failsafe_mode.feature @@ -74,8 +74,8 @@ Feature: dcs failsafe mode Scenario: check three-node cluster is functioning while DCS is down Given I start postgres0 And I start postgres2 - Then "members/postgres0" key in DCS has state=running after 10 seconds - And "members/postgres2" key in DCS has state=running after 10 seconds + Then "members/postgres2" key in DCS has state=running after 10 seconds + And "members/postgres0" key in DCS has state=running after 20 seconds And Response on GET http://127.0.0.1:8008/failsafe contains postgres2 after 10 seconds And replication works from postgres1 to postgres0 after 10 seconds Given DCS is down diff --git a/features/environment.py b/features/environment.py index 318c9801..2a2f0017 100644 --- a/features/environment.py +++ b/features/environment.py @@ -102,6 +102,7 @@ class PatroniController(AbstractController): self.watchdog = None self._scope = (custom_config or {}).get('scope', 'batman') + self._citus_group = (custom_config or {}).get('citus', {}).get('group') self._config = self._make_patroni_test_config(name, custom_config) self._closables = [] @@ -143,7 +144,7 @@ class PatroniController(AbstractController): self.watchdog.start() env = os.environ.copy() if isinstance(self._context.dcs_ctl, KubernetesController): - self._context.dcs_ctl.create_pod(self._name[8:], self._scope) + self._context.dcs_ctl.create_pod(self._name[8:], self._scope, self._citus_group) env['PATRONI_KUBERNETES_POD_IP'] = '10.0.0.' + self._name[-1] if os.name == 'nt': env['BEHAVE_DEBUG'] = 'true' @@ -385,6 +386,10 @@ class AbstractDcsController(AbstractController): if self._work_directory: shutil.rmtree(self._work_directory) + def path(self, key=None, scope='batman', group=None): + citus_group = '/{0}'.format(group) if group is not None else '' + return self._CLUSTER_NODE.format(scope) + citus_group + (key and '/' + key or '') + def start_outage(self): if not self._paused and self._handle: self._handle.suspend() @@ -395,11 +400,8 @@ class AbstractDcsController(AbstractController): self._handle.resume() self._paused = False - def path(self, key=None, scope='batman'): - return self._CLUSTER_NODE.format(scope) + (key and '/' + key or '') - @abc.abstractmethod - def query(self, key, scope='batman'): + def query(self, key, scope='batman', group=None): """ query for a value of a given key """ @abc.abstractmethod @@ -447,11 +449,11 @@ class ConsulController(AbstractDcsController): except Exception: return False - def path(self, key=None, scope='batman'): - return super(ConsulController, self).path(key, scope)[1:] + def path(self, key=None, scope='batman', group=None): + return super(ConsulController, self).path(key, scope, group)[1:] - def query(self, key, scope='batman'): - _, value = self._client.kv.get(self.path(key, scope)) + def query(self, key, scope='batman', group=None): + _, value = self._client.kv.get(self.path(key, scope, group)) return value and value['Value'].decode('utf-8') def cleanup_service_tree(self): @@ -491,10 +493,10 @@ class EtcdController(AbstractEtcdController): super(EtcdController, self).__init__(context, EtcdClient) os.environ['PATRONI_ETCD_HOST'] = 'localhost:2379' - def query(self, key, scope='batman'): + def query(self, key, scope='batman', group=None): import etcd try: - return self._client.get(self.path(key, scope)).value + return self._client.get(self.path(key, scope, group)).value except etcd.EtcdKeyNotFound: return None @@ -515,9 +517,9 @@ class Etcd3Controller(AbstractEtcdController): super(Etcd3Controller, self).__init__(context, Etcd3Client) os.environ['PATRONI_ETCD3_HOST'] = 'localhost:2379' - def query(self, key, scope='batman'): + def query(self, key, scope='batman', group=None): import base64 - response = self._client.range(self.path(key, scope)) + response = self._client.range(self.path(key, scope, group)) for k in response.get('kvs', []): return base64.b64decode(k['value']).decode('utf-8') if 'value' in k else None @@ -609,10 +611,12 @@ class KubernetesController(AbstractExternalDcsController): return False return True - def create_pod(self, name, scope): + def create_pod(self, name, scope, group=None): self.delete_pod(name) labels = self._labels.copy() labels['cluster-name'] = scope + if group is not None: + labels['citus-group'] = str(group) metadata = self._client.V1ObjectMeta(namespace=self._namespace, name=name, labels=labels) spec = self._client.V1PodSpec(containers=[self._client.V1Container(name=name, image='empty')]) body = self._client.V1Pod(metadata=metadata, spec=spec) @@ -629,12 +633,14 @@ class KubernetesController(AbstractExternalDcsController): except Exception: break - def query(self, key, scope='batman'): + def query(self, key, scope='batman', group=None): if key.startswith('members/'): pod = self._api.read_namespaced_pod(key[8:], self._namespace) return (pod.metadata.annotations or {}).get('status', '') else: try: + if group is not None: + scope = '{0}-{1}'.format(scope, group) ep = scope + {'leader': '', 'history': '-config', 'initialize': '-config'}.get(key, '-' + key) e = self._api.read_namespaced_endpoints(ep, self._namespace) if key != 'sync': @@ -675,10 +681,10 @@ class ZooKeeperController(AbstractExternalDcsController): def process_name(self): return "zookeeper" - def query(self, key, scope='batman'): + def query(self, key, scope='batman', group=None): import kazoo.exceptions try: - return self._client.get(self.path(key, scope))[0].decode('utf-8') + return self._client.get(self.path(key, scope, group))[0].decode('utf-8') except kazoo.exceptions.NoNodeError: return None @@ -748,8 +754,8 @@ class RaftController(AbstractDcsController): '--source=patroni', '-p', 'patroni_raft_controller.py'], stdout=self._log, stderr=subprocess.STDOUT, env=env) - def query(self, key, scope='batman'): - ret = self._raft.get(self.path(key, scope)) + def query(self, key, scope='batman', group=None): + ret = self._raft.get(self.path(key, scope, group)) return ret and ret['value'] def set(self, key, value): @@ -1087,9 +1093,12 @@ def after_all(context): def before_feature(context, feature): """ create per-feature output directory to collect Patroni and PostgreSQL logs """ if feature.name == 'watchdog' and os.name == 'nt': - feature.skip("Watchdog isn't supported on Windows") - else: - context.pctl.create_and_set_output_directory(feature.name) + return feature.skip("Watchdog isn't supported on Windows") + elif feature.name == 'citus': + lib = subprocess.check_output(['pg_config', '--pkglibdir']).decode('utf-8').strip() + if not os.path.exists(os.path.join(lib, 'citus.so')): + return feature.skip("Citus extenstion isn't available") + context.pctl.create_and_set_output_directory(feature.name) def after_feature(context, feature): diff --git a/features/patroni_api.feature b/features/patroni_api.feature index 2d663643..58112959 100644 --- a/features/patroni_api.feature +++ b/features/patroni_api.feature @@ -14,9 +14,9 @@ Scenario: check API requests on a stand-alone server Then I receive a response code 200 When I issue a GET request to http://127.0.0.1:8008/replica Then I receive a response code 503 - When I run patronictl.py reinit batman postgres0 --force - Then I receive a response returncode 0 - And I receive a response output "Failed: reinitialize for member postgres0, status code=503, (I am the leader, can not reinitialize)" + When I issue a POST request to http://127.0.0.1:8008/reinitialize with {"force": true} + Then I receive a response code 503 + And I receive a response text I am the leader, can not reinitialize When I run patronictl.py switchover batman --master postgres0 --force Then I receive a response returncode 1 And I receive a response output "Error: No candidates found to switchover to" diff --git a/features/steps/citus.py b/features/steps/citus.py new file mode 100644 index 00000000..7f5a5b26 --- /dev/null +++ b/features/steps/citus.py @@ -0,0 +1,117 @@ +import json +import time + +from behave import step, then +from dateutil import tz +from datetime import datetime +from functools import partial +from threading import Thread, Event + +tzutc = tz.tzutc() + + +@step('{name:w} is a leader in a group {group:d} after {time_limit:d} seconds') +@then('{name:w} is a leader in a group {group:d} after {time_limit:d} seconds') +def is_a_group_leader(context, name, group, time_limit): + time_limit *= context.timeout_multiplier + max_time = time.time() + int(time_limit) + while (context.dcs_ctl.query("leader", group=group) != name): + time.sleep(1) + assert time.time() < max_time, "{0} is not a leader in dcs after {1} seconds".format(name, time_limit) + + +@step('"{name}" key in a group {group:d} in DCS has {key:w}={value} after {time_limit:d} seconds') +def check_group_member(context, name, group, key, value, time_limit): + time_limit *= context.timeout_multiplier + max_time = time.time() + int(time_limit) + dcs_value = None + response = None + while time.time() < max_time: + try: + response = json.loads(context.dcs_ctl.query(name, group=group)) + dcs_value = response.get(key) + if dcs_value == value: + return + except Exception: + pass + time.sleep(1) + assert False, ("{0} in a group {1} does not have {2}={3} (found {4}) in dcs" + + " after {5} seconds").format(name, group, key, value, response, time_limit) + + +@step('I start {name:w} in citus group {group:d}') +def start_citus(context, name, group): + return context.pctl.start(name, custom_config={"citus": {"database": "postgres", "group": int(group)}}) + + +@step('{name1:w} is registered in the coordinator {name2:w} as the worker in group {group:d}') +def check_registration(context, name1, name2, group): + worker_port = int(context.pctl.query(name1, "SHOW port").fetchone()[0]) + r = context.pctl.query(name2, "SELECT nodeport FROM pg_catalog.pg_dist_node WHERE groupid = {0}".format(group)) + assert worker_port == r.fetchone()[0],\ + "Worker {0} is not registered in pg_dist_node on the coordinator {1}".format(name1, name2) + + +@step('I create a distributed table on {name:w}') +def create_distributed_table(context, name): + context.pctl.query(name, 'CREATE TABLE public.d(id int not null)') + context.pctl.query(name, "SELECT create_distributed_table('public.d', 'id')") + + +@step('I cleanup a distributed table on {name:w}') +def cleanup_distributed_table(context, name): + context.pctl.query(name, 'TRUNCATE public.d') + + +def insert_thread(query_func, context): + while True: + if context.thread_stop_event.is_set(): + break + + context.insert_counter += 1 + query_func('INSERT INTO public.d VALUES({0})'.format(context.insert_counter)) + + context.thread_stop_event.wait(0.01) + + +@step('I start a thread inserting data on {name:w}') +def start_insert_thread(context, name): + context.thread_stop_event = Event() + context.insert_counter = 0 + query_func = partial(context.pctl.query, name) + thread_func = partial(insert_thread, query_func, context) + context.thread = Thread(target=thread_func) + context.thread.daemon = True + context.thread.start() + + +@then('a thread is still alive') +def thread_is_alive(context): + assert context.thread.is_alive(), "Thread is not alive" + + +@step("I stop a thread") +def stop_insert_thread(context): + context.thread_stop_event.set() + context.thread.join(1*context.timeout_multiplier) + assert not context.thread.is_alive(), "Thread is still alive" + + +@step("a distributed table on {name:w} has expected rows") +def count_rows(context, name): + rows = context.pctl.query(name, "SELECT COUNT(*) FROM public.d").fetchone()[0] + assert rows == context.insert_counter, "Distributed table doesn't have expected amount of rows" + + +@step("There is a transaction in progress on {name:w} changing pg_dist_node") +def check_transaction(context, name): + cur = context.pctl.query(name, "SELECT xact_start FROM pg_stat_activity WHERE pid <> pg_backend_pid() AND state" + " = 'idle in transaction' AND query ~ 'citus_update_node' AND query ~ 'demoted'") + assert cur.rowcount == 1, "There is no idle in transaction updating pg_dist_node" + context.xact_start = cur.fetchone()[0] + + +@step("a transaction finishes in {timeout:d} seconds") +def check_transaction_timeout(context, timeout): + assert (datetime.now(tzutc) - context.xact_start).seconds > timeout,\ + "a transaction finished earlier than in {0} seconds".format(timeout) diff --git a/kubernetes/Dockerfile.citus b/kubernetes/Dockerfile.citus new file mode 100644 index 00000000..44370dcc --- /dev/null +++ b/kubernetes/Dockerfile.citus @@ -0,0 +1,42 @@ +FROM postgres:15 +LABEL maintainer="Alexander Kukushkin " + +RUN export DEBIAN_FRONTEND=noninteractive \ + && echo 'APT::Install-Recommends "0";\nAPT::Install-Suggests "0";' > /etc/apt/apt.conf.d/01norecommend \ + && apt-get update -y \ + && apt-get upgrade -y \ + && apt-cache depends patroni | sed -n -e 's/.* Depends: \(python3-.\+\)$/\1/p' \ + | grep -Ev '^python3-(sphinx|etcd|consul|kazoo|kubernetes)' \ + | xargs apt-get install -y busybox vim-tiny curl jq less locales git python3-pip python3-wheel \ + ## Make sure we have a en_US.UTF-8 locale available + && localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 \ + && curl https://install.citusdata.com/community/deb.sh | bash \ + && apt-get -y install postgresql-15-citus-11.1 \ + && pip3 install setuptools \ + && pip3 install 'git+https://github.com/zalando/patroni.git@feature/citus#egg=patroni[kubernetes]' \ + && PGHOME=/home/postgres \ + && mkdir -p $PGHOME \ + && chown postgres $PGHOME \ + && sed -i "s|/var/lib/postgresql.*|$PGHOME:/bin/bash|" /etc/passwd \ + && /bin/busybox --install -s \ + # Set permissions for OpenShift + && chmod 775 $PGHOME \ + && chmod 664 /etc/passwd \ + # Clean up + && apt-get remove -y git python3-pip python3-wheel \ + && apt-get autoremove -y \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /root/.cache + +ADD entrypoint.sh / +ENV PGSSLMODE=verify-ca PGSSLKEY=/etc/ssl/private/ssl-cert-snakeoil.key PGSSLCERT=/etc/ssl/certs/ssl-cert-snakeoil.pem PGSSLROOTCERT=/etc/ssl/certs/ssl-cert-snakeoil.pem + +RUN sed -i 's/^postgresql:/&\n basebackup:\n checkpoint: fast/' /entrypoint.sh \ + && sed -i "s|^ postgresql:|&\n pg_hba:\n - local all all trust\n - hostssl replication all all md5 clientcert=$PGSSLMODE\n - hostssl all all all md5 clientcert=$PGSSLMODE\n parameters:\n max_connections: 100\n shared_buffers: 16MB\n ssl: 'on'\n ssl_ca_file: $PGSSLROOTCERT\n ssl_cert_file: $PGSSLCERT\n ssl_key_file: $PGSSLKEY\n citus.node_conninfo: 'sslrootcert=$PGSSLROOTCERT sslkey=$PGSSLKEY sslcert=$PGSSLCERT sslmode=$PGSSLMODE'|" /entrypoint.sh \ + && sed -i "s#^ \(superuser\|replication\):#&\n sslmode: $PGSSLMODE\n sslkey: $PGSSLKEY\n sslcert: $PGSSLCERT\n sslrootcert: $PGSSLROOTCERT#" /entrypoint.sh + +EXPOSE 5432 8008 +ENV LC_ALL=en_US.UTF-8 LANG=en_US.UTF-8 EDITOR=/usr/bin/editor +USER postgres +WORKDIR /home/postgres +CMD ["/bin/bash", "/entrypoint.sh"] diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 00000000..acedc092 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,154 @@ +# Kubernetes deployment examples +Below you will find examples of Patroni deployments using [kind](https://kind.sigs.k8s.io/). + +# Patroni on K8s +The Patroni cluster deployment with a StatefulSet consisting of three Pods. + +Example session: + + $ kind create cluster + Creating cluster "kind" ... + ✓ Ensuring node image (kindest/node:v1.25.3) 🖼 + ✓ Preparing nodes 📦 + ✓ Writing configuration 📜 + ✓ Starting control-plane 🕹️ + ✓ Installing CNI 🔌 + ✓ Installing StorageClass 💾 + Set kubectl context to "kind-kind" + You can now use your cluster with: + + kubectl cluster-info --context kind-kind + + Thanks for using kind! 😊 + + $ docker build -t patroni . + Sending build context to Docker daemon 138.8kB + Step 1/9 : FROM postgres:15 + ... + Successfully built e9bfe69c5d2b + Successfully tagged patroni:latest + + $ kind load docker-image patroni + Image: "" with ID "sha256:e9bfe69c5d2b319dec0cf564fb895484537664775e18f37f9b707914cc5537e6" not yet present on node "kind-control-plane", loading... + + $ kubectl apply -f patroni_k8s.yaml + service/patronidemo-config created + statefulset.apps/patronidemo created + endpoints/patronidemo created + service/patronidemo created + service/patronidemo-repl created + secret/patronidemo created + serviceaccount/patronidemo created + role.rbac.authorization.k8s.io/patronidemo created + rolebinding.rbac.authorization.k8s.io/patronidemo created + clusterrole.rbac.authorization.k8s.io/patroni-k8s-ep-access created + clusterrolebinding.rbac.authorization.k8s.io/patroni-k8s-ep-access created + + $ kubectl get pods -L role + NAME READY STATUS RESTARTS AGE ROLE + patronidemo-0 1/1 Running 0 34s master + patronidemo-1 1/1 Running 0 30s replica + patronidemo-2 1/1 Running 0 26s replica + + $ kubectl exec -ti patronidemo-0 -- bash + postgres@patronidemo-0:~$ patronictl list + + Cluster: patronidemo (7186662553319358497) ----+----+-----------+ + | Member | Host | Role | State | TL | Lag in MB | + +---------------+------------+---------+---------+----+-----------+ + | patronidemo-0 | 10.244.0.5 | Leader | running | 1 | | + | patronidemo-1 | 10.244.0.6 | Replica | running | 1 | 0 | + | patronidemo-2 | 10.244.0.7 | Replica | running | 1 | 0 | + +---------------+------------+---------+---------+----+-----------+ + +# Citus on K8s +The Citus cluster with the StatefulSets, one coordinator with three Pods and two workers with two pods each. + +Example session: + + $ kind create cluster + Creating cluster "kind" ... + ✓ Ensuring node image (kindest/node:v1.25.3) 🖼 + ✓ Preparing nodes 📦 + ✓ Writing configuration 📜 + ✓ Starting control-plane 🕹️ + ✓ Installing CNI 🔌 + ✓ Installing StorageClass 💾 + Set kubectl context to "kind-kind" + You can now use your cluster with: + + kubectl cluster-info --context kind-kind + + Thanks for using kind! 😊 + + demo@localhost:~/git/patroni/kubernetes$ docker build -f Dockerfile.citus -t patroni-citus-k8s . + Sending build context to Docker daemon 138.8kB + Step 1/11 : FROM postgres:15 + ... + Successfully built 8cd73e325028 + Successfully tagged patroni-citus-k8s:latest + + $ kind load docker-image patroni-citus-k8s + Image: "" with ID "sha256:8cd73e325028d7147672494965e53453f5540400928caac0305015eb2c7027c7" not yet present on node "kind-control-plane", loading... + + $ kubectl apply -f citus_k8s.yaml + service/citusdemo-0-config created + service/citusdemo-1-config created + service/citusdemo-2-config created + statefulset.apps/citusdemo-0 created + statefulset.apps/citusdemo-1 created + statefulset.apps/citusdemo-2 created + endpoints/citusdemo-0 created + service/citusdemo-0 created + endpoints/citusdemo-1 created + service/citusdemo-1 created + endpoints/citusdemo-2 created + service/citusdemo-2 created + service/citusdemo-workers created + secret/citusdemo created + serviceaccount/citusdemo created + role.rbac.authorization.k8s.io/citusdemo created + rolebinding.rbac.authorization.k8s.io/citusdemo created + clusterrole.rbac.authorization.k8s.io/patroni-k8s-ep-access created + clusterrolebinding.rbac.authorization.k8s.io/patroni-k8s-ep-access created + + $ kubectl get sts + NAME READY AGE + citusdemo-0 1/3 6s # coodinator (group=0) + citusdemo-1 1/2 6s # worker (group=1) + citusdemo-2 1/2 6s # worker (group=2) + + $ kubectl get pods -l cluster-name=citusdemo -L role + NAME READY STATUS RESTARTS AGE ROLE + citusdemo-0-0 1/1 Running 0 105s master + citusdemo-0-1 1/1 Running 0 101s replica + citusdemo-0-2 1/1 Running 0 96s replica + citusdemo-1-0 1/1 Running 0 105s master + citusdemo-1-1 1/1 Running 0 101s replica + citusdemo-2-0 1/1 Running 0 105s master + citusdemo-2-1 1/1 Running 0 101s replica + + $ kubectl exec -ti citusdemo-0-0 -- bash + postgres@citusdemo-0-0:~$ patronictl list + + Citus cluster: citusdemo -----------+--------------+---------+----+-----------+ + | Group | Member | Host | Role | State | TL | Lag in MB | + +-------+---------------+-------------+--------------+---------+----+-----------+ + | 0 | citusdemo-0-0 | 10.244.0.10 | Leader | running | 1 | | + | 0 | citusdemo-0-1 | 10.244.0.12 | Replica | running | 1 | 0 | + | 0 | citusdemo-0-2 | 10.244.0.14 | Sync Standby | running | 1 | 0 | + | 1 | citusdemo-1-0 | 10.244.0.8 | Leader | running | 1 | | + | 1 | citusdemo-1-1 | 10.244.0.11 | Sync Standby | running | 1 | 0 | + | 2 | citusdemo-2-0 | 10.244.0.9 | Leader | running | 1 | | + | 2 | citusdemo-2-1 | 10.244.0.13 | Sync Standby | running | 1 | 0 | + +-------+---------------+-------------+--------------+---------+----+-----------+ + + postgres@citusdemo-0-0:~$ psql citus + psql (15.1 (Debian 15.1-1.pgdg110+1)) + Type "help" for help. + + citus=# table pg_dist_node; + nodeid | groupid | nodename | nodeport | noderack | hasmetadata | isactive | noderole | nodecluster | metadatasynced | shouldhaveshards + --------+---------+-------------+----------+----------+-------------+----------+----------+-------------+----------------+------------------ + 1 | 0 | 10.244.0.10 | 5432 | default | t | t | primary | default | t | f + 2 | 1 | 10.244.0.8 | 5432 | default | t | t | primary | default | t | t + 3 | 2 | 10.244.0.9 | 5432 | default | t | t | primary | default | t | t + (3 rows) diff --git a/kubernetes/citus_k8s.yaml b/kubernetes/citus_k8s.yaml new file mode 100644 index 00000000..a53fe15f --- /dev/null +++ b/kubernetes/citus_k8s.yaml @@ -0,0 +1,590 @@ +# headless services to avoid deletion of citusdemo-*-config endpoints +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-0-config + labels: + application: patroni + cluster-name: citusdemo + citus-group: '0' +spec: + clusterIP: None + +--- +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-1-config + labels: + application: patroni + cluster-name: citusdemo + citus-group: '1' +spec: + clusterIP: None + +--- + +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-2-config + labels: + application: patroni + cluster-name: citusdemo + citus-group: '2' +spec: + clusterIP: None + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: &cluster_name citusdemo-0 + labels: &labels + application: patroni + cluster-name: citusdemo + citus-group: '0' + citus-type: coordinator +spec: + replicas: 3 + serviceName: *cluster_name + selector: + matchLabels: + <<: *labels + template: + metadata: + labels: + <<: *labels + spec: + serviceAccountName: citusdemo + containers: + - name: *cluster_name + image: patroni-citus-k8s # docker build -f Dockerfile.citus -t patroni-citus-k8s . + imagePullPolicy: IfNotPresent + readinessProbe: + httpGet: + scheme: HTTP + path: /readiness + port: 8008 + initialDelaySeconds: 3 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + ports: + - containerPort: 8008 + protocol: TCP + - containerPort: 5432 + protocol: TCP + volumeMounts: + - mountPath: /home/postgres/pgdata + name: pgdata + env: + - name: PATRONI_KUBERNETES_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: PATRONI_KUBERNETES_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PATRONI_KUBERNETES_BYPASS_API_SERVICE + value: 'true' + - name: PATRONI_KUBERNETES_USE_ENDPOINTS + value: 'true' + - name: PATRONI_KUBERNETES_LABELS + value: '{application: patroni, cluster-name: citusdemo}' + - name: PATRONI_CITUS_DATABASE + value: citus + - name: PATRONI_CITUS_GROUP + value: '0' + - name: PATRONI_SUPERUSER_USERNAME + value: postgres + - name: PATRONI_SUPERUSER_PASSWORD + valueFrom: + secretKeyRef: + name: citusdemo + key: superuser-password + - name: PATRONI_REPLICATION_USERNAME + value: standby + - name: PATRONI_REPLICATION_PASSWORD + valueFrom: + secretKeyRef: + name: citusdemo + key: replication-password + - name: PATRONI_SCOPE + value: citusdemo + - name: PATRONI_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: PATRONI_POSTGRESQL_DATA_DIR + value: /home/postgres/pgdata/pgroot/data + - name: PATRONI_POSTGRESQL_PGPASS + value: /tmp/pgpass + - name: PATRONI_POSTGRESQL_LISTEN + value: '0.0.0.0:5432' + - name: PATRONI_RESTAPI_LISTEN + value: '0.0.0.0:8008' + terminationGracePeriodSeconds: 0 + volumes: + - name: pgdata + emptyDir: {} +# volumeClaimTemplates: +# - metadata: +# labels: +# application: spilo +# spilo-cluster: *cluster_name +# annotations: +# volume.alpha.kubernetes.io/storage-class: anything +# name: pgdata +# spec: +# accessModes: +# - ReadWriteOnce +# resources: +# requests: +# storage: 5Gi + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: &cluster_name citusdemo-1 + labels: &labels + application: patroni + cluster-name: citusdemo + citus-group: '1' + citus-type: worker +spec: + replicas: 2 + serviceName: *cluster_name + selector: + matchLabels: + <<: *labels + template: + metadata: + labels: + <<: *labels + spec: + serviceAccountName: citusdemo + containers: + - name: *cluster_name + image: patroni-citus-k8s # docker build -f Dockerfile.citus -t patroni-citus-k8s . + imagePullPolicy: IfNotPresent + readinessProbe: + httpGet: + scheme: HTTP + path: /readiness + port: 8008 + initialDelaySeconds: 3 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + ports: + - containerPort: 8008 + protocol: TCP + - containerPort: 5432 + protocol: TCP + volumeMounts: + - mountPath: /home/postgres/pgdata + name: pgdata + env: + - name: PATRONI_KUBERNETES_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: PATRONI_KUBERNETES_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PATRONI_KUBERNETES_BYPASS_API_SERVICE + value: 'true' + - name: PATRONI_KUBERNETES_USE_ENDPOINTS + value: 'true' + - name: PATRONI_KUBERNETES_LABELS + value: '{application: patroni, cluster-name: citusdemo}' + - name: PATRONI_CITUS_DATABASE + value: citus + - name: PATRONI_CITUS_GROUP + value: '1' + - name: PATRONI_SUPERUSER_USERNAME + value: postgres + - name: PATRONI_SUPERUSER_PASSWORD + valueFrom: + secretKeyRef: + name: citusdemo + key: superuser-password + - name: PATRONI_REPLICATION_USERNAME + value: standby + - name: PATRONI_REPLICATION_PASSWORD + valueFrom: + secretKeyRef: + name: citusdemo + key: replication-password + - name: PATRONI_SCOPE + value: citusdemo + - name: PATRONI_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: PATRONI_POSTGRESQL_DATA_DIR + value: /home/postgres/pgdata/pgroot/data + - name: PATRONI_POSTGRESQL_PGPASS + value: /tmp/pgpass + - name: PATRONI_POSTGRESQL_LISTEN + value: '0.0.0.0:5432' + - name: PATRONI_RESTAPI_LISTEN + value: '0.0.0.0:8008' + terminationGracePeriodSeconds: 0 + volumes: + - name: pgdata + emptyDir: {} +# volumeClaimTemplates: +# - metadata: +# labels: +# application: spilo +# spilo-cluster: *cluster_name +# annotations: +# volume.alpha.kubernetes.io/storage-class: anything +# name: pgdata +# spec: +# accessModes: +# - ReadWriteOnce +# resources: +# requests: +# storage: 5Gi + +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: &cluster_name citusdemo-2 + labels: &labels + application: patroni + cluster-name: citusdemo + citus-group: '2' + citus-type: worker +spec: + replicas: 2 + serviceName: *cluster_name + selector: + matchLabels: + <<: *labels + template: + metadata: + labels: + <<: *labels + spec: + serviceAccountName: citusdemo + containers: + - name: *cluster_name + image: patroni-citus-k8s # docker build -f Dockerfile.citus -t patroni-citus-k8s . + imagePullPolicy: IfNotPresent + readinessProbe: + httpGet: + scheme: HTTP + path: /readiness + port: 8008 + initialDelaySeconds: 3 + periodSeconds: 10 + timeoutSeconds: 5 + successThreshold: 1 + failureThreshold: 3 + ports: + - containerPort: 8008 + protocol: TCP + - containerPort: 5432 + protocol: TCP + volumeMounts: + - mountPath: /home/postgres/pgdata + name: pgdata + env: + - name: PATRONI_KUBERNETES_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: PATRONI_KUBERNETES_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: PATRONI_KUBERNETES_BYPASS_API_SERVICE + value: 'true' + - name: PATRONI_KUBERNETES_USE_ENDPOINTS + value: 'true' + - name: PATRONI_KUBERNETES_LABELS + value: '{application: patroni, cluster-name: citusdemo}' + - name: PATRONI_CITUS_DATABASE + value: citus + - name: PATRONI_CITUS_GROUP + value: '2' + - name: PATRONI_SUPERUSER_USERNAME + value: postgres + - name: PATRONI_SUPERUSER_PASSWORD + valueFrom: + secretKeyRef: + name: citusdemo + key: superuser-password + - name: PATRONI_REPLICATION_USERNAME + value: standby + - name: PATRONI_REPLICATION_PASSWORD + valueFrom: + secretKeyRef: + name: citusdemo + key: replication-password + - name: PATRONI_SCOPE + value: citusdemo + - name: PATRONI_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: PATRONI_POSTGRESQL_DATA_DIR + value: /home/postgres/pgdata/pgroot/data + - name: PATRONI_POSTGRESQL_PGPASS + value: /tmp/pgpass + - name: PATRONI_POSTGRESQL_LISTEN + value: '0.0.0.0:5432' + - name: PATRONI_RESTAPI_LISTEN + value: '0.0.0.0:8008' + terminationGracePeriodSeconds: 0 + volumes: + - name: pgdata + emptyDir: {} +# volumeClaimTemplates: +# - metadata: +# labels: +# application: spilo +# spilo-cluster: *cluster_name +# annotations: +# volume.alpha.kubernetes.io/storage-class: anything +# name: pgdata +# spec: +# accessModes: +# - ReadWriteOnce +# resources: +# requests: +# storage: 5Gi + +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: citusdemo-0 + labels: + application: patroni + cluster-name: citusdemo + citus-group: '0' + citus-type: coordinator +subsets: [] + +--- +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-0 + labels: + application: patroni + cluster-name: citusdemo + citus-group: '0' + citus-type: coordinator +spec: + type: ClusterIP + ports: + - port: 5432 + targetPort: 5432 + +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: citusdemo-1 + labels: + application: patroni + cluster-name: citusdemo + citus-group: '1' + citus-type: worker +subsets: [] + +--- +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-1 + labels: + application: patroni + cluster-name: citusdemo + citus-group: '1' + citus-type: worker +spec: + type: ClusterIP + ports: + - port: 5432 + targetPort: 5432 + +--- +apiVersion: v1 +kind: Endpoints +metadata: + name: citusdemo-2 + labels: + application: patroni + cluster-name: citusdemo + citus-group: '2' + citus-type: worker +subsets: [] + +--- +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-2 + labels: + application: patroni + cluster-name: citusdemo + citus-group: '2' + citus-type: worker +spec: + type: ClusterIP + ports: + - port: 5432 + targetPort: 5432 + +--- +apiVersion: v1 +kind: Service +metadata: + name: citusdemo-workers + labels: &labels + application: patroni + cluster-name: citusdemo + citus-type: worker + role: master +spec: + type: ClusterIP + selector: + <<: *labels + ports: + - port: 5432 + targetPort: 5432 + +--- +apiVersion: v1 +kind: Secret +metadata: + name: &cluster_name citusdemo + labels: + application: patroni + cluster-name: *cluster_name +type: Opaque +data: + superuser-password: emFsYW5kbw== + replication-password: cmVwLXBhc3M= + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: citusdemo + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: citusdemo +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - get + - list + - patch + - update + - watch + # delete and deletecollection are required only for 'patronictl remove' + - delete + - deletecollection +- apiGroups: + - "" + resources: + - endpoints + verbs: + - get + - patch + - update + # the following three privileges are necessary only when using endpoints + - create + - list + - watch + # delete and deletecollection are required only for for 'patronictl remove' + - delete + - deletecollection +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - patch + - update + - watch +# The following privilege is only necessary for creation of headless service +# for citusdemo-config endpoint, in order to prevent cleaning it up by the +# k8s master. You can avoid giving this privilege by explicitly creating the +# service like it is done in this manifest (lines 2..10) +- apiGroups: + - "" + resources: + - services + verbs: + - create + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: citusdemo +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: citusdemo +subjects: +- kind: ServiceAccount + name: citusdemo + +# Following privileges are only required if deployed not in the "default" +# namespace and you want Patroni to bypass kubernetes service +# (PATRONI_KUBERNETES_BYPASS_API_SERVICE=true) +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: patroni-k8s-ep-access +rules: +- apiGroups: + - "" + resources: + - endpoints + resourceNames: + - kubernetes + verbs: + - get + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: patroni-k8s-ep-access +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: patroni-k8s-ep-access +subjects: +- kind: ServiceAccount + name: citusdemo +# The namespace must be specified explicitly. +# If deploying to the different namespace you have to change it. + namespace: default diff --git a/patroni/api.py b/patroni/api.py index 1988b1ba..038b55d7 100644 --- a/patroni/api.py +++ b/patroni/api.py @@ -615,6 +615,18 @@ class RestApiHandler(BaseHTTPRequestHandler): def do_POST_switchover(self): self.do_POST_failover(action='switchover') + @check_access + def do_POST_citus(self): + request = self._read_json_content() + if not request: + return + + patroni = self.server.patroni + if patroni.postgresql.citus_handler.is_coordinator() and patroni.ha.is_leader(): + cluster = patroni.dcs.get_cluster(True) + patroni.postgresql.citus_handler.handle_event(cluster, request) + self._write_response(200, 'OK') + def parse_request(self): """Override parse_request method to enrich basic functionality of `BaseHTTPRequestHandler` class @@ -759,16 +771,17 @@ class RestApiServer(ThreadingMixIn, HTTPServer, Thread): def __members_ips(self): cluster = self.patroni.dcs.cluster if self.__allowlist_include_members and cluster: - for member in cluster.members: - if member.api_url: - try: - r = urlparse(member.api_url) - host = r.hostname - port = r.port or (443 if r.scheme == 'https' else 80) - for ip in self.__resolve_ips(host, port): - yield ip - except Exception as e: - logger.debug('Failed to parse url %s: %r', member.api_url, e) + for cluster in [cluster] + list(cluster.workers.values()): + for member in cluster.members: + if member.api_url: + try: + r = urlparse(member.api_url) + host = r.hostname + port = r.port or (443 if r.scheme == 'https' else 80) + for ip in self.__resolve_ips(host, port): + yield ip + except Exception as e: + logger.debug('Failed to parse url %s: %r', member.api_url, e) def check_access(self, rh): if self.__allowlist or self.__allowlist_include_members: diff --git a/patroni/config.py b/patroni/config.py index d0f01eb4..2d25f918 100644 --- a/patroni/config.py +++ b/patroni/config.py @@ -2,6 +2,7 @@ import json import logging import os import shutil +import six import tempfile import yaml @@ -355,9 +356,15 @@ class Config(object): 'CACERT', 'CERT', 'KEY', 'VERIFY', 'TOKEN', 'CHECKS', 'DC', 'CONSISTENCY', 'REGISTER_SERVICE', 'SERVICE_CHECK_INTERVAL', 'SERVICE_CHECK_TLS_SERVER_NAME', 'NAMESPACE', 'CONTEXT', 'USE_ENDPOINTS', 'SCOPE_LABEL', 'ROLE_LABEL', 'POD_IP', - 'PORTS', 'LABELS', 'BYPASS_API_SERVICE', 'KEY_PASSWORD', 'USE_SSL', 'SET_ACLS') and name: + 'PORTS', 'LABELS', 'BYPASS_API_SERVICE', 'KEY_PASSWORD', 'USE_SSL', 'SET_ACLS', + 'GROUP', 'DATABASE') and name: value = os.environ.pop(param) - if suffix == 'PORT': + if name == 'CITUS': + if suffix == 'GROUP': + value = parse_int(value) + elif suffix != 'DATABASE': + continue + elif suffix == 'PORT': value = value and parse_int(value) elif suffix in ('HOSTS', 'PORTS', 'CHECKS'): value = value and _parse_list(value) @@ -365,7 +372,7 @@ class Config(object): value = _parse_dict(value) elif suffix in ('USE_PROXIES', 'REGISTER_SERVICE', 'USE_ENDPOINTS', 'BYPASS_API_SERVICE', 'VERIFY'): value = parse_bool(value) - if value: + if value is not None: ret[name.lower()][suffix.lower()] = value for dcs in ('etcd', 'etcd3'): if dcs in ret: @@ -393,7 +400,11 @@ class Config(object): def _build_effective_configuration(self, dynamic_configuration, local_configuration): config = self._safe_copy_dynamic_configuration(dynamic_configuration) for name, value in local_configuration.items(): - if name == 'postgresql': + if name == 'citus': # remove invalid citus configuration + if isinstance(value, dict) and isinstance(value.get('group'), six.integer_types)\ + and isinstance(value.get('database'), six.string_types): + config[name] = value + elif name == 'postgresql': for name, value in (value or {}).items(): if name == 'parameters': config['postgresql'][name].update(self._process_postgresql_parameters(value, True)) @@ -431,6 +442,12 @@ class Config(object): if 'name' not in config and 'name' in pg_config: config['name'] = pg_config['name'] + # when bootstrapping the new Citus cluster (coordinator/worker) enable sync replication in global configuration + if 'citus' in config: + bootstrap = config.setdefault('bootstrap', {}) + dcs = bootstrap.setdefault('dcs', {}) + dcs.setdefault('synchronous_mode', True) + updated_fields = ( 'name', 'scope', @@ -438,7 +455,8 @@ class Config(object): 'synchronous_mode', 'synchronous_mode_strict', 'synchronous_node_count', - 'maximum_lag_on_syncnode' + 'maximum_lag_on_syncnode', + 'citus' ) pg_config.update({p: config[p] for p in updated_fields if p in config}) diff --git a/patroni/ctl.py b/patroni/ctl.py index ec780731..fcdf2f3f 100644 --- a/patroni/ctl.py +++ b/patroni/ctl.py @@ -137,6 +137,9 @@ option_watch = click.option('-W', is_flag=True, help='Auto update the screen eve option_force = click.option('--force', is_flag=True, help='Do not ask for confirmation at any point') arg_cluster_name = click.argument('cluster_name', required=False, default=lambda: click.get_current_context().obj.get('scope')) +option_default_citus_group = click.option('--group', required=False, type=int, help='Citus group', + default=lambda: click.get_current_context().obj.get('citus', {}).get('group')) +option_citus_group = click.option('--group', required=False, type=int, help='Citus group') option_insecure = click.option('-k', '--insecure', is_flag=True, help='Allow connections to SSL sites without certs') @@ -157,11 +160,16 @@ def ctl(ctx, config_file, dcs_url, insecure): ctx.obj.setdefault('ctl', {})['insecure'] = ctx.obj.get('ctl', {}).get('insecure') or insecure -def get_dcs(config, scope): +def get_dcs(config, scope, group): config.update({'scope': scope, 'patronictl': True}) + if group is not None: + config['citus'] = {'group': group} config.setdefault('name', scope) try: - return _get_dcs(config) + dcs = _get_dcs(config) + if config.get('citus') and group is None: + dcs.get_cluster = dcs._get_citus_cluster + return dcs except PatroniException as e: raise PatroniCtlException(str(e)) @@ -186,9 +194,10 @@ def print_output(columns, rows, alignment=None, fmt='pretty', header=None, delim for row in rows: if row[i]: row[i] = format_config_for_editing(row[i], fmt != 'pretty').strip() - if list_cluster and fmt != 'tsv': # skip cluster name if pretty-printing - columns = columns[1:] if columns else [] - rows = [row[1:] for row in rows] + if list_cluster and fmt != 'tsv': # skip cluster name and maybe Citus group if pretty-printing + skip_cols = 2 if ' (group: ' in header else 1 + columns = columns[skip_cols:] if columns else [] + rows = [row[skip_cols:] for row in rows] if fmt == 'tsv': for r in ([columns] if columns else []) + rows: @@ -232,21 +241,25 @@ def watching(w, watch, max_count=None, clear=True): yield 0 -def get_all_members(cluster, role='master'): +def get_all_members(obj, cluster, group, role='master'): + clusters = {0: cluster} + if obj.get('citus') and group is None: + clusters.update(cluster.workers) if role == 'master': - if cluster.leader is not None and cluster.leader.name: - yield cluster.leader + for cluster in clusters.values(): + if cluster.leader is not None and cluster.leader.name: + yield cluster.leader.member return - leader_name = (cluster.leader.member.name if cluster.leader else None) - for m in cluster.members: - if role == 'any' or role == 'replica' and m.name != leader_name: - yield m + for cluster in clusters.values(): + leader_name = (cluster.leader.member.name if cluster.leader else None) + for m in cluster.members: + if role == 'any' or role == 'replica' and m.name != leader_name: + yield m -def get_any_member(cluster, role='master', member=None): - members = get_all_members(cluster, role) - for m in members: +def get_any_member(obj, cluster, group, role='master', member=None): + for m in get_all_members(obj, cluster, group, role): if member is None or m.name == member: return m @@ -260,8 +273,8 @@ def get_all_members_leader_first(cluster): yield member -def get_cursor(cluster, connect_parameters, role='master', member=None): - member = get_any_member(cluster, role=role, member=member) +def get_cursor(obj, cluster, group, connect_parameters, role='master', member=None): + member = get_any_member(obj, cluster, group, role=role, member=member) if member is None: return None @@ -289,32 +302,31 @@ def get_cursor(cluster, connect_parameters, role='master', member=None): return None -def get_members(cluster, cluster_name, member_names, role, force, action, ask_confirmation=True): - candidates = {m.name: m for m in cluster.members} +def get_members(obj, cluster, cluster_name, member_names, role, force, action, ask_confirmation=True, group=None): + members = list(get_all_members(obj, cluster, group, role)) + candidates = {m.name for m in members} if not force or role: if not member_names and not candidates: raise PatroniCtlException('{0} cluster doesn\'t have any members'.format(cluster_name)) - output_members(cluster, cluster_name) + output_members(obj, cluster, cluster_name, group=group) - if role: - role_names = [m.name for m in get_all_members(cluster, role)] - if member_names: - member_names = list(set(member_names) & set(role_names)) - if not member_names: - raise PatroniCtlException('No {0} among provided members'.format(role)) - else: - member_names = role_names + if member_names: + member_names = list(set(member_names) & candidates) + if not member_names: + raise PatroniCtlException('No {0} among provided members'.format(role)) + elif action != 'reinitialize': + member_names = list(candidates) if not member_names and not force: member_names = [click.prompt('Which member do you want to {0} [{1}]?'.format(action, - ', '.join(candidates.keys())), type=str, default='')] + ', '.join(candidates)), type=str, default='')] for member_name in member_names: if member_name not in candidates: raise PatroniCtlException('{0} is not a member of cluster'.format(member_name)) - members = [candidates[n] for n in member_names] + members = [m for m in members if m.name in member_names] if ask_confirmation: confirm_members_action(members, force, action) return members @@ -340,15 +352,18 @@ def confirm_members_action(members, force, action, scheduled_at=None): 'any']), default=None) @click.option('--member', '-m', help='Generate a dsn for this member', type=str) @arg_cluster_name +@option_citus_group @click.pass_obj -def dsn(obj, cluster_name, role, member): - if role is not None and member is not None: - raise PatroniCtlException('--role and --member are mutually exclusive options') +def dsn(obj, cluster_name, group, role, member): + if member is not None: + if role is not None: + raise PatroniCtlException('--role and --member are mutually exclusive options') + role = 'any' if member is None and role is None: role = 'master' - cluster = get_dcs(obj, cluster_name).get_cluster() - m = get_any_member(cluster, role=role, member=member) + cluster = get_dcs(obj, cluster_name, group).get_cluster() + m = get_any_member(obj, cluster, group, role=role, member=member) if m is None: raise PatroniCtlException('Can not find a suitable member') @@ -358,6 +373,7 @@ def dsn(obj, cluster_name, role, member): @ctl.command('query', help='Query a Patroni PostgreSQL member') @arg_cluster_name +@option_citus_group @click.option('--format', 'fmt', help='Output format (pretty, tsv, json, yaml)', default='tsv') @click.option('--file', '-f', 'p_file', help='Execute the SQL commands from this file', type=click.File('rb')) @click.option('--password', help='force password prompt', is_flag=True) @@ -374,6 +390,7 @@ def dsn(obj, cluster_name, role, member): def query( obj, cluster_name, + group, role, member, w, @@ -386,8 +403,10 @@ def query( dbname, fmt='tsv', ): - if role is not None and member is not None: - raise PatroniCtlException('--role and --member are mutually exclusive options') + if member is not None: + if role is not None: + raise PatroniCtlException('--role and --member are mutually exclusive options') + role = 'any' if member is None and role is None: role = 'master' @@ -408,25 +427,25 @@ def query( if p_file is not None: command = p_file.read() - dcs = get_dcs(obj, cluster_name) + dcs = get_dcs(obj, cluster_name, group) cursor = None for _ in watching(w, watch, clear=False): if cursor is None: cluster = dcs.get_cluster() - output, header = query_member(cluster, cursor, member, role, command, connect_parameters) + output, header = query_member(obj, cluster, group, cursor, member, role, command, connect_parameters) print_output(header, output, fmt=fmt, delimiter=delimiter) -def query_member(cluster, cursor, member, role, command, connect_parameters): +def query_member(obj, cluster, group, cursor, member, role, command, connect_parameters): from . import psycopg try: if cursor is None: - cursor = get_cursor(cluster, connect_parameters, role=role, member=member) + cursor = get_cursor(obj, cluster, group, connect_parameters, role=role, member=member) if cursor is None: - if role is None: + if member is not None: message = 'No connection to member {0} is available'.format(member) else: message = 'No connection to role={0} is available'.format(role) @@ -446,13 +465,16 @@ def query_member(cluster, cursor, member, role, command, connect_parameters): @ctl.command('remove', help='Remove cluster from DCS') @click.argument('cluster_name') +@option_citus_group @option_format @click.pass_obj -def remove(obj, cluster_name, fmt): - dcs = get_dcs(obj, cluster_name) +def remove(obj, cluster_name, group, fmt): + dcs = get_dcs(obj, cluster_name, group) cluster = dcs.get_cluster() - output_members(cluster, cluster_name, fmt=fmt) + if obj.get('citus') and group is None: + raise PatroniCtlException('For Citus clusters the --group must me specified') + output_members(obj, cluster, cluster_name, fmt=fmt) confirm = click.prompt('Please confirm the cluster name to remove', type=str) if confirm != cluster_name: @@ -501,14 +523,15 @@ def parse_scheduled(scheduled): @ctl.command('reload', help='Reload cluster member configuration') @click.argument('cluster_name') @click.argument('member_names', nargs=-1) +@option_citus_group @click.option('--role', '-r', help='Reload only members with this role', default='any', type=click.Choice(['master', 'replica', 'any'])) @option_force @click.pass_obj -def reload(obj, cluster_name, member_names, force, role): - cluster = get_dcs(obj, cluster_name).get_cluster() +def reload(obj, cluster_name, member_names, group, force, role): + cluster = get_dcs(obj, cluster_name, group).get_cluster() - members = get_members(cluster, cluster_name, member_names, role, force, 'reload') + members = get_members(obj, cluster, cluster_name, member_names, role, force, 'reload', group=group) for member in members: r = request_patroni(member, 'post', 'reload') @@ -527,6 +550,7 @@ def reload(obj, cluster_name, member_names, force, role): @ctl.command('restart', help='Restart cluster member') @click.argument('cluster_name') @click.argument('member_names', nargs=-1) +@option_citus_group @click.option('--role', '-r', help='Restart only members with this role', default='any', type=click.Choice(['master', 'replica', 'any'])) @click.option('--any', 'p_any', help='Restart a single member only', is_flag=True) @@ -539,10 +563,10 @@ def reload(obj, cluster_name, member_names, force, role): help='Return error and fail over if necessary when restarting takes longer than this.') @option_force @click.pass_obj -def restart(obj, cluster_name, member_names, force, role, p_any, scheduled, version, pending, timeout): - cluster = get_dcs(obj, cluster_name).get_cluster() +def restart(obj, cluster_name, group, member_names, force, role, p_any, scheduled, version, pending, timeout): + cluster = get_dcs(obj, cluster_name, group).get_cluster() - members = get_members(cluster, cluster_name, member_names, role, force, 'restart', False) + members = get_members(obj, cluster, cluster_name, member_names, role, force, 'restart', False, group=group) if scheduled is None and not force: next_hour = (datetime.datetime.now() + datetime.timedelta(hours=1)).strftime('%Y-%m-%dT%H:%M') scheduled = click.prompt('When should the restart take place (e.g. ' + next_hour + ') ', @@ -600,13 +624,14 @@ def restart(obj, cluster_name, member_names, force, role, p_any, scheduled, vers @ctl.command('reinit', help='Reinitialize cluster member') @click.argument('cluster_name') +@option_citus_group @click.argument('member_names', nargs=-1) @option_force @click.option('--wait', help='Wait until reinitialization completes', is_flag=True) @click.pass_obj -def reinit(obj, cluster_name, member_names, force, wait): - cluster = get_dcs(obj, cluster_name).get_cluster() - members = get_members(cluster, cluster_name, member_names, None, force, 'reinitialize') +def reinit(obj, cluster_name, group, member_names, force, wait): + cluster = get_dcs(obj, cluster_name, group).get_cluster() + members = get_members(obj, cluster, cluster_name, member_names, 'replica', force, 'reinitialize', group=group) wait_on_members = [] for member in members: @@ -637,7 +662,7 @@ def reinit(obj, cluster_name, member_names, force, wait): wait_on_members.remove(member) -def _do_failover_or_switchover(obj, action, cluster_name, master, candidate, force, scheduled=None): +def _do_failover_or_switchover(obj, action, cluster_name, group, master, candidate, force, scheduled=None): """ We want to trigger a failover or switchover for the specified cluster name. @@ -645,8 +670,18 @@ def _do_failover_or_switchover(obj, action, cluster_name, master, candidate, for If so, we trigger an action and keep the client up to date. """ - dcs = get_dcs(obj, cluster_name) + dcs = get_dcs(obj, cluster_name, group) cluster = dcs.get_cluster() + click.echo('Current cluster topology') + output_members(obj, cluster, cluster_name, group=group) + + if obj.get('citus') and group is None: + if force: + raise PatroniCtlException('For Citus clusters the --group must me specified') + else: + group = click.prompt('Citus group', type=int) + dcs = get_dcs(obj, cluster_name, group) + cluster = dcs.get_cluster() if action == 'switchover' and (cluster.leader is None or not cluster.leader.name): raise PatroniCtlException('This cluster has no master') @@ -700,9 +735,6 @@ def _do_failover_or_switchover(obj, action, cluster_name, master, candidate, for logging.debug(failover_value) # By now we have established that the leader exists and the candidate exists - click.echo('Current cluster topology') - output_members(dcs.get_cluster(), cluster_name) - if not force: demote_msg = ', demoting current master ' + master if master else '' if scheduled_at_str: @@ -738,30 +770,32 @@ def _do_failover_or_switchover(obj, action, cluster_name, master, candidate, for click.echo('{0} Could not {1} using Patroni api, falling back to DCS'.format(timestamp(), action)) dcs.manual_failover(master, candidate, scheduled_at=scheduled_at) - output_members(cluster, cluster_name) + output_members(obj, cluster, cluster_name, group=group) @ctl.command('failover', help='Failover to a replica') @arg_cluster_name +@option_citus_group @click.option('--master', help='The name of the current master', default=None) @click.option('--candidate', help='The name of the candidate', default=None) @option_force @click.pass_obj -def failover(obj, cluster_name, master, candidate, force): +def failover(obj, cluster_name, group, master, candidate, force): action = 'switchover' if master else 'failover' - _do_failover_or_switchover(obj, action, cluster_name, master, candidate, force) + _do_failover_or_switchover(obj, action, cluster_name, group, master, candidate, force) @ctl.command('switchover', help='Switchover to a replica') @arg_cluster_name +@option_citus_group @click.option('--master', help='The name of the current master', default=None) @click.option('--candidate', help='The name of the candidate', default=None) @click.option('--scheduled', help='Timestamp of a scheduled switchover in unambiguous format (e.g. ISO 8601)', default=None) @option_force @click.pass_obj -def switchover(obj, cluster_name, master, candidate, force, scheduled): - _do_failover_or_switchover(obj, 'switchover', cluster_name, master, candidate, force, scheduled) +def switchover(obj, cluster_name, group, master, candidate, force, scheduled): + _do_failover_or_switchover(obj, 'switchover', cluster_name, group, master, candidate, force, scheduled) def generate_topology(level, member, topology): @@ -791,48 +825,7 @@ def topology_sort(members): yield member -def output_members(cluster, name, extended=False, fmt='pretty'): - rows = [] - logging.debug(cluster) - initialize = {None: 'uninitialized', '': 'initializing'}.get(cluster.initialize, cluster.initialize) - cluster = cluster_as_json(cluster) - - columns = ['Cluster', 'Member', 'Host', 'Role', 'State', 'TL', 'Lag in MB'] - for c in ('Pending restart', 'Scheduled restart', 'Tags'): - if extended or any(m.get(c.lower().replace(' ', '_')) for m in cluster['members']): - columns.append(c) - - # Show Host as 'host:port' if somebody is running on non-standard port or two nodes are running on the same host - members = [m for m in cluster['members'] if 'host' in m] - append_port = any('port' in m and m['port'] != 5432 for m in members) or\ - len(set(m['host'] for m in members)) < len(members) - - sort = topology_sort if fmt == 'topology' else iter - for m in sort(cluster['members']): - logging.debug(m) - - lag = m.get('lag', '') - m.update(cluster=name, member=m['name'], host=m.get('host', ''), tl=m.get('timeline', ''), - role=m['role'].replace('_', ' ').title(), - lag_in_mb=round(lag/1024/1024) if isinstance(lag, six.integer_types) else lag, - pending_restart='*' if m.get('pending_restart') else '') - - if append_port and m['host'] and m.get('port'): - m['host'] = ':'.join([m['host'], str(m['port'])]) - - if 'scheduled_restart' in m: - value = m['scheduled_restart']['schedule'] - if 'postgres_version' in m['scheduled_restart']: - value += ' if version < {0}'.format(m['scheduled_restart']['postgres_version']) - m['scheduled_restart'] = value - - rows.append([m.get(n.lower().replace(' ', '_'), '') for n in columns]) - - print_output(columns, rows, {'Lag in MB': 'r', 'TL': 'r'}, fmt, ' Cluster: {0} ({1}) '.format(name, initialize)) - - if fmt not in ('pretty', 'topology'): # Omit service info when using machine-readable formats - return - +def get_cluster_service_info(cluster): service_info = [] if cluster.get('pause'): service_info.append('Maintenance mode: on') @@ -843,44 +836,109 @@ def output_members(cluster, name, extended=False, fmt='pretty'): if name in cluster['scheduled_switchover']: info += '\n{0:>24}: {1}'.format(name, cluster['scheduled_switchover'][name]) service_info.append(info) + return service_info - if service_info: - click.echo(' ' + '\n '.join(service_info)) + +def output_members(obj, cluster, name, extended=False, fmt='pretty', group=None): + rows = [] + logging.debug(cluster) + + initialize = {None: 'uninitialized', '': 'initializing'}.get(cluster.initialize, cluster.initialize) + columns = ['Cluster', 'Member', 'Host', 'Role', 'State', 'TL', 'Lag in MB'] + + clusters = {group or 0: cluster_as_json(cluster)} + + is_citus_cluster = obj.get('citus') + if is_citus_cluster: + columns.insert(1, 'Group') + if group is None: + clusters.update({g: cluster_as_json(c) for g, c in cluster.workers.items()}) + + all_members = [m for c in clusters.values() for m in c['members'] if 'host' in m] + + for c in ('Pending restart', 'Scheduled restart', 'Tags'): + if extended or any(m.get(c.lower().replace(' ', '_')) for m in all_members): + columns.append(c) + + # Show Host as 'host:port' if somebody is running on non-standard port or two nodes are running on the same host + append_port = any('port' in m and m['port'] != 5432 for m in all_members) or\ + len(set(m['host'] for m in all_members)) < len(all_members) + + sort = topology_sort if fmt == 'topology' else iter + for g, cluster in sorted(clusters.items()): + for member in sort(cluster['members']): + logging.debug(member) + + lag = member.get('lag', '') + member.update(cluster=name, member=member['name'], group=g, + host=member.get('host', ''), tl=member.get('timeline', ''), + role=member['role'].replace('_', ' ').title(), + lag_in_mb=round(lag/1024/1024) if isinstance(lag, six.integer_types) else lag, + pending_restart='*' if member.get('pending_restart') else '') + + if append_port and member['host'] and member.get('port'): + member['host'] = ':'.join([member['host'], str(member['port'])]) + + if 'scheduled_restart' in member: + value = member['scheduled_restart']['schedule'] + if 'postgres_version' in member['scheduled_restart']: + value += ' if version < {0}'.format(member['scheduled_restart']['postgres_version']) + member['scheduled_restart'] = value + + rows.append([member.get(n.lower().replace(' ', '_'), '') for n in columns]) + + title = 'Citus cluster' if is_citus_cluster else 'Cluster' + group_title = '' if group is None else 'group: {0}, '.format(group) + title_details = group_title and ' ({0}{1})'.format(group_title, initialize) + title = ' {0}: {1}{2} '.format(title, name, title_details) + print_output(columns, rows, {'Group': 'r', 'Lag in MB': 'r', 'TL': 'r'}, fmt, title) + + if fmt not in ('pretty', 'topology'): # Omit service info when using machine-readable formats + return + + for g, cluster in sorted(clusters.items()): + service_info = get_cluster_service_info(cluster) + if service_info: + if is_citus_cluster and group is None: + click.echo('Citus group: {0}'.format(g)) + click.echo(' ' + '\n '.join(service_info)) @ctl.command('list', help='List the Patroni members for a given Patroni') @click.argument('cluster_names', nargs=-1) +@option_citus_group @click.option('--extended', '-e', help='Show some extra information', is_flag=True) @click.option('--timestamp', '-t', 'ts', help='Print timestamp', is_flag=True) @option_format @option_watch @option_watchrefresh @click.pass_obj -def members(obj, cluster_names, fmt, watch, w, extended, ts): +def members(obj, cluster_names, group, fmt, watch, w, extended, ts): if not cluster_names: if 'scope' in obj: cluster_names = [obj['scope']] if not cluster_names: return logging.warning('Listing members: No cluster names were provided') - for cluster_name in cluster_names: - dcs = get_dcs(obj, cluster_name) + for _ in watching(w, watch): + if ts: + click.echo(timestamp(0)) - for _ in watching(w, watch): - if ts: - click.echo(timestamp(0)) + for cluster_name in cluster_names: + dcs = get_dcs(obj, cluster_name, group) cluster = dcs.get_cluster() - output_members(cluster, cluster_name, extended, fmt) + output_members(obj, cluster, cluster_name, extended, fmt, group) @ctl.command('topology', help='Prints ASCII topology for given cluster') @click.argument('cluster_names', nargs=-1) +@option_citus_group @option_watch @option_watchrefresh @click.pass_obj @click.pass_context -def topology(ctx, obj, cluster_names, watch, w): +def topology(ctx, obj, cluster_names, group, watch, w): ctx.forward(members, fmt='topology') @@ -921,10 +979,11 @@ def set_defaults(config, cluster_name): @ctl.command('scaffold', help='Create a structure for the cluster in DCS') @click.argument('cluster_name') +@option_citus_group @click.option('--sysid', '-s', help='System ID of the cluster to put into the initialize key', default="") @click.pass_obj -def scaffold(obj, cluster_name, sysid): - dcs = get_dcs(obj, cluster_name) +def scaffold(obj, cluster_name, group, sysid): + dcs = get_dcs(obj, cluster_name, group) cluster = dcs.get_cluster() if cluster and cluster.initialize is not None: raise PatroniCtlException("This cluster is already initialized") @@ -945,18 +1004,19 @@ def scaffold(obj, cluster_name, sysid): @ctl.command('flush', help='Discard scheduled events') @click.argument('cluster_name') +@option_citus_group @click.argument('member_names', nargs=-1) @click.argument('target', type=click.Choice(['restart', 'switchover'])) @click.option('--role', '-r', help='Flush only members with this role', default='any', type=click.Choice(['master', 'replica', 'any'])) @option_force @click.pass_obj -def flush(obj, cluster_name, member_names, force, role, target): - dcs = get_dcs(obj, cluster_name) +def flush(obj, cluster_name, group, member_names, force, role, target): + dcs = get_dcs(obj, cluster_name, group) cluster = dcs.get_cluster() if target == 'restart': - for member in get_members(cluster, cluster_name, member_names, role, force, 'flush'): + for member in get_members(obj, cluster, cluster_name, member_names, role, force, 'flush', group=group): if member.data.get('scheduled_restart'): r = request_patroni(member, 'delete', 'restart') check_response(r, member.name, 'flush scheduled restart') @@ -1002,8 +1062,8 @@ def wait_until_pause_is_applied(dcs, paused, old_cluster): return click.echo('Success: cluster management is {0}'.format(paused and 'paused' or 'resumed')) -def toggle_pause(config, cluster_name, paused, wait): - dcs = get_dcs(config, cluster_name) +def toggle_pause(config, cluster_name, group, paused, wait): + dcs = get_dcs(config, cluster_name, group) cluster = dcs.get_cluster() if cluster.is_paused() == paused: raise PatroniCtlException('Cluster is {0} paused'.format(paused and 'already' or 'not')) @@ -1031,18 +1091,20 @@ def toggle_pause(config, cluster_name, paused, wait): @ctl.command('pause', help='Disable auto failover') @arg_cluster_name +@option_default_citus_group @click.pass_obj @click.option('--wait', help='Wait until pause is applied on all nodes', is_flag=True) -def pause(obj, cluster_name, wait): - return toggle_pause(obj, cluster_name, True, wait) +def pause(obj, cluster_name, group, wait): + return toggle_pause(obj, cluster_name, group, True, wait) @ctl.command('resume', help='Resume auto failover') @arg_cluster_name +@option_default_citus_group @click.option('--wait', help='Wait until pause is cleared on all nodes', is_flag=True) @click.pass_obj -def resume(obj, cluster_name, wait): - return toggle_pause(obj, cluster_name, False, wait) +def resume(obj, cluster_name, group, wait): + return toggle_pause(obj, cluster_name, group, False, wait) @contextmanager @@ -1199,6 +1261,7 @@ def invoke_editor(before_editing, cluster_name): @ctl.command('edit-config', help="Edit cluster configuration") @arg_cluster_name +@option_default_citus_group @click.option('--quiet', '-q', is_flag=True, help='Do not show changes') @click.option('--set', '-s', 'kvpairs', multiple=True, help='Set specific configuration value. Can be specified multiple times') @@ -1210,8 +1273,8 @@ def invoke_editor(before_editing, cluster_name): ' Use - for stdin.') @option_force @click.pass_obj -def edit_config(obj, cluster_name, force, quiet, kvpairs, pgkvpairs, apply_filename, replace_filename): - dcs = get_dcs(obj, cluster_name) +def edit_config(obj, cluster_name, group, force, quiet, kvpairs, pgkvpairs, apply_filename, replace_filename): + dcs = get_dcs(obj, cluster_name, group) cluster = dcs.get_cluster() before_editing = format_config_for_editing(cluster.config.data) @@ -1253,9 +1316,10 @@ def edit_config(obj, cluster_name, force, quiet, kvpairs, pgkvpairs, apply_filen @ctl.command('show-config', help="Show cluster configuration") @arg_cluster_name +@option_default_citus_group @click.pass_obj -def show_config(obj, cluster_name): - cluster = get_dcs(obj, cluster_name).get_cluster() +def show_config(obj, cluster_name, group): + cluster = get_dcs(obj, cluster_name, group).get_cluster() click.echo(format_config_for_editing(cluster.config.data)) @@ -1263,16 +1327,17 @@ def show_config(obj, cluster_name): @ctl.command('version', help='Output version of patronictl command or a running Patroni instance') @click.argument('cluster_name', required=False) @click.argument('member_names', nargs=-1) +@option_citus_group @click.pass_obj -def version(obj, cluster_name, member_names): +def version(obj, cluster_name, group, member_names): click.echo("patronictl version {0}".format(__version__)) if not cluster_name: return click.echo("") - cluster = get_dcs(obj, cluster_name).get_cluster() - for m in cluster.members: + cluster = get_dcs(obj, cluster_name, group).get_cluster() + for m in get_all_members(obj, cluster, group, 'any'): if m.api_url: if not member_names or m.name in member_names: try: @@ -1288,10 +1353,11 @@ def version(obj, cluster_name, member_names): @ctl.command('history', help="Show the history of failovers/switchovers") @arg_cluster_name +@option_default_citus_group @option_format @click.pass_obj -def history(obj, cluster_name, fmt): - cluster = get_dcs(obj, cluster_name).get_cluster() +def history(obj, cluster_name, group, fmt): + cluster = get_dcs(obj, cluster_name, group).get_cluster() history = cluster.history and cluster.history.lines or [] table_header_row = ['TL', 'LSN', 'Reason', 'Timestamp', 'New Leader'] for line in history: diff --git a/patroni/dcs/__init__.py b/patroni/dcs/__init__.py index c27ffa50..c1684787 100644 --- a/patroni/dcs/__init__.py +++ b/patroni/dcs/__init__.py @@ -20,6 +20,8 @@ from threading import Event, Lock from ..exceptions import PatroniFatalException from ..utils import deep_compare, parse_bool, uri +CITUS_COORDINATOR_GROUP_ID = 0 +citus_group_re = re.compile('^(0|[1-9][0-9]*)$') slot_name_re = re.compile('^[a-z0-9_]{1,63}$') logger = logging.getLogger(__name__) @@ -94,6 +96,9 @@ def get_dcs(config): # propagate some parameters config[name].update({p: config[p] for p in ('namespace', 'name', 'scope', 'loop_wait', 'patronictl', 'ttl', 'retry_timeout') if p in config}) + # From citus section we only need "group" parameter, but will propagate everything just in case. + if isinstance(config.get('citus'), dict): + config[name].update(config['citus']) return item(config[name]) except ImportError: logger.debug('Failed to import %s', module_name) @@ -444,7 +449,8 @@ class TimelineHistory(namedtuple('TimelineHistory', 'index,value,lines')): return TimelineHistory(index, value, lines) -class Cluster(namedtuple('Cluster', 'initialize,config,leader,last_lsn,members,failover,sync,history,slots,failsafe')): +class Cluster(namedtuple('Cluster', 'initialize,config,leader,last_lsn,members,' + 'failover,sync,history,slots,failsafe,workers')): """Immutable object (namedtuple) which represents PostgreSQL cluster. Consists of the following fields: @@ -459,7 +465,13 @@ class Cluster(namedtuple('Cluster', 'initialize,config,leader,last_lsn,members,f :param history: reference to `TimelineHistory` object :param slots: state of permanent logical replication slots on the primary in the format: {"slot_name": int} :param failsafe: failsafe topology. Node is allowed to become the leader only if its name is found in this list. - """ + :param workers: workers of the Citus cluster, optional. Format: {int(group): Cluster()}""" + + def __new__(cls, *args): + # Make workers argument optional + if len(cls._fields) == len(args) + 1: + args = args + ({},) + return super(Cluster, cls).__new__(cls, *args) @property def leader_name(self): @@ -665,6 +677,7 @@ class AbstractDCS(object): """ self._name = config['name'] self._base_path = re.sub('/+', '/', '/'.join(['', config.get('namespace', 'service'), config['scope']])) + self._citus_group = str(config['group']) if isinstance(config.get('group'), six.integer_types) else None self._set_loop_wait(config.get('loop_wait', 10)) self._ctl = bool(config.get('patronictl', False)) @@ -678,7 +691,11 @@ class AbstractDCS(object): self.event = Event() def client_path(self, path): - return '/'.join([self._base_path, path.lstrip('/')]) + components = [self._base_path] + if self._citus_group: + components.append(self._citus_group) + components.append(path.lstrip('/')) + return '/'.join(components) @property def initialize_path(self): @@ -753,23 +770,62 @@ class AbstractDCS(object): return self._last_seen @abc.abstractmethod - def _load_cluster(self): - """Internally this method should build `Cluster` object which - represents current state and topology of the cluster in DCS. - this method supposed to be called only by `get_cluster` method. + def _cluster_loader(self, path): + """Load and build the `Cluster` object from DCS, which + represents a single Patroni cluster. - raise `~DCSError` in case of communication or other problems with DCS. - If the current node was running as a master and exception raised, - instance would be demoted.""" + :param path: the path in DCS where to load Cluster(s) from. + :returns: `Cluster`""" + + def _citus_cluster_loader(self, path): + """Load and build `Cluster` onjects from DCS that represent all + Patroni clusters from a single Citus cluster. + + :param path: the path in DCS where to load Cluster(s) from. + :returns: all Citus groups as `dict`, with group ids as keys""" + + @abc.abstractmethod + def _load_cluster(self, path, loader): + """Internally this method should call the `loader` method that + will build `Cluster` object which represents current state and + topology of the cluster in DCS. This method supposed to be + called only by `get_cluster` method. + + :param path: the path in DCS where to load Cluster(s) from. + :param loader: one of `_cluster_loader` or `_citus_cluster_loader` + :raise: `~DCSError` in case of communication problems with DCS. + If the current node was running as a master and exception + raised, instance would be demoted.""" def _bypass_caches(self): """Used only in zookeeper""" + def is_citus_coordinator(self): + return self._citus_group == str(CITUS_COORDINATOR_GROUP_ID) + + def get_citus_coordinator(self): + try: + path = '{0}/{1}/'.format(self._base_path, CITUS_COORDINATOR_GROUP_ID) + return self._load_cluster(path, self._cluster_loader) + except Exception as e: + logger.error('Failed to load Citus coordinator cluster from %s: %r', self.__class__.__name__, e) + + def _get_citus_cluster(self): + groups = self._load_cluster(self._base_path + '/', self._citus_cluster_loader) + if isinstance(groups, Cluster): # Zookeeper could return a cached version + cluster = groups + else: + cluster = groups.pop(CITUS_COORDINATOR_GROUP_ID, + Cluster(None, None, None, None, [], None, None, None, None, None)) + cluster.workers.update(groups) + return cluster + def get_cluster(self, force=False): if force: self._bypass_caches() try: - cluster = self._load_cluster() + cluster = self._get_citus_cluster() if self.is_citus_coordinator()\ + else self._load_cluster(self.client_path(''), self._cluster_loader) except Exception: self.reset_cluster() raise diff --git a/patroni/dcs/consul.py b/patroni/dcs/consul.py index 105a75ab..86809c10 100644 --- a/patroni/dcs/consul.py +++ b/patroni/dcs/consul.py @@ -8,14 +8,14 @@ import ssl import time import urllib3 -from collections import namedtuple +from collections import defaultdict, namedtuple from consul import ConsulException, NotFound, base from urllib3.exceptions import HTTPError from six.moves.urllib.parse import urlencode, urlparse, quote from six.moves.http_client import HTTPException -from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member,\ - SyncState, TimelineHistory, ReturnFalseException, catch_return_false_exception +from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, SyncState,\ + TimelineHistory, ReturnFalseException, catch_return_false_exception, citus_group_re from ..exceptions import DCSError from ..utils import deep_compare, parse_bool, Retry, RetryFailedError, split_host_port, uri, USER_AGENT @@ -190,6 +190,7 @@ class Consul(AbstractDCS): def __init__(self, config): super(Consul, self).__init__(config) + self._base_path = self._base_path[1:] self._scope = config['scope'] self._session = None self.__do_not_watch = False @@ -318,85 +319,95 @@ class Consul(AbstractDCS): logger.exception('refresh_session') raise ConsulError('Failed to renew/create session') - def client_path(self, path): - return super(Consul, self).client_path(path)[1:] - @staticmethod def member(node): return Member.from_node(node['ModifyIndex'], os.path.basename(node['Key']), node.get('Session'), node['Value']) - def _load_cluster(self): + def _cluster_from_nodes(self, nodes): + # get initialize flag + initialize = nodes.get(self._INITIALIZE) + initialize = initialize and initialize['Value'] + + # get global dynamic configuration + config = nodes.get(self._CONFIG) + config = config and ClusterConfig.from_node(config['ModifyIndex'], config['Value']) + + # get timeline history + history = nodes.get(self._HISTORY) + history = history and TimelineHistory.from_node(history['ModifyIndex'], history['Value']) + + # get last known leader lsn and slots + status = nodes.get(self._STATUS) + if status: + try: + status = json.loads(status['Value']) + last_lsn = status.get(self._OPTIME) + slots = status.get('slots') + except Exception: + slots = last_lsn = None + else: + last_lsn = nodes.get(self._LEADER_OPTIME) + last_lsn = last_lsn and last_lsn['Value'] + slots = None + try: - path = self.client_path('/') - _, results = self.retry(self._client.kv.get, path, recurse=True) + last_lsn = int(last_lsn) + except Exception: + last_lsn = 0 - if results is None: - raise NotFound + # get list of members + members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] - nodes = {} - for node in results: + # get leader + leader = nodes.get(self._LEADER) + + if leader: + member = Member(-1, leader['Value'], None, {}) + member = ([m for m in members if m.name == leader['Value']] or [member])[0] + leader = Leader(leader['ModifyIndex'], leader.get('Session'), member) + + # failover key + failover = nodes.get(self._FAILOVER) + if failover: + failover = Failover.from_node(failover['ModifyIndex'], failover['Value']) + + # get synchronization state + sync = nodes.get(self._SYNC) + sync = SyncState.from_node(sync and sync['ModifyIndex'], sync and sync['Value']) + + # get failsafe topology + failsafe = nodes.get(self._FAILSAFE) + try: + failsafe = json.loads(failsafe['Value']) if failsafe else None + except Exception: + failsafe = None + + return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + + def _cluster_loader(self, path): + _, results = self.retry(self._client.kv.get, path, recurse=True) + if results is None: + raise NotFound + nodes = {} + for node in results: + node['Value'] = (node['Value'] or b'').decode('utf-8') + nodes[node['Key'][len(path):]] = node + + return self._cluster_from_nodes(nodes) + + def _citus_cluster_loader(self, path): + _, results = self.retry(self._client.kv.get, path, recurse=True) + clusters = defaultdict(dict) + for node in results or []: + key = node['Key'][len(path):].split('/', 1) + if len(key) == 2 and citus_group_re.match(key[0]): node['Value'] = (node['Value'] or b'').decode('utf-8') - nodes[node['Key'][len(path):].lstrip('/')] = node + clusters[int(key[0])][key[1]] = node + return {group: self._cluster_from_nodes(nodes) for group, nodes in clusters.items()} - # get initialize flag - initialize = nodes.get(self._INITIALIZE) - initialize = initialize and initialize['Value'] - - # get global dynamic configuration - config = nodes.get(self._CONFIG) - config = config and ClusterConfig.from_node(config['ModifyIndex'], config['Value']) - - # get timeline history - history = nodes.get(self._HISTORY) - history = history and TimelineHistory.from_node(history['ModifyIndex'], history['Value']) - - # get last known leader lsn and slots - status = nodes.get(self._STATUS) - if status: - try: - status = json.loads(status['Value']) - last_lsn = status.get(self._OPTIME) - slots = status.get('slots') - except Exception: - slots = last_lsn = None - else: - last_lsn = nodes.get(self._LEADER_OPTIME) - last_lsn = last_lsn and last_lsn['Value'] - slots = None - - try: - last_lsn = int(last_lsn) - except Exception: - last_lsn = 0 - - # get list of members - members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] - - # get leader - leader = nodes.get(self._LEADER) - - if leader: - member = Member(-1, leader['Value'], None, {}) - member = ([m for m in members if m.name == leader['Value']] or [member])[0] - leader = Leader(leader['ModifyIndex'], leader.get('Session'), member) - - # failover key - failover = nodes.get(self._FAILOVER) - if failover: - failover = Failover.from_node(failover['ModifyIndex'], failover['Value']) - - # get synchronization state - sync = nodes.get(self._SYNC) - sync = SyncState.from_node(sync and sync['ModifyIndex'], sync and sync['Value']) - - # get failsafe topology - failsafe = nodes.get(self._FAILSAFE) - try: - failsafe = json.loads(failsafe['Value']) if failsafe else None - except Exception: - failsafe = None - - return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + def _load_cluster(self, path, loader): + try: + return loader(path) except NotFound: return Cluster(None, None, None, None, [], None, None, None, None, None) except Exception: diff --git a/patroni/dcs/etcd.py b/patroni/dcs/etcd.py index 3a8cf0c1..9908b863 100644 --- a/patroni/dcs/etcd.py +++ b/patroni/dcs/etcd.py @@ -10,6 +10,8 @@ import six import socket import time +from collections import defaultdict +from copy import deepcopy from dns.exception import DNSException from dns import resolver from urllib3 import Timeout @@ -19,8 +21,8 @@ from six.moves.http_client import HTTPException from six.moves.urllib_parse import urlparse from threading import Thread -from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member,\ - SyncState, TimelineHistory, ReturnFalseException, catch_return_false_exception +from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, SyncState,\ + TimelineHistory, ReturnFalseException, catch_return_false_exception, citus_group_re from ..exceptions import DCSError from ..request import get as requests_get from ..utils import Retry, RetryFailedError, split_host_port, uri, USER_AGENT @@ -480,6 +482,7 @@ class AbstractEtcd(AbstractDCS): sock.setsockopt(*opt) def get_etcd_client(self, config, client_cls): + config = deepcopy(config) if 'proxy' in config: config['use_proxies'] = True config['url'] = config['proxy'] @@ -604,71 +607,85 @@ class Etcd(AbstractEtcd): def member(node): return Member.from_node(node.modifiedIndex, os.path.basename(node.key), node.ttl, node.value) - def _load_cluster(self): + def _cluster_from_nodes(self, etcd_index, nodes): + # get initialize flag + initialize = nodes.get(self._INITIALIZE) + initialize = initialize and initialize.value + + # get global dynamic configuration + config = nodes.get(self._CONFIG) + config = config and ClusterConfig.from_node(config.modifiedIndex, config.value) + + # get timeline history + history = nodes.get(self._HISTORY) + history = history and TimelineHistory.from_node(history.modifiedIndex, history.value) + + # get last know leader lsn and slots + status = nodes.get(self._STATUS) + if status: + try: + status = json.loads(status.value) + last_lsn = status.get(self._OPTIME) + slots = status.get('slots') + except Exception: + slots = last_lsn = None + else: + last_lsn = nodes.get(self._LEADER_OPTIME) + last_lsn = last_lsn and last_lsn.value + slots = None + + try: + last_lsn = int(last_lsn) + except Exception: + last_lsn = 0 + + # get list of members + members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] + + # get leader + leader = nodes.get(self._LEADER) + if leader: + member = Member(-1, leader.value, None, {}) + member = ([m for m in members if m.name == leader.value] or [member])[0] + index = etcd_index if etcd_index > leader.modifiedIndex else leader.modifiedIndex + 1 + leader = Leader(index, leader.ttl, member) + + # failover key + failover = nodes.get(self._FAILOVER) + if failover: + failover = Failover.from_node(failover.modifiedIndex, failover.value) + + # get synchronization state + sync = nodes.get(self._SYNC) + sync = SyncState.from_node(sync and sync.modifiedIndex, sync and sync.value) + + # get failsafe topology + failsafe = nodes.get(self._FAILSAFE) + try: + failsafe = json.loads(failsafe.value) if failsafe else None + except Exception: + failsafe = None + + return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + + def _cluster_loader(self, path): + result = self.retry(self._client.read, path, recursive=True) + nodes = {node.key[len(result.key):].lstrip('/'): node for node in result.leaves} + return self._cluster_from_nodes(result.etcd_index, nodes) + + def _citus_cluster_loader(self, path): + clusters = defaultdict(dict) + result = self.retry(self._client.read, path, recursive=True) + for node in result.leaves: + key = node.key[len(result.key):].lstrip('/').split('/', 1) + if len(key) == 2 and citus_group_re.match(key[0]): + clusters[int(key[0])][key[1]] = node + return {group: self._cluster_from_nodes(result.etcd_index, nodes) for group, nodes in clusters.items()} + + def _load_cluster(self, path, loader): cluster = None try: - result = self.retry(self._client.read, self.client_path(''), recursive=True) - nodes = {node.key[len(result.key):].lstrip('/'): node for node in result.leaves} - - # get initialize flag - initialize = nodes.get(self._INITIALIZE) - initialize = initialize and initialize.value - - # get global dynamic configuration - config = nodes.get(self._CONFIG) - config = config and ClusterConfig.from_node(config.modifiedIndex, config.value) - - # get timeline history - history = nodes.get(self._HISTORY) - history = history and TimelineHistory.from_node(history.modifiedIndex, history.value) - - # get last know leader lsn and slots - status = nodes.get(self._STATUS) - if status: - try: - status = json.loads(status.value) - last_lsn = status.get(self._OPTIME) - slots = status.get('slots') - except Exception: - slots = last_lsn = None - else: - last_lsn = nodes.get(self._LEADER_OPTIME) - last_lsn = last_lsn and last_lsn.value - slots = None - - try: - last_lsn = int(last_lsn) - except Exception: - last_lsn = 0 - - # get list of members - members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] - - # get leader - leader = nodes.get(self._LEADER) - if leader: - member = Member(-1, leader.value, None, {}) - member = ([m for m in members if m.name == leader.value] or [member])[0] - index = result.etcd_index if result.etcd_index > leader.modifiedIndex else leader.modifiedIndex + 1 - leader = Leader(index, leader.ttl, member) - - # failover key - failover = nodes.get(self._FAILOVER) - if failover: - failover = Failover.from_node(failover.modifiedIndex, failover.value) - - # get synchronization state - sync = nodes.get(self._SYNC) - sync = SyncState.from_node(sync and sync.modifiedIndex, sync and sync.value) - - # get failsafe topology - failsafe = nodes.get(self._FAILSAFE) - try: - failsafe = json.loads(failsafe.value) if failsafe else None - except Exception: - failsafe = None - - cluster = Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + cluster = loader(path) except etcd.EtcdKeyNotFound: cluster = Cluster(None, None, None, None, [], None, None, None, None, None) except Exception as e: diff --git a/patroni/dcs/etcd3.py b/patroni/dcs/etcd3.py index 0c2d144b..21e705eb 100644 --- a/patroni/dcs/etcd3.py +++ b/patroni/dcs/etcd3.py @@ -10,11 +10,12 @@ import sys import time import urllib3 +from collections import defaultdict from threading import Condition, Lock, Thread from urllib3.exceptions import ReadTimeoutError, ProtocolError -from . import ClusterConfig, Cluster, Failover, Leader, Member,\ - SyncState, TimelineHistory, ReturnFalseException, catch_return_false_exception +from . import ClusterConfig, Cluster, Failover, Leader, Member, SyncState,\ + TimelineHistory, ReturnFalseException, catch_return_false_exception, citus_group_re from .etcd import AbstractEtcdClientWithFailover, AbstractEtcd, catch_etcd_errors from ..exceptions import DCSError, PatroniException from ..utils import deep_compare, enable_keepalive, iter_response_objects, RetryFailedError, USER_AGENT @@ -558,13 +559,18 @@ class PatroniEtcd3Client(Etcd3Client): raise RetryFailedError('Exceeded retry deadline') self._kv_cache.condition.wait(timeout) - def get_cluster(self): - if self._kv_cache: + def get_cluster(self, path): + if self._kv_cache and path.startswith(self._etcd3.cluster_prefix): with self._kv_cache.condition: self._wait_cache(self._etcd3._retry.deadline) - return self._kv_cache.copy() + ret = self._kv_cache.copy() else: - return self._etcd3.retry(self.prefix, self._etcd3.cluster_prefix).get('kvs', []) + ret = self._etcd3.retry(self.prefix, path).get('kvs', []) + for node in ret: + node.update({'key': base64_decode(node['key']), + 'value': base64_decode(node.get('value', '')), + 'lease': node.get('lease')}) + return ret def call_rpc(self, method, fields, retry=None): ret = super(PatroniEtcd3Client, self).call_rpc(method, fields, retry) @@ -641,85 +647,94 @@ class Etcd3(AbstractEtcd): @property def cluster_prefix(self): - return self.client_path('') + return self._base_path + '/' if self.is_citus_coordinator() else self.client_path('') @staticmethod def member(node): return Member.from_node(node['mod_revision'], os.path.basename(node['key']), node['lease'], node['value']) - def _load_cluster(self): + def _cluster_from_nodes(self, nodes): + # get initialize flag + initialize = nodes.get(self._INITIALIZE) + initialize = initialize and initialize['value'] + + # get global dynamic configuration + config = nodes.get(self._CONFIG) + config = config and ClusterConfig.from_node(config['mod_revision'], config['value']) + + # get timeline history + history = nodes.get(self._HISTORY) + history = history and TimelineHistory.from_node(history['mod_revision'], history['value']) + + # get last know leader lsn and slots + status = nodes.get(self._STATUS) + if status: + try: + status = json.loads(status['value']) + last_lsn = status.get(self._OPTIME) + slots = status.get('slots') + except Exception: + slots = last_lsn = None + else: + last_lsn = nodes.get(self._LEADER_OPTIME) + last_lsn = last_lsn and last_lsn['value'] + slots = None + + try: + last_lsn = int(last_lsn) + except Exception: + last_lsn = 0 + + # get list of members + members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] + + # get leader + leader = nodes.get(self._LEADER) + if not self._ctl and leader and leader['value'] == self._name and self._lease != leader.get('lease'): + logger.warning('I am the leader but not owner of the lease') + + if leader: + member = Member(-1, leader['value'], None, {}) + member = ([m for m in members if m.name == leader['value']] or [member])[0] + leader = Leader(leader['mod_revision'], leader['lease'], member) + + # failover key + failover = nodes.get(self._FAILOVER) + if failover: + failover = Failover.from_node(failover['mod_revision'], failover['value']) + + # get synchronization state + sync = nodes.get(self._SYNC) + sync = SyncState.from_node(sync and sync['mod_revision'], sync and sync['value']) + + # get failsafe topology + failsafe = nodes.get(self._FAILSAFE) + try: + failsafe = json.loads(failsafe['value']) if failsafe else None + except Exception: + failsafe = None + + return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + + def _cluster_loader(self, path): + nodes = {node['key'][len(path):]: node + for node in self._client.get_cluster(path) + if node['key'].startswith(path)} + return self._cluster_from_nodes(nodes) + + def _citus_cluster_loader(self, path): + clusters = defaultdict(dict) + path = self._base_path + '/' + for node in self._client.get_cluster(path): + key = node['key'][len(path):].split('/', 1) + if len(key) == 2 and citus_group_re.match(key[0]): + clusters[int(key[0])][key[1]] = node + return {group: self._cluster_from_nodes(nodes) for group, nodes in clusters.items()} + + def _load_cluster(self, path, loader): cluster = None try: - path_len = len(self.cluster_prefix) - - nodes = {} - for node in self._client.get_cluster(): - node['key'] = base64_decode(node['key']) - node['value'] = base64_decode(node.get('value', '')) - node['lease'] = node.get('lease') - nodes[node['key'][path_len:].lstrip('/')] = node - - # get initialize flag - initialize = nodes.get(self._INITIALIZE) - initialize = initialize and initialize['value'] - - # get global dynamic configuration - config = nodes.get(self._CONFIG) - config = config and ClusterConfig.from_node(config['mod_revision'], config['value']) - - # get timeline history - history = nodes.get(self._HISTORY) - history = history and TimelineHistory.from_node(history['mod_revision'], history['value']) - - # get last know leader lsn and slots - status = nodes.get(self._STATUS) - if status: - try: - status = json.loads(status['value']) - last_lsn = status.get(self._OPTIME) - slots = status.get('slots') - except Exception: - slots = last_lsn = None - else: - last_lsn = nodes.get(self._LEADER_OPTIME) - last_lsn = last_lsn and last_lsn['value'] - slots = None - - try: - last_lsn = int(last_lsn) - except Exception: - last_lsn = 0 - - # get list of members - members = [self.member(n) for k, n in nodes.items() if k.startswith(self._MEMBERS) and k.count('/') == 1] - - # get leader - leader = nodes.get(self._LEADER) - if not self._ctl and leader and leader['value'] == self._name and self._lease != leader.get('lease'): - logger.warning('I am the leader but not owner of the lease') - - if leader: - member = Member(-1, leader['value'], None, {}) - member = ([m for m in members if m.name == leader['value']] or [member])[0] - leader = Leader(leader['mod_revision'], leader['lease'], member) - - # failover key - failover = nodes.get(self._FAILOVER) - if failover: - failover = Failover.from_node(failover['mod_revision'], failover['value']) - - # get synchronization state - sync = nodes.get(self._SYNC) - sync = SyncState.from_node(sync and sync['mod_revision'], sync and sync['value']) - - # get failsafe topology - failsafe = nodes.get(self._FAILSAFE) - try: - failsafe = json.loads(failsafe['value']) if failsafe else None - except Exception: - failsafe = None - - cluster = Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + cluster = loader(path) except UnsupportedEtcdVersion: raise except Exception as e: @@ -853,7 +868,7 @@ class Etcd3(AbstractEtcd): @catch_etcd_errors def delete_cluster(self): - return self.retry(self._client.deleteprefix, self.cluster_prefix) + return self.retry(self._client.deleteprefix, self.client_path('')) @catch_etcd_errors def set_history_value(self, value): diff --git a/patroni/dcs/exhibitor.py b/patroni/dcs/exhibitor.py index 70066d65..8b7b5914 100644 --- a/patroni/dcs/exhibitor.py +++ b/patroni/dcs/exhibitor.py @@ -68,7 +68,7 @@ class Exhibitor(ZooKeeper): config['hosts'] = self._ensemble_provider.zookeeper_hosts super(Exhibitor, self).__init__(config) - def _load_cluster(self): + def _load_cluster(self, path, loader): if self._ensemble_provider.poll(): self._client.set_hosts(self._ensemble_provider.zookeeper_hosts) - return super(Exhibitor, self)._load_cluster() + return super(Exhibitor, self)._load_cluster(path, loader) diff --git a/patroni/dcs/kubernetes.py b/patroni/dcs/kubernetes.py index c3bbd0bc..5e49a5b8 100644 --- a/patroni/dcs/kubernetes.py +++ b/patroni/dcs/kubernetes.py @@ -14,12 +14,15 @@ import time import urllib3 import yaml +from collections import defaultdict +from copy import deepcopy from urllib3 import Timeout from urllib3.exceptions import HTTPError from six.moves.http_client import HTTPException from threading import Condition, Lock, Thread -from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, SyncState, TimelineHistory +from . import AbstractDCS, Cluster, ClusterConfig, Failover, Leader, Member, SyncState,\ + TimelineHistory, CITUS_COORDINATOR_GROUP_ID, citus_group_re from ..exceptions import DCSError from ..utils import deep_compare, iter_response_objects, keepalive_socket_options,\ Retry, RetryFailedError, tzutc, uri, USER_AGENT @@ -687,8 +690,10 @@ class ObjectCache(Thread): class Kubernetes(AbstractDCS): + _CITUS_LABEL = 'citus-group' + def __init__(self, config): - self._labels = config['labels'] + self._labels = deepcopy(config['labels']) self._labels[config.get('scope_label', 'cluster-name')] = config['scope'] self._label_selector = ','.join('{0}={1}'.format(k, v) for k, v in self._labels.items()) self._namespace = config.get('namespace') or 'default' @@ -696,6 +701,9 @@ class Kubernetes(AbstractDCS): self._ca_certs = os.environ.get('PATRONI_KUBERNETES_CACERT', config.get('cacert')) or SERVICE_CERT_FILENAME config['namespace'] = '' super(Kubernetes, self).__init__(config) + if self._citus_group: + self._labels[self._CITUS_LABEL] = self._citus_group + self._retry = Retry(deadline=config['retry_timeout'], max_delay=1, max_tries=-1, retry_exceptions=KubernetesRetriableException) self._ttl = None @@ -755,7 +763,7 @@ class Kubernetes(AbstractDCS): @property def leader_path(self): - return self._base_path[1:] if self._api.use_endpoints else super(Kubernetes, self).leader_path + return super(Kubernetes, self).leader_path[:-7 if self._api.use_endpoints else None] def set_ttl(self, ttl): ttl = int(ttl) @@ -787,96 +795,137 @@ class Kubernetes(AbstractDCS): raise RetryFailedError('Exceeded retry deadline') self._condition.wait(timeout) - def _load_cluster(self): + def _cluster_from_nodes(self, group, nodes, pods): + members = [self.member(pod) for pod in pods] + path = self._base_path[1:] + '-' + if group: + path += group + '-' + + config = nodes.get(path + self._CONFIG) + metadata = config and config.metadata + annotations = metadata and metadata.annotations or {} + + # get initialize flag + initialize = annotations.get(self._INITIALIZE) + + # get global dynamic configuration + config = ClusterConfig.from_node(metadata and metadata.resource_version, + annotations.get(self._CONFIG) or '{}', + metadata.resource_version if self._CONFIG in annotations else 0) + + # get timeline history + history = TimelineHistory.from_node(metadata and metadata.resource_version, + annotations.get(self._HISTORY) or '[]') + + leader_path = path[:-1] if self._api.use_endpoints else path + self._LEADER + leader = nodes.get(leader_path) + metadata = leader and leader.metadata + if leader_path == self.leader_path: # We want to memorize leader_resource_version only for our cluster + self._leader_resource_version = metadata.resource_version if metadata else None + annotations = metadata and metadata.annotations or {} + + # get last known leader lsn + last_lsn = annotations.get(self._OPTIME) + try: + last_lsn = 0 if last_lsn is None else int(last_lsn) + except Exception: + last_lsn = 0 + + # get permanent slots state (confirmed_flush_lsn) + slots = annotations.get('slots') + try: + slots = slots and json.loads(slots) + except Exception: + slots = None + + # get failsafe topology + failsafe = annotations.get(self._FAILSAFE) + try: + failsafe = json.loads(failsafe) if failsafe else None + except Exception: + failsafe = None + + # get leader + leader_record = {n: annotations.get(n) for n in (self._LEADER, 'acquireTime', + 'ttl', 'renewTime', 'transitions') if n in annotations} + # We want to memorize leader_observed_record and update leader_observed_time only for our cluster + if leader_path == self.leader_path and (leader_record or self._leader_observed_record)\ + and leader_record != self._leader_observed_record: + self._leader_observed_record = leader_record + self._leader_observed_time = time.time() + + leader = leader_record.get(self._LEADER) + try: + ttl = int(leader_record.get('ttl')) or self._ttl + except (TypeError, ValueError): + ttl = self._ttl + + # We want to check validity of the leader record only for our own cluster + if leader_path == self.leader_path and\ + not (metadata and self._leader_observed_time and self._leader_observed_time + ttl >= time.time()): + leader = None + + if metadata: + member = Member(-1, leader, None, {}) + member = ([m for m in members if m.name == leader] or [member])[0] + leader = Leader(metadata.resource_version, None, member) + + # failover key + failover = nodes.get(path + self._FAILOVER) + metadata = failover and failover.metadata + failover = Failover.from_node(metadata and metadata.resource_version, + metadata and (metadata.annotations or {}).copy()) + + # get synchronization state + sync = nodes.get(path + self._SYNC) + metadata = sync and sync.metadata + sync = SyncState.from_node(metadata and metadata.resource_version, metadata and metadata.annotations) + + return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + + def _cluster_loader(self, path): + return self._cluster_from_nodes(path['group'], path['nodes'], path['pods']) + + def _citus_cluster_loader(self, path): + clusters = defaultdict(lambda: {'pods': [], 'nodes': {}}) + + for pod in path['pods']: + group = pod.metadata.labels.get(self._CITUS_LABEL) + if group and citus_group_re.match(group): + clusters[group]['pods'].append(pod) + + for name, kind in path['nodes'].items(): + group = kind.metadata.labels.get(self._CITUS_LABEL) + if group and citus_group_re.match(group): + clusters[group]['nodes'][name] = kind + return {int(group): self._cluster_from_nodes(group, value['nodes'], value['pods']) + for group, value in clusters.items()} + + def __load_cluster(self, group, loader): stop_time = time.time() + self._retry.deadline self._api.refresh_api_servers_cache() try: with self._condition: self._wait_caches(stop_time) - - members = [self.member(pod) for pod in self._pods.copy().values()] - nodes = self._kinds.copy() - - config = nodes.get(self.config_path) - metadata = config and config.metadata - annotations = metadata and metadata.annotations or {} - - # get initialize flag - initialize = annotations.get(self._INITIALIZE) - - # get global dynamic configuration - config = ClusterConfig.from_node(metadata and metadata.resource_version, - annotations.get(self._CONFIG) or '{}', - metadata.resource_version if self._CONFIG in annotations else 0) - - # get timeline history - history = TimelineHistory.from_node(metadata and metadata.resource_version, - annotations.get(self._HISTORY) or '[]') - - leader = nodes.get(self.leader_path) - metadata = leader and leader.metadata - self._leader_resource_version = metadata.resource_version if metadata else None - annotations = metadata and metadata.annotations or {} - - # get last known leader lsn - last_lsn = annotations.get(self._OPTIME) - try: - last_lsn = 0 if last_lsn is None else int(last_lsn) - except Exception: - last_lsn = 0 - - # get permanent slots state (confirmed_flush_lsn) - slots = annotations.get('slots') - try: - slots = slots and json.loads(slots) - except Exception: - slots = None - - # get failsafe topology - failsafe = annotations.get(self._FAILSAFE) - try: - failsafe = json.loads(failsafe) if failsafe else None - except Exception: - failsafe = None - - # get leader - leader_record = {n: annotations.get(n) for n in (self._LEADER, 'acquireTime', - 'ttl', 'renewTime', 'transitions') if n in annotations} - if (leader_record or self._leader_observed_record) and leader_record != self._leader_observed_record: - self._leader_observed_record = leader_record - self._leader_observed_time = time.time() - - leader = leader_record.get(self._LEADER) - try: - ttl = int(leader_record.get('ttl')) or self._ttl - except (TypeError, ValueError): - ttl = self._ttl - - if not metadata or not self._leader_observed_time or self._leader_observed_time + ttl < time.time() \ - and (self._name != leader or not isinstance(failsafe, dict) or leader not in failsafe): - leader = None - - if metadata: - member = Member(-1, leader, None, {}) - member = ([m for m in members if m.name == leader] or [member])[0] - leader = Leader(metadata.resource_version, None, member) - - # failover key - failover = nodes.get(self.failover_path) - metadata = failover and failover.metadata - failover = Failover.from_node(metadata and metadata.resource_version, - metadata and (metadata.annotations or {}).copy()) - - # get synchronization state - sync = nodes.get(self.sync_path) - metadata = sync and sync.metadata - sync = SyncState.from_node(metadata and metadata.resource_version, metadata and metadata.annotations) - - return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + pods = [pod for pod in self._pods.copy().values() + if not group or pod.metadata.labels.get(self._CITUS_LABEL) == group] + nodes = {name: kind for name, kind in self._kinds.copy().items() + if not group or kind.metadata.labels.get(self._CITUS_LABEL) == group} + return loader({'group': group, 'pods': pods, 'nodes': nodes}) except Exception: logger.exception('get_cluster') raise KubernetesError('Kubernetes API is not responding properly') + def _load_cluster(self, path, loader): + group = self._citus_group if path == self.client_path('') else None + return self.__load_cluster(group, loader) + + def get_citus_coordinator(self): + try: + return self.__load_cluster(str(CITUS_COORDINATOR_GROUP_ID), self._cluster_loader) + except Exception as e: + logger.error('Failed to load Citus coordinator cluster from Kubernetes: %r', e) + @staticmethod def compare_ports(p1, p2): return p1.name == p2.name and p1.port == p2.port and (p1.protocol or 'TCP') == (p2.protocol or 'TCP') diff --git a/patroni/dcs/raft.py b/patroni/dcs/raft.py index 8c563906..cc60fb60 100644 --- a/patroni/dcs/raft.py +++ b/patroni/dcs/raft.py @@ -4,13 +4,14 @@ import os import threading import time +from collections import defaultdict from pysyncobj import SyncObj, SyncObjConf, replicated, FAIL_REASON from pysyncobj.dns_resolver import globalDnsResolver from pysyncobj.node import TCPNode from pysyncobj.transport import TCPTransport, CONNECTION_STATE from pysyncobj.utility import TcpUtility -from . import AbstractDCS, ClusterConfig, Cluster, Failover, Leader, Member, SyncState, TimelineHistory +from . import AbstractDCS, ClusterConfig, Cluster, Failover, Leader, Member, SyncState, TimelineHistory, citus_group_re from ..exceptions import DCSError from ..utils import validate_directory @@ -319,13 +320,7 @@ class Raft(AbstractDCS): def member(key, value): return Member.from_node(value['index'], os.path.basename(key), None, value['value']) - def _load_cluster(self): - prefix = self.client_path('') - response = self._sync_obj.get(prefix, recursive=True) - if not response: - return Cluster(None, None, None, None, [], None, None, None, None, None) - nodes = {os.path.relpath(key, prefix).replace('\\', '/'): value for key, value in response.items()} - + def _cluster_from_nodes(self, nodes): # get initialize flag initialize = nodes.get(self._INITIALIZE) initialize = initialize and initialize['value'] @@ -385,6 +380,25 @@ class Raft(AbstractDCS): return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) + def _cluster_loader(self, path): + response = self._sync_obj.get(path, recursive=True) + if not response: + return Cluster(None, None, None, None, [], None, None, None, None, None) + nodes = {key[len(path):]: value for key, value in response.items()} + return self._cluster_from_nodes(nodes) + + def _citus_cluster_loader(self, path): + clusters = defaultdict(dict) + response = self._sync_obj.get(path, recursive=True) + for key, value in response.items(): + key = key[len(path):].split('/', 1) + if len(key) == 2 and citus_group_re.match(key[0]): + clusters[int(key[0])][key[1]] = value + return {group: self._cluster_from_nodes(nodes) for group, nodes in clusters.items()} + + def _load_cluster(self, path, loader): + return loader(path) + def _write_leader_optime(self, last_lsn): return self._sync_obj.set(self.leader_optime_path, last_lsn, timeout=1) diff --git a/patroni/dcs/zookeeper.py b/patroni/dcs/zookeeper.py index 2125120f..ab4ca5f5 100644 --- a/patroni/dcs/zookeeper.py +++ b/patroni/dcs/zookeeper.py @@ -11,7 +11,7 @@ from kazoo.protocol.states import KeeperState from kazoo.retry import RetryFailedError from kazoo.security import make_acl -from . import AbstractDCS, ClusterConfig, Cluster, Failover, Leader, Member, SyncState, TimelineHistory +from . import AbstractDCS, ClusterConfig, Cluster, Failover, Leader, Member, SyncState, TimelineHistory, citus_group_re from ..exceptions import DCSError from ..utils import deep_compare @@ -149,7 +149,11 @@ class ZooKeeper(AbstractDCS): def cluster_watcher(self, event): self._fetch_cluster = True - self.status_watcher(event) + if not event or event.state != KazooState.CONNECTED or event.path.startswith(self.client_path('')): + self.status_watcher(event) + + def members_watcher(self, event): + self._fetch_cluster = True def reload_config(self, config): self.set_retry_timeout(config['retry_timeout']) @@ -194,10 +198,10 @@ class ZooKeeper(AbstractDCS): except NoNodeError: return None - def get_status(self, leader): + def get_status(self, path, leader): watch = self.status_watcher if not leader or leader.name != self._name else None - status = self.get_node(self.status_path, watch) + status = self.get_node(path + self._STATUS, watch) if status: try: status = json.loads(status[0]) @@ -206,7 +210,7 @@ class ZooKeeper(AbstractDCS): except Exception: slots = last_lsn = None else: - last_lsn = self.get_node(self.leader_optime_path, watch) + last_lsn = self.get_node(path + self._LEADER_OPTIME, watch) last_lsn = last_lsn and last_lsn[0] slots = None @@ -228,41 +232,41 @@ class ZooKeeper(AbstractDCS): except NoNodeError: return [] - def load_members(self): + def load_members(self, path): members = [] - for member in self.get_children(self.members_path, self.cluster_watcher): - data = self.get_node(self.members_path + member) + for member in self.get_children(path + self._MEMBERS, self.cluster_watcher): + data = self.get_node(path + self._MEMBERS + member) if data is not None: members.append(self.member(member, *data)) return members - def _inner_load_cluster(self): + def _cluster_loader(self, path): self._fetch_cluster = False self.event.clear() - nodes = set(self.get_children(self.client_path(''), self.cluster_watcher)) + nodes = set(self.get_children(path, self.cluster_watcher)) if not nodes: self._fetch_cluster = True # get initialize flag - initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None + initialize = (self.get_node(path + self._INITIALIZE) or [None])[0] if self._INITIALIZE in nodes else None # get global dynamic configuration - config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None + config = self.get_node(path + self._CONFIG, watch=self.cluster_watcher) if self._CONFIG in nodes else None config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid) # get timeline history - history = self.get_node(self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None + history = self.get_node(path + self._HISTORY, watch=self.cluster_watcher) if self._HISTORY in nodes else None history = history and TimelineHistory.from_node(history[1].mzxid, history[0]) # get synchronization state - sync = self.get_node(self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None + sync = self.get_node(path + self._SYNC, watch=self.cluster_watcher) if self._SYNC in nodes else None sync = SyncState.from_node(sync and sync[1].version, sync and sync[0]) # get list of members - members = self.load_members() if self._MEMBERS[:-1] in nodes else [] + members = self.load_members(path) if self._MEMBERS[:-1] in nodes else [] # get leader - leader = self.get_node(self.leader_path) if self._LEADER in nodes else None + leader = self.get_node(path + self._LEADER) if self._LEADER in nodes else None if leader: member = Member(-1, leader[0], None, {}) member = ([m for m in members if m.name == leader[0]] or [member])[0] @@ -270,14 +274,14 @@ class ZooKeeper(AbstractDCS): self._fetch_cluster = member.index == -1 # get last known leader lsn and slots - last_lsn, slots = self.get_status(leader) + last_lsn, slots = self.get_status(path, leader) # failover key - failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None + failover = self.get_node(path + self._FAILOVER, watch=self.cluster_watcher) if self._FAILOVER in nodes else None failover = failover and Failover.from_node(failover[1].version, failover[0]) # get failsafe topology - failsafe = self.get_node(self.failsafe_path, watch=self.cluster_watcher) if self._FAILSAFE in nodes else None + failsafe = self.get_node(path + self._FAILSAFE, watch=self.cluster_watcher) if self._FAILSAFE in nodes else None try: failsafe = json.loads(failsafe[0]) if failsafe else None except Exception: @@ -285,11 +289,21 @@ class ZooKeeper(AbstractDCS): return Cluster(initialize, config, leader, last_lsn, members, failover, sync, history, slots, failsafe) - def _load_cluster(self): - cluster = self.cluster + def _citus_cluster_loader(self, path): + fetch_cluster = False + ret = {} + for node in self.get_children(path, self.cluster_watcher): + if citus_group_re.match(node): + ret[int(node)] = self._cluster_loader(path + node + '/') + fetch_cluster = fetch_cluster or self._fetch_cluster + self._fetch_cluster = fetch_cluster + return ret + + def _load_cluster(self, path, loader): + cluster = self.cluster if path == self._base_path + '/' else None if self._fetch_cluster or cluster is None: try: - cluster = self._client.retry(self._inner_load_cluster) + cluster = self._client.retry(loader, path) except Exception: logger.exception('get_cluster') self.cluster_watcher(None) @@ -302,10 +316,12 @@ class ZooKeeper(AbstractDCS): self.event.clear() else: try: - last_lsn, slots = self.get_status(cluster.leader) + last_lsn, slots = self.get_status(self.client_path(''), cluster.leader) self.event.clear() - cluster = Cluster(cluster.initialize, cluster.config, cluster.leader, last_lsn, cluster.members, - cluster.failover, cluster.sync, cluster.history, slots, cluster.failsafe) + cluster = list(cluster) + cluster[3] = last_lsn + cluster[8] = slots + cluster = Cluster(*cluster) except Exception: pass return cluster diff --git a/patroni/ha.py b/patroni/ha.py index 6b95c6d0..422a3a71 100644 --- a/patroni/ha.py +++ b/patroni/ha.py @@ -156,6 +156,8 @@ class Ha(object): # Count of concurrent sync disabling requests. Value above zero means that we don't want to be synchronous # standby. Changes protected by _member_state_lock. self._disable_sync = 0 + # Remember the last known member role and state written to the DCS in order to notify Citus coordinator + self._last_state = None # We need following property to avoid shutdown of postgres when join of Patroni to the postgres # already running as replica was aborted due to cluster not being initialized in DCS. @@ -269,6 +271,22 @@ class Ha(object): tags['nosync'] = True return tags + def notify_citus_coordinator(self, event): + if self.state_handler.citus_handler.is_worker(): + coordinator = self.dcs.get_citus_coordinator() + if coordinator and coordinator.leader and coordinator.leader.conn_kwargs: + try: + data = {'type': event, + 'group': self.state_handler.citus_handler.group(), + 'leader': self.state_handler.name, + 'timeout': self.dcs.ttl, + 'cooldown': self.patroni.config['retry_timeout']} + timeout = self.dcs.ttl if event == 'before_demote' else 2 + self.patroni.request(coordinator.leader.member, 'post', 'citus', data, timeout=timeout, retries=0) + except Exception as e: + logger.warning('Request to Citus coordinator leader %s %s failed: %r', + coordinator.leader.name, coordinator.leader.member.api_url, e) + def touch_member(self): with self._member_state_lock: data = { @@ -321,7 +339,13 @@ class Ha(object): if self.is_paused(): data['pause'] = True - return self.dcs.touch_member(data) + ret = self.dcs.touch_member(data) + if ret: + if self._last_state != (data['state'], data['role'])\ + and (data['state'], data['role']) == ('running', 'master'): + self.notify_citus_coordinator('after_promote') + self._last_state = (data['state'], data['role']) + return ret def clone(self, clone_member=None, msg='(without leader)'): if self.is_standby_cluster() and not isinstance(clone_member, RemoteMember): @@ -709,8 +733,9 @@ class Ha(object): self.state_handler.set_role('master') self.process_sync_replication() self.update_cluster_history() + self.state_handler.citus_handler.sync_pg_dist_node(self.cluster) return message - elif self.state_handler.role == 'master': + elif self.state_handler.role in ('master', 'promoted'): self.process_sync_replication() return message else: @@ -722,16 +747,20 @@ class Ha(object): # promotion until next cycle. TODO: trigger immediate retry of run_cycle return 'Postponing promotion because synchronous replication state was updated by somebody else' self.state_handler.sync_handler.set_synchronous_standby_names( - ['*'] if self.is_synchronous_mode_strict() else []) - if self.state_handler.role != 'master': + ['*'] if self.is_synchronous_mode_strict() else []) + if self.state_handler.role not in ('master', 'promoted'): def on_success(): self._rewind.reset_state() logger.info("cleared rewind state after becoming the leader") + def before_promote(): + self.notify_citus_coordinator('before_promote') + with self._async_response: self._async_response.reset() self._async_executor.try_run_async('promote', self.state_handler.promote, - args=(self.dcs.loop_wait, self._async_response, on_success)) + args=(self.dcs.loop_wait, self._async_response, + before_promote, on_success)) return promote_message def fetch_node_status(self, member): @@ -1022,9 +1051,16 @@ class Ha(object): self.release_leader_key_voluntarily(checkpoint_location) status['released'] = True + def before_shutdown(): + if self.state_handler.citus_handler.is_coordinator(): + self.state_handler.citus_handler.on_demote() + else: + self.notify_citus_coordinator('before_demote') + self.state_handler.stop(mode_control['stop'], checkpoint=mode_control['checkpoint'], on_safepoint=self.watchdog.disable if self.watchdog.is_running else None, on_shutdown=on_shutdown if mode_control['release'] else None, + before_shutdown=before_shutdown if mode == 'graceful' else None, stop_timeout=self.master_stop_timeout()) self.state_handler.set_role('demoted') self.set_is_leader(False) @@ -1323,8 +1359,16 @@ class Ha(object): timeout = restart_data.get('timeout', self.patroni.config['master_start_timeout']) self.set_start_timeout(timeout) + def before_shutdown(): + self.notify_citus_coordinator('before_demote') + + def after_start(): + self.notify_citus_coordinator('after_promote') + # For non async cases we want to wait for restart to complete or timeout before returning. - do_restart = functools.partial(self.state_handler.restart, timeout, self._async_executor.critical_task) + do_restart = functools.partial(self.state_handler.restart, timeout, self._async_executor.critical_task, + before_shutdown=before_shutdown if self.has_lock() else None, + after_start=after_start if self.has_lock() else None) if self.is_synchronous_mode() and not self.has_lock(): do_restart = functools.partial(self.while_not_sync_standby, do_restart) @@ -1452,6 +1496,7 @@ class Ha(object): if not self.watchdog.activate(): logger.error('Cancelling bootstrap because watchdog activation failed') self.cancel_initialization() + self._rewind.ensure_checkpoint_after_promote(self.wakeup) self.dcs.initialize(create_new=(self.cluster.initialize is None), sysid=self.state_handler.sysid) self.dcs.set_config_value(json.dumps(self.patroni.config.dynamic_configuration, separators=(',', ':'))) @@ -1756,9 +1801,14 @@ class Ha(object): else: self.dcs.write_leader_optime(checkpoint_location) + def _before_shutdown(): + self.notify_citus_coordinator('before_demote') + on_shutdown = _on_shutdown if self.is_leader() else None + before_shutdown = _before_shutdown if self.is_leader() else None self.while_not_sync_standby(lambda: self.state_handler.stop(checkpoint=False, on_safepoint=disable_wd, on_shutdown=on_shutdown, + before_shutdown=before_shutdown, stop_timeout=self.master_stop_timeout())) if not self.state_handler.is_running(): if self.is_leader() and not status['deleted']: diff --git a/patroni/postgresql/__init__.py b/patroni/postgresql/__init__.py index afd40e11..54ab4005 100644 --- a/patroni/postgresql/__init__.py +++ b/patroni/postgresql/__init__.py @@ -19,6 +19,7 @@ from .callback_executor import CallbackExecutor from .cancellable import CancellableSubprocess from .config import ConfigHandler, mtime from .connection import Connection, get_connection_cursor +from .citus import CitusHandler from .misc import parse_history, parse_lsn, postgres_major_version_to_int from .postmaster import PostmasterProcess from .slots import SlotsHandler @@ -76,6 +77,7 @@ class Postgresql(object): self._pending_restart = False self._connection = Connection() + self.citus_handler = CitusHandler(self, config.get('citus')) self.config = ConfigHandler(self, config) self.config.check_directories() @@ -283,7 +285,8 @@ class Postgresql(object): return self._connection.get() def set_connection_kwargs(self, kwargs): - self._connection.set_conn_kwargs(kwargs) + self._connection.set_conn_kwargs(kwargs.copy()) + self.citus_handler.set_conn_kwargs(kwargs.copy()) def _query(self, sql, *params): """We are always using the same cursor, therefore this method is not thread-safe!!! @@ -552,7 +555,7 @@ class Postgresql(object): logger.warning("Timed out waiting for PostgreSQL to start") return False - def start(self, timeout=None, task=None, block_callbacks=False, role=None): + def start(self, timeout=None, task=None, block_callbacks=False, role=None, after_start=None): """Start PostgreSQL Waits for postmaster to open ports or terminate so pg_isready can be used to check startup completion @@ -623,6 +626,8 @@ class Postgresql(object): ret = self.wait_for_startup(start_timeout) if ret is not None: + if ret and after_start: + after_start() return ret elif timeout is not None: return False @@ -649,7 +654,7 @@ class Postgresql(object): return 'not accessible or not healty' def stop(self, mode='fast', block_callbacks=False, checkpoint=None, - on_safepoint=None, on_shutdown=None, stop_timeout=None): + on_safepoint=None, on_shutdown=None, before_shutdown=None, stop_timeout=None): """Stop PostgreSQL Supports a callback when a safepoint is reached. A safepoint is when no user backend can return a successful @@ -658,11 +663,13 @@ class Postgresql(object): :param on_safepoint: This callback is called when no user backends are running. :param on_shutdown: is called when pg_controldata starts reporting `Database cluster state: shut down` + :param before_shutdown: is called after running optional CHECKPOINT and before running pg_ctl stop """ if checkpoint is None: checkpoint = False if mode == 'immediate' else True - success, pg_signaled = self._do_stop(mode, block_callbacks, checkpoint, on_safepoint, on_shutdown, stop_timeout) + success, pg_signaled = self._do_stop(mode, block_callbacks, checkpoint, on_safepoint, + on_shutdown, before_shutdown, stop_timeout) if success: # block_callbacks is used during restart to avoid # running start/stop callbacks in addition to restart ones @@ -675,7 +682,7 @@ class Postgresql(object): self.set_state('stop failed') return success - def _do_stop(self, mode, block_callbacks, checkpoint, on_safepoint, on_shutdown, stop_timeout): + def _do_stop(self, mode, block_callbacks, checkpoint, on_safepoint, on_shutdown, before_shutdown, stop_timeout): postmaster = self.is_running() if not postmaster: if on_safepoint: @@ -688,6 +695,9 @@ class Postgresql(object): if not block_callbacks: self.set_state('stopping') + if before_shutdown: + before_shutdown() + # Send signal to postmaster to stop success = postmaster.signal_stop(mode, self.pgcommand('pg_ctl')) if success is not None: @@ -815,7 +825,8 @@ class Postgresql(object): return self.state == 'running' - def restart(self, timeout=None, task=None, block_callbacks=False, role=None): + def restart(self, timeout=None, task=None, block_callbacks=False, + role=None, before_shutdown=None, after_start=None): """Restarts PostgreSQL. When timeout parameter is set the call will block either until PostgreSQL has started, failed to start or @@ -826,7 +837,8 @@ class Postgresql(object): self.set_state('restarting') if not block_callbacks: self.__cb_pending = ACTION_ON_RESTART - ret = self.stop(block_callbacks=True) and self.start(timeout, task, True, role) + ret = self.stop(block_callbacks=True, before_shutdown=before_shutdown)\ + and self.start(timeout, task, True, role, after_start) if not ret and not self.is_starting(): self.set_state('restart failed ({0})'.format(self.state)) return ret @@ -951,6 +963,7 @@ class Postgresql(object): for _ in polling_loop(wait_seconds): data = self.controldata() if data.get('Database cluster state') == 'in production': + self.set_role('master') return True def _pre_promote(self): @@ -968,8 +981,8 @@ class Postgresql(object): logger.info('pre_promote script `%s` exited with %s', cmd, ret) return ret == 0 - def promote(self, wait_seconds, task, on_success=None): - if self.role == 'master': + def promote(self, wait_seconds, task, before_promote=None, on_success=None): + if self.role in ('promoted', 'master'): return True ret = self._pre_promote() @@ -985,11 +998,15 @@ class Postgresql(object): logger.info("PostgreSQL promote cancelled.") return False + if before_promote is not None: + before_promote() + self.slots_handler.on_promote() + self.citus_handler.schedule_cache_rebuild() ret = self.pg_ctl('promote', '-W') if ret: - self.set_role('master') + self.set_role('promoted') if on_success is not None: on_success() self.call_nowait(ACTION_ON_ROLE_CHANGE) @@ -1132,4 +1149,5 @@ class Postgresql(object): if not self._major_version: self.configure_server_parameters() self.slots_handler.schedule() + self.citus_handler.schedule_cache_rebuild() self._sysid = None diff --git a/patroni/postgresql/bootstrap.py b/patroni/postgresql/bootstrap.py index 4ade3d1a..12208c14 100644 --- a/patroni/postgresql/bootstrap.py +++ b/patroni/postgresql/bootstrap.py @@ -345,7 +345,7 @@ END;$$""".format(quote_literal(name), quote_ident(name, self._postgresql.connect BEGIN SET local synchronous_commit = 'local'; GRANT EXECUTE ON function pg_catalog.{0} TO {1}; -END;$$""".format(f, quote_ident(rewind['username'], self._postgresql.connection())) +END;$$""".format(f, quote_ident(rewind['username'], postgresql.connection())) postgresql.query(sql) for name, value in (config.get('users') or {}).items(): @@ -377,6 +377,9 @@ END;$$""".format(f, quote_ident(rewind['username'], self._postgresql.connection( postgresql.reload() time.sleep(1) # give a time to postgres to "reload" configuration files postgresql.connection().close() # close connection to reconnect with a new password + else: # initdb + # We may want create database and extension for citus + self._postgresql.citus_handler.bootstrap() except Exception: logger.exception('post_bootstrap') task.complete(False) diff --git a/patroni/postgresql/citus.py b/patroni/postgresql/citus.py new file mode 100644 index 00000000..5f27ec9e --- /dev/null +++ b/patroni/postgresql/citus.py @@ -0,0 +1,389 @@ +import logging +import re +import time + +from six.moves.urllib_parse import urlparse +from threading import Condition, Event, Thread + +from .connection import Connection +from ..dcs import CITUS_COORDINATOR_GROUP_ID +from ..psycopg import connect, quote_ident + +CITUS_SLOT_NAME_RE = re.compile(r'^citus_shard_(move|split)_slot(_[1-9][0-9]*){2,3}$') +logger = logging.getLogger(__name__) + + +class PgDistNode(object): + """Represents a single row in the `pg_dist_node` table""" + + def __init__(self, group, host, port, event, nodeid=None, timeout=None, cooldown=None): + self.group = group + # A weird way of pausing client connections by adding the `-demoted` suffix to the hostname + self.host = host + ('-demoted' if event == 'before_demote' else '') + self.port = port + # Event that is trying to change or changed the given row. + # Possible values: before_demote, before_promote, after_promote. + self.event = event + self.nodeid = nodeid + + # If transaction was started, we need to COMMIT/ROLLBACK before the deadline + self.timeout = timeout + self.cooldown = cooldown or 10000 # 10s by default + self.deadline = 0 + + # All changes in the pg_dist_node are serialized on the Patroni + # side by performing them from a thread. The thread, that is + # requested a change, sometimes needs to wait for a result. + # For example, we want to pause client connections before demoting + # the worker, and once it is done notify the calling thread. + self._event = Event() + + def wait(self): + self._event.wait() + + def wakeup(self): + self._event.set() + + def __eq__(self, other): + return isinstance(other, PgDistNode) and self.event == other.event\ + and self.host == other.host and self.port == other.port + + def __ne__(self, other): + return not self == other + + def __str__(self): + return ('PgDistNode(nodeid={0},group={1},host={2},port={3},event={4})' + .format(self.nodeid, self.group, self.host, self.port, self.event)) + + def __repr__(self): + return str(self) + + +class CitusHandler(Thread): + + def __init__(self, postgresql, config): + super(CitusHandler, self).__init__() + self.daemon = True + self._postgresql = postgresql + self._config = config + self._connection = Connection() + self._pg_dist_node = {} # Cache of pg_dist_node: {groupid: PgDistNode()} + self._tasks = [] # Requests to change pg_dist_node, every task is a `PgDistNode` + self._condition = Condition() # protects _pg_dist_node, _tasks, and _schedule_load_pg_dist_node + self._in_flight = None # Reference to the `PgDistNode` if there is a transaction in progress changing it + self.schedule_cache_rebuild() + + def is_enabled(self): + return isinstance(self._config, dict) + + def group(self): + return self._config['group'] + + def is_coordinator(self): + return self.is_enabled() and self.group() == CITUS_COORDINATOR_GROUP_ID + + def is_worker(self): + return self.is_enabled() and not self.is_coordinator() + + def set_conn_kwargs(self, kwargs): + if self.is_enabled(): + kwargs.update({'dbname': self._config['database'], + 'options': '-c statement_timeout=0 -c idle_in_transaction_session_timeout=0'}) + self._connection.set_conn_kwargs(kwargs) + + def schedule_cache_rebuild(self): + with self._condition: + self._schedule_load_pg_dist_node = True + + def on_demote(self): + with self._condition: + self._pg_dist_node.clear() + self._tasks[:] = [] + self._in_flight = None + + def query(self, sql, *params): + try: + logger.debug('query(%s, %s)', sql, params) + cursor = self._connection.cursor() + cursor.execute(sql, params or None) + return cursor + except Exception as e: + logger.error('Exception when executing query "%s", (%s): %r', sql, params, e) + self._connection.close() + self._in_flight = None + self.schedule_cache_rebuild() + raise e + + def load_pg_dist_node(self): + """Read from the `pg_dist_node` table and put it into the local cache""" + + with self._condition: + if not self._schedule_load_pg_dist_node: + return True + self._schedule_load_pg_dist_node = False + + try: + cursor = self.query("SELECT nodeid, groupid, nodename, nodeport, noderole" + " FROM pg_catalog.pg_dist_node WHERE noderole = 'primary'") + except Exception: + return False + + with self._condition: + self._pg_dist_node = {r[1]: PgDistNode(r[1], r[2], r[3], 'after_promote', r[0]) for r in cursor} + return True + + def sync_pg_dist_node(self, cluster): + """Maintain the `pg_dist_node` from the coordinator leader every heartbeat loop. + + We can't always rely on REST API calls from worker nodes in order + to maintain `pg_dist_node`, therefore at least once per heartbeat + loop we make sure that workes registered in `self._pg_dist_node` + cache are matching the cluster view from DCS by creating tasks + the same way as it is done from the REST API.""" + + if not self.is_coordinator(): + return + + with self._condition: + if not self.is_alive(): + self.start() + + self.add_task('after_promote', CITUS_COORDINATOR_GROUP_ID, self._postgresql.connection_string) + + for group, worker in cluster.workers.items(): + leader = worker.leader + if leader and leader.conn_url\ + and leader.data.get('role') == 'master' and leader.data.get('state') == 'running': + self.add_task('after_promote', group, leader.conn_url) + + def find_task_by_group(self, group): + for i, task in enumerate(self._tasks): + if task.group == group: + return i + + def pick_task(self): + """Returns the tuple(i, task), where `i` - is the task index in the self._tasks list + + Tasks are picked by following priorities: + 1. If there is already a transaction in progress, pick a task + that that will change already affected worker primary. + 2. If the coordinator address should be changed - pick a task + with group=0 (coordinators are always in group 0). + 3. Pick a task that is the oldest (first from the self._tasks)""" + + with self._condition: + if self._in_flight: + i = self.find_task_by_group(self._in_flight.group) + else: + while True: + i = self.find_task_by_group(CITUS_COORDINATOR_GROUP_ID) # set_coordinator + if i is None and self._tasks: + i = 0 + if i is None: + break + task = self._tasks[i] + if task == self._pg_dist_node.get(task.group): + self._tasks.pop(i) # nothing to do because cached version of pg_dist_node already matches + else: + break + task = self._tasks[i] if i is not None else None + + # When tasks are added it could happen that self._pg_dist_node + # wasn't ready (self._schedule_load_pg_dist_node is False) + # and hence the nodeid wasn't filled. + if task and task.group in self._pg_dist_node: + task.nodeid = self._pg_dist_node[task.group].nodeid + return i, task + + def update_node(self, task): + if task.group == CITUS_COORDINATOR_GROUP_ID: + return self.query("SELECT pg_catalog.citus_set_coordinator_host(%s, %s, 'primary', 'default')", + task.host, task.port) + + if task.nodeid is None and task.event != 'before_demote': + task.nodeid = self.query("SELECT pg_catalog.citus_add_node(%s, %s, %s, 'primary', 'default')", + task.host, task.port, task.group).fetchone()[0] + elif task.nodeid is not None: + # XXX: statement_timeout? + self.query('SELECT pg_catalog.citus_update_node(%s, %s, %s, true, %s)', + task.nodeid, task.host, task.port, task.cooldown) + + def process_task(self, task): + """Updates a single row in `pg_dist_node` table, optionally in a transaction. + + The transaction is started if we do a demote of the worker node + or before promoting the other worker if there is not transaction + in progress. And, the transaction it is committed when the + switchover/failover completed. + + This method returns `True` if node was updated (optionally, + transaction was committed) as an indicator that + the `self._pg_dist_node` cache should be updated. + + The maximum lifetime of the transaction in progress + is controlled outside of this method.""" + + if task.event == 'after_promote': + # The after_promote may happen without previous before_demote and/or + # before_promore. In this case we just call self.update_node() method. + # If there is a transaction in progress, it could be that it already did + # required changes and we can simply COMMIT. + if not self._in_flight or self._in_flight.host != task.host or self._in_flight.port != task.port: + self.update_node(task) + if self._in_flight: + self.query('COMMIT') + self._in_flight = None + return True + else: # before_demote, before_promote + if task.timeout: + task.deadline = time.time() + task.timeout + if not self._in_flight: + self.query('BEGIN') + self.update_node(task) + self._in_flight = task + return False + + def process_tasks(self): + while True: + if not self._in_flight and not self.load_pg_dist_node(): + break + + i, task = self.pick_task() + if not task: + break + try: + update_cache = self.process_task(task) + except Exception as e: + logger.error('Exception when working with pg_dist_node: %r', e) + update_cache = False + with self._condition: + if self._tasks: + if update_cache: + self._pg_dist_node[task.group] = task + if id(self._tasks[i]) == id(task): + self._tasks.pop(i) + task.wakeup() + + def run(self): + while True: + try: + with self._condition: + if self._schedule_load_pg_dist_node: + timeout = -1 + elif self._in_flight: + timeout = self._in_flight.deadline - time.time() if self._tasks else None + else: + timeout = -1 if self._tasks else None + + if timeout is None or timeout > 0: + self._condition.wait(timeout) + elif self._in_flight: + logger.warning('Rolling back transaction. Last known status: %s', self._in_flight) + self.query('ROLLBACK') + self._in_flight = None + self.process_tasks() + except Exception: + logger.exception('run') + + def _add_task(self, task): + with self._condition: + i = self.find_task_by_group(task.group) + + # task.timeout is None is an indicator that it was scheduled + # from the sync_pg_dist_node() and we don't want to override + # already existing task created from REST API. + if task.timeout is None and (i is not None or self._in_flight and self._in_flight.group == task.group): + return False + + # Override already existing task for the same worker group + if i is not None: + if task != self._tasks[i]: + logger.debug('Overriding existing task: %s != %s', self._tasks[i], task) + self._tasks[i] = task + self._condition.notify() + return True + # Add the task to the list if Worker node state is different from the cached `pg_dist_node` + elif self._schedule_load_pg_dist_node or task != self._pg_dist_node.get(task.group)\ + or self._in_flight and task.group == self._in_flight.group: + logger.debug('Adding the new task: %s', task) + self._tasks.append(task) + self._condition.notify() + return True + return False + + def add_task(self, event, group, conn_url, timeout=None, cooldown=None): + try: + r = urlparse(conn_url) + except Exception as e: + return logger.error('Failed to parse connection url %s: %r', conn_url, e) + host = r.hostname + port = r.port or 5432 + task = PgDistNode(group, host, port, event, timeout=timeout, cooldown=cooldown) + return task if self._add_task(task) else None + + def handle_event(self, cluster, event): + if not self.is_alive(): + return + + cluster = cluster.workers.get(event['group']) + if not (cluster and cluster.leader and cluster.leader.name == event['leader'] and cluster.leader.conn_url): + return + + task = self.add_task(event['type'], event['group'], + cluster.leader.conn_url, + event['timeout'], event['cooldown']*1000) + if task and event['type'] == 'before_demote': + task.wait() + + def bootstrap(self): + if not self.is_enabled(): + return + + conn_kwargs = self._postgresql.config.local_connect_kwargs + conn_kwargs['options'] = '-c synchronous_commit=local -c statement_timeout=0' + if self._config['database'] != self._postgresql.database: + conn = connect(**conn_kwargs) + try: + with conn.cursor() as cur: + cur.execute('CREATE DATABASE {0}'.format(quote_ident(self._config['database'], conn))) + finally: + conn.close() + + conn_kwargs['dbname'] = self._config['database'] + conn = connect(**conn_kwargs) + try: + with conn.cursor() as cur: + cur.execute('CREATE EXTENSION citus') + + superuser = self._postgresql.config.superuser + params = {k: superuser[k] for k in ('password', 'sslcert', 'sslkey') if k in superuser} + if params: + cur.execute("INSERT INTO pg_catalog.pg_dist_authinfo VALUES" + "(0, pg_catalog.current_user(), %s)", + (self._postgresql.config.format_dsn(params),)) + finally: + conn.close() + + def adjust_postgres_gucs(self, parameters): + if not self.is_enabled(): + return + + # citus extension must be on the first place in shared_preload_libraries + shared_preload_libraries = list(filter( + lambda el: el and el != 'citus', + [p.strip() for p in parameters.get('shared_preload_libraries', '').split(',')])) + parameters['shared_preload_libraries'] = ','.join(['citus'] + shared_preload_libraries) + + # if not explicitly set Citus overrides max_prepared_transactions to max_connections*2 + if parameters.get('max_prepared_transactions') == 0: + parameters['max_prepared_transactions'] = parameters['max_connections'] * 2 + + # Resharding in Citus implemented using logical replication + parameters['wal_level'] = 'logical' + + def ignore_replication_slot(self, slot): + if self.is_enabled() and self._postgresql.is_leader() and\ + slot['type'] == 'logical' and slot['database'] == self._config['database']: + m = CITUS_SLOT_NAME_RE.match(slot['name']) + return m and {'move': 'pgoutput', 'split': 'citus'}.get(m.group(1)) == slot['plugin'] + return False diff --git a/patroni/postgresql/config.py b/patroni/postgresql/config.py index 3a438c43..e14f1d5a 100644 --- a/patroni/postgresql/config.py +++ b/patroni/postgresql/config.py @@ -862,6 +862,9 @@ class ConfigHandler(object): elif self._postgresql.major_version: wal_keep_size = parse_int(parameters.pop('wal_keep_size', self.CMDLINE_OPTIONS['wal_keep_size'][0]), 'MB') parameters.setdefault('wal_keep_segments', int((wal_keep_size + 8) / 16)) + + self._postgresql.citus_handler.adjust_postgres_gucs(parameters) + ret = CaseInsensitiveDict({k: v for k, v in parameters.items() if not self._postgresql.major_version or self._postgresql.major_version >= self.CMDLINE_OPTIONS.get(k, (0, 1, 90100))[2]}) ret.update({k: os.path.join(self._config_dir, ret[k]) for k in ('hba_file', 'ident_file') if k in ret}) diff --git a/patroni/postgresql/slots.py b/patroni/postgresql/slots.py index ea04015c..c304b1f3 100644 --- a/patroni/postgresql/slots.py +++ b/patroni/postgresql/slots.py @@ -176,7 +176,7 @@ class SlotsHandler(object): if ((matcher.get("name") is None or matcher["name"] == name) and all(not matcher.get(a) or matcher[a] == slot.get(a) for a in ('database', 'plugin', 'type'))): return True - return False + return self._postgresql.citus_handler.ignore_replication_slot(slot) def drop_replication_slot(self, name): """Returns a tuple(active, dropped)""" diff --git a/patroni/validator.py b/patroni/validator.py index f83bcf0d..0aad3f51 100644 --- a/patroni/validator.py +++ b/patroni/validator.py @@ -367,6 +367,10 @@ schema = Schema({ Optional("ports"): [{"name": str, "port": int}], }, }), + Optional("citus"): { + "database": str, + "group": int + }, "postgresql": { "listen": validate_host_port_listen_multiple_hosts, "connect_address": validate_connect_address, diff --git a/postgres0.yml b/postgres0.yml index ebc67426..5b373c2c 100644 --- a/postgres0.yml +++ b/postgres0.yml @@ -18,6 +18,10 @@ restapi: # keyfile: /etc/ssl/private/ssl-cert-snakeoil.key # cacert: /etc/ssl/certs/ssl-cacert-snakeoil.pem +#citus: +# database: citus +# group: 0 # coordinator + etcd: #Provide host to do the initial discovery of the cluster topology: host: 127.0.0.1:2379 diff --git a/postgres1.yml b/postgres1.yml index 21993a4d..ce7d1ee2 100644 --- a/postgres1.yml +++ b/postgres1.yml @@ -18,6 +18,10 @@ restapi: # keyfile: /etc/ssl/private/ssl-cert-snakeoil.key # cacert: /etc/ssl/certs/ssl-cacert-snakeoil.pem +#citus: +# database: citus +# group: 1 # worker + etcd: #Provide host to do the initial discovery of the cluster topology: host: 127.0.0.1:2379 diff --git a/postgres2.yml b/postgres2.yml index 8612adcb..5ca44cd0 100644 --- a/postgres2.yml +++ b/postgres2.yml @@ -18,6 +18,10 @@ restapi: # keyfile: /etc/ssl/private/ssl-cert-snakeoil.key # cacert: /etc/ssl/certs/ssl-cacert-snakeoil.pem +#citus: +# database: citus +# group: 1 # worker + etcd: #Provide host to do the initial discovery of the cluster topology: host: 127.0.0.1:2379 diff --git a/tests/__init__.py b/tests/__init__.py index 91cc99df..d8b5ece8 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -128,6 +128,10 @@ class MockCursor(object): b'1\t0/40159C0\tno recovery target specified\n\n' b'2\t0/402DD98\tno recovery target specified\n\n' b'3\t0/403DD98\tno recovery target specified\n')] + elif sql.startswith('SELECT pg_catalog.citus_add_node'): + self.results = [(2,)] + elif sql.startswith('SELECT nodeid, groupid'): + self.results = [(1, 0, 'host1', 5432, 'primary'), (2, 1, 'host2', 5432, 'primary')] else: self.results = [(None, None, None, None, None, None, None, None, None, None)] @@ -205,7 +209,8 @@ class PostgresInit(unittest.TestCase): 'pg_hba': ['host all all 0.0.0.0/0 md5'], 'pg_ident': ['krb realm postgres'], 'callbacks': {'on_start': 'true', 'on_stop': 'true', 'on_reload': 'true', - 'on_restart': 'true', 'on_role_change': 'true'}}) + 'on_restart': 'true', 'on_role_change': 'true'}, + 'citus': {'group': 0, 'database': 'citus'}}) class BaseTestPostgresql(PostgresInit): diff --git a/tests/test_api.py b/tests/test_api.py index 668d4156..ffe85c7e 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -34,6 +34,7 @@ class MockPostgresql(object): lsn_name = 'lsn' POSTMASTER_START_TIME = 'pg_catalog.pg_postmaster_start_time()' TL_LSN = 'CASE WHEN pg_catalog.pg_is_in_recovery()' + citus_handler = Mock() @staticmethod def connection(): @@ -573,6 +574,13 @@ class TestRestApiHandler(unittest.TestCase): MockRestApiServer(RestApiHandler, post + '14\n\n{"leader":"1"}') MockRestApiServer(RestApiHandler, post + '37\n\n{"candidate":"2","scheduled_at": "1"}') + @patch.object(MockPatroni, 'dcs', Mock()) + @patch.object(MockHa, 'is_leader', Mock(return_value=True)) + def test_do_POST_citus(self): + post = 'POST /citus HTTP/1.0' + self._authorization + '\nContent-Length: ' + MockRestApiServer(RestApiHandler, post + '0\n\n') + MockRestApiServer(RestApiHandler, post + '14\n\n{"leader":"1"}') + class TestRestApiServer(unittest.TestCase): diff --git a/tests/test_citus.py b/tests/test_citus.py new file mode 100644 index 00000000..bccf0189 --- /dev/null +++ b/tests/test_citus.py @@ -0,0 +1,159 @@ +from mock import Mock, patch +from patroni.postgresql.citus import CitusHandler + +from . import BaseTestPostgresql, MockCursor, psycopg_connect, SleepException +from .test_ha import get_cluster_initialized_with_leader + + +@patch('patroni.postgresql.citus.Thread', Mock()) +@patch('patroni.psycopg.connect', psycopg_connect) +class TestCitus(BaseTestPostgresql): + + def setUp(self): + super(TestCitus, self).setUp() + self.c = self.p.citus_handler + self.c.set_conn_kwargs({'host': 'localhost', 'dbname': 'postgres'}) + self.cluster = get_cluster_initialized_with_leader() + self.cluster.workers[1] = self.cluster + + @patch('time.time', Mock(side_effect=[100, 130, 160, 190, 220, 250, 280])) + @patch('patroni.postgresql.citus.logger.exception', Mock(side_effect=SleepException)) + @patch('patroni.postgresql.citus.logger.warning') + @patch('patroni.postgresql.citus.PgDistNode.wait', Mock()) + @patch.object(CitusHandler, 'is_alive', Mock(return_value=True)) + def test_run(self, mock_logger_warning): + # `before_demote` or `before_promote` REST API calls starting a + # transaction. We want to make sure that it finishes during + # certain timeout. In case if it is not, we want to roll it back + # in order to not block other workers that want to update + # `pg_dist_node`. + self.c._condition.wait = Mock(side_effect=[Mock(), Mock(), Mock(), SleepException]) + + self.c.handle_event(self.cluster, {'type': 'before_demote', 'group': 1, + 'leader': 'leader', 'timeout': 30, 'cooldown': 10}) + self.c.add_task('after_promote', 2, 'postgres://host3:5432/postgres') + self.assertRaises(SleepException, self.c.run) + mock_logger_warning.assert_called_once() + self.assertTrue(mock_logger_warning.call_args[0][0].startswith('Rolling back transaction')) + self.assertTrue(repr(mock_logger_warning.call_args[0][1]).startswith('PgDistNode')) + + @patch.object(CitusHandler, 'is_alive', Mock(return_value=False)) + @patch.object(CitusHandler, 'start', Mock()) + def test_sync_pg_dist_node(self): + with patch.object(CitusHandler, 'is_enabled', Mock(return_value=False)): + self.c.sync_pg_dist_node(self.cluster) + self.c.sync_pg_dist_node(self.cluster) + + def test_handle_event(self): + self.c.handle_event(self.cluster, {}) + with patch.object(CitusHandler, 'is_alive', Mock(return_value=True)): + self.c.handle_event(self.cluster, {'type': 'after_promote', 'group': 2, + 'leader': 'leader', 'timeout': 30, 'cooldown': 10}) + + def test_add_task(self): + with patch('patroni.postgresql.citus.logger.error') as mock_logger,\ + patch('patroni.postgresql.citus.urlparse', Mock(side_effect=Exception)): + self.c.add_task('', 1, None) + mock_logger.assert_called_once() + + with patch('patroni.postgresql.citus.logger.debug') as mock_logger: + self.c.add_task('before_demote', 1, 'postgres://host:5432/postgres', 30) + mock_logger.assert_called_once() + self.assertTrue(mock_logger.call_args[0][0].startswith('Adding the new task:')) + + with patch('patroni.postgresql.citus.logger.debug') as mock_logger: + self.c.add_task('before_promote', 1, 'postgres://host:5432/postgres', 30) + mock_logger.assert_called_once() + self.assertTrue(mock_logger.call_args[0][0].startswith('Overriding existing task:')) + + # add_task called from sync_pg_dist_node should not override already scheduled or in flight task + self.assertIsNotNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres', 30)) + self.assertIsNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + self.c._in_flight = self.c._tasks.pop() + self.assertIsNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + + # If there is no transaction in progress and cached pg_dist_node matching desired state task should not be added + self.c._schedule_load_pg_dist_node = False + self.c._pg_dist_node[self.c._in_flight.group] = self.c._in_flight + self.c._in_flight = None + self.assertIsNone(self.c.add_task('after_promote', 1, 'postgres://host:5432/postgres')) + + def test_pick_task(self): + self.c.add_task('after_promote', 1, 'postgres://host2:5432/postgres') + with patch.object(CitusHandler, 'process_task') as mock_process_task: + self.c.process_tasks() + # process_task() shouln't be called because pick_task double checks with _pg_dist_node + mock_process_task.assert_not_called() + + def test_process_task(self): + self.c.add_task('after_promote', 0, 'postgres://host2:5432/postgres') + task = self.c.add_task('before_promote', 1, 'postgres://host4:5432/postgres', 30) + self.c.process_tasks() + self.assertTrue(task._event.is_set()) + + # the after_promote should result only in COMMIT + task = self.c.add_task('after_promote', 1, 'postgres://host4:5432/postgres', 30) + with patch.object(CitusHandler, 'query') as mock_query: + self.c.process_tasks() + mock_query.assert_called_once() + self.assertEqual(mock_query.call_args[0][0], 'COMMIT') + + def test_process_tasks(self): + self.c.add_task('after_promote', 0, 'postgres://host2:5432/postgres') + self.c.process_tasks() + + self.c.add_task('after_promote', 0, 'postgres://host3:5432/postgres') + with patch('patroni.postgresql.citus.logger.error') as mock_logger,\ + patch.object(CitusHandler, 'query', Mock(side_effect=Exception)): + self.c.process_tasks() + mock_logger.assert_called_once() + self.assertTrue(mock_logger.call_args[0][0].startswith('Exception when working with pg_dist_node: ')) + + def test_on_demote(self): + self.c.on_demote() + + @patch('patroni.postgresql.citus.logger.error') + @patch.object(MockCursor, 'execute', Mock(side_effect=Exception)) + def test_load_pg_dist_node(self, mock_logger): + # load_pg_dist_node() triggers, query fails and exception is property handled + self.c.process_tasks() + self.assertTrue(self.c._schedule_load_pg_dist_node) + mock_logger.assert_called_once() + self.assertTrue(mock_logger.call_args[0][0].startswith('Exception when executing query')) + self.assertTrue(mock_logger.call_args[0][1].startswith('SELECT nodeid, groupid, ')) + + def test_wait(self): + task = self.c.add_task('before_demote', 1, 'postgres://host:5432/postgres', 30) + task._event.wait = Mock() + task.wait() + + def test_adjust_postgres_gucs(self): + parameters = {'max_connections': 101, + 'max_prepared_transactions': 0, + 'shared_preload_libraries': 'foo , citus, bar '} + self.c.adjust_postgres_gucs(parameters) + self.assertEqual(parameters['max_prepared_transactions'], 202) + self.assertEqual(parameters['shared_preload_libraries'], 'citus,foo,bar') + self.assertEqual(parameters['wal_level'], 'logical') + + @patch.object(CitusHandler, 'is_enabled', Mock(return_value=False)) + def test_bootstrap(self): + self.c.bootstrap() + + def test_ignore_replication_slot(self): + self.assertFalse(self.c.ignore_replication_slot({'name': 'foo', 'type': 'physical', + 'database': 'bar', 'plugin': 'wal2json'})) + self.assertFalse(self.c.ignore_replication_slot({'name': 'foo', 'type': 'logical', + 'database': 'bar', 'plugin': 'wal2json'})) + self.assertFalse(self.c.ignore_replication_slot({'name': 'foo', 'type': 'logical', + 'database': 'bar', 'plugin': 'pgoutput'})) + self.assertFalse(self.c.ignore_replication_slot({'name': 'foo', 'type': 'logical', + 'database': 'citus', 'plugin': 'pgoutput'})) + self.assertTrue(self.c.ignore_replication_slot({'name': 'citus_shard_move_slot_1_2_3', + 'type': 'logical', 'database': 'citus', 'plugin': 'pgoutput'})) + self.assertFalse(self.c.ignore_replication_slot({'name': 'citus_shard_move_slot_1_2_3', + 'type': 'logical', 'database': 'citus', 'plugin': 'citus'})) + self.assertFalse(self.c.ignore_replication_slot({'name': 'citus_shard_split_slot_1_2_3', + 'type': 'logical', 'database': 'citus', 'plugin': 'pgoutput'})) + self.assertTrue(self.c.ignore_replication_slot({'name': 'citus_shard_split_slot_1_2_3', + 'type': 'logical', 'database': 'citus', 'plugin': 'citus'})) diff --git a/tests/test_config.py b/tests/test_config.py index e74a7be1..9406ea49 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -31,6 +31,9 @@ class TestConfig(unittest.TestCase): 'PATRONI_LOGLEVEL': 'ERROR', 'PATRONI_LOG_LOGGERS': 'patroni.postmaster: WARNING, urllib3: DEBUG', 'PATRONI_LOG_FILE_NUM': '5', + 'PATRONI_CITUS_DATABASE': 'citus', + 'PATRONI_CITUS_GROUP': '0', + 'PATRONI_CITUS_HOST': '0', 'PATRONI_RESTAPI_USERNAME': 'username', 'PATRONI_RESTAPI_PASSWORD': 'password', 'PATRONI_RESTAPI_LISTEN': '0.0.0.0:8008', diff --git a/tests/test_consul.py b/tests/test_consul.py index caf99bb0..64d75dda 100644 --- a/tests/test_consul.py +++ b/tests/test_consul.py @@ -18,6 +18,8 @@ def kv_get(self, key, **kwargs): good_cls = ('6429', [{'CreateIndex': 1334, 'Flags': 0, 'Key': key + 'failover', 'LockIndex': 0, 'ModifyIndex': 1334, 'Value': b''}, + {'CreateIndex': 1334, 'Flags': 0, 'Key': key + '1/initialize', 'LockIndex': 0, + 'ModifyIndex': 1334, 'Value': b'postgresql0'}, {'CreateIndex': 1334, 'Flags': 0, 'Key': key + 'initialize', 'LockIndex': 0, 'ModifyIndex': 1334, 'Value': b'postgresql0'}, {'CreateIndex': 2621, 'Flags': 0, 'Key': key + 'leader', 'LockIndex': 1, @@ -94,7 +96,7 @@ class TestConsul(unittest.TestCase): 'verify': 'on', 'cert': 'bar', 'cacert': 'buz', 'register_service': True}) self.c = Consul({'ttl': 30, 'scope': 'test', 'name': 'postgresql1', 'host': 'localhost:1', 'retry_timeout': 10, 'register_service': True, 'service_check_tls_server_name': True}) - self.c._base_path = '/service/good' + self.c._base_path = 'service/good' self.c.get_cluster() @patch('time.sleep', Mock(side_effect=SleepException)) @@ -115,16 +117,22 @@ class TestConsul(unittest.TestCase): @patch.object(consul.Consul.KV, 'delete', Mock()) def test_get_cluster(self): - self.c._base_path = '/service/test' + self.c._base_path = 'service/test' self.assertIsInstance(self.c.get_cluster(), Cluster) self.assertIsInstance(self.c.get_cluster(), Cluster) - self.c._base_path = '/service/fail' + self.c._base_path = 'service/fail' self.assertRaises(ConsulError, self.c.get_cluster) - self.c._base_path = '/service/broken' + self.c._base_path = 'service/broken' self.assertIsInstance(self.c.get_cluster(), Cluster) - self.c._base_path = '/service/legacy' + self.c._base_path = 'service/legacy' self.assertIsInstance(self.c.get_cluster(), Cluster) + def test__get_citus_cluster(self): + self.c._citus_group = '0' + cluster = self.c.get_cluster() + self.assertIsInstance(cluster, Cluster) + self.assertIsInstance(cluster.workers[1], Cluster) + @patch.object(consul.Consul.KV, 'delete', Mock(side_effect=[ConsulException, True, True, True])) @patch.object(consul.Consul.KV, 'put', Mock(side_effect=[True, ConsulException, InvalidSession])) def test_touch_member(self): diff --git a/tests/test_ctl.py b/tests/test_ctl.py index 87b8e6ca..67ae1fe7 100644 --- a/tests/test_ctl.py +++ b/tests/test_ctl.py @@ -21,7 +21,8 @@ from .test_ha import get_cluster_initialized_without_leader, get_cluster_initial @patch('patroni.ctl.load_config', Mock(return_value={ - 'scope': 'alpha', 'restapi': {'listen': '::', 'certfile': 'a'}, 'etcd': {'host': 'localhost:2379'}, + 'scope': 'alpha', 'restapi': {'listen': '::', 'certfile': 'a'}, + 'etcd': {'host': 'localhost:2379'}, 'citus': {'database': 'citus', 'group': 0}, 'postgresql': {'data_dir': '.', 'pgpass': './pgpass', 'parameters': {}, 'retry_timeout': 5}})) class TestCtl(unittest.TestCase): @@ -30,7 +31,8 @@ class TestCtl(unittest.TestCase): with patch.object(AbstractEtcdClientWithFailover, 'machines') as mock_machines: mock_machines.__get__ = Mock(return_value=['http://remotehost:2379']) self.runner = CliRunner() - self.e = get_dcs({'etcd': {'ttl': 30, 'host': 'ok:2379', 'retry_timeout': 10}}, 'foo') + self.e = get_dcs({'etcd': {'ttl': 30, 'host': 'ok:2379', 'retry_timeout': 10}, + 'citus': {'group': 0}}, 'foo', None) @patch('patroni.ctl.logging.debug') def test_load_config(self, mock_logger_debug): @@ -56,14 +58,14 @@ class TestCtl(unittest.TestCase): @patch('patroni.psycopg.connect', psycopg_connect) def test_get_cursor(self): - self.assertIsNone(get_cursor(get_cluster_initialized_without_leader(), {}, role='master')) + self.assertIsNone(get_cursor({}, get_cluster_initialized_without_leader(), None, {}, role='master')) - self.assertIsNotNone(get_cursor(get_cluster_initialized_with_leader(), {}, role='master')) + self.assertIsNotNone(get_cursor({}, get_cluster_initialized_with_leader(), None, {}, role='master')) # MockCursor returns pg_is_in_recovery as false - self.assertIsNone(get_cursor(get_cluster_initialized_with_leader(), {}, role='replica')) + self.assertIsNone(get_cursor({}, get_cluster_initialized_with_leader(), None, {}, role='replica')) - self.assertIsNotNone(get_cursor(get_cluster_initialized_with_leader(), {'dbname': 'foo'}, role='any')) + self.assertIsNotNone(get_cursor({}, get_cluster_initialized_with_leader(), None, {'dbname': 'foo'}, role='any')) def test_parse_dcs(self): assert parse_dcs(None) is None @@ -80,7 +82,7 @@ class TestCtl(unittest.TestCase): cluster = get_cluster_initialized_with_leader(Failover(1, 'foo', 'bar', scheduled_at)) del cluster.members[1].data['conn_url'] for fmt in ('pretty', 'json', 'yaml', 'tsv', 'topology'): - self.assertIsNone(output_members(cluster, name='abc', fmt=fmt)) + self.assertIsNone(output_members({}, cluster, name='abc', fmt=fmt)) @patch('patroni.ctl.get_dcs') @patch.object(PoolManager, 'request', Mock(return_value=MockResponse())) @@ -88,74 +90,79 @@ class TestCtl(unittest.TestCase): mock_get_dcs.return_value = self.e mock_get_dcs.return_value.get_cluster = get_cluster_initialized_with_leader mock_get_dcs.return_value.set_failover_value = Mock() - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nother\n\ny') assert 'leader' in result.output - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n2300-01-01T12:23:00\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], + input='leader\nother\n2300-01-01T12:23:00\ny') assert result.exit_code == 0 with patch('patroni.dcs.Cluster.is_paused', Mock(return_value=True)): - result = self.runner.invoke(ctl, ['switchover', 'dummy', '--force', '--scheduled', '2015-01-01T12:00:00']) + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0', + '--force', '--scheduled', '2015-01-01T12:00:00']) assert result.exit_code == 1 # Aborting switchover, as we answer NO to the confirmation - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n\nN') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nother\n\nN') assert result.exit_code == 1 # Aborting scheduled switchover, as we answer NO to the confirmation - result = self.runner.invoke(ctl, ['switchover', 'dummy', '--scheduled', '2015-01-01T12:00:00+01:00'], - input='leader\nother\n\nN') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0', + '--scheduled', '2015-01-01T12:00:00+01:00'], input='leader\nother\n\nN') assert result.exit_code == 1 # Target and source are equal - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nleader\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nleader\n\ny') assert result.exit_code == 1 # Reality is not part of this cluster - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nReality\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nReality\n\ny') assert result.exit_code == 1 - result = self.runner.invoke(ctl, ['switchover', 'dummy', '--force']) + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0', '--force']) assert 'Member' in result.output - result = self.runner.invoke(ctl, ['switchover', 'dummy', '--force', '--scheduled', '2015-01-01T12:00:00+01:00']) + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0', + '--force', '--scheduled', '2015-01-01T12:00:00+01:00']) assert result.exit_code == 0 # Invalid timestamp - result = self.runner.invoke(ctl, ['switchover', 'dummy', '--force', '--scheduled', 'invalid']) + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0', '--force', '--scheduled', 'invalid']) assert result.exit_code != 0 # Invalid timestamp - result = self.runner.invoke(ctl, ['switchover', 'dummy', '--force', '--scheduled', '2115-02-30T12:00:00+01:00']) + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0', + '--force', '--scheduled', '2115-02-30T12:00:00+01:00']) assert result.exit_code != 0 # Specifying wrong leader - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='dummy') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='dummy') assert result.exit_code == 1 with patch.object(PoolManager, 'request', Mock(side_effect=Exception)): # Non-responding patroni - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n2300-01-01T12:23:00\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], + input='leader\nother\n2300-01-01T12:23:00\ny') assert 'falling back to DCS' in result.output with patch.object(PoolManager, 'request') as mocked: mocked.return_value.status = 500 - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nother\n\ny') assert 'Switchover failed' in result.output mocked.return_value.status = 501 mocked.return_value.data = b'Server does not support this operation' - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nother\n\ny') assert 'Switchover failed' in result.output # No members available mock_get_dcs.return_value.get_cluster = get_cluster_initialized_with_only_leader - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nother\n\ny') assert result.exit_code == 1 # No master available mock_get_dcs.return_value.get_cluster = get_cluster_initialized_without_leader - result = self.runner.invoke(ctl, ['switchover', 'dummy'], input='leader\nother\n\ny') + result = self.runner.invoke(ctl, ['switchover', 'dummy', '--group', '0'], input='leader\nother\n\ny') assert result.exit_code == 1 @patch('patroni.ctl.get_dcs') @@ -164,12 +171,14 @@ class TestCtl(unittest.TestCase): mock_get_dcs.return_value = self.e mock_get_dcs.return_value.get_cluster = get_cluster_initialized_with_leader mock_get_dcs.return_value.set_failover_value = Mock() - result = self.runner.invoke(ctl, ['failover', 'dummy'], input='\n') + result = self.runner.invoke(ctl, ['failover', 'dummy', '--force'], input='\n') + assert 'For Citus clusters the --group must me specified' in result.output + result = self.runner.invoke(ctl, ['failover', 'dummy'], input='0\n') assert 'Failover could be performed only to a specific candidate' in result.output @patch('patroni.dcs.dcs_modules', Mock(return_value=['patroni.dcs.dummy', 'patroni.dcs.etcd'])) def test_get_dcs(self): - self.assertRaises(PatroniCtlException, get_dcs, {'dummy': {}}, 'dummy') + self.assertRaises(PatroniCtlException, get_dcs, {'dummy': {}}, 'dummy', 0) @patch('patroni.psycopg.connect', psycopg_connect) @patch('patroni.ctl.query_member', Mock(return_value=([['mock column']], None))) @@ -189,7 +198,7 @@ class TestCtl(unittest.TestCase): result = self.runner.invoke(ctl, ['query', 'alpha', '--file', 'dummy', '--command', 'dummy']) assert result.exit_code == 1 - result = self.runner.invoke(ctl, ['query', 'alpha', '--file', 'dummy']) + result = self.runner.invoke(ctl, ['query', 'alpha', '--member', 'abc', '--file', 'dummy']) assert result.exit_code == 0 os.remove('dummy') @@ -207,21 +216,21 @@ class TestCtl(unittest.TestCase): def test_query_member(self): with patch('patroni.ctl.get_cursor', Mock(return_value=MockConnect().cursor())): - rows = query_member(None, None, None, 'master', 'SELECT pg_catalog.pg_is_in_recovery()', {}) + rows = query_member({}, None, None, None, None, 'master', 'SELECT pg_catalog.pg_is_in_recovery()', {}) self.assertTrue('False' in str(rows)) with patch.object(MockCursor, 'execute', Mock(side_effect=OperationalError('bla'))): - rows = query_member(None, None, None, 'replica', 'SELECT pg_catalog.pg_is_in_recovery()', {}) + rows = query_member({}, None, None, None, None, 'replica', 'SELECT pg_catalog.pg_is_in_recovery()', {}) with patch('patroni.ctl.get_cursor', Mock(return_value=None)): - rows = query_member(None, None, None, None, 'SELECT pg_catalog.pg_is_in_recovery()', {}) + rows = query_member({}, None, None, None, None, None, 'SELECT pg_catalog.pg_is_in_recovery()', {}) self.assertTrue('No connection to' in str(rows)) - rows = query_member(None, None, None, 'replica', 'SELECT pg_catalog.pg_is_in_recovery()', {}) + rows = query_member({}, None, None, None, 'foo', 'replica', 'SELECT pg_catalog.pg_is_in_recovery()', {}) self.assertTrue('No connection to' in str(rows)) with patch('patroni.ctl.get_cursor', Mock(side_effect=OperationalError('bla'))): - rows = query_member(None, None, None, 'replica', 'SELECT pg_catalog.pg_is_in_recovery()', {}) + rows = query_member({}, None, None, None, None, 'replica', 'SELECT pg_catalog.pg_is_in_recovery()', {}) @patch('patroni.ctl.get_dcs') def test_dsn(self, mock_get_dcs): @@ -334,21 +343,23 @@ class TestCtl(unittest.TestCase): @patch('patroni.ctl.get_dcs') def test_remove(self, mock_get_dcs): mock_get_dcs.return_value.get_cluster = get_cluster_initialized_with_leader - result = self.runner.invoke(ctl, ['-k', 'remove', 'alpha'], input='alpha\nslave') + result = self.runner.invoke(ctl, ['remove', 'dummy'], input='\n') + assert 'For Citus clusters the --group must me specified' in result.output + result = self.runner.invoke(ctl, ['-k', 'remove', 'alpha', '--group', '0'], input='alpha\nslave') assert 'Please confirm' in result.output assert 'You are about to remove all' in result.output # Not typing an exact confirmation assert result.exit_code == 1 # master specified does not match master of cluster - result = self.runner.invoke(ctl, ['remove', 'alpha'], input='alpha\nYes I am aware\nslave') + result = self.runner.invoke(ctl, ['remove', 'alpha', '--group', '0'], input='alpha\nYes I am aware\nslave') assert result.exit_code == 1 # cluster specified on cmdline does not match verification prompt - result = self.runner.invoke(ctl, ['remove', 'alpha'], input='beta\nleader') + result = self.runner.invoke(ctl, ['remove', 'alpha', '--group', '0'], input='beta\nleader') assert result.exit_code == 1 - result = self.runner.invoke(ctl, ['remove', 'alpha'], input='alpha\nYes I am aware\nleader') + result = self.runner.invoke(ctl, ['remove', 'alpha', '--group', '0'], input='alpha\nYes I am aware\nleader') assert result.exit_code == 0 def test_ctl(self): @@ -358,23 +369,24 @@ class TestCtl(unittest.TestCase): assert 'Usage:' in result.output def test_get_any_member(self): - self.assertIsNone(get_any_member(get_cluster_initialized_without_leader(), role='master')) + self.assertIsNone(get_any_member({}, get_cluster_initialized_without_leader(), None, role='master')) - m = get_any_member(get_cluster_initialized_with_leader(), role='master') + m = get_any_member({}, get_cluster_initialized_with_leader(), None, role='master') self.assertEqual(m.name, 'leader') def test_get_all_members(self): - self.assertEqual(list(get_all_members(get_cluster_initialized_without_leader(), role='master')), []) + self.assertEqual(list(get_all_members({}, get_cluster_initialized_without_leader(), None, role='master')), []) - r = list(get_all_members(get_cluster_initialized_with_leader(), role='master')) + r = list(get_all_members({}, get_cluster_initialized_with_leader(), None, role='master')) self.assertEqual(len(r), 1) self.assertEqual(r[0].name, 'leader') - r = list(get_all_members(get_cluster_initialized_with_leader(), role='replica')) + r = list(get_all_members({}, get_cluster_initialized_with_leader(), None, role='replica')) self.assertEqual(len(r), 1) self.assertEqual(r[0].name, 'other') - self.assertEqual(len(list(get_all_members(get_cluster_initialized_without_leader(), role='replica'))), 2) + self.assertEqual(len(list(get_all_members({}, get_cluster_initialized_without_leader(), + None, role='replica'))), 2) @patch('patroni.ctl.get_dcs') def test_members(self, mock_get_dcs): @@ -438,16 +450,16 @@ class TestCtl(unittest.TestCase): cluster.members.append(cascade_member_wrong_tags) mock_get_dcs.return_value.get_cluster = Mock(return_value=cluster) result = self.runner.invoke(ctl, ['topology', 'dummy']) - assert '+\n| leader | 127.0.0.1:5435 | Leader |' in result.output - assert '|\n| + other | 127.0.0.1:5436 | Replica |' in result.output - assert '|\n| + cascade | 127.0.0.1:5437 | Replica |' in result.output - assert '|\n| + wrong_cascade | 127.0.0.1:5438 | Replica |' in result.output + assert '+\n| 0 | leader | 127.0.0.1:5435 | Leader |' in result.output + assert '|\n| 0 | + other | 127.0.0.1:5436 | Replica |' in result.output + assert '|\n| 0 | + cascade | 127.0.0.1:5437 | Replica |' in result.output + assert '|\n| 0 | + wrong_cascade | 127.0.0.1:5438 | Replica |' in result.output cluster = get_cluster_initialized_without_leader() mock_get_dcs.return_value.get_cluster = Mock(return_value=cluster) result = self.runner.invoke(ctl, ['topology', 'dummy']) - assert '+\n| + leader | 127.0.0.1:5435 | Replica |' in result.output - assert '|\n| + other | 127.0.0.1:5436 | Replica |' in result.output + assert '+\n| 0 | + leader | 127.0.0.1:5435 | Replica |' in result.output + assert '|\n| 0 | + other | 127.0.0.1:5436 | Replica |' in result.output @patch('patroni.ctl.get_dcs') @patch.object(PoolManager, 'request', Mock(return_value=MockResponse())) diff --git a/tests/test_etcd.py b/tests/test_etcd.py index 0e70462a..898bdd96 100644 --- a/tests/test_etcd.py +++ b/tests/test_etcd.py @@ -40,6 +40,10 @@ def etcd_read(self, key, **kwargs): raise etcd.EtcdKeyNotFound response = {"action": "get", "node": {"key": "/service/batman5", "dir": True, "nodes": [ + {"key": "/service/batman5/1", "dir": True, "nodes": [ + {"key": "/service/batman5/1/initialize", "value": "2164261704", + "modifiedIndex": 20729, "createdIndex": 20729}], + "modifiedIndex": 20437, "createdIndex": 20437}, {"key": "/service/batman5/config", "value": '{"synchronous_mode": 0, "failsafe_mode": true}', "modifiedIndex": 1582, "createdIndex": 1582}, {"key": "/service/batman5/failover", "value": "", @@ -266,6 +270,12 @@ class TestEtcd(unittest.TestCase): self.etcd._base_path = '/service/noleader' self.assertRaises(EtcdError, self.etcd.get_cluster) + def test__get_citus_cluster(self): + self.etcd._citus_group = '0' + cluster = self.etcd.get_cluster() + self.assertIsInstance(cluster, Cluster) + self.assertIsInstance(cluster.workers[1], Cluster) + def test_touch_member(self): self.assertFalse(self.etcd.touch_member('', '')) diff --git a/tests/test_etcd3.py b/tests/test_etcd3.py index a4ba5841..4fad7765 100644 --- a/tests/test_etcd3.py +++ b/tests/test_etcd3.py @@ -30,6 +30,8 @@ def mock_urlopen(self, method, url, **kwargs): ret.content = json.dumps({ "header": {"revision": "1"}, "kvs": [ + {"key": base64_encode('/patroni/test/1/initialize'), + "value": base64_encode('12345'), "mod_revision": '1'}, {"key": base64_encode('/patroni/test/leader'), "value": base64_encode('foo'), "lease": "bla", "mod_revision": '1'}, {"key": base64_encode('/patroni/test/members/foo'), @@ -207,6 +209,12 @@ class TestEtcd3(BaseTestEtcd3): mock_urlopen.side_effect = SleepException() self.assertRaises(Etcd3Error, self.etcd3.get_cluster) + def test__get_citus_cluster(self): + self.etcd3._citus_group = '0' + cluster = self.etcd3.get_cluster() + self.assertIsInstance(cluster, Cluster) + self.assertIsInstance(cluster.workers[1], Cluster) + def test_touch_member(self): self.etcd3.touch_member({}) self.etcd3._lease = 'bla' diff --git a/tests/test_ha.py b/tests/test_ha.py index c88fc774..38b4ca47 100644 --- a/tests/test_ha.py +++ b/tests/test_ha.py @@ -13,6 +13,7 @@ from patroni.postgresql import Postgresql from patroni.postgresql.bootstrap import Bootstrap from patroni.postgresql.cancellable import CancellableSubprocess from patroni.postgresql.config import ConfigHandler +from patroni.postgresql.postmaster import PostmasterProcess from patroni.postgresql.rewind import Rewind from patroni.postgresql.slots import SlotsHandler from patroni.utils import tzutc @@ -51,7 +52,8 @@ def get_cluster_bootstrapping_without_leader(cluster_config=None): def get_cluster_initialized_without_leader(leader=False, failover=None, sync=None, cluster_config=None, failsafe=False): m1 = Member(0, 'leader', 28, {'conn_url': 'postgres://replicator:rep-pass@127.0.0.1:5435/postgres', - 'api_url': 'http://127.0.0.1:8008/patroni', 'xlog_location': 4}) + 'api_url': 'http://127.0.0.1:8008/patroni', 'xlog_location': 4, + 'role': 'master', 'state': 'running'}) leader = Leader(0, 0, m1 if leader else Member(0, '', 28, {})) m2 = Member(0, 'other', 28, {'conn_url': 'postgres://replicator:rep-pass@127.0.0.1:5436/postgres', 'api_url': 'http://127.0.0.1:8011/patroni', @@ -188,6 +190,7 @@ def run_async(self, func, args=()): @patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=False)) @patch('patroni.async_executor.AsyncExecutor.run_async', run_async) @patch('patroni.postgresql.rewind.Thread', Mock()) +@patch('patroni.postgresql.citus.CitusHandler.start', Mock()) @patch('subprocess.call', Mock(return_value=0)) @patch('time.sleep', Mock()) class TestHa(PostgresInit): @@ -204,7 +207,8 @@ class TestHa(PostgresInit): self.p.postmaster_start_time = MagicMock(return_value=str(postmaster_start_time)) self.p.can_create_replica_without_replication_connection = MagicMock(return_value=False) self.e = get_dcs({'etcd': {'ttl': 30, 'host': 'ok:2379', 'scope': 'test', - 'name': 'foo', 'retry_timeout': 10}}) + 'name': 'foo', 'retry_timeout': 10}, + 'citus': {'database': 'citus', 'group': None}}) self.ha = Ha(MockPatroni(self.p, self.e)) self.ha.old_cluster = self.e.get_cluster() self.ha.cluster = get_cluster_initialized_without_leader() @@ -225,6 +229,9 @@ class TestHa(PostgresInit): self.p.timeline_wal_position = Mock(return_value=(0, 1, 1)) self.p.set_role('standby_leader') self.ha.touch_member() + self.p.set_role('master') + self.ha.dcs.touch_member = true + self.ha.touch_member() def test_is_leader(self): self.assertFalse(self.ha.is_leader()) @@ -420,6 +427,12 @@ class TestHa(PostgresInit): self.ha.has_lock = true self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0), the leader with the lock') + def test_coordinator_leader_with_lock(self): + self.ha.cluster = get_cluster_initialized_with_leader() + self.ha.cluster.is_unlocked = false + self.ha.has_lock = true + self.assertEqual(self.ha.run_cycle(), 'no action. I am (postgresql0), the leader with the lock') + @patch.object(Postgresql, '_wait_for_connection_close', Mock()) def test_demote_because_not_having_lock(self): self.ha.cluster.is_unlocked = false @@ -553,6 +566,8 @@ class TestHa(PostgresInit): self.assertEqual(self.ha.bootstrap(), 'failed to acquire initialize lock') @patch('patroni.psycopg.connect', psycopg_connect) + @patch('patroni.postgresql.citus.connect', psycopg_connect) + @patch('patroni.postgresql.citus.quote_ident', Mock()) @patch.object(Postgresql, 'connection', Mock(return_value=None)) def test_bootstrap_initialized_new_cluster(self): self.ha.cluster = get_cluster_not_initialized_without_leader() @@ -572,16 +587,20 @@ class TestHa(PostgresInit): self.assertRaises(PatroniFatalException, self.ha.post_bootstrap) @patch('patroni.psycopg.connect', psycopg_connect) + @patch('patroni.postgresql.citus.connect', psycopg_connect) + @patch('patroni.postgresql.citus.quote_ident', Mock()) @patch.object(Postgresql, 'connection', Mock(return_value=None)) def test_bootstrap_release_initialize_key_on_watchdog_failure(self): self.ha.cluster = get_cluster_not_initialized_without_leader() self.e.initialize = true self.ha.bootstrap() - self.p.is_running.return_value = MockPostmaster() self.p.is_leader = true - with patch.object(Watchdog, 'activate', Mock(return_value=False)): + with patch.object(Watchdog, 'activate', Mock(return_value=False)),\ + patch('patroni.ha.logger.error') as mock_logger: self.assertEqual(self.ha.post_bootstrap(), 'running post_bootstrap') self.assertRaises(PatroniFatalException, self.ha.post_bootstrap) + self.assertTrue(mock_logger.call_args[0][0].startswith('Cancelling bootstrap because' + ' watchdog activation failed')) @patch('patroni.psycopg.connect', psycopg_connect) def test_reinitialize(self): @@ -608,6 +627,20 @@ class TestHa(PostgresInit): with patch.object(self.ha, "restart_matches", return_value=False): self.assertEqual(self.ha.restart({'foo': 'bar'}), (False, "restart conditions are not satisfied")) + @patch('time.sleep', Mock()) + @patch.object(ConfigHandler, 'replace_pg_hba', Mock()) + @patch.object(ConfigHandler, 'replace_pg_ident', Mock()) + @patch.object(PostmasterProcess, 'start', Mock(return_value=MockPostmaster())) + @patch('patroni.postgresql.citus.CitusHandler.is_coordinator', Mock(return_value=False)) + def test_worker_restart(self): + self.ha.has_lock = true + self.ha.patroni.request = Mock() + self.p.is_running = Mock(side_effect=[Mock(), False]) + self.assertEqual(self.ha.restart({}), (True, 'restarted successfully')) + self.ha.patroni.request.assert_called() + self.assertEqual(self.ha.patroni.request.call_args_list[0][0][3]['type'], 'before_demote') + self.assertEqual(self.ha.patroni.request.call_args_list[1][0][3]['type'], 'after_promote') + @patch('os.kill', Mock()) def test_restart_in_progress(self): with patch('patroni.async_executor.AsyncExecutor.busy', PropertyMock(return_value=True)): @@ -631,6 +664,7 @@ class TestHa(PostgresInit): self.ha.is_paused = true self.assertEqual(self.ha.run_cycle(), 'PAUSE: restart in progress') + @patch('patroni.postgresql.citus.CitusHandler.is_coordinator', Mock(return_value=False)) def test_manual_failover_from_leader(self): self.ha.fetch_node_status = get_node_status() self.ha.has_lock = true @@ -1266,6 +1300,16 @@ class TestHa(PostgresInit): self.ha.is_failover_possible = true self.ha.shutdown() + @patch('patroni.postgresql.citus.CitusHandler.is_coordinator', Mock(return_value=False)) + def test_shutdown_citus_worker(self): + self.ha.is_leader = true + self.p.is_running = Mock(side_effect=[Mock(), False]) + self.ha.patroni.request = Mock() + self.ha.shutdown() + self.ha.patroni.request.assert_called() + self.assertEqual(self.ha.patroni.request.call_args[0][2], 'citus') + self.assertEqual(self.ha.patroni.request.call_args[0][3]['type'], 'before_demote') + @patch('time.sleep', Mock()) def test_leader_with_not_accessible_data_directory(self): self.ha.cluster = get_cluster_initialized_with_leader() @@ -1366,3 +1410,16 @@ class TestHa(PostgresInit): self.ha.dcs.attempt_to_acquire_leader = Mock(side_effect=[DCSError('foo'), Exception]) self.assertRaises(DCSError, self.ha.acquire_lock) self.assertFalse(self.ha.acquire_lock()) + + @patch('patroni.postgresql.citus.CitusHandler.is_coordinator', Mock(return_value=False)) + def test_notify_citus_coordinator(self): + self.ha.patroni.request = Mock() + self.ha.notify_citus_coordinator('before_demote') + self.ha.patroni.request.assert_called_once() + self.assertEqual(self.ha.patroni.request.call_args[1]['timeout'], 30) + self.ha.patroni.request = Mock(side_effect=Exception) + with patch('patroni.ha.logger.warning') as mock_logger: + self.ha.notify_citus_coordinator('before_promote') + self.assertEqual(self.ha.patroni.request.call_args[1]['timeout'], 2) + mock_logger.assert_called() + self.assertTrue(mock_logger.call_args[0][0].startswith('Request to Citus coordinator')) diff --git a/tests/test_kubernetes.py b/tests/test_kubernetes.py index 27238a1c..b0a9dcdd 100644 --- a/tests/test_kubernetes.py +++ b/tests/test_kubernetes.py @@ -7,7 +7,7 @@ import time import unittest from mock import Mock, PropertyMock, mock_open, patch -from patroni.dcs.kubernetes import k8s_client, k8s_config, K8sConfig, K8sConnectionFailed,\ +from patroni.dcs.kubernetes import Cluster, k8s_client, k8s_config, K8sConfig, K8sConnectionFailed,\ K8sException, K8sObject, Kubernetes, KubernetesError, KubernetesRetriableException,\ Retry, RetryFailedError, SERVICE_HOST_ENV_NAME, SERVICE_PORT_ENV_NAME from six.moves import builtins @@ -26,6 +26,15 @@ def mock_list_namespaced_config_map(*args, **kwargs): items.append(k8s_client.V1ConfigMap(metadata=k8s_client.V1ObjectMeta(**metadata))) metadata.update({'name': 'test-sync', 'annotations': {'leader': 'p-0'}}) items.append(k8s_client.V1ConfigMap(metadata=k8s_client.V1ObjectMeta(**metadata))) + metadata.update({'name': 'test-0-leader', 'labels': {Kubernetes._CITUS_LABEL: '0'}, + 'annotations': {'optime': '1234x', 'leader': 'p-0', 'ttl': '30s', 'slots': '{', 'failsafe': '{'}}) + items.append(k8s_client.V1ConfigMap(metadata=k8s_client.V1ObjectMeta(**metadata))) + metadata.update({'name': 'test-0-config', 'labels': {Kubernetes._CITUS_LABEL: '0'}, + 'annotations': {'initialize': '123', 'config': '{}'}}) + items.append(k8s_client.V1ConfigMap(metadata=k8s_client.V1ObjectMeta(**metadata))) + metadata.update({'name': 'test-1-leader', 'labels': {Kubernetes._CITUS_LABEL: '1'}, + 'annotations': {'leader': 'p-3', 'ttl': '30s'}}) + items.append(k8s_client.V1ConfigMap(metadata=k8s_client.V1ObjectMeta(**metadata))) metadata = k8s_client.V1ObjectMeta(resource_version='1') return k8s_client.V1ConfigMapList(metadata=metadata, items=items, kind='ConfigMapList') @@ -48,7 +57,8 @@ def mock_list_namespaced_endpoints(*args, **kwargs): def mock_list_namespaced_pod(*args, **kwargs): - metadata = k8s_client.V1ObjectMeta(resource_version='1', name='p-0', annotations={'status': '{}'}, + metadata = k8s_client.V1ObjectMeta(resource_version='1', labels={'f': 'b', Kubernetes._CITUS_LABEL: '1'}, + name='p-0', annotations={'status': '{}'}, uid='964dfeae-e79b-4476-8a5a-1920b5c2a69d') status = k8s_client.V1PodStatus(pod_ip='10.0.0.0') spec = k8s_client.V1PodSpec(hostname='p-0', node_name='kind-control-plane', containers=[]) @@ -213,9 +223,10 @@ class BaseTestKubernetes(unittest.TestCase): @patch.object(k8s_client.CoreV1Api, 'list_namespaced_config_map', mock_list_namespaced_config_map, create=True) def setUp(self, config=None): config = config or {} - config.update(ttl=30, scope='test', name='p-0', loop_wait=10, + config.update(ttl=30, scope='test', name='p-0', loop_wait=10, group=0, retry_timeout=10, labels={'f': 'b'}, bypass_api_service=True) self.k = Kubernetes(config) + self.k._citus_group = None self.assertRaises(AttributeError, self.k._pods._build_cache) self.k._pods._is_ready = True self.assertRaises(TypeError, self.k._kinds._build_cache) @@ -239,6 +250,20 @@ class TestKubernetesConfigMaps(BaseTestKubernetes): with patch.object(Kubernetes, '_wait_caches', Mock(side_effect=Exception)): self.assertRaises(KubernetesError, self.k.get_cluster) + def test__get_citus_cluster(self): + self.k._citus_group = '0' + cluster = self.k.get_cluster() + self.assertIsInstance(cluster, Cluster) + self.assertIsInstance(cluster.workers[1], Cluster) + + @patch('patroni.dcs.kubernetes.logger.error') + def test_get_citus_coordinator(self, mock_logger): + self.assertIsInstance(self.k.get_citus_coordinator(), Cluster) + with patch.object(Kubernetes, '_cluster_loader', Mock(side_effect=Exception)): + self.assertIsNone(self.k.get_citus_coordinator()) + mock_logger.assert_called() + self.assertTrue(mock_logger.call_args[0][0].startswith('Failed to load Citus coordinator')) + def test_attempt_to_acquire_leader(self): with patch.object(k8s_client.CoreV1Api, 'patch_namespaced_config_map', create=True) as mock_patch: mock_patch.side_effect = K8sException @@ -373,6 +398,7 @@ class TestCacheBuilder(BaseTestKubernetes): @patch.object(k8s_client.CoreV1Api, 'list_namespaced_config_map', mock_list_namespaced_config_map, create=True) @patch('patroni.dcs.kubernetes.ObjectCache._watch') def test__build_cache(self, mock_response): + self.k._citus_group = '0' mock_response.return_value.read_chunked.return_value = [json.dumps( {'type': 'MODIFIED', 'object': {'metadata': { 'name': self.k.config_path, 'resourceVersion': '2', 'annotations': {self.k._CONFIG: 'foo'}}}} diff --git a/tests/test_raft.py b/tests/test_raft.py index 2c8ce84e..21306e20 100644 --- a/tests/test_raft.py +++ b/tests/test_raft.py @@ -4,7 +4,8 @@ import tempfile import time from mock import Mock, PropertyMock, patch -from patroni.dcs.raft import DynMemberSyncObj, KVStoreTTL, Raft, RaftError, SyncObjUtility, TCPTransport, _TCPTransport +from patroni.dcs.raft import Cluster, DynMemberSyncObj, KVStoreTTL,\ + Raft, RaftError, SyncObjUtility, TCPTransport, _TCPTransport from pysyncobj import SyncObjConf, FAIL_REASON @@ -128,7 +129,8 @@ class TestRaft(unittest.TestCase): def test_raft(self): raft = Raft({'ttl': 30, 'scope': 'test', 'name': 'pg', 'self_addr': '127.0.0.1:1234', - 'retry_timeout': 10, 'data_dir': self._TMP}) + 'retry_timeout': 10, 'data_dir': self._TMP, + 'database': 'citus', 'group': 0}) raft.reload_config({'retry_timeout': 20, 'ttl': 60, 'loop_wait': 10}) self.assertTrue(raft._sync_obj.set(raft.members_path + 'legacy', '{"version":"2.0.0"}')) self.assertTrue(raft.touch_member('')) @@ -136,20 +138,28 @@ class TestRaft(unittest.TestCase): self.assertTrue(raft.cancel_initialization()) self.assertTrue(raft.set_config_value('{}')) self.assertTrue(raft.write_sync_state('foo', 'bar')) + raft._citus_group = '1' self.assertTrue(raft.manual_failover('foo', 'bar')) - raft.get_cluster() + raft._citus_group = '0' + cluster = raft.get_cluster() + self.assertIsInstance(cluster, Cluster) + self.assertIsInstance(cluster.workers[1], Cluster) self.assertTrue(raft._sync_obj.set(raft.status_path, '{"optime":1234567,"slots":{"ls":12345}}')) raft.get_cluster() self.assertTrue(raft.update_leader('1', failsafe={'foo': 'bat'})) self.assertTrue(raft._sync_obj.set(raft.failsafe_path, '{"foo"}')) self.assertTrue(raft._sync_obj.set(raft.status_path, '{')) - raft.get_cluster() + raft.get_citus_coordinator() self.assertTrue(raft.delete_sync_state()) self.assertTrue(raft.delete_leader()) self.assertTrue(raft.set_history_value('')) self.assertTrue(raft.delete_cluster()) + raft._citus_group = '1' + self.assertTrue(raft.delete_cluster()) + raft._citus_group = None raft.get_cluster() self.assertTrue(raft.take_leader()) + raft.get_cluster() raft.watch(None, 0.001) raft._sync_obj.destroy() diff --git a/tests/test_zookeeper.py b/tests/test_zookeeper.py index e1e2ac66..d98fe77f 100644 --- a/tests/test_zookeeper.py +++ b/tests/test_zookeeper.py @@ -62,7 +62,7 @@ class MockKazooClient(Mock): if path.startswith('/no_node'): raise NoNodeError elif path in ['/service/bla/', '/service/test/']: - return ['initialize', 'leader', 'members', 'optime', 'failover', 'sync', 'failsafe'] + return ['initialize', 'leader', 'members', 'optime', 'failover', 'sync', 'failsafe', '0', '1'] return ['foo', 'bar', 'buzz'] def create(self, path, value=b"", acl=None, ephemeral=False, sequence=False, makepath=False): @@ -155,6 +155,11 @@ class TestZooKeeper(unittest.TestCase): def test_session_listener(self): self.zk.session_listener(KazooState.SUSPENDED) + def test_members_watcher(self): + self.zk._fetch_cluster = False + self.zk.members_watcher(None) + self.assertTrue(self.zk._fetch_cluster) + def test_reload_config(self): self.zk.reload_config({'ttl': 20, 'retry_timeout': 10, 'loop_wait': 10}) self.zk.reload_config({'ttl': 20, 'retry_timeout': 10, 'loop_wait': 5}) @@ -165,15 +170,15 @@ class TestZooKeeper(unittest.TestCase): def test_get_children(self): self.assertListEqual(self.zk.get_children('/no_node'), []) - def test__inner_load_cluster(self): + def test__cluster_loader(self): self.zk._base_path = self.zk._base_path.replace('test', 'bla') - self.zk._inner_load_cluster() + self.zk._cluster_loader(self.zk.client_path('')) self.zk._base_path = self.zk._base_path = '/broken' - self.zk._inner_load_cluster() + self.zk._cluster_loader(self.zk.client_path('')) self.zk._base_path = self.zk._base_path = '/legacy' - self.zk._inner_load_cluster() + self.zk._cluster_loader(self.zk.client_path('')) self.zk._base_path = self.zk._base_path = '/no_node' - self.zk._inner_load_cluster() + self.zk._cluster_loader(self.zk.client_path('')) def test_get_cluster(self): cluster = self.zk.get_cluster(True) @@ -188,6 +193,19 @@ class TestZooKeeper(unittest.TestCase): cluster = self.zk.get_cluster() self.assertEqual(cluster.last_lsn, 500) + def test__get_citus_cluster(self): + self.zk._citus_group = '0' + for _ in range(0, 2): + cluster = self.zk.get_cluster() + self.assertIsInstance(cluster, Cluster) + self.assertIsInstance(cluster.workers[1], Cluster) + + @patch('patroni.dcs.zookeeper.logger.error') + @patch.object(ZooKeeper, '_cluster_loader', Mock(side_effect=Exception)) + def test_get_citus_coordinator(self, mock_logger): + self.assertIsNone(self.zk.get_citus_coordinator()) + mock_logger.assert_called_once() + def test_delete_leader(self): self.assertTrue(self.zk.delete_leader())