Compare commits

..

3 Commits

Author SHA1 Message Date
Jeff McCune
d64c3e8c66 (#58) Zitadel Failover RunBook 2024-03-14 15:25:38 -07:00
Jeff McCune
f344f97374 (#58) Restore last zitadel database backup
When the cluster is provisioned, restore the most recent backup instead
of a fixed point in time.
2024-03-14 11:40:17 -07:00
Jeff McCune
770088b912 (#53) Clean up nested if statements with && 2024-03-13 10:35:20 -07:00
5 changed files with 374 additions and 70 deletions

View File

@@ -13,7 +13,10 @@ let BucketRepoName = "repo2"
// Restore options. Set the timestamp to a known good point in time.
// time="2024-03-11T17:08:58Z" level=info msg="crunchy-pgbackrest ends"
let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// Restore the most recent backup.
let RestoreOptions = []
#KubernetesObjects & {
apiObjects: {
@@ -40,7 +43,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: "10Gi"
}
}]
standby: {
@@ -103,7 +106,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
name: "repo1"
volume: volumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: string | *"4Gi"
}
},
{
@@ -137,7 +140,7 @@ let HighlyAvailable = {
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: "1Gi"
resources: requests: storage: string | *"10Gi"
}
affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: [{
weight: 1

View File

@@ -34,17 +34,15 @@ let LoginCert = #PlatformCerts.login
}
for k, svc in #OptionalServices {
if svc.enabled {
if list.Contains(svc.clusterNames, #ClusterName) {
Gateway: "\(svc.name)": #Gateway & {
metadata: name: svc.name
metadata: namespace: #TargetNamespace
spec: selector: istio: "ingressgateway"
spec: servers: [for s in svc.servers {s}]
}
for k, s in svc.servers {
ExternalSecret: "\(s.tls.credentialName)": _
}
if svc.enabled && list.Contains(svc.clusterNames, #ClusterName) {
Gateway: "\(svc.name)": #Gateway & {
metadata: name: svc.name
metadata: namespace: #TargetNamespace
spec: selector: istio: "ingressgateway"
spec: servers: [for s in svc.servers {s}]
}
for k, s in svc.servers {
ExternalSecret: "\(s.tls.credentialName)": _
}
}
}

View File

@@ -11,67 +11,65 @@ let Name = "vault"
let Vault = #OptionalServices[Name]
if Vault.enabled {
if list.Contains(Vault.clusterNames, #ClusterName) {
#HelmChart & {
namespace: #TargetNamespace
chart: {
name: Name
version: "0.25.0"
repository: {
name: "hashicorp"
url: "https://helm.releases.hashicorp.com"
}
}
values: #Values
apiObjects: {
ExternalSecret: "gcpkms-creds": _
ExternalSecret: "vault-server-cert": _
VirtualService: "\(Name)": {
metadata: name: Name
metadata: namespace: #TargetNamespace
spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
spec: gateways: ["istio-ingress/\(Name)"]
spec: http: [
{
route: [
{
destination: host: "\(Name)-active"
destination: port: number: 8200
},
]
},
]
}
if Vault.enabled && list.Contains(Vault.clusterNames, #ClusterName) {
#HelmChart & {
namespace: #TargetNamespace
chart: {
name: Name
version: "0.25.0"
repository: {
name: "hashicorp"
url: "https://helm.releases.hashicorp.com"
}
}
values: #Values
#Kustomize: {
patches: [
{
target: {
group: "apps"
version: "v1"
kind: "StatefulSet"
name: Name
}
patch: yaml.Marshal(EnvPatch)
},
]
apiObjects: {
ExternalSecret: "gcpkms-creds": _
ExternalSecret: "vault-server-cert": _
VirtualService: "\(Name)": {
metadata: name: Name
metadata: namespace: #TargetNamespace
spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
spec: gateways: ["istio-ingress/\(Name)"]
spec: http: [
{
route: [
{
destination: host: "\(Name)-active"
destination: port: number: 8200
},
]
},
]
}
}
}
let EnvPatch = [
#Kustomize: {
patches: [
{
op: "test"
path: "/spec/template/spec/containers/0/env/4/name"
value: "VAULT_ADDR"
},
{
op: "replace"
path: "/spec/template/spec/containers/0/env/4/value"
value: "http://$(VAULT_K8S_POD_NAME):8200"
target: {
group: "apps"
version: "v1"
kind: "StatefulSet"
name: Name
}
patch: yaml.Marshal(EnvPatch)
},
]
}
let EnvPatch = [
{
op: "test"
path: "/spec/template/spec/containers/0/env/4/name"
value: "VAULT_ADDR"
},
{
op: "replace"
path: "/spec/template/spec/containers/0/env/4/value"
value: "http://$(VAULT_K8S_POD_NAME):8200"
},
]
}

View File

@@ -0,0 +1,305 @@
## Overview
Use this run book to move the ZITADEL primary postgres database from one cluster to another. At the end of the process https://zitadel.example.com service will be live on the cluster which was previously the standby at the start of the process.
Downtime: ~10 minutes if executed quickly. ~60 minutes if executed slowly and deliberately.
## Initial State
1. https://login.ois.run is routed to core2.
2. `postgrescluster/zitadel` on core2 is primary.
3. `postgrescluster/zitadel` on core1 is standby.
4. A recent [[#Full Backup]] has been taken and is stored in S3.
5. The standby cluster has been restored from the recent full backup.
## Process
1. [[#Edit Platform site config]]
2. [[#Render core2]]
3. [[#Demote core2]]
4. [[#Verify core2 is standby]]
5. [[#Render core1]]
6. [[#Promote core1]]
7. [[#Verify core1 is primary]]
8. [[#Reconcile Zitadel]]
9. [[#Cut over NLB]]
10. [[#Test Login]]
## Edit Platform site config
Change the primary cluster from core2 to core1 in `platforms/platform.site.cue`.
```cue
#Platform: primaryCluster: name: "core1"
```
## Render core2
Configure core as a standby.
```
holos render --cluster-name=core2 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index c1aa8394..45daa650 100644
--- a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,24 +85,6 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
- dataSource:
- pgbackrest:
- configuration:
- - secret:
- name: pgo-s3-creds
- global:
- repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
- repo2-cipher-type: aes-256-cbc
- options:
- - --type=time
- - --target="2024-03-11 17:10:00+00"
- repo:
- name: repo2
- s3:
- bucket: ois-zitadel-backups
- endpoint: s3.dualstack.us-east-2.amazonaws.com
- region: us-east-2
- stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -139,7 +121,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: false
+ enabled: true
repoName: repo2
users:
- databases
```
## Demote core2
```
10012 git add -p
10013 git commit -m 'Make core2 a standby zitadel cluster'
10014 git push origin HEAD
10015 flux reconcile source git flux-system
10016 flux get ks
```
## Verify core2 is standby
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -2
```
Expected output:
```
2024-03-14 21:58:26,205 INFO: no action. I am (zitadel-pgha1-smw7-0), the standby leader with the lock
```
## Render core1
```
holos render --cluster-name=core1 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index aa69814e..290c0865 100644
--- a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,6 +85,22 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
+ dataSource:
+ pgbackrest:
+ configuration:
+ - secret:
+ name: pgo-s3-creds
+ global:
+ repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+ repo2-cipher-type: aes-256-cbc
+ options: []
+ repo:
+ name: repo2
+ s3:
+ bucket: ois-zitadel-backups
+ endpoint: s3.dualstack.us-east-2.amazonaws.com
+ region: us-east-2
+ stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -121,7 +137,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: true
+ enabled: false
repoName: repo2
users:
- databases:
```
## Promote core1
```
flux reconcile source git flux-system
flux reconcile ks prod-iam-postgres
```
## Verify core1 is primary
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -1
```
Expected output:
```
2024-03-14 22:05:01,159 INFO: no action. I am (zitadel-pgha1-xl2b-0), the leader with the lock
```
## Reconcile Zitadel
Now that the database is back up, reconcile zitadel.
This is pretty brutal but it works:
```
flux delete ks prod-iam-zitadel
Are you sure you want to delete this kustomization: y
► deleting kustomization prod-iam-zitadel in flux-system namespace
✔ kustomization deleted
```
```
k apply --server-side=true -f deploy/clusters/core1/holos/components/prod-iam-zitadel-kustomization.gen.yaml
kustomization.kustomize.toolkit.fluxcd.io/prod-iam-zitadel serverside-applied
```
ZITADEL should come up READY 2/2
```
k get pods
NAME READY STATUS RESTARTS AGE
zitadel-76776f747f-94wft 2/2 Running 2 (30s ago) 34s
zitadel-76776f747f-fl24c 2/2 Running 1 (30s ago) 34s
zitadel-76776f747f-xx2w7 2/2 Running 2 (30s ago) 34s
zitadel-backup-49c7-h25cj 0/1 Completed 0 3m23s
zitadel-init-td2sh 0/1 Completed 0 34s
zitadel-pgbouncer-d9f8cffc-2vwk5 2/2 Running 0 22m
zitadel-pgbouncer-d9f8cffc-rrrhd 2/2 Running 0 22m
zitadel-pgha1-4npq-0 4/4 Running 0 3m21s
zitadel-pgha1-xl2b-0 4/4 Running 0 3m43s
zitadel-repo-host-0 2/2 Running 0 22m
zitadel-setup-b7zkx 0/1 Completed 1 34s
zitadel-test-connection 0/1 Error 0 34s
```
## Cut over NLB
Finally, cut over the NLB config to move to core1.
This is specific to our metal clusters.
Apply with: `git push puppet +HEAD:production -o run=netlb`
```diff
diff --git a/site-modules/profile/templates/haproxy/haproxy.cfg.epp b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
index b358a70..3ee40db 100644
--- a/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+++ b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
@@ -225,7 +225,7 @@ frontend sni-ingress from tcp-ingress
use_backend core2-https if { req.ssl_sni -i web.holos.run }
use_backend core2-https if { req.ssl_sni -i -m end .web.holos.run }
# Identity provider is routed to one cluster at a time
- use_backend core2-https if { req.ssl_sni -i login.ois.run }
+ use_backend core1-https if { req.ssl_sni -i login.ois.run }
# Holos Dev Environments
use_backend k1-https if { req.ssl_sni -i holos.wtf }
use_backend k1-https if { req.ssl_sni -i -m end .holos.wtf }
@@ -358,7 +358,7 @@ frontend http-ingress from http-ingress
bind 65.102.23.41:80
bind 2602:41:6617:2ec0::80:1d:80 v6only
# Zitadel is active on one cluster at a time
- use_backend core2-http if { hdr(host) -i login.ois.run }
+ use_backend core1-http if { hdr(host) -i login.ois.run }
# Vault is active on core1 or core2
use_backend core2-http if { hdr(host) -i vault.core.ois.run }
# Infrastructure Dex OIDC ID issuer is active on core1 or core2
```
## Test Login
```
kubectl oidc-login get-token \
--oidc-extra-scope=openid \
--oidc-extra-scope=email \
--oidc-extra-scope=profile \
--oidc-extra-scope=groups \
--oidc-extra-scope=offline_access \
--oidc-extra-scope=urn:zitadel:iam:org:domain:primary:openinfrastructure.co \
--oidc-issuer-url=https://login.ois.run \
--oidc-client-id=${CLIENT_ID:-257714027772314751@holos_platform} \
--oidc-use-pkce \
--force-refresh > ~/.kube/token.json
```
```
jq -r .status.token ~/.kube/token.json | cut -d. -f2 | base64 -d | jq
```
```json
{
"amr": [
"mfa",
"user"
],
"at_hash": "5nUq6kXWT8GheCNvDNdyow",
"aud": [
"257714027772314751@holos_platform",
"257713952794870157"
],
"auth_time": 1710172186,
"azp": "257714027772314751@holos_platform",
"client_id": "257714027772314751@holos_platform",
"email": "jeff@openinfrastructure.co",
"email_verified": true,
"exp": 1710497614,
"family_name": "McCune",
"given_name": "Jeff",
"groups": [
"prod-cluster-admin"
],
"iat": 1710454404,
"iss": "https://login.ois.run",
"locale": null,
"name": "Jeff McCune",
"nickname": "Jeff",
"preferred_username": "jeff@openinfrastructure.co",
"sub": "257712562366383231",
"updated_at": 1710105084,
"urn:zitadel:iam:org:domain:primary": "openinfrastructure.co",
"urn:zitadel:iam:org:project:257713952794870157:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
},
"urn:zitadel:iam:org:project:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
}
}
```
## Tasks
### Identify the leader
Same command for primary and standby clusters.
```
k get pods -o name -l postgres-operator.crunchydata.com/role=master
```
### Full Backup
On the primary cluster:
```
kubectl -n prod-iam-zitadel annotate postgrescluster zitadel postgres-operator.crunchydata.com/pgbackrest-backup="$(date)" --overwrite
```
Watch the progress:
```
k -n prod-iam-zitadel logs -l postgres-operator.crunchydata.com/pgbackrest-backup=manual -f
```
Expected output:
```
time="2024-03-14T18:36:16Z" level=info msg="crunchy-pgbackrest starts"
time="2024-03-14T18:36:16Z" level=info msg="debug flag set to false"
time="2024-03-14T18:36:16Z" level=info msg="backrest backup command requested"
time="2024-03-14T18:36:16Z" level=info msg="command to execute is [pgbackrest backup --stanza=db --repo=2 --type=full]"
time="2024-03-14T18:39:11Z" level=info msg="output=[]"
time="2024-03-14T18:39:11Z" level=info msg="stderr=[]"
time="2024-03-14T18:39:11Z" level=info msg="crunchy-pgbackrest ends"
```
## Troubleshooting
Full backup doesn't start
Check the controller logs. If you see `Switchover failed` errors, try editing the number of postgrescluster replicas down to 1 then back up to 2. Then try a backup again.
```
Switchover failed, details: 503, Switchover failed\n" version=5.5.1-0-amd64
```

View File

@@ -1 +1 @@
57
58