(#58 ) Zitadel Failover RunBook

(#58 ) Restore last zitadel database backup
When the cluster is provisioned, restore the most recent backup instead of a fixed point in time.
2026-03-19 08:44:58 +00:00 · 2024-03-14 15:25:38 -07:00 · 2024-03-14 11:40:17 -07:00 · 2024-03-13 10:35:20 -07:00 · 2024-03-12 17:18:38 -07:00 · 2024-03-12 14:14:20 -07:00
19 changed files with 1972 additions and 42 deletions
--- a/docs/examples/optionalservices.cue
+++ b/docs/examples/optionalservices.cue
@@ -0,0 +1,39 @@
+package holos
+
+// Controls optional feature flags for services distributed across multiple holos components.
+// For example, enable issuing certificates in the provisioner cluster when an optional service is
+// enabled for a workload cluster.
+
+#OptionalService: {
+	name:    string
+	enabled: true | *false
+	clusters: [Name=_]: #Platform.clusters[Name]
+	clusterNames: [for k, v in clusters {k}]
+	namespaces: [Name=_]: #ManagedNamespace & {
+		name: Name
+	}
+	// servers represents istio Gateway.spec.servers.hosts entries
+	// Refer to istio/gateway/gateway.cue
+	servers: [Name=_]: {
+		hosts: [...string]
+		port: name:          Name
+		port: number:        443
+		port: protocol:      "HTTPS"
+		tls: credentialName: string
+		tls: mode:           "SIMPLE"
+	}
+	// public tls certs should align to hosts.
+	certs: [Name=_]: #Certificate & {
+		metadata: name: Name
+	}
+}
+
+#OptionalServices: {
+	[Name=_]: #OptionalService & {
+		name: Name
+	}
+}
+
+for k, v in #OptionalServices {
+	#ManagedNamespaces: v.namespaces
+}
--- a/docs/examples/platforms/optional.site.cue
+++ b/docs/examples/platforms/optional.site.cue
@@ -0,0 +1,54 @@
+package holos
+
+let CoreDomain = "core.\(#Platform.org.domain)"
+let TargetNamespace = "prod-core-vault"
+
+#OptionalServices: {
+	vault: {
+		enabled: true
+		clusters: core1: _
+		clusters: core2: _
+		namespaces: "prod-core-vault": labels: "istio-injection": "enabled"
+		certs: "vault-core": #Certificate & {
+			metadata: name:      "vault-core"
+			metadata: namespace: "istio-ingress"
+			spec: {
+				commonName: "vault.\(CoreDomain)"
+				dnsNames: [commonName]
+				secretName: metadata.name
+				issuerRef: kind: "ClusterIssuer"
+				issuerRef: name: string | *"letsencrypt"
+			}
+		}
+		servers: "https-vault-core": {
+			hosts: ["\(TargetNamespace)/vault.\(CoreDomain)"]
+			tls: credentialName: certs."vault-core".spec.secretName
+		}
+		for k, v in clusters {
+			let obj = (Cert & {Name: "vault-core", Cluster: v.name}).APIObject
+			certs: "\(obj.metadata.name)": obj
+			servers: "https-\(obj.metadata.name)": {
+				hosts: [for host in obj.spec.dnsNames {"\(TargetNamespace)/\(host)"}]
+				tls: credentialName: obj.spec.secretName
+			}
+		}
+	}
+}
+
+// Cert provisions a cluster specific certificate.
+let Cert = {
+	Name:    string
+	Cluster: string
+
+	APIObject: #Certificate & {
+		metadata: name:      "\(Cluster)-\(Name)"
+		metadata: namespace: string | *"istio-ingress"
+		spec: {
+			commonName: string | *"vault.\(Cluster).\(CoreDomain)"
+			dnsNames: [commonName]
+			secretName: metadata.name
+			issuerRef: kind: "ClusterIssuer"
+			issuerRef: name: string | *"letsencrypt"
+		}
+	}
+}
--- a/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/postgres/postgres.cue
+++ b/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/postgres/postgres.cue
@@ -13,7 +13,10 @@ let BucketRepoName = "repo2"

 // Restore options.  Set the timestamp to a known good point in time.
 // time="2024-03-11T17:08:58Z" level=info msg="crunchy-pgbackrest ends"
-let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
+// let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
+
+// Restore the most recent backup.
+let RestoreOptions = []

 #KubernetesObjects & {
 	apiObjects: {
@@ -40,7 +43,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
 					replicas: 2
 					dataVolumeClaimSpec: {
 						accessModes: ["ReadWriteOnce"]
-						resources: requests: storage: string | *"1Gi"
+						resources: requests: storage: "10Gi"
 					}
 				}]
 				standby: {
@@ -103,7 +106,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
 							name: "repo1"
 							volume: volumeClaimSpec: {
 								accessModes: ["ReadWriteOnce"]
-								resources: requests: storage: string | *"1Gi"
+								resources: requests: storage: string | *"4Gi"
 							}
 						},
 						{
@@ -137,7 +140,7 @@ let HighlyAvailable = {
 			replicas: 2
 			dataVolumeClaimSpec: {
 				accessModes: ["ReadWriteOnce"]
-				resources: requests: storage: "1Gi"
+				resources: requests: storage: string | *"10Gi"
 			}
 			affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: [{
 				weight: 1
--- a/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/zitadel/zitadel.cue
+++ b/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/zitadel/zitadel.cue
@@ -10,7 +10,8 @@ let Name = "zitadel"
 #Kustomization: spec: targetNamespace: #TargetNamespace

 #HelmChart & {
-	namespace: #TargetNamespace
+	namespace:   #TargetNamespace
+	enableHooks: true
 	chart: {
 		name:    Name
 		version: "7.9.0"
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/init/namespaces/component.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/init/namespaces/component.cue
@@ -7,26 +7,16 @@ package holos
 	component: "namespaces"
 }

-// #PlatformNamespaceObjects defines the api objects necessary for eso SecretStores in external clusters to access secrets in a given namespace in the provisioner cluster.
-#PlatformNamespaceObjects: {
-	_ns: #PlatformNamespace
-
-	objects: [
-		#Namespace & {
-			metadata: _ns
-		},
-	]
-}
-
 #KubernetesObjects & {
 	apiObjects: {
+		// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
+		for k, ns in #ManagedNamespaces {
+			Namespace: "\(ns.name)": #Namespace & {metadata: ns}
+		}
+
+		// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
 		for ns in #PlatformNamespaces {
-			for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
-				let Kind = obj.kind
-				let NS = ns.name
-				let Name = obj.metadata.name
-				"\(Kind)": "\(NS)/\(Name)": obj
-			}
+			Namespace: "\(ns.name)": #Namespace & {metadata: ns}
 		}
 	}
 }
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/mesh/istio/gateway/gateway.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/mesh/istio/gateway/gateway.cue
@@ -1,7 +1,8 @@
 package holos

-// The primary istio Gateway, named default
+import "list"

+// The primary istio Gateway, named default
 let Name = "gateway"

 #InputKeys: component: Name
@@ -31,5 +32,19 @@ let LoginCert = #PlatformCerts.login
 				},
 			]
 		}
+
+		for k, svc in #OptionalServices {
+			if svc.enabled && list.Contains(svc.clusterNames, #ClusterName) {
+				Gateway: "\(svc.name)": #Gateway & {
+					metadata: name:      svc.name
+					metadata: namespace: #TargetNamespace
+					spec: selector: istio: "ingressgateway"
+					spec: servers: [for s in svc.servers {s}]
+				}
+				for k, s in svc.servers {
+					ExternalSecret: "\(s.tls.credentialName)": _
+				}
+			}
+		}
 	}
 }
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/secrets/secretstores/secretstores.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/secrets/secretstores/secretstores.cue
@@ -30,5 +30,10 @@ package holos
 				"\(Kind)": "\(NS)/\(Name)": obj
 			}
 		}
+
+		for k, ns in #ManagedNamespaces {
+			let obj = #SecretStore & {_namespace: ns.name}
+			SecretStore: "\(ns.name)/\(obj.metadata.name)": obj
+		}
 	}
 }
--- a/docs/examples/platforms/reference/clusters/optional/vault/values.cue
+++ b/docs/examples/platforms/reference/clusters/optional/vault/values.cue
--- a/docs/examples/platforms/reference/clusters/optional/vault/values.holos.cue
+++ b/docs/examples/platforms/reference/clusters/optional/vault/values.holos.cue
@@ -0,0 +1,146 @@
+package holos
+
+#Values: {
+
+	// Vault Helm Chart Holos Values
+	global: {
+		enabled: true
+		// Istio handles this
+		tlsDisable: true
+	}
+
+	injector: enabled: false
+
+	server: {
+		image: {
+			// repository: "hashicorp/vault"
+			repository: "quay.io/holos/hashicorp/vault"
+			tag:        "1.14.10"
+			// Overrides the default Image Pull Policy
+			pullPolicy: "IfNotPresent"
+		}
+
+		extraLabels: "sidecar.istio.io/inject": "true"
+
+		resources: requests: {
+			memory: "256Mi"
+			cpu:    "2000m"
+		}
+		// limits:
+		//   memory: 1024Mi
+		//   cpu: 2000m
+		// For HA configuration and because we need to manually init the vault,
+		// we need to define custom readiness/liveness Probe settings
+		readinessProbe: {
+			enabled: true
+			path:    "/v1/sys/health?standbyok=true&sealedcode=204&uninitcode=204"
+		}
+		livenessProbe: {
+			enabled:             true
+			path:                "/v1/sys/health?standbyok=true"
+			initialDelaySeconds: 60
+		}
+
+		// extraEnvironmentVars is a list of extra environment variables to set with
+		// the stateful set. These could be used to include variables required for
+		// auto-unseal.
+		// Vault validates an incomplete chain:
+		// https://github.com/hashicorp/vault/issues/11318
+		extraEnvironmentVars: {
+			GOMAXPROCS: "2"
+		} // Set to cpu limit, see https://github.com/uber-go/automaxprocs
+		// extraVolumes is a list of extra volumes to mount. These will be exposed
+		// to Vault in the path `/vault/userconfig/<name>/`.
+		extraVolumes: [{
+			type: "secret"
+			name: "gcpkms-creds"
+		}]
+
+		// This configures the Vault Statefulset to create a PVC for audit logs.
+		// See https://www.vaultproject.io/docs/audit/index.html to know more
+		auditStorage: {
+			enabled:   true
+			mountPath: "/var/log/vault"
+		} // for compatibility with plain debian vm location.
+
+		standalone: {
+			enabled: false
+		}
+
+		ha: {
+			enabled:  true
+			replicas: 3
+			raft: {
+				enabled:   true
+				setNodeId: true
+
+				config: """
+					ui = true
+					listener \"tcp\" {
+					  address = \"[::]:8200\"
+					  cluster_address = \"[::]:8201\"
+					  # mTLS is handled by the the istio sidecar
+					  tls_disable = \"true\"
+					  # Enable unauthenticated metrics access (necessary for Prometheus Operator)
+					  telemetry {
+					    unauthenticated_metrics_access = true
+					  }
+					}
+
+					telemetry {
+					  prometheus_retention_time = \"30s\"
+					  disable_hostname = true
+					}
+
+					seal \"gcpckms\" {
+					  credentials = \"/vault/userconfig/gcpkms-creds/credentials.json\"
+					  project     = \"v6-vault-f15f\"
+					  region      = \"us-west1\"
+					  key_ring    = \"vault-core\"
+					  crypto_key  = \"vault-core-unseal\"
+					}
+
+					# Note; the retry_join leader_api_address values come from the Stable
+					# Network ID feature of a Statefulset.  See:
+					# https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#stable-network-id
+					storage \"raft\" {
+					  path = \"/vault/data\"
+					  retry_join {
+					    leader_api_addr = \"http://vault-0.vault-internal:8200\"
+					    leader_tls_servername = \"vault\"
+					  }
+					  retry_join {
+					    leader_api_addr = \"http://vault-1.vault-internal:8200\"
+					    leader_tls_servername = \"vault\"
+					  }
+					  retry_join {
+					    leader_api_addr = \"http://vault-2.vault-internal:8200\"
+					    leader_tls_servername = \"vault\"
+					  }
+
+					  autopilot {
+					    cleanup_dead_servers = \"true\"
+					    last_contact_threshold = \"200ms\"
+					    last_contact_failure_threshold = \"10m\"
+					    max_trailing_logs = 250000
+					    min_quorum = 3
+					    server_stabilization_time = \"10s\"
+					  }
+
+					}
+
+					service_registration \"kubernetes\" {}
+
+					"""
+
+				// Vault UI (Will be exposed via the service mesh)
+			} // Vault UI (Will be exposed via the service mesh)
+		} // Vault UI (Will be exposed via the service mesh)
+	} // Vault UI (Will be exposed via the service mesh)// Vault UI (Will be exposed via the service mesh)
+	ui: {
+		enabled:         true
+		serviceType:     "ClusterIP"
+		serviceNodePort: null
+		externalPort:    8200
+	}
+}
--- a/docs/examples/platforms/reference/clusters/optional/vault/vault.cue
+++ b/docs/examples/platforms/reference/clusters/optional/vault/vault.cue
@@ -0,0 +1,75 @@
+package holos
+
+import "encoding/yaml"
+
+import "list"
+
+let Name = "vault"
+#InputKeys: component: Name
+#InputKeys: project:   "core"
+#TargetNamespace: "\(#InstancePrefix)-\(Name)"
+
+let Vault = #OptionalServices[Name]
+
+if Vault.enabled && list.Contains(Vault.clusterNames, #ClusterName) {
+	#HelmChart & {
+		namespace: #TargetNamespace
+		chart: {
+			name:    Name
+			version: "0.25.0"
+			repository: {
+				name: "hashicorp"
+				url:  "https://helm.releases.hashicorp.com"
+			}
+		}
+		values: #Values
+
+		apiObjects: {
+			ExternalSecret: "gcpkms-creds":      _
+			ExternalSecret: "vault-server-cert": _
+			VirtualService: "\(Name)": {
+				metadata: name:      Name
+				metadata: namespace: #TargetNamespace
+				spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
+				spec: gateways: ["istio-ingress/\(Name)"]
+				spec: http: [
+					{
+						route: [
+							{
+								destination: host: "\(Name)-active"
+								destination: port: number: 8200
+							},
+						]
+					},
+				]
+			}
+		}
+	}
+
+	#Kustomize: {
+		patches: [
+			{
+				target: {
+					group:   "apps"
+					version: "v1"
+					kind:    "StatefulSet"
+					name:    Name
+				}
+				patch: yaml.Marshal(EnvPatch)
+			},
+		]
+	}
+
+	let EnvPatch = [
+		{
+			op:    "test"
+			path:  "/spec/template/spec/containers/0/env/4/name"
+			value: "VAULT_ADDR"
+		},
+		{
+			op:    "replace"
+			path:  "/spec/template/spec/containers/0/env/4/value"
+			value: "http://$(VAULT_K8S_POD_NAME):8200"
+		},
+	]
+}
--- a/docs/examples/platforms/reference/clusters/provisioner/mesh/certificates/optional/vault.cue
+++ b/docs/examples/platforms/reference/clusters/provisioner/mesh/certificates/optional/vault.cue
@@ -0,0 +1,13 @@
+package holos
+
+let Vault = #OptionalServices.vault
+
+if Vault.enabled {
+	#KubernetesObjects & {
+		apiObjects: {
+			for k, obj in Vault.certs {
+				"\(obj.kind)": "\(obj.metadata.name)": obj
+			}
+		}
+	}
+}
--- a/docs/examples/platforms/reference/clusters/provisioner/mesh/issuers/letsencrypt.cue
+++ b/docs/examples/platforms/reference/clusters/provisioner/mesh/issuers/letsencrypt.cue
--- a/docs/examples/platforms/reference/clusters/provisioner/secrets/eso-creds-refresher/component.cue
+++ b/docs/examples/platforms/reference/clusters/provisioner/secrets/eso-creds-refresher/component.cue
@@ -24,6 +24,14 @@ ksObjects: []
 				"\(Kind)": "\(ns.name)/\(Name)": obj
 			}
 		}
+
+		for k, ns in #ManagedNamespaces {
+			for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
+				let Kind = obj.kind
+				let Name = obj.metadata.name
+				"\(Kind)": "\(ns.name)/\(Name)": obj
+			}
+		}
 	}
 }

--- a/docs/examples/platforms/reference/clusters/provisioner/secrets/namespaces/component.cue
+++ b/docs/examples/platforms/reference/clusters/provisioner/secrets/namespaces/component.cue
@@ -7,25 +7,16 @@ package holos
 	component: "namespaces"
 }

-// #PlatformNamespaceObjects defines the api objects necessary for eso SecretStores in external clusters to access secrets in a given namespace in the provisioner cluster.
-#PlatformNamespaceObjects: {
-	_ns: #PlatformNamespace
-
-	objects: [
-		#Namespace & {
-			metadata: name: _ns.name
-		},
-	]
-}
-
 #KubernetesObjects & {
 	apiObjects: {
+		// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
+		for k, ns in #ManagedNamespaces {
+			Namespace: "\(ns.name)": #Namespace & {metadata: ns}
+		}
+
+		// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
 		for ns in #PlatformNamespaces {
-			for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
-				let Kind = obj.kind
-				let Name = obj.metadata.name
-				"\(Kind)": "\(Name)": obj
-			}
+			Namespace: "\(ns.name)": #Namespace & {metadata: ns}
 		}
 	}
 }
--- a/docs/examples/platforms/reference/namespaces.cue
+++ b/docs/examples/platforms/reference/namespaces.cue
@@ -12,7 +12,7 @@ let Privileged = {

 // #PlatformNamespaces is the union of all namespaces across all cluster types.  Namespaces are created in all clusters regardless of if they're
 // used within the cluster or not.  The is important for security and consistency with IAM, RBAC, and Secrets sync between clusters.
-// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
+// TODO: Deprecate in favor of #ManagedNamespaces because it better to add fields to an object instead adding items to a list.
 #PlatformNamespaces: [
 	{name: "external-secrets"},
 	{name: "holos-system"},
--- a/docs/examples/schema.cue
+++ b/docs/examples/schema.cue
@@ -285,6 +285,21 @@ _apiVersion: "holos.run/v1alpha1"
 	}
 }

+// ManagedNamespace is a namespace to manage across all clusters in the holos platform.
+#ManagedNamespace: {
+	// TODO metadata labels and annotations
+	name: string
+	labels: [string]: string
+}
+
+// #ManagedNamepsaces is the union of all namespaces across all cluster types and optional services.
+// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
+#ManagedNamespaces: {
+	[Name=_]: {
+		name: Name
+	}
+}
+
 // #Backups defines backup configuration.
 // TODO: Consider the best place for this, possibly as part of the site platform config.  This represents the primary location for backups.
 #Backups: {
@@ -393,6 +408,8 @@ _apiVersion: "holos.run/v1alpha1"
 	resourcesFile: ResourcesFile
 	// kustomizeFiles represents the files in a kustomize directory tree.
 	kustomizeFiles: #KustomizeFiles.Files
+	// enableHooks removes the --no-hooks flag from helm template
+	enableHooks: true | *false
 }

 // #KustomizeBuild is a holos component that uses plain yaml files as the source of api objects for a holos component.
--- a/docs/runbooks/login/failover.md
+++ b/docs/runbooks/login/failover.md
@@ -0,0 +1,305 @@
+## Overview
+
+Use this run book to move the ZITADEL primary postgres database from one cluster to another.  At the end of the process https://zitadel.example.com service will be live on the cluster which was previously the standby at the start of the process.
+
+Downtime: ~10 minutes if executed quickly.  ~60 minutes if executed slowly and deliberately.
+
+## Initial State
+1. https://login.ois.run is routed to core2.
+2. `postgrescluster/zitadel` on core2 is primary.
+3. `postgrescluster/zitadel` on core1 is standby.
+4. A recent [[#Full Backup]] has been taken and is stored in S3.  
+5. The standby cluster has been restored from the recent full backup.
+## Process
+1. [[#Edit Platform site config]]
+2. [[#Render core2]]
+3. [[#Demote core2]]  
+4. [[#Verify core2 is standby]]
+5. [[#Render core1]]
+6. [[#Promote core1]]
+7. [[#Verify core1 is primary]]
+8. [[#Reconcile Zitadel]]
+9. [[#Cut over NLB]]
+10. [[#Test Login]]
+
+## Edit Platform site config
+Change the primary cluster from core2 to core1 in `platforms/platform.site.cue`.
+```cue
+#Platform: primaryCluster: name: "core1"
+```
+## Render core2
+Configure core as a standby.
+```
+holos render --cluster-name=core2 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
+```
+
+Expected change:
+```diff
+diff --git a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+index c1aa8394..45daa650 100644
+--- a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+@@ -85,24 +85,6 @@ spec:
+     name: zitadel-repl-tls
+   customTLSSecret:
+     name: zitadel-primary-tls
+-  dataSource:
+-    pgbackrest:
+-      configuration:
+-        - secret:
+-            name: pgo-s3-creds
+-      global:
+-        repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+-        repo2-cipher-type: aes-256-cbc
+-      options:
+-        - --type=time
+-        - --target="2024-03-11 17:10:00+00"
+-      repo:
+-        name: repo2
+-        s3:
+-          bucket: ois-zitadel-backups
+-          endpoint: s3.dualstack.us-east-2.amazonaws.com
+-          region: us-east-2
+-      stanza: db
+   image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
+   instances:
+     - name: pgha1
+@@ -139,7 +121,7 @@ spec:
+       image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
+       replicas: 2
+   standby:
+-    enabled: false
+    enabled: true
+     repoName: repo2
+   users:
+     - databases
+```
+## Demote core2
+
+```
+10012  git add -p
+10013  git commit -m 'Make core2 a standby zitadel cluster'
+10014  git push origin HEAD
+10015  flux reconcile source git flux-system
+10016  flux get ks
+```
+
+## Verify core2 is standby
+```
+k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -2
+```
+Expected output:
+```
+2024-03-14 21:58:26,205 INFO: no action. I am (zitadel-pgha1-smw7-0), the standby leader with the lock
+```
+## Render core1
+```
+holos render --cluster-name=core1 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
+```
+Expected change:
+```diff
+diff --git a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+index aa69814e..290c0865 100644
+--- a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+@@ -85,6 +85,22 @@ spec:
+     name: zitadel-repl-tls
+   customTLSSecret:
+     name: zitadel-primary-tls
+  dataSource:
+    pgbackrest:
+      configuration:
+        - secret:
+            name: pgo-s3-creds
+      global:
+        repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+        repo2-cipher-type: aes-256-cbc
+      options: []
+      repo:
+        name: repo2
+        s3:
+          bucket: ois-zitadel-backups
+          endpoint: s3.dualstack.us-east-2.amazonaws.com
+          region: us-east-2
+      stanza: db
+   image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
+   instances:
+     - name: pgha1
+@@ -121,7 +137,7 @@ spec:
+       image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
+       replicas: 2
+   standby:
+-    enabled: true
+    enabled: false
+     repoName: repo2
+   users:
+     - databases:
+```
+
+## Promote core1
+```
+flux reconcile source git flux-system
+flux reconcile ks prod-iam-postgres
+```
+## Verify core1 is primary
+```
+k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -1
+```
+Expected output:
+```
+2024-03-14 22:05:01,159 INFO: no action. I am (zitadel-pgha1-xl2b-0), the leader with the lock
+```
+## Reconcile Zitadel
+Now that the database is back up, reconcile zitadel.
+This is pretty brutal but it works:
+```
+❯ flux delete ks prod-iam-zitadel
+Are you sure you want to delete this kustomization: y
+► deleting kustomization prod-iam-zitadel in flux-system namespace
+✔ kustomization deleted
+```
+```
+❯ k apply --server-side=true -f deploy/clusters/core1/holos/components/prod-iam-zitadel-kustomization.gen.yaml
+kustomization.kustomize.toolkit.fluxcd.io/prod-iam-zitadel serverside-applied
+```
+ZITADEL should come up READY 2/2
+```
+❯ k get pods
+NAME                               READY   STATUS      RESTARTS      AGE
+zitadel-76776f747f-94wft           2/2     Running     2 (30s ago)   34s
+zitadel-76776f747f-fl24c           2/2     Running     1 (30s ago)   34s
+zitadel-76776f747f-xx2w7           2/2     Running     2 (30s ago)   34s
+zitadel-backup-49c7-h25cj          0/1     Completed   0             3m23s
+zitadel-init-td2sh                 0/1     Completed   0             34s
+zitadel-pgbouncer-d9f8cffc-2vwk5   2/2     Running     0             22m
+zitadel-pgbouncer-d9f8cffc-rrrhd   2/2     Running     0             22m
+zitadel-pgha1-4npq-0               4/4     Running     0             3m21s
+zitadel-pgha1-xl2b-0               4/4     Running     0             3m43s
+zitadel-repo-host-0                2/2     Running     0             22m
+zitadel-setup-b7zkx                0/1     Completed   1             34s
+zitadel-test-connection            0/1     Error       0             34s
+```
+
+## Cut over NLB
+Finally, cut over the NLB config to move to core1.
+This is specific to our metal clusters.
+Apply with: `git push puppet +HEAD:production -o run=netlb`
+```diff
+diff --git a/site-modules/profile/templates/haproxy/haproxy.cfg.epp b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+index b358a70..3ee40db 100644
+--- a/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+++ b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+@@ -225,7 +225,7 @@ frontend sni-ingress from tcp-ingress
+   use_backend core2-https if { req.ssl_sni -i web.holos.run }
+   use_backend core2-https if { req.ssl_sni -i -m end .web.holos.run }
+   # Identity provider is routed to one cluster at a time
+-  use_backend core2-https if { req.ssl_sni -i login.ois.run }
+  use_backend core1-https if { req.ssl_sni -i login.ois.run }
+   # Holos Dev Environments
+   use_backend k1-https if { req.ssl_sni -i holos.wtf }
+   use_backend k1-https if { req.ssl_sni -i -m end .holos.wtf }
+@@ -358,7 +358,7 @@ frontend http-ingress from http-ingress
+   bind 65.102.23.41:80
+   bind 2602:41:6617:2ec0::80:1d:80 v6only
+   # Zitadel is active on one cluster at a time
+-  use_backend core2-http if { hdr(host) -i login.ois.run }
+  use_backend core1-http if { hdr(host) -i login.ois.run }
+   # Vault is active on core1 or core2
+   use_backend core2-http if { hdr(host) -i vault.core.ois.run }
+   # Infrastructure Dex OIDC ID issuer is active on core1 or core2
+
+```
+## Test Login
+```
+kubectl oidc-login get-token \
+  --oidc-extra-scope=openid \
+  --oidc-extra-scope=email \
+  --oidc-extra-scope=profile \
+  --oidc-extra-scope=groups \
+  --oidc-extra-scope=offline_access \
+  --oidc-extra-scope=urn:zitadel:iam:org:domain:primary:openinfrastructure.co \
+  --oidc-issuer-url=https://login.ois.run \
+  --oidc-client-id=${CLIENT_ID:-257714027772314751@holos_platform} \
+  --oidc-use-pkce \
+  --force-refresh > ~/.kube/token.json
+```
+
+```
+jq -r .status.token ~/.kube/token.json | cut -d. -f2 | base64 -d | jq
+```
+
+```json
+{
+  "amr": [
+    "mfa",
+    "user"
+  ],
+  "at_hash": "5nUq6kXWT8GheCNvDNdyow",
+  "aud": [
+    "257714027772314751@holos_platform",
+    "257713952794870157"
+  ],
+  "auth_time": 1710172186,
+  "azp": "257714027772314751@holos_platform",
+  "client_id": "257714027772314751@holos_platform",
+  "email": "jeff@openinfrastructure.co",
+  "email_verified": true,
+  "exp": 1710497614,
+  "family_name": "McCune",
+  "given_name": "Jeff",
+  "groups": [
+    "prod-cluster-admin"
+  ],
+  "iat": 1710454404,
+  "iss": "https://login.ois.run",
+  "locale": null,
+  "name": "Jeff McCune",
+  "nickname": "Jeff",
+  "preferred_username": "jeff@openinfrastructure.co",
+  "sub": "257712562366383231",
+  "updated_at": 1710105084,
+  "urn:zitadel:iam:org:domain:primary": "openinfrastructure.co",
+  "urn:zitadel:iam:org:project:257713952794870157:roles": {
+    "prod-cluster-admin": {
+      "257712562366317695": "openinfrastructure.co"
+    }
+  },
+  "urn:zitadel:iam:org:project:roles": {
+    "prod-cluster-admin": {
+      "257712562366317695": "openinfrastructure.co"
+    }
+  }
+}
+```
+## Tasks
+### Identify the leader
+Same command for primary and standby clusters.
+```
+k get pods -o name -l postgres-operator.crunchydata.com/role=master
+```
+### Full Backup
+On the primary cluster:
+```
+kubectl -n prod-iam-zitadel annotate postgrescluster zitadel postgres-operator.crunchydata.com/pgbackrest-backup="$(date)" --overwrite
+```
+Watch the progress:
+```
+k -n prod-iam-zitadel logs -l postgres-operator.crunchydata.com/pgbackrest-backup=manual -f
+```
+Expected output:
+```
+time="2024-03-14T18:36:16Z" level=info msg="crunchy-pgbackrest starts"
+time="2024-03-14T18:36:16Z" level=info msg="debug flag set to false"
+time="2024-03-14T18:36:16Z" level=info msg="backrest backup command requested"
+time="2024-03-14T18:36:16Z" level=info msg="command to execute is [pgbackrest backup --stanza=db --repo=2 --type=full]"
+time="2024-03-14T18:39:11Z" level=info msg="output=[]"
+time="2024-03-14T18:39:11Z" level=info msg="stderr=[]"
+time="2024-03-14T18:39:11Z" level=info msg="crunchy-pgbackrest ends"
+```
+
+## Troubleshooting
+Full backup doesn't start
+Check the controller logs.  If you see `Switchover failed` errors, try editing the number of postgrescluster replicas down to 1 then back up to 2.  Then try a backup again.
+```
+Switchover failed, details: 503, Switchover failed\n" version=5.5.1-0-amd64
+```
--- a/pkg/internal/builder/builder.go
+++ b/pkg/internal/builder/builder.go
@@ -121,6 +121,7 @@ type HelmChart struct {
 	Namespace     string   `json:"namespace"`
 	Chart         Chart    `json:"chart"`
 	ValuesContent string   `json:"valuesContent"`
+	EnableHooks   bool     `json:"enableHooks"`
 	// APIObjectMap holds the marshalled representation of api objects.
 	APIObjectMap apiObjectMap `json:"APIObjectMap"`
 }
@@ -445,7 +446,12 @@ func runHelm(ctx context.Context, hc *HelmChart, r *Result, path holos.PathCompo

 	// Run charts
 	chart := hc.Chart
-	helmOut, err := util.RunCmd(ctx, "helm", "template", "--include-crds", "--values", valuesPath, "--namespace", hc.Namespace, "--kubeconfig", "/dev/null", "--version", chart.Version, chart.Release, cachedChartPath)
+	args := []string{"template"}
+	if !hc.EnableHooks {
+		args = append(args, "--no-hooks")
+	}
+	args = append(args, "--include-crds", "--values", valuesPath, "--namespace", hc.Namespace, "--kubeconfig", "/dev/null", "--version", chart.Version, chart.Release, cachedChartPath)
+	helmOut, err := util.RunCmd(ctx, "helm", args...)
 	if err != nil {
 		stderr := helmOut.Stderr.String()
 		lines := strings.Split(stderr, "\n")
--- a/pkg/version/embedded/minor
+++ b/pkg/version/embedded/minor
@@ -1 +1 @@
-56
+58
Author	SHA1	Message	Date
Jeff McCune	d64c3e8c66	(#58 ) Zitadel Failover RunBook	2024-03-14 15:25:38 -07:00
Jeff McCune	f344f97374	(#58 ) Restore last zitadel database backup When the cluster is provisioned, restore the most recent backup instead of a fixed point in time.	2024-03-14 11:40:17 -07:00
Jeff McCune	770088b912	(#53 ) Clean up nested if statements with &&	2024-03-13 10:35:20 -07:00
Jeff McCune	cb9b39c3ca	(#53 ) Add Vault as an optional service on the core clusters This patch migrates the vault component from [holos-infra][1] to a cue based component. Vault is optional in the reference platform, so this patch also defines an `#OptionalServices` struct to conditionally manage a service across multiple clusters in the platform. The primary use case for optional services is managing a namespace to provision and provide secrets across clusters. [1]: https://github.com/holos-run/holos-infra/tree/v0.5.0/components/core/core/vault	2024-03-12 17:18:38 -07:00
Jeff McCune	0f34b20546	(#54 ) Disable helm hooks when rendering components Pods are unnecessarily created when deploying helm based holos components and often fail. Prevent these test pods by disabling helm hooks with the `--no-hooks` flag. Closes: #54	2024-03-12 14:14:20 -07:00