(#57 ) Allow gha-rs scale set to fail on all but one clusters

The effect of this patch is limited to refreshing credentials only for namespaces that exist in the local cluster. There is structure in place in the CUE code to allow for namespaces bound to specific clusters, but this is used only by the optional Vault component. This patch was an attempt to work around https://github.com/actions/actions-runner-controller/issues/3351 by deploying the runner scale sets into unique namespaces. This effort was a waste of time, only one listener pod successfully registered for a given scale set name / group combination. Because we have only one group named Default we can only have one listener pod globally for a given scale set name. Because we want our workflows to execute regardless of the availability of a single cluster, we're going to let this fail for now. The pod retries every 3 seconds. When a cluster is destroyed, another cluster will quickly register. A follow up patch will look to expand this retry behavior.
(#60 ) Disable flux reconciliation of deployment/zitadel on standby clusters
2026-03-19 16:54:58 +00:00 · 2024-03-15 12:53:16 -07:00 · 2024-03-14 21:58:32 -07:00 · 2024-03-14 15:25:38 -07:00 · 2024-03-14 11:40:17 -07:00 · 2024-03-13 10:35:20 -07:00
18 changed files with 499 additions and 118 deletions
--- a/docs/examples/optionalservices.cue
+++ b/docs/examples/optionalservices.cue
@@ -1,16 +1,19 @@
-package holos
-
 // Controls optional feature flags for services distributed across multiple holos components.
 // For example, enable issuing certificates in the provisioner cluster when an optional service is
 // enabled for a workload cluster.
+package holos
+
+import "list"

 #OptionalService: {
 	name:    string
 	enabled: true | *false
 	clusters: [Name=_]: #Platform.clusters[Name]
-	clusterNames: [for k, v in clusters {k}]
-	namespaces: [Name=_]: #ManagedNamespace & {
-		name: Name
+	clusterNames: [for c in clusters {c.name}]
+
+	managedNamespaces: [Name=_]: #ManagedNamespace & {
+		namespace: metadata: name: Name
+		clusterNames: ["provisioner", for c in clusters {c.name}]
 	}
 	// servers represents istio Gateway.spec.servers.hosts entries
 	// Refer to istio/gateway/gateway.cue
@@ -34,6 +37,10 @@ package holos
 	}
 }

-for k, v in #OptionalServices {
-	#ManagedNamespaces: v.namespaces
+for svc in #OptionalServices {
+	for nsName, ns in svc.managedNamespaces {
+		if svc.enabled && list.Contains(ns.clusterNames, #ClusterName) {
+			#ManagedNamespaces: "\(nsName)": ns
+		}
+	}
 }
--- a/docs/examples/platforms/optional.site.cue
+++ b/docs/examples/platforms/optional.site.cue
@@ -8,7 +8,9 @@ let TargetNamespace = "prod-core-vault"
 		enabled: true
 		clusters: core1: _
 		clusters: core2: _
-		namespaces: "prod-core-vault": labels: "istio-injection": "enabled"
+		managedNamespaces: "prod-core-vault": {
+			namespace: metadata: labels: "istio-injection": "enabled"
+		}
 		certs: "vault-core": #Certificate & {
 			metadata: name:      "vault-core"
 			metadata: namespace: "istio-ingress"
--- a/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/postgres/postgres.cue
+++ b/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/postgres/postgres.cue
@@ -13,7 +13,10 @@ let BucketRepoName = "repo2"

 // Restore options.  Set the timestamp to a known good point in time.
 // time="2024-03-11T17:08:58Z" level=info msg="crunchy-pgbackrest ends"
-let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
+// let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
+
+// Restore the most recent backup.
+let RestoreOptions = []

 #KubernetesObjects & {
 	apiObjects: {
@@ -40,7 +43,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
 					replicas: 2
 					dataVolumeClaimSpec: {
 						accessModes: ["ReadWriteOnce"]
-						resources: requests: storage: string | *"1Gi"
+						resources: requests: storage: "10Gi"
 					}
 				}]
 				standby: {
@@ -103,7 +106,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
 							name: "repo1"
 							volume: volumeClaimSpec: {
 								accessModes: ["ReadWriteOnce"]
-								resources: requests: storage: string | *"1Gi"
+								resources: requests: storage: string | *"4Gi"
 							}
 						},
 						{
@@ -137,7 +140,7 @@ let HighlyAvailable = {
 			replicas: 2
 			dataVolumeClaimSpec: {
 				accessModes: ["ReadWriteOnce"]
-				resources: requests: storage: "1Gi"
+				resources: requests: storage: string | *"10Gi"
 			}
 			affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: [{
 				weight: 1
--- a/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/zitadel/zitadel.cue
+++ b/docs/examples/platforms/reference/clusters/accounts/iam/zitadel/zitadel/zitadel.cue
@@ -61,43 +61,70 @@ let DatabaseCACertPatch = [
 	},
 ]

-#Kustomize: {
-	patches: [
-		{
+let CAPatch = #Patch & {
+	target: {
+		group:   "apps" | "batch"
+		version: "v1"
+		kind:    "Job" | "Deployment"
+		name:    string
+	}
+	patch: yaml.Marshal(DatabaseCACertPatch)
+}
+
+#KustomizePatches: {
+	mesh: {
+		target: {
+			group:   "apps"
+			version: "v1"
+			kind:    "Deployment"
+			name:    Name
+		}
+		patch: yaml.Marshal(IstioInject)
+	}
+	deploymentCA: CAPatch & {
+		target: group: "apps"
+		target: kind:  "Deployment"
+		target: name:  Name
+	}
+	initJob: CAPatch & {
+		target: group: "batch"
+		target: kind:  "Job"
+		target: name:  "\(Name)-init"
+	}
+	setupJob: CAPatch & {
+		target: group: "batch"
+		target: kind:  "Job"
+		target: name:  "\(Name)-setup"
+	}
+	if #IsPrimaryCluster == false {
+		fluxDisable: {
 			target: {
 				group:   "apps"
 				version: "v1"
 				kind:    "Deployment"
 				name:    Name
 			}
-			patch: yaml.Marshal(IstioInject)
-		},
-		{
-			target: {
-				group:   "apps"
-				version: "v1"
-				kind:    "Deployment"
-				name:    Name
-			}
-			patch: yaml.Marshal(DatabaseCACertPatch)
-		},
-		{
+			patch: yaml.Marshal(DisableFluxPatch)
+		}
+		initDisable: {
 			target: {
 				group:   "batch"
 				version: "v1"
 				kind:    "Job"
 				name:    "\(Name)-init"
 			}
-			patch: yaml.Marshal(DatabaseCACertPatch)
-		},
-		{
+			patch: yaml.Marshal(DisableFluxPatch)
+		}
+		setupDisable: {
 			target: {
 				group:   "batch"
 				version: "v1"
 				kind:    "Job"
 				name:    "\(Name)-setup"
 			}
-			patch: yaml.Marshal(DatabaseCACertPatch)
-		},
-	]
+			patch: yaml.Marshal(DisableFluxPatch)
+		}
+	}
 }
+
+let DisableFluxPatch = [{op: "replace", path: "/metadata/annotations/kustomize.toolkit.fluxcd.io~1reconcile", value: "disabled"}]
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/github/arc/arc.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/github/arc/arc.cue
@@ -4,6 +4,6 @@ package holos
 #InputKeys: project: "github"
 #DependsOn: Namespaces: name: "prod-secrets-namespaces"

-#TargetNamespace: #InputKeys.component
+#ARCSystemNamespace: "arc-system"
 #HelmChart: namespace: #TargetNamespace
 #HelmChart: chart: version: "0.8.3"
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/github/arc/runner/arc-runner.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/github/arc/runner/arc-runner.cue
@@ -1,5 +1,6 @@
 package holos

+#TargetNamespace: "arc-runner"
 #InputKeys: component: "arc-runner"
 #Kustomization: spec: targetNamespace: #TargetNamespace

--- a/docs/examples/platforms/reference/clusters/foundation/cloud/github/arc/system/arc-system.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/github/arc/system/arc-system.cue
@@ -1,6 +1,6 @@
 package holos

-#TargetNamespace: "arc-system"
+#TargetNamespace: #ARCSystemNamespace
 #InputKeys: component: "arc-system"

 #HelmChart & {
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/init/namespaces/component.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/init/namespaces/component.cue
@@ -1,5 +1,7 @@
 package holos

+import "list"
+
 #TargetNamespace: "default"

 #InputKeys: {
@@ -11,7 +13,9 @@ package holos
 	apiObjects: {
 		// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
 		for k, ns in #ManagedNamespaces {
-			Namespace: "\(ns.name)": #Namespace & {metadata: ns}
+			if list.Contains(ns.clusterNames, #ClusterName) {
+				Namespace: "\(k)": #Namespace & ns.namespace
+			}
 		}

 		// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/mesh/istio/gateway/gateway.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/mesh/istio/gateway/gateway.cue
@@ -34,17 +34,15 @@ let LoginCert = #PlatformCerts.login
 		}

 		for k, svc in #OptionalServices {
-			if svc.enabled {
-				if list.Contains(svc.clusterNames, #ClusterName) {
-					Gateway: "\(svc.name)": #Gateway & {
-						metadata: name:      svc.name
-						metadata: namespace: #TargetNamespace
-						spec: selector: istio: "ingressgateway"
-						spec: servers: [for s in svc.servers {s}]
-					}
-					for k, s in svc.servers {
-						ExternalSecret: "\(s.tls.credentialName)": _
-					}
+			if svc.enabled && list.Contains(svc.clusterNames, #ClusterName) {
+				Gateway: "\(svc.name)": #Gateway & {
+					metadata: name:      svc.name
+					metadata: namespace: #TargetNamespace
+					spec: selector: istio: "ingressgateway"
+					spec: servers: [for s in svc.servers {s}]
+				}
+				for k, s in svc.servers {
+					ExternalSecret: "\(s.tls.credentialName)": _
 				}
 			}
 		}
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/secrets/eso-creds-refresher/component.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/secrets/eso-creds-refresher/component.cue
@@ -93,7 +93,14 @@ provisioner get serviceaccount -A --selector=holos.run/job.name=\(NAME) --output

 # Create the tokens
 mkdir tokens
-jq -r '.items[].metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
+
+kubectl get namespaces -o name > namespaces.txt
+
+# Iterate over local namespaces
+while IFS= read -r NAMESPACE; do
+  echo "Getting token for local cluster $NAMESPACE" >&2
+  jq -r '.items[] | select("namespace/"+.metadata.namespace == "'${NAMESPACE}'") | .metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
+done < namespaces.txt

 # Create the secrets
 mksecret tokens/*.jwt
@@ -124,6 +131,11 @@ kubectl apply --server-side=true -f secrets.yaml
 					resources: ["secrets"]
 					verbs: ["*"]
 				},
+				{
+					apiGroups: [""]
+					resources: ["namespaces"]
+					verbs: ["list"]
+				},
 			]
 		},
 		// Bind the Role to the ServiceAccount for the Job.
--- a/docs/examples/platforms/reference/clusters/foundation/cloud/secrets/secretstores/secretstores.cue
+++ b/docs/examples/platforms/reference/clusters/foundation/cloud/secrets/secretstores/secretstores.cue
@@ -1,5 +1,7 @@
 package holos

+import "list"
+
 #DependsOn: _ESOCreds

 #TargetNamespace: "default"
@@ -31,9 +33,11 @@ package holos
 			}
 		}

-		for k, ns in #ManagedNamespaces {
-			let obj = #SecretStore & {_namespace: ns.name}
-			SecretStore: "\(ns.name)/\(obj.metadata.name)": obj
+		for nsName, ns in #ManagedNamespaces {
+			if list.Contains(ns.clusterNames, #ClusterName) {
+				let obj = #SecretStore & {_namespace: nsName}
+				SecretStore: "\(nsName)/\(obj.metadata.name)": obj
+			}
 		}
 	}
 }
--- a/docs/examples/platforms/reference/clusters/optional/vault/vault.cue
+++ b/docs/examples/platforms/reference/clusters/optional/vault/vault.cue
@@ -11,67 +11,65 @@ let Name = "vault"

 let Vault = #OptionalServices[Name]

-if Vault.enabled {
-	if list.Contains(Vault.clusterNames, #ClusterName) {
-		#HelmChart & {
-			namespace: #TargetNamespace
-			chart: {
-				name:    Name
-				version: "0.25.0"
-				repository: {
-					name: "hashicorp"
-					url:  "https://helm.releases.hashicorp.com"
-				}
-			}
-			values: #Values
-
-			apiObjects: {
-				ExternalSecret: "gcpkms-creds":      _
-				ExternalSecret: "vault-server-cert": _
-				VirtualService: "\(Name)": {
-					metadata: name:      Name
-					metadata: namespace: #TargetNamespace
-					spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
-					spec: gateways: ["istio-ingress/\(Name)"]
-					spec: http: [
-						{
-							route: [
-								{
-									destination: host: "\(Name)-active"
-									destination: port: number: 8200
-								},
-							]
-						},
-					]
-				}
+if Vault.enabled && list.Contains(Vault.clusterNames, #ClusterName) {
+	#HelmChart & {
+		namespace: #TargetNamespace
+		chart: {
+			name:    Name
+			version: "0.25.0"
+			repository: {
+				name: "hashicorp"
+				url:  "https://helm.releases.hashicorp.com"
 			}
 		}
+		values: #Values

-		#Kustomize: {
-			patches: [
-				{
-					target: {
-						group:   "apps"
-						version: "v1"
-						kind:    "StatefulSet"
-						name:    Name
-					}
-					patch: yaml.Marshal(EnvPatch)
-				},
-			]
+		apiObjects: {
+			ExternalSecret: "gcpkms-creds":      _
+			ExternalSecret: "vault-server-cert": _
+			VirtualService: "\(Name)": {
+				metadata: name:      Name
+				metadata: namespace: #TargetNamespace
+				spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
+				spec: gateways: ["istio-ingress/\(Name)"]
+				spec: http: [
+					{
+						route: [
+							{
+								destination: host: "\(Name)-active"
+								destination: port: number: 8200
+							},
+						]
+					},
+				]
+			}
 		}
+	}

-		let EnvPatch = [
+	#Kustomize: {
+		patches: [
 			{
-				op:    "test"
-				path:  "/spec/template/spec/containers/0/env/4/name"
-				value: "VAULT_ADDR"
-			},
-			{
-				op:    "replace"
-				path:  "/spec/template/spec/containers/0/env/4/value"
-				value: "http://$(VAULT_K8S_POD_NAME):8200"
+				target: {
+					group:   "apps"
+					version: "v1"
+					kind:    "StatefulSet"
+					name:    Name
+				}
+				patch: yaml.Marshal(EnvPatch)
 			},
 		]
 	}
+
+	let EnvPatch = [
+		{
+			op:    "test"
+			path:  "/spec/template/spec/containers/0/env/4/name"
+			value: "VAULT_ADDR"
+		},
+		{
+			op:    "replace"
+			path:  "/spec/template/spec/containers/0/env/4/value"
+			value: "http://$(VAULT_K8S_POD_NAME):8200"
+		},
+	]
 }
--- a/docs/examples/platforms/reference/clusters/provisioner/secrets/eso-creds-refresher/component.cue
+++ b/docs/examples/platforms/reference/clusters/provisioner/secrets/eso-creds-refresher/component.cue
@@ -25,11 +25,11 @@ ksObjects: []
 			}
 		}

-		for k, ns in #ManagedNamespaces {
-			for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
+		for nsName, ns in #ManagedNamespaces {
+			for obj in (#PlatformNamespaceObjects & {_ns: ns.namespace.metadata}).objects {
 				let Kind = obj.kind
 				let Name = obj.metadata.name
-				"\(Kind)": "\(ns.name)/\(Name)": obj
+				"\(Kind)": "\(nsName)/\(Name)": obj
 			}
 		}
 	}
--- a/docs/examples/platforms/reference/clusters/provisioner/secrets/namespaces/component.cue
+++ b/docs/examples/platforms/reference/clusters/provisioner/secrets/namespaces/component.cue
@@ -10,8 +10,8 @@ package holos
 #KubernetesObjects & {
 	apiObjects: {
 		// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
-		for k, ns in #ManagedNamespaces {
-			Namespace: "\(ns.name)": #Namespace & {metadata: ns}
+		for nsName, ns in #ManagedNamespaces {
+			Namespace: "\(nsName)": #Namespace & ns.namespace
 		}

 		// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
--- a/docs/examples/schema.cue
+++ b/docs/examples/schema.cue
@@ -152,7 +152,8 @@ _apiVersion: "holos.run/v1alpha1"
 		suspend?:         bool
 		targetNamespace?: string
 		timeout:          string | *"3m0s"
-		wait:             bool | *true
+		// wait performs health checks for all reconciled resources. If set to true, .spec.healthChecks is ignored.
+		wait: bool | *true
 		dependsOn: [for k, v in #DependsOn {v}]
 	}
 }
@@ -287,16 +288,21 @@ _apiVersion: "holos.run/v1alpha1"

 // ManagedNamespace is a namespace to manage across all clusters in the holos platform.
 #ManagedNamespace: {
-	// TODO metadata labels and annotations
-	name: string
-	labels: [string]: string
+	namespace: {
+		metadata: {
+			name: string
+			labels: [string]: string
+		}
+	}
+	// clusterNames represents the set of clusters the namespace is managed on.  Usually all clusters.
+	clusterNames: [...string]
 }

 // #ManagedNamepsaces is the union of all namespaces across all cluster types and optional services.
 // Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
 #ManagedNamespaces: {
-	[Name=_]: {
-		name: Name
+	[Name=_]: #ManagedNamespace & {
+		namespace: metadata: name: Name
 	}
 }

@@ -465,8 +471,18 @@ _apiVersion: "holos.run/v1alpha1"
 	kind:       "Kustomization"
 	resources: [ResourcesFile]
 	...
+	if len(#KustomizePatches) > 0 {
+		patches: [for v in #KustomizePatches {v}]
+	}
 }

+#KustomizePatches: {
+	[_]: #Patch
+}
+
+// #Patch is a kustomize patch
+#Patch: kc.#Patch
+
 // #DefaultSecurityContext is the holos default security context to comply with the restricted namespace policy.
 // Refer to https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
 #DefaultSecurityContext: {
@@ -485,6 +501,10 @@ _apiVersion: "holos.run/v1alpha1"
 	spec: secretName: metadata.name
 }

+// #IsPrimaryCluster is true if the cluster being rendered is the primary cluster
+// Used by the iam project to determine where https://login.example.com is active.
+#IsPrimaryCluster: bool & #ClusterName == #Platform.primaryCluster.name
+
 // By default, render kind: Skipped so holos knows to skip over intermediate cue files.
 // This enables the use of holos render ./foo/bar/baz/... when bar contains intermediary constraints which are not complete components.
 // Holos skips over these intermediary cue instances.
--- a/docs/runbooks/login/failover.md
+++ b/docs/runbooks/login/failover.md
@@ -0,0 +1,305 @@
+## Overview
+
+Use this run book to move the ZITADEL primary postgres database from one cluster to another.  At the end of the process https://zitadel.example.com service will be live on the cluster which was previously the standby at the start of the process.
+
+Downtime: ~10 minutes if executed quickly.  ~60 minutes if executed slowly and deliberately.
+
+## Initial State
+1. https://login.ois.run is routed to core2.
+2. `postgrescluster/zitadel` on core2 is primary.
+3. `postgrescluster/zitadel` on core1 is standby.
+4. A recent [[#Full Backup]] has been taken and is stored in S3.  
+5. The standby cluster has been restored from the recent full backup.
+## Process
+1. [[#Edit Platform site config]]
+2. [[#Render core2]]
+3. [[#Demote core2]]  
+4. [[#Verify core2 is standby]]
+5. [[#Render core1]]
+6. [[#Promote core1]]
+7. [[#Verify core1 is primary]]
+8. [[#Reconcile Zitadel]]
+9. [[#Cut over NLB]]
+10. [[#Test Login]]
+
+## Edit Platform site config
+Change the primary cluster from core2 to core1 in `platforms/platform.site.cue`.
+```cue
+#Platform: primaryCluster: name: "core1"
+```
+## Render core2
+Configure core as a standby.
+```
+holos render --cluster-name=core2 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
+```
+
+Expected change:
+```diff
+diff --git a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+index c1aa8394..45daa650 100644
+--- a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+@@ -85,24 +85,6 @@ spec:
+     name: zitadel-repl-tls
+   customTLSSecret:
+     name: zitadel-primary-tls
+-  dataSource:
+-    pgbackrest:
+-      configuration:
+-        - secret:
+-            name: pgo-s3-creds
+-      global:
+-        repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+-        repo2-cipher-type: aes-256-cbc
+-      options:
+-        - --type=time
+-        - --target="2024-03-11 17:10:00+00"
+-      repo:
+-        name: repo2
+-        s3:
+-          bucket: ois-zitadel-backups
+-          endpoint: s3.dualstack.us-east-2.amazonaws.com
+-          region: us-east-2
+-      stanza: db
+   image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
+   instances:
+     - name: pgha1
+@@ -139,7 +121,7 @@ spec:
+       image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
+       replicas: 2
+   standby:
+-    enabled: false
+    enabled: true
+     repoName: repo2
+   users:
+     - databases
+```
+## Demote core2
+
+```
+10012  git add -p
+10013  git commit -m 'Make core2 a standby zitadel cluster'
+10014  git push origin HEAD
+10015  flux reconcile source git flux-system
+10016  flux get ks
+```
+
+## Verify core2 is standby
+```
+k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -2
+```
+Expected output:
+```
+2024-03-14 21:58:26,205 INFO: no action. I am (zitadel-pgha1-smw7-0), the standby leader with the lock
+```
+## Render core1
+```
+holos render --cluster-name=core1 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
+```
+Expected change:
+```diff
+diff --git a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+index aa69814e..290c0865 100644
+--- a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+@@ -85,6 +85,22 @@ spec:
+     name: zitadel-repl-tls
+   customTLSSecret:
+     name: zitadel-primary-tls
+  dataSource:
+    pgbackrest:
+      configuration:
+        - secret:
+            name: pgo-s3-creds
+      global:
+        repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+        repo2-cipher-type: aes-256-cbc
+      options: []
+      repo:
+        name: repo2
+        s3:
+          bucket: ois-zitadel-backups
+          endpoint: s3.dualstack.us-east-2.amazonaws.com
+          region: us-east-2
+      stanza: db
+   image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
+   instances:
+     - name: pgha1
+@@ -121,7 +137,7 @@ spec:
+       image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
+       replicas: 2
+   standby:
+-    enabled: true
+    enabled: false
+     repoName: repo2
+   users:
+     - databases:
+```
+
+## Promote core1
+```
+flux reconcile source git flux-system
+flux reconcile ks prod-iam-postgres
+```
+## Verify core1 is primary
+```
+k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -1
+```
+Expected output:
+```
+2024-03-14 22:05:01,159 INFO: no action. I am (zitadel-pgha1-xl2b-0), the leader with the lock
+```
+## Reconcile Zitadel
+Now that the database is back up, reconcile zitadel.
+This is pretty brutal but it works:
+```
+❯ flux delete ks prod-iam-zitadel
+Are you sure you want to delete this kustomization: y
+► deleting kustomization prod-iam-zitadel in flux-system namespace
+✔ kustomization deleted
+```
+```
+❯ k apply --server-side=true -f deploy/clusters/core1/holos/components/prod-iam-zitadel-kustomization.gen.yaml
+kustomization.kustomize.toolkit.fluxcd.io/prod-iam-zitadel serverside-applied
+```
+ZITADEL should come up READY 2/2
+```
+❯ k get pods
+NAME                               READY   STATUS      RESTARTS      AGE
+zitadel-76776f747f-94wft           2/2     Running     2 (30s ago)   34s
+zitadel-76776f747f-fl24c           2/2     Running     1 (30s ago)   34s
+zitadel-76776f747f-xx2w7           2/2     Running     2 (30s ago)   34s
+zitadel-backup-49c7-h25cj          0/1     Completed   0             3m23s
+zitadel-init-td2sh                 0/1     Completed   0             34s
+zitadel-pgbouncer-d9f8cffc-2vwk5   2/2     Running     0             22m
+zitadel-pgbouncer-d9f8cffc-rrrhd   2/2     Running     0             22m
+zitadel-pgha1-4npq-0               4/4     Running     0             3m21s
+zitadel-pgha1-xl2b-0               4/4     Running     0             3m43s
+zitadel-repo-host-0                2/2     Running     0             22m
+zitadel-setup-b7zkx                0/1     Completed   1             34s
+zitadel-test-connection            0/1     Error       0             34s
+```
+
+## Cut over NLB
+Finally, cut over the NLB config to move to core1.
+This is specific to our metal clusters.
+Apply with: `git push puppet +HEAD:production -o run=netlb`
+```diff
+diff --git a/site-modules/profile/templates/haproxy/haproxy.cfg.epp b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+index b358a70..3ee40db 100644
+--- a/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+++ b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+@@ -225,7 +225,7 @@ frontend sni-ingress from tcp-ingress
+   use_backend core2-https if { req.ssl_sni -i web.holos.run }
+   use_backend core2-https if { req.ssl_sni -i -m end .web.holos.run }
+   # Identity provider is routed to one cluster at a time
+-  use_backend core2-https if { req.ssl_sni -i login.ois.run }
+  use_backend core1-https if { req.ssl_sni -i login.ois.run }
+   # Holos Dev Environments
+   use_backend k1-https if { req.ssl_sni -i holos.wtf }
+   use_backend k1-https if { req.ssl_sni -i -m end .holos.wtf }
+@@ -358,7 +358,7 @@ frontend http-ingress from http-ingress
+   bind 65.102.23.41:80
+   bind 2602:41:6617:2ec0::80:1d:80 v6only
+   # Zitadel is active on one cluster at a time
+-  use_backend core2-http if { hdr(host) -i login.ois.run }
+  use_backend core1-http if { hdr(host) -i login.ois.run }
+   # Vault is active on core1 or core2
+   use_backend core2-http if { hdr(host) -i vault.core.ois.run }
+   # Infrastructure Dex OIDC ID issuer is active on core1 or core2
+
+```
+## Test Login
+```
+kubectl oidc-login get-token \
+  --oidc-extra-scope=openid \
+  --oidc-extra-scope=email \
+  --oidc-extra-scope=profile \
+  --oidc-extra-scope=groups \
+  --oidc-extra-scope=offline_access \
+  --oidc-extra-scope=urn:zitadel:iam:org:domain:primary:openinfrastructure.co \
+  --oidc-issuer-url=https://login.ois.run \
+  --oidc-client-id=${CLIENT_ID:-257714027772314751@holos_platform} \
+  --oidc-use-pkce \
+  --force-refresh > ~/.kube/token.json
+```
+
+```
+jq -r .status.token ~/.kube/token.json | cut -d. -f2 | base64 -d | jq
+```
+
+```json
+{
+  "amr": [
+    "mfa",
+    "user"
+  ],
+  "at_hash": "5nUq6kXWT8GheCNvDNdyow",
+  "aud": [
+    "257714027772314751@holos_platform",
+    "257713952794870157"
+  ],
+  "auth_time": 1710172186,
+  "azp": "257714027772314751@holos_platform",
+  "client_id": "257714027772314751@holos_platform",
+  "email": "jeff@openinfrastructure.co",
+  "email_verified": true,
+  "exp": 1710497614,
+  "family_name": "McCune",
+  "given_name": "Jeff",
+  "groups": [
+    "prod-cluster-admin"
+  ],
+  "iat": 1710454404,
+  "iss": "https://login.ois.run",
+  "locale": null,
+  "name": "Jeff McCune",
+  "nickname": "Jeff",
+  "preferred_username": "jeff@openinfrastructure.co",
+  "sub": "257712562366383231",
+  "updated_at": 1710105084,
+  "urn:zitadel:iam:org:domain:primary": "openinfrastructure.co",
+  "urn:zitadel:iam:org:project:257713952794870157:roles": {
+    "prod-cluster-admin": {
+      "257712562366317695": "openinfrastructure.co"
+    }
+  },
+  "urn:zitadel:iam:org:project:roles": {
+    "prod-cluster-admin": {
+      "257712562366317695": "openinfrastructure.co"
+    }
+  }
+}
+```
+## Tasks
+### Identify the leader
+Same command for primary and standby clusters.
+```
+k get pods -o name -l postgres-operator.crunchydata.com/role=master
+```
+### Full Backup
+On the primary cluster:
+```
+kubectl -n prod-iam-zitadel annotate postgrescluster zitadel postgres-operator.crunchydata.com/pgbackrest-backup="$(date)" --overwrite
+```
+Watch the progress:
+```
+k -n prod-iam-zitadel logs -l postgres-operator.crunchydata.com/pgbackrest-backup=manual -f
+```
+Expected output:
+```
+time="2024-03-14T18:36:16Z" level=info msg="crunchy-pgbackrest starts"
+time="2024-03-14T18:36:16Z" level=info msg="debug flag set to false"
+time="2024-03-14T18:36:16Z" level=info msg="backrest backup command requested"
+time="2024-03-14T18:36:16Z" level=info msg="command to execute is [pgbackrest backup --stanza=db --repo=2 --type=full]"
+time="2024-03-14T18:39:11Z" level=info msg="output=[]"
+time="2024-03-14T18:39:11Z" level=info msg="stderr=[]"
+time="2024-03-14T18:39:11Z" level=info msg="crunchy-pgbackrest ends"
+```
+
+## Troubleshooting
+Full backup doesn't start
+Check the controller logs.  If you see `Switchover failed` errors, try editing the number of postgrescluster replicas down to 1 then back up to 2.  Then try a backup again.
+```
+Switchover failed, details: 503, Switchover failed\n" version=5.5.1-0-amd64
+```
--- a/pkg/version/embedded/minor
+++ b/pkg/version/embedded/minor
@@ -1 +1 @@
-57
+58
--- a/pkg/version/embedded/patch
+++ b/pkg/version/embedded/patch
@@ -1 +1 @@
-0
+1
Author	SHA1	Message	Date
Jeff McCune	fd6fbe5598	(#57 ) Allow gha-rs scale set to fail on all but one clusters The effect of this patch is limited to refreshing credentials only for namespaces that exist in the local cluster. There is structure in place in the CUE code to allow for namespaces bound to specific clusters, but this is used only by the optional Vault component. This patch was an attempt to work around https://github.com/actions/actions-runner-controller/issues/3351 by deploying the runner scale sets into unique namespaces. This effort was a waste of time, only one listener pod successfully registered for a given scale set name / group combination. Because we have only one group named Default we can only have one listener pod globally for a given scale set name. Because we want our workflows to execute regardless of the availability of a single cluster, we're going to let this fail for now. The pod retries every 3 seconds. When a cluster is destroyed, another cluster will quickly register. A follow up patch will look to expand this retry behavior.	2024-03-15 12:53:16 -07:00
Jeff McCune	67472e1e1c	(#60 ) Disable flux reconciliation of deployment/zitadel on standby clusters	2024-03-14 21:58:32 -07:00
Jeff McCune	d64c3e8c66	(#58 ) Zitadel Failover RunBook	2024-03-14 15:25:38 -07:00
Jeff McCune	f344f97374	(#58 ) Restore last zitadel database backup When the cluster is provisioned, restore the most recent backup instead of a fixed point in time.	2024-03-14 11:40:17 -07:00
Jeff McCune	770088b912	(#53 ) Clean up nested if statements with &&	2024-03-13 10:35:20 -07:00