Compare commits

..

7 Commits

Author SHA1 Message Date
Jeff McCune
fd6fbe5598 (#57) Allow gha-rs scale set to fail on all but one clusters
The effect of this patch is limited to refreshing credentials only for
namespaces that exist in the local cluster.  There is structure in place
in the CUE code to allow for namespaces bound to specific clusters, but
this is used only by the optional Vault component.

This patch was an attempt to work around
https://github.com/actions/actions-runner-controller/issues/3351 by
deploying the runner scale sets into unique namespaces.

This effort was a waste of time, only one listener pod successfully
registered for a given scale set name / group combination.

Because we have only one group named Default we can only have one
listener pod globally for a given scale set name.

Because we want our workflows to execute regardless of the availability
of a single cluster, we're going to let this fail for now.  The pod
retries every 3 seconds.  When a cluster is destroyed, another cluster
will quickly register.

A follow up patch will look to expand this retry behavior.
2024-03-15 12:53:16 -07:00
Jeff McCune
67472e1e1c (#60) Disable flux reconciliation of deployment/zitadel on standby clusters 2024-03-14 21:58:32 -07:00
Jeff McCune
d64c3e8c66 (#58) Zitadel Failover RunBook 2024-03-14 15:25:38 -07:00
Jeff McCune
f344f97374 (#58) Restore last zitadel database backup
When the cluster is provisioned, restore the most recent backup instead
of a fixed point in time.
2024-03-14 11:40:17 -07:00
Jeff McCune
770088b912 (#53) Clean up nested if statements with && 2024-03-13 10:35:20 -07:00
Jeff McCune
cb9b39c3ca (#53) Add Vault as an optional service on the core clusters
This patch migrates the vault component from [holos-infra][1] to a cue
based component.  Vault is optional in the reference platform, so this
patch also defines an `#OptionalServices` struct to conditionally manage
a service across multiple clusters in the platform.

The primary use case for optional services is managing a namespace to
provision and provide secrets across clusters.

[1]: https://github.com/holos-run/holos-infra/tree/v0.5.0/components/core/core/vault
2024-03-12 17:18:38 -07:00
Jeff McCune
0f34b20546 (#54) Disable helm hooks when rendering components
Pods are unnecessarily created when deploying helm based holos
components and often fail.  Prevent these test pods by disabling helm
hooks with the `--no-hooks` flag.

Closes: #54
2024-03-12 14:14:20 -07:00
24 changed files with 2075 additions and 68 deletions

View File

@@ -0,0 +1,46 @@
// Controls optional feature flags for services distributed across multiple holos components.
// For example, enable issuing certificates in the provisioner cluster when an optional service is
// enabled for a workload cluster.
package holos
import "list"
#OptionalService: {
name: string
enabled: true | *false
clusters: [Name=_]: #Platform.clusters[Name]
clusterNames: [for c in clusters {c.name}]
managedNamespaces: [Name=_]: #ManagedNamespace & {
namespace: metadata: name: Name
clusterNames: ["provisioner", for c in clusters {c.name}]
}
// servers represents istio Gateway.spec.servers.hosts entries
// Refer to istio/gateway/gateway.cue
servers: [Name=_]: {
hosts: [...string]
port: name: Name
port: number: 443
port: protocol: "HTTPS"
tls: credentialName: string
tls: mode: "SIMPLE"
}
// public tls certs should align to hosts.
certs: [Name=_]: #Certificate & {
metadata: name: Name
}
}
#OptionalServices: {
[Name=_]: #OptionalService & {
name: Name
}
}
for svc in #OptionalServices {
for nsName, ns in svc.managedNamespaces {
if svc.enabled && list.Contains(ns.clusterNames, #ClusterName) {
#ManagedNamespaces: "\(nsName)": ns
}
}
}

View File

@@ -0,0 +1,56 @@
package holos
let CoreDomain = "core.\(#Platform.org.domain)"
let TargetNamespace = "prod-core-vault"
#OptionalServices: {
vault: {
enabled: true
clusters: core1: _
clusters: core2: _
managedNamespaces: "prod-core-vault": {
namespace: metadata: labels: "istio-injection": "enabled"
}
certs: "vault-core": #Certificate & {
metadata: name: "vault-core"
metadata: namespace: "istio-ingress"
spec: {
commonName: "vault.\(CoreDomain)"
dnsNames: [commonName]
secretName: metadata.name
issuerRef: kind: "ClusterIssuer"
issuerRef: name: string | *"letsencrypt"
}
}
servers: "https-vault-core": {
hosts: ["\(TargetNamespace)/vault.\(CoreDomain)"]
tls: credentialName: certs."vault-core".spec.secretName
}
for k, v in clusters {
let obj = (Cert & {Name: "vault-core", Cluster: v.name}).APIObject
certs: "\(obj.metadata.name)": obj
servers: "https-\(obj.metadata.name)": {
hosts: [for host in obj.spec.dnsNames {"\(TargetNamespace)/\(host)"}]
tls: credentialName: obj.spec.secretName
}
}
}
}
// Cert provisions a cluster specific certificate.
let Cert = {
Name: string
Cluster: string
APIObject: #Certificate & {
metadata: name: "\(Cluster)-\(Name)"
metadata: namespace: string | *"istio-ingress"
spec: {
commonName: string | *"vault.\(Cluster).\(CoreDomain)"
dnsNames: [commonName]
secretName: metadata.name
issuerRef: kind: "ClusterIssuer"
issuerRef: name: string | *"letsencrypt"
}
}
}

View File

@@ -13,7 +13,10 @@ let BucketRepoName = "repo2"
// Restore options. Set the timestamp to a known good point in time.
// time="2024-03-11T17:08:58Z" level=info msg="crunchy-pgbackrest ends"
let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// Restore the most recent backup.
let RestoreOptions = []
#KubernetesObjects & {
apiObjects: {
@@ -40,7 +43,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: "10Gi"
}
}]
standby: {
@@ -103,7 +106,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
name: "repo1"
volume: volumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: string | *"4Gi"
}
},
{
@@ -137,7 +140,7 @@ let HighlyAvailable = {
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: "1Gi"
resources: requests: storage: string | *"10Gi"
}
affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: [{
weight: 1

View File

@@ -10,7 +10,8 @@ let Name = "zitadel"
#Kustomization: spec: targetNamespace: #TargetNamespace
#HelmChart & {
namespace: #TargetNamespace
namespace: #TargetNamespace
enableHooks: true
chart: {
name: Name
version: "7.9.0"
@@ -60,43 +61,70 @@ let DatabaseCACertPatch = [
},
]
#Kustomize: {
patches: [
{
let CAPatch = #Patch & {
target: {
group: "apps" | "batch"
version: "v1"
kind: "Job" | "Deployment"
name: string
}
patch: yaml.Marshal(DatabaseCACertPatch)
}
#KustomizePatches: {
mesh: {
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(IstioInject)
}
deploymentCA: CAPatch & {
target: group: "apps"
target: kind: "Deployment"
target: name: Name
}
initJob: CAPatch & {
target: group: "batch"
target: kind: "Job"
target: name: "\(Name)-init"
}
setupJob: CAPatch & {
target: group: "batch"
target: kind: "Job"
target: name: "\(Name)-setup"
}
if #IsPrimaryCluster == false {
fluxDisable: {
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(IstioInject)
},
{
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
{
patch: yaml.Marshal(DisableFluxPatch)
}
initDisable: {
target: {
group: "batch"
version: "v1"
kind: "Job"
name: "\(Name)-init"
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
{
patch: yaml.Marshal(DisableFluxPatch)
}
setupDisable: {
target: {
group: "batch"
version: "v1"
kind: "Job"
name: "\(Name)-setup"
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
]
patch: yaml.Marshal(DisableFluxPatch)
}
}
}
let DisableFluxPatch = [{op: "replace", path: "/metadata/annotations/kustomize.toolkit.fluxcd.io~1reconcile", value: "disabled"}]

View File

@@ -4,6 +4,6 @@ package holos
#InputKeys: project: "github"
#DependsOn: Namespaces: name: "prod-secrets-namespaces"
#TargetNamespace: #InputKeys.component
#ARCSystemNamespace: "arc-system"
#HelmChart: namespace: #TargetNamespace
#HelmChart: chart: version: "0.8.3"

View File

@@ -1,5 +1,6 @@
package holos
#TargetNamespace: "arc-runner"
#InputKeys: component: "arc-runner"
#Kustomization: spec: targetNamespace: #TargetNamespace

View File

@@ -1,6 +1,6 @@
package holos
#TargetNamespace: "arc-system"
#TargetNamespace: #ARCSystemNamespace
#InputKeys: component: "arc-system"
#HelmChart & {

View File

@@ -1,5 +1,7 @@
package holos
import "list"
#TargetNamespace: "default"
#InputKeys: {
@@ -7,26 +9,18 @@ package holos
component: "namespaces"
}
// #PlatformNamespaceObjects defines the api objects necessary for eso SecretStores in external clusters to access secrets in a given namespace in the provisioner cluster.
#PlatformNamespaceObjects: {
_ns: #PlatformNamespace
objects: [
#Namespace & {
metadata: _ns
},
]
}
#KubernetesObjects & {
apiObjects: {
for ns in #PlatformNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
let Kind = obj.kind
let NS = ns.name
let Name = obj.metadata.name
"\(Kind)": "\(NS)/\(Name)": obj
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
if list.Contains(ns.clusterNames, #ClusterName) {
Namespace: "\(k)": #Namespace & ns.namespace
}
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
for ns in #PlatformNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
}
}
}

View File

@@ -1,7 +1,8 @@
package holos
// The primary istio Gateway, named default
import "list"
// The primary istio Gateway, named default
let Name = "gateway"
#InputKeys: component: Name
@@ -31,5 +32,19 @@ let LoginCert = #PlatformCerts.login
},
]
}
for k, svc in #OptionalServices {
if svc.enabled && list.Contains(svc.clusterNames, #ClusterName) {
Gateway: "\(svc.name)": #Gateway & {
metadata: name: svc.name
metadata: namespace: #TargetNamespace
spec: selector: istio: "ingressgateway"
spec: servers: [for s in svc.servers {s}]
}
for k, s in svc.servers {
ExternalSecret: "\(s.tls.credentialName)": _
}
}
}
}
}

View File

@@ -93,7 +93,14 @@ provisioner get serviceaccount -A --selector=holos.run/job.name=\(NAME) --output
# Create the tokens
mkdir tokens
jq -r '.items[].metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
kubectl get namespaces -o name > namespaces.txt
# Iterate over local namespaces
while IFS= read -r NAMESPACE; do
echo "Getting token for local cluster $NAMESPACE" >&2
jq -r '.items[] | select("namespace/"+.metadata.namespace == "'${NAMESPACE}'") | .metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
done < namespaces.txt
# Create the secrets
mksecret tokens/*.jwt
@@ -124,6 +131,11 @@ kubectl apply --server-side=true -f secrets.yaml
resources: ["secrets"]
verbs: ["*"]
},
{
apiGroups: [""]
resources: ["namespaces"]
verbs: ["list"]
},
]
},
// Bind the Role to the ServiceAccount for the Job.

View File

@@ -1,5 +1,7 @@
package holos
import "list"
#DependsOn: _ESOCreds
#TargetNamespace: "default"
@@ -30,5 +32,12 @@ package holos
"\(Kind)": "\(NS)/\(Name)": obj
}
}
for nsName, ns in #ManagedNamespaces {
if list.Contains(ns.clusterNames, #ClusterName) {
let obj = #SecretStore & {_namespace: nsName}
SecretStore: "\(nsName)/\(obj.metadata.name)": obj
}
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,146 @@
package holos
#Values: {
// Vault Helm Chart Holos Values
global: {
enabled: true
// Istio handles this
tlsDisable: true
}
injector: enabled: false
server: {
image: {
// repository: "hashicorp/vault"
repository: "quay.io/holos/hashicorp/vault"
tag: "1.14.10"
// Overrides the default Image Pull Policy
pullPolicy: "IfNotPresent"
}
extraLabels: "sidecar.istio.io/inject": "true"
resources: requests: {
memory: "256Mi"
cpu: "2000m"
}
// limits:
// memory: 1024Mi
// cpu: 2000m
// For HA configuration and because we need to manually init the vault,
// we need to define custom readiness/liveness Probe settings
readinessProbe: {
enabled: true
path: "/v1/sys/health?standbyok=true&sealedcode=204&uninitcode=204"
}
livenessProbe: {
enabled: true
path: "/v1/sys/health?standbyok=true"
initialDelaySeconds: 60
}
// extraEnvironmentVars is a list of extra environment variables to set with
// the stateful set. These could be used to include variables required for
// auto-unseal.
// Vault validates an incomplete chain:
// https://github.com/hashicorp/vault/issues/11318
extraEnvironmentVars: {
GOMAXPROCS: "2"
} // Set to cpu limit, see https://github.com/uber-go/automaxprocs
// extraVolumes is a list of extra volumes to mount. These will be exposed
// to Vault in the path `/vault/userconfig/<name>/`.
extraVolumes: [{
type: "secret"
name: "gcpkms-creds"
}]
// This configures the Vault Statefulset to create a PVC for audit logs.
// See https://www.vaultproject.io/docs/audit/index.html to know more
auditStorage: {
enabled: true
mountPath: "/var/log/vault"
} // for compatibility with plain debian vm location.
standalone: {
enabled: false
}
ha: {
enabled: true
replicas: 3
raft: {
enabled: true
setNodeId: true
config: """
ui = true
listener \"tcp\" {
address = \"[::]:8200\"
cluster_address = \"[::]:8201\"
# mTLS is handled by the the istio sidecar
tls_disable = \"true\"
# Enable unauthenticated metrics access (necessary for Prometheus Operator)
telemetry {
unauthenticated_metrics_access = true
}
}
telemetry {
prometheus_retention_time = \"30s\"
disable_hostname = true
}
seal \"gcpckms\" {
credentials = \"/vault/userconfig/gcpkms-creds/credentials.json\"
project = \"v6-vault-f15f\"
region = \"us-west1\"
key_ring = \"vault-core\"
crypto_key = \"vault-core-unseal\"
}
# Note; the retry_join leader_api_address values come from the Stable
# Network ID feature of a Statefulset. See:
# https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#stable-network-id
storage \"raft\" {
path = \"/vault/data\"
retry_join {
leader_api_addr = \"http://vault-0.vault-internal:8200\"
leader_tls_servername = \"vault\"
}
retry_join {
leader_api_addr = \"http://vault-1.vault-internal:8200\"
leader_tls_servername = \"vault\"
}
retry_join {
leader_api_addr = \"http://vault-2.vault-internal:8200\"
leader_tls_servername = \"vault\"
}
autopilot {
cleanup_dead_servers = \"true\"
last_contact_threshold = \"200ms\"
last_contact_failure_threshold = \"10m\"
max_trailing_logs = 250000
min_quorum = 3
server_stabilization_time = \"10s\"
}
}
service_registration \"kubernetes\" {}
"""
// Vault UI (Will be exposed via the service mesh)
} // Vault UI (Will be exposed via the service mesh)
} // Vault UI (Will be exposed via the service mesh)
} // Vault UI (Will be exposed via the service mesh)// Vault UI (Will be exposed via the service mesh)
ui: {
enabled: true
serviceType: "ClusterIP"
serviceNodePort: null
externalPort: 8200
}
}

View File

@@ -0,0 +1,75 @@
package holos
import "encoding/yaml"
import "list"
let Name = "vault"
#InputKeys: component: Name
#InputKeys: project: "core"
#TargetNamespace: "\(#InstancePrefix)-\(Name)"
let Vault = #OptionalServices[Name]
if Vault.enabled && list.Contains(Vault.clusterNames, #ClusterName) {
#HelmChart & {
namespace: #TargetNamespace
chart: {
name: Name
version: "0.25.0"
repository: {
name: "hashicorp"
url: "https://helm.releases.hashicorp.com"
}
}
values: #Values
apiObjects: {
ExternalSecret: "gcpkms-creds": _
ExternalSecret: "vault-server-cert": _
VirtualService: "\(Name)": {
metadata: name: Name
metadata: namespace: #TargetNamespace
spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
spec: gateways: ["istio-ingress/\(Name)"]
spec: http: [
{
route: [
{
destination: host: "\(Name)-active"
destination: port: number: 8200
},
]
},
]
}
}
}
#Kustomize: {
patches: [
{
target: {
group: "apps"
version: "v1"
kind: "StatefulSet"
name: Name
}
patch: yaml.Marshal(EnvPatch)
},
]
}
let EnvPatch = [
{
op: "test"
path: "/spec/template/spec/containers/0/env/4/name"
value: "VAULT_ADDR"
},
{
op: "replace"
path: "/spec/template/spec/containers/0/env/4/value"
value: "http://$(VAULT_K8S_POD_NAME):8200"
},
]
}

View File

@@ -0,0 +1,13 @@
package holos
let Vault = #OptionalServices.vault
if Vault.enabled {
#KubernetesObjects & {
apiObjects: {
for k, obj in Vault.certs {
"\(obj.kind)": "\(obj.metadata.name)": obj
}
}
}
}

View File

@@ -24,6 +24,14 @@ ksObjects: []
"\(Kind)": "\(ns.name)/\(Name)": obj
}
}
for nsName, ns in #ManagedNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns.namespace.metadata}).objects {
let Kind = obj.kind
let Name = obj.metadata.name
"\(Kind)": "\(nsName)/\(Name)": obj
}
}
}
}

View File

@@ -7,25 +7,16 @@ package holos
component: "namespaces"
}
// #PlatformNamespaceObjects defines the api objects necessary for eso SecretStores in external clusters to access secrets in a given namespace in the provisioner cluster.
#PlatformNamespaceObjects: {
_ns: #PlatformNamespace
objects: [
#Namespace & {
metadata: name: _ns.name
},
]
}
#KubernetesObjects & {
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for nsName, ns in #ManagedNamespaces {
Namespace: "\(nsName)": #Namespace & ns.namespace
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
for ns in #PlatformNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
let Kind = obj.kind
let Name = obj.metadata.name
"\(Kind)": "\(Name)": obj
}
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
}
}
}

View File

@@ -12,7 +12,7 @@ let Privileged = {
// #PlatformNamespaces is the union of all namespaces across all cluster types. Namespaces are created in all clusters regardless of if they're
// used within the cluster or not. The is important for security and consistency with IAM, RBAC, and Secrets sync between clusters.
// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
// TODO: Deprecate in favor of #ManagedNamespaces because it better to add fields to an object instead adding items to a list.
#PlatformNamespaces: [
{name: "external-secrets"},
{name: "holos-system"},

View File

@@ -152,7 +152,8 @@ _apiVersion: "holos.run/v1alpha1"
suspend?: bool
targetNamespace?: string
timeout: string | *"3m0s"
wait: bool | *true
// wait performs health checks for all reconciled resources. If set to true, .spec.healthChecks is ignored.
wait: bool | *true
dependsOn: [for k, v in #DependsOn {v}]
}
}
@@ -285,6 +286,26 @@ _apiVersion: "holos.run/v1alpha1"
}
}
// ManagedNamespace is a namespace to manage across all clusters in the holos platform.
#ManagedNamespace: {
namespace: {
metadata: {
name: string
labels: [string]: string
}
}
// clusterNames represents the set of clusters the namespace is managed on. Usually all clusters.
clusterNames: [...string]
}
// #ManagedNamepsaces is the union of all namespaces across all cluster types and optional services.
// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
#ManagedNamespaces: {
[Name=_]: #ManagedNamespace & {
namespace: metadata: name: Name
}
}
// #Backups defines backup configuration.
// TODO: Consider the best place for this, possibly as part of the site platform config. This represents the primary location for backups.
#Backups: {
@@ -393,6 +414,8 @@ _apiVersion: "holos.run/v1alpha1"
resourcesFile: ResourcesFile
// kustomizeFiles represents the files in a kustomize directory tree.
kustomizeFiles: #KustomizeFiles.Files
// enableHooks removes the --no-hooks flag from helm template
enableHooks: true | *false
}
// #KustomizeBuild is a holos component that uses plain yaml files as the source of api objects for a holos component.
@@ -448,8 +471,18 @@ _apiVersion: "holos.run/v1alpha1"
kind: "Kustomization"
resources: [ResourcesFile]
...
if len(#KustomizePatches) > 0 {
patches: [for v in #KustomizePatches {v}]
}
}
#KustomizePatches: {
[_]: #Patch
}
// #Patch is a kustomize patch
#Patch: kc.#Patch
// #DefaultSecurityContext is the holos default security context to comply with the restricted namespace policy.
// Refer to https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
#DefaultSecurityContext: {
@@ -468,6 +501,10 @@ _apiVersion: "holos.run/v1alpha1"
spec: secretName: metadata.name
}
// #IsPrimaryCluster is true if the cluster being rendered is the primary cluster
// Used by the iam project to determine where https://login.example.com is active.
#IsPrimaryCluster: bool & #ClusterName == #Platform.primaryCluster.name
// By default, render kind: Skipped so holos knows to skip over intermediate cue files.
// This enables the use of holos render ./foo/bar/baz/... when bar contains intermediary constraints which are not complete components.
// Holos skips over these intermediary cue instances.

View File

@@ -0,0 +1,305 @@
## Overview
Use this run book to move the ZITADEL primary postgres database from one cluster to another. At the end of the process https://zitadel.example.com service will be live on the cluster which was previously the standby at the start of the process.
Downtime: ~10 minutes if executed quickly. ~60 minutes if executed slowly and deliberately.
## Initial State
1. https://login.ois.run is routed to core2.
2. `postgrescluster/zitadel` on core2 is primary.
3. `postgrescluster/zitadel` on core1 is standby.
4. A recent [[#Full Backup]] has been taken and is stored in S3.
5. The standby cluster has been restored from the recent full backup.
## Process
1. [[#Edit Platform site config]]
2. [[#Render core2]]
3. [[#Demote core2]]
4. [[#Verify core2 is standby]]
5. [[#Render core1]]
6. [[#Promote core1]]
7. [[#Verify core1 is primary]]
8. [[#Reconcile Zitadel]]
9. [[#Cut over NLB]]
10. [[#Test Login]]
## Edit Platform site config
Change the primary cluster from core2 to core1 in `platforms/platform.site.cue`.
```cue
#Platform: primaryCluster: name: "core1"
```
## Render core2
Configure core as a standby.
```
holos render --cluster-name=core2 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index c1aa8394..45daa650 100644
--- a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,24 +85,6 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
- dataSource:
- pgbackrest:
- configuration:
- - secret:
- name: pgo-s3-creds
- global:
- repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
- repo2-cipher-type: aes-256-cbc
- options:
- - --type=time
- - --target="2024-03-11 17:10:00+00"
- repo:
- name: repo2
- s3:
- bucket: ois-zitadel-backups
- endpoint: s3.dualstack.us-east-2.amazonaws.com
- region: us-east-2
- stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -139,7 +121,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: false
+ enabled: true
repoName: repo2
users:
- databases
```
## Demote core2
```
10012 git add -p
10013 git commit -m 'Make core2 a standby zitadel cluster'
10014 git push origin HEAD
10015 flux reconcile source git flux-system
10016 flux get ks
```
## Verify core2 is standby
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -2
```
Expected output:
```
2024-03-14 21:58:26,205 INFO: no action. I am (zitadel-pgha1-smw7-0), the standby leader with the lock
```
## Render core1
```
holos render --cluster-name=core1 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index aa69814e..290c0865 100644
--- a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,6 +85,22 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
+ dataSource:
+ pgbackrest:
+ configuration:
+ - secret:
+ name: pgo-s3-creds
+ global:
+ repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+ repo2-cipher-type: aes-256-cbc
+ options: []
+ repo:
+ name: repo2
+ s3:
+ bucket: ois-zitadel-backups
+ endpoint: s3.dualstack.us-east-2.amazonaws.com
+ region: us-east-2
+ stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -121,7 +137,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: true
+ enabled: false
repoName: repo2
users:
- databases:
```
## Promote core1
```
flux reconcile source git flux-system
flux reconcile ks prod-iam-postgres
```
## Verify core1 is primary
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -1
```
Expected output:
```
2024-03-14 22:05:01,159 INFO: no action. I am (zitadel-pgha1-xl2b-0), the leader with the lock
```
## Reconcile Zitadel
Now that the database is back up, reconcile zitadel.
This is pretty brutal but it works:
```
flux delete ks prod-iam-zitadel
Are you sure you want to delete this kustomization: y
► deleting kustomization prod-iam-zitadel in flux-system namespace
✔ kustomization deleted
```
```
k apply --server-side=true -f deploy/clusters/core1/holos/components/prod-iam-zitadel-kustomization.gen.yaml
kustomization.kustomize.toolkit.fluxcd.io/prod-iam-zitadel serverside-applied
```
ZITADEL should come up READY 2/2
```
k get pods
NAME READY STATUS RESTARTS AGE
zitadel-76776f747f-94wft 2/2 Running 2 (30s ago) 34s
zitadel-76776f747f-fl24c 2/2 Running 1 (30s ago) 34s
zitadel-76776f747f-xx2w7 2/2 Running 2 (30s ago) 34s
zitadel-backup-49c7-h25cj 0/1 Completed 0 3m23s
zitadel-init-td2sh 0/1 Completed 0 34s
zitadel-pgbouncer-d9f8cffc-2vwk5 2/2 Running 0 22m
zitadel-pgbouncer-d9f8cffc-rrrhd 2/2 Running 0 22m
zitadel-pgha1-4npq-0 4/4 Running 0 3m21s
zitadel-pgha1-xl2b-0 4/4 Running 0 3m43s
zitadel-repo-host-0 2/2 Running 0 22m
zitadel-setup-b7zkx 0/1 Completed 1 34s
zitadel-test-connection 0/1 Error 0 34s
```
## Cut over NLB
Finally, cut over the NLB config to move to core1.
This is specific to our metal clusters.
Apply with: `git push puppet +HEAD:production -o run=netlb`
```diff
diff --git a/site-modules/profile/templates/haproxy/haproxy.cfg.epp b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
index b358a70..3ee40db 100644
--- a/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+++ b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
@@ -225,7 +225,7 @@ frontend sni-ingress from tcp-ingress
use_backend core2-https if { req.ssl_sni -i web.holos.run }
use_backend core2-https if { req.ssl_sni -i -m end .web.holos.run }
# Identity provider is routed to one cluster at a time
- use_backend core2-https if { req.ssl_sni -i login.ois.run }
+ use_backend core1-https if { req.ssl_sni -i login.ois.run }
# Holos Dev Environments
use_backend k1-https if { req.ssl_sni -i holos.wtf }
use_backend k1-https if { req.ssl_sni -i -m end .holos.wtf }
@@ -358,7 +358,7 @@ frontend http-ingress from http-ingress
bind 65.102.23.41:80
bind 2602:41:6617:2ec0::80:1d:80 v6only
# Zitadel is active on one cluster at a time
- use_backend core2-http if { hdr(host) -i login.ois.run }
+ use_backend core1-http if { hdr(host) -i login.ois.run }
# Vault is active on core1 or core2
use_backend core2-http if { hdr(host) -i vault.core.ois.run }
# Infrastructure Dex OIDC ID issuer is active on core1 or core2
```
## Test Login
```
kubectl oidc-login get-token \
--oidc-extra-scope=openid \
--oidc-extra-scope=email \
--oidc-extra-scope=profile \
--oidc-extra-scope=groups \
--oidc-extra-scope=offline_access \
--oidc-extra-scope=urn:zitadel:iam:org:domain:primary:openinfrastructure.co \
--oidc-issuer-url=https://login.ois.run \
--oidc-client-id=${CLIENT_ID:-257714027772314751@holos_platform} \
--oidc-use-pkce \
--force-refresh > ~/.kube/token.json
```
```
jq -r .status.token ~/.kube/token.json | cut -d. -f2 | base64 -d | jq
```
```json
{
"amr": [
"mfa",
"user"
],
"at_hash": "5nUq6kXWT8GheCNvDNdyow",
"aud": [
"257714027772314751@holos_platform",
"257713952794870157"
],
"auth_time": 1710172186,
"azp": "257714027772314751@holos_platform",
"client_id": "257714027772314751@holos_platform",
"email": "jeff@openinfrastructure.co",
"email_verified": true,
"exp": 1710497614,
"family_name": "McCune",
"given_name": "Jeff",
"groups": [
"prod-cluster-admin"
],
"iat": 1710454404,
"iss": "https://login.ois.run",
"locale": null,
"name": "Jeff McCune",
"nickname": "Jeff",
"preferred_username": "jeff@openinfrastructure.co",
"sub": "257712562366383231",
"updated_at": 1710105084,
"urn:zitadel:iam:org:domain:primary": "openinfrastructure.co",
"urn:zitadel:iam:org:project:257713952794870157:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
},
"urn:zitadel:iam:org:project:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
}
}
```
## Tasks
### Identify the leader
Same command for primary and standby clusters.
```
k get pods -o name -l postgres-operator.crunchydata.com/role=master
```
### Full Backup
On the primary cluster:
```
kubectl -n prod-iam-zitadel annotate postgrescluster zitadel postgres-operator.crunchydata.com/pgbackrest-backup="$(date)" --overwrite
```
Watch the progress:
```
k -n prod-iam-zitadel logs -l postgres-operator.crunchydata.com/pgbackrest-backup=manual -f
```
Expected output:
```
time="2024-03-14T18:36:16Z" level=info msg="crunchy-pgbackrest starts"
time="2024-03-14T18:36:16Z" level=info msg="debug flag set to false"
time="2024-03-14T18:36:16Z" level=info msg="backrest backup command requested"
time="2024-03-14T18:36:16Z" level=info msg="command to execute is [pgbackrest backup --stanza=db --repo=2 --type=full]"
time="2024-03-14T18:39:11Z" level=info msg="output=[]"
time="2024-03-14T18:39:11Z" level=info msg="stderr=[]"
time="2024-03-14T18:39:11Z" level=info msg="crunchy-pgbackrest ends"
```
## Troubleshooting
Full backup doesn't start
Check the controller logs. If you see `Switchover failed` errors, try editing the number of postgrescluster replicas down to 1 then back up to 2. Then try a backup again.
```
Switchover failed, details: 503, Switchover failed\n" version=5.5.1-0-amd64
```

View File

@@ -121,6 +121,7 @@ type HelmChart struct {
Namespace string `json:"namespace"`
Chart Chart `json:"chart"`
ValuesContent string `json:"valuesContent"`
EnableHooks bool `json:"enableHooks"`
// APIObjectMap holds the marshalled representation of api objects.
APIObjectMap apiObjectMap `json:"APIObjectMap"`
}
@@ -445,7 +446,12 @@ func runHelm(ctx context.Context, hc *HelmChart, r *Result, path holos.PathCompo
// Run charts
chart := hc.Chart
helmOut, err := util.RunCmd(ctx, "helm", "template", "--include-crds", "--values", valuesPath, "--namespace", hc.Namespace, "--kubeconfig", "/dev/null", "--version", chart.Version, chart.Release, cachedChartPath)
args := []string{"template"}
if !hc.EnableHooks {
args = append(args, "--no-hooks")
}
args = append(args, "--include-crds", "--values", valuesPath, "--namespace", hc.Namespace, "--kubeconfig", "/dev/null", "--version", chart.Version, chart.Release, cachedChartPath)
helmOut, err := util.RunCmd(ctx, "helm", args...)
if err != nil {
stderr := helmOut.Stderr.String()
lines := strings.Split(stderr, "\n")

View File

@@ -1 +1 @@
56
58

View File

@@ -1 +1 @@
0
1