Compare commits

...

5 Commits

Author SHA1 Message Date
Jeff McCune
fd6fbe5598 (#57) Allow gha-rs scale set to fail on all but one clusters
The effect of this patch is limited to refreshing credentials only for
namespaces that exist in the local cluster.  There is structure in place
in the CUE code to allow for namespaces bound to specific clusters, but
this is used only by the optional Vault component.

This patch was an attempt to work around
https://github.com/actions/actions-runner-controller/issues/3351 by
deploying the runner scale sets into unique namespaces.

This effort was a waste of time, only one listener pod successfully
registered for a given scale set name / group combination.

Because we have only one group named Default we can only have one
listener pod globally for a given scale set name.

Because we want our workflows to execute regardless of the availability
of a single cluster, we're going to let this fail for now.  The pod
retries every 3 seconds.  When a cluster is destroyed, another cluster
will quickly register.

A follow up patch will look to expand this retry behavior.
2024-03-15 12:53:16 -07:00
Jeff McCune
67472e1e1c (#60) Disable flux reconciliation of deployment/zitadel on standby clusters 2024-03-14 21:58:32 -07:00
Jeff McCune
d64c3e8c66 (#58) Zitadel Failover RunBook 2024-03-14 15:25:38 -07:00
Jeff McCune
f344f97374 (#58) Restore last zitadel database backup
When the cluster is provisioned, restore the most recent backup instead
of a fixed point in time.
2024-03-14 11:40:17 -07:00
Jeff McCune
770088b912 (#53) Clean up nested if statements with && 2024-03-13 10:35:20 -07:00
18 changed files with 499 additions and 118 deletions

View File

@@ -1,16 +1,19 @@
package holos
// Controls optional feature flags for services distributed across multiple holos components.
// For example, enable issuing certificates in the provisioner cluster when an optional service is
// enabled for a workload cluster.
package holos
import "list"
#OptionalService: {
name: string
enabled: true | *false
clusters: [Name=_]: #Platform.clusters[Name]
clusterNames: [for k, v in clusters {k}]
namespaces: [Name=_]: #ManagedNamespace & {
name: Name
clusterNames: [for c in clusters {c.name}]
managedNamespaces: [Name=_]: #ManagedNamespace & {
namespace: metadata: name: Name
clusterNames: ["provisioner", for c in clusters {c.name}]
}
// servers represents istio Gateway.spec.servers.hosts entries
// Refer to istio/gateway/gateway.cue
@@ -34,6 +37,10 @@ package holos
}
}
for k, v in #OptionalServices {
#ManagedNamespaces: v.namespaces
for svc in #OptionalServices {
for nsName, ns in svc.managedNamespaces {
if svc.enabled && list.Contains(ns.clusterNames, #ClusterName) {
#ManagedNamespaces: "\(nsName)": ns
}
}
}

View File

@@ -8,7 +8,9 @@ let TargetNamespace = "prod-core-vault"
enabled: true
clusters: core1: _
clusters: core2: _
namespaces: "prod-core-vault": labels: "istio-injection": "enabled"
managedNamespaces: "prod-core-vault": {
namespace: metadata: labels: "istio-injection": "enabled"
}
certs: "vault-core": #Certificate & {
metadata: name: "vault-core"
metadata: namespace: "istio-ingress"

View File

@@ -13,7 +13,10 @@ let BucketRepoName = "repo2"
// Restore options. Set the timestamp to a known good point in time.
// time="2024-03-11T17:08:58Z" level=info msg="crunchy-pgbackrest ends"
let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// Restore the most recent backup.
let RestoreOptions = []
#KubernetesObjects & {
apiObjects: {
@@ -40,7 +43,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: "10Gi"
}
}]
standby: {
@@ -103,7 +106,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
name: "repo1"
volume: volumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: string | *"4Gi"
}
},
{
@@ -137,7 +140,7 @@ let HighlyAvailable = {
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: "1Gi"
resources: requests: storage: string | *"10Gi"
}
affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: [{
weight: 1

View File

@@ -61,43 +61,70 @@ let DatabaseCACertPatch = [
},
]
#Kustomize: {
patches: [
{
let CAPatch = #Patch & {
target: {
group: "apps" | "batch"
version: "v1"
kind: "Job" | "Deployment"
name: string
}
patch: yaml.Marshal(DatabaseCACertPatch)
}
#KustomizePatches: {
mesh: {
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(IstioInject)
}
deploymentCA: CAPatch & {
target: group: "apps"
target: kind: "Deployment"
target: name: Name
}
initJob: CAPatch & {
target: group: "batch"
target: kind: "Job"
target: name: "\(Name)-init"
}
setupJob: CAPatch & {
target: group: "batch"
target: kind: "Job"
target: name: "\(Name)-setup"
}
if #IsPrimaryCluster == false {
fluxDisable: {
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(IstioInject)
},
{
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
{
patch: yaml.Marshal(DisableFluxPatch)
}
initDisable: {
target: {
group: "batch"
version: "v1"
kind: "Job"
name: "\(Name)-init"
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
{
patch: yaml.Marshal(DisableFluxPatch)
}
setupDisable: {
target: {
group: "batch"
version: "v1"
kind: "Job"
name: "\(Name)-setup"
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
]
patch: yaml.Marshal(DisableFluxPatch)
}
}
}
let DisableFluxPatch = [{op: "replace", path: "/metadata/annotations/kustomize.toolkit.fluxcd.io~1reconcile", value: "disabled"}]

View File

@@ -4,6 +4,6 @@ package holos
#InputKeys: project: "github"
#DependsOn: Namespaces: name: "prod-secrets-namespaces"
#TargetNamespace: #InputKeys.component
#ARCSystemNamespace: "arc-system"
#HelmChart: namespace: #TargetNamespace
#HelmChart: chart: version: "0.8.3"

View File

@@ -1,5 +1,6 @@
package holos
#TargetNamespace: "arc-runner"
#InputKeys: component: "arc-runner"
#Kustomization: spec: targetNamespace: #TargetNamespace

View File

@@ -1,6 +1,6 @@
package holos
#TargetNamespace: "arc-system"
#TargetNamespace: #ARCSystemNamespace
#InputKeys: component: "arc-system"
#HelmChart & {

View File

@@ -1,5 +1,7 @@
package holos
import "list"
#TargetNamespace: "default"
#InputKeys: {
@@ -11,7 +13,9 @@ package holos
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
if list.Contains(ns.clusterNames, #ClusterName) {
Namespace: "\(k)": #Namespace & ns.namespace
}
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.

View File

@@ -34,17 +34,15 @@ let LoginCert = #PlatformCerts.login
}
for k, svc in #OptionalServices {
if svc.enabled {
if list.Contains(svc.clusterNames, #ClusterName) {
Gateway: "\(svc.name)": #Gateway & {
metadata: name: svc.name
metadata: namespace: #TargetNamespace
spec: selector: istio: "ingressgateway"
spec: servers: [for s in svc.servers {s}]
}
for k, s in svc.servers {
ExternalSecret: "\(s.tls.credentialName)": _
}
if svc.enabled && list.Contains(svc.clusterNames, #ClusterName) {
Gateway: "\(svc.name)": #Gateway & {
metadata: name: svc.name
metadata: namespace: #TargetNamespace
spec: selector: istio: "ingressgateway"
spec: servers: [for s in svc.servers {s}]
}
for k, s in svc.servers {
ExternalSecret: "\(s.tls.credentialName)": _
}
}
}

View File

@@ -93,7 +93,14 @@ provisioner get serviceaccount -A --selector=holos.run/job.name=\(NAME) --output
# Create the tokens
mkdir tokens
jq -r '.items[].metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
kubectl get namespaces -o name > namespaces.txt
# Iterate over local namespaces
while IFS= read -r NAMESPACE; do
echo "Getting token for local cluster $NAMESPACE" >&2
jq -r '.items[] | select("namespace/"+.metadata.namespace == "'${NAMESPACE}'") | .metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
done < namespaces.txt
# Create the secrets
mksecret tokens/*.jwt
@@ -124,6 +131,11 @@ kubectl apply --server-side=true -f secrets.yaml
resources: ["secrets"]
verbs: ["*"]
},
{
apiGroups: [""]
resources: ["namespaces"]
verbs: ["list"]
},
]
},
// Bind the Role to the ServiceAccount for the Job.

View File

@@ -1,5 +1,7 @@
package holos
import "list"
#DependsOn: _ESOCreds
#TargetNamespace: "default"
@@ -31,9 +33,11 @@ package holos
}
}
for k, ns in #ManagedNamespaces {
let obj = #SecretStore & {_namespace: ns.name}
SecretStore: "\(ns.name)/\(obj.metadata.name)": obj
for nsName, ns in #ManagedNamespaces {
if list.Contains(ns.clusterNames, #ClusterName) {
let obj = #SecretStore & {_namespace: nsName}
SecretStore: "\(nsName)/\(obj.metadata.name)": obj
}
}
}
}

View File

@@ -11,67 +11,65 @@ let Name = "vault"
let Vault = #OptionalServices[Name]
if Vault.enabled {
if list.Contains(Vault.clusterNames, #ClusterName) {
#HelmChart & {
namespace: #TargetNamespace
chart: {
name: Name
version: "0.25.0"
repository: {
name: "hashicorp"
url: "https://helm.releases.hashicorp.com"
}
}
values: #Values
apiObjects: {
ExternalSecret: "gcpkms-creds": _
ExternalSecret: "vault-server-cert": _
VirtualService: "\(Name)": {
metadata: name: Name
metadata: namespace: #TargetNamespace
spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
spec: gateways: ["istio-ingress/\(Name)"]
spec: http: [
{
route: [
{
destination: host: "\(Name)-active"
destination: port: number: 8200
},
]
},
]
}
if Vault.enabled && list.Contains(Vault.clusterNames, #ClusterName) {
#HelmChart & {
namespace: #TargetNamespace
chart: {
name: Name
version: "0.25.0"
repository: {
name: "hashicorp"
url: "https://helm.releases.hashicorp.com"
}
}
values: #Values
#Kustomize: {
patches: [
{
target: {
group: "apps"
version: "v1"
kind: "StatefulSet"
name: Name
}
patch: yaml.Marshal(EnvPatch)
},
]
apiObjects: {
ExternalSecret: "gcpkms-creds": _
ExternalSecret: "vault-server-cert": _
VirtualService: "\(Name)": {
metadata: name: Name
metadata: namespace: #TargetNamespace
spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
spec: gateways: ["istio-ingress/\(Name)"]
spec: http: [
{
route: [
{
destination: host: "\(Name)-active"
destination: port: number: 8200
},
]
},
]
}
}
}
let EnvPatch = [
#Kustomize: {
patches: [
{
op: "test"
path: "/spec/template/spec/containers/0/env/4/name"
value: "VAULT_ADDR"
},
{
op: "replace"
path: "/spec/template/spec/containers/0/env/4/value"
value: "http://$(VAULT_K8S_POD_NAME):8200"
target: {
group: "apps"
version: "v1"
kind: "StatefulSet"
name: Name
}
patch: yaml.Marshal(EnvPatch)
},
]
}
let EnvPatch = [
{
op: "test"
path: "/spec/template/spec/containers/0/env/4/name"
value: "VAULT_ADDR"
},
{
op: "replace"
path: "/spec/template/spec/containers/0/env/4/value"
value: "http://$(VAULT_K8S_POD_NAME):8200"
},
]
}

View File

@@ -25,11 +25,11 @@ ksObjects: []
}
}
for k, ns in #ManagedNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
for nsName, ns in #ManagedNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns.namespace.metadata}).objects {
let Kind = obj.kind
let Name = obj.metadata.name
"\(Kind)": "\(ns.name)/\(Name)": obj
"\(Kind)": "\(nsName)/\(Name)": obj
}
}
}

View File

@@ -10,8 +10,8 @@ package holos
#KubernetesObjects & {
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
for nsName, ns in #ManagedNamespaces {
Namespace: "\(nsName)": #Namespace & ns.namespace
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.

View File

@@ -152,7 +152,8 @@ _apiVersion: "holos.run/v1alpha1"
suspend?: bool
targetNamespace?: string
timeout: string | *"3m0s"
wait: bool | *true
// wait performs health checks for all reconciled resources. If set to true, .spec.healthChecks is ignored.
wait: bool | *true
dependsOn: [for k, v in #DependsOn {v}]
}
}
@@ -287,16 +288,21 @@ _apiVersion: "holos.run/v1alpha1"
// ManagedNamespace is a namespace to manage across all clusters in the holos platform.
#ManagedNamespace: {
// TODO metadata labels and annotations
name: string
labels: [string]: string
namespace: {
metadata: {
name: string
labels: [string]: string
}
}
// clusterNames represents the set of clusters the namespace is managed on. Usually all clusters.
clusterNames: [...string]
}
// #ManagedNamepsaces is the union of all namespaces across all cluster types and optional services.
// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
#ManagedNamespaces: {
[Name=_]: {
name: Name
[Name=_]: #ManagedNamespace & {
namespace: metadata: name: Name
}
}
@@ -465,8 +471,18 @@ _apiVersion: "holos.run/v1alpha1"
kind: "Kustomization"
resources: [ResourcesFile]
...
if len(#KustomizePatches) > 0 {
patches: [for v in #KustomizePatches {v}]
}
}
#KustomizePatches: {
[_]: #Patch
}
// #Patch is a kustomize patch
#Patch: kc.#Patch
// #DefaultSecurityContext is the holos default security context to comply with the restricted namespace policy.
// Refer to https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
#DefaultSecurityContext: {
@@ -485,6 +501,10 @@ _apiVersion: "holos.run/v1alpha1"
spec: secretName: metadata.name
}
// #IsPrimaryCluster is true if the cluster being rendered is the primary cluster
// Used by the iam project to determine where https://login.example.com is active.
#IsPrimaryCluster: bool & #ClusterName == #Platform.primaryCluster.name
// By default, render kind: Skipped so holos knows to skip over intermediate cue files.
// This enables the use of holos render ./foo/bar/baz/... when bar contains intermediary constraints which are not complete components.
// Holos skips over these intermediary cue instances.

View File

@@ -0,0 +1,305 @@
## Overview
Use this run book to move the ZITADEL primary postgres database from one cluster to another. At the end of the process https://zitadel.example.com service will be live on the cluster which was previously the standby at the start of the process.
Downtime: ~10 minutes if executed quickly. ~60 minutes if executed slowly and deliberately.
## Initial State
1. https://login.ois.run is routed to core2.
2. `postgrescluster/zitadel` on core2 is primary.
3. `postgrescluster/zitadel` on core1 is standby.
4. A recent [[#Full Backup]] has been taken and is stored in S3.
5. The standby cluster has been restored from the recent full backup.
## Process
1. [[#Edit Platform site config]]
2. [[#Render core2]]
3. [[#Demote core2]]
4. [[#Verify core2 is standby]]
5. [[#Render core1]]
6. [[#Promote core1]]
7. [[#Verify core1 is primary]]
8. [[#Reconcile Zitadel]]
9. [[#Cut over NLB]]
10. [[#Test Login]]
## Edit Platform site config
Change the primary cluster from core2 to core1 in `platforms/platform.site.cue`.
```cue
#Platform: primaryCluster: name: "core1"
```
## Render core2
Configure core as a standby.
```
holos render --cluster-name=core2 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index c1aa8394..45daa650 100644
--- a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,24 +85,6 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
- dataSource:
- pgbackrest:
- configuration:
- - secret:
- name: pgo-s3-creds
- global:
- repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
- repo2-cipher-type: aes-256-cbc
- options:
- - --type=time
- - --target="2024-03-11 17:10:00+00"
- repo:
- name: repo2
- s3:
- bucket: ois-zitadel-backups
- endpoint: s3.dualstack.us-east-2.amazonaws.com
- region: us-east-2
- stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -139,7 +121,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: false
+ enabled: true
repoName: repo2
users:
- databases
```
## Demote core2
```
10012 git add -p
10013 git commit -m 'Make core2 a standby zitadel cluster'
10014 git push origin HEAD
10015 flux reconcile source git flux-system
10016 flux get ks
```
## Verify core2 is standby
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -2
```
Expected output:
```
2024-03-14 21:58:26,205 INFO: no action. I am (zitadel-pgha1-smw7-0), the standby leader with the lock
```
## Render core1
```
holos render --cluster-name=core1 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index aa69814e..290c0865 100644
--- a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,6 +85,22 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
+ dataSource:
+ pgbackrest:
+ configuration:
+ - secret:
+ name: pgo-s3-creds
+ global:
+ repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+ repo2-cipher-type: aes-256-cbc
+ options: []
+ repo:
+ name: repo2
+ s3:
+ bucket: ois-zitadel-backups
+ endpoint: s3.dualstack.us-east-2.amazonaws.com
+ region: us-east-2
+ stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -121,7 +137,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: true
+ enabled: false
repoName: repo2
users:
- databases:
```
## Promote core1
```
flux reconcile source git flux-system
flux reconcile ks prod-iam-postgres
```
## Verify core1 is primary
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -1
```
Expected output:
```
2024-03-14 22:05:01,159 INFO: no action. I am (zitadel-pgha1-xl2b-0), the leader with the lock
```
## Reconcile Zitadel
Now that the database is back up, reconcile zitadel.
This is pretty brutal but it works:
```
flux delete ks prod-iam-zitadel
Are you sure you want to delete this kustomization: y
► deleting kustomization prod-iam-zitadel in flux-system namespace
✔ kustomization deleted
```
```
k apply --server-side=true -f deploy/clusters/core1/holos/components/prod-iam-zitadel-kustomization.gen.yaml
kustomization.kustomize.toolkit.fluxcd.io/prod-iam-zitadel serverside-applied
```
ZITADEL should come up READY 2/2
```
k get pods
NAME READY STATUS RESTARTS AGE
zitadel-76776f747f-94wft 2/2 Running 2 (30s ago) 34s
zitadel-76776f747f-fl24c 2/2 Running 1 (30s ago) 34s
zitadel-76776f747f-xx2w7 2/2 Running 2 (30s ago) 34s
zitadel-backup-49c7-h25cj 0/1 Completed 0 3m23s
zitadel-init-td2sh 0/1 Completed 0 34s
zitadel-pgbouncer-d9f8cffc-2vwk5 2/2 Running 0 22m
zitadel-pgbouncer-d9f8cffc-rrrhd 2/2 Running 0 22m
zitadel-pgha1-4npq-0 4/4 Running 0 3m21s
zitadel-pgha1-xl2b-0 4/4 Running 0 3m43s
zitadel-repo-host-0 2/2 Running 0 22m
zitadel-setup-b7zkx 0/1 Completed 1 34s
zitadel-test-connection 0/1 Error 0 34s
```
## Cut over NLB
Finally, cut over the NLB config to move to core1.
This is specific to our metal clusters.
Apply with: `git push puppet +HEAD:production -o run=netlb`
```diff
diff --git a/site-modules/profile/templates/haproxy/haproxy.cfg.epp b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
index b358a70..3ee40db 100644
--- a/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+++ b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
@@ -225,7 +225,7 @@ frontend sni-ingress from tcp-ingress
use_backend core2-https if { req.ssl_sni -i web.holos.run }
use_backend core2-https if { req.ssl_sni -i -m end .web.holos.run }
# Identity provider is routed to one cluster at a time
- use_backend core2-https if { req.ssl_sni -i login.ois.run }
+ use_backend core1-https if { req.ssl_sni -i login.ois.run }
# Holos Dev Environments
use_backend k1-https if { req.ssl_sni -i holos.wtf }
use_backend k1-https if { req.ssl_sni -i -m end .holos.wtf }
@@ -358,7 +358,7 @@ frontend http-ingress from http-ingress
bind 65.102.23.41:80
bind 2602:41:6617:2ec0::80:1d:80 v6only
# Zitadel is active on one cluster at a time
- use_backend core2-http if { hdr(host) -i login.ois.run }
+ use_backend core1-http if { hdr(host) -i login.ois.run }
# Vault is active on core1 or core2
use_backend core2-http if { hdr(host) -i vault.core.ois.run }
# Infrastructure Dex OIDC ID issuer is active on core1 or core2
```
## Test Login
```
kubectl oidc-login get-token \
--oidc-extra-scope=openid \
--oidc-extra-scope=email \
--oidc-extra-scope=profile \
--oidc-extra-scope=groups \
--oidc-extra-scope=offline_access \
--oidc-extra-scope=urn:zitadel:iam:org:domain:primary:openinfrastructure.co \
--oidc-issuer-url=https://login.ois.run \
--oidc-client-id=${CLIENT_ID:-257714027772314751@holos_platform} \
--oidc-use-pkce \
--force-refresh > ~/.kube/token.json
```
```
jq -r .status.token ~/.kube/token.json | cut -d. -f2 | base64 -d | jq
```
```json
{
"amr": [
"mfa",
"user"
],
"at_hash": "5nUq6kXWT8GheCNvDNdyow",
"aud": [
"257714027772314751@holos_platform",
"257713952794870157"
],
"auth_time": 1710172186,
"azp": "257714027772314751@holos_platform",
"client_id": "257714027772314751@holos_platform",
"email": "jeff@openinfrastructure.co",
"email_verified": true,
"exp": 1710497614,
"family_name": "McCune",
"given_name": "Jeff",
"groups": [
"prod-cluster-admin"
],
"iat": 1710454404,
"iss": "https://login.ois.run",
"locale": null,
"name": "Jeff McCune",
"nickname": "Jeff",
"preferred_username": "jeff@openinfrastructure.co",
"sub": "257712562366383231",
"updated_at": 1710105084,
"urn:zitadel:iam:org:domain:primary": "openinfrastructure.co",
"urn:zitadel:iam:org:project:257713952794870157:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
},
"urn:zitadel:iam:org:project:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
}
}
```
## Tasks
### Identify the leader
Same command for primary and standby clusters.
```
k get pods -o name -l postgres-operator.crunchydata.com/role=master
```
### Full Backup
On the primary cluster:
```
kubectl -n prod-iam-zitadel annotate postgrescluster zitadel postgres-operator.crunchydata.com/pgbackrest-backup="$(date)" --overwrite
```
Watch the progress:
```
k -n prod-iam-zitadel logs -l postgres-operator.crunchydata.com/pgbackrest-backup=manual -f
```
Expected output:
```
time="2024-03-14T18:36:16Z" level=info msg="crunchy-pgbackrest starts"
time="2024-03-14T18:36:16Z" level=info msg="debug flag set to false"
time="2024-03-14T18:36:16Z" level=info msg="backrest backup command requested"
time="2024-03-14T18:36:16Z" level=info msg="command to execute is [pgbackrest backup --stanza=db --repo=2 --type=full]"
time="2024-03-14T18:39:11Z" level=info msg="output=[]"
time="2024-03-14T18:39:11Z" level=info msg="stderr=[]"
time="2024-03-14T18:39:11Z" level=info msg="crunchy-pgbackrest ends"
```
## Troubleshooting
Full backup doesn't start
Check the controller logs. If you see `Switchover failed` errors, try editing the number of postgrescluster replicas down to 1 then back up to 2. Then try a backup again.
```
Switchover failed, details: 503, Switchover failed\n" version=5.5.1-0-amd64
```

View File

@@ -1 +1 @@
57
58

View File

@@ -1 +1 @@
0
1