Compare commits

..

5 Commits

Author SHA1 Message Date
Jeff McCune
d64c3e8c66 (#58) Zitadel Failover RunBook 2024-03-14 15:25:38 -07:00
Jeff McCune
f344f97374 (#58) Restore last zitadel database backup
When the cluster is provisioned, restore the most recent backup instead
of a fixed point in time.
2024-03-14 11:40:17 -07:00
Jeff McCune
770088b912 (#53) Clean up nested if statements with && 2024-03-13 10:35:20 -07:00
Jeff McCune
cb9b39c3ca (#53) Add Vault as an optional service on the core clusters
This patch migrates the vault component from [holos-infra][1] to a cue
based component.  Vault is optional in the reference platform, so this
patch also defines an `#OptionalServices` struct to conditionally manage
a service across multiple clusters in the platform.

The primary use case for optional services is managing a namespace to
provision and provide secrets across clusters.

[1]: https://github.com/holos-run/holos-infra/tree/v0.5.0/components/core/core/vault
2024-03-12 17:18:38 -07:00
Jeff McCune
0f34b20546 (#54) Disable helm hooks when rendering components
Pods are unnecessarily created when deploying helm based holos
components and often fail.  Prevent these test pods by disabling helm
hooks with the `--no-hooks` flag.

Closes: #54
2024-03-12 14:14:20 -07:00
19 changed files with 1972 additions and 42 deletions

View File

@@ -0,0 +1,39 @@
package holos
// Controls optional feature flags for services distributed across multiple holos components.
// For example, enable issuing certificates in the provisioner cluster when an optional service is
// enabled for a workload cluster.
#OptionalService: {
name: string
enabled: true | *false
clusters: [Name=_]: #Platform.clusters[Name]
clusterNames: [for k, v in clusters {k}]
namespaces: [Name=_]: #ManagedNamespace & {
name: Name
}
// servers represents istio Gateway.spec.servers.hosts entries
// Refer to istio/gateway/gateway.cue
servers: [Name=_]: {
hosts: [...string]
port: name: Name
port: number: 443
port: protocol: "HTTPS"
tls: credentialName: string
tls: mode: "SIMPLE"
}
// public tls certs should align to hosts.
certs: [Name=_]: #Certificate & {
metadata: name: Name
}
}
#OptionalServices: {
[Name=_]: #OptionalService & {
name: Name
}
}
for k, v in #OptionalServices {
#ManagedNamespaces: v.namespaces
}

View File

@@ -0,0 +1,54 @@
package holos
let CoreDomain = "core.\(#Platform.org.domain)"
let TargetNamespace = "prod-core-vault"
#OptionalServices: {
vault: {
enabled: true
clusters: core1: _
clusters: core2: _
namespaces: "prod-core-vault": labels: "istio-injection": "enabled"
certs: "vault-core": #Certificate & {
metadata: name: "vault-core"
metadata: namespace: "istio-ingress"
spec: {
commonName: "vault.\(CoreDomain)"
dnsNames: [commonName]
secretName: metadata.name
issuerRef: kind: "ClusterIssuer"
issuerRef: name: string | *"letsencrypt"
}
}
servers: "https-vault-core": {
hosts: ["\(TargetNamespace)/vault.\(CoreDomain)"]
tls: credentialName: certs."vault-core".spec.secretName
}
for k, v in clusters {
let obj = (Cert & {Name: "vault-core", Cluster: v.name}).APIObject
certs: "\(obj.metadata.name)": obj
servers: "https-\(obj.metadata.name)": {
hosts: [for host in obj.spec.dnsNames {"\(TargetNamespace)/\(host)"}]
tls: credentialName: obj.spec.secretName
}
}
}
}
// Cert provisions a cluster specific certificate.
let Cert = {
Name: string
Cluster: string
APIObject: #Certificate & {
metadata: name: "\(Cluster)-\(Name)"
metadata: namespace: string | *"istio-ingress"
spec: {
commonName: string | *"vault.\(Cluster).\(CoreDomain)"
dnsNames: [commonName]
secretName: metadata.name
issuerRef: kind: "ClusterIssuer"
issuerRef: name: string | *"letsencrypt"
}
}
}

View File

@@ -13,7 +13,10 @@ let BucketRepoName = "repo2"
// Restore options. Set the timestamp to a known good point in time.
// time="2024-03-11T17:08:58Z" level=info msg="crunchy-pgbackrest ends"
let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
// Restore the most recent backup.
let RestoreOptions = []
#KubernetesObjects & {
apiObjects: {
@@ -40,7 +43,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: "10Gi"
}
}]
standby: {
@@ -103,7 +106,7 @@ let RestoreOptions = ["--type=time", "--target=\"2024-03-11 17:10:00+00\""]
name: "repo1"
volume: volumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: string | *"1Gi"
resources: requests: storage: string | *"4Gi"
}
},
{
@@ -137,7 +140,7 @@ let HighlyAvailable = {
replicas: 2
dataVolumeClaimSpec: {
accessModes: ["ReadWriteOnce"]
resources: requests: storage: "1Gi"
resources: requests: storage: string | *"10Gi"
}
affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: [{
weight: 1

View File

@@ -10,7 +10,8 @@ let Name = "zitadel"
#Kustomization: spec: targetNamespace: #TargetNamespace
#HelmChart & {
namespace: #TargetNamespace
namespace: #TargetNamespace
enableHooks: true
chart: {
name: Name
version: "7.9.0"

View File

@@ -7,26 +7,16 @@ package holos
component: "namespaces"
}
// #PlatformNamespaceObjects defines the api objects necessary for eso SecretStores in external clusters to access secrets in a given namespace in the provisioner cluster.
#PlatformNamespaceObjects: {
_ns: #PlatformNamespace
objects: [
#Namespace & {
metadata: _ns
},
]
}
#KubernetesObjects & {
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
for ns in #PlatformNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
let Kind = obj.kind
let NS = ns.name
let Name = obj.metadata.name
"\(Kind)": "\(NS)/\(Name)": obj
}
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
}
}
}

View File

@@ -1,7 +1,8 @@
package holos
// The primary istio Gateway, named default
import "list"
// The primary istio Gateway, named default
let Name = "gateway"
#InputKeys: component: Name
@@ -31,5 +32,19 @@ let LoginCert = #PlatformCerts.login
},
]
}
for k, svc in #OptionalServices {
if svc.enabled && list.Contains(svc.clusterNames, #ClusterName) {
Gateway: "\(svc.name)": #Gateway & {
metadata: name: svc.name
metadata: namespace: #TargetNamespace
spec: selector: istio: "ingressgateway"
spec: servers: [for s in svc.servers {s}]
}
for k, s in svc.servers {
ExternalSecret: "\(s.tls.credentialName)": _
}
}
}
}
}

View File

@@ -30,5 +30,10 @@ package holos
"\(Kind)": "\(NS)/\(Name)": obj
}
}
for k, ns in #ManagedNamespaces {
let obj = #SecretStore & {_namespace: ns.name}
SecretStore: "\(ns.name)/\(obj.metadata.name)": obj
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,146 @@
package holos
#Values: {
// Vault Helm Chart Holos Values
global: {
enabled: true
// Istio handles this
tlsDisable: true
}
injector: enabled: false
server: {
image: {
// repository: "hashicorp/vault"
repository: "quay.io/holos/hashicorp/vault"
tag: "1.14.10"
// Overrides the default Image Pull Policy
pullPolicy: "IfNotPresent"
}
extraLabels: "sidecar.istio.io/inject": "true"
resources: requests: {
memory: "256Mi"
cpu: "2000m"
}
// limits:
// memory: 1024Mi
// cpu: 2000m
// For HA configuration and because we need to manually init the vault,
// we need to define custom readiness/liveness Probe settings
readinessProbe: {
enabled: true
path: "/v1/sys/health?standbyok=true&sealedcode=204&uninitcode=204"
}
livenessProbe: {
enabled: true
path: "/v1/sys/health?standbyok=true"
initialDelaySeconds: 60
}
// extraEnvironmentVars is a list of extra environment variables to set with
// the stateful set. These could be used to include variables required for
// auto-unseal.
// Vault validates an incomplete chain:
// https://github.com/hashicorp/vault/issues/11318
extraEnvironmentVars: {
GOMAXPROCS: "2"
} // Set to cpu limit, see https://github.com/uber-go/automaxprocs
// extraVolumes is a list of extra volumes to mount. These will be exposed
// to Vault in the path `/vault/userconfig/<name>/`.
extraVolumes: [{
type: "secret"
name: "gcpkms-creds"
}]
// This configures the Vault Statefulset to create a PVC for audit logs.
// See https://www.vaultproject.io/docs/audit/index.html to know more
auditStorage: {
enabled: true
mountPath: "/var/log/vault"
} // for compatibility with plain debian vm location.
standalone: {
enabled: false
}
ha: {
enabled: true
replicas: 3
raft: {
enabled: true
setNodeId: true
config: """
ui = true
listener \"tcp\" {
address = \"[::]:8200\"
cluster_address = \"[::]:8201\"
# mTLS is handled by the the istio sidecar
tls_disable = \"true\"
# Enable unauthenticated metrics access (necessary for Prometheus Operator)
telemetry {
unauthenticated_metrics_access = true
}
}
telemetry {
prometheus_retention_time = \"30s\"
disable_hostname = true
}
seal \"gcpckms\" {
credentials = \"/vault/userconfig/gcpkms-creds/credentials.json\"
project = \"v6-vault-f15f\"
region = \"us-west1\"
key_ring = \"vault-core\"
crypto_key = \"vault-core-unseal\"
}
# Note; the retry_join leader_api_address values come from the Stable
# Network ID feature of a Statefulset. See:
# https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#stable-network-id
storage \"raft\" {
path = \"/vault/data\"
retry_join {
leader_api_addr = \"http://vault-0.vault-internal:8200\"
leader_tls_servername = \"vault\"
}
retry_join {
leader_api_addr = \"http://vault-1.vault-internal:8200\"
leader_tls_servername = \"vault\"
}
retry_join {
leader_api_addr = \"http://vault-2.vault-internal:8200\"
leader_tls_servername = \"vault\"
}
autopilot {
cleanup_dead_servers = \"true\"
last_contact_threshold = \"200ms\"
last_contact_failure_threshold = \"10m\"
max_trailing_logs = 250000
min_quorum = 3
server_stabilization_time = \"10s\"
}
}
service_registration \"kubernetes\" {}
"""
// Vault UI (Will be exposed via the service mesh)
} // Vault UI (Will be exposed via the service mesh)
} // Vault UI (Will be exposed via the service mesh)
} // Vault UI (Will be exposed via the service mesh)// Vault UI (Will be exposed via the service mesh)
ui: {
enabled: true
serviceType: "ClusterIP"
serviceNodePort: null
externalPort: 8200
}
}

View File

@@ -0,0 +1,75 @@
package holos
import "encoding/yaml"
import "list"
let Name = "vault"
#InputKeys: component: Name
#InputKeys: project: "core"
#TargetNamespace: "\(#InstancePrefix)-\(Name)"
let Vault = #OptionalServices[Name]
if Vault.enabled && list.Contains(Vault.clusterNames, #ClusterName) {
#HelmChart & {
namespace: #TargetNamespace
chart: {
name: Name
version: "0.25.0"
repository: {
name: "hashicorp"
url: "https://helm.releases.hashicorp.com"
}
}
values: #Values
apiObjects: {
ExternalSecret: "gcpkms-creds": _
ExternalSecret: "vault-server-cert": _
VirtualService: "\(Name)": {
metadata: name: Name
metadata: namespace: #TargetNamespace
spec: hosts: [for cert in Vault.certs {cert.spec.commonName}]
spec: gateways: ["istio-ingress/\(Name)"]
spec: http: [
{
route: [
{
destination: host: "\(Name)-active"
destination: port: number: 8200
},
]
},
]
}
}
}
#Kustomize: {
patches: [
{
target: {
group: "apps"
version: "v1"
kind: "StatefulSet"
name: Name
}
patch: yaml.Marshal(EnvPatch)
},
]
}
let EnvPatch = [
{
op: "test"
path: "/spec/template/spec/containers/0/env/4/name"
value: "VAULT_ADDR"
},
{
op: "replace"
path: "/spec/template/spec/containers/0/env/4/value"
value: "http://$(VAULT_K8S_POD_NAME):8200"
},
]
}

View File

@@ -0,0 +1,13 @@
package holos
let Vault = #OptionalServices.vault
if Vault.enabled {
#KubernetesObjects & {
apiObjects: {
for k, obj in Vault.certs {
"\(obj.kind)": "\(obj.metadata.name)": obj
}
}
}
}

View File

@@ -24,6 +24,14 @@ ksObjects: []
"\(Kind)": "\(ns.name)/\(Name)": obj
}
}
for k, ns in #ManagedNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
let Kind = obj.kind
let Name = obj.metadata.name
"\(Kind)": "\(ns.name)/\(Name)": obj
}
}
}
}

View File

@@ -7,25 +7,16 @@ package holos
component: "namespaces"
}
// #PlatformNamespaceObjects defines the api objects necessary for eso SecretStores in external clusters to access secrets in a given namespace in the provisioner cluster.
#PlatformNamespaceObjects: {
_ns: #PlatformNamespace
objects: [
#Namespace & {
metadata: name: _ns.name
},
]
}
#KubernetesObjects & {
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.
for ns in #PlatformNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
let Kind = obj.kind
let Name = obj.metadata.name
"\(Kind)": "\(Name)": obj
}
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
}
}
}

View File

@@ -12,7 +12,7 @@ let Privileged = {
// #PlatformNamespaces is the union of all namespaces across all cluster types. Namespaces are created in all clusters regardless of if they're
// used within the cluster or not. The is important for security and consistency with IAM, RBAC, and Secrets sync between clusters.
// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
// TODO: Deprecate in favor of #ManagedNamespaces because it better to add fields to an object instead adding items to a list.
#PlatformNamespaces: [
{name: "external-secrets"},
{name: "holos-system"},

View File

@@ -285,6 +285,21 @@ _apiVersion: "holos.run/v1alpha1"
}
}
// ManagedNamespace is a namespace to manage across all clusters in the holos platform.
#ManagedNamespace: {
// TODO metadata labels and annotations
name: string
labels: [string]: string
}
// #ManagedNamepsaces is the union of all namespaces across all cluster types and optional services.
// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
#ManagedNamespaces: {
[Name=_]: {
name: Name
}
}
// #Backups defines backup configuration.
// TODO: Consider the best place for this, possibly as part of the site platform config. This represents the primary location for backups.
#Backups: {
@@ -393,6 +408,8 @@ _apiVersion: "holos.run/v1alpha1"
resourcesFile: ResourcesFile
// kustomizeFiles represents the files in a kustomize directory tree.
kustomizeFiles: #KustomizeFiles.Files
// enableHooks removes the --no-hooks flag from helm template
enableHooks: true | *false
}
// #KustomizeBuild is a holos component that uses plain yaml files as the source of api objects for a holos component.

View File

@@ -0,0 +1,305 @@
## Overview
Use this run book to move the ZITADEL primary postgres database from one cluster to another. At the end of the process https://zitadel.example.com service will be live on the cluster which was previously the standby at the start of the process.
Downtime: ~10 minutes if executed quickly. ~60 minutes if executed slowly and deliberately.
## Initial State
1. https://login.ois.run is routed to core2.
2. `postgrescluster/zitadel` on core2 is primary.
3. `postgrescluster/zitadel` on core1 is standby.
4. A recent [[#Full Backup]] has been taken and is stored in S3.
5. The standby cluster has been restored from the recent full backup.
## Process
1. [[#Edit Platform site config]]
2. [[#Render core2]]
3. [[#Demote core2]]
4. [[#Verify core2 is standby]]
5. [[#Render core1]]
6. [[#Promote core1]]
7. [[#Verify core1 is primary]]
8. [[#Reconcile Zitadel]]
9. [[#Cut over NLB]]
10. [[#Test Login]]
## Edit Platform site config
Change the primary cluster from core2 to core1 in `platforms/platform.site.cue`.
```cue
#Platform: primaryCluster: name: "core1"
```
## Render core2
Configure core as a standby.
```
holos render --cluster-name=core2 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index c1aa8394..45daa650 100644
--- a/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core2/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,24 +85,6 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
- dataSource:
- pgbackrest:
- configuration:
- - secret:
- name: pgo-s3-creds
- global:
- repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
- repo2-cipher-type: aes-256-cbc
- options:
- - --type=time
- - --target="2024-03-11 17:10:00+00"
- repo:
- name: repo2
- s3:
- bucket: ois-zitadel-backups
- endpoint: s3.dualstack.us-east-2.amazonaws.com
- region: us-east-2
- stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -139,7 +121,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: false
+ enabled: true
repoName: repo2
users:
- databases
```
## Demote core2
```
10012 git add -p
10013 git commit -m 'Make core2 a standby zitadel cluster'
10014 git push origin HEAD
10015 flux reconcile source git flux-system
10016 flux get ks
```
## Verify core2 is standby
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -2
```
Expected output:
```
2024-03-14 21:58:26,205 INFO: no action. I am (zitadel-pgha1-smw7-0), the standby leader with the lock
```
## Render core1
```
holos render --cluster-name=core1 ~/workspace/holos-run/holos/docs/examples/platforms/reference/clusters/accounts/...
```
Expected change:
```diff
diff --git a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
index aa69814e..290c0865 100644
--- a/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
+++ b/deploy/clusters/core1/components/prod-iam-postgres/prod-iam-postgres.gen.yaml
@@ -85,6 +85,22 @@ spec:
name: zitadel-repl-tls
customTLSSecret:
name: zitadel-primary-tls
+ dataSource:
+ pgbackrest:
+ configuration:
+ - secret:
+ name: pgo-s3-creds
+ global:
+ repo2-path: /pgbackrest/prod-iam-zitadel/zitadel/repo2
+ repo2-cipher-type: aes-256-cbc
+ options: []
+ repo:
+ name: repo2
+ s3:
+ bucket: ois-zitadel-backups
+ endpoint: s3.dualstack.us-east-2.amazonaws.com
+ region: us-east-2
+ stanza: db
image: registry.developers.crunchydata.com/crunchydata/crunchy-postgres:ubi8-16.2-0
instances:
- name: pgha1
@@ -121,7 +137,7 @@ spec:
image: registry.developers.crunchydata.com/crunchydata/crunchy-pgbouncer:ubi8-1.21-3
replicas: 2
standby:
- enabled: true
+ enabled: false
repoName: repo2
users:
- databases:
```
## Promote core1
```
flux reconcile source git flux-system
flux reconcile ks prod-iam-postgres
```
## Verify core1 is primary
```
k logs $(kubectl get pods -o name -l postgres-operator.crunchydata.com/role=master) | tail -1
```
Expected output:
```
2024-03-14 22:05:01,159 INFO: no action. I am (zitadel-pgha1-xl2b-0), the leader with the lock
```
## Reconcile Zitadel
Now that the database is back up, reconcile zitadel.
This is pretty brutal but it works:
```
flux delete ks prod-iam-zitadel
Are you sure you want to delete this kustomization: y
► deleting kustomization prod-iam-zitadel in flux-system namespace
✔ kustomization deleted
```
```
k apply --server-side=true -f deploy/clusters/core1/holos/components/prod-iam-zitadel-kustomization.gen.yaml
kustomization.kustomize.toolkit.fluxcd.io/prod-iam-zitadel serverside-applied
```
ZITADEL should come up READY 2/2
```
k get pods
NAME READY STATUS RESTARTS AGE
zitadel-76776f747f-94wft 2/2 Running 2 (30s ago) 34s
zitadel-76776f747f-fl24c 2/2 Running 1 (30s ago) 34s
zitadel-76776f747f-xx2w7 2/2 Running 2 (30s ago) 34s
zitadel-backup-49c7-h25cj 0/1 Completed 0 3m23s
zitadel-init-td2sh 0/1 Completed 0 34s
zitadel-pgbouncer-d9f8cffc-2vwk5 2/2 Running 0 22m
zitadel-pgbouncer-d9f8cffc-rrrhd 2/2 Running 0 22m
zitadel-pgha1-4npq-0 4/4 Running 0 3m21s
zitadel-pgha1-xl2b-0 4/4 Running 0 3m43s
zitadel-repo-host-0 2/2 Running 0 22m
zitadel-setup-b7zkx 0/1 Completed 1 34s
zitadel-test-connection 0/1 Error 0 34s
```
## Cut over NLB
Finally, cut over the NLB config to move to core1.
This is specific to our metal clusters.
Apply with: `git push puppet +HEAD:production -o run=netlb`
```diff
diff --git a/site-modules/profile/templates/haproxy/haproxy.cfg.epp b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
index b358a70..3ee40db 100644
--- a/site-modules/profile/templates/haproxy/haproxy.cfg.epp
+++ b/site-modules/profile/templates/haproxy/haproxy.cfg.epp
@@ -225,7 +225,7 @@ frontend sni-ingress from tcp-ingress
use_backend core2-https if { req.ssl_sni -i web.holos.run }
use_backend core2-https if { req.ssl_sni -i -m end .web.holos.run }
# Identity provider is routed to one cluster at a time
- use_backend core2-https if { req.ssl_sni -i login.ois.run }
+ use_backend core1-https if { req.ssl_sni -i login.ois.run }
# Holos Dev Environments
use_backend k1-https if { req.ssl_sni -i holos.wtf }
use_backend k1-https if { req.ssl_sni -i -m end .holos.wtf }
@@ -358,7 +358,7 @@ frontend http-ingress from http-ingress
bind 65.102.23.41:80
bind 2602:41:6617:2ec0::80:1d:80 v6only
# Zitadel is active on one cluster at a time
- use_backend core2-http if { hdr(host) -i login.ois.run }
+ use_backend core1-http if { hdr(host) -i login.ois.run }
# Vault is active on core1 or core2
use_backend core2-http if { hdr(host) -i vault.core.ois.run }
# Infrastructure Dex OIDC ID issuer is active on core1 or core2
```
## Test Login
```
kubectl oidc-login get-token \
--oidc-extra-scope=openid \
--oidc-extra-scope=email \
--oidc-extra-scope=profile \
--oidc-extra-scope=groups \
--oidc-extra-scope=offline_access \
--oidc-extra-scope=urn:zitadel:iam:org:domain:primary:openinfrastructure.co \
--oidc-issuer-url=https://login.ois.run \
--oidc-client-id=${CLIENT_ID:-257714027772314751@holos_platform} \
--oidc-use-pkce \
--force-refresh > ~/.kube/token.json
```
```
jq -r .status.token ~/.kube/token.json | cut -d. -f2 | base64 -d | jq
```
```json
{
"amr": [
"mfa",
"user"
],
"at_hash": "5nUq6kXWT8GheCNvDNdyow",
"aud": [
"257714027772314751@holos_platform",
"257713952794870157"
],
"auth_time": 1710172186,
"azp": "257714027772314751@holos_platform",
"client_id": "257714027772314751@holos_platform",
"email": "jeff@openinfrastructure.co",
"email_verified": true,
"exp": 1710497614,
"family_name": "McCune",
"given_name": "Jeff",
"groups": [
"prod-cluster-admin"
],
"iat": 1710454404,
"iss": "https://login.ois.run",
"locale": null,
"name": "Jeff McCune",
"nickname": "Jeff",
"preferred_username": "jeff@openinfrastructure.co",
"sub": "257712562366383231",
"updated_at": 1710105084,
"urn:zitadel:iam:org:domain:primary": "openinfrastructure.co",
"urn:zitadel:iam:org:project:257713952794870157:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
},
"urn:zitadel:iam:org:project:roles": {
"prod-cluster-admin": {
"257712562366317695": "openinfrastructure.co"
}
}
}
```
## Tasks
### Identify the leader
Same command for primary and standby clusters.
```
k get pods -o name -l postgres-operator.crunchydata.com/role=master
```
### Full Backup
On the primary cluster:
```
kubectl -n prod-iam-zitadel annotate postgrescluster zitadel postgres-operator.crunchydata.com/pgbackrest-backup="$(date)" --overwrite
```
Watch the progress:
```
k -n prod-iam-zitadel logs -l postgres-operator.crunchydata.com/pgbackrest-backup=manual -f
```
Expected output:
```
time="2024-03-14T18:36:16Z" level=info msg="crunchy-pgbackrest starts"
time="2024-03-14T18:36:16Z" level=info msg="debug flag set to false"
time="2024-03-14T18:36:16Z" level=info msg="backrest backup command requested"
time="2024-03-14T18:36:16Z" level=info msg="command to execute is [pgbackrest backup --stanza=db --repo=2 --type=full]"
time="2024-03-14T18:39:11Z" level=info msg="output=[]"
time="2024-03-14T18:39:11Z" level=info msg="stderr=[]"
time="2024-03-14T18:39:11Z" level=info msg="crunchy-pgbackrest ends"
```
## Troubleshooting
Full backup doesn't start
Check the controller logs. If you see `Switchover failed` errors, try editing the number of postgrescluster replicas down to 1 then back up to 2. Then try a backup again.
```
Switchover failed, details: 503, Switchover failed\n" version=5.5.1-0-amd64
```

View File

@@ -121,6 +121,7 @@ type HelmChart struct {
Namespace string `json:"namespace"`
Chart Chart `json:"chart"`
ValuesContent string `json:"valuesContent"`
EnableHooks bool `json:"enableHooks"`
// APIObjectMap holds the marshalled representation of api objects.
APIObjectMap apiObjectMap `json:"APIObjectMap"`
}
@@ -445,7 +446,12 @@ func runHelm(ctx context.Context, hc *HelmChart, r *Result, path holos.PathCompo
// Run charts
chart := hc.Chart
helmOut, err := util.RunCmd(ctx, "helm", "template", "--include-crds", "--values", valuesPath, "--namespace", hc.Namespace, "--kubeconfig", "/dev/null", "--version", chart.Version, chart.Release, cachedChartPath)
args := []string{"template"}
if !hc.EnableHooks {
args = append(args, "--no-hooks")
}
args = append(args, "--include-crds", "--values", valuesPath, "--namespace", hc.Namespace, "--kubeconfig", "/dev/null", "--version", chart.Version, chart.Release, cachedChartPath)
helmOut, err := util.RunCmd(ctx, "helm", args...)
if err != nil {
stderr := helmOut.Stderr.String()
lines := strings.Split(stderr, "\n")

View File

@@ -1 +1 @@
56
58