Compare commits

...

2 Commits

Author SHA1 Message Date
Jeff McCune
bd2effa183 (#61) Improve ks prod-iam-zitadel robustness with flux health checks
Without this patch ks/prod-iam-zitadel often gets blocked waiting for
jobs that will never complete.  In addition, flux should not manage the
zitadel-test-connection Pod which is an unnecessary artifact of the
upstream helm chart.

We'd disable helm hooks, but they're necessary to create the init and
setup jobs.

This patch also changes the default behavior of Kustomizations from
wait: true to wait: false.  Waiting is expensive for the api server and
slows down the reconciliation process considerably.

Component authors should use ks.spec.healthChecks to target specific
important resources to watch and wait for.
2024-03-15 15:56:43 -07:00
Jeff McCune
562412fbe7 (#57) Run gha-rs scale set only on the primary cluster
This patch fixes the problem of the actions runner scale set listener
pod failing every 3 seconds.  See
https://github.com/actions/actions-runner-controller/issues/3351

The solution is not ideal, if the primary cluster is down workflows will
not execute.  The primary cluster shouldn't go down though so this is
the trade off.  Lower log spam and resource usage by eliminating the
failing pods on other clusters for lower availability if the primary
cluster is not available.

We could let the pods loop and if the primary is unavailable another
would quickly pick up the role, but it doesn't seem worth it.
2024-03-15 13:13:25 -07:00
6 changed files with 109 additions and 28 deletions

View File

@@ -1,13 +1,29 @@
package holos
#InputKeys: component: "postgres-certs"
let SecretNames = {
[Name=_]: {name: Name}
"\(_DBName)-primary-tls": _
"\(_DBName)-repl-tls": _
"\(_DBName)-client-tls": _
"\(_DBName)-root-ca": _
}
#Kustomization: spec: targetNamespace: #TargetNamespace
#Kustomization: spec: healthChecks: [
for s in SecretNames {
apiVersion: "external-secrets.io/v1beta1"
kind: "ExternalSecret"
name: s.name
namespace: #TargetNamespace
},
]
#KubernetesObjects & {
apiObjects: {
ExternalSecret: {
"\(_DBName)-primary-tls": _
"\(_DBName)-repl-tls": _
"\(_DBName)-client-tls": _
"\(_DBName)-root-ca": _
for s in SecretNames {
ExternalSecret: "\(s.name)": _
}
}
}

View File

@@ -18,9 +18,24 @@ let BucketRepoName = "repo2"
// Restore the most recent backup.
let RestoreOptions = []
#Kustomization: spec: healthChecks: [
{
apiVersion: "external-secrets.io/v1beta1"
kind: "ExternalSecret"
name: S3Secret
namespace: #TargetNamespace
},
{
apiVersion: "postgres-operator.crunchydata.com/v1beta1"
kind: "PostgresCluster"
name: _DBName
namespace: #TargetNamespace
},
]
#KubernetesObjects & {
apiObjects: {
ExternalSecret: "pgo-s3-creds": _
ExternalSecret: "\(S3Secret)": _
PostgresCluster: db: #PostgresCluster & HighlyAvailable & {
metadata: name: _DBName
metadata: namespace: #TargetNamespace

View File

@@ -7,7 +7,33 @@ let Name = "zitadel"
#DependsOn: postgres: _
// Upstream helm chart doesn't specify the namespace field for all resources.
#Kustomization: spec: targetNamespace: #TargetNamespace
#Kustomization: spec: {
targetNamespace: #TargetNamespace
wait: false
}
if #IsPrimaryCluster == true {
#Kustomization: spec: healthChecks: [
{
apiVersion: "apps/v1"
kind: "Deployment"
name: Name
namespace: #TargetNamespace
},
{
apiVersion: "batch/v1"
kind: "Job"
name: "\(Name)-init"
namespace: #TargetNamespace
},
{
apiVersion: "batch/v1"
kind: "Job"
name: "\(Name)-setup"
namespace: #TargetNamespace
},
]
}
#HelmChart & {
namespace: #TargetNamespace
@@ -96,6 +122,14 @@ let CAPatch = #Patch & {
target: kind: "Job"
target: name: "\(Name)-setup"
}
testDisable: {
target: {
version: "v1"
kind: "Pod"
name: "\(Name)-test-connection"
}
patch: yaml.Marshal(DisableFluxPatch)
}
if #IsPrimaryCluster == false {
fluxDisable: {
target: {

View File

@@ -4,24 +4,37 @@ package holos
#InputKeys: component: "arc-runner"
#Kustomization: spec: targetNamespace: #TargetNamespace
#HelmChart & {
values: {
#Values
controllerServiceAccount: name: "gha-rs-controller"
controllerServiceAccount: namespace: "arc-system"
githubConfigSecret: "controller-manager"
githubConfigUrl: "https://github.com/" + #Platform.org.github.orgs.primary.name
}
apiObjects: ExternalSecret: "\(values.githubConfigSecret)": _
chart: {
// Match the gha-base-name in the chart _helpers.tpl to avoid long full names.
// NOTE: Unfortunately the INSTALLATION_NAME is used as the helm release
// name and GitHub removed support for runner labels, so the only way to
// specify which runner a workflow runs on is using this helm release name.
// The quote is "Update the INSTALLATION_NAME value carefully. You will use
// the installation name as the value of runs-on in your workflows." Refer to
// https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/quickstart-for-actions-runner-controller
release: "gha-rs"
name: "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
let GitHubConfigSecret = "controller-manager"
// Just sync the external secret, don't configure the scale set
// Work around https://github.com/actions/actions-runner-controller/issues/3351
if #IsPrimaryCluster == false {
#KubernetesObjects & {
apiObjects: ExternalSecret: "\(GitHubConfigSecret)": _
}
}
// Put the scale set on the primary cluster.
if #IsPrimaryCluster == true {
#HelmChart & {
values: {
#Values
controllerServiceAccount: name: "gha-rs-controller"
controllerServiceAccount: namespace: "arc-system"
githubConfigSecret: GitHubConfigSecret
githubConfigUrl: "https://github.com/" + #Platform.org.github.orgs.primary.name
}
apiObjects: ExternalSecret: "\(values.githubConfigSecret)": _
chart: {
// Match the gha-base-name in the chart _helpers.tpl to avoid long full names.
// NOTE: Unfortunately the INSTALLATION_NAME is used as the helm release
// name and GitHub removed support for runner labels, so the only way to
// specify which runner a workflow runs on is using this helm release name.
// The quote is "Update the INSTALLATION_NAME value carefully. You will use
// the installation name as the value of runs-on in your workflows." Refer to
// https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/quickstart-for-actions-runner-controller
release: "gha-rs"
name: "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
}
}
}

View File

@@ -153,7 +153,10 @@ _apiVersion: "holos.run/v1alpha1"
targetNamespace?: string
timeout: string | *"3m0s"
// wait performs health checks for all reconciled resources. If set to true, .spec.healthChecks is ignored.
wait: bool | *true
// Setting this to true for all components generates considerable load on the api server from watches.
// Operations are additionally more complicated when all resources are watched. Consider setting wait true for
// relatively simple components, otherwise target specific resources with spec.healthChecks.
wait: true | *false
dependsOn: [for k, v in #DependsOn {v}]
}
}

View File

@@ -1 +1 @@
1
3