Compare commits

..

4 Commits

Author SHA1 Message Date
Jeff McCune
bd2effa183 (#61) Improve ks prod-iam-zitadel robustness with flux health checks
Without this patch ks/prod-iam-zitadel often gets blocked waiting for
jobs that will never complete.  In addition, flux should not manage the
zitadel-test-connection Pod which is an unnecessary artifact of the
upstream helm chart.

We'd disable helm hooks, but they're necessary to create the init and
setup jobs.

This patch also changes the default behavior of Kustomizations from
wait: true to wait: false.  Waiting is expensive for the api server and
slows down the reconciliation process considerably.

Component authors should use ks.spec.healthChecks to target specific
important resources to watch and wait for.
2024-03-15 15:56:43 -07:00
Jeff McCune
562412fbe7 (#57) Run gha-rs scale set only on the primary cluster
This patch fixes the problem of the actions runner scale set listener
pod failing every 3 seconds.  See
https://github.com/actions/actions-runner-controller/issues/3351

The solution is not ideal, if the primary cluster is down workflows will
not execute.  The primary cluster shouldn't go down though so this is
the trade off.  Lower log spam and resource usage by eliminating the
failing pods on other clusters for lower availability if the primary
cluster is not available.

We could let the pods loop and if the primary is unavailable another
would quickly pick up the role, but it doesn't seem worth it.
2024-03-15 13:13:25 -07:00
Jeff McCune
fd6fbe5598 (#57) Allow gha-rs scale set to fail on all but one clusters
The effect of this patch is limited to refreshing credentials only for
namespaces that exist in the local cluster.  There is structure in place
in the CUE code to allow for namespaces bound to specific clusters, but
this is used only by the optional Vault component.

This patch was an attempt to work around
https://github.com/actions/actions-runner-controller/issues/3351 by
deploying the runner scale sets into unique namespaces.

This effort was a waste of time, only one listener pod successfully
registered for a given scale set name / group combination.

Because we have only one group named Default we can only have one
listener pod globally for a given scale set name.

Because we want our workflows to execute regardless of the availability
of a single cluster, we're going to let this fail for now.  The pod
retries every 3 seconds.  When a cluster is destroyed, another cluster
will quickly register.

A follow up patch will look to expand this retry behavior.
2024-03-15 12:53:16 -07:00
Jeff McCune
67472e1e1c (#60) Disable flux reconciliation of deployment/zitadel on standby clusters 2024-03-14 21:58:32 -07:00
15 changed files with 232 additions and 74 deletions

View File

@@ -1,16 +1,19 @@
package holos
// Controls optional feature flags for services distributed across multiple holos components.
// For example, enable issuing certificates in the provisioner cluster when an optional service is
// enabled for a workload cluster.
package holos
import "list"
#OptionalService: {
name: string
enabled: true | *false
clusters: [Name=_]: #Platform.clusters[Name]
clusterNames: [for k, v in clusters {k}]
namespaces: [Name=_]: #ManagedNamespace & {
name: Name
clusterNames: [for c in clusters {c.name}]
managedNamespaces: [Name=_]: #ManagedNamespace & {
namespace: metadata: name: Name
clusterNames: ["provisioner", for c in clusters {c.name}]
}
// servers represents istio Gateway.spec.servers.hosts entries
// Refer to istio/gateway/gateway.cue
@@ -34,6 +37,10 @@ package holos
}
}
for k, v in #OptionalServices {
#ManagedNamespaces: v.namespaces
for svc in #OptionalServices {
for nsName, ns in svc.managedNamespaces {
if svc.enabled && list.Contains(ns.clusterNames, #ClusterName) {
#ManagedNamespaces: "\(nsName)": ns
}
}
}

View File

@@ -8,7 +8,9 @@ let TargetNamespace = "prod-core-vault"
enabled: true
clusters: core1: _
clusters: core2: _
namespaces: "prod-core-vault": labels: "istio-injection": "enabled"
managedNamespaces: "prod-core-vault": {
namespace: metadata: labels: "istio-injection": "enabled"
}
certs: "vault-core": #Certificate & {
metadata: name: "vault-core"
metadata: namespace: "istio-ingress"

View File

@@ -1,13 +1,29 @@
package holos
#InputKeys: component: "postgres-certs"
let SecretNames = {
[Name=_]: {name: Name}
"\(_DBName)-primary-tls": _
"\(_DBName)-repl-tls": _
"\(_DBName)-client-tls": _
"\(_DBName)-root-ca": _
}
#Kustomization: spec: targetNamespace: #TargetNamespace
#Kustomization: spec: healthChecks: [
for s in SecretNames {
apiVersion: "external-secrets.io/v1beta1"
kind: "ExternalSecret"
name: s.name
namespace: #TargetNamespace
},
]
#KubernetesObjects & {
apiObjects: {
ExternalSecret: {
"\(_DBName)-primary-tls": _
"\(_DBName)-repl-tls": _
"\(_DBName)-client-tls": _
"\(_DBName)-root-ca": _
for s in SecretNames {
ExternalSecret: "\(s.name)": _
}
}
}

View File

@@ -18,9 +18,24 @@ let BucketRepoName = "repo2"
// Restore the most recent backup.
let RestoreOptions = []
#Kustomization: spec: healthChecks: [
{
apiVersion: "external-secrets.io/v1beta1"
kind: "ExternalSecret"
name: S3Secret
namespace: #TargetNamespace
},
{
apiVersion: "postgres-operator.crunchydata.com/v1beta1"
kind: "PostgresCluster"
name: _DBName
namespace: #TargetNamespace
},
]
#KubernetesObjects & {
apiObjects: {
ExternalSecret: "pgo-s3-creds": _
ExternalSecret: "\(S3Secret)": _
PostgresCluster: db: #PostgresCluster & HighlyAvailable & {
metadata: name: _DBName
metadata: namespace: #TargetNamespace

View File

@@ -7,7 +7,33 @@ let Name = "zitadel"
#DependsOn: postgres: _
// Upstream helm chart doesn't specify the namespace field for all resources.
#Kustomization: spec: targetNamespace: #TargetNamespace
#Kustomization: spec: {
targetNamespace: #TargetNamespace
wait: false
}
if #IsPrimaryCluster == true {
#Kustomization: spec: healthChecks: [
{
apiVersion: "apps/v1"
kind: "Deployment"
name: Name
namespace: #TargetNamespace
},
{
apiVersion: "batch/v1"
kind: "Job"
name: "\(Name)-init"
namespace: #TargetNamespace
},
{
apiVersion: "batch/v1"
kind: "Job"
name: "\(Name)-setup"
namespace: #TargetNamespace
},
]
}
#HelmChart & {
namespace: #TargetNamespace
@@ -61,43 +87,78 @@ let DatabaseCACertPatch = [
},
]
#Kustomize: {
patches: [
{
let CAPatch = #Patch & {
target: {
group: "apps" | "batch"
version: "v1"
kind: "Job" | "Deployment"
name: string
}
patch: yaml.Marshal(DatabaseCACertPatch)
}
#KustomizePatches: {
mesh: {
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(IstioInject)
}
deploymentCA: CAPatch & {
target: group: "apps"
target: kind: "Deployment"
target: name: Name
}
initJob: CAPatch & {
target: group: "batch"
target: kind: "Job"
target: name: "\(Name)-init"
}
setupJob: CAPatch & {
target: group: "batch"
target: kind: "Job"
target: name: "\(Name)-setup"
}
testDisable: {
target: {
version: "v1"
kind: "Pod"
name: "\(Name)-test-connection"
}
patch: yaml.Marshal(DisableFluxPatch)
}
if #IsPrimaryCluster == false {
fluxDisable: {
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(IstioInject)
},
{
target: {
group: "apps"
version: "v1"
kind: "Deployment"
name: Name
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
{
patch: yaml.Marshal(DisableFluxPatch)
}
initDisable: {
target: {
group: "batch"
version: "v1"
kind: "Job"
name: "\(Name)-init"
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
{
patch: yaml.Marshal(DisableFluxPatch)
}
setupDisable: {
target: {
group: "batch"
version: "v1"
kind: "Job"
name: "\(Name)-setup"
}
patch: yaml.Marshal(DatabaseCACertPatch)
},
]
patch: yaml.Marshal(DisableFluxPatch)
}
}
}
let DisableFluxPatch = [{op: "replace", path: "/metadata/annotations/kustomize.toolkit.fluxcd.io~1reconcile", value: "disabled"}]

View File

@@ -4,6 +4,6 @@ package holos
#InputKeys: project: "github"
#DependsOn: Namespaces: name: "prod-secrets-namespaces"
#TargetNamespace: #InputKeys.component
#ARCSystemNamespace: "arc-system"
#HelmChart: namespace: #TargetNamespace
#HelmChart: chart: version: "0.8.3"

View File

@@ -1,26 +1,40 @@
package holos
#TargetNamespace: "arc-runner"
#InputKeys: component: "arc-runner"
#Kustomization: spec: targetNamespace: #TargetNamespace
#HelmChart & {
values: {
#Values
controllerServiceAccount: name: "gha-rs-controller"
controllerServiceAccount: namespace: "arc-system"
githubConfigSecret: "controller-manager"
githubConfigUrl: "https://github.com/" + #Platform.org.github.orgs.primary.name
}
apiObjects: ExternalSecret: "\(values.githubConfigSecret)": _
chart: {
// Match the gha-base-name in the chart _helpers.tpl to avoid long full names.
// NOTE: Unfortunately the INSTALLATION_NAME is used as the helm release
// name and GitHub removed support for runner labels, so the only way to
// specify which runner a workflow runs on is using this helm release name.
// The quote is "Update the INSTALLATION_NAME value carefully. You will use
// the installation name as the value of runs-on in your workflows." Refer to
// https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/quickstart-for-actions-runner-controller
release: "gha-rs"
name: "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
let GitHubConfigSecret = "controller-manager"
// Just sync the external secret, don't configure the scale set
// Work around https://github.com/actions/actions-runner-controller/issues/3351
if #IsPrimaryCluster == false {
#KubernetesObjects & {
apiObjects: ExternalSecret: "\(GitHubConfigSecret)": _
}
}
// Put the scale set on the primary cluster.
if #IsPrimaryCluster == true {
#HelmChart & {
values: {
#Values
controllerServiceAccount: name: "gha-rs-controller"
controllerServiceAccount: namespace: "arc-system"
githubConfigSecret: GitHubConfigSecret
githubConfigUrl: "https://github.com/" + #Platform.org.github.orgs.primary.name
}
apiObjects: ExternalSecret: "\(values.githubConfigSecret)": _
chart: {
// Match the gha-base-name in the chart _helpers.tpl to avoid long full names.
// NOTE: Unfortunately the INSTALLATION_NAME is used as the helm release
// name and GitHub removed support for runner labels, so the only way to
// specify which runner a workflow runs on is using this helm release name.
// The quote is "Update the INSTALLATION_NAME value carefully. You will use
// the installation name as the value of runs-on in your workflows." Refer to
// https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/quickstart-for-actions-runner-controller
release: "gha-rs"
name: "oci://ghcr.io/actions/actions-runner-controller-charts/gha-runner-scale-set"
}
}
}

View File

@@ -1,6 +1,6 @@
package holos
#TargetNamespace: "arc-system"
#TargetNamespace: #ARCSystemNamespace
#InputKeys: component: "arc-system"
#HelmChart & {

View File

@@ -1,5 +1,7 @@
package holos
import "list"
#TargetNamespace: "default"
#InputKeys: {
@@ -11,7 +13,9 @@ package holos
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
if list.Contains(ns.clusterNames, #ClusterName) {
Namespace: "\(k)": #Namespace & ns.namespace
}
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.

View File

@@ -93,7 +93,14 @@ provisioner get serviceaccount -A --selector=holos.run/job.name=\(NAME) --output
# Create the tokens
mkdir tokens
jq -r '.items[].metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
kubectl get namespaces -o name > namespaces.txt
# Iterate over local namespaces
while IFS= read -r NAMESPACE; do
echo "Getting token for local cluster $NAMESPACE" >&2
jq -r '.items[] | select("namespace/"+.metadata.namespace == "'${NAMESPACE}'") | .metadata | "provisioner -n \\(.namespace) create token --duration=12h \\(.name) > tokens/\\(.namespace).\\(.name).jwt"' serviceaccounts.json | bash -x
done < namespaces.txt
# Create the secrets
mksecret tokens/*.jwt
@@ -124,6 +131,11 @@ kubectl apply --server-side=true -f secrets.yaml
resources: ["secrets"]
verbs: ["*"]
},
{
apiGroups: [""]
resources: ["namespaces"]
verbs: ["list"]
},
]
},
// Bind the Role to the ServiceAccount for the Job.

View File

@@ -1,5 +1,7 @@
package holos
import "list"
#DependsOn: _ESOCreds
#TargetNamespace: "default"
@@ -31,9 +33,11 @@ package holos
}
}
for k, ns in #ManagedNamespaces {
let obj = #SecretStore & {_namespace: ns.name}
SecretStore: "\(ns.name)/\(obj.metadata.name)": obj
for nsName, ns in #ManagedNamespaces {
if list.Contains(ns.clusterNames, #ClusterName) {
let obj = #SecretStore & {_namespace: nsName}
SecretStore: "\(nsName)/\(obj.metadata.name)": obj
}
}
}
}

View File

@@ -25,11 +25,11 @@ ksObjects: []
}
}
for k, ns in #ManagedNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns}).objects {
for nsName, ns in #ManagedNamespaces {
for obj in (#PlatformNamespaceObjects & {_ns: ns.namespace.metadata}).objects {
let Kind = obj.kind
let Name = obj.metadata.name
"\(Kind)": "\(ns.name)/\(Name)": obj
"\(Kind)": "\(nsName)/\(Name)": obj
}
}
}

View File

@@ -10,8 +10,8 @@ package holos
#KubernetesObjects & {
apiObjects: {
// #ManagedNamespaces is the set of all namespaces across all clusters in the platform.
for k, ns in #ManagedNamespaces {
Namespace: "\(ns.name)": #Namespace & {metadata: ns}
for nsName, ns in #ManagedNamespaces {
Namespace: "\(nsName)": #Namespace & ns.namespace
}
// #PlatformNamespaces is deprecated in favor of #ManagedNamespaces.

View File

@@ -152,7 +152,11 @@ _apiVersion: "holos.run/v1alpha1"
suspend?: bool
targetNamespace?: string
timeout: string | *"3m0s"
wait: bool | *true
// wait performs health checks for all reconciled resources. If set to true, .spec.healthChecks is ignored.
// Setting this to true for all components generates considerable load on the api server from watches.
// Operations are additionally more complicated when all resources are watched. Consider setting wait true for
// relatively simple components, otherwise target specific resources with spec.healthChecks.
wait: true | *false
dependsOn: [for k, v in #DependsOn {v}]
}
}
@@ -287,16 +291,21 @@ _apiVersion: "holos.run/v1alpha1"
// ManagedNamespace is a namespace to manage across all clusters in the holos platform.
#ManagedNamespace: {
// TODO metadata labels and annotations
name: string
labels: [string]: string
namespace: {
metadata: {
name: string
labels: [string]: string
}
}
// clusterNames represents the set of clusters the namespace is managed on. Usually all clusters.
clusterNames: [...string]
}
// #ManagedNamepsaces is the union of all namespaces across all cluster types and optional services.
// Holos adopts the namespace sameness position of SIG Multicluster, refer to https://github.com/kubernetes/community/blob/dd4c8b704ef1c9c3bfd928c6fa9234276d61ad18/sig-multicluster/namespace-sameness-position-statement.md
#ManagedNamespaces: {
[Name=_]: {
name: Name
[Name=_]: #ManagedNamespace & {
namespace: metadata: name: Name
}
}
@@ -465,8 +474,18 @@ _apiVersion: "holos.run/v1alpha1"
kind: "Kustomization"
resources: [ResourcesFile]
...
if len(#KustomizePatches) > 0 {
patches: [for v in #KustomizePatches {v}]
}
}
#KustomizePatches: {
[_]: #Patch
}
// #Patch is a kustomize patch
#Patch: kc.#Patch
// #DefaultSecurityContext is the holos default security context to comply with the restricted namespace policy.
// Refer to https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted
#DefaultSecurityContext: {
@@ -485,6 +504,10 @@ _apiVersion: "holos.run/v1alpha1"
spec: secretName: metadata.name
}
// #IsPrimaryCluster is true if the cluster being rendered is the primary cluster
// Used by the iam project to determine where https://login.example.com is active.
#IsPrimaryCluster: bool & #ClusterName == #Platform.primaryCluster.name
// By default, render kind: Skipped so holos knows to skip over intermediate cue files.
// This enables the use of holos render ./foo/bar/baz/... when bar contains intermediary constraints which are not complete components.
// Holos skips over these intermediary cue instances.

View File

@@ -1 +1 @@
0
3