diff --git a/.taskfiles/cluster/Taskfile.dist.yaml b/.taskfiles/cluster/Taskfile.dist.yaml index 2bb0107d..c5d2e3b8 100644 --- a/.taskfiles/cluster/Taskfile.dist.yaml +++ b/.taskfiles/cluster/Taskfile.dist.yaml @@ -1,4 +1,5 @@ --- +# Tasks related to multi-cluster or cluster-level management, e.g. bootstrap version: "3" includes: diff --git a/.taskfiles/cnpg/Taskfile.dist.yaml b/.taskfiles/cnpg/Taskfile.dist.yaml deleted file mode 100644 index 2d0be47b..00000000 --- a/.taskfiles/cnpg/Taskfile.dist.yaml +++ /dev/null @@ -1,16 +0,0 @@ ---- -version: "3" - -vars: - PG: '{{.PG | default "pg-default"}}' - NS: '{{.NS | default "pg"}}' - -tasks: - rw: - desc: Get current CloudNativePG master (writable) instance to run command against. - vars: - CLUSTER: '{{ or .PG (fail "Missing `PG` environment variable!") }}' - CLUSTER: '{{ or .NS (fail "Missing `NS` environment variable!") }}' - dir: '{{.USER_WORKING_DIR}}' - cmds: - - kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% kubectl -n {{.NS}} {{.CLI_ARGS}} diff --git a/.taskfiles/flux/Taskfile.dist.yaml b/.taskfiles/flux/Taskfile.dist.yaml index ce4d0b78..bf5f70e6 100644 --- a/.taskfiles/flux/Taskfile.dist.yaml +++ b/.taskfiles/flux/Taskfile.dist.yaml @@ -1,6 +1,9 @@ --- version: "3" +# vars: +# DEFAULTNS: '{{.NS | default "flux-system"}}' + tasks: install: cmds: @@ -10,3 +13,12 @@ tasks: aliases: [ga, fga, getall] cmds: - viddy -n1 flux get all -A || watch -n1 flux get all -A + + cantWait: + aliases: [cw] + vars: + NS: '{{ .NS | default "flux-system" }}' + KS: '{{ .KS | or .KS (fail "Missing `KS` environment variable!") }}' + cmds: + - kubectl patch ks -n {{.NS}} {{.KS}} --type merge -p '{"spec":{"dependsOn":[]}}' + - kubectl patch ks -n {{.NS}} {{.KS}} --type merge -p '{"spec":{"wait":false}}' diff --git a/.taskfiles/flux/cantWait.yaml b/.taskfiles/flux/cantWait.yaml new file mode 100644 index 00000000..95cd9381 --- /dev/null +++ b/.taskfiles/flux/cantWait.yaml @@ -0,0 +1,7 @@ +--- +# apiVersion: kustomize.toolkit.fluxcd.io/v1 +# kind: Kustomization +# # metadata: +# # name: not-used +spec: + dependsOn: [] \ No newline at end of file diff --git a/.taskfiles/k8s/Taskfile.dist.yaml b/.taskfiles/k8s/Taskfile.dist.yaml index 042e9c19..e20a145f 100644 --- a/.taskfiles/k8s/Taskfile.dist.yaml +++ b/.taskfiles/k8s/Taskfile.dist.yaml @@ -1,8 +1,9 @@ --- +# General Kubernetes admin tasks version: "3" vars: - C: '{{ .C | default "biohazard" }}' + # C: '{{ .C | default "admin@biohazard" }}' TIMENOW: sh: date +%Y%m%d-%H%M%S @@ -11,7 +12,7 @@ tasks: aliases: [nsps] desc: While true loop labelling a given namespace with Pod Security labels, if external source creates namespace (e.g. Azure Arc's Helm apply with `--create-namespace`) vars: - NS: '{{ or .NS (fail "Missing `NS` environment variable!") }}' + NS: &ns-fail '{{ or .NS (fail "Missing `NS` environment variable!") }}' PS: '{{ or .PS (fail "Missing `PS` environment variable!") }}' cmds: - while true; do kubectl label namespaces {{.NS}} "pod-security.kubernetes.io/enforce={{.PS}}" >/dev/null 2>/dev/null || true; kubectl label namespaces {{.NS}} "pod-security.kubernetes.io/enforce-version=latest" >/dev/null 2>/dev/null || true; done @@ -21,32 +22,42 @@ tasks: internal: true desc: Wait for a job's pod to change its status to pending vars: - NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}' - NS: '{{ .NS | default "default" }}' + NAME: &name-fail '{{ or .NAME (fail "Missing `NAME` environment variable!") }}' + NS: *ns-fail cmds: - until [[ $(kubectl -n "{{.NS}}" get pod "{{.NAME}}" -o jsonpath='{.items[*].status.phase}') == "Pending" ]]; do sleep 1; done wait-pod-running: - aliases: [waitp] + aliases: [waitr] internal: true desc: Wait for a job's pod to change its status to pending vars: - NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}' - NS: '{{ .NS | default "default" }}' + NAME: *name-fail + NS: *ns-fail cmds: - until [[ $(kubectl -n "{{.NS}}" get pod "{{.NAME}}" -o jsonpath='{.items[*].status.phase}') == "Running" ]]; do sleep 1; done + wait-pod-delete: + aliases: [waitd] + internal: true + desc: Wait for a job's pod to delete + vars: + NAME: *name-fail + NS: *ns-fail + cmds: + - until kubectl wait pod -n {{.NS}} {{.NAME}} --for delete --timeout=1m; do sleep 1; done + wait-finish: internal: true desc: Wait for a job's pod to change its status to pending vars: - NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}' - NS: '{{ .NS | default "default" }}' + NAME: *name-fail + NS: *ns-fail TYPE: '{{ .TYPE | default "job" }}' # WAIT_ARGS: '{{.WAIT_ARGS | default "echo \"{{.NAME}} is still running, logs:\" && kubectl -n {{.NS}} logs {{.NAME}} --since 2s -f;"}}' cmds: - |- - until kubectl -n {{.NS}} wait {{.TYPE}}/{{.NAME}} --for condition=complete --timeout=2s; do + until kubectl -n {{.NS}} wait {{.TYPE}}/{{.NAME}} --for condition=complete --timeout=2s; do echo "{{.NAME}} is still running, logs:" && kubectl -n {{.NS}} logs {{.TYPE}}/{{.NAME}} --since 2s -f || true; done @@ -68,13 +79,33 @@ tasks: SECRETS_FILE: "{{.ROOT_DIR}}/kube/clusters/{{.C}}/config/secrets.sops.env" VARS_FILE: "{{.ROOT_DIR}}/kube/clusters/{{.C}}/config/vars.sops.env" FCMD: '{{ .FCMD | default "cat" }}' + C: '{{.C | default "admin@biohazard"}}' cmds: + #- sops exec-env {{.SECRETS_FILE}} "sops exec-env {{.VARS_FILE}} \"{{.FCMD}} {{.F}} | envsubst | kubectl apply --context {{.C}} -f -\"" - sops exec-env {{.SECRETS_FILE}} "sops exec-env {{.VARS_FILE}} \"{{.FCMD}} {{.F}} | envsubst | kubectl apply -f -\"" + cilium-bootstrap-apply: + aliases: [cilium] + desc: Bootstrap Cilium onto new cluster, ready for Flux to take over managing Cilium. + # dir: "/{{.ROOT_DIR}}/kube/clusters/{{.C}}/talos" + vars: + C: '{{ or .C (fail "Missing C environment variable for cluster!") }}' + cmds: + - kubectl delete configmap -n kube-system cilium-config || true + - kubectl delete daemonset -n kube-system cilium || true + - kubectl delete deployment -n kube-system cilium-operator || true + - cp /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/clusters/{{.C}}/helm-values.yaml /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install/base-values.yaml + - task: sops-apply + vars: + C: '{{.C | default "admin@biohazard"}}' + F: "/{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install" + FCMD: "kustomize build --enable-helm" + - defer: rm -rf /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install/base-values.yaml + newapp: desc: Copy app folder structure template, substitute APPNAME, and (TODO) prompt user for variables values such as DNS, UID etc. vars: - app: '{{ or .app (fail "Missing `app` variable!") }}' + APP: &app-fail '{{ or .APP (fail "Missing `app` variable!") }}' cmds: - cp -r ./kube/templates/test ./kube/deploy/apps/{{.app}} # lowercase, used for resource names etc @@ -123,15 +154,75 @@ tasks: export KBENCH=$(curl -sL https://raw.githubusercontent.com/yasker/kbench/main/deploy/fio.yaml) [[ ! -z "{{.SC}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").spec.storageClassName = "{{.SC}}"') [[ ! -z "{{.NODE}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").spec.template.spec.nodeSelector."kubernetes.io/hostname" = "{{.NODE}}"') + export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").metadata.name = "kbench-{{- .TIMENOW -}}"') + export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").metadata.name = "kbench-{{- .TIMENOW -}}"') + export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | (select(.kind == "Job").spec.template.spec.volumes.[] | select(.name == "vol").persistentVolumeClaim.claimName) |= "kbench-{{- .TIMENOW -}}"') printf '%s\n' "${KBENCH}" | kubectl delete -n {{.NS}} -f - || true - |- export KBENCH=$(curl -sL https://raw.githubusercontent.com/yasker/kbench/main/deploy/fio.yaml) [[ ! -z "{{.SC}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").spec.storageClassName = "{{.SC}}"') [[ ! -z "{{.NODE}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").spec.template.spec.nodeSelector."kubernetes.io/hostname" = "{{.NODE}}"') + export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").metadata.name = "kbench-{{- .TIMENOW -}}"') + export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").metadata.name = "kbench-{{- .TIMENOW -}}"') + export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | (select(.kind == "Job").spec.template.spec.volumes.[] | select(.name == "vol").persistentVolumeClaim.claimName) |= "kbench-{{- .TIMENOW -}}"') printf '%s\n' "${KBENCH}" | kubectl apply -n {{.NS}} -f - - defer: *kbench-delete - task: wait-finish vars: NS: '{{ .NS | default "default" }}' - NAME: "kbench" + NAME: "kbench-{{- .TIMENOW -}}" TYPE: "job" + + scale-to-0: + aliases: ["0"] + desc: Scale given workloads to 0 + vars: &scale-vars + KS: + sh: |- + [[ -z "{{.KS}}" ]] && echo "{{.APP}}-app" || echo "{{.KS}}" + KSNS: '{{ .KSNS | default "flux-system" }}' + HR: '{{ .HR | default .APP }}' + HRNS: + sh: |- + [[ -n "{{.HRNS}}" ]] && echo "{{.HRNS}}" && exit || [[ -n "{{.NS}}" ]] && echo "{{.NS}}" && exit || echo "{{.APP}}" + APP: *app-fail + NS: '{{ .NS | default .APP }}' + REASON: '{{ .REASON | default .TIMENOW }}' + cmds: + - |- # annotate Flux Kustomization + [[ {{.KS}} != "false" ]] && kubectl annotate --overwrite kustomizations.kustomize.toolkit.fluxcd.io -n {{.KSNS}} {{.KS}} scaledown.home.arpa/reason={{.REASON}} || true + - |- # suspend Flux Kustomization + [[ {{.KS}} != "false" ]] && flux suspend kustomization -n {{.KSNS}} {{.KS}} || true + - |- # annotate Flux HelmRelease + [[ {{.HR}} != "false" ]] && kubectl annotate --overwrite helmreleases.helm.toolkit.fluxcd.io -n {{.HRNS}} {{.HR}} scaledown.home.arpa/reason={{.REASON}} || true + - |- # suspend Flux HelmRelease + [[ {{.HR}} != "false" ]] && flux suspend helmrelease -n {{.HRNS}} {{.HR}} || true + - |- # annotate Kubernetes controller with reason + kubectl annotate --overwrite deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/reason={{.REASON}} || kubectl annotate --overwrite statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/reason={{.REASON}} + - |- # annotate Kubernetes controller with old replica count + kubectl annotate --overwrite deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas=$(kubectl get deployment -n {{.NS}} {{.APP}} -o jsonpath='{.spec.replicas}') || kubectl annotate --overwrite statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas=$(kubectl get statefulset -n {{.NS}} {{.APP}} -o jsonpath='{.spec.replicas}') + - |- # scale down Kubernetes controller + kubectl scale deployment -n {{.NS}} {{.APP}} --replicas 0 || kubectl scale statefulset -n {{.NS}} {{.APP}} --replicas 0 + - task: wait-pod-delete + vars: + NAME: '{{.APP}}' + NS: '{{.NS}}' + + scale-back-up: + aliases: ["up"] + vars: *scale-vars + cmds: + - |- # scale up Kubernetes controller back to old replica count + kubectl scale deployment -n {{.NS}} {{.APP}} --replicas "$(kubectl get deployment -n {{.NS}} {{.APP}} -o jsonpath='{.metadata.annotations.scaledown\.home\.arpa/replicas}')" || kubectl scale statefulset -n {{.NS}} {{.APP}} --replicas "$(kubectl get statefulset -n {{.NS}} {{.APP}} -o jsonpath='{.metadata.annotations.scaledown\.home\.arpa/replicas}')" + - |- # remove old replica count annotation from Kubernetes controller + kubectl annotate deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas- || kubectl annotate statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas- + - |- # remove reason annotation from Kubernetes controller + kubectl annotate deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/reason- || kubectl annotate statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/reason- + - |- # resume Flux HelmRelease + [[ {{.HR}} != "disabled" ]] && flux resume helmrelease -n {{.HRNS}} {{.HR}} || true + - |- # remove reason annotation from Flux HelmRelease + [[ {{.HR}} != "disabled" ]] && kubectl annotate helmreleases.helm.toolkit.fluxcd.io -n {{.HRNS}} {{.HR}} scaledown.home.arpa/reason- || true + - |- # resume Flux Kustomization + [[ {{.KS}} != "disabled" ]] && flux resume kustomization -n {{.KSNS}} {{.KS}} || true + - |- # remove reason annotation from Flux Kustomization + [[ {{.KS}} != "disabled" ]] && kubectl annotate kustomizations.kustomize.toolkit.fluxcd.io -n {{.KSNS}} {{.KS}} scaledown.home.arpa/reason- || true \ No newline at end of file diff --git a/.taskfiles/pg/Taskfile.dist.yaml b/.taskfiles/pg/Taskfile.dist.yaml new file mode 100644 index 00000000..4dcff05c --- /dev/null +++ b/.taskfiles/pg/Taskfile.dist.yaml @@ -0,0 +1,63 @@ +--- +version: "3" + +# vars: + # PG: '{{.PG | default "default"}}' + # NS: '{{.NS | default "pg"}}' + +tasks: + rw: + desc: Get current CloudNativePG master (writable) instance to run command against. + dir: "{{.USER_WORKING_DIR}}" + vars: + APP: "{{.APP}}" + PG: &pg-default + sh: |- + [[ -n "{{.PG}}" ]] && echo "{{.PG}}" || [[ -n "{{.APP}}" ]] && echo "pg-{{.APP}}" || echo "pg-default" + NS: &ns-default + sh: |- + [[ -n "{{.NS}}" ]] && echo "{{.NS}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || echo "pg" + # PG: '{{ or .PG (fail "Missing `PG` environment variable!") }}' + # NS: &ns-default '{{.NS | default "pg"}}' + cmds: + - kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% kubectl -n {{.NS}} {{.CLI_ARGS}} + + adminer: + desc: Use kubectl netshoot krew plugin to deploy adminer as sidecar to the current primary CNPG replica pod. + dir: "{{.USER_WORKING_DIR}}" + vars: + APP: "{{.APP}}" + PG: *pg-default + NS: *ns-default + PF_ADDR: '{{ .PF_ADDR | default "127.0.0.1" }}' + PF_PORT: '{{ .PF_PORT | default "8080" }}' + cmds: + - | + kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% bash -c "while kubectl netshoot debug -n {{.NS}} % --image-name adminer --image-tag 4.8.1-standalone; do kubectl port-forward -n {{.NS}} % --address {{.PF_ADDR}} {{.PF_PORT}}:8080; break; done" + + crunchy-restore: + desc: Restore a pg_dump to a CrunchyData Postgres cluster, writing to the master instance. + dir: "{{.USER_WORKING_DIR}}" + vars: + APP: "{{.APP}}" + PG: *pg-default + NS: *ns-default + DUMP: '{{ or .DUMP (fail "Missing `DUMP` environment variable for specifying pg_dump file location!") }}' + CP_DIR: '{{ .CP_DIR | default "/pgdata" }}' # would've been /tmp or /run if Crunchy replica container didn't crash and restart when files are copied there for some reason, maybe small tmpfs size? + DBNAME: + sh: |- + [[ -n "{{.DBNAME}}" ]] && echo "{{.DBNAME}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing `DB` environment variable for specifying database name!" && exit 1 ) + DBUSER: + sh: |- + [[ -n "{{.DBUSER}}" ]] && echo "{{.DBUSER}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing `USER` environment variable for specifying user name!" && exit 1 ) + ARGS: '{{ .ARGS | default "--verbose --verbose --format=c --clean --if-exists --no-owner --role $DBUSER" }}' # TODO: personal defaults, please check and change accordingly if copying this task before running it! + CRUNCHY_PRIMARY: + sh: |- + kubectl get pods -n {{.NS}} -l postgres-operator.crunchydata.com/role=master,postgres-operator.crunchydata.com/cluster={{.PG}} -o jsonpath='{.items[0].metadata.name}' + env: + DBUSER: '{{.DBUSER}}' + cmds: + - kubectl cp --container database {{.DUMP}} {{.NS}}/{{.CRUNCHY_PRIMARY}}:{{.CP_DIR}}/restore-dump.psql + - kubectl exec -it --container database -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'echo "ALTER DATABASE {{.DBNAME}} OWNER TO {{.DBUSER}};" | psql' + - kubectl exec -it --container database -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'pg_restore --dbname {{.DBNAME}} {{.ARGS}} {{.CP_DIR}}/restore-dump.psql' + - defer: kubectl exec -it -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'rm -rf {{.CP_DIR}}/restore-dump.psql' diff --git a/.taskfiles/rook/Taskfile.dist.yaml b/.taskfiles/rook/Taskfile.dist.yaml index 4bbf344e..46453f6b 100644 --- a/.taskfiles/rook/Taskfile.dist.yaml +++ b/.taskfiles/rook/Taskfile.dist.yaml @@ -71,10 +71,10 @@ tasks: cmds: # TODO: mimic deviceFilter - envsubst < <(cat {{.JOB_TEMPLATE}}) | kubectl apply -f - - - |- + - |- kubectl -n kube-system logs job/{{.JOB_NAME}} -f || true; - until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=2s; do - echo "Job {{.JOB_NAME}} is still running, logs:" && + until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=2s; do + echo "Job {{.JOB_NAME}} is still running, logs:" && kubectl -n kube-system logs job/{{.JOB_NAME}} -f || true; done; # - kubectl -n kube-system logs job/{{.JOB_NAME}} @@ -122,6 +122,19 @@ tasks: vars: NODE: '{{.NODE}}' + wipe-nodes-biohazard: + desc: Wipe all nodes in cluster "nuclear" + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + cmds: + - task: wipe-node + vars: + NODE: "ange" + CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202L91P6HGN" + - task: wipe-node + vars: + NODE: "charlotte" + CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202JK1P6HGN" # I swear I'll forget to update this here when I have to change the disk + wipe-nodes-nuclear: desc: Wipe all nodes in cluster "nuclear" dir: '/{{.ROOT_DIR}}/.taskfiles/rook' @@ -135,6 +148,23 @@ tasks: NODE: "humming" CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202L91P6HGN" + force-delete-cluster: + desc: |- + Sometimes Rook seems to fail applying the cluster and want to delete it before it even gets anything set up, this Task will force delete all finalizers to delete all unready Ceph resources. + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + vars: + C: '{{ or .C (fail "Missing `C` environment variable for cluster!") }}' + cmds: + - helm uninstall -n rook-ceph rook-ceph-cluster-{{.C}} && true || true + - |- + for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do + kubectl get -n rook-ceph "$CRD" -o name | \ + xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true + done + - |- + kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true + kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true + reinstall: desc: |- For when Rook refuses to create any OSDs at all @@ -149,23 +179,17 @@ tasks: C: '{{.C}}' - flux suspend ks 0-{{.C}}-config - flux suspend ks 1-core-storage-rook-ceph-app - - flux suspend ks 1-core-storage-rook-ceph-cluster - - helm uninstall -n rook-ceph rook-ceph-cluster && true || true - - flux delete hr -n rook-ceph rook-ceph-cluster --silent && true || true - - |- - for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do - kubectl get -n rook-ceph "$CRD" -o name | \ - xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true - done - - |- - kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true - kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true + - flux suspend ks 1-core-storage-rook-ceph-cluster-{{.C}} + - task: force-delete-cluster + vars: + C: '{{.C}}' + # - flux delete hr -n rook-ceph rook-ceph-cluster-{{.C}} --silent && true || true - helm uninstall -n rook-ceph rook-ceph && true || true - - flux delete hr -n rook-ceph rook-ceph --silent && true || true + # - flux delete hr -n rook-ceph rook-ceph --silent && true || true - kubectl get namespaces rook-ceph && until kubectl delete namespaces rook-ceph; do kubectl get namespaces rook-ceph -o jsonpath="{.status}"; done || true - task: wipe-nodes-{{.C}} # - task: wipe-nodes-{{.C}} # run again in case wipe fails - flux suspend ks 0-{{.C}}-config && flux resume ks 0-{{.C}}-config - flux suspend ks 1-core-storage-rook-ceph-app && flux resume ks 1-core-storage-rook-ceph-app - - flux suspend ks 1-core-storage-rook-ceph-cluster && flux resume ks 1-core-storage-rook-ceph-cluster - # - task: osd-prepare-logs # uncomment to debug OSD prepare errors with while true loop, since the job re-runs 6 times and overwrites the previous runs' logs \ No newline at end of file + - flux suspend ks 1-core-storage-rook-ceph-cluster-{{.C}} && flux resume ks 1-core-storage-rook-ceph-cluster-{{.C}} + # - task: osd-prepare-logs # uncomment to debug OSD prepare errors with while true loop, since the job re-runs 6 times and overwrites the previous runs' logs diff --git a/.taskfiles/talos/Taskfile.dist.yaml b/.taskfiles/talos/Taskfile.dist.yaml index 93bd7ac4..64478b3f 100644 --- a/.taskfiles/talos/Taskfile.dist.yaml +++ b/.taskfiles/talos/Taskfile.dist.yaml @@ -24,21 +24,6 @@ tasks: - talhelper genconfig -e ./talenv.sops.yaml - defer: rm -rf ./talenv.sops.yaml - cilium-bootstrap-apply: - aliases: [cilium] - desc: Bootstrap Cilium onto new cluster, ready for Flux to take over managing Cilium. - # dir: "/{{.ROOT_DIR}}/kube/clusters/{{.C}}/talos" - vars: - C: '{{ or .C (fail "Missing C environment variable for cluster!") }}' - cmds: - - kubectl delete configmap -n kube-system cilium-config || true - - kubectl delete daemonset -n kube-system cilium || true - - kubectl delete deployment -n kube-system cilium-operator || true - - task: k8s:sops-apply - vars: - C: '{{.C | default "biohazard"}}' - F: "/{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install" - FCMD: "kustomize build --enable-helm" bootstrap: aliases: [bs] diff --git a/.taskfiles/volsync/Taskfile.dist.yaml b/.taskfiles/volsync/Taskfile.dist.yaml index 0cb9b36f..32059475 100644 --- a/.taskfiles/volsync/Taskfile.dist.yaml +++ b/.taskfiles/volsync/Taskfile.dist.yaml @@ -1,18 +1,127 @@ --- version: "3" +vars: + TIMENOW: + sh: date +%Y%m%d-%H%M%S + +includes: + k8s: + internal: true + taskfile: ../k8s + tasks: unlock: desc: Unlock Restic repo using env vars from cluster secrets. vars: - ns: '{{ or .ns (fail "Missing `ns` environment variable!") }}' - repo: '{{ or .repo (fail "Missing `repo` environment variable!") }}' + NS: &ns-fail '{{ or .NS (fail "Missing `NS` environment variable!") }}' + REPO: &restic-repo '{{ or .REPO (fail "Missing `REPO` environment variable!") }}' cmds: - - env $(kubectl get secrets -n {{.ns}} {{.repo}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic unlock + - env $(kubectl get secrets -n {{.NS}} {{.REPO}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic unlock --remove-all run: desc: Unlock Restic repo using env vars from cluster secrets. vars: - ns: '{{ or .ns (fail "Missing `ns` environment variable!") }}' - repo: '{{ or .repo (fail "Missing `repo` environment variable!") }}' + NS: *ns-fail + REPO: *restic-repo cmds: - - env $(kubectl get secrets -n {{.ns}} {{.repo}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic {{.CLI_ARGS}} + - env $(kubectl get secrets -n {{.NS}} {{.REPO}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic {{.CLI_ARGS}} + restore: + desc: Restore VolSync data into PVCs + dir: "/{{.ROOT_DIR}}/.taskfiles/volsync/template" + vars: &restore-vars + # APP: '{{ or .APP (fail "Missing `APP` environment variable!") }}' + NS: + sh: |- + [[ -n "{{.RSRC}}" ]] && echo "{{.RSRC}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing NS or APP environment variable for Namespace!"; exit 1 ) + # RSRC: '{{ or .RSRC (fail "Missing `RSRC` environment variable!") }}' + RSRC: + sh: |- + [[ -n "{{.RSRC}}" ]] && echo "{{.RSRC}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}-restic" || ( echo "Missing RSRC or APP environment variable for VolSync ReplicationSource!"; exit 1 ) + REPO: + sh: |- + [[ -n "{{.REPO}}" ]] && echo "{{.REPO}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.repository}' + KS: + sh: |- + [[ -n "{{.KS}}" ]] && echo "{{.KS}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.metadata.labels.kustomize\.toolkit\.fluxcd\.io/name}' + PVC: + sh: |- + [[ -n "{{.PVC}}" ]] && echo "{{.PVC}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.sourcePVC}' + SC: + sh: |- + [[ -n "{{.SC}}" ]] && echo "{{.SC}}" || kubectl get pvc -n {{.NS}} {{.PVC}} -o jsonpath='{.spec.storageClassName}' + APP: + sh: |- + [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || kubectl get pvc -n {{.NS}} {{.PVC}} -o jsonpath="{.metadata.labels.app\.kubernetes\.io/name}" + CTRL: + sh: |- + ( [[ -n "{{.CTRL}}" && -n "{{.APP}}" ]] && echo "{{.CTRL}}/{{.APP}}" ) || ( [[ -z "{{.CTRL}}" ]] && kubectl get -n {{.NS}} deployments.apps/{{.APP}} 2>&1 2>/dev/null && echo "deployments.apps/{{.APP}}" || echo "statefulsets.apps/{{.APP}}" ) + RESTORE: + sh: |- + [[ -n "{{.PREV}}" ]] && echo "{{.PREV}}" || [[ -n "{{.ASOF}}" ]] && echo "{{.ASOF}}" || ( echo "Missing PREV or ASOF environment variable to select snapshot to restore!"; exit 1 ) + RSRCDEL: '{{ .RSRCDEL | default "true" }}' + RUID: + sh: |- + kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.runAsUser}' + RGID: + sh: |- + kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.runAsGroup}' + RFSG: + sh: |- + kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.fsGroup}' + RSRCYAML: + sh: |- + kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o yaml | yq 'del(.metadata["uid", "resourceVersion", "creationTimestamp", "generation"],.status,.metadata.annotations."kubectl.kubernetes.io/last-applied-configuration")' | tee ./rsrc.tmp.yaml + STSDEL: '{{ .STSDEL | default "false" }}' # delete StatefulSet PVC before recreating + STSREC: '{{ .STSREC | default "false" }}' # scale up and down StatefulSet to create PVC + env: # pass Task vars as env vars, can't anchor vars to env as values will be duplicated + NS: '{{.NS}}' + RSRC: '{{.RSRC}}' + REPO: '{{.REPO}}' + KS: '{{.KS}}' + PVC: '{{.PVC}}' + SC: '{{.SC}}' + APP: '{{.APP}}' + CTRL: '{{.CTRL}}' + RESTORE: '{{.RESTORE}}' + TIMENOW: '{{.TIMENOW}}' + RUID: '{{.RUID}}' + RGID: '{{.RGID}}' + RFSG: '{{.RFSG}}' + STSDEL: '{{.STSDEL}}' + STSREC: '{{.STSREC}}' # StatefulSet recreate PVC by scaling up and down # TODO: currently assumes 1 replica and 1 VCT PVC + # RSRCYAML: '{{.RSRCYAML}}' + cmds: + # - echo ${RDESTYAML} + # - echo ${RSRCYAML} + - |- # if true, delete StatefulSet PVC + [[ "{{.STSDEL}}" == "true" ]] && kubectl delete pvc -n {{.NS}} {{.PVC}} + - |- # if not false, scale up and down StatefulSet to recreate PVC + [[ "{{.STSREC}}" != "false" ]] && kubectl patch -n {{.NS}} {{.CTRL}} --type merge -p '{"spec":{"template":{"spec":{"nodeSelector":{"recreate-sts-pvc.volsync.home.arpa":"$TIMENOW"}}}}}' && kubectl scale -n {{.NS}} {{.CTRL}} --replicas 1 && kubectl scale -n {{.NS}} {{.CTRL}} --replicas 0 && until kubectl wait --for=jsonpath='{.status.phase}'=Bound -n {{.NS}} pvc/{{.PVC}}; do sleep 1; done && kubectl patch -n {{.NS}} {{.CTRL}} --type=json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/recreate-sts-pvc.volsync.home.arpa"}]' + - echo "{{.RSRCYAML}}" + - echo ${RSRC} ${NS} ${REPO} ${KS} ${PVC} ${SC} ${APP} ${CTRL} ${RESTORE} + - |- + [[ "{{.RSRCDEL}}" != "false" ]] && echo "{{.RSRCYAML}}" | yq | kubectl delete -f - + # [[ "{{.RSRCDEL}}" != "false" ]] && kubectl delete -f ./rsrc.tmp.yaml + - task: k8s:scale-to-0 + vars: + <<: *restore-vars + REASON: 'volsync-restore-{{.RSRC}}-{{.PVC}}-{{.TIMENOW}}' + - cat ./wipe-pvc.tmpl.yaml | envsubst | kubectl apply -f - + - task: k8s:wait-finish + vars: + NAME: 'wipe-{{.PVC}}-{{.TIMENOW}}' + NS: '{{.NS}}' + - |- + cat ./ReplicationDestination.tmpl.yaml | envsubst | ( [[ -n "{{.PREV}}" ]] && yq '.spec.restic.previous = {{.PREV}}' || [[ -n "{{.ASOF}}" ]] && yq '.spec.restic.restoreAsOf = "{{.ASOF}}"' || exit 1 ) | kubectl apply -f - + - task: k8s:wait-finish + vars: + NAME: 'volsync-dst-{{.PVC}}-{{.TIMENOW}}' + NS: '{{.NS}}' + - cat ./ReplicationDestination.tmpl.yaml | envsubst | kubectl delete -f - + - task: k8s:scale-back-up + vars: + <<: *restore-vars + - defer: |- + [[ "{{.RSRCDEL}}" != "false" ]] && echo "{{.RSRCYAML}}" | yq | kubectl apply -f - + # [[ "{{.RSRCDEL}}" != "false" ]] && kubectl apply -f ./rsrc.tmp.yaml + - defer: rm ./rsrc.tmp.yaml \ No newline at end of file diff --git a/.taskfiles/volsync/template/ReplicationDestination.tmpl.yaml b/.taskfiles/volsync/template/ReplicationDestination.tmpl.yaml new file mode 100644 index 00000000..aeee5002 --- /dev/null +++ b/.taskfiles/volsync/template/ReplicationDestination.tmpl.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: volsync.backube/v1alpha1 +kind: ReplicationDestination +metadata: + name: "${PVC}-${TIMENOW}" + namespace: "${NS}" +spec: + trigger: + manual: "restore-once-${TIMENOW}" + restic: + repository: "${REPO}" + destinationPVC: "${PVC}" + copyMethod: Direct + storageClassName: "${SC}" + moverSecurityContext: + runAsUser: ${RUID} + runAsGroup: ${RGID} + fsGroup: ${RFSG} + \ No newline at end of file diff --git a/.taskfiles/volsync/template/rsrc.tmp.yaml b/.taskfiles/volsync/template/rsrc.tmp.yaml new file mode 100644 index 00000000..c4414266 --- /dev/null +++ b/.taskfiles/volsync/template/rsrc.tmp.yaml @@ -0,0 +1,27 @@ +apiVersion: volsync.backube/v1alpha1 +kind: ReplicationSource +metadata: + labels: + kustomize.toolkit.fluxcd.io/name: minecraft-app + kustomize.toolkit.fluxcd.io/namespace: flux-system + name: minecraft-restic + namespace: minecraft +spec: + restic: + cacheCapacity: 2Gi + copyMethod: Snapshot + moverSecurityContext: + fsGroup: 25565 + runAsGroup: 25565 + runAsUser: 25565 + pruneIntervalDays: 14 + repository: minecraft-restic + retain: + daily: 14 + within: 7d + storageClassName: block + volumeSnapshotClassName: block + sourcePVC: data-minecraft-0 + trigger: + manual: before-move-to-bare-metal + schedule: 0 6 * * * diff --git a/.taskfiles/volsync/template/wipe-pvc.tmpl.yaml b/.taskfiles/volsync/template/wipe-pvc.tmpl.yaml new file mode 100644 index 00000000..0fc21929 --- /dev/null +++ b/.taskfiles/volsync/template/wipe-pvc.tmpl.yaml @@ -0,0 +1,26 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "wipe-${PVC}-${TIMENOW}" + namespace: "${NS}" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: OnFailure + containers: + - name: wipe + image: public.ecr.aws/docker/library/busybox:latest + command: ["/bin/sh", "-c", "cd /config; find . -delete"] + volumeMounts: + - name: config + mountPath: /config + securityContext: + runAsUser: 0 + runAsGroup: 0 + volumes: + - name: config + persistentVolumeClaim: + claimName: "${PVC}"