feat(tasks): VolSync & Crunchy restore, others

I haven't committed my Taskfiles in so long I honestly don't remember
what I mainly changed in them and just assume they all work lol
This commit is contained in:
JJGadgets
2023-11-12 03:53:34 +08:00
parent a027a1e405
commit e811310bc4
12 changed files with 414 additions and 66 deletions

View File

@@ -1,4 +1,5 @@
---
# Tasks related to multi-cluster or cluster-level management, e.g. bootstrap
version: "3"
includes:

View File

@@ -1,16 +0,0 @@
---
version: "3"
vars:
PG: '{{.PG | default "pg-default"}}'
NS: '{{.NS | default "pg"}}'
tasks:
rw:
desc: Get current CloudNativePG master (writable) instance to run command against.
vars:
CLUSTER: '{{ or .PG (fail "Missing `PG` environment variable!") }}'
CLUSTER: '{{ or .NS (fail "Missing `NS` environment variable!") }}'
dir: '{{.USER_WORKING_DIR}}'
cmds:
- kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% kubectl -n {{.NS}} {{.CLI_ARGS}}

View File

@@ -1,6 +1,9 @@
---
version: "3"
# vars:
# DEFAULTNS: '{{.NS | default "flux-system"}}'
tasks:
install:
cmds:
@@ -10,3 +13,12 @@ tasks:
aliases: [ga, fga, getall]
cmds:
- viddy -n1 flux get all -A || watch -n1 flux get all -A
cantWait:
aliases: [cw]
vars:
NS: '{{ .NS | default "flux-system" }}'
KS: '{{ .KS | or .KS (fail "Missing `KS` environment variable!") }}'
cmds:
- kubectl patch ks -n {{.NS}} {{.KS}} --type merge -p '{"spec":{"dependsOn":[]}}'
- kubectl patch ks -n {{.NS}} {{.KS}} --type merge -p '{"spec":{"wait":false}}'

View File

@@ -0,0 +1,7 @@
---
# apiVersion: kustomize.toolkit.fluxcd.io/v1
# kind: Kustomization
# # metadata:
# # name: not-used
spec:
dependsOn: []

View File

@@ -1,8 +1,9 @@
---
# General Kubernetes admin tasks
version: "3"
vars:
C: '{{ .C | default "biohazard" }}'
# C: '{{ .C | default "admin@biohazard" }}'
TIMENOW:
sh: date +%Y%m%d-%H%M%S
@@ -11,7 +12,7 @@ tasks:
aliases: [nsps]
desc: While true loop labelling a given namespace with Pod Security labels, if external source creates namespace (e.g. Azure Arc's Helm apply with `--create-namespace`)
vars:
NS: '{{ or .NS (fail "Missing `NS` environment variable!") }}'
NS: &ns-fail '{{ or .NS (fail "Missing `NS` environment variable!") }}'
PS: '{{ or .PS (fail "Missing `PS` environment variable!") }}'
cmds:
- while true; do kubectl label namespaces {{.NS}} "pod-security.kubernetes.io/enforce={{.PS}}" >/dev/null 2>/dev/null || true; kubectl label namespaces {{.NS}} "pod-security.kubernetes.io/enforce-version=latest" >/dev/null 2>/dev/null || true; done
@@ -21,32 +22,42 @@ tasks:
internal: true
desc: Wait for a job's pod to change its status to pending
vars:
NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
NS: '{{ .NS | default "default" }}'
NAME: &name-fail '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
NS: *ns-fail
cmds:
- until [[ $(kubectl -n "{{.NS}}" get pod "{{.NAME}}" -o jsonpath='{.items[*].status.phase}') == "Pending" ]]; do sleep 1; done
wait-pod-running:
aliases: [waitp]
aliases: [waitr]
internal: true
desc: Wait for a job's pod to change its status to pending
vars:
NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
NS: '{{ .NS | default "default" }}'
NAME: *name-fail
NS: *ns-fail
cmds:
- until [[ $(kubectl -n "{{.NS}}" get pod "{{.NAME}}" -o jsonpath='{.items[*].status.phase}') == "Running" ]]; do sleep 1; done
wait-pod-delete:
aliases: [waitd]
internal: true
desc: Wait for a job's pod to delete
vars:
NAME: *name-fail
NS: *ns-fail
cmds:
- until kubectl wait pod -n {{.NS}} {{.NAME}} --for delete --timeout=1m; do sleep 1; done
wait-finish:
internal: true
desc: Wait for a job's pod to change its status to pending
vars:
NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
NS: '{{ .NS | default "default" }}'
NAME: *name-fail
NS: *ns-fail
TYPE: '{{ .TYPE | default "job" }}'
# WAIT_ARGS: '{{.WAIT_ARGS | default "echo \"{{.NAME}} is still running, logs:\" && kubectl -n {{.NS}} logs {{.NAME}} --since 2s -f;"}}'
cmds:
- |-
until kubectl -n {{.NS}} wait {{.TYPE}}/{{.NAME}} --for condition=complete --timeout=2s; do
until kubectl -n {{.NS}} wait {{.TYPE}}/{{.NAME}} --for condition=complete --timeout=2s; do
echo "{{.NAME}} is still running, logs:" && kubectl -n {{.NS}} logs {{.TYPE}}/{{.NAME}} --since 2s -f || true;
done
@@ -68,13 +79,33 @@ tasks:
SECRETS_FILE: "{{.ROOT_DIR}}/kube/clusters/{{.C}}/config/secrets.sops.env"
VARS_FILE: "{{.ROOT_DIR}}/kube/clusters/{{.C}}/config/vars.sops.env"
FCMD: '{{ .FCMD | default "cat" }}'
C: '{{.C | default "admin@biohazard"}}'
cmds:
#- sops exec-env {{.SECRETS_FILE}} "sops exec-env {{.VARS_FILE}} \"{{.FCMD}} {{.F}} | envsubst | kubectl apply --context {{.C}} -f -\""
- sops exec-env {{.SECRETS_FILE}} "sops exec-env {{.VARS_FILE}} \"{{.FCMD}} {{.F}} | envsubst | kubectl apply -f -\""
cilium-bootstrap-apply:
aliases: [cilium]
desc: Bootstrap Cilium onto new cluster, ready for Flux to take over managing Cilium.
# dir: "/{{.ROOT_DIR}}/kube/clusters/{{.C}}/talos"
vars:
C: '{{ or .C (fail "Missing C environment variable for cluster!") }}'
cmds:
- kubectl delete configmap -n kube-system cilium-config || true
- kubectl delete daemonset -n kube-system cilium || true
- kubectl delete deployment -n kube-system cilium-operator || true
- cp /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/clusters/{{.C}}/helm-values.yaml /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install/base-values.yaml
- task: sops-apply
vars:
C: '{{.C | default "admin@biohazard"}}'
F: "/{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install"
FCMD: "kustomize build --enable-helm"
- defer: rm -rf /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install/base-values.yaml
newapp:
desc: Copy app folder structure template, substitute APPNAME, and (TODO) prompt user for variables values such as DNS, UID etc.
vars:
app: '{{ or .app (fail "Missing `app` variable!") }}'
APP: &app-fail '{{ or .APP (fail "Missing `app` variable!") }}'
cmds:
- cp -r ./kube/templates/test ./kube/deploy/apps/{{.app}}
# lowercase, used for resource names etc
@@ -123,15 +154,75 @@ tasks:
export KBENCH=$(curl -sL https://raw.githubusercontent.com/yasker/kbench/main/deploy/fio.yaml)
[[ ! -z "{{.SC}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").spec.storageClassName = "{{.SC}}"')
[[ ! -z "{{.NODE}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").spec.template.spec.nodeSelector."kubernetes.io/hostname" = "{{.NODE}}"')
export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").metadata.name = "kbench-{{- .TIMENOW -}}"')
export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").metadata.name = "kbench-{{- .TIMENOW -}}"')
export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | (select(.kind == "Job").spec.template.spec.volumes.[] | select(.name == "vol").persistentVolumeClaim.claimName) |= "kbench-{{- .TIMENOW -}}"')
printf '%s\n' "${KBENCH}" | kubectl delete -n {{.NS}} -f - || true
- |-
export KBENCH=$(curl -sL https://raw.githubusercontent.com/yasker/kbench/main/deploy/fio.yaml)
[[ ! -z "{{.SC}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").spec.storageClassName = "{{.SC}}"')
[[ ! -z "{{.NODE}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").spec.template.spec.nodeSelector."kubernetes.io/hostname" = "{{.NODE}}"')
export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").metadata.name = "kbench-{{- .TIMENOW -}}"')
export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").metadata.name = "kbench-{{- .TIMENOW -}}"')
export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | (select(.kind == "Job").spec.template.spec.volumes.[] | select(.name == "vol").persistentVolumeClaim.claimName) |= "kbench-{{- .TIMENOW -}}"')
printf '%s\n' "${KBENCH}" | kubectl apply -n {{.NS}} -f -
- defer: *kbench-delete
- task: wait-finish
vars:
NS: '{{ .NS | default "default" }}'
NAME: "kbench"
NAME: "kbench-{{- .TIMENOW -}}"
TYPE: "job"
scale-to-0:
aliases: ["0"]
desc: Scale given workloads to 0
vars: &scale-vars
KS:
sh: |-
[[ -z "{{.KS}}" ]] && echo "{{.APP}}-app" || echo "{{.KS}}"
KSNS: '{{ .KSNS | default "flux-system" }}'
HR: '{{ .HR | default .APP }}'
HRNS:
sh: |-
[[ -n "{{.HRNS}}" ]] && echo "{{.HRNS}}" && exit || [[ -n "{{.NS}}" ]] && echo "{{.NS}}" && exit || echo "{{.APP}}"
APP: *app-fail
NS: '{{ .NS | default .APP }}'
REASON: '{{ .REASON | default .TIMENOW }}'
cmds:
- |- # annotate Flux Kustomization
[[ {{.KS}} != "false" ]] && kubectl annotate --overwrite kustomizations.kustomize.toolkit.fluxcd.io -n {{.KSNS}} {{.KS}} scaledown.home.arpa/reason={{.REASON}} || true
- |- # suspend Flux Kustomization
[[ {{.KS}} != "false" ]] && flux suspend kustomization -n {{.KSNS}} {{.KS}} || true
- |- # annotate Flux HelmRelease
[[ {{.HR}} != "false" ]] && kubectl annotate --overwrite helmreleases.helm.toolkit.fluxcd.io -n {{.HRNS}} {{.HR}} scaledown.home.arpa/reason={{.REASON}} || true
- |- # suspend Flux HelmRelease
[[ {{.HR}} != "false" ]] && flux suspend helmrelease -n {{.HRNS}} {{.HR}} || true
- |- # annotate Kubernetes controller with reason
kubectl annotate --overwrite deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/reason={{.REASON}} || kubectl annotate --overwrite statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/reason={{.REASON}}
- |- # annotate Kubernetes controller with old replica count
kubectl annotate --overwrite deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas=$(kubectl get deployment -n {{.NS}} {{.APP}} -o jsonpath='{.spec.replicas}') || kubectl annotate --overwrite statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas=$(kubectl get statefulset -n {{.NS}} {{.APP}} -o jsonpath='{.spec.replicas}')
- |- # scale down Kubernetes controller
kubectl scale deployment -n {{.NS}} {{.APP}} --replicas 0 || kubectl scale statefulset -n {{.NS}} {{.APP}} --replicas 0
- task: wait-pod-delete
vars:
NAME: '{{.APP}}'
NS: '{{.NS}}'
scale-back-up:
aliases: ["up"]
vars: *scale-vars
cmds:
- |- # scale up Kubernetes controller back to old replica count
kubectl scale deployment -n {{.NS}} {{.APP}} --replicas "$(kubectl get deployment -n {{.NS}} {{.APP}} -o jsonpath='{.metadata.annotations.scaledown\.home\.arpa/replicas}')" || kubectl scale statefulset -n {{.NS}} {{.APP}} --replicas "$(kubectl get statefulset -n {{.NS}} {{.APP}} -o jsonpath='{.metadata.annotations.scaledown\.home\.arpa/replicas}')"
- |- # remove old replica count annotation from Kubernetes controller
kubectl annotate deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas- || kubectl annotate statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas-
- |- # remove reason annotation from Kubernetes controller
kubectl annotate deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/reason- || kubectl annotate statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/reason-
- |- # resume Flux HelmRelease
[[ {{.HR}} != "disabled" ]] && flux resume helmrelease -n {{.HRNS}} {{.HR}} || true
- |- # remove reason annotation from Flux HelmRelease
[[ {{.HR}} != "disabled" ]] && kubectl annotate helmreleases.helm.toolkit.fluxcd.io -n {{.HRNS}} {{.HR}} scaledown.home.arpa/reason- || true
- |- # resume Flux Kustomization
[[ {{.KS}} != "disabled" ]] && flux resume kustomization -n {{.KSNS}} {{.KS}} || true
- |- # remove reason annotation from Flux Kustomization
[[ {{.KS}} != "disabled" ]] && kubectl annotate kustomizations.kustomize.toolkit.fluxcd.io -n {{.KSNS}} {{.KS}} scaledown.home.arpa/reason- || true

View File

@@ -0,0 +1,63 @@
---
version: "3"
# vars:
# PG: '{{.PG | default "default"}}'
# NS: '{{.NS | default "pg"}}'
tasks:
rw:
desc: Get current CloudNativePG master (writable) instance to run command against.
dir: "{{.USER_WORKING_DIR}}"
vars:
APP: "{{.APP}}"
PG: &pg-default
sh: |-
[[ -n "{{.PG}}" ]] && echo "{{.PG}}" || [[ -n "{{.APP}}" ]] && echo "pg-{{.APP}}" || echo "pg-default"
NS: &ns-default
sh: |-
[[ -n "{{.NS}}" ]] && echo "{{.NS}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || echo "pg"
# PG: '{{ or .PG (fail "Missing `PG` environment variable!") }}'
# NS: &ns-default '{{.NS | default "pg"}}'
cmds:
- kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% kubectl -n {{.NS}} {{.CLI_ARGS}}
adminer:
desc: Use kubectl netshoot krew plugin to deploy adminer as sidecar to the current primary CNPG replica pod.
dir: "{{.USER_WORKING_DIR}}"
vars:
APP: "{{.APP}}"
PG: *pg-default
NS: *ns-default
PF_ADDR: '{{ .PF_ADDR | default "127.0.0.1" }}'
PF_PORT: '{{ .PF_PORT | default "8080" }}'
cmds:
- |
kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% bash -c "while kubectl netshoot debug -n {{.NS}} % --image-name adminer --image-tag 4.8.1-standalone; do kubectl port-forward -n {{.NS}} % --address {{.PF_ADDR}} {{.PF_PORT}}:8080; break; done"
crunchy-restore:
desc: Restore a pg_dump to a CrunchyData Postgres cluster, writing to the master instance.
dir: "{{.USER_WORKING_DIR}}"
vars:
APP: "{{.APP}}"
PG: *pg-default
NS: *ns-default
DUMP: '{{ or .DUMP (fail "Missing `DUMP` environment variable for specifying pg_dump file location!") }}'
CP_DIR: '{{ .CP_DIR | default "/pgdata" }}' # would've been /tmp or /run if Crunchy replica container didn't crash and restart when files are copied there for some reason, maybe small tmpfs size?
DBNAME:
sh: |-
[[ -n "{{.DBNAME}}" ]] && echo "{{.DBNAME}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing `DB` environment variable for specifying database name!" && exit 1 )
DBUSER:
sh: |-
[[ -n "{{.DBUSER}}" ]] && echo "{{.DBUSER}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing `USER` environment variable for specifying user name!" && exit 1 )
ARGS: '{{ .ARGS | default "--verbose --verbose --format=c --clean --if-exists --no-owner --role $DBUSER" }}' # TODO: personal defaults, please check and change accordingly if copying this task before running it!
CRUNCHY_PRIMARY:
sh: |-
kubectl get pods -n {{.NS}} -l postgres-operator.crunchydata.com/role=master,postgres-operator.crunchydata.com/cluster={{.PG}} -o jsonpath='{.items[0].metadata.name}'
env:
DBUSER: '{{.DBUSER}}'
cmds:
- kubectl cp --container database {{.DUMP}} {{.NS}}/{{.CRUNCHY_PRIMARY}}:{{.CP_DIR}}/restore-dump.psql
- kubectl exec -it --container database -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'echo "ALTER DATABASE {{.DBNAME}} OWNER TO {{.DBUSER}};" | psql'
- kubectl exec -it --container database -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'pg_restore --dbname {{.DBNAME}} {{.ARGS}} {{.CP_DIR}}/restore-dump.psql'
- defer: kubectl exec -it -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'rm -rf {{.CP_DIR}}/restore-dump.psql'

View File

@@ -71,10 +71,10 @@ tasks:
cmds:
# TODO: mimic deviceFilter
- envsubst < <(cat {{.JOB_TEMPLATE}}) | kubectl apply -f -
- |-
- |-
kubectl -n kube-system logs job/{{.JOB_NAME}} -f || true;
until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=2s; do
echo "Job {{.JOB_NAME}} is still running, logs:" &&
until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=2s; do
echo "Job {{.JOB_NAME}} is still running, logs:" &&
kubectl -n kube-system logs job/{{.JOB_NAME}} -f || true;
done;
# - kubectl -n kube-system logs job/{{.JOB_NAME}}
@@ -122,6 +122,19 @@ tasks:
vars:
NODE: '{{.NODE}}'
wipe-nodes-biohazard:
desc: Wipe all nodes in cluster "nuclear"
dir: '/{{.ROOT_DIR}}/.taskfiles/rook'
cmds:
- task: wipe-node
vars:
NODE: "ange"
CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202L91P6HGN"
- task: wipe-node
vars:
NODE: "charlotte"
CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202JK1P6HGN" # I swear I'll forget to update this here when I have to change the disk
wipe-nodes-nuclear:
desc: Wipe all nodes in cluster "nuclear"
dir: '/{{.ROOT_DIR}}/.taskfiles/rook'
@@ -135,6 +148,23 @@ tasks:
NODE: "humming"
CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202L91P6HGN"
force-delete-cluster:
desc: |-
Sometimes Rook seems to fail applying the cluster and want to delete it before it even gets anything set up, this Task will force delete all finalizers to delete all unready Ceph resources.
dir: '/{{.ROOT_DIR}}/.taskfiles/rook'
vars:
C: '{{ or .C (fail "Missing `C` environment variable for cluster!") }}'
cmds:
- helm uninstall -n rook-ceph rook-ceph-cluster-{{.C}} && true || true
- |-
for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do
kubectl get -n rook-ceph "$CRD" -o name | \
xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true
done
- |-
kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true
kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true
reinstall:
desc: |-
For when Rook refuses to create any OSDs at all
@@ -149,23 +179,17 @@ tasks:
C: '{{.C}}'
- flux suspend ks 0-{{.C}}-config
- flux suspend ks 1-core-storage-rook-ceph-app
- flux suspend ks 1-core-storage-rook-ceph-cluster
- helm uninstall -n rook-ceph rook-ceph-cluster && true || true
- flux delete hr -n rook-ceph rook-ceph-cluster --silent && true || true
- |-
for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do
kubectl get -n rook-ceph "$CRD" -o name | \
xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true
done
- |-
kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true
kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true
- flux suspend ks 1-core-storage-rook-ceph-cluster-{{.C}}
- task: force-delete-cluster
vars:
C: '{{.C}}'
# - flux delete hr -n rook-ceph rook-ceph-cluster-{{.C}} --silent && true || true
- helm uninstall -n rook-ceph rook-ceph && true || true
- flux delete hr -n rook-ceph rook-ceph --silent && true || true
# - flux delete hr -n rook-ceph rook-ceph --silent && true || true
- kubectl get namespaces rook-ceph && until kubectl delete namespaces rook-ceph; do kubectl get namespaces rook-ceph -o jsonpath="{.status}"; done || true
- task: wipe-nodes-{{.C}}
# - task: wipe-nodes-{{.C}} # run again in case wipe fails
- flux suspend ks 0-{{.C}}-config && flux resume ks 0-{{.C}}-config
- flux suspend ks 1-core-storage-rook-ceph-app && flux resume ks 1-core-storage-rook-ceph-app
- flux suspend ks 1-core-storage-rook-ceph-cluster && flux resume ks 1-core-storage-rook-ceph-cluster
# - task: osd-prepare-logs # uncomment to debug OSD prepare errors with while true loop, since the job re-runs 6 times and overwrites the previous runs' logs
- flux suspend ks 1-core-storage-rook-ceph-cluster-{{.C}} && flux resume ks 1-core-storage-rook-ceph-cluster-{{.C}}
# - task: osd-prepare-logs # uncomment to debug OSD prepare errors with while true loop, since the job re-runs 6 times and overwrites the previous runs' logs

View File

@@ -24,21 +24,6 @@ tasks:
- talhelper genconfig -e ./talenv.sops.yaml
- defer: rm -rf ./talenv.sops.yaml
cilium-bootstrap-apply:
aliases: [cilium]
desc: Bootstrap Cilium onto new cluster, ready for Flux to take over managing Cilium.
# dir: "/{{.ROOT_DIR}}/kube/clusters/{{.C}}/talos"
vars:
C: '{{ or .C (fail "Missing C environment variable for cluster!") }}'
cmds:
- kubectl delete configmap -n kube-system cilium-config || true
- kubectl delete daemonset -n kube-system cilium || true
- kubectl delete deployment -n kube-system cilium-operator || true
- task: k8s:sops-apply
vars:
C: '{{.C | default "biohazard"}}'
F: "/{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install"
FCMD: "kustomize build --enable-helm"
bootstrap:
aliases: [bs]

View File

@@ -1,18 +1,127 @@
---
version: "3"
vars:
TIMENOW:
sh: date +%Y%m%d-%H%M%S
includes:
k8s:
internal: true
taskfile: ../k8s
tasks:
unlock:
desc: Unlock Restic repo using env vars from cluster secrets.
vars:
ns: '{{ or .ns (fail "Missing `ns` environment variable!") }}'
repo: '{{ or .repo (fail "Missing `repo` environment variable!") }}'
NS: &ns-fail '{{ or .NS (fail "Missing `NS` environment variable!") }}'
REPO: &restic-repo '{{ or .REPO (fail "Missing `REPO` environment variable!") }}'
cmds:
- env $(kubectl get secrets -n {{.ns}} {{.repo}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic unlock
- env $(kubectl get secrets -n {{.NS}} {{.REPO}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic unlock --remove-all
run:
desc: Unlock Restic repo using env vars from cluster secrets.
vars:
ns: '{{ or .ns (fail "Missing `ns` environment variable!") }}'
repo: '{{ or .repo (fail "Missing `repo` environment variable!") }}'
NS: *ns-fail
REPO: *restic-repo
cmds:
- env $(kubectl get secrets -n {{.ns}} {{.repo}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic {{.CLI_ARGS}}
- env $(kubectl get secrets -n {{.NS}} {{.REPO}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic {{.CLI_ARGS}}
restore:
desc: Restore VolSync data into PVCs
dir: "/{{.ROOT_DIR}}/.taskfiles/volsync/template"
vars: &restore-vars
# APP: '{{ or .APP (fail "Missing `APP` environment variable!") }}'
NS:
sh: |-
[[ -n "{{.RSRC}}" ]] && echo "{{.RSRC}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing NS or APP environment variable for Namespace!"; exit 1 )
# RSRC: '{{ or .RSRC (fail "Missing `RSRC` environment variable!") }}'
RSRC:
sh: |-
[[ -n "{{.RSRC}}" ]] && echo "{{.RSRC}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}-restic" || ( echo "Missing RSRC or APP environment variable for VolSync ReplicationSource!"; exit 1 )
REPO:
sh: |-
[[ -n "{{.REPO}}" ]] && echo "{{.REPO}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.repository}'
KS:
sh: |-
[[ -n "{{.KS}}" ]] && echo "{{.KS}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.metadata.labels.kustomize\.toolkit\.fluxcd\.io/name}'
PVC:
sh: |-
[[ -n "{{.PVC}}" ]] && echo "{{.PVC}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.sourcePVC}'
SC:
sh: |-
[[ -n "{{.SC}}" ]] && echo "{{.SC}}" || kubectl get pvc -n {{.NS}} {{.PVC}} -o jsonpath='{.spec.storageClassName}'
APP:
sh: |-
[[ -n "{{.APP}}" ]] && echo "{{.APP}}" || kubectl get pvc -n {{.NS}} {{.PVC}} -o jsonpath="{.metadata.labels.app\.kubernetes\.io/name}"
CTRL:
sh: |-
( [[ -n "{{.CTRL}}" && -n "{{.APP}}" ]] && echo "{{.CTRL}}/{{.APP}}" ) || ( [[ -z "{{.CTRL}}" ]] && kubectl get -n {{.NS}} deployments.apps/{{.APP}} 2>&1 2>/dev/null && echo "deployments.apps/{{.APP}}" || echo "statefulsets.apps/{{.APP}}" )
RESTORE:
sh: |-
[[ -n "{{.PREV}}" ]] && echo "{{.PREV}}" || [[ -n "{{.ASOF}}" ]] && echo "{{.ASOF}}" || ( echo "Missing PREV or ASOF environment variable to select snapshot to restore!"; exit 1 )
RSRCDEL: '{{ .RSRCDEL | default "true" }}'
RUID:
sh: |-
kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.runAsUser}'
RGID:
sh: |-
kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.runAsGroup}'
RFSG:
sh: |-
kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.fsGroup}'
RSRCYAML:
sh: |-
kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o yaml | yq 'del(.metadata["uid", "resourceVersion", "creationTimestamp", "generation"],.status,.metadata.annotations."kubectl.kubernetes.io/last-applied-configuration")' | tee ./rsrc.tmp.yaml
STSDEL: '{{ .STSDEL | default "false" }}' # delete StatefulSet PVC before recreating
STSREC: '{{ .STSREC | default "false" }}' # scale up and down StatefulSet to create PVC
env: # pass Task vars as env vars, can't anchor vars to env as values will be duplicated
NS: '{{.NS}}'
RSRC: '{{.RSRC}}'
REPO: '{{.REPO}}'
KS: '{{.KS}}'
PVC: '{{.PVC}}'
SC: '{{.SC}}'
APP: '{{.APP}}'
CTRL: '{{.CTRL}}'
RESTORE: '{{.RESTORE}}'
TIMENOW: '{{.TIMENOW}}'
RUID: '{{.RUID}}'
RGID: '{{.RGID}}'
RFSG: '{{.RFSG}}'
STSDEL: '{{.STSDEL}}'
STSREC: '{{.STSREC}}' # StatefulSet recreate PVC by scaling up and down # TODO: currently assumes 1 replica and 1 VCT PVC
# RSRCYAML: '{{.RSRCYAML}}'
cmds:
# - echo ${RDESTYAML}
# - echo ${RSRCYAML}
- |- # if true, delete StatefulSet PVC
[[ "{{.STSDEL}}" == "true" ]] && kubectl delete pvc -n {{.NS}} {{.PVC}}
- |- # if not false, scale up and down StatefulSet to recreate PVC
[[ "{{.STSREC}}" != "false" ]] && kubectl patch -n {{.NS}} {{.CTRL}} --type merge -p '{"spec":{"template":{"spec":{"nodeSelector":{"recreate-sts-pvc.volsync.home.arpa":"$TIMENOW"}}}}}' && kubectl scale -n {{.NS}} {{.CTRL}} --replicas 1 && kubectl scale -n {{.NS}} {{.CTRL}} --replicas 0 && until kubectl wait --for=jsonpath='{.status.phase}'=Bound -n {{.NS}} pvc/{{.PVC}}; do sleep 1; done && kubectl patch -n {{.NS}} {{.CTRL}} --type=json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/recreate-sts-pvc.volsync.home.arpa"}]'
- echo "{{.RSRCYAML}}"
- echo ${RSRC} ${NS} ${REPO} ${KS} ${PVC} ${SC} ${APP} ${CTRL} ${RESTORE}
- |-
[[ "{{.RSRCDEL}}" != "false" ]] && echo "{{.RSRCYAML}}" | yq | kubectl delete -f -
# [[ "{{.RSRCDEL}}" != "false" ]] && kubectl delete -f ./rsrc.tmp.yaml
- task: k8s:scale-to-0
vars:
<<: *restore-vars
REASON: 'volsync-restore-{{.RSRC}}-{{.PVC}}-{{.TIMENOW}}'
- cat ./wipe-pvc.tmpl.yaml | envsubst | kubectl apply -f -
- task: k8s:wait-finish
vars:
NAME: 'wipe-{{.PVC}}-{{.TIMENOW}}'
NS: '{{.NS}}'
- |-
cat ./ReplicationDestination.tmpl.yaml | envsubst | ( [[ -n "{{.PREV}}" ]] && yq '.spec.restic.previous = {{.PREV}}' || [[ -n "{{.ASOF}}" ]] && yq '.spec.restic.restoreAsOf = "{{.ASOF}}"' || exit 1 ) | kubectl apply -f -
- task: k8s:wait-finish
vars:
NAME: 'volsync-dst-{{.PVC}}-{{.TIMENOW}}'
NS: '{{.NS}}'
- cat ./ReplicationDestination.tmpl.yaml | envsubst | kubectl delete -f -
- task: k8s:scale-back-up
vars:
<<: *restore-vars
- defer: |-
[[ "{{.RSRCDEL}}" != "false" ]] && echo "{{.RSRCYAML}}" | yq | kubectl apply -f -
# [[ "{{.RSRCDEL}}" != "false" ]] && kubectl apply -f ./rsrc.tmp.yaml
- defer: rm ./rsrc.tmp.yaml

View File

@@ -0,0 +1,19 @@
---
apiVersion: volsync.backube/v1alpha1
kind: ReplicationDestination
metadata:
name: "${PVC}-${TIMENOW}"
namespace: "${NS}"
spec:
trigger:
manual: "restore-once-${TIMENOW}"
restic:
repository: "${REPO}"
destinationPVC: "${PVC}"
copyMethod: Direct
storageClassName: "${SC}"
moverSecurityContext:
runAsUser: ${RUID}
runAsGroup: ${RGID}
fsGroup: ${RFSG}

View File

@@ -0,0 +1,27 @@
apiVersion: volsync.backube/v1alpha1
kind: ReplicationSource
metadata:
labels:
kustomize.toolkit.fluxcd.io/name: minecraft-app
kustomize.toolkit.fluxcd.io/namespace: flux-system
name: minecraft-restic
namespace: minecraft
spec:
restic:
cacheCapacity: 2Gi
copyMethod: Snapshot
moverSecurityContext:
fsGroup: 25565
runAsGroup: 25565
runAsUser: 25565
pruneIntervalDays: 14
repository: minecraft-restic
retain:
daily: 14
within: 7d
storageClassName: block
volumeSnapshotClassName: block
sourcePVC: data-minecraft-0
trigger:
manual: before-move-to-bare-metal
schedule: 0 6 * * *

View File

@@ -0,0 +1,26 @@
---
apiVersion: batch/v1
kind: Job
metadata:
name: "wipe-${PVC}-${TIMENOW}"
namespace: "${NS}"
spec:
ttlSecondsAfterFinished: 3600
template:
spec:
automountServiceAccountToken: false
restartPolicy: OnFailure
containers:
- name: wipe
image: public.ecr.aws/docker/library/busybox:latest
command: ["/bin/sh", "-c", "cd /config; find . -delete"]
volumeMounts:
- name: config
mountPath: /config
securityContext:
runAsUser: 0
runAsGroup: 0
volumes:
- name: config
persistentVolumeClaim:
claimName: "${PVC}"