feat(tasks): VolSync & Crunchy restore, others

I haven't committed my Taskfiles in so long I honestly don't remember what I mainly changed in them and just assume they all work lol
2026-01-27 10:18:27 +00:00 · 2023-11-12 03:53:34 +08:00
parent a027a1e405
commit e811310bc4
12 changed files with 414 additions and 66 deletions
--- a/.taskfiles/cluster/Taskfile.dist.yaml
+++ b/.taskfiles/cluster/Taskfile.dist.yaml
@@ -1,4 +1,5 @@
 ---
+# Tasks related to multi-cluster or cluster-level management, e.g. bootstrap
 version: "3"

 includes:
--- a/.taskfiles/cnpg/Taskfile.dist.yaml
+++ b/.taskfiles/cnpg/Taskfile.dist.yaml
@@ -1,16 +0,0 @@
---
-version: "3"
-
-vars:
-  PG: '{{.PG | default "pg-default"}}'
-  NS: '{{.NS | default "pg"}}'
-
-tasks:
-  rw:
-    desc: Get current CloudNativePG master (writable) instance to run command against.
-    vars:
-      CLUSTER: '{{ or .PG (fail "Missing `PG` environment variable!") }}'
-      CLUSTER: '{{ or .NS (fail "Missing `NS` environment variable!") }}'
-    dir: '{{.USER_WORKING_DIR}}'
-    cmds:
-      - kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% kubectl -n {{.NS}} {{.CLI_ARGS}}
--- a/.taskfiles/flux/Taskfile.dist.yaml
+++ b/.taskfiles/flux/Taskfile.dist.yaml
@@ -1,6 +1,9 @@
 ---
 version: "3"

+# vars:
+#   DEFAULTNS: '{{.NS | default "flux-system"}}'
+
 tasks:
  install:
    cmds:
@@ -10,3 +13,12 @@ tasks:
    aliases: [ga, fga, getall]
    cmds:
      - viddy -n1 flux get all -A || watch -n1 flux get all -A
+
+  cantWait:
+    aliases: [cw]
+    vars:
+      NS: '{{ .NS | default "flux-system" }}'
+      KS: '{{ .KS | or .KS (fail "Missing `KS` environment variable!") }}'
+    cmds:
+      - kubectl patch ks -n {{.NS}} {{.KS}} --type merge -p '{"spec":{"dependsOn":[]}}'
+      - kubectl patch ks -n {{.NS}} {{.KS}} --type merge -p '{"spec":{"wait":false}}'
--- a/.taskfiles/flux/cantWait.yaml
+++ b/.taskfiles/flux/cantWait.yaml
@@ -0,0 +1,7 @@
+---
+# apiVersion: kustomize.toolkit.fluxcd.io/v1
+# kind: Kustomization
+# # metadata:
+# #   name: not-used
+spec:
+  dependsOn: []
--- a/.taskfiles/k8s/Taskfile.dist.yaml
+++ b/.taskfiles/k8s/Taskfile.dist.yaml
@@ -1,8 +1,9 @@
 ---
+# General Kubernetes admin tasks
 version: "3"

 vars:
-  C: '{{ .C | default "biohazard" }}'
+  # C: '{{ .C | default "admin@biohazard" }}'
  TIMENOW:
    sh: date +%Y%m%d-%H%M%S

@@ -11,7 +12,7 @@ tasks:
    aliases: [nsps]
    desc: While true loop labelling a given namespace with Pod Security labels, if external source creates namespace (e.g. Azure Arc's Helm apply with `--create-namespace`)
    vars:
-      NS: '{{ or .NS (fail "Missing `NS` environment variable!") }}'
+      NS: &ns-fail '{{ or .NS (fail "Missing `NS` environment variable!") }}'
      PS: '{{ or .PS (fail "Missing `PS` environment variable!") }}'
    cmds:
      - while true; do kubectl label namespaces {{.NS}} "pod-security.kubernetes.io/enforce={{.PS}}" >/dev/null 2>/dev/null || true; kubectl label namespaces {{.NS}} "pod-security.kubernetes.io/enforce-version=latest" >/dev/null 2>/dev/null || true; done
@@ -21,32 +22,42 @@ tasks:
    internal: true
    desc: Wait for a job's pod to change its status to pending
    vars:
-      NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
-      NS: '{{ .NS | default "default" }}'
+      NAME: &name-fail '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
+      NS: *ns-fail
    cmds:
      - until [[ $(kubectl -n "{{.NS}}" get pod "{{.NAME}}" -o jsonpath='{.items[*].status.phase}') == "Pending" ]]; do sleep 1; done

  wait-pod-running:
-    aliases: [waitp]
+    aliases: [waitr]
    internal: true
    desc: Wait for a job's pod to change its status to pending
    vars:
-      NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
-      NS: '{{ .NS | default "default" }}'
+      NAME: *name-fail
+      NS: *ns-fail
    cmds:
      - until [[ $(kubectl -n "{{.NS}}" get pod "{{.NAME}}" -o jsonpath='{.items[*].status.phase}') == "Running" ]]; do sleep 1; done

+  wait-pod-delete:
+    aliases: [waitd]
+    internal: true
+    desc: Wait for a job's pod to delete
+    vars:
+      NAME: *name-fail
+      NS: *ns-fail
+    cmds:
+      - until kubectl wait pod -n {{.NS}} {{.NAME}} --for delete --timeout=1m; do sleep 1; done
+
  wait-finish:
    internal: true
    desc: Wait for a job's pod to change its status to pending
    vars:
-      NAME: '{{ or .NAME (fail "Missing `NAME` environment variable!") }}'
-      NS: '{{ .NS | default "default" }}'
+      NAME: *name-fail
+      NS: *ns-fail
      TYPE: '{{ .TYPE | default "job" }}'
      # WAIT_ARGS: '{{.WAIT_ARGS | default "echo \"{{.NAME}} is still running, logs:\" && kubectl -n {{.NS}} logs {{.NAME}} --since 2s -f;"}}'
    cmds:
      - |-
-        until kubectl -n {{.NS}} wait {{.TYPE}}/{{.NAME}} --for condition=complete --timeout=2s; do 
+        until kubectl -n {{.NS}} wait {{.TYPE}}/{{.NAME}} --for condition=complete --timeout=2s; do
          echo "{{.NAME}} is still running, logs:" && kubectl -n {{.NS}} logs {{.TYPE}}/{{.NAME}} --since 2s -f || true;
        done

@@ -68,13 +79,33 @@ tasks:
      SECRETS_FILE: "{{.ROOT_DIR}}/kube/clusters/{{.C}}/config/secrets.sops.env"
      VARS_FILE: "{{.ROOT_DIR}}/kube/clusters/{{.C}}/config/vars.sops.env"
      FCMD: '{{ .FCMD | default "cat" }}'
+      C: '{{.C | default "admin@biohazard"}}'
    cmds:
+      #- sops exec-env {{.SECRETS_FILE}} "sops exec-env {{.VARS_FILE}} \"{{.FCMD}} {{.F}} | envsubst | kubectl apply --context {{.C}} -f -\""
      - sops exec-env {{.SECRETS_FILE}} "sops exec-env {{.VARS_FILE}} \"{{.FCMD}} {{.F}} | envsubst | kubectl apply -f -\""

+  cilium-bootstrap-apply:
+    aliases: [cilium]
+    desc: Bootstrap Cilium onto new cluster, ready for Flux to take over managing Cilium.
+    # dir: "/{{.ROOT_DIR}}/kube/clusters/{{.C}}/talos"
+    vars:
+      C: '{{ or .C (fail "Missing C environment variable for cluster!") }}'
+    cmds:
+      - kubectl delete configmap -n kube-system cilium-config || true
+      - kubectl delete daemonset -n kube-system cilium || true
+      - kubectl delete deployment -n kube-system cilium-operator || true
+      - cp /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/clusters/{{.C}}/helm-values.yaml /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install/base-values.yaml
+      - task: sops-apply
+        vars:
+          C: '{{.C | default "admin@biohazard"}}'
+          F: "/{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install"
+          FCMD: "kustomize build --enable-helm"
+      - defer: rm -rf /{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install/base-values.yaml
+
  newapp:
    desc: Copy app folder structure template, substitute APPNAME, and (TODO) prompt user for variables values such as DNS, UID etc.
    vars:
-      app: '{{ or .app (fail "Missing `app` variable!") }}'
+      APP: &app-fail '{{ or .APP (fail "Missing `app` variable!") }}'
    cmds:
      - cp -r ./kube/templates/test ./kube/deploy/apps/{{.app}}
      # lowercase, used for resource names etc
@@ -123,15 +154,75 @@ tasks:
        export KBENCH=$(curl -sL https://raw.githubusercontent.com/yasker/kbench/main/deploy/fio.yaml)
        [[ ! -z "{{.SC}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").spec.storageClassName = "{{.SC}}"')
        [[ ! -z "{{.NODE}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").spec.template.spec.nodeSelector."kubernetes.io/hostname" = "{{.NODE}}"')
+        export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").metadata.name = "kbench-{{- .TIMENOW -}}"')
+        export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").metadata.name = "kbench-{{- .TIMENOW -}}"')
+        export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | (select(.kind == "Job").spec.template.spec.volumes.[] | select(.name == "vol").persistentVolumeClaim.claimName) |= "kbench-{{- .TIMENOW -}}"')
        printf '%s\n' "${KBENCH}" | kubectl delete -n {{.NS}} -f - || true
      - |-
        export KBENCH=$(curl -sL https://raw.githubusercontent.com/yasker/kbench/main/deploy/fio.yaml)
        [[ ! -z "{{.SC}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").spec.storageClassName = "{{.SC}}"')
        [[ ! -z "{{.NODE}}" ]] && export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").spec.template.spec.nodeSelector."kubernetes.io/hostname" = "{{.NODE}}"')
+        export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "PersistentVolumeClaim").metadata.name = "kbench-{{- .TIMENOW -}}"')
+        export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | select(.kind == "Job").metadata.name = "kbench-{{- .TIMENOW -}}"')
+        export KBENCH=$(printf '%s\n' "${KBENCH}" | yq '. | (select(.kind == "Job").spec.template.spec.volumes.[] | select(.name == "vol").persistentVolumeClaim.claimName) |= "kbench-{{- .TIMENOW -}}"')
        printf '%s\n' "${KBENCH}" | kubectl apply -n {{.NS}} -f -
      - defer: *kbench-delete
      - task: wait-finish
        vars:
          NS: '{{ .NS | default "default" }}'
-          NAME: "kbench"
+          NAME: "kbench-{{- .TIMENOW -}}"
          TYPE: "job"
+
+  scale-to-0:
+    aliases: ["0"]
+    desc: Scale given workloads to 0
+    vars: &scale-vars
+      KS:
+        sh: |-
+          [[ -z "{{.KS}}" ]] && echo "{{.APP}}-app" || echo "{{.KS}}"
+      KSNS: '{{ .KSNS | default "flux-system" }}'
+      HR: '{{ .HR | default .APP }}'
+      HRNS:
+        sh: |-
+          [[ -n "{{.HRNS}}" ]] && echo "{{.HRNS}}" && exit || [[ -n "{{.NS}}" ]] && echo "{{.NS}}" && exit || echo "{{.APP}}"
+      APP: *app-fail
+      NS: '{{ .NS | default .APP }}'
+      REASON: '{{ .REASON | default .TIMENOW }}'
+    cmds:
+      - |- # annotate Flux Kustomization
+        [[ {{.KS}} != "false" ]] && kubectl annotate --overwrite kustomizations.kustomize.toolkit.fluxcd.io -n {{.KSNS}} {{.KS}} scaledown.home.arpa/reason={{.REASON}} || true
+      - |- # suspend Flux Kustomization
+        [[ {{.KS}} != "false" ]] && flux suspend kustomization -n {{.KSNS}} {{.KS}} || true
+      - |- # annotate Flux HelmRelease
+        [[ {{.HR}} != "false" ]] && kubectl annotate --overwrite helmreleases.helm.toolkit.fluxcd.io -n {{.HRNS}} {{.HR}} scaledown.home.arpa/reason={{.REASON}} || true
+      - |- # suspend Flux HelmRelease
+        [[ {{.HR}} != "false" ]] && flux suspend helmrelease -n {{.HRNS}} {{.HR}} || true
+      - |- # annotate Kubernetes controller with reason
+        kubectl annotate --overwrite deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/reason={{.REASON}} || kubectl annotate --overwrite statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/reason={{.REASON}}
+      - |- # annotate Kubernetes controller with old replica count
+        kubectl annotate --overwrite deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas=$(kubectl get deployment -n {{.NS}} {{.APP}} -o jsonpath='{.spec.replicas}') || kubectl annotate --overwrite statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas=$(kubectl get statefulset -n {{.NS}} {{.APP}} -o jsonpath='{.spec.replicas}')
+      - |- # scale down Kubernetes controller
+        kubectl scale deployment -n {{.NS}} {{.APP}} --replicas 0 || kubectl scale statefulset -n {{.NS}} {{.APP}} --replicas 0
+      - task: wait-pod-delete
+        vars:
+          NAME: '{{.APP}}'
+          NS: '{{.NS}}'
+
+  scale-back-up:
+    aliases: ["up"]
+    vars: *scale-vars
+    cmds:
+      - |- # scale up Kubernetes controller back to old replica count
+        kubectl scale deployment -n {{.NS}} {{.APP}} --replicas "$(kubectl get deployment -n {{.NS}} {{.APP}} -o jsonpath='{.metadata.annotations.scaledown\.home\.arpa/replicas}')" || kubectl scale statefulset -n {{.NS}} {{.APP}} --replicas "$(kubectl get statefulset -n {{.NS}} {{.APP}} -o jsonpath='{.metadata.annotations.scaledown\.home\.arpa/replicas}')"
+      - |- # remove old replica count annotation from Kubernetes controller
+        kubectl annotate deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas- || kubectl annotate statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/replicas-
+      - |- # remove reason annotation from Kubernetes controller
+        kubectl annotate deployment -n {{.NS}} {{.APP}} scaledown.home.arpa/reason- || kubectl annotate statefulset -n {{.NS}} {{.APP}} scaledown.home.arpa/reason-
+      - |- # resume Flux HelmRelease
+        [[ {{.HR}} != "disabled" ]] && flux resume helmrelease -n {{.HRNS}} {{.HR}} || true
+      - |- # remove reason annotation from Flux HelmRelease
+        [[ {{.HR}} != "disabled" ]] && kubectl annotate helmreleases.helm.toolkit.fluxcd.io -n {{.HRNS}} {{.HR}} scaledown.home.arpa/reason- || true
+      - |- # resume Flux Kustomization
+        [[ {{.KS}} != "disabled" ]] && flux resume kustomization -n {{.KSNS}} {{.KS}} || true
+      - |- # remove reason annotation from Flux Kustomization
+        [[ {{.KS}} != "disabled" ]] && kubectl annotate kustomizations.kustomize.toolkit.fluxcd.io -n {{.KSNS}} {{.KS}} scaledown.home.arpa/reason- || true
--- a/.taskfiles/pg/Taskfile.dist.yaml
+++ b/.taskfiles/pg/Taskfile.dist.yaml
@@ -0,0 +1,63 @@
+---
+version: "3"
+
+# vars:
+  # PG: '{{.PG | default "default"}}'
+  # NS: '{{.NS | default "pg"}}'
+
+tasks:
+  rw:
+    desc: Get current CloudNativePG master (writable) instance to run command against.
+    dir: "{{.USER_WORKING_DIR}}"
+    vars:
+      APP: "{{.APP}}"
+      PG: &pg-default
+        sh: |-
+          [[ -n "{{.PG}}" ]] && echo "{{.PG}}" || [[ -n "{{.APP}}" ]] && echo "pg-{{.APP}}" || echo "pg-default"
+      NS: &ns-default
+        sh: |-
+          [[ -n "{{.NS}}" ]] && echo "{{.NS}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || echo "pg"
+      # PG: '{{ or .PG (fail "Missing `PG` environment variable!") }}'
+      # NS: &ns-default '{{.NS | default "pg"}}'
+    cmds:
+      - kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% kubectl -n {{.NS}} {{.CLI_ARGS}}
+
+  adminer:
+    desc: Use kubectl netshoot krew plugin to deploy adminer as sidecar to the current primary CNPG replica pod.
+    dir: "{{.USER_WORKING_DIR}}"
+    vars:
+      APP: "{{.APP}}"
+      PG: *pg-default
+      NS: *ns-default
+      PF_ADDR: '{{ .PF_ADDR | default "127.0.0.1" }}'
+      PF_PORT: '{{ .PF_PORT | default "8080" }}'
+    cmds:
+      - |
+        kubectl get clusters.postgresql.cnpg.io -n {{.NS}} {{.PG}} -o yaml | yq '.status.currentPrimary' | xargs -o -I% bash -c "while kubectl netshoot debug -n {{.NS}} % --image-name adminer --image-tag 4.8.1-standalone; do kubectl port-forward -n {{.NS}} % --address {{.PF_ADDR}} {{.PF_PORT}}:8080; break; done"
+
+  crunchy-restore:
+    desc: Restore a pg_dump to a CrunchyData Postgres cluster, writing to the master instance.
+    dir: "{{.USER_WORKING_DIR}}"
+    vars:
+      APP: "{{.APP}}"
+      PG: *pg-default
+      NS: *ns-default
+      DUMP: '{{ or .DUMP (fail "Missing `DUMP` environment variable for specifying pg_dump file location!") }}'
+      CP_DIR: '{{ .CP_DIR | default "/pgdata" }}' # would've been /tmp or /run if Crunchy replica container didn't crash and restart when files are copied there for some reason, maybe small tmpfs size?
+      DBNAME:
+        sh: |-
+          [[ -n "{{.DBNAME}}" ]] && echo "{{.DBNAME}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing `DB` environment variable for specifying database name!" && exit 1 )
+      DBUSER:
+        sh: |-
+          [[ -n "{{.DBUSER}}" ]] && echo "{{.DBUSER}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing `USER` environment variable for specifying user name!" && exit 1 )
+      ARGS: '{{ .ARGS | default "--verbose --verbose --format=c --clean --if-exists --no-owner --role $DBUSER" }}' # TODO: personal defaults, please check and change accordingly if copying this task before running it!
+      CRUNCHY_PRIMARY:
+        sh: |-
+          kubectl get pods -n {{.NS}} -l postgres-operator.crunchydata.com/role=master,postgres-operator.crunchydata.com/cluster={{.PG}} -o jsonpath='{.items[0].metadata.name}'
+    env:
+      DBUSER: '{{.DBUSER}}'
+    cmds:
+      - kubectl cp --container database {{.DUMP}} {{.NS}}/{{.CRUNCHY_PRIMARY}}:{{.CP_DIR}}/restore-dump.psql
+      - kubectl exec -it --container database -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'echo "ALTER DATABASE {{.DBNAME}} OWNER TO {{.DBUSER}};" | psql'
+      - kubectl exec -it --container database -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'pg_restore --dbname {{.DBNAME}} {{.ARGS}} {{.CP_DIR}}/restore-dump.psql'
+      - defer: kubectl exec -it -n {{.NS}} {{.CRUNCHY_PRIMARY}} -- /bin/bash -c 'rm -rf {{.CP_DIR}}/restore-dump.psql'
--- a/.taskfiles/rook/Taskfile.dist.yaml
+++ b/.taskfiles/rook/Taskfile.dist.yaml
@@ -71,10 +71,10 @@ tasks:
    cmds:
      # TODO: mimic deviceFilter
      - envsubst < <(cat {{.JOB_TEMPLATE}}) | kubectl apply -f -
-      - |- 
+      - |-
        kubectl -n kube-system logs job/{{.JOB_NAME}} -f || true;
-        until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=2s; do 
-          echo "Job {{.JOB_NAME}} is still running, logs:" && 
+        until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=2s; do
+          echo "Job {{.JOB_NAME}} is still running, logs:" &&
          kubectl -n kube-system logs job/{{.JOB_NAME}} -f || true;
        done;
      # - kubectl -n kube-system logs job/{{.JOB_NAME}}
@@ -122,6 +122,19 @@ tasks:
        vars:
          NODE: '{{.NODE}}'

+  wipe-nodes-biohazard:
+    desc: Wipe all nodes in cluster "nuclear"
+    dir: '/{{.ROOT_DIR}}/.taskfiles/rook'
+    cmds:
+      - task: wipe-node
+        vars:
+          NODE: "ange"
+          CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202L91P6HGN"
+      - task: wipe-node
+        vars:
+          NODE: "charlotte"
+          CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202JK1P6HGN" # I swear I'll forget to update this here when I have to change the disk
+
  wipe-nodes-nuclear:
    desc: Wipe all nodes in cluster "nuclear"
    dir: '/{{.ROOT_DIR}}/.taskfiles/rook'
@@ -135,6 +148,23 @@ tasks:
          NODE: "humming"
          CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202L91P6HGN"

+  force-delete-cluster:
+    desc: |-
+      Sometimes Rook seems to fail applying the cluster and want to delete it before it even gets anything set up, this Task will force delete all finalizers to delete all unready Ceph resources.
+    dir: '/{{.ROOT_DIR}}/.taskfiles/rook'
+    vars:
+      C: '{{ or .C (fail "Missing `C` environment variable for cluster!") }}'
+    cmds:
+      - helm uninstall -n rook-ceph rook-ceph-cluster-{{.C}} && true || true
+      - |-
+        for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do
+            kubectl get -n rook-ceph "$CRD" -o name | \
+            xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true
+        done
+      - |-
+        kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true
+        kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true
+
  reinstall:
    desc: |-
      For when Rook refuses to create any OSDs at all
@@ -149,23 +179,17 @@ tasks:
          C: '{{.C}}'
      - flux suspend ks 0-{{.C}}-config
      - flux suspend ks 1-core-storage-rook-ceph-app
-      - flux suspend ks 1-core-storage-rook-ceph-cluster
-      - helm uninstall -n rook-ceph rook-ceph-cluster && true || true
-      - flux delete hr -n rook-ceph rook-ceph-cluster --silent && true || true
-      - |-
-        for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do
-            kubectl get -n rook-ceph "$CRD" -o name | \
-            xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true
-        done
-      - |-
-        kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true
-        kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true
+      - flux suspend ks 1-core-storage-rook-ceph-cluster-{{.C}}
+      - task: force-delete-cluster
+        vars:
+          C: '{{.C}}'
+      # - flux delete hr -n rook-ceph rook-ceph-cluster-{{.C}} --silent && true || true
      - helm uninstall -n rook-ceph rook-ceph && true || true
-      - flux delete hr -n rook-ceph rook-ceph --silent && true || true
+      # - flux delete hr -n rook-ceph rook-ceph --silent && true || true
      - kubectl get namespaces rook-ceph && until kubectl delete namespaces rook-ceph; do kubectl get namespaces rook-ceph -o jsonpath="{.status}"; done || true
      - task: wipe-nodes-{{.C}}
      # - task: wipe-nodes-{{.C}} # run again in case wipe fails
      - flux suspend ks 0-{{.C}}-config && flux resume ks 0-{{.C}}-config
      - flux suspend ks 1-core-storage-rook-ceph-app && flux resume ks 1-core-storage-rook-ceph-app
-      - flux suspend ks 1-core-storage-rook-ceph-cluster && flux resume ks 1-core-storage-rook-ceph-cluster
-      # - task: osd-prepare-logs # uncomment to debug OSD prepare errors with while true loop, since the job re-runs 6 times and overwrites the previous runs' logs
+      - flux suspend ks 1-core-storage-rook-ceph-cluster-{{.C}} && flux resume ks 1-core-storage-rook-ceph-cluster-{{.C}}
+      # - task: osd-prepare-logs # uncomment to debug OSD prepare errors with while true loop, since the job re-runs 6 times and overwrites the previous runs' logs
--- a/.taskfiles/talos/Taskfile.dist.yaml
+++ b/.taskfiles/talos/Taskfile.dist.yaml
@@ -24,21 +24,6 @@ tasks:
      - talhelper genconfig -e ./talenv.sops.yaml
      - defer: rm -rf ./talenv.sops.yaml

-  cilium-bootstrap-apply:
-    aliases: [cilium]
-    desc: Bootstrap Cilium onto new cluster, ready for Flux to take over managing Cilium.
-    # dir: "/{{.ROOT_DIR}}/kube/clusters/{{.C}}/talos"
-    vars:
-      C: '{{ or .C (fail "Missing C environment variable for cluster!") }}'
-    cmds:
-      - kubectl delete configmap -n kube-system cilium-config || true
-      - kubectl delete daemonset -n kube-system cilium || true
-      - kubectl delete deployment -n kube-system cilium-operator || true
-      - task: k8s:sops-apply
-        vars:
-          C: '{{.C | default "biohazard"}}'
-          F: "/{{.ROOT_DIR}}/kube/deploy/core/_networking/cilium/app/bootstrap-install"
-          FCMD: "kustomize build --enable-helm"

  bootstrap:
    aliases: [bs]
--- a/.taskfiles/volsync/Taskfile.dist.yaml
+++ b/.taskfiles/volsync/Taskfile.dist.yaml
@@ -1,18 +1,127 @@
 ---
 version: "3"

+vars:
+  TIMENOW:
+    sh: date +%Y%m%d-%H%M%S
+
+includes:
+  k8s:
+    internal: true
+    taskfile: ../k8s
+
 tasks:
  unlock:
    desc: Unlock Restic repo using env vars from cluster secrets.
    vars:
-      ns: '{{ or .ns (fail "Missing `ns` environment variable!") }}'
-      repo: '{{ or .repo (fail "Missing `repo` environment variable!") }}'
+      NS: &ns-fail '{{ or .NS (fail "Missing `NS` environment variable!") }}'
+      REPO: &restic-repo '{{ or .REPO (fail "Missing `REPO` environment variable!") }}'
    cmds:
-      - env $(kubectl get secrets -n {{.ns}} {{.repo}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic unlock
+      - env $(kubectl get secrets -n {{.NS}} {{.REPO}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic unlock --remove-all
  run:
    desc: Unlock Restic repo using env vars from cluster secrets.
    vars:
-      ns: '{{ or .ns (fail "Missing `ns` environment variable!") }}'
-      repo: '{{ or .repo (fail "Missing `repo` environment variable!") }}'
+      NS: *ns-fail
+      REPO: *restic-repo
    cmds:
-      - env $(kubectl get secrets -n {{.ns}} {{.repo}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic {{.CLI_ARGS}}
+      - env $(kubectl get secrets -n {{.NS}} {{.REPO}} -o yaml | yq '.data | to_entries | map(.key + "=" + (.value | @base64d | @sh)) | join("\n")') restic {{.CLI_ARGS}}
+  restore:
+    desc: Restore VolSync data into PVCs
+    dir: "/{{.ROOT_DIR}}/.taskfiles/volsync/template"
+    vars: &restore-vars
+      # APP: '{{ or .APP (fail "Missing `APP` environment variable!") }}'
+      NS:
+        sh: |-
+          [[ -n "{{.RSRC}}" ]] && echo "{{.RSRC}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || ( echo "Missing NS or APP environment variable for Namespace!"; exit 1 )
+      # RSRC: '{{ or .RSRC (fail "Missing `RSRC` environment variable!") }}'
+      RSRC:
+        sh: |-
+          [[ -n "{{.RSRC}}" ]] && echo "{{.RSRC}}" || [[ -n "{{.APP}}" ]] && echo "{{.APP}}-restic" || ( echo "Missing RSRC or APP environment variable for VolSync ReplicationSource!"; exit 1 )
+      REPO:
+        sh: |-
+          [[ -n "{{.REPO}}" ]] && echo "{{.REPO}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.repository}'
+      KS:
+        sh: |-
+          [[ -n "{{.KS}}" ]] && echo "{{.KS}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.metadata.labels.kustomize\.toolkit\.fluxcd\.io/name}'
+      PVC:
+        sh: |-
+          [[ -n "{{.PVC}}" ]] && echo "{{.PVC}}" || kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.sourcePVC}'
+      SC:
+        sh: |-
+          [[ -n "{{.SC}}" ]] && echo "{{.SC}}" || kubectl get pvc -n {{.NS}} {{.PVC}} -o jsonpath='{.spec.storageClassName}'
+      APP:
+        sh: |-
+          [[ -n "{{.APP}}" ]] && echo "{{.APP}}" || kubectl get pvc -n {{.NS}} {{.PVC}} -o jsonpath="{.metadata.labels.app\.kubernetes\.io/name}"
+      CTRL:
+        sh: |-
+          ( [[ -n "{{.CTRL}}" && -n "{{.APP}}" ]] && echo "{{.CTRL}}/{{.APP}}" ) || ( [[ -z "{{.CTRL}}" ]] && kubectl get -n {{.NS}} deployments.apps/{{.APP}} 2>&1 2>/dev/null && echo "deployments.apps/{{.APP}}" || echo "statefulsets.apps/{{.APP}}" )
+      RESTORE:
+        sh: |-
+          [[ -n "{{.PREV}}" ]] && echo "{{.PREV}}" || [[ -n "{{.ASOF}}" ]] && echo "{{.ASOF}}" || ( echo "Missing PREV or ASOF environment variable to select snapshot to restore!"; exit 1 )
+      RSRCDEL: '{{ .RSRCDEL | default "true" }}'
+      RUID:
+        sh: |-
+          kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.runAsUser}'
+      RGID:
+        sh: |-
+          kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.runAsGroup}'
+      RFSG:
+        sh: |-
+          kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o jsonpath='{.spec.restic.moverSecurityContext.fsGroup}'
+      RSRCYAML:
+        sh: |-
+          kubectl get replicationsource -n {{.NS}} {{.RSRC}} -o yaml | yq 'del(.metadata["uid", "resourceVersion", "creationTimestamp", "generation"],.status,.metadata.annotations."kubectl.kubernetes.io/last-applied-configuration")' | tee ./rsrc.tmp.yaml
+      STSDEL: '{{ .STSDEL | default "false" }}' # delete StatefulSet PVC before recreating
+      STSREC: '{{ .STSREC | default "false" }}' # scale up and down StatefulSet to create PVC
+    env: # pass Task vars as env vars, can't anchor vars to env as values will be duplicated
+      NS: '{{.NS}}'
+      RSRC: '{{.RSRC}}'
+      REPO: '{{.REPO}}'
+      KS: '{{.KS}}'
+      PVC: '{{.PVC}}'
+      SC: '{{.SC}}'
+      APP: '{{.APP}}'
+      CTRL: '{{.CTRL}}'
+      RESTORE: '{{.RESTORE}}'
+      TIMENOW: '{{.TIMENOW}}'
+      RUID: '{{.RUID}}'
+      RGID: '{{.RGID}}'
+      RFSG: '{{.RFSG}}'
+      STSDEL: '{{.STSDEL}}'
+      STSREC: '{{.STSREC}}' # StatefulSet recreate PVC by scaling up and down # TODO: currently assumes 1 replica and 1 VCT PVC
+      # RSRCYAML: '{{.RSRCYAML}}'
+    cmds:
+      # - echo ${RDESTYAML}
+      # - echo ${RSRCYAML}
+      - |- # if true, delete StatefulSet PVC
+        [[ "{{.STSDEL}}" == "true" ]] && kubectl delete pvc -n {{.NS}} {{.PVC}}
+      - |- # if not false, scale up and down StatefulSet to recreate PVC
+        [[ "{{.STSREC}}" != "false" ]] && kubectl patch -n {{.NS}} {{.CTRL}} --type merge -p '{"spec":{"template":{"spec":{"nodeSelector":{"recreate-sts-pvc.volsync.home.arpa":"$TIMENOW"}}}}}' && kubectl scale -n {{.NS}} {{.CTRL}} --replicas 1 && kubectl scale -n {{.NS}} {{.CTRL}} --replicas 0 && until kubectl wait --for=jsonpath='{.status.phase}'=Bound -n {{.NS}} pvc/{{.PVC}}; do sleep 1; done && kubectl patch -n {{.NS}} {{.CTRL}} --type=json -p='[{"op": "remove", "path": "/spec/template/spec/nodeSelector/recreate-sts-pvc.volsync.home.arpa"}]'
+      - echo "{{.RSRCYAML}}"
+      - echo ${RSRC} ${NS} ${REPO} ${KS} ${PVC} ${SC} ${APP} ${CTRL} ${RESTORE}
+      - |-
+        [[ "{{.RSRCDEL}}" != "false" ]] && echo "{{.RSRCYAML}}" | yq | kubectl delete -f -
+      #  [[ "{{.RSRCDEL}}" != "false" ]] && kubectl delete -f ./rsrc.tmp.yaml
+      - task: k8s:scale-to-0
+        vars:
+          <<: *restore-vars
+          REASON: 'volsync-restore-{{.RSRC}}-{{.PVC}}-{{.TIMENOW}}'
+      - cat ./wipe-pvc.tmpl.yaml | envsubst | kubectl apply -f -
+      - task: k8s:wait-finish
+        vars:
+          NAME: 'wipe-{{.PVC}}-{{.TIMENOW}}'
+          NS: '{{.NS}}'
+      - |-
+        cat ./ReplicationDestination.tmpl.yaml | envsubst | ( [[ -n "{{.PREV}}" ]] && yq '.spec.restic.previous = {{.PREV}}' || [[ -n "{{.ASOF}}" ]] && yq '.spec.restic.restoreAsOf = "{{.ASOF}}"' || exit 1 ) | kubectl apply -f -
+      - task: k8s:wait-finish
+        vars:
+          NAME: 'volsync-dst-{{.PVC}}-{{.TIMENOW}}'
+          NS: '{{.NS}}'
+      - cat ./ReplicationDestination.tmpl.yaml | envsubst | kubectl delete -f -
+      - task: k8s:scale-back-up
+        vars:
+          <<: *restore-vars
+      - defer: |-
+          [[ "{{.RSRCDEL}}" != "false" ]] && echo "{{.RSRCYAML}}" | yq | kubectl apply -f -
+        #  [[ "{{.RSRCDEL}}" != "false" ]] && kubectl apply -f ./rsrc.tmp.yaml
+      - defer: rm ./rsrc.tmp.yaml
--- a/.taskfiles/volsync/template/ReplicationDestination.tmpl.yaml
+++ b/.taskfiles/volsync/template/ReplicationDestination.tmpl.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: volsync.backube/v1alpha1
+kind: ReplicationDestination
+metadata:
+  name: "${PVC}-${TIMENOW}"
+  namespace: "${NS}"
+spec:
+  trigger:
+    manual: "restore-once-${TIMENOW}"
+  restic:
+    repository: "${REPO}"
+    destinationPVC: "${PVC}"
+    copyMethod: Direct
+    storageClassName: "${SC}"
+    moverSecurityContext:
+      runAsUser: ${RUID}
+      runAsGroup: ${RGID}
+      fsGroup: ${RFSG}
+      
--- a/.taskfiles/volsync/template/rsrc.tmp.yaml
+++ b/.taskfiles/volsync/template/rsrc.tmp.yaml
@@ -0,0 +1,27 @@
+apiVersion: volsync.backube/v1alpha1
+kind: ReplicationSource
+metadata:
+  labels:
+    kustomize.toolkit.fluxcd.io/name: minecraft-app
+    kustomize.toolkit.fluxcd.io/namespace: flux-system
+  name: minecraft-restic
+  namespace: minecraft
+spec:
+  restic:
+    cacheCapacity: 2Gi
+    copyMethod: Snapshot
+    moverSecurityContext:
+      fsGroup: 25565
+      runAsGroup: 25565
+      runAsUser: 25565
+    pruneIntervalDays: 14
+    repository: minecraft-restic
+    retain:
+      daily: 14
+      within: 7d
+    storageClassName: block
+    volumeSnapshotClassName: block
+  sourcePVC: data-minecraft-0
+  trigger:
+    manual: before-move-to-bare-metal
+    schedule: 0 6 * * *
--- a/.taskfiles/volsync/template/wipe-pvc.tmpl.yaml
+++ b/.taskfiles/volsync/template/wipe-pvc.tmpl.yaml
@@ -0,0 +1,26 @@
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: "wipe-${PVC}-${TIMENOW}"
+  namespace: "${NS}"
+spec:
+  ttlSecondsAfterFinished: 3600
+  template:
+    spec:
+      automountServiceAccountToken: false
+      restartPolicy: OnFailure
+      containers:
+        - name: wipe
+          image: public.ecr.aws/docker/library/busybox:latest
+          command: ["/bin/sh", "-c", "cd /config; find . -delete"]
+          volumeMounts:
+            - name: config
+              mountPath: /config
+          securityContext:
+            runAsUser: 0
+            runAsGroup: 0
+      volumes:
+        - name: config
+          persistentVolumeClaim:
+            claimName: "${PVC}"