From 32a11260f6a0c130b7ec28e9ae944cece38ad5da Mon Sep 17 00:00:00 2001 From: JJGadgets Date: Fri, 30 Jun 2023 22:47:34 +0800 Subject: [PATCH] feat(taskfiles): add rook tasks --- .taskfiles/cluster/Taskfile.dist.yaml | 15 ++- .taskfiles/k8s/Taskfile.dist.yaml | 10 ++ .taskfiles/k8s/priv-pod.yaml | 31 +++++ .taskfiles/rook/Taskfile.dist.yaml | 126 ++++++++++++++++++ .taskfiles/rook/wipe-rook-state-job.tmpl.yaml | 31 +++++ .taskfiles/rook/zap-disk-job.tmpl.yaml | 37 +++++ .taskfiles/talos/Taskfile.dist.yaml | 8 +- Taskfile.dist.yaml | 3 + kube/clusters/nuclear/config/vars.sops.env | 10 +- .../storage/rook-ceph/cluster/app/hr.yaml | 8 +- 10 files changed, 266 insertions(+), 13 deletions(-) create mode 100644 .taskfiles/k8s/priv-pod.yaml create mode 100644 .taskfiles/rook/Taskfile.dist.yaml create mode 100644 .taskfiles/rook/wipe-rook-state-job.tmpl.yaml create mode 100644 .taskfiles/rook/zap-disk-job.tmpl.yaml diff --git a/.taskfiles/cluster/Taskfile.dist.yaml b/.taskfiles/cluster/Taskfile.dist.yaml index 8a136c15..802d71d9 100644 --- a/.taskfiles/cluster/Taskfile.dist.yaml +++ b/.taskfiles/cluster/Taskfile.dist.yaml @@ -6,13 +6,22 @@ vars: APPLY: '{{.APPLY | default "0"}}' tasks: - cluster-init-flux: - aliases: [initflux] + cluster-switch: + aliases: [ctx] vars: C: '{{ or .C (fail "Missing C environment variable for cluster!") }}' cmds: - | - sed -i 's/current-context: admin@.*/current-context: admin@{{.C}}/g' ~/.kube/config + sed -i 's/current-context: admin@.*/current-context: admin@{{.C}}/g' ~/.kube/config || true + + cluster-init-flux: + aliases: [initflux] + vars: + C: '{{ or .C (fail "Missing `C` environment variable for cluster!") }}' + cmds: + - task: cluster-switch + vars: + C: '{{.C}}' - kubectl apply --server-side --kustomize ./kube/bootstrap/flux/ cluster-init: diff --git a/.taskfiles/k8s/Taskfile.dist.yaml b/.taskfiles/k8s/Taskfile.dist.yaml index a879f5f0..663d60fa 100644 --- a/.taskfiles/k8s/Taskfile.dist.yaml +++ b/.taskfiles/k8s/Taskfile.dist.yaml @@ -5,6 +5,16 @@ vars: C: '{{ .C | default "biohazard" }}' tasks: + wait-job-pending: + aliases: [waitjob] + internal: true + desc: Wait for a job's pod to change its status to pending + vars: + NAMESPACE: '{{ or .NAMESPACE (fail "Missing `NAMESPACE` environment variable!") }}' + JOB_NAME: '{{ or .JOB_NAME (fail "Missing `JOB_NAME` environment variable!") }}' + cmds: + - until [[ $(kubectl -n "${NAMESPACE}" get pod -l job-name="${JOB_NAME}" -o jsonpath='{.items[*].status.phase}') == "Pending" ]]; do sleep 1; done + get-public-ingresses: aliases: [gpi] desc: | diff --git a/.taskfiles/k8s/priv-pod.yaml b/.taskfiles/k8s/priv-pod.yaml new file mode 100644 index 00000000..f9e37d8f --- /dev/null +++ b/.taskfiles/k8s/priv-pod.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "root" + namespace: "kube-system" + labels: + jobs.home.arpa/rook: "wipe-disk" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: Never + nodeName: blackfish + containers: + - name: test + image: ghcr.io/onedr0p/alpine:3.17.3@sha256:999384960b6114496a5e4036e945141c205d064ce23b87326bd3f8d878c5a9d4 + securityContext: + privileged: true + resources: {} + command: ["/bin/sh", "-c"] + args: ["sleep 999999999"] + volumeMounts: + - mountPath: /mnt/disk + name: host-disk + volumes: + - name: host-disk + hostPath: + path: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202JK1P6HGN" + diff --git a/.taskfiles/rook/Taskfile.dist.yaml b/.taskfiles/rook/Taskfile.dist.yaml new file mode 100644 index 00000000..32ad2247 --- /dev/null +++ b/.taskfiles/rook/Taskfile.dist.yaml @@ -0,0 +1,126 @@ +--- +version: "3" + +x-task-vars: &task-vars + NODE: "{{.NODE}}" + CEPH_DISK: "{{.CEPH_DISK}}" + TIME: "{{.TIME}}" + JOB_NAME: "{{.JOB_NAME}}" + +vars: + TIME: '{{now | date "150405"}}' + +includes: + k8s: + internal: true + taskfile: ../k8s + cluster: + internal: true + taskfile: ../cluster + +tasks: + zap-disk: + desc: Prepare a disk to be used as a Ceph OSD on specified node by zapping all data and partition data. + internal: true + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + cmds: + # TODO: mimic deviceFilter + - envsubst < <(cat {{.JOB_TEMPLATE}}) | kubectl apply -f - + # - task: k8s:wait-job-pending + # vars: + # JOB_NAME: '{{.wipeCephDiskJobName}}' + # NAMESPACE: kube-system + - |- + until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=1m; do + echo "Job {{.JOB_NAME}} is still running, logs:" && + kubectl -n kube-system logs job/{{.JOB_NAME}} -f; + done + # - kubectl -n kube-system logs job/{{.JOB_NAME}} + - defer: kubectl -n kube-system delete job {{.JOB_NAME}} + vars: + NODE: '{{ or .NODE (fail "`NODE` is required") }}' + CEPH_DISK: '{{ or .CEPH_DISK (fail "`CEPH_DISK` is required") }}' + JOB_NAME: 'zap-disk-{{- .NODE -}}-{{- .TIME -}}' + JOB_TEMPLATE: "zap-disk-job.tmpl.yaml" + env: *task-vars + preconditions: + - sh: test -f {{.JOB_TEMPLATE}} + + wipe-state: + desc: Wipe all Ceph state on specified node. + internal: true + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + cmds: + - envsubst < <(cat {{.JOB_TEMPLATE}}) | kubectl apply -f - + # - task: k8s:wait-job-pending + # vars: + # JOB_NAME: '{{.wipeCephDiskJobName}}' + # NAMESPACE: kube-system + - until kubectl -n kube-system wait job/{{.JOB_NAME}} --for condition=complete --timeout=1m; do echo "Job {{.JOB_NAME}} is still running, logs:" && kubectl -n kube-system logs job/{{.JOB_NAME}} -f; done + # - kubectl -n kube-system logs job/{{.JOB_NAME}} + - defer: kubectl -n kube-system delete job {{.JOB_NAME}} + vars: + NODE: '{{ or .NODE (fail "`NODE` is required") }}' + JOB_NAME: "wipe-rook-state-{{- .NODE -}}-{{- .TIME -}}" + JOB_TEMPLATE: "wipe-rook-state-job.tmpl.yaml" + env: *task-vars + preconditions: + - sh: test -f {{.JOB_TEMPLATE}} + + wipe-node: + aliases: ["wn"] + desc: Trigger a wipe of all Rook-Ceph data on specified node. + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + vars: + NODE: '{{ or .NODE (fail "Missing `NODE` environment variable!") }}' + CEPH_DISK: '{{ or .CEPH_DISK (fail "Missing `CEPH_DISK` environment variable!") }}' + cmds: + - task: zap-disk + vars: + NODE: '{{.NODE}}' + CEPH_DISK: '{{ or .CEPH_DISK (fail "Missing `CEPH_DISK` environment variable!") }}' + - task: wipe-state + vars: + NODE: '{{.NODE}}' + + wipe-nodes-nuclear: + desc: Wipe all nodes in cluster "nuclear" + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + cmds: + - task: wipe-node + vars: + NODE: "blackfish" + CEPH_DISK: "/dev/disk/by-id/ata-INTEL_SSDSC2BB016T4_BTWD709202JK1P6HGN" + + reinstall: + desc: |- + For when Rook refuses to create any OSDs at all + Assuming Flux and resource names, suspends master ks.yaml (Flux Kustomization), suspends ks.yaml for Rook-Ceph and cluster, suspends HelmReleases for Rook-Ceph and cluster, deletes cluster HelmRelease, patches Ceph CR and cm/secret finalizers, removes Rook-Ceph HR and namespace. + Then, reconcile master, Rook-Ceph and cluster ks.yaml. + dir: '/{{.ROOT_DIR}}/.taskfiles/rook' + vars: + C: '{{ or .C (fail "Missing `C` environment variable for cluster!") }}' + cmds: + - task: cluster:cluster-switch + vars: + C: '{{.C}}' + - flux suspend ks 0-{{.C}}-config + - flux suspend ks 1-core-storage-rook-ceph-app + - flux suspend ks 1-core-storage-rook-ceph-cluster + - helm uninstall -n rook-ceph rook-ceph-cluster && true || true + - flux delete hr -n rook-ceph rook-ceph-cluster --silent && true || true + - |- + for CRD in $(kubectl get crd -n rook-ceph | awk '/ceph.rook.io/ {print $1}'); do + kubectl get -n rook-ceph "$CRD" -o name | \ + xargs -I {} kubectl patch -n rook-ceph {} --type merge -p '{"metadata":{"finalizers": []}}' && true || true + done + - |- + kubectl -n rook-ceph patch configmap rook-ceph-mon-endpoints --type merge -p '{"metadata":{"finalizers": []}}' && true || true + kubectl -n rook-ceph patch secrets rook-ceph-mon --type merge -p '{"metadata":{"finalizers": []}}' && true || true + - helm uninstall -n rook-ceph rook-ceph && true || true + - flux delete hr -n rook-ceph rook-ceph --silent && true || true + - kubectl get namespaces rook-ceph && until kubectl delete namespaces rook-ceph; do kubectl get namespaces rook-ceph -o jsonpath="{.status}"; done || true + - task: wipe-nodes-{{.C}} + - flux suspend ks 0-{{.C}}-config && flux resume ks 0-{{.C}}-config + - flux suspend ks 1-core-storage-rook-ceph-app && flux resume ks 1-core-storage-rook-ceph-app + - flux suspend ks 1-core-storage-rook-ceph-cluster && flux resume ks 1-core-storage-rook-ceph-cluster diff --git a/.taskfiles/rook/wipe-rook-state-job.tmpl.yaml b/.taskfiles/rook/wipe-rook-state-job.tmpl.yaml new file mode 100644 index 00000000..53c09add --- /dev/null +++ b/.taskfiles/rook/wipe-rook-state-job.tmpl.yaml @@ -0,0 +1,31 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "${JOB_NAME}" + namespace: "kube-system" + labels: + jobs.home.arpa/rook: "wipe-rook-state" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: Never + nodeName: ${NODE} + containers: + - name: disk-wipe + image: ghcr.io/onedr0p/alpine:3.17.3@sha256:999384960b6114496a5e4036e945141c205d064ce23b87326bd3f8d878c5a9d4 + securityContext: + privileged: true + resources: {} + command: ["/bin/sh", "-c"] + args: + - echo "Running `rm -rf`:"; rm -rf /mnt/host_var/lib/rook && echo "Success!" || echo "Failed!" + volumeMounts: + - mountPath: /mnt/host_var + name: host-var + volumes: + - name: host-var + hostPath: + path: /var diff --git a/.taskfiles/rook/zap-disk-job.tmpl.yaml b/.taskfiles/rook/zap-disk-job.tmpl.yaml new file mode 100644 index 00000000..0761820d --- /dev/null +++ b/.taskfiles/rook/zap-disk-job.tmpl.yaml @@ -0,0 +1,37 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: "${JOB_NAME}" + namespace: "kube-system" + labels: + jobs.home.arpa/rook: "wipe-disk" +spec: + ttlSecondsAfterFinished: 3600 + template: + spec: + automountServiceAccountToken: false + restartPolicy: Never + nodeName: ${NODE} + containers: + - name: disk-wipe + image: ghcr.io/onedr0p/alpine:3.17.3@sha256:999384960b6114496a5e4036e945141c205d064ce23b87326bd3f8d878c5a9d4 + securityContext: + privileged: true + resources: {} + command: ["/bin/sh", "-c"] + args: + - apk add --no-cache sgdisk util-linux parted; + fdisk -l /mnt/disk; + echo "Running sgdisk..." && sgdisk --zap-all /mnt/disk && echo "Success!" || break; + echo "Running wipefs..." && wipefs --all --backup /mnt/disk && echo "Success!" || break; + echo "Running blkdiscard..." && blkdiscard /mnt/disk && echo "Success!" || break; + echo "Running partprobe..." && partprobe /mnt/disk && echo "Success!" || echo "Failed!" + #dd if=/dev/zero bs=1M count=10000 oflag=direct of=${CEPH_DISK}; + volumeMounts: + - mountPath: /mnt/disk + name: host-disk + volumes: + - name: host-disk + hostPath: + path: "${CEPH_DISK}" \ No newline at end of file diff --git a/.taskfiles/talos/Taskfile.dist.yaml b/.taskfiles/talos/Taskfile.dist.yaml index 78568bc1..d16b266c 100644 --- a/.taskfiles/talos/Taskfile.dist.yaml +++ b/.taskfiles/talos/Taskfile.dist.yaml @@ -2,8 +2,12 @@ version: "3" includes: - k8s: ../k8s - cluster: ../cluster + k8s: + internal: true + taskfile: ../k8s + cluster: + internal: true + taskfile: ../cluster vars: C: '{{.C | default "biohazard"}}' diff --git a/Taskfile.dist.yaml b/Taskfile.dist.yaml index 54d0cb7d..0f57d9e8 100644 --- a/Taskfile.dist.yaml +++ b/Taskfile.dist.yaml @@ -23,6 +23,9 @@ includes: cnpg: aliases: [pg] taskfile: .taskfiles/cnpg/Taskfile.dist.yaml + rook: + aliases: [r] + taskfile: .taskfiles/rook tasks: default: diff --git a/kube/clusters/nuclear/config/vars.sops.env b/kube/clusters/nuclear/config/vars.sops.env index b5ba09f5..5c13220b 100644 --- a/kube/clusters/nuclear/config/vars.sops.env +++ b/kube/clusters/nuclear/config/vars.sops.env @@ -57,7 +57,7 @@ APP_IP_K8S_GATEWAY=ENC[AES256_GCM,data:OwsOAO2jzTqrjULN,iv:EFsZhmlSDA8iHkyJgpE7B APP_IP_HAPROXY_PROXY_X=ENC[AES256_GCM,data:X4OtM9iKr9RKllTb,iv:tl5zMdaYKu9iZF23860bLHxyxwgFMl0Y7cCMD1mI8wY=,tag:tCkyCrYQjOYJHuUdlDnyZw==,type:str] APP_UID_HAPROXY_PROXY_X=ENC[AES256_GCM,data:B3G6nA==,iv:e5UIYZa52kQ8GFBD30d4/U8WMito6albh4CMgYhHOpg=,tag:8TBer2t4zGGYIC3bmO5FLA==,type:str] APP_DNS_FLUX_WEBHOOK=ENC[AES256_GCM,data:/vKJ0ACjgrnQdz93O0IXCy3q1Ttv8yFsJA==,iv:mbL57+d+xn9XuggscB9KiaP9t2mTZi57j+89Whhw7Ro=,tag:q37t5QXvLjTqZXhQWoT7gw==,type:str] -APP_DNS_CEPH=ENC[AES256_GCM,data:npgEErUvNqTnZ8I=,iv:rt3ditftNw2PJJIodAUJLBQv4EIqTgmZSk8Gkg2t/lY=,tag:8adKoJ/shIVoRlsSwttwUQ==,type:str] +APP_DNS_CEPH=ENC[AES256_GCM,data:d3Rr47Rs6vLEAGOb,iv:oRwmj7gvc9nVHg7vvPNN/EdqXKwtm+BjeEvEZx5fUvQ=,tag:EjR15rnCAc+K/cwYgwgJAA==,type:str] APP_DNS_RADOSGW=ENC[AES256_GCM,data:H5CKxKXf3imtuA==,iv:PKnd+pSP7eEHHUA5YLsHOEEBxApAakvJq3WzfVQGymY=,tag:2uZdzZ8ZDCkFdCQ/n9KSGQ==,type:str] APP_IP_RADOSGW=ENC[AES256_GCM,data:OYs5/gWPQkPbYc8=,iv:HxElYcvv/+hQCfOa+TD04Vp/tM7jzlGApK3Z/kp/q0s=,tag:gWUtQiWgIIkfG9Yd4WC0fg==,type:str] APP_DNS_RGW_S3=ENC[AES256_GCM,data:MD1ldDBMlPsyGNY=,iv:4lkGMXZSU6Ygsy5EkHFvg8dSDw49NfEJ0/1u48gXXmo=,tag:ICeMEHR6zPA3sbxUloXYoQ==,type:str] @@ -123,12 +123,12 @@ CONFIG_ZEROTIER_ENDPOINT=ENC[AES256_GCM,data:tOyIlrzdn8sck7um7OSicq5T0XWAmymaRLn CONFIG_AUTHENTIK_REMOTE_HOST=ENC[AES256_GCM,data:K0SOweZKXdAhlKQrZQ42UQ3Eg7u3KTF5nCemjkz2XA==,iv:65Qret7KcDIZRoLzRwoY6sXGNUGfrhsMugJ+jeO5O6Y=,tag:qkv/mksEiPuMtylKvIs/Yw==,type:str] CONFIG_HEADSCALE_IPV4=ENC[AES256_GCM,data:EZ7GMHA6u1wWPS5g6Pg=,iv:W1hcseQ4Q6CisTXnDLI7hWTy18fIVKtZ46tudCyhfa4=,tag:2WnnNjuZhwUPG07OKTQt2g==,type:str] CONFIG_OVENMEDIAENGINE_NAME=ENC[AES256_GCM,data:58CuH8bcUHWXBZA=,iv:BN7x6aAJPbzIn25sNoycsHRE5pugkubLS2VrM77+g/E=,tag:6JAsRjU0L6wbZtns3rk6KQ==,type:str] -sops_lastmodified=2023-06-29T21:15:10Z -sops_mac=ENC[AES256_GCM,data:pCl0VZSogPmc8Mcp2zOz7nk5qvUW5UvV7R/nvaGuxTKUe/YkNgELKV5wSfw6Xlx0NJf28TKaJirYRsIHQnWCiZPp7oN7N1XObh37Ud9azZK2xxLRiz3+DGtG05i9UdhsZ0AXhzcuoo4E/u1o00UzEDCkqKyk0ZVjUQGFlZRXCyU=,iv:jusmwDbhxrSHPm0wv+aMh79g1y3RjGjYSHkOXKM+ocw=,tag:cPieIAP6hXmJIUfPTpAUHw==,type:str] sops_unencrypted_suffix=_unencrypted sops_age__list_0__map_recipient=age1u57l4s400gqstc0p485j4646cemntufr0pcyp32yudklsp90xpmszxvnkj -sops_version=3.7.3 sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBFSXFvLzFQaFJ0OVJKUFV5\nTWh2OUltUlpJWFlVVytFYU9VajBHSnQ4SGdjCnRVbEVXdDVyUHJrR05Ba0xvUm1l\nTkt2YmNUZy90ZFA2b3QrODFKZ01EVG8KLS0tIEw2dkd1cnFCbnI5eWxKL2o1aDVB\nN0hveXZ2dWdxQ2k2L0pGR0ROMStVTmsK4dV/hNyDjsYnVUiFQ7kqdmcVHfYyVckz\nh/rwLjcZgsup72WDVP3v6Eul8B3LKFrSb8CDFA54tyQmSdFDCQC+Zg==\n-----END AGE ENCRYPTED FILE-----\n +sops_mac=ENC[AES256_GCM,data:POdxJILRjhilf8dMLmtK+D6uuHeLZG7HmtaU7rd9msYh0a+tEIXF+VD/zi+CZEyofsezvKJIJETj6sP0UchUJoHpf2nRN+4jiDKuDnmi64jAl8tQNjduZYvbbsCwiBcA69ThQ9i3HdOtJzmDlSHorBkyUokU1UlV9Lzqq3VfvPY=,iv:hY1teRiKLpIbfoyouNOmdZ2d2OcTmP15+npejRkbOcI=,tag:z9sZf/1LhYaAMM8p+SI2NQ==,type:str] +sops_pgp__list_0__map_fp=31E70E5BC80C58AFF5DD649921AC5A1AC6E5B7F2 sops_pgp__list_0__map_created_at=2023-06-01T18:01:04Z sops_pgp__list_0__map_enc=-----BEGIN PGP MESSAGE-----\n\nhF4DAAAAAAAAAAASAQdAbA35718t0WVKrjQFYUPviCb0lVuh8NpfSdJCHjHcWWww\n8ak4q4VL69tZLSjQHx+VsMmKooknxWz6pw0lGxyDYlZMQ81bodInjaZGFZSz8Uuh\n0l4BhDCNDBBALTrnTliz6/DAHvmavI4UxMHost5alFio9JPkTDNmXZyvcy1/R6aw\n/uhQXLUBRvm0TSOhBZb7d0SLkLfe02Um40w1TibpKXsZz1GOMbPRNBMHHra0QIuQ\n=0jA+\n-----END PGP MESSAGE-----\n -sops_pgp__list_0__map_fp=31E70E5BC80C58AFF5DD649921AC5A1AC6E5B7F2 +sops_version=3.7.3 +sops_lastmodified=2023-06-29T21:53:05Z diff --git a/kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml b/kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml index a82a8b97..9b913318 100644 --- a/kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml +++ b/kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml @@ -32,16 +32,18 @@ spec: compression: enabled: true crashCollector: - disable: false + disable: true dashboard: enabled: true urlPrefix: "/" ssl: false mon: - count: 3 + # count: 3 + count: 1 allowMultiplePerNode: false mgr: - count: 2 + # count: 2 + count: 1 allowMultiplePerNode: false modules: - name: "pg_autoscaler"