diff --git a/kube/bootstrap/flux/flux-install-localhost.yaml b/kube/bootstrap/flux/flux-install-localhost.yaml index 27b47dfc..36187939 100644 --- a/kube/bootstrap/flux/flux-install-localhost.yaml +++ b/kube/bootstrap/flux/flux-install-localhost.yaml @@ -100,6 +100,10 @@ spec: labelSelector: matchLabels: flux: localhost + - op: replace + path: /spec/strategy + value: + type: Recreate # avoid new pods stuck on CrashLoop because localhost is already binded target: kind: Deployment - patch: | diff --git a/kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/hr.yaml similarity index 99% rename from kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml rename to kube/deploy/core/storage/rook-ceph/cluster/biohazard/hr.yaml index 6d27ea03..e427ccf7 100644 --- a/kube/deploy/core/storage/rook-ceph/cluster/app/hr.yaml +++ b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/hr.yaml @@ -7,6 +7,7 @@ metadata: namespace: rook-ceph labels: helm.flux.home.arpa/default: "false" + kustomize.toolkit.fluxcd.io/prune: Disabled spec: interval: 5m timeout: 1h diff --git a/kube/deploy/core/storage/rook-ceph/cluster/app/ingress.yaml b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/ingress.yaml similarity index 100% rename from kube/deploy/core/storage/rook-ceph/cluster/app/ingress.yaml rename to kube/deploy/core/storage/rook-ceph/cluster/biohazard/ingress.yaml diff --git a/kube/deploy/core/storage/rook-ceph/cluster/app/netpol.yaml b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/netpol.yaml similarity index 100% rename from kube/deploy/core/storage/rook-ceph/cluster/app/netpol.yaml rename to kube/deploy/core/storage/rook-ceph/cluster/biohazard/netpol.yaml diff --git a/kube/deploy/core/storage/rook-ceph/cluster/biohazard/rgw-admin.yaml b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/rgw-admin.yaml new file mode 100644 index 00000000..22e26d3c --- /dev/null +++ b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/rgw-admin.yaml @@ -0,0 +1,24 @@ +--- +apiVersion: ceph.rook.io/v1 +kind: CephObjectStoreUser +metadata: + name: rgw-biohazard-admin + namespace: rook-ceph +spec: + store: biohazard + displayName: rgw-biohazard-admin + capabilities: + user: "*" + bucket: "*" + usage: "*" + metadata: "*" + zone: "*" + roles: "*" + info: "*" + amz-cache: "*" + bilog: "*" + mdlog: "*" + datalog: "*" + user-policy: "*" + oidc-provider: "*" + ratelimit: "*" diff --git a/kube/deploy/core/storage/rook-ceph/cluster/app/svc.yaml b/kube/deploy/core/storage/rook-ceph/cluster/biohazard/svc.yaml similarity index 100% rename from kube/deploy/core/storage/rook-ceph/cluster/app/svc.yaml rename to kube/deploy/core/storage/rook-ceph/cluster/biohazard/svc.yaml diff --git a/kube/deploy/core/storage/rook-ceph/cluster/ks.yaml b/kube/deploy/core/storage/rook-ceph/cluster/ks.yaml index 9c69bc2a..ac225921 100644 --- a/kube/deploy/core/storage/rook-ceph/cluster/ks.yaml +++ b/kube/deploy/core/storage/rook-ceph/cluster/ks.yaml @@ -2,16 +2,19 @@ apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: 1-core-storage-rook-ceph-cluster + name: "1-core-storage-rook-ceph-cluster" namespace: flux-system + labels: + prune.flux.home.arpa/disabled: "true" spec: wait: true - path: ./kube/deploy/core/storage/rook-ceph/cluster/app + prune: false + path: "./kube/deploy/core/storage/rook-ceph/cluster/${CLUSTER_NAME:=biohazard}" dependsOn: - - name: 0-${CLUSTER_NAME}-config + - name: "0-${CLUSTER_NAME}-config" - name: 1-core-storage-rook-ceph-app healthChecks: - apiVersion: ceph.rook.io/v1 kind: CephCluster - name: "${CLUSTER_NAME}" - namespace: rook-ceph \ No newline at end of file + name: "${CLUSTER_NAME:=biohazard}" + namespace: rook-ceph diff --git a/kube/deploy/core/storage/rook-ceph/cluster/sinon/hr.yaml b/kube/deploy/core/storage/rook-ceph/cluster/sinon/hr.yaml new file mode 100644 index 00000000..b34f08b1 --- /dev/null +++ b/kube/deploy/core/storage/rook-ceph/cluster/sinon/hr.yaml @@ -0,0 +1,323 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta2 +kind: HelmRelease +metadata: + #name: "rook-ceph-cluster-${CLUSTER_NAME:=biohazard}" + name: "rook-ceph-cluster-sinon" # TODO: envsubst in HR currently breaks flux-local, hardcode cluster name for now since not using Ceph in multi k8s clusters as of this commit + namespace: rook-ceph + labels: + helm.flux.home.arpa/default: "false" + kustomize.toolkit.fluxcd.io/prune: Disabled +spec: + interval: 5m + timeout: 1h + maxHistory: 10 + install: + crds: CreateReplace + createNamespace: true + remediation: + retries: 5 + upgrade: + crds: CreateReplace + cleanupOnFail: true + remediation: + retries: 5 + strategy: rollback + rollback: + recreate: true + cleanupOnFail: true + uninstall: + keepHistory: false + chart: + spec: + chart: rook-ceph-cluster + version: "v1.13.4" + sourceRef: + name: rook-ceph + kind: HelmRepository + namespace: flux-system + values: + clusterName: "sinon" + operatorNamespace: "rook-ceph" + configOverride: | + [global] + bdev_enable_discard = true + osd_class_update_on_start = false + osd_pool_default_size = 2 + osd_pool_default_min_size = 2 + mon_data_avail_warn = 10 + #bdev_async_discard = true + cephClusterSpec: + cephVersion: + image: "quay.io/ceph/ceph:v18.2.2" # TODO: temp rolled back to 18.2.0 for Prom metrics, 18.2.1 breaks some metrics + network: + provider: host + addressRanges: + public: ["127.0.0.0/8"] + connections: + requireMsgr2: true + encryption: + enabled: false # TODO: 2023-11-23 disabled to test speeds with compression without encryption + compression: + enabled: false + crashCollector: + disable: true + dashboard: + enabled: true + urlPrefix: "/" + ssl: false + prometheusEndpoint: "http://vmsingle-victoria.monitoring.svc.cluster.local:8429" + labels: + mgr: + ingress.home.arpa/nginx-internal: "allow" + prom.home.arpa/kps: "allow" + tailscale.com/expose: "true" + egress.home.arpa/kps: "allow" + mon: + count: 1 + allowMultiplePerNode: false + mgr: + count: 1 + allowMultiplePerNode: false + modules: + - name: "pg_autoscaler" + enabled: true + - name: "diskprediction_local" + enabled: true + removeOSDsIfOutAndSafeToRemove: false + storage: + useAllNodes: false + useAllDevices: false + config: + encryptedDevice: "true" + osdsPerDevice: "1" + nodes: + - name: "sinon" + devicePathFilter: &osd-wd-12tb "^/dev/disk/by-id/ata-WDC_WD120EFBX.*" + config: + deviceClass: "hdd" + placement: + all: + tolerations: + - key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + resources: + mgr: + requests: + cpu: "150m" + memory: "512Mi" + limits: + cpu: "2000m" + memory: "4Gi" + mon: + requests: + cpu: "100m" + memory: "500Mi" + limits: + cpu: "2000m" + memory: "4Gi" + osd: + requests: + cpu: "500m" + memory: "3Gi" + limits: + # cpu: "999999m" + cpu: "2000m" # 2023-12-22: re-enabled CPU limits becaues 3 8th gen Intel CPUs aren't enough for the rest of my workloads apparently + memory: "10Gi" + mgr-sidecar: + requests: + cpu: "100m" + memory: "100Mi" + limits: + cpu: "1000m" + memory: "500Mi" + crashcollector: + requests: + cpu: "15m" + memory: "64Mi" + limits: + memory: "64Mi" + logcollector: + requests: + cpu: "50m" + memory: "100Mi" + limits: + memory: "1Gi" + cleanup: + requests: + cpu: "250m" + memory: "100Mi" + limits: + memory: "1Gi" + exporter: + requests: + cpu: "50m" + memory: "50Mi" + limits: + cpu: "250m" + memory: "128Mi" + cephBlockPools: + #- name: "${CLUSTER_NAME}-block-k8s-ssd" + - name: "${CLUSTER_NAME}-block-k8s-hdd" + spec: + failureDomain: "host" + deviceClass: "hdd" + replicated: + size: 2 + parameters: + min_size: "2" + compression_mode: "aggressive" + compression_algorithm: "lz4" + storageClass: &rbd-sc + enabled: true + name: "block" + isDefault: true + reclaimPolicy: "Delete" + allowVolumeExpansion: true + mountOptions: ["discard"] + parameters: + imageFormat: "2" + imageFeatures: "layering,exclusive-lock,object-map,fast-diff,deep-flatten" # https://docs.ceph.com/en/quincy/rbd/rbd-config-ref/#image-features + csi.storage.k8s.io/provisioner-secret-name: "rook-csi-rbd-provisioner" + csi.storage.k8s.io/provisioner-secret-namespace: "rook-ceph" + csi.storage.k8s.io/controller-expand-secret-name: "rook-csi-rbd-provisioner" + csi.storage.k8s.io/controller-expand-secret-namespace: "rook-ceph" + csi.storage.k8s.io/node-stage-secret-name: "rook-csi-rbd-node" + csi.storage.k8s.io/node-stage-secret-namespace: "rook-ceph" + csi.storage.k8s.io/fstype: "ext4" + - name: "builtin-mgr" + spec: + name: ".mgr" # https://github.com/rook/rook/blob/bb4528d460db17909b3dec0b2f31374913345cd5/deploy/examples/pool-builtin-mgr.yaml#L11 + failureDomain: "host" + deviceClass: "ssd" + replicated: + size: 2 # https://github.com/rook/rook/discussions/13178 + requireSafeReplicaSize: true + parameters: + min_size: "2" + compression_mode: "none" + storageClass: + enabled: false # ain't nothin' touchin' the mgr pool lol + cephBlockPoolsVolumeSnapshotClass: + enabled: true + name: "block" + isDefault: true + deletionPolicy: "Delete" + cephFileSystems: + - &cephfs + name: &fs "${CLUSTER_NAME}-fs" + spec: &cephfsSpec + preserveFilesystemOnDelete: true + metadataPool: + failureDomain: "host" + #deviceClass: "ssd" + deviceClass: "hdd" + replicated: + size: 2 + dataPools: + - &cephfsData + name: &fsdata0 "${CLUSTER_NAME}-fs-data0" + failureDomain: "host" + #deviceClass: "ssd" + deviceClass: "hdd" + replicated: + size: 2 + parameters: + min_size: "2" + compression_mode: "aggressive" + compression_algorithm: "zstd" + metadataServer: + activeCount: 1 # 1 active 1 standby + activeStandby: true + resources: + requests: + cpu: 100m + memory: 2Gi + limits: + cpu: 2000m # 2 cores + memory: 4Gi + storageClass: &cephfsSC + enabled: true + isDefault: false + name: "file" + pool: *fsdata0 + reclaimPolicy: "Delete" + allowVolumeExpansion: true + # mountOptions: ["discard"] + parameters: + csi.storage.k8s.io/provisioner-secret-name: "rook-csi-cephfs-provisioner" + csi.storage.k8s.io/provisioner-secret-namespace: "rook-ceph" + csi.storage.k8s.io/controller-expand-secret-name: "rook-csi-cephfs-provisioner" + csi.storage.k8s.io/controller-expand-secret-namespace: "rook-ceph" + csi.storage.k8s.io/node-stage-secret-name: "rook-csi-cephfs-node" + csi.storage.k8s.io/node-stage-secret-namespace: "rook-ceph" + csi.storage.k8s.io/fstype: "ext4" + cephFileSystemVolumeSnapshotClass: + enabled: true + name: "file" + isDefault: false + deletionPolicy: Delete + cephObjectStores: + - name: &rgw "${CLUSTER_NAME}" + spec: + preservePoolsOnDelete: true + metadataPool: + failureDomain: "host" + #deviceClass: "ssd" + replicated: + size: 2 + dataPool: + failureDomain: "host" + #deviceClass: "ssd" + deviceClass: "hdd" + replicated: + size: 2 + healthCheck: + bucket: + interval: "60s" + gateway: + labels: # netpols + tailscale.com/expose: "true" + instances: 2 + hostNetwork: false + port: 6953 + securePort: 53443 + sslCertificateRef: "short-domain-tls" + priorityClassName: "system-cluster-critical" + resources: + requests: + cpu: 100m + limits: + memory: 2Gi + ingress: + enabled: true + ingressClassName: "nginx-internal" + annotations: + nginx.ingress.kubernetes.io/custom-http-errors: "404,502" + nginx.ingress.kubernetes.io/proxy-body-size: "0" + nginx.ingress.kubernetes.io/proxy-request-buffering: "off" + host: + name: &rgw-host "${APP_DNS_RGW_S3}" + path: "/" + tls: + - hosts: [*rgw-host] + storageClass: + enabled: true + name: "rgw-${CLUSTER_NAME}" + reclaimPolicy: "Delete" + volumeBindingMode: "Immediate" + parameters: + region: "us-east-1" + monitoring: + enabled: true + createPrometheusRules: true + ingress: + dashboard: + ingressClassName: "nginx-internal" + host: + name: &dashboard-host "${APP_DNS_CEPH}" + path: "/" + tls: + - hosts: + - *dashboard-host + pspEnable: false diff --git a/kube/deploy/core/storage/rook-ceph/cluster/sinon/netpol.yaml b/kube/deploy/core/storage/rook-ceph/cluster/sinon/netpol.yaml new file mode 100644 index 00000000..b1d11fbc --- /dev/null +++ b/kube/deploy/core/storage/rook-ceph/cluster/sinon/netpol.yaml @@ -0,0 +1,30 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/datreeio/CRDs-catalog/main/cilium.io/ciliumnetworkpolicy_v2.json +apiVersion: cilium.io/v2 +kind: CiliumNetworkPolicy +metadata: + name: rook-ceph-cluster + namespace: &app rook-ceph +spec: + endpointSelector: {} + ingress: + # same namespace + - fromEndpoints: + - matchLabels: + io.kubernetes.pod.namespace: *app + - fromEntities: + - "host" + - "remote-node" + - "kube-apiserver" + egress: + # same namespace + - toEndpoints: + - matchLabels: + io.kubernetes.pod.namespace: *app + # hostNet Ceph cluster + - toEntities: + - "host" + - "remote-node" + toPorts: + - ports: + - port: "3300" \ No newline at end of file diff --git a/kube/deploy/core/storage/rook-ceph/cluster/sinon/svc.yaml b/kube/deploy/core/storage/rook-ceph/cluster/sinon/svc.yaml new file mode 100644 index 00000000..e871f7ab --- /dev/null +++ b/kube/deploy/core/storage/rook-ceph/cluster/sinon/svc.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: rook-ceph-rgw-https + namespace: rook-ceph + annotations: + # this is for in-cluster apps only + io.cilium/internal: "true" + io.cilium/lb-ipam-ips: "${APP_IP_RGW_HTTPS}" + coredns.io/hostname: "${APP_DNS_RGW_HTTPS}" +spec: + type: LoadBalancer + selector: + rgw: "${CLUSTER_NAME}" + ports: + - name: https + port: 443 + targetPort: 53443 + protocol: TCP + - name: https-udp # if they ever implement QUIC or the like + port: 443 + targetPort: 53443 + protocol: UDP + allocateLoadBalancerNodePorts: false