diff --git a/cluster/apps/kustomization.yaml b/cluster/apps/kustomization.yaml index 6ab248e3..ba93172b 100644 --- a/cluster/apps/kustomization.yaml +++ b/cluster/apps/kustomization.yaml @@ -7,6 +7,7 @@ resources: - flux-system - kube-system - media + - monitoring - networking - security # - home diff --git a/cluster/apps/monitoring/botkube/helm-release.yaml b/cluster/apps/monitoring/botkube/helm-release.yaml deleted file mode 100644 index 42782dce..00000000 --- a/cluster/apps/monitoring/botkube/helm-release.yaml +++ /dev/null @@ -1,229 +0,0 @@ ---- -apiVersion: helm.toolkit.fluxcd.io/v2beta1 -kind: HelmRelease -metadata: - name: botkube - namespace: monitoring -spec: - interval: 5m - chart: - spec: - # renovate: registryUrl=https://infracloudio.github.io/charts - chart: botkube - version: v0.12.3 - sourceRef: - kind: HelmRepository - name: infracloudio-charts - namespace: flux-system - interval: 5m - values: - communications: - discord: - enabled: true - token: "${SECRET_BOTKUBE_DISCORD_TOKEN}" - botid: "842864233690431508" - channel: "842863758522974228" - serviceMonitor: - enabled: true - config: - recommendations: false - settings: - clustername: home-cluster - resources: - - name: v1/services - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: apps/v1/deployments - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - update - - delete - - error - - name: apps/v1/statefulsets - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - update - - delete - - error - - name: networking.k8s.io/v1beta1/ingresses - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: v1/nodes - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: v1/namespaces - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: v1/persistentvolumes - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: v1/persistentvolumeclaims - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: v1/configmaps - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: apps/v1/daemonsets - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: batch/v1/jobs - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - update - - error - - name: rbac.authorization.k8s.io/v1/rolebindings - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: rbac.authorization.k8s.io/v1/clusterrolebindings - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error - - name: rbac.authorization.k8s.io/v1/clusterroles - namespaces: - include: - - default - - dev - - home - - kube-system - - media - - monitoring - - networking - events: - - create - - delete - - error diff --git a/cluster/apps/monitoring/grafana/helm-release.yaml b/cluster/apps/monitoring/grafana/helm-release.yaml index edbfa34d..dbbc3a37 100644 --- a/cluster/apps/monitoring/grafana/helm-release.yaml +++ b/cluster/apps/monitoring/grafana/helm-release.yaml @@ -5,38 +5,37 @@ metadata: name: grafana namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: - # renovate: registryUrl=https://grafana.github.io/helm-charts chart: grafana - version: 6.17.10 + version: 6.32.0 sourceRef: kind: HelmRepository name: grafana-charts namespace: flux-system - interval: 5m + interval: 15m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 values: - image: - repository: ghcr.io/k8s-at-home/grafana - replicas: 1 + replicas: 3 env: GF_EXPLORE_ENABLED: true GF_PANELS_DISABLE_SANITIZE_HTML: true GF_LOG_FILTERS: rendering:debug - GF_DATE_FORMATS_USE_BROWSER_LOCALE: true GF_DATE_FORMATS_FULL_DATE: "MMM Do, YYYY hh:mm:ss a" - GF_DATE_FORMATS_INTERVAL_SECOND: "hh:mm:ss a" - GF_DATE_FORMATS_INTERVAL_MINUTE: "hh:mm a" - GF_DATE_FORMATS_INTERVAL_HOUR: "DD/MM hh:mm a" - GF_DATE_FORMATS_INTERVAL_DAY: "DD/MM" - GF_DATE_FORMATS_INTERVAL_MONTH: "MM-YYYY" - GF_DATE_FORMATS_INTERVAL_YEAR: "YYYY" - VAR_BLOCKY_URL: "http://blocky.networking.svc.cluster.local:4000" - adminPassword: "${SECRET_GRAFANA_PASSWORD}" + GF_SECURITY_ALLOW_EMBEDDING: true + GF_SECURITY_COOKIE_SAMESITE: grafana + admin: + existingSecret: grafana-admin-creds grafana.ini: server: - root_url: "https://grafana.${SECRET_DOMAIN}" + root_url: "https://grafana.${SECRET_PUBLIC_DOMAIN}" paths: data: /var/lib/grafana/data logs: /var/log/grafana @@ -81,6 +80,9 @@ spec: orgId: 1 datasources.yaml: apiVersion: 1 + deleteDatasources: + - name: Loki + orgId: 1 datasources: - name: Prometheus type: prometheus @@ -90,29 +92,9 @@ spec: - name: Loki type: loki access: proxy - url: http://loki:3100 + url: http://loki-gateway:80 dashboards: default: - # Ref: https://grafana.com/grafana/dashboards/12175 - calico-felix: - gnetId: 12175 - revision: 5 - datasource: Prometheus - # Ref: https://grafana.com/grafana/dashboards/2842 - ceph-cluster: - gnetId: 2842 - revision: 14 - datasource: Prometheus - # Ref: https://grafana.com/grafana/dashboards/5336 - ceph-osd: - gnetId: 5336 - revision: 5 - datasource: Prometheus - # Ref: https://grafana.com/grafana/dashboards/5342 - ceph-pools: - gnetId: 5342 - revision: 5 - datasource: Prometheus # Ref: https://grafana.com/grafana/dashboards/7845 zfs: gnetId: 7845 @@ -144,18 +126,9 @@ spec: flux-control-plane: url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/control-plane.json datasource: Prometheus - blocky: - url: https://raw.githubusercontent.com/0xERR0R/blocky/master/docs/blocky-grafana.json - datasource: Prometheus apc-smart-ups-1500: url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/apc-smart-ups-1500-snmp.json datasource: Prometheus - lidarr: - url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/lidarr.json - radarr: - url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/radarr.json - sonarr: - url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/sonarr.json sidecar: dashboards: enabled: true @@ -174,16 +147,31 @@ spec: enabled: true ingress: enabled: true - ingressClassName: "traefik" + ingressClassName: "nginx" annotations: - traefik.ingress.kubernetes.io/router.entrypoints: "websecure" + nginx.ingress.kubernetes.io/affinity: "cookie" + nginx.ingress.kubernetes.io/session-cookie-name: "grafana" + nginx.ingress.kubernetes.io/session-cookie-expires: "172800" + nginx.ingress.kubernetes.io/session-cookie-max-age: "172800" hosts: - - "grafana.${SECRET_DOMAIN}" + - &host "grafana.${SECRET_PUBLIC_DOMAIN}" tls: - hosts: - - "grafana.${SECRET_DOMAIN}" + - *host serviceAccount: create: true autoMount: true persistence: enabled: false + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - grafana + topologyKey: kubernetes.io/hostname diff --git a/cluster/apps/monitoring/grafana/kustomization.yaml b/cluster/apps/monitoring/grafana/kustomization.yaml index 761d2252..2d4a1b6e 100644 --- a/cluster/apps/monitoring/grafana/kustomization.yaml +++ b/cluster/apps/monitoring/grafana/kustomization.yaml @@ -1,4 +1,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- helm-release.yaml + - secret.sops.yaml + - helm-release.yaml diff --git a/cluster/apps/monitoring/grafana/secrete.sops.yaml b/cluster/apps/monitoring/grafana/secrete.sops.yaml new file mode 100644 index 00000000..f21a927c --- /dev/null +++ b/cluster/apps/monitoring/grafana/secrete.sops.yaml @@ -0,0 +1,29 @@ +# yamllint disable +apiVersion: v1 +kind: Secret +metadata: + name: grafana-admin-creds + namespace: monitoring +stringData: + admin-user: ENC[AES256_GCM,data:YEpVd78=,iv:Ed/WWBj61OyoqGgcx3mCCK/fJj020oj4h+sHyt6eeR0=,tag:DbTj3rfV/nxnaFOfopoAmA==,type:str] + admin-password: ENC[AES256_GCM,data:zYt9S6FPcCTtoukltlkQnw==,iv:1jSk25EJ6tDOnRXJOL2weht10S1zJ3/SVMUN7HfW61A=,tag:R2TW5PB4iUuvl4XpUglrIQ==,type:str] +sops: + kms: [] + gcp_kms: [] + azure_kv: [] + hc_vault: [] + age: + - recipient: age1nfn3vxpsgm49ljgs8kxevga9makhh9aply6ddgf9wplsfuwpcv2qzmqatc + enc: | + -----BEGIN AGE ENCRYPTED FILE----- + YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBWZ0VxRm81RkttajQvZ2c3 + SktZTDZ6MUZFeUZXQVFGKy9KQmFvQjhCVG1NCnM4dUo0Ym0remNsOTN3SWZUMTMx + UmE3TkhoS1c1UGN3QklXdCtWWFNqOTQKLS0tIGxXbHNVTUxtdXNMSWJnOWo0QjVr + QXZMSUxFNUdaWHI2UkpOVjBxa1FHVGcK32kLlwinxQcTEbWRBhc5ju2e3ORnFxeo + 2iDkIZepjjxd9Rwkak3o/+YGtBJa0Elvw07bpvvmsO20GZtkVCKIIg== + -----END AGE ENCRYPTED FILE----- + lastmodified: "2022-07-04T15:09:39Z" + mac: ENC[AES256_GCM,data:jQvHQowQatizYBWcVWd0GBmxVjWKri21h6lnhhql5nViTNpKns2J/r2h4bDkds4BLW41voQyCG9mwN/cnBrjOOBhIp55eEzgw+Jq2x5I980FvDBdRq+d5PO+5A/DCjphB1HSRyLd4mkpGS9qX6c1ieOzW2hUm797OZVf2Ov6Vvs=,iv:CYiiR7UKftaRdCiH4e+1xgqleQzgzdFZpT5tLzmZpQ4=,tag:JJR5EDNOQxX2NHGg25n4lg==,type:str] + pgp: [] + encrypted_regex: ^(data|stringData)$ + version: 3.7.3 diff --git a/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml b/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml index c40003fa..a51f4f47 100644 --- a/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml +++ b/cluster/apps/monitoring/kube-prometheus-stack/helm-release.yaml @@ -5,18 +5,23 @@ metadata: name: kube-prometheus-stack namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: - # renovate: registryUrl=https://prometheus-community.github.io/helm-charts chart: kube-prometheus-stack - version: 18.1.1 + version: 36.2.1 sourceRef: kind: HelmRepository name: prometheus-community-charts namespace: flux-system - interval: 5m - timeout: 20m + interval: 15m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 values: alertmanager: config: @@ -66,22 +71,24 @@ spec: ingress: enabled: true pathType: Prefix - ingressClassName: "traefik" + ingressClassName: "nginx" annotations: traefik.ingress.kubernetes.io/router.entrypoints: "websecure" hosts: - - "alert-manager.${SECRET_DOMAIN}" + - &host "alert-manager.${SECRET_PUBLIC_DOMAIN}" tls: - hosts: - - "alert-manager.${SECRET_DOMAIN}" + - *host alertmanagerSpec: + replicas: 3 + podAntiAffinity: hard storage: volumeClaimTemplate: spec: storageClassName: "ceph-block" resources: requests: - storage: 10Gi + storage: 250Mi nodeExporter: serviceMonitor: relabelings: @@ -91,6 +98,19 @@ spec: sourceLabels: - __meta_kubernetes_pod_node_name targetLabel: kubernetes_node + kube-state-metrics: + metricLabelsAllowlist: + - "persistentvolumeclaims=[*]" + prometheus: + monitor: + enabled: true + relabelings: + - action: replace + regex: (.*) + replacement: $1 + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: kubernetes_node grafana: enabled: false forceDeployDashboards: true @@ -110,59 +130,55 @@ spec: kubeApiServer: enabled: true kubeControllerManager: - enabled: true - endpoints: - - 10.75.40.10 - - 10.75.40.11 - - 10.75.40.12 + enabled: false kubeScheduler: - enabled: true - endpoints: - - 10.75.40.10 - - 10.75.40.11 - - 10.75.40.12 + enabled: false kubeProxy: - enabled: true - endpoints: - - 10.75.40.10 - - 10.75.40.11 - - 10.75.40.12 + enabled: false kubeEtcd: - enabled: true - endpoints: - - 10.75.40.10 - - 10.75.40.11 - - 10.75.40.12 - service: - enabled: true - port: 2381 - targetPort: 2381 + enabled: false prometheus: ingress: enabled: true pathType: Prefix - ingressClassName: "traefik" - annotations: - traefik.ingress.kubernetes.io/router.entrypoints: "websecure" + ingressClassName: "nginx" hosts: - - "prometheus.${SECRET_DOMAIN}" + - &host "prometheus.${SECRET_DOMAIN}" tls: - hosts: - - "prometheus.${SECRET_DOMAIN}" + - *host + thanosService: + enabled: true + thanosServiceMonitor: + enabled: true + thanosIngress: + enabled: true + pathType: Prefix + ingressClassName: "nginx" + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "true" + nginx.ingress.kubernetes.io/backend-protocol: "GRPC" + hosts: + - &host "thanos-sidecar.${SECRET_PUBLIC_DOMAIN}" + tls: + - hosts: + - *host resources: requests: cpu: 1000m memory: 1000Mi - thanosService: - enabled: true prometheusSpec: - replicas: 1 - replicaExternalLabelName: "replica" + replicas: 3 + replicaExternalLabelName: __replica__ + externalLabels: + cluster: ${CLUSTER_NAME} + podAntiAffinity: hard + retentionSize: "6GB" ruleSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false probeSelectorNilUsesHelmValues: false - retention: 6h + retention: 2d enableAdminAPI: true walCompression: true storageSpec: @@ -173,13 +189,16 @@ spec: requests: storage: 10Gi thanos: - image: quay.io/thanos/thanos:v0.23.1 - version: v0.22.0 + image: quay.io/thanos/thanos:v0.26.0 + # renovate: datasource=docker depName=quay.io/thanos/thanos + version: "v0.26.0" objectStorageConfig: - name: thanos-objstore + name: thanos-objstore-secret key: objstore.yml additionalScrapeConfigs: - job_name: 'node-exporter' + scrape_interval: 1m + scrape_timeout: 10s honor_timestamps: true static_configs: - targets: @@ -188,12 +207,16 @@ spec: # franxx nas - 10.75.30.15:9100 - job_name: 'minio-job' + scrape_interval: 1m + scrape_timeout: 10s honor_timestamps: true metrics_path: /minio/v2/metrics/cluster static_configs: - targets: - 10.75.30.15:9000 - job_name: "coredns" + scrape_interval: 1m + scrape_timeout: 10s honor_timestamps: true static_configs: - targets: diff --git a/cluster/apps/monitoring/kube-prometheus-stack/kustomization.yaml b/cluster/apps/monitoring/kube-prometheus-stack/kustomization.yaml index 761d2252..34a8531c 100644 --- a/cluster/apps/monitoring/kube-prometheus-stack/kustomization.yaml +++ b/cluster/apps/monitoring/kube-prometheus-stack/kustomization.yaml @@ -1,4 +1,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- helm-release.yaml + - helm-release.yaml diff --git a/cluster/apps/monitoring/kustomization.yaml b/cluster/apps/monitoring/kustomization.yaml index e70feaf2..4e5e5344 100644 --- a/cluster/apps/monitoring/kustomization.yaml +++ b/cluster/apps/monitoring/kustomization.yaml @@ -2,14 +2,14 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- botkube +- namespace.yaml - grafana - kube-prometheus-stack - loki -- promtail # - snmp-exporter-switch-0 -- snmp-exporter-ups-0 -- speedtest-exporter +#- snmp-exporter-ups-0 +#- speedtest-exporter - thanos +- vector #- unifi-poller -- uptimerobot-heartbeat +#- uptimerobot-heartbeat diff --git a/cluster/apps/monitoring/loki/config-map.yaml b/cluster/apps/monitoring/loki/config-map.yaml new file mode 100644 index 00000000..f3d31e95 --- /dev/null +++ b/cluster/apps/monitoring/loki/config-map.yaml @@ -0,0 +1,47 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-alerting-rules + namespace: monitoring +data: + loki-alerting-rules.yaml: |- + groups: + # + # SMART Failures + # + - name: smart-failure + rules: + - alert: SmartFailures + expr: | + sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + summary: "SMART has reported failures on host {{ $labels.hostname }}" + # + # *arr + # + - name: arr + rules: + - alert: ArrDatabaseIsLocked + expr: | + sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + summary: "{{ $labels.app }} is experiencing locked database issues" + - alert: ArrDatabaseIsMalformed + expr: | + sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0 + for: 2m + labels: + severity: critical + category: logs + annotations: + summary: "{{ $labels.app }} is experiencing malformed database disk image issues" + diff --git a/cluster/apps/monitoring/loki/helm-release.yaml b/cluster/apps/monitoring/loki/helm-release.yaml index 3e43861e..f9f11e2c 100644 --- a/cluster/apps/monitoring/loki/helm-release.yaml +++ b/cluster/apps/monitoring/loki/helm-release.yaml @@ -5,33 +5,148 @@ metadata: name: loki namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: - # renovate: registryUrl=https://grafana.github.io/helm-charts - chart: loki - version: 2.8.1 + chart: loki-simple-scalable + version: 1.4.3 sourceRef: kind: HelmRepository name: grafana-charts namespace: flux-system - interval: 5m + interval: 15m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 values: - image: - repository: ghcr.io/k8s-at-home/loki - config: - storage_config: - aws: - bucketnames: loki - endpoint: nas01:9000 - access_key_id: "${SECRET_MINIO_ACCESS_KEY}" - secret_access_key: "${SECRET_MINIO_SECRET_KEY}" - s3forcepathstyle: true - insecure: true - boltdb_shipper: - active_index_directory: /data/loki/index - cache_location: /data/loki/index_cache - resync_interval: 5s + loki: + structuredConfig: + auth_enabled: false + server: + log_level: info + http_listen_port: 3100 + grpc_listen_port: 9095 + memberlist: + join_members: ["loki-memberlist"] + limits_config: + retention_period: 14d + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h + max_cache_freshness_per_query: 10m + split_queries_by_interval: 15m + ingestion_rate_mb: 8 + ingestion_burst_size_mb: 16 + schema_config: + configs: + - from: "2021-08-01" + store: boltdb-shipper + object_store: s3 + schema: v11 + index: + prefix: loki_index_ + period: 24h + common: + path_prefix: /var/loki + replication_factor: 3 + storage: + s3: + s3: null + insecure: true + s3forcepathstyle: true + ring: + kvstore: + store: memberlist + ruler: + enable_api: true + enable_alertmanager_v2: true + alertmanager_url: http://kube-prometheus-stack-alertmanager:9093 + storage: + type: local + local: + directory: /rules + rule_path: /tmp/scratch + ring: + kvstore: + store: memberlist + distributor: + ring: + kvstore: + store: memberlist + compactor: + working_directory: /var/loki/boltdb-shipper-compactor shared_store: s3 + compaction_interval: 10m + retention_enabled: true + retention_delete_delay: 2h + retention_delete_worker_count: 150 + ingester: + max_chunk_age: 1h + lifecycler: + ring: + kvstore: + store: memberlist + analytics: + reporting_enabled: false + gateway: + enabled: true + replicas: 3 + ingress: + enabled: true + ingressClassName: "nginx" + hosts: + - host: &host "loki.${SECRET_PUBLIC_DOMAIN}" + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - *host + write: + replicas: 3 + persistence: + size: 10Gi + storageClass: ceph-block + read: + replicas: 3 + extraVolumeMounts: + - name: loki-rules + mountPath: /rules/fake + - name: loki-rules-tmp + mountPath: /tmp/scratch + - name: loki-tmp + mountPath: /tmp/loki-tmp + extraVolumes: + - name: loki-rules + configMap: + name: loki-alerting-rules + - name: loki-rules-tmp + emptyDir: {} + - name: loki-tmp + emptyDir: {} + persistence: + size: 10Gi + storageClass: ceph-block serviceMonitor: enabled: true + valuesFrom: + - targetPath: loki.structuredConfig.common.storage.s3.bucketnames + kind: ConfigMap + name: loki-chunks-bucket-v1 + valuesKey: BUCKET_NAME + - targetPath: loki.structuredConfig.common.storage.s3.endpoint + kind: ConfigMap + name: loki-chunks-bucket-v1 + valuesKey: BUCKET_HOST + - targetPath: loki.structuredConfig.common.storage.s3.access_key_id + kind: Secret + name: loki-chunks-bucket-v1 + valuesKey: AWS_ACCESS_KEY_ID + - targetPath: loki.structuredConfig.common.storage.s3.secret_access_key + kind: Secret + name: loki-chunks-bucket-v1 + valuesKey: AWS_SECRET_ACCESS_KEY diff --git a/cluster/apps/monitoring/loki/kustomization.yaml b/cluster/apps/monitoring/loki/kustomization.yaml index d3bd5723..ebcaa309 100644 --- a/cluster/apps/monitoring/loki/kustomization.yaml +++ b/cluster/apps/monitoring/loki/kustomization.yaml @@ -1,5 +1,7 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- helm-release.yaml -- prometheus-rule.yaml + - object-bucket-claim.yaml + - config-map.yaml + - helm-release.yaml \ No newline at end of file diff --git a/cluster/apps/monitoring/loki/object-bucket-claim.yaml b/cluster/apps/monitoring/loki/object-bucket-claim.yaml new file mode 100644 index 00000000..6af8cae9 --- /dev/null +++ b/cluster/apps/monitoring/loki/object-bucket-claim.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: loki-chunks-bucket-v1 + namespace: monitoring +spec: + bucketName: loki-chunks-v1 + storageClassName: ceph-bucket \ No newline at end of file diff --git a/cluster/apps/monitoring/loki/prometheus-rule.yaml b/cluster/apps/monitoring/loki/prometheus-rule.yaml deleted file mode 100644 index 707c248c..00000000 --- a/cluster/apps/monitoring/loki/prometheus-rule.yaml +++ /dev/null @@ -1,109 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - name: loki.rules - namespace: monitoring -spec: - groups: - - name: loki.rules - rules: - - alert: LokiRequestErrors - annotations: - message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors." - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics." - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) - > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency." - expr: | - namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} - > 1 - for: 15m - labels: - severity: critical - - expr: | - histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:99quantile - - expr: | - histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job)) - record: job:loki_request_duration_seconds:50quantile - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (job) - record: job:loki_request_duration_seconds:avg - - expr: | - sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job) - record: job:loki_request_duration_seconds_bucket:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job) - record: job:loki_request_duration_seconds_sum:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_count[1m])) by (job) - record: job:loki_request_duration_seconds_count:sum_rate - - expr: | - histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:99quantile - - expr: | - histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route)) - record: job_route:loki_request_duration_seconds:50quantile - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) - record: job_route:loki_request_duration_seconds:avg - - expr: | - sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, job, route) - record: job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_sum:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_count[1m])) by (job, route) - record: job_route:loki_request_duration_seconds_count:sum_rate - - expr: | - histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:99quantile - - expr: | - histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route)) - record: namespace_job_route:loki_request_duration_seconds:50quantile - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds:avg - - expr: | - sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_sum[1m])) - by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: | - sum(rate(loki_request_duration_seconds_count[1m])) - by (namespace, job, route) - record: namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/cluster/apps/monitoring/namespace.yaml b/cluster/apps/monitoring/namespace.yaml new file mode 100644 index 00000000..8315a7ed --- /dev/null +++ b/cluster/apps/monitoring/namespace.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + labels: + kustomize.toolkit.fluxcd.io/prune: disabled + goldilocks.fairwinds.com/enabled: "true" \ No newline at end of file diff --git a/cluster/apps/monitoring/thanos/dashboard/kustomization.yaml b/cluster/apps/monitoring/thanos/dashboard/kustomization.yaml new file mode 100644 index 00000000..ba0731af --- /dev/null +++ b/cluster/apps/monitoring/thanos/dashboard/kustomization.yaml @@ -0,0 +1,38 @@ +--- +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: monitoring +configMapGenerator: + - name: thanos-bucket-replicate-dashboard + files: + - thanos-bucket-replicate-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/bucket-replicate.json + - name: thanos-compact-dashboard + files: + - thanos-compact-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/compact.json + - name: thanos-overview-dashboard + files: + - thanos-overview-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/overview.json + - name: thanos-query-dashboard + files: + - thanos-query-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/query.json + - name: thanos-query-frontend-dashboard + files: + - thanos-query-frontend-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/query-frontend.json + - name: thanos-receive-dashboard + files: + - thanos-receive-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/receive.json + - name: thanos-rule-dashboard + files: + - thanos-rule-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/rule.json + - name: thanos-sidecar-dashboard + files: + - thanos-sidecar-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/sidecar.json + - name: thanos-store-dashboard + files: + - thanos-store-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/store.json +generatorOptions: + disableNameSuffixHash: true + annotations: + kustomize.toolkit.fluxcd.io/substitute: disabled + labels: + grafana_dashboard: "true" diff --git a/cluster/apps/monitoring/thanos/helm-release.yaml b/cluster/apps/monitoring/thanos/helm-release.yaml index dbb55e29..1c3aacc9 100644 --- a/cluster/apps/monitoring/thanos/helm-release.yaml +++ b/cluster/apps/monitoring/thanos/helm-release.yaml @@ -5,49 +5,148 @@ metadata: name: thanos namespace: monitoring spec: - interval: 5m + interval: 15m chart: spec: - # renovate: registryUrl=https://charts.bitnami.com/bitnami chart: thanos - version: 8.1.2 + version: 10.5.4 sourceRef: kind: HelmRepository name: bitnami-charts namespace: flux-system - interval: 5m + interval: 15m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + dependsOn: + - name: kube-prometheus-stack + namespace: monitoring values: + objstoreConfig: + type: s3 + config: + insecure: true query: enabled: true + replicaCount: 3 replicaLabels: - - replica + - __replica__ dnsDiscovery: sidecarsService: kube-prometheus-stack-thanos-discovery sidecarsNamespace: monitoring ingress: enabled: true - hostname: "thanos.${SECRET_DOMAIN}" - ingressClassName: "traefik" - annotations: - traefik.ingress.kubernetes.io/router.entrypoints: "websecure" + hostname: &host "thanos.${SECRET_PUBLIC_DOMAIN}" + ingressClassName: "nginx" tls: true - existingObjstoreSecret: thanos-objstore + extraTls: + - hosts: + - *host queryFrontend: enabled: false bucketweb: enabled: true + replicaCount: 3 + podAntiAffinityPreset: hard + ingress: + enabled: true + hostname: &host "thanos-bucketweb.${SECRET_PUBLIC_DOMAIN}" + ingressClassName: "nginx" + tls: true + extraTls: + - hosts: + - *host compactor: enabled: true - persistence: - enabled: false - storegateway: - enabled: true + extraFlags: + - "--compact.concurrency" + - "4" + retentionResolutionRaw: 14d + retentionResolution5m: 14d + retentionResolution1h: 14d + ingress: + enabled: true + hostname: &host "thanos-compactor.${SECRET_PUBLIC_DOMAIN}" + ingressClassName: "nginx" + tls: true + extraTls: + - hosts: + - *host persistence: enabled: true - size: 5Gi + storageClass: "ceph-block" + size: 50Gi + storegateway: + enabled: true + replicaCount: 3 + podAntiAffinityPreset: hard + ingress: + enabled: true + hostname: &host "thanos-storegateway.${SECRET_PUBLIC_DOMAIN}" + ingressClassName: "nginx" + tls: true + extraTls: + - hosts: + - *host + persistence: + enabled: true + storageClass: "ceph-block" + size: 20Gi ruler: - enabled: false + enabled: true + replicaCount: 3 + podAntiAffinityPreset: hard + dnsDiscovery: + enabled: true + alertmanagers: + - http://kube-prometheus-stack-alertmanager.monitoring:9093 + clusterName: ${CLUSTER_NAME} + extraFlags: + - "--web.prefix-header=X-Forwarded-Prefix" + config: |- + groups: + - name: "PrometheusWatcher" + rules: + - alert: "PrometheusDown" + expr: absent(up{job="kube-prometheus-stack-prometheus"}) + ingress: + enabled: true + hostname: &host "thanos-ruler.${SECRET_PUBLIC_DOMAIN}" + ingressClassName: "nginx" + tls: true + extraTls: + - hosts: + - *host + persistence: + enabled: true + storageClass: "ceph-block" + size: 20Gi metrics: enabled: true serviceMonitor: enabled: true + valuesFrom: + - targetPath: objstoreConfig.config.bucket + kind: ConfigMap + name: thanos-ceph-bucket-v1 + valuesKey: BUCKET_NAME + - targetPath: objstoreConfig.config.endpoint + kind: ConfigMap + name: thanos-ceph-bucket-v1 + valuesKey: BUCKET_HOST + - targetPath: objstoreConfig.config.region + kind: ConfigMap + name: thanos-ceph-bucket-v1 + valuesKey: BUCKET_REGION + - targetPath: objstoreConfig.config.access_key + kind: Secret + name: thanos-ceph-bucket-v1 + valuesKey: AWS_ACCESS_KEY_ID + - targetPath: objstoreConfig.config.secret_key + kind: Secret + name: thanos-ceph-bucket-v1 + valuesKey: AWS_SECRET_ACCESS_KEY \ No newline at end of file diff --git a/cluster/apps/monitoring/thanos/kustomization.yaml b/cluster/apps/monitoring/thanos/kustomization.yaml index 545e5413..3db2f6c7 100644 --- a/cluster/apps/monitoring/thanos/kustomization.yaml +++ b/cluster/apps/monitoring/thanos/kustomization.yaml @@ -2,5 +2,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - secret.sops.yaml + - dashboard + - object-bucket-claim.yaml - helm-release.yaml diff --git a/cluster/apps/monitoring/thanos/object-bucket-claim.yaml b/cluster/apps/monitoring/thanos/object-bucket-claim.yaml new file mode 100644 index 00000000..6cf4a8c6 --- /dev/null +++ b/cluster/apps/monitoring/thanos/object-bucket-claim.yaml @@ -0,0 +1,9 @@ +--- +apiVersion: objectbucket.io/v1alpha1 +kind: ObjectBucketClaim +metadata: + name: thanos-ceph-bucket-v1 + namespace: monitoring +spec: + bucketName: thanos-v1 + storageClassName: ceph-bucket \ No newline at end of file diff --git a/cluster/apps/monitoring/thanos/secret.sops.yaml b/cluster/apps/monitoring/thanos/secret.sops.yaml deleted file mode 100644 index 33287efe..00000000 --- a/cluster/apps/monitoring/thanos/secret.sops.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# yamllint disable -apiVersion: v1 -kind: Secret -metadata: - name: thanos-objstore - namespace: monitoring -stringData: - objstore.yml: ENC[AES256_GCM,data:eeCI5cch1lOSotG2rg1iw+RJ25eGUNzYJw/dZCcx4RfZUknNI+Z72tZWgTkt547R3bpT7IxeOZmQYJ6/WFS1gJynbkzgE6zFHpHbgMkvIWYA4iecaKq+idH5dRvJ1YV+Z4mPoEVJG1XFggKVcpdOowmWZQ022822GpRXwt2/OoltsgSA2REVb37Un2tM6PZ9ofnF9hTlo5v3EfEM5wI/3JUfQQ==,iv:AkWZPzYB/+bHprHxmRHKSuLTlsLDw3f7jJkFBJd3Yxo=,tag:IwC0eN2tOlY7j8OXcnpazg==,type:str] -sops: - kms: [] - gcp_kms: [] - azure_kv: [] - hc_vault: [] - age: - - recipient: age1nfn3vxpsgm49ljgs8kxevga9makhh9aply6ddgf9wplsfuwpcv2qzmqatc - enc: | - -----BEGIN AGE ENCRYPTED FILE----- - YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB6OGkxcnZscnJ1TitwQXgv - ZnkwcmdnVWJLbFFGSmRHV2pqbitYQW5IWWtFCjNNdVBwTEQ0VDk2OEtHKzkwSi9h - cVpOR3NONE9HQXB6VDlwMUp0WUYzRGMKLS0tIGRlY0UrOVhzMndJTWFyclg3ZHBV - bk1tZmFPUy9FUEtiMkVHcDBGaUZwdmsKdv0wD5JNfdBN45ba8bbjpVIEHop4AqKX - R+Vp9//6wTxsiafO0Bp0RUls1gHuRUYKhgAcH9PP8TIjZCwbUpHEpg== - -----END AGE ENCRYPTED FILE----- - lastmodified: "2021-08-11T23:27:44Z" - mac: ENC[AES256_GCM,data:2z0BgAwz408+gSDfuxGtt75mF++qOSgKb/RGdm0fqTORrFB+a/Yc/alXS0NVOl43WAkxY8HpUozQooa6VhdA88OcoNFUUtz8uhpeymBj1t/xXL4gE85Be2FXmhGFHKOaIulgPIiRScwxvaYG4C289QjIHZ8T3E9ykiYnrl1/bQk=,iv:MehdXoE4gTDoF+mG9SRatebR8LHxoy+g+709/I+LHII=,tag:q97WqxBWqRHLmAekD9IzWw==,type:str] - pgp: [] - encrypted_regex: ^(data|stringData)$ - version: 3.7.1 diff --git a/cluster/apps/monitoring/unifi-poller/helm-release.yaml b/cluster/apps/monitoring/unifi-poller/helm-release.yaml deleted file mode 100644 index d0351404..00000000 --- a/cluster/apps/monitoring/unifi-poller/helm-release.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -apiVersion: helm.toolkit.fluxcd.io/v2beta1 -kind: HelmRelease -metadata: - name: unifi-poller - namespace: monitoring -spec: - interval: 5m - chart: - spec: - # renovate: registryUrl=https://k8s-at-home.com/charts/ - chart: unifi-poller - version: 10.1.0 - sourceRef: - kind: HelmRepository - name: k8s-at-home-charts - namespace: flux-system - interval: 5m - values: - image: - repository: golift/unifi-poller - tag: 2.1.3 - env: - TZ: "America/New_York" - UP_UNIFI_DEFAULT_ROLE: "homelab-controller" - UP_UNIFI_DEFAULT_URL: "https://192.168.1.2" - UP_UNIFI_DEFAULT_VERIFY_SSL: false - UP_UNIFI_DEFAULT_USER: "unifipoller" - UP_UNIFI_DEFAULT_PASS: "unifipoller" - UP_UNIFI_DEFAULT_SAVE_SITES: true - UP_UNIFI_DEFAULT_SAVE_DPI: false - UP_INFLUXDB_DISABLE: true - UP_PROMETHEUS_DISABLE: false - UP_PROMETHEUS_NAMESPACE: "unifipoller" - prometheus: - serviceMonitor: - enabled: true diff --git a/cluster/apps/monitoring/uptimerobot-operator/helm-release.yaml b/cluster/apps/monitoring/uptimerobot-operator/helm-release.yaml deleted file mode 100644 index 95c3ed49..00000000 --- a/cluster/apps/monitoring/uptimerobot-operator/helm-release.yaml +++ /dev/null @@ -1,27 +0,0 @@ ---- -apiVersion: helm.toolkit.fluxcd.io/v2beta1 -kind: HelmRelease -metadata: - name: uptimerobot-operator - namespace: monitoring -spec: - interval: 5m - chart: - spec: - # renovate: registryUrl=https://brennerm.github.io/uptimerobot-operator/helm - chart: uptimerobot-operator - version: 0.3.0 - sourceRef: - kind: HelmRepository - name: uptimerobot-operator-charts - namespace: flux-system - interval: 5m - values: - uptimeRobotApiKey: "${SECRET_UPTIMEROBOT_API_KEY}" - disableIngressHandling: true - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - memory: 250Mi diff --git a/cluster/apps/monitoring/vector/agent/helm-release.yaml b/cluster/apps/monitoring/vector/agent/helm-release.yaml new file mode 100644 index 00000000..1aa966b3 --- /dev/null +++ b/cluster/apps/monitoring/vector/agent/helm-release.yaml @@ -0,0 +1,83 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: vector-agent + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: vector + version: 0.13.2 + sourceRef: + kind: HelmRepository + name: vector-charts + namespace: flux-system + interval: 15m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + dependsOn: + - name: vector-aggregator + namespace: monitoring + values: + image: + repository: timberio/vector + tag: 0.22.3-debian + role: Agent + customConfig: + data_dir: /vector-data-dir + api: + enabled: false + # Sources + sources: + journal_logs: + type: journald + journal_directory: /var/log/journal + kubernetes_logs: + type: kubernetes_logs + pod_annotation_fields: + container_image: "container_image" + container_name: "container_name" + pod_annotations: "pod_annotations" + pod_labels: "pod_labels" + pod_name: "pod_name" + vector_metrics: + type: internal_metrics + # Sinks + sinks: + loki_journal_sink: + type: vector + inputs: + - journal_logs + address: vector-aggregator:6000 + version: "2" + loki_kubernetes_sink: + type: vector + inputs: + - kubernetes_logs + address: vector-aggregator:6010 + version: "2" + prometheus_sink: + type: prometheus_exporter + inputs: + - vector_metrics + address: 0.0.0.0:9090 + service: + enabled: false + podMonitor: + enabled: true + jobLabel: vector-agent + port: prometheus-sink + securityContext: + privileged: true + tolerations: + - effect: NoSchedule + operator: Exists + - effect: NoExecute + operator: Exists diff --git a/cluster/apps/monitoring/unifi-poller/kustomization.yaml b/cluster/apps/monitoring/vector/agent/kustomization.yaml similarity index 75% rename from cluster/apps/monitoring/unifi-poller/kustomization.yaml rename to cluster/apps/monitoring/vector/agent/kustomization.yaml index 761d2252..bfe696aa 100644 --- a/cluster/apps/monitoring/unifi-poller/kustomization.yaml +++ b/cluster/apps/monitoring/vector/agent/kustomization.yaml @@ -1,4 +1,5 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- helm-release.yaml + - helm-release.yaml \ No newline at end of file diff --git a/cluster/apps/monitoring/vector/aggregator/helm-release.yaml b/cluster/apps/monitoring/vector/aggregator/helm-release.yaml new file mode 100644 index 00000000..3301fa49 --- /dev/null +++ b/cluster/apps/monitoring/vector/aggregator/helm-release.yaml @@ -0,0 +1,148 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: vector-aggregator + namespace: monitoring +spec: + interval: 15m + chart: + spec: + chart: vector + version: 0.13.2 + sourceRef: + kind: HelmRepository + name: vector-charts + namespace: flux-system + interval: 15m + install: + createNamespace: true + remediation: + retries: 5 + upgrade: + remediation: + retries: 5 + dependsOn: + - name: loki + namespace: monitoring + values: + customConfig: + data_dir: /vector-data-dir + api: + enabled: false + # + # Sources + # + sources: + journal_logs: + type: vector + address: 0.0.0.0:6000 + version: "2" + kubernetes_logs: + type: vector + address: 0.0.0.0:6010 + version: "2" + vector_metrics: + type: internal_metrics + # + # Transformations + # + transforms: + kubernetes_logs_remap: + type: remap + inputs: + - kubernetes_logs + source: | + # Standardize 'app' index + .custom_app_name = .pod_labels."app.kubernetes.io/name" || .pod_labels.app || .pod_labels."k8s-app" || "unknown" + # + # Sinks + # + sinks: + loki_journal: + type: loki + inputs: + - journal_logs + endpoint: http://loki-gateway:80 + encoding: + codec: json + batch: + max_bytes: 2049000 + out_of_order_action: accept + remove_label_fields: true + remove_timestamp: true + labels: + hostname: >- + {{`{{ host }}`}} + loki_kubernetes: + type: loki + inputs: + - kubernetes_logs_remap + endpoint: http://loki-gateway:80 + encoding: + codec: json + batch: + max_bytes: 2049000 + out_of_order_action: accept + remove_label_fields: true + remove_timestamp: true + labels: + app: >- + {{`{{ custom_app_name }}`}} + namespace: >- + {{`{{ kubernetes.pod_namespace }}`}} + node: >- + {{`{{ kubernetes.pod_node_name }}`}} + prometheus_sink: + type: prometheus_exporter + inputs: + - vector_metrics + address: 0.0.0.0:9090 + # console_debug: + # type: console + # inputs: + # - opnsense_filterlog_logs + # target: stdout + # encoding: + # codec: json + image: + repository: timberio/vector + tag: 0.22.3-debian + role: Stateless-Aggregator + replicas: 3 + # env: + # - name: VECTOR_LOG + # value: debug + service: + enabled: true + type: LoadBalancer + podMonitor: + enabled: true + jobLabel: vector-aggregator + port: prometheus-sink + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - Stateless-Aggregator + topologyKey: kubernetes.io/hostname + postRenderers: + - kustomize: + patchesJson6902: + - target: + kind: Service + name: vector-aggregator + patch: + - op: add + path: /spec/loadBalancerIP + value: + - "${SVC_SYSLOG_ADDR}" + - op: replace + path: /spec/externalTrafficPolicy + value: Local diff --git a/cluster/apps/monitoring/botkube/kustomization.yaml b/cluster/apps/monitoring/vector/aggregator/kustomization.yaml similarity index 75% rename from cluster/apps/monitoring/botkube/kustomization.yaml rename to cluster/apps/monitoring/vector/aggregator/kustomization.yaml index 761d2252..bfe696aa 100644 --- a/cluster/apps/monitoring/botkube/kustomization.yaml +++ b/cluster/apps/monitoring/vector/aggregator/kustomization.yaml @@ -1,4 +1,5 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- helm-release.yaml + - helm-release.yaml \ No newline at end of file diff --git a/cluster/apps/monitoring/uptimerobot-operator/kustomization.yaml b/cluster/apps/monitoring/vector/kustomization.yaml similarity index 72% rename from cluster/apps/monitoring/uptimerobot-operator/kustomization.yaml rename to cluster/apps/monitoring/vector/kustomization.yaml index 761d2252..d012532e 100644 --- a/cluster/apps/monitoring/uptimerobot-operator/kustomization.yaml +++ b/cluster/apps/monitoring/vector/kustomization.yaml @@ -1,4 +1,6 @@ +--- apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: -- helm-release.yaml + - agent + - aggregator \ No newline at end of file diff --git a/cluster/config/cluster-settings.yaml b/cluster/config/cluster-settings.yaml index 0e04c687..08a9ea73 100644 --- a/cluster/config/cluster-settings.yaml +++ b/cluster/config/cluster-settings.yaml @@ -17,7 +17,7 @@ data: LB_COREDNS_IP: 10.75.45.100 LB_TRAEFIK_IP: 10.75.45.101 SVC_NGINX_ADDR: 10.75.45.101 - LB_LOKI_IP: 10.75.45.102 + SVC_SYSLOG_ADDR: 10.75.45.102 SVC_PLEX_ADDR: 10.75.45.103 SVC_JELLYFIN_ADDR: 10.75.45.104 LB_BLOCKY_IP: 10.75.45.105