mirror of
https://github.com/outbackdingo/home-ops.git
synced 2026-01-28 02:19:04 +00:00
refactor: monitoring updates
This commit is contained in:
@@ -7,6 +7,7 @@ resources:
|
||||
- flux-system
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
- security
|
||||
# - home
|
||||
|
||||
@@ -1,229 +0,0 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: botkube
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://infracloudio.github.io/charts
|
||||
chart: botkube
|
||||
version: v0.12.3
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: infracloudio-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
values:
|
||||
communications:
|
||||
discord:
|
||||
enabled: true
|
||||
token: "${SECRET_BOTKUBE_DISCORD_TOKEN}"
|
||||
botid: "842864233690431508"
|
||||
channel: "842863758522974228"
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
config:
|
||||
recommendations: false
|
||||
settings:
|
||||
clustername: home-cluster
|
||||
resources:
|
||||
- name: v1/services
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: apps/v1/deployments
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- update
|
||||
- delete
|
||||
- error
|
||||
- name: apps/v1/statefulsets
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- update
|
||||
- delete
|
||||
- error
|
||||
- name: networking.k8s.io/v1beta1/ingresses
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/nodes
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/namespaces
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/persistentvolumes
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/persistentvolumeclaims
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: v1/configmaps
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: apps/v1/daemonsets
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: batch/v1/jobs
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- update
|
||||
- error
|
||||
- name: rbac.authorization.k8s.io/v1/rolebindings
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: rbac.authorization.k8s.io/v1/clusterrolebindings
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
- name: rbac.authorization.k8s.io/v1/clusterroles
|
||||
namespaces:
|
||||
include:
|
||||
- default
|
||||
- dev
|
||||
- home
|
||||
- kube-system
|
||||
- media
|
||||
- monitoring
|
||||
- networking
|
||||
events:
|
||||
- create
|
||||
- delete
|
||||
- error
|
||||
@@ -5,38 +5,37 @@ metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://grafana.github.io/helm-charts
|
||||
chart: grafana
|
||||
version: 6.17.10
|
||||
version: 6.32.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
values:
|
||||
image:
|
||||
repository: ghcr.io/k8s-at-home/grafana
|
||||
replicas: 1
|
||||
replicas: 3
|
||||
env:
|
||||
GF_EXPLORE_ENABLED: true
|
||||
GF_PANELS_DISABLE_SANITIZE_HTML: true
|
||||
GF_LOG_FILTERS: rendering:debug
|
||||
GF_DATE_FORMATS_USE_BROWSER_LOCALE: true
|
||||
GF_DATE_FORMATS_FULL_DATE: "MMM Do, YYYY hh:mm:ss a"
|
||||
GF_DATE_FORMATS_INTERVAL_SECOND: "hh:mm:ss a"
|
||||
GF_DATE_FORMATS_INTERVAL_MINUTE: "hh:mm a"
|
||||
GF_DATE_FORMATS_INTERVAL_HOUR: "DD/MM hh:mm a"
|
||||
GF_DATE_FORMATS_INTERVAL_DAY: "DD/MM"
|
||||
GF_DATE_FORMATS_INTERVAL_MONTH: "MM-YYYY"
|
||||
GF_DATE_FORMATS_INTERVAL_YEAR: "YYYY"
|
||||
VAR_BLOCKY_URL: "http://blocky.networking.svc.cluster.local:4000"
|
||||
adminPassword: "${SECRET_GRAFANA_PASSWORD}"
|
||||
GF_SECURITY_ALLOW_EMBEDDING: true
|
||||
GF_SECURITY_COOKIE_SAMESITE: grafana
|
||||
admin:
|
||||
existingSecret: grafana-admin-creds
|
||||
grafana.ini:
|
||||
server:
|
||||
root_url: "https://grafana.${SECRET_DOMAIN}"
|
||||
root_url: "https://grafana.${SECRET_PUBLIC_DOMAIN}"
|
||||
paths:
|
||||
data: /var/lib/grafana/data
|
||||
logs: /var/log/grafana
|
||||
@@ -81,6 +80,9 @@ spec:
|
||||
orgId: 1
|
||||
datasources.yaml:
|
||||
apiVersion: 1
|
||||
deleteDatasources:
|
||||
- name: Loki
|
||||
orgId: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
@@ -90,29 +92,9 @@ spec:
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki:3100
|
||||
url: http://loki-gateway:80
|
||||
dashboards:
|
||||
default:
|
||||
# Ref: https://grafana.com/grafana/dashboards/12175
|
||||
calico-felix:
|
||||
gnetId: 12175
|
||||
revision: 5
|
||||
datasource: Prometheus
|
||||
# Ref: https://grafana.com/grafana/dashboards/2842
|
||||
ceph-cluster:
|
||||
gnetId: 2842
|
||||
revision: 14
|
||||
datasource: Prometheus
|
||||
# Ref: https://grafana.com/grafana/dashboards/5336
|
||||
ceph-osd:
|
||||
gnetId: 5336
|
||||
revision: 5
|
||||
datasource: Prometheus
|
||||
# Ref: https://grafana.com/grafana/dashboards/5342
|
||||
ceph-pools:
|
||||
gnetId: 5342
|
||||
revision: 5
|
||||
datasource: Prometheus
|
||||
# Ref: https://grafana.com/grafana/dashboards/7845
|
||||
zfs:
|
||||
gnetId: 7845
|
||||
@@ -144,18 +126,9 @@ spec:
|
||||
flux-control-plane:
|
||||
url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/control-plane.json
|
||||
datasource: Prometheus
|
||||
blocky:
|
||||
url: https://raw.githubusercontent.com/0xERR0R/blocky/master/docs/blocky-grafana.json
|
||||
datasource: Prometheus
|
||||
apc-smart-ups-1500:
|
||||
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/apc-smart-ups-1500-snmp.json
|
||||
datasource: Prometheus
|
||||
lidarr:
|
||||
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/lidarr.json
|
||||
radarr:
|
||||
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/radarr.json
|
||||
sonarr:
|
||||
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/sonarr.json
|
||||
sidecar:
|
||||
dashboards:
|
||||
enabled: true
|
||||
@@ -174,16 +147,31 @@ spec:
|
||||
enabled: true
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: "traefik"
|
||||
ingressClassName: "nginx"
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
|
||||
nginx.ingress.kubernetes.io/affinity: "cookie"
|
||||
nginx.ingress.kubernetes.io/session-cookie-name: "grafana"
|
||||
nginx.ingress.kubernetes.io/session-cookie-expires: "172800"
|
||||
nginx.ingress.kubernetes.io/session-cookie-max-age: "172800"
|
||||
hosts:
|
||||
- "grafana.${SECRET_DOMAIN}"
|
||||
- &host "grafana.${SECRET_PUBLIC_DOMAIN}"
|
||||
tls:
|
||||
- hosts:
|
||||
- "grafana.${SECRET_DOMAIN}"
|
||||
- *host
|
||||
serviceAccount:
|
||||
create: true
|
||||
autoMount: true
|
||||
persistence:
|
||||
enabled: false
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/name
|
||||
operator: In
|
||||
values:
|
||||
- grafana
|
||||
topologyKey: kubernetes.io/hostname
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- secret.sops.yaml
|
||||
- helm-release.yaml
|
||||
|
||||
29
cluster/apps/monitoring/grafana/secrete.sops.yaml
Normal file
29
cluster/apps/monitoring/grafana/secrete.sops.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
# yamllint disable
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-admin-creds
|
||||
namespace: monitoring
|
||||
stringData:
|
||||
admin-user: ENC[AES256_GCM,data:YEpVd78=,iv:Ed/WWBj61OyoqGgcx3mCCK/fJj020oj4h+sHyt6eeR0=,tag:DbTj3rfV/nxnaFOfopoAmA==,type:str]
|
||||
admin-password: ENC[AES256_GCM,data:zYt9S6FPcCTtoukltlkQnw==,iv:1jSk25EJ6tDOnRXJOL2weht10S1zJ3/SVMUN7HfW61A=,tag:R2TW5PB4iUuvl4XpUglrIQ==,type:str]
|
||||
sops:
|
||||
kms: []
|
||||
gcp_kms: []
|
||||
azure_kv: []
|
||||
hc_vault: []
|
||||
age:
|
||||
- recipient: age1nfn3vxpsgm49ljgs8kxevga9makhh9aply6ddgf9wplsfuwpcv2qzmqatc
|
||||
enc: |
|
||||
-----BEGIN AGE ENCRYPTED FILE-----
|
||||
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBWZ0VxRm81RkttajQvZ2c3
|
||||
SktZTDZ6MUZFeUZXQVFGKy9KQmFvQjhCVG1NCnM4dUo0Ym0remNsOTN3SWZUMTMx
|
||||
UmE3TkhoS1c1UGN3QklXdCtWWFNqOTQKLS0tIGxXbHNVTUxtdXNMSWJnOWo0QjVr
|
||||
QXZMSUxFNUdaWHI2UkpOVjBxa1FHVGcK32kLlwinxQcTEbWRBhc5ju2e3ORnFxeo
|
||||
2iDkIZepjjxd9Rwkak3o/+YGtBJa0Elvw07bpvvmsO20GZtkVCKIIg==
|
||||
-----END AGE ENCRYPTED FILE-----
|
||||
lastmodified: "2022-07-04T15:09:39Z"
|
||||
mac: ENC[AES256_GCM,data:jQvHQowQatizYBWcVWd0GBmxVjWKri21h6lnhhql5nViTNpKns2J/r2h4bDkds4BLW41voQyCG9mwN/cnBrjOOBhIp55eEzgw+Jq2x5I980FvDBdRq+d5PO+5A/DCjphB1HSRyLd4mkpGS9qX6c1ieOzW2hUm797OZVf2Ov6Vvs=,iv:CYiiR7UKftaRdCiH4e+1xgqleQzgzdFZpT5tLzmZpQ4=,tag:JJR5EDNOQxX2NHGg25n4lg==,type:str]
|
||||
pgp: []
|
||||
encrypted_regex: ^(data|stringData)$
|
||||
version: 3.7.3
|
||||
@@ -5,18 +5,23 @@ metadata:
|
||||
name: kube-prometheus-stack
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://prometheus-community.github.io/helm-charts
|
||||
chart: kube-prometheus-stack
|
||||
version: 18.1.1
|
||||
version: 36.2.1
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: prometheus-community-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
timeout: 20m
|
||||
interval: 15m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
values:
|
||||
alertmanager:
|
||||
config:
|
||||
@@ -66,22 +71,24 @@ spec:
|
||||
ingress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: "traefik"
|
||||
ingressClassName: "nginx"
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
|
||||
hosts:
|
||||
- "alert-manager.${SECRET_DOMAIN}"
|
||||
- &host "alert-manager.${SECRET_PUBLIC_DOMAIN}"
|
||||
tls:
|
||||
- hosts:
|
||||
- "alert-manager.${SECRET_DOMAIN}"
|
||||
- *host
|
||||
alertmanagerSpec:
|
||||
replicas: 3
|
||||
podAntiAffinity: hard
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
storageClassName: "ceph-block"
|
||||
resources:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
storage: 250Mi
|
||||
nodeExporter:
|
||||
serviceMonitor:
|
||||
relabelings:
|
||||
@@ -91,6 +98,19 @@ spec:
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
kube-state-metrics:
|
||||
metricLabelsAllowlist:
|
||||
- "persistentvolumeclaims=[*]"
|
||||
prometheus:
|
||||
monitor:
|
||||
enabled: true
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: kubernetes_node
|
||||
grafana:
|
||||
enabled: false
|
||||
forceDeployDashboards: true
|
||||
@@ -110,59 +130,55 @@ spec:
|
||||
kubeApiServer:
|
||||
enabled: true
|
||||
kubeControllerManager:
|
||||
enabled: true
|
||||
endpoints:
|
||||
- 10.75.40.10
|
||||
- 10.75.40.11
|
||||
- 10.75.40.12
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: true
|
||||
endpoints:
|
||||
- 10.75.40.10
|
||||
- 10.75.40.11
|
||||
- 10.75.40.12
|
||||
enabled: false
|
||||
kubeProxy:
|
||||
enabled: true
|
||||
endpoints:
|
||||
- 10.75.40.10
|
||||
- 10.75.40.11
|
||||
- 10.75.40.12
|
||||
enabled: false
|
||||
kubeEtcd:
|
||||
enabled: true
|
||||
endpoints:
|
||||
- 10.75.40.10
|
||||
- 10.75.40.11
|
||||
- 10.75.40.12
|
||||
service:
|
||||
enabled: true
|
||||
port: 2381
|
||||
targetPort: 2381
|
||||
enabled: false
|
||||
prometheus:
|
||||
ingress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: "traefik"
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
|
||||
ingressClassName: "nginx"
|
||||
hosts:
|
||||
- "prometheus.${SECRET_DOMAIN}"
|
||||
- &host "prometheus.${SECRET_DOMAIN}"
|
||||
tls:
|
||||
- hosts:
|
||||
- "prometheus.${SECRET_DOMAIN}"
|
||||
- *host
|
||||
thanosService:
|
||||
enabled: true
|
||||
thanosServiceMonitor:
|
||||
enabled: true
|
||||
thanosIngress:
|
||||
enabled: true
|
||||
pathType: Prefix
|
||||
ingressClassName: "nginx"
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/ssl-redirect: "true"
|
||||
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
|
||||
hosts:
|
||||
- &host "thanos-sidecar.${SECRET_PUBLIC_DOMAIN}"
|
||||
tls:
|
||||
- hosts:
|
||||
- *host
|
||||
resources:
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 1000Mi
|
||||
thanosService:
|
||||
enabled: true
|
||||
prometheusSpec:
|
||||
replicas: 1
|
||||
replicaExternalLabelName: "replica"
|
||||
replicas: 3
|
||||
replicaExternalLabelName: __replica__
|
||||
externalLabels:
|
||||
cluster: ${CLUSTER_NAME}
|
||||
podAntiAffinity: hard
|
||||
retentionSize: "6GB"
|
||||
ruleSelectorNilUsesHelmValues: false
|
||||
serviceMonitorSelectorNilUsesHelmValues: false
|
||||
podMonitorSelectorNilUsesHelmValues: false
|
||||
probeSelectorNilUsesHelmValues: false
|
||||
retention: 6h
|
||||
retention: 2d
|
||||
enableAdminAPI: true
|
||||
walCompression: true
|
||||
storageSpec:
|
||||
@@ -173,13 +189,16 @@ spec:
|
||||
requests:
|
||||
storage: 10Gi
|
||||
thanos:
|
||||
image: quay.io/thanos/thanos:v0.23.1
|
||||
version: v0.22.0
|
||||
image: quay.io/thanos/thanos:v0.26.0
|
||||
# renovate: datasource=docker depName=quay.io/thanos/thanos
|
||||
version: "v0.26.0"
|
||||
objectStorageConfig:
|
||||
name: thanos-objstore
|
||||
name: thanos-objstore-secret
|
||||
key: objstore.yml
|
||||
additionalScrapeConfigs:
|
||||
- job_name: 'node-exporter'
|
||||
scrape_interval: 1m
|
||||
scrape_timeout: 10s
|
||||
honor_timestamps: true
|
||||
static_configs:
|
||||
- targets:
|
||||
@@ -188,12 +207,16 @@ spec:
|
||||
# franxx nas
|
||||
- 10.75.30.15:9100
|
||||
- job_name: 'minio-job'
|
||||
scrape_interval: 1m
|
||||
scrape_timeout: 10s
|
||||
honor_timestamps: true
|
||||
metrics_path: /minio/v2/metrics/cluster
|
||||
static_configs:
|
||||
- targets:
|
||||
- 10.75.30.15:9000
|
||||
- job_name: "coredns"
|
||||
scrape_interval: 1m
|
||||
scrape_timeout: 10s
|
||||
honor_timestamps: true
|
||||
static_configs:
|
||||
- targets:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- helm-release.yaml
|
||||
|
||||
@@ -2,14 +2,14 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- botkube
|
||||
- namespace.yaml
|
||||
- grafana
|
||||
- kube-prometheus-stack
|
||||
- loki
|
||||
- promtail
|
||||
# - snmp-exporter-switch-0
|
||||
- snmp-exporter-ups-0
|
||||
- speedtest-exporter
|
||||
#- snmp-exporter-ups-0
|
||||
#- speedtest-exporter
|
||||
- thanos
|
||||
- vector
|
||||
#- unifi-poller
|
||||
- uptimerobot-heartbeat
|
||||
#- uptimerobot-heartbeat
|
||||
|
||||
47
cluster/apps/monitoring/loki/config-map.yaml
Normal file
47
cluster/apps/monitoring/loki/config-map.yaml
Normal file
@@ -0,0 +1,47 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: loki-alerting-rules
|
||||
namespace: monitoring
|
||||
data:
|
||||
loki-alerting-rules.yaml: |-
|
||||
groups:
|
||||
#
|
||||
# SMART Failures
|
||||
#
|
||||
- name: smart-failure
|
||||
rules:
|
||||
- alert: SmartFailures
|
||||
expr: |
|
||||
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "SMART has reported failures on host {{ $labels.hostname }}"
|
||||
#
|
||||
# *arr
|
||||
#
|
||||
- name: arr
|
||||
rules:
|
||||
- alert: ArrDatabaseIsLocked
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "{{ $labels.app }} is experiencing locked database issues"
|
||||
- alert: ArrDatabaseIsMalformed
|
||||
expr: |
|
||||
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
category: logs
|
||||
annotations:
|
||||
summary: "{{ $labels.app }} is experiencing malformed database disk image issues"
|
||||
|
||||
@@ -5,33 +5,148 @@ metadata:
|
||||
name: loki
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://grafana.github.io/helm-charts
|
||||
chart: loki
|
||||
version: 2.8.1
|
||||
chart: loki-simple-scalable
|
||||
version: 1.4.3
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: grafana-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
values:
|
||||
image:
|
||||
repository: ghcr.io/k8s-at-home/loki
|
||||
config:
|
||||
storage_config:
|
||||
aws:
|
||||
bucketnames: loki
|
||||
endpoint: nas01:9000
|
||||
access_key_id: "${SECRET_MINIO_ACCESS_KEY}"
|
||||
secret_access_key: "${SECRET_MINIO_SECRET_KEY}"
|
||||
s3forcepathstyle: true
|
||||
insecure: true
|
||||
boltdb_shipper:
|
||||
active_index_directory: /data/loki/index
|
||||
cache_location: /data/loki/index_cache
|
||||
resync_interval: 5s
|
||||
loki:
|
||||
structuredConfig:
|
||||
auth_enabled: false
|
||||
server:
|
||||
log_level: info
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9095
|
||||
memberlist:
|
||||
join_members: ["loki-memberlist"]
|
||||
limits_config:
|
||||
retention_period: 14d
|
||||
enforce_metric_name: false
|
||||
reject_old_samples: true
|
||||
reject_old_samples_max_age: 168h
|
||||
max_cache_freshness_per_query: 10m
|
||||
split_queries_by_interval: 15m
|
||||
ingestion_rate_mb: 8
|
||||
ingestion_burst_size_mb: 16
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2021-08-01"
|
||||
store: boltdb-shipper
|
||||
object_store: s3
|
||||
schema: v11
|
||||
index:
|
||||
prefix: loki_index_
|
||||
period: 24h
|
||||
common:
|
||||
path_prefix: /var/loki
|
||||
replication_factor: 3
|
||||
storage:
|
||||
s3:
|
||||
s3: null
|
||||
insecure: true
|
||||
s3forcepathstyle: true
|
||||
ring:
|
||||
kvstore:
|
||||
store: memberlist
|
||||
ruler:
|
||||
enable_api: true
|
||||
enable_alertmanager_v2: true
|
||||
alertmanager_url: http://kube-prometheus-stack-alertmanager:9093
|
||||
storage:
|
||||
type: local
|
||||
local:
|
||||
directory: /rules
|
||||
rule_path: /tmp/scratch
|
||||
ring:
|
||||
kvstore:
|
||||
store: memberlist
|
||||
distributor:
|
||||
ring:
|
||||
kvstore:
|
||||
store: memberlist
|
||||
compactor:
|
||||
working_directory: /var/loki/boltdb-shipper-compactor
|
||||
shared_store: s3
|
||||
compaction_interval: 10m
|
||||
retention_enabled: true
|
||||
retention_delete_delay: 2h
|
||||
retention_delete_worker_count: 150
|
||||
ingester:
|
||||
max_chunk_age: 1h
|
||||
lifecycler:
|
||||
ring:
|
||||
kvstore:
|
||||
store: memberlist
|
||||
analytics:
|
||||
reporting_enabled: false
|
||||
gateway:
|
||||
enabled: true
|
||||
replicas: 3
|
||||
ingress:
|
||||
enabled: true
|
||||
ingressClassName: "nginx"
|
||||
hosts:
|
||||
- host: &host "loki.${SECRET_PUBLIC_DOMAIN}"
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- *host
|
||||
write:
|
||||
replicas: 3
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: ceph-block
|
||||
read:
|
||||
replicas: 3
|
||||
extraVolumeMounts:
|
||||
- name: loki-rules
|
||||
mountPath: /rules/fake
|
||||
- name: loki-rules-tmp
|
||||
mountPath: /tmp/scratch
|
||||
- name: loki-tmp
|
||||
mountPath: /tmp/loki-tmp
|
||||
extraVolumes:
|
||||
- name: loki-rules
|
||||
configMap:
|
||||
name: loki-alerting-rules
|
||||
- name: loki-rules-tmp
|
||||
emptyDir: {}
|
||||
- name: loki-tmp
|
||||
emptyDir: {}
|
||||
persistence:
|
||||
size: 10Gi
|
||||
storageClass: ceph-block
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
valuesFrom:
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.bucketnames
|
||||
kind: ConfigMap
|
||||
name: loki-chunks-bucket-v1
|
||||
valuesKey: BUCKET_NAME
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.endpoint
|
||||
kind: ConfigMap
|
||||
name: loki-chunks-bucket-v1
|
||||
valuesKey: BUCKET_HOST
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.access_key_id
|
||||
kind: Secret
|
||||
name: loki-chunks-bucket-v1
|
||||
valuesKey: AWS_ACCESS_KEY_ID
|
||||
- targetPath: loki.structuredConfig.common.storage.s3.secret_access_key
|
||||
kind: Secret
|
||||
name: loki-chunks-bucket-v1
|
||||
valuesKey: AWS_SECRET_ACCESS_KEY
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- prometheus-rule.yaml
|
||||
- object-bucket-claim.yaml
|
||||
- config-map.yaml
|
||||
- helm-release.yaml
|
||||
9
cluster/apps/monitoring/loki/object-bucket-claim.yaml
Normal file
9
cluster/apps/monitoring/loki/object-bucket-claim.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
apiVersion: objectbucket.io/v1alpha1
|
||||
kind: ObjectBucketClaim
|
||||
metadata:
|
||||
name: loki-chunks-bucket-v1
|
||||
namespace: monitoring
|
||||
spec:
|
||||
bucketName: loki-chunks-v1
|
||||
storageClassName: ceph-bucket
|
||||
@@ -1,109 +0,0 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: loki.rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: loki.rules
|
||||
rules:
|
||||
- alert: LokiRequestErrors
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
|
||||
expr: |
|
||||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
> 10
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestPanics
|
||||
annotations:
|
||||
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
|
||||
expr: |
|
||||
sum(increase(loki_panic_total[10m])) by (namespace, job)
|
||||
> 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: LokiRequestLatency
|
||||
annotations:
|
||||
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
|
||||
expr: |
|
||||
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job))
|
||||
record: job:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job)
|
||||
record: job:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
|
||||
record: job:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route))
|
||||
record: job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, job, route)
|
||||
record: job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
|
||||
record: job_route:loki_request_duration_seconds_count:sum_rate
|
||||
- expr: |
|
||||
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:99quantile
|
||||
- expr: |
|
||||
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route))
|
||||
record: namespace_job_route:loki_request_duration_seconds:50quantile
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
|
||||
/
|
||||
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds:avg
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_bucket[1m]))
|
||||
by (le, namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_sum[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
|
||||
- expr: |
|
||||
sum(rate(loki_request_duration_seconds_count[1m]))
|
||||
by (namespace, job, route)
|
||||
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate
|
||||
8
cluster/apps/monitoring/namespace.yaml
Normal file
8
cluster/apps/monitoring/namespace.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: monitoring
|
||||
labels:
|
||||
kustomize.toolkit.fluxcd.io/prune: disabled
|
||||
goldilocks.fairwinds.com/enabled: "true"
|
||||
38
cluster/apps/monitoring/thanos/dashboard/kustomization.yaml
Normal file
38
cluster/apps/monitoring/thanos/dashboard/kustomization.yaml
Normal file
@@ -0,0 +1,38 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
namespace: monitoring
|
||||
configMapGenerator:
|
||||
- name: thanos-bucket-replicate-dashboard
|
||||
files:
|
||||
- thanos-bucket-replicate-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/bucket-replicate.json
|
||||
- name: thanos-compact-dashboard
|
||||
files:
|
||||
- thanos-compact-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/compact.json
|
||||
- name: thanos-overview-dashboard
|
||||
files:
|
||||
- thanos-overview-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/overview.json
|
||||
- name: thanos-query-dashboard
|
||||
files:
|
||||
- thanos-query-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/query.json
|
||||
- name: thanos-query-frontend-dashboard
|
||||
files:
|
||||
- thanos-query-frontend-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/query-frontend.json
|
||||
- name: thanos-receive-dashboard
|
||||
files:
|
||||
- thanos-receive-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/receive.json
|
||||
- name: thanos-rule-dashboard
|
||||
files:
|
||||
- thanos-rule-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/rule.json
|
||||
- name: thanos-sidecar-dashboard
|
||||
files:
|
||||
- thanos-sidecar-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/sidecar.json
|
||||
- name: thanos-store-dashboard
|
||||
files:
|
||||
- thanos-store-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/store.json
|
||||
generatorOptions:
|
||||
disableNameSuffixHash: true
|
||||
annotations:
|
||||
kustomize.toolkit.fluxcd.io/substitute: disabled
|
||||
labels:
|
||||
grafana_dashboard: "true"
|
||||
@@ -5,49 +5,148 @@ metadata:
|
||||
name: thanos
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://charts.bitnami.com/bitnami
|
||||
chart: thanos
|
||||
version: 8.1.2
|
||||
version: 10.5.4
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: bitnami-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
interval: 15m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
dependsOn:
|
||||
- name: kube-prometheus-stack
|
||||
namespace: monitoring
|
||||
values:
|
||||
objstoreConfig:
|
||||
type: s3
|
||||
config:
|
||||
insecure: true
|
||||
query:
|
||||
enabled: true
|
||||
replicaCount: 3
|
||||
replicaLabels:
|
||||
- replica
|
||||
- __replica__
|
||||
dnsDiscovery:
|
||||
sidecarsService: kube-prometheus-stack-thanos-discovery
|
||||
sidecarsNamespace: monitoring
|
||||
ingress:
|
||||
enabled: true
|
||||
hostname: "thanos.${SECRET_DOMAIN}"
|
||||
ingressClassName: "traefik"
|
||||
annotations:
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
|
||||
hostname: &host "thanos.${SECRET_PUBLIC_DOMAIN}"
|
||||
ingressClassName: "nginx"
|
||||
tls: true
|
||||
existingObjstoreSecret: thanos-objstore
|
||||
extraTls:
|
||||
- hosts:
|
||||
- *host
|
||||
queryFrontend:
|
||||
enabled: false
|
||||
bucketweb:
|
||||
enabled: true
|
||||
replicaCount: 3
|
||||
podAntiAffinityPreset: hard
|
||||
ingress:
|
||||
enabled: true
|
||||
hostname: &host "thanos-bucketweb.${SECRET_PUBLIC_DOMAIN}"
|
||||
ingressClassName: "nginx"
|
||||
tls: true
|
||||
extraTls:
|
||||
- hosts:
|
||||
- *host
|
||||
compactor:
|
||||
enabled: true
|
||||
persistence:
|
||||
enabled: false
|
||||
storegateway:
|
||||
enabled: true
|
||||
extraFlags:
|
||||
- "--compact.concurrency"
|
||||
- "4"
|
||||
retentionResolutionRaw: 14d
|
||||
retentionResolution5m: 14d
|
||||
retentionResolution1h: 14d
|
||||
ingress:
|
||||
enabled: true
|
||||
hostname: &host "thanos-compactor.${SECRET_PUBLIC_DOMAIN}"
|
||||
ingressClassName: "nginx"
|
||||
tls: true
|
||||
extraTls:
|
||||
- hosts:
|
||||
- *host
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 5Gi
|
||||
storageClass: "ceph-block"
|
||||
size: 50Gi
|
||||
storegateway:
|
||||
enabled: true
|
||||
replicaCount: 3
|
||||
podAntiAffinityPreset: hard
|
||||
ingress:
|
||||
enabled: true
|
||||
hostname: &host "thanos-storegateway.${SECRET_PUBLIC_DOMAIN}"
|
||||
ingressClassName: "nginx"
|
||||
tls: true
|
||||
extraTls:
|
||||
- hosts:
|
||||
- *host
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClass: "ceph-block"
|
||||
size: 20Gi
|
||||
ruler:
|
||||
enabled: false
|
||||
enabled: true
|
||||
replicaCount: 3
|
||||
podAntiAffinityPreset: hard
|
||||
dnsDiscovery:
|
||||
enabled: true
|
||||
alertmanagers:
|
||||
- http://kube-prometheus-stack-alertmanager.monitoring:9093
|
||||
clusterName: ${CLUSTER_NAME}
|
||||
extraFlags:
|
||||
- "--web.prefix-header=X-Forwarded-Prefix"
|
||||
config: |-
|
||||
groups:
|
||||
- name: "PrometheusWatcher"
|
||||
rules:
|
||||
- alert: "PrometheusDown"
|
||||
expr: absent(up{job="kube-prometheus-stack-prometheus"})
|
||||
ingress:
|
||||
enabled: true
|
||||
hostname: &host "thanos-ruler.${SECRET_PUBLIC_DOMAIN}"
|
||||
ingressClassName: "nginx"
|
||||
tls: true
|
||||
extraTls:
|
||||
- hosts:
|
||||
- *host
|
||||
persistence:
|
||||
enabled: true
|
||||
storageClass: "ceph-block"
|
||||
size: 20Gi
|
||||
metrics:
|
||||
enabled: true
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
valuesFrom:
|
||||
- targetPath: objstoreConfig.config.bucket
|
||||
kind: ConfigMap
|
||||
name: thanos-ceph-bucket-v1
|
||||
valuesKey: BUCKET_NAME
|
||||
- targetPath: objstoreConfig.config.endpoint
|
||||
kind: ConfigMap
|
||||
name: thanos-ceph-bucket-v1
|
||||
valuesKey: BUCKET_HOST
|
||||
- targetPath: objstoreConfig.config.region
|
||||
kind: ConfigMap
|
||||
name: thanos-ceph-bucket-v1
|
||||
valuesKey: BUCKET_REGION
|
||||
- targetPath: objstoreConfig.config.access_key
|
||||
kind: Secret
|
||||
name: thanos-ceph-bucket-v1
|
||||
valuesKey: AWS_ACCESS_KEY_ID
|
||||
- targetPath: objstoreConfig.config.secret_key
|
||||
kind: Secret
|
||||
name: thanos-ceph-bucket-v1
|
||||
valuesKey: AWS_SECRET_ACCESS_KEY
|
||||
@@ -2,5 +2,6 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- secret.sops.yaml
|
||||
- dashboard
|
||||
- object-bucket-claim.yaml
|
||||
- helm-release.yaml
|
||||
|
||||
9
cluster/apps/monitoring/thanos/object-bucket-claim.yaml
Normal file
9
cluster/apps/monitoring/thanos/object-bucket-claim.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
---
|
||||
apiVersion: objectbucket.io/v1alpha1
|
||||
kind: ObjectBucketClaim
|
||||
metadata:
|
||||
name: thanos-ceph-bucket-v1
|
||||
namespace: monitoring
|
||||
spec:
|
||||
bucketName: thanos-v1
|
||||
storageClassName: ceph-bucket
|
||||
@@ -1,28 +0,0 @@
|
||||
# yamllint disable
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: thanos-objstore
|
||||
namespace: monitoring
|
||||
stringData:
|
||||
objstore.yml: ENC[AES256_GCM,data:eeCI5cch1lOSotG2rg1iw+RJ25eGUNzYJw/dZCcx4RfZUknNI+Z72tZWgTkt547R3bpT7IxeOZmQYJ6/WFS1gJynbkzgE6zFHpHbgMkvIWYA4iecaKq+idH5dRvJ1YV+Z4mPoEVJG1XFggKVcpdOowmWZQ022822GpRXwt2/OoltsgSA2REVb37Un2tM6PZ9ofnF9hTlo5v3EfEM5wI/3JUfQQ==,iv:AkWZPzYB/+bHprHxmRHKSuLTlsLDw3f7jJkFBJd3Yxo=,tag:IwC0eN2tOlY7j8OXcnpazg==,type:str]
|
||||
sops:
|
||||
kms: []
|
||||
gcp_kms: []
|
||||
azure_kv: []
|
||||
hc_vault: []
|
||||
age:
|
||||
- recipient: age1nfn3vxpsgm49ljgs8kxevga9makhh9aply6ddgf9wplsfuwpcv2qzmqatc
|
||||
enc: |
|
||||
-----BEGIN AGE ENCRYPTED FILE-----
|
||||
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB6OGkxcnZscnJ1TitwQXgv
|
||||
ZnkwcmdnVWJLbFFGSmRHV2pqbitYQW5IWWtFCjNNdVBwTEQ0VDk2OEtHKzkwSi9h
|
||||
cVpOR3NONE9HQXB6VDlwMUp0WUYzRGMKLS0tIGRlY0UrOVhzMndJTWFyclg3ZHBV
|
||||
bk1tZmFPUy9FUEtiMkVHcDBGaUZwdmsKdv0wD5JNfdBN45ba8bbjpVIEHop4AqKX
|
||||
R+Vp9//6wTxsiafO0Bp0RUls1gHuRUYKhgAcH9PP8TIjZCwbUpHEpg==
|
||||
-----END AGE ENCRYPTED FILE-----
|
||||
lastmodified: "2021-08-11T23:27:44Z"
|
||||
mac: ENC[AES256_GCM,data:2z0BgAwz408+gSDfuxGtt75mF++qOSgKb/RGdm0fqTORrFB+a/Yc/alXS0NVOl43WAkxY8HpUozQooa6VhdA88OcoNFUUtz8uhpeymBj1t/xXL4gE85Be2FXmhGFHKOaIulgPIiRScwxvaYG4C289QjIHZ8T3E9ykiYnrl1/bQk=,iv:MehdXoE4gTDoF+mG9SRatebR8LHxoy+g+709/I+LHII=,tag:q97WqxBWqRHLmAekD9IzWw==,type:str]
|
||||
pgp: []
|
||||
encrypted_regex: ^(data|stringData)$
|
||||
version: 3.7.1
|
||||
@@ -1,37 +0,0 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: unifi-poller
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://k8s-at-home.com/charts/
|
||||
chart: unifi-poller
|
||||
version: 10.1.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: k8s-at-home-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
values:
|
||||
image:
|
||||
repository: golift/unifi-poller
|
||||
tag: 2.1.3
|
||||
env:
|
||||
TZ: "America/New_York"
|
||||
UP_UNIFI_DEFAULT_ROLE: "homelab-controller"
|
||||
UP_UNIFI_DEFAULT_URL: "https://192.168.1.2"
|
||||
UP_UNIFI_DEFAULT_VERIFY_SSL: false
|
||||
UP_UNIFI_DEFAULT_USER: "unifipoller"
|
||||
UP_UNIFI_DEFAULT_PASS: "unifipoller"
|
||||
UP_UNIFI_DEFAULT_SAVE_SITES: true
|
||||
UP_UNIFI_DEFAULT_SAVE_DPI: false
|
||||
UP_INFLUXDB_DISABLE: true
|
||||
UP_PROMETHEUS_DISABLE: false
|
||||
UP_PROMETHEUS_NAMESPACE: "unifipoller"
|
||||
prometheus:
|
||||
serviceMonitor:
|
||||
enabled: true
|
||||
@@ -1,27 +0,0 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: uptimerobot-operator
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 5m
|
||||
chart:
|
||||
spec:
|
||||
# renovate: registryUrl=https://brennerm.github.io/uptimerobot-operator/helm
|
||||
chart: uptimerobot-operator
|
||||
version: 0.3.0
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: uptimerobot-operator-charts
|
||||
namespace: flux-system
|
||||
interval: 5m
|
||||
values:
|
||||
uptimeRobotApiKey: "${SECRET_UPTIMEROBOT_API_KEY}"
|
||||
disableIngressHandling: true
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
limits:
|
||||
memory: 250Mi
|
||||
83
cluster/apps/monitoring/vector/agent/helm-release.yaml
Normal file
83
cluster/apps/monitoring/vector/agent/helm-release.yaml
Normal file
@@ -0,0 +1,83 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: vector-agent
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: vector
|
||||
version: 0.13.2
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: vector-charts
|
||||
namespace: flux-system
|
||||
interval: 15m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
dependsOn:
|
||||
- name: vector-aggregator
|
||||
namespace: monitoring
|
||||
values:
|
||||
image:
|
||||
repository: timberio/vector
|
||||
tag: 0.22.3-debian
|
||||
role: Agent
|
||||
customConfig:
|
||||
data_dir: /vector-data-dir
|
||||
api:
|
||||
enabled: false
|
||||
# Sources
|
||||
sources:
|
||||
journal_logs:
|
||||
type: journald
|
||||
journal_directory: /var/log/journal
|
||||
kubernetes_logs:
|
||||
type: kubernetes_logs
|
||||
pod_annotation_fields:
|
||||
container_image: "container_image"
|
||||
container_name: "container_name"
|
||||
pod_annotations: "pod_annotations"
|
||||
pod_labels: "pod_labels"
|
||||
pod_name: "pod_name"
|
||||
vector_metrics:
|
||||
type: internal_metrics
|
||||
# Sinks
|
||||
sinks:
|
||||
loki_journal_sink:
|
||||
type: vector
|
||||
inputs:
|
||||
- journal_logs
|
||||
address: vector-aggregator:6000
|
||||
version: "2"
|
||||
loki_kubernetes_sink:
|
||||
type: vector
|
||||
inputs:
|
||||
- kubernetes_logs
|
||||
address: vector-aggregator:6010
|
||||
version: "2"
|
||||
prometheus_sink:
|
||||
type: prometheus_exporter
|
||||
inputs:
|
||||
- vector_metrics
|
||||
address: 0.0.0.0:9090
|
||||
service:
|
||||
enabled: false
|
||||
podMonitor:
|
||||
enabled: true
|
||||
jobLabel: vector-agent
|
||||
port: prometheus-sink
|
||||
securityContext:
|
||||
privileged: true
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
- effect: NoExecute
|
||||
operator: Exists
|
||||
@@ -1,4 +1,5 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- helm-release.yaml
|
||||
148
cluster/apps/monitoring/vector/aggregator/helm-release.yaml
Normal file
148
cluster/apps/monitoring/vector/aggregator/helm-release.yaml
Normal file
@@ -0,0 +1,148 @@
|
||||
---
|
||||
apiVersion: helm.toolkit.fluxcd.io/v2beta1
|
||||
kind: HelmRelease
|
||||
metadata:
|
||||
name: vector-aggregator
|
||||
namespace: monitoring
|
||||
spec:
|
||||
interval: 15m
|
||||
chart:
|
||||
spec:
|
||||
chart: vector
|
||||
version: 0.13.2
|
||||
sourceRef:
|
||||
kind: HelmRepository
|
||||
name: vector-charts
|
||||
namespace: flux-system
|
||||
interval: 15m
|
||||
install:
|
||||
createNamespace: true
|
||||
remediation:
|
||||
retries: 5
|
||||
upgrade:
|
||||
remediation:
|
||||
retries: 5
|
||||
dependsOn:
|
||||
- name: loki
|
||||
namespace: monitoring
|
||||
values:
|
||||
customConfig:
|
||||
data_dir: /vector-data-dir
|
||||
api:
|
||||
enabled: false
|
||||
#
|
||||
# Sources
|
||||
#
|
||||
sources:
|
||||
journal_logs:
|
||||
type: vector
|
||||
address: 0.0.0.0:6000
|
||||
version: "2"
|
||||
kubernetes_logs:
|
||||
type: vector
|
||||
address: 0.0.0.0:6010
|
||||
version: "2"
|
||||
vector_metrics:
|
||||
type: internal_metrics
|
||||
#
|
||||
# Transformations
|
||||
#
|
||||
transforms:
|
||||
kubernetes_logs_remap:
|
||||
type: remap
|
||||
inputs:
|
||||
- kubernetes_logs
|
||||
source: |
|
||||
# Standardize 'app' index
|
||||
.custom_app_name = .pod_labels."app.kubernetes.io/name" || .pod_labels.app || .pod_labels."k8s-app" || "unknown"
|
||||
#
|
||||
# Sinks
|
||||
#
|
||||
sinks:
|
||||
loki_journal:
|
||||
type: loki
|
||||
inputs:
|
||||
- journal_logs
|
||||
endpoint: http://loki-gateway:80
|
||||
encoding:
|
||||
codec: json
|
||||
batch:
|
||||
max_bytes: 2049000
|
||||
out_of_order_action: accept
|
||||
remove_label_fields: true
|
||||
remove_timestamp: true
|
||||
labels:
|
||||
hostname: >-
|
||||
{{`{{ host }}`}}
|
||||
loki_kubernetes:
|
||||
type: loki
|
||||
inputs:
|
||||
- kubernetes_logs_remap
|
||||
endpoint: http://loki-gateway:80
|
||||
encoding:
|
||||
codec: json
|
||||
batch:
|
||||
max_bytes: 2049000
|
||||
out_of_order_action: accept
|
||||
remove_label_fields: true
|
||||
remove_timestamp: true
|
||||
labels:
|
||||
app: >-
|
||||
{{`{{ custom_app_name }}`}}
|
||||
namespace: >-
|
||||
{{`{{ kubernetes.pod_namespace }}`}}
|
||||
node: >-
|
||||
{{`{{ kubernetes.pod_node_name }}`}}
|
||||
prometheus_sink:
|
||||
type: prometheus_exporter
|
||||
inputs:
|
||||
- vector_metrics
|
||||
address: 0.0.0.0:9090
|
||||
# console_debug:
|
||||
# type: console
|
||||
# inputs:
|
||||
# - opnsense_filterlog_logs
|
||||
# target: stdout
|
||||
# encoding:
|
||||
# codec: json
|
||||
image:
|
||||
repository: timberio/vector
|
||||
tag: 0.22.3-debian
|
||||
role: Stateless-Aggregator
|
||||
replicas: 3
|
||||
# env:
|
||||
# - name: VECTOR_LOG
|
||||
# value: debug
|
||||
service:
|
||||
enabled: true
|
||||
type: LoadBalancer
|
||||
podMonitor:
|
||||
enabled: true
|
||||
jobLabel: vector-aggregator
|
||||
port: prometheus-sink
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- weight: 100
|
||||
podAffinityTerm:
|
||||
labelSelector:
|
||||
matchExpressions:
|
||||
- key: app.kubernetes.io/component
|
||||
operator: In
|
||||
values:
|
||||
- Stateless-Aggregator
|
||||
topologyKey: kubernetes.io/hostname
|
||||
postRenderers:
|
||||
- kustomize:
|
||||
patchesJson6902:
|
||||
- target:
|
||||
kind: Service
|
||||
name: vector-aggregator
|
||||
patch:
|
||||
- op: add
|
||||
path: /spec/loadBalancerIP
|
||||
value:
|
||||
- "${SVC_SYSLOG_ADDR}"
|
||||
- op: replace
|
||||
path: /spec/externalTrafficPolicy
|
||||
value: Local
|
||||
@@ -1,4 +1,5 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- helm-release.yaml
|
||||
@@ -1,4 +1,6 @@
|
||||
---
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- helm-release.yaml
|
||||
- agent
|
||||
- aggregator
|
||||
@@ -17,7 +17,7 @@ data:
|
||||
LB_COREDNS_IP: 10.75.45.100
|
||||
LB_TRAEFIK_IP: 10.75.45.101
|
||||
SVC_NGINX_ADDR: 10.75.45.101
|
||||
LB_LOKI_IP: 10.75.45.102
|
||||
SVC_SYSLOG_ADDR: 10.75.45.102
|
||||
SVC_PLEX_ADDR: 10.75.45.103
|
||||
SVC_JELLYFIN_ADDR: 10.75.45.104
|
||||
LB_BLOCKY_IP: 10.75.45.105
|
||||
|
||||
Reference in New Issue
Block a user