refactor: monitoring updates

This commit is contained in:
Toboshii Nakama
2022-07-04 10:35:38 -05:00
parent b375239301
commit cd710a8bec
27 changed files with 752 additions and 577 deletions

View File

@@ -7,6 +7,7 @@ resources:
- flux-system
- kube-system
- media
- monitoring
- networking
- security
# - home

View File

@@ -1,229 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: botkube
namespace: monitoring
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://infracloudio.github.io/charts
chart: botkube
version: v0.12.3
sourceRef:
kind: HelmRepository
name: infracloudio-charts
namespace: flux-system
interval: 5m
values:
communications:
discord:
enabled: true
token: "${SECRET_BOTKUBE_DISCORD_TOKEN}"
botid: "842864233690431508"
channel: "842863758522974228"
serviceMonitor:
enabled: true
config:
recommendations: false
settings:
clustername: home-cluster
resources:
- name: v1/services
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: apps/v1/deployments
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- update
- delete
- error
- name: apps/v1/statefulsets
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- update
- delete
- error
- name: networking.k8s.io/v1beta1/ingresses
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: v1/nodes
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: v1/namespaces
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: v1/persistentvolumes
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: v1/persistentvolumeclaims
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: v1/configmaps
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: apps/v1/daemonsets
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: batch/v1/jobs
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- update
- error
- name: rbac.authorization.k8s.io/v1/rolebindings
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: rbac.authorization.k8s.io/v1/clusterrolebindings
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error
- name: rbac.authorization.k8s.io/v1/clusterroles
namespaces:
include:
- default
- dev
- home
- kube-system
- media
- monitoring
- networking
events:
- create
- delete
- error

View File

@@ -5,38 +5,37 @@ metadata:
name: grafana
namespace: monitoring
spec:
interval: 5m
interval: 15m
chart:
spec:
# renovate: registryUrl=https://grafana.github.io/helm-charts
chart: grafana
version: 6.17.10
version: 6.32.0
sourceRef:
kind: HelmRepository
name: grafana-charts
namespace: flux-system
interval: 5m
interval: 15m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
values:
image:
repository: ghcr.io/k8s-at-home/grafana
replicas: 1
replicas: 3
env:
GF_EXPLORE_ENABLED: true
GF_PANELS_DISABLE_SANITIZE_HTML: true
GF_LOG_FILTERS: rendering:debug
GF_DATE_FORMATS_USE_BROWSER_LOCALE: true
GF_DATE_FORMATS_FULL_DATE: "MMM Do, YYYY hh:mm:ss a"
GF_DATE_FORMATS_INTERVAL_SECOND: "hh:mm:ss a"
GF_DATE_FORMATS_INTERVAL_MINUTE: "hh:mm a"
GF_DATE_FORMATS_INTERVAL_HOUR: "DD/MM hh:mm a"
GF_DATE_FORMATS_INTERVAL_DAY: "DD/MM"
GF_DATE_FORMATS_INTERVAL_MONTH: "MM-YYYY"
GF_DATE_FORMATS_INTERVAL_YEAR: "YYYY"
VAR_BLOCKY_URL: "http://blocky.networking.svc.cluster.local:4000"
adminPassword: "${SECRET_GRAFANA_PASSWORD}"
GF_SECURITY_ALLOW_EMBEDDING: true
GF_SECURITY_COOKIE_SAMESITE: grafana
admin:
existingSecret: grafana-admin-creds
grafana.ini:
server:
root_url: "https://grafana.${SECRET_DOMAIN}"
root_url: "https://grafana.${SECRET_PUBLIC_DOMAIN}"
paths:
data: /var/lib/grafana/data
logs: /var/log/grafana
@@ -81,6 +80,9 @@ spec:
orgId: 1
datasources.yaml:
apiVersion: 1
deleteDatasources:
- name: Loki
orgId: 1
datasources:
- name: Prometheus
type: prometheus
@@ -90,29 +92,9 @@ spec:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
url: http://loki-gateway:80
dashboards:
default:
# Ref: https://grafana.com/grafana/dashboards/12175
calico-felix:
gnetId: 12175
revision: 5
datasource: Prometheus
# Ref: https://grafana.com/grafana/dashboards/2842
ceph-cluster:
gnetId: 2842
revision: 14
datasource: Prometheus
# Ref: https://grafana.com/grafana/dashboards/5336
ceph-osd:
gnetId: 5336
revision: 5
datasource: Prometheus
# Ref: https://grafana.com/grafana/dashboards/5342
ceph-pools:
gnetId: 5342
revision: 5
datasource: Prometheus
# Ref: https://grafana.com/grafana/dashboards/7845
zfs:
gnetId: 7845
@@ -144,18 +126,9 @@ spec:
flux-control-plane:
url: https://raw.githubusercontent.com/fluxcd/flux2/main/manifests/monitoring/grafana/dashboards/control-plane.json
datasource: Prometheus
blocky:
url: https://raw.githubusercontent.com/0xERR0R/blocky/master/docs/blocky-grafana.json
datasource: Prometheus
apc-smart-ups-1500:
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/apc-smart-ups-1500-snmp.json
datasource: Prometheus
lidarr:
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/lidarr.json
radarr:
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/radarr.json
sonarr:
url: https://raw.githubusercontent.com/k8s-at-home/grafana-dashboards/main/sonarr.json
sidecar:
dashboards:
enabled: true
@@ -174,16 +147,31 @@ spec:
enabled: true
ingress:
enabled: true
ingressClassName: "traefik"
ingressClassName: "nginx"
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
nginx.ingress.kubernetes.io/affinity: "cookie"
nginx.ingress.kubernetes.io/session-cookie-name: "grafana"
nginx.ingress.kubernetes.io/session-cookie-expires: "172800"
nginx.ingress.kubernetes.io/session-cookie-max-age: "172800"
hosts:
- "grafana.${SECRET_DOMAIN}"
- &host "grafana.${SECRET_PUBLIC_DOMAIN}"
tls:
- hosts:
- "grafana.${SECRET_DOMAIN}"
- *host
serviceAccount:
create: true
autoMount: true
persistence:
enabled: false
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/name
operator: In
values:
- grafana
topologyKey: kubernetes.io/hostname

View File

@@ -1,4 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- secret.sops.yaml
- helm-release.yaml

View File

@@ -0,0 +1,29 @@
# yamllint disable
apiVersion: v1
kind: Secret
metadata:
name: grafana-admin-creds
namespace: monitoring
stringData:
admin-user: ENC[AES256_GCM,data:YEpVd78=,iv:Ed/WWBj61OyoqGgcx3mCCK/fJj020oj4h+sHyt6eeR0=,tag:DbTj3rfV/nxnaFOfopoAmA==,type:str]
admin-password: ENC[AES256_GCM,data:zYt9S6FPcCTtoukltlkQnw==,iv:1jSk25EJ6tDOnRXJOL2weht10S1zJ3/SVMUN7HfW61A=,tag:R2TW5PB4iUuvl4XpUglrIQ==,type:str]
sops:
kms: []
gcp_kms: []
azure_kv: []
hc_vault: []
age:
- recipient: age1nfn3vxpsgm49ljgs8kxevga9makhh9aply6ddgf9wplsfuwpcv2qzmqatc
enc: |
-----BEGIN AGE ENCRYPTED FILE-----
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBWZ0VxRm81RkttajQvZ2c3
SktZTDZ6MUZFeUZXQVFGKy9KQmFvQjhCVG1NCnM4dUo0Ym0remNsOTN3SWZUMTMx
UmE3TkhoS1c1UGN3QklXdCtWWFNqOTQKLS0tIGxXbHNVTUxtdXNMSWJnOWo0QjVr
QXZMSUxFNUdaWHI2UkpOVjBxa1FHVGcK32kLlwinxQcTEbWRBhc5ju2e3ORnFxeo
2iDkIZepjjxd9Rwkak3o/+YGtBJa0Elvw07bpvvmsO20GZtkVCKIIg==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2022-07-04T15:09:39Z"
mac: ENC[AES256_GCM,data:jQvHQowQatizYBWcVWd0GBmxVjWKri21h6lnhhql5nViTNpKns2J/r2h4bDkds4BLW41voQyCG9mwN/cnBrjOOBhIp55eEzgw+Jq2x5I980FvDBdRq+d5PO+5A/DCjphB1HSRyLd4mkpGS9qX6c1ieOzW2hUm797OZVf2Ov6Vvs=,iv:CYiiR7UKftaRdCiH4e+1xgqleQzgzdFZpT5tLzmZpQ4=,tag:JJR5EDNOQxX2NHGg25n4lg==,type:str]
pgp: []
encrypted_regex: ^(data|stringData)$
version: 3.7.3

View File

@@ -5,18 +5,23 @@ metadata:
name: kube-prometheus-stack
namespace: monitoring
spec:
interval: 5m
interval: 15m
chart:
spec:
# renovate: registryUrl=https://prometheus-community.github.io/helm-charts
chart: kube-prometheus-stack
version: 18.1.1
version: 36.2.1
sourceRef:
kind: HelmRepository
name: prometheus-community-charts
namespace: flux-system
interval: 5m
timeout: 20m
interval: 15m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
values:
alertmanager:
config:
@@ -66,22 +71,24 @@ spec:
ingress:
enabled: true
pathType: Prefix
ingressClassName: "traefik"
ingressClassName: "nginx"
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
hosts:
- "alert-manager.${SECRET_DOMAIN}"
- &host "alert-manager.${SECRET_PUBLIC_DOMAIN}"
tls:
- hosts:
- "alert-manager.${SECRET_DOMAIN}"
- *host
alertmanagerSpec:
replicas: 3
podAntiAffinity: hard
storage:
volumeClaimTemplate:
spec:
storageClassName: "ceph-block"
resources:
requests:
storage: 10Gi
storage: 250Mi
nodeExporter:
serviceMonitor:
relabelings:
@@ -91,6 +98,19 @@ spec:
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
kube-state-metrics:
metricLabelsAllowlist:
- "persistentvolumeclaims=[*]"
prometheus:
monitor:
enabled: true
relabelings:
- action: replace
regex: (.*)
replacement: $1
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: kubernetes_node
grafana:
enabled: false
forceDeployDashboards: true
@@ -110,59 +130,55 @@ spec:
kubeApiServer:
enabled: true
kubeControllerManager:
enabled: true
endpoints:
- 10.75.40.10
- 10.75.40.11
- 10.75.40.12
enabled: false
kubeScheduler:
enabled: true
endpoints:
- 10.75.40.10
- 10.75.40.11
- 10.75.40.12
enabled: false
kubeProxy:
enabled: true
endpoints:
- 10.75.40.10
- 10.75.40.11
- 10.75.40.12
enabled: false
kubeEtcd:
enabled: true
endpoints:
- 10.75.40.10
- 10.75.40.11
- 10.75.40.12
service:
enabled: true
port: 2381
targetPort: 2381
enabled: false
prometheus:
ingress:
enabled: true
pathType: Prefix
ingressClassName: "traefik"
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
ingressClassName: "nginx"
hosts:
- "prometheus.${SECRET_DOMAIN}"
- &host "prometheus.${SECRET_DOMAIN}"
tls:
- hosts:
- "prometheus.${SECRET_DOMAIN}"
- *host
thanosService:
enabled: true
thanosServiceMonitor:
enabled: true
thanosIngress:
enabled: true
pathType: Prefix
ingressClassName: "nginx"
annotations:
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
hosts:
- &host "thanos-sidecar.${SECRET_PUBLIC_DOMAIN}"
tls:
- hosts:
- *host
resources:
requests:
cpu: 1000m
memory: 1000Mi
thanosService:
enabled: true
prometheusSpec:
replicas: 1
replicaExternalLabelName: "replica"
replicas: 3
replicaExternalLabelName: __replica__
externalLabels:
cluster: ${CLUSTER_NAME}
podAntiAffinity: hard
retentionSize: "6GB"
ruleSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
probeSelectorNilUsesHelmValues: false
retention: 6h
retention: 2d
enableAdminAPI: true
walCompression: true
storageSpec:
@@ -173,13 +189,16 @@ spec:
requests:
storage: 10Gi
thanos:
image: quay.io/thanos/thanos:v0.23.1
version: v0.22.0
image: quay.io/thanos/thanos:v0.26.0
# renovate: datasource=docker depName=quay.io/thanos/thanos
version: "v0.26.0"
objectStorageConfig:
name: thanos-objstore
name: thanos-objstore-secret
key: objstore.yml
additionalScrapeConfigs:
- job_name: 'node-exporter'
scrape_interval: 1m
scrape_timeout: 10s
honor_timestamps: true
static_configs:
- targets:
@@ -188,12 +207,16 @@ spec:
# franxx nas
- 10.75.30.15:9100
- job_name: 'minio-job'
scrape_interval: 1m
scrape_timeout: 10s
honor_timestamps: true
metrics_path: /minio/v2/metrics/cluster
static_configs:
- targets:
- 10.75.30.15:9000
- job_name: "coredns"
scrape_interval: 1m
scrape_timeout: 10s
honor_timestamps: true
static_configs:
- targets:

View File

@@ -1,4 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- helm-release.yaml

View File

@@ -2,14 +2,14 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- botkube
- namespace.yaml
- grafana
- kube-prometheus-stack
- loki
- promtail
# - snmp-exporter-switch-0
- snmp-exporter-ups-0
- speedtest-exporter
#- snmp-exporter-ups-0
#- speedtest-exporter
- thanos
- vector
#- unifi-poller
- uptimerobot-heartbeat
#- uptimerobot-heartbeat

View File

@@ -0,0 +1,47 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: loki-alerting-rules
namespace: monitoring
data:
loki-alerting-rules.yaml: |-
groups:
#
# SMART Failures
#
- name: smart-failure
rules:
- alert: SmartFailures
expr: |
sum by (hostname) (count_over_time({hostname=~".+"} | json | _SYSTEMD_UNIT = "smartmontools.service" !~ "(?i)previous self-test completed without error" !~ "(?i)Prefailure" |~ "(?i)(error|fail)"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "SMART has reported failures on host {{ $labels.hostname }}"
#
# *arr
#
- name: arr
rules:
- alert: ArrDatabaseIsLocked
expr: |
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database is locked"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "{{ $labels.app }} is experiencing locked database issues"
- alert: ArrDatabaseIsMalformed
expr: |
sum by (app) (count_over_time({app=~".*arr"} |~ "(?i)database disk image is malformed"[2m])) > 0
for: 2m
labels:
severity: critical
category: logs
annotations:
summary: "{{ $labels.app }} is experiencing malformed database disk image issues"

View File

@@ -5,33 +5,148 @@ metadata:
name: loki
namespace: monitoring
spec:
interval: 5m
interval: 15m
chart:
spec:
# renovate: registryUrl=https://grafana.github.io/helm-charts
chart: loki
version: 2.8.1
chart: loki-simple-scalable
version: 1.4.3
sourceRef:
kind: HelmRepository
name: grafana-charts
namespace: flux-system
interval: 5m
interval: 15m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
values:
image:
repository: ghcr.io/k8s-at-home/loki
config:
storage_config:
aws:
bucketnames: loki
endpoint: nas01:9000
access_key_id: "${SECRET_MINIO_ACCESS_KEY}"
secret_access_key: "${SECRET_MINIO_SECRET_KEY}"
s3forcepathstyle: true
insecure: true
boltdb_shipper:
active_index_directory: /data/loki/index
cache_location: /data/loki/index_cache
resync_interval: 5s
loki:
structuredConfig:
auth_enabled: false
server:
log_level: info
http_listen_port: 3100
grpc_listen_port: 9095
memberlist:
join_members: ["loki-memberlist"]
limits_config:
retention_period: 14d
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
max_cache_freshness_per_query: 10m
split_queries_by_interval: 15m
ingestion_rate_mb: 8
ingestion_burst_size_mb: 16
schema_config:
configs:
- from: "2021-08-01"
store: boltdb-shipper
object_store: s3
schema: v11
index:
prefix: loki_index_
period: 24h
common:
path_prefix: /var/loki
replication_factor: 3
storage:
s3:
s3: null
insecure: true
s3forcepathstyle: true
ring:
kvstore:
store: memberlist
ruler:
enable_api: true
enable_alertmanager_v2: true
alertmanager_url: http://kube-prometheus-stack-alertmanager:9093
storage:
type: local
local:
directory: /rules
rule_path: /tmp/scratch
ring:
kvstore:
store: memberlist
distributor:
ring:
kvstore:
store: memberlist
compactor:
working_directory: /var/loki/boltdb-shipper-compactor
shared_store: s3
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
ingester:
max_chunk_age: 1h
lifecycler:
ring:
kvstore:
store: memberlist
analytics:
reporting_enabled: false
gateway:
enabled: true
replicas: 3
ingress:
enabled: true
ingressClassName: "nginx"
hosts:
- host: &host "loki.${SECRET_PUBLIC_DOMAIN}"
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- *host
write:
replicas: 3
persistence:
size: 10Gi
storageClass: ceph-block
read:
replicas: 3
extraVolumeMounts:
- name: loki-rules
mountPath: /rules/fake
- name: loki-rules-tmp
mountPath: /tmp/scratch
- name: loki-tmp
mountPath: /tmp/loki-tmp
extraVolumes:
- name: loki-rules
configMap:
name: loki-alerting-rules
- name: loki-rules-tmp
emptyDir: {}
- name: loki-tmp
emptyDir: {}
persistence:
size: 10Gi
storageClass: ceph-block
serviceMonitor:
enabled: true
valuesFrom:
- targetPath: loki.structuredConfig.common.storage.s3.bucketnames
kind: ConfigMap
name: loki-chunks-bucket-v1
valuesKey: BUCKET_NAME
- targetPath: loki.structuredConfig.common.storage.s3.endpoint
kind: ConfigMap
name: loki-chunks-bucket-v1
valuesKey: BUCKET_HOST
- targetPath: loki.structuredConfig.common.storage.s3.access_key_id
kind: Secret
name: loki-chunks-bucket-v1
valuesKey: AWS_ACCESS_KEY_ID
- targetPath: loki.structuredConfig.common.storage.s3.secret_access_key
kind: Secret
name: loki-chunks-bucket-v1
valuesKey: AWS_SECRET_ACCESS_KEY

View File

@@ -1,5 +1,7 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- prometheus-rule.yaml
- object-bucket-claim.yaml
- config-map.yaml
- helm-release.yaml

View File

@@ -0,0 +1,9 @@
---
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: loki-chunks-bucket-v1
namespace: monitoring
spec:
bucketName: loki-chunks-v1
storageClassName: ceph-bucket

View File

@@ -1,109 +0,0 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: loki.rules
namespace: monitoring
spec:
groups:
- name: loki.rules
rules:
- alert: LokiRequestErrors
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value | humanizePercentage }} errors."
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
message: "{{ $labels.job }} is experiencing {{ $value | humanizePercentage }} increase of panics."
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job)
> 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
message: "{{ $labels.job }} {{ $labels.route }} is experiencing {{ $value }}s 99th percentile latency."
expr: |
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"}
> 1
for: 15m
labels:
severity: critical
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job))
record: job:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job)
record: job:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job)
record: job:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job)
record: job:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route))
record: job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, job, route)
record: job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m])) by (job, route)
record: job_route:loki_request_duration_seconds_count:sum_rate
- expr: |
histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:99quantile
- expr: |
histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route))
record: namespace_job_route:loki_request_duration_seconds:50quantile
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds:avg
- expr: |
sum(rate(loki_request_duration_seconds_bucket[1m]))
by (le, namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_bucket:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_sum[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_sum:sum_rate
- expr: |
sum(rate(loki_request_duration_seconds_count[1m]))
by (namespace, job, route)
record: namespace_job_route:loki_request_duration_seconds_count:sum_rate

View File

@@ -0,0 +1,8 @@
---
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
kustomize.toolkit.fluxcd.io/prune: disabled
goldilocks.fairwinds.com/enabled: "true"

View File

@@ -0,0 +1,38 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
configMapGenerator:
- name: thanos-bucket-replicate-dashboard
files:
- thanos-bucket-replicate-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/bucket-replicate.json
- name: thanos-compact-dashboard
files:
- thanos-compact-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/compact.json
- name: thanos-overview-dashboard
files:
- thanos-overview-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/overview.json
- name: thanos-query-dashboard
files:
- thanos-query-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/query.json
- name: thanos-query-frontend-dashboard
files:
- thanos-query-frontend-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/query-frontend.json
- name: thanos-receive-dashboard
files:
- thanos-receive-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/receive.json
- name: thanos-rule-dashboard
files:
- thanos-rule-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/rule.json
- name: thanos-sidecar-dashboard
files:
- thanos-sidecar-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/sidecar.json
- name: thanos-store-dashboard
files:
- thanos-store-dashboard.json=https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/thanos/dashboards/store.json
generatorOptions:
disableNameSuffixHash: true
annotations:
kustomize.toolkit.fluxcd.io/substitute: disabled
labels:
grafana_dashboard: "true"

View File

@@ -5,49 +5,148 @@ metadata:
name: thanos
namespace: monitoring
spec:
interval: 5m
interval: 15m
chart:
spec:
# renovate: registryUrl=https://charts.bitnami.com/bitnami
chart: thanos
version: 8.1.2
version: 10.5.4
sourceRef:
kind: HelmRepository
name: bitnami-charts
namespace: flux-system
interval: 5m
interval: 15m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
dependsOn:
- name: kube-prometheus-stack
namespace: monitoring
values:
objstoreConfig:
type: s3
config:
insecure: true
query:
enabled: true
replicaCount: 3
replicaLabels:
- replica
- __replica__
dnsDiscovery:
sidecarsService: kube-prometheus-stack-thanos-discovery
sidecarsNamespace: monitoring
ingress:
enabled: true
hostname: "thanos.${SECRET_DOMAIN}"
ingressClassName: "traefik"
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
hostname: &host "thanos.${SECRET_PUBLIC_DOMAIN}"
ingressClassName: "nginx"
tls: true
existingObjstoreSecret: thanos-objstore
extraTls:
- hosts:
- *host
queryFrontend:
enabled: false
bucketweb:
enabled: true
replicaCount: 3
podAntiAffinityPreset: hard
ingress:
enabled: true
hostname: &host "thanos-bucketweb.${SECRET_PUBLIC_DOMAIN}"
ingressClassName: "nginx"
tls: true
extraTls:
- hosts:
- *host
compactor:
enabled: true
persistence:
enabled: false
storegateway:
enabled: true
extraFlags:
- "--compact.concurrency"
- "4"
retentionResolutionRaw: 14d
retentionResolution5m: 14d
retentionResolution1h: 14d
ingress:
enabled: true
hostname: &host "thanos-compactor.${SECRET_PUBLIC_DOMAIN}"
ingressClassName: "nginx"
tls: true
extraTls:
- hosts:
- *host
persistence:
enabled: true
size: 5Gi
storageClass: "ceph-block"
size: 50Gi
storegateway:
enabled: true
replicaCount: 3
podAntiAffinityPreset: hard
ingress:
enabled: true
hostname: &host "thanos-storegateway.${SECRET_PUBLIC_DOMAIN}"
ingressClassName: "nginx"
tls: true
extraTls:
- hosts:
- *host
persistence:
enabled: true
storageClass: "ceph-block"
size: 20Gi
ruler:
enabled: false
enabled: true
replicaCount: 3
podAntiAffinityPreset: hard
dnsDiscovery:
enabled: true
alertmanagers:
- http://kube-prometheus-stack-alertmanager.monitoring:9093
clusterName: ${CLUSTER_NAME}
extraFlags:
- "--web.prefix-header=X-Forwarded-Prefix"
config: |-
groups:
- name: "PrometheusWatcher"
rules:
- alert: "PrometheusDown"
expr: absent(up{job="kube-prometheus-stack-prometheus"})
ingress:
enabled: true
hostname: &host "thanos-ruler.${SECRET_PUBLIC_DOMAIN}"
ingressClassName: "nginx"
tls: true
extraTls:
- hosts:
- *host
persistence:
enabled: true
storageClass: "ceph-block"
size: 20Gi
metrics:
enabled: true
serviceMonitor:
enabled: true
valuesFrom:
- targetPath: objstoreConfig.config.bucket
kind: ConfigMap
name: thanos-ceph-bucket-v1
valuesKey: BUCKET_NAME
- targetPath: objstoreConfig.config.endpoint
kind: ConfigMap
name: thanos-ceph-bucket-v1
valuesKey: BUCKET_HOST
- targetPath: objstoreConfig.config.region
kind: ConfigMap
name: thanos-ceph-bucket-v1
valuesKey: BUCKET_REGION
- targetPath: objstoreConfig.config.access_key
kind: Secret
name: thanos-ceph-bucket-v1
valuesKey: AWS_ACCESS_KEY_ID
- targetPath: objstoreConfig.config.secret_key
kind: Secret
name: thanos-ceph-bucket-v1
valuesKey: AWS_SECRET_ACCESS_KEY

View File

@@ -2,5 +2,6 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- secret.sops.yaml
- dashboard
- object-bucket-claim.yaml
- helm-release.yaml

View File

@@ -0,0 +1,9 @@
---
apiVersion: objectbucket.io/v1alpha1
kind: ObjectBucketClaim
metadata:
name: thanos-ceph-bucket-v1
namespace: monitoring
spec:
bucketName: thanos-v1
storageClassName: ceph-bucket

View File

@@ -1,28 +0,0 @@
# yamllint disable
apiVersion: v1
kind: Secret
metadata:
name: thanos-objstore
namespace: monitoring
stringData:
objstore.yml: ENC[AES256_GCM,data:eeCI5cch1lOSotG2rg1iw+RJ25eGUNzYJw/dZCcx4RfZUknNI+Z72tZWgTkt547R3bpT7IxeOZmQYJ6/WFS1gJynbkzgE6zFHpHbgMkvIWYA4iecaKq+idH5dRvJ1YV+Z4mPoEVJG1XFggKVcpdOowmWZQ022822GpRXwt2/OoltsgSA2REVb37Un2tM6PZ9ofnF9hTlo5v3EfEM5wI/3JUfQQ==,iv:AkWZPzYB/+bHprHxmRHKSuLTlsLDw3f7jJkFBJd3Yxo=,tag:IwC0eN2tOlY7j8OXcnpazg==,type:str]
sops:
kms: []
gcp_kms: []
azure_kv: []
hc_vault: []
age:
- recipient: age1nfn3vxpsgm49ljgs8kxevga9makhh9aply6ddgf9wplsfuwpcv2qzmqatc
enc: |
-----BEGIN AGE ENCRYPTED FILE-----
YWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSB6OGkxcnZscnJ1TitwQXgv
ZnkwcmdnVWJLbFFGSmRHV2pqbitYQW5IWWtFCjNNdVBwTEQ0VDk2OEtHKzkwSi9h
cVpOR3NONE9HQXB6VDlwMUp0WUYzRGMKLS0tIGRlY0UrOVhzMndJTWFyclg3ZHBV
bk1tZmFPUy9FUEtiMkVHcDBGaUZwdmsKdv0wD5JNfdBN45ba8bbjpVIEHop4AqKX
R+Vp9//6wTxsiafO0Bp0RUls1gHuRUYKhgAcH9PP8TIjZCwbUpHEpg==
-----END AGE ENCRYPTED FILE-----
lastmodified: "2021-08-11T23:27:44Z"
mac: ENC[AES256_GCM,data:2z0BgAwz408+gSDfuxGtt75mF++qOSgKb/RGdm0fqTORrFB+a/Yc/alXS0NVOl43WAkxY8HpUozQooa6VhdA88OcoNFUUtz8uhpeymBj1t/xXL4gE85Be2FXmhGFHKOaIulgPIiRScwxvaYG4C289QjIHZ8T3E9ykiYnrl1/bQk=,iv:MehdXoE4gTDoF+mG9SRatebR8LHxoy+g+709/I+LHII=,tag:q97WqxBWqRHLmAekD9IzWw==,type:str]
pgp: []
encrypted_regex: ^(data|stringData)$
version: 3.7.1

View File

@@ -1,37 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: unifi-poller
namespace: monitoring
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://k8s-at-home.com/charts/
chart: unifi-poller
version: 10.1.0
sourceRef:
kind: HelmRepository
name: k8s-at-home-charts
namespace: flux-system
interval: 5m
values:
image:
repository: golift/unifi-poller
tag: 2.1.3
env:
TZ: "America/New_York"
UP_UNIFI_DEFAULT_ROLE: "homelab-controller"
UP_UNIFI_DEFAULT_URL: "https://192.168.1.2"
UP_UNIFI_DEFAULT_VERIFY_SSL: false
UP_UNIFI_DEFAULT_USER: "unifipoller"
UP_UNIFI_DEFAULT_PASS: "unifipoller"
UP_UNIFI_DEFAULT_SAVE_SITES: true
UP_UNIFI_DEFAULT_SAVE_DPI: false
UP_INFLUXDB_DISABLE: true
UP_PROMETHEUS_DISABLE: false
UP_PROMETHEUS_NAMESPACE: "unifipoller"
prometheus:
serviceMonitor:
enabled: true

View File

@@ -1,27 +0,0 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: uptimerobot-operator
namespace: monitoring
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://brennerm.github.io/uptimerobot-operator/helm
chart: uptimerobot-operator
version: 0.3.0
sourceRef:
kind: HelmRepository
name: uptimerobot-operator-charts
namespace: flux-system
interval: 5m
values:
uptimeRobotApiKey: "${SECRET_UPTIMEROBOT_API_KEY}"
disableIngressHandling: true
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
memory: 250Mi

View File

@@ -0,0 +1,83 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: vector-agent
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: vector
version: 0.13.2
sourceRef:
kind: HelmRepository
name: vector-charts
namespace: flux-system
interval: 15m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
dependsOn:
- name: vector-aggregator
namespace: monitoring
values:
image:
repository: timberio/vector
tag: 0.22.3-debian
role: Agent
customConfig:
data_dir: /vector-data-dir
api:
enabled: false
# Sources
sources:
journal_logs:
type: journald
journal_directory: /var/log/journal
kubernetes_logs:
type: kubernetes_logs
pod_annotation_fields:
container_image: "container_image"
container_name: "container_name"
pod_annotations: "pod_annotations"
pod_labels: "pod_labels"
pod_name: "pod_name"
vector_metrics:
type: internal_metrics
# Sinks
sinks:
loki_journal_sink:
type: vector
inputs:
- journal_logs
address: vector-aggregator:6000
version: "2"
loki_kubernetes_sink:
type: vector
inputs:
- kubernetes_logs
address: vector-aggregator:6010
version: "2"
prometheus_sink:
type: prometheus_exporter
inputs:
- vector_metrics
address: 0.0.0.0:9090
service:
enabled: false
podMonitor:
enabled: true
jobLabel: vector-agent
port: prometheus-sink
securityContext:
privileged: true
tolerations:
- effect: NoSchedule
operator: Exists
- effect: NoExecute
operator: Exists

View File

@@ -1,4 +1,5 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- helm-release.yaml

View File

@@ -0,0 +1,148 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: vector-aggregator
namespace: monitoring
spec:
interval: 15m
chart:
spec:
chart: vector
version: 0.13.2
sourceRef:
kind: HelmRepository
name: vector-charts
namespace: flux-system
interval: 15m
install:
createNamespace: true
remediation:
retries: 5
upgrade:
remediation:
retries: 5
dependsOn:
- name: loki
namespace: monitoring
values:
customConfig:
data_dir: /vector-data-dir
api:
enabled: false
#
# Sources
#
sources:
journal_logs:
type: vector
address: 0.0.0.0:6000
version: "2"
kubernetes_logs:
type: vector
address: 0.0.0.0:6010
version: "2"
vector_metrics:
type: internal_metrics
#
# Transformations
#
transforms:
kubernetes_logs_remap:
type: remap
inputs:
- kubernetes_logs
source: |
# Standardize 'app' index
.custom_app_name = .pod_labels."app.kubernetes.io/name" || .pod_labels.app || .pod_labels."k8s-app" || "unknown"
#
# Sinks
#
sinks:
loki_journal:
type: loki
inputs:
- journal_logs
endpoint: http://loki-gateway:80
encoding:
codec: json
batch:
max_bytes: 2049000
out_of_order_action: accept
remove_label_fields: true
remove_timestamp: true
labels:
hostname: >-
{{`{{ host }}`}}
loki_kubernetes:
type: loki
inputs:
- kubernetes_logs_remap
endpoint: http://loki-gateway:80
encoding:
codec: json
batch:
max_bytes: 2049000
out_of_order_action: accept
remove_label_fields: true
remove_timestamp: true
labels:
app: >-
{{`{{ custom_app_name }}`}}
namespace: >-
{{`{{ kubernetes.pod_namespace }}`}}
node: >-
{{`{{ kubernetes.pod_node_name }}`}}
prometheus_sink:
type: prometheus_exporter
inputs:
- vector_metrics
address: 0.0.0.0:9090
# console_debug:
# type: console
# inputs:
# - opnsense_filterlog_logs
# target: stdout
# encoding:
# codec: json
image:
repository: timberio/vector
tag: 0.22.3-debian
role: Stateless-Aggregator
replicas: 3
# env:
# - name: VECTOR_LOG
# value: debug
service:
enabled: true
type: LoadBalancer
podMonitor:
enabled: true
jobLabel: vector-aggregator
port: prometheus-sink
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
labelSelector:
matchExpressions:
- key: app.kubernetes.io/component
operator: In
values:
- Stateless-Aggregator
topologyKey: kubernetes.io/hostname
postRenderers:
- kustomize:
patchesJson6902:
- target:
kind: Service
name: vector-aggregator
patch:
- op: add
path: /spec/loadBalancerIP
value:
- "${SVC_SYSLOG_ADDR}"
- op: replace
path: /spec/externalTrafficPolicy
value: Local

View File

@@ -1,4 +1,5 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- helm-release.yaml

View File

@@ -1,4 +1,6 @@
---
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml
- agent
- aggregator

View File

@@ -17,7 +17,7 @@ data:
LB_COREDNS_IP: 10.75.45.100
LB_TRAEFIK_IP: 10.75.45.101
SVC_NGINX_ADDR: 10.75.45.101
LB_LOKI_IP: 10.75.45.102
SVC_SYSLOG_ADDR: 10.75.45.102
SVC_PLEX_ADDR: 10.75.45.103
SVC_JELLYFIN_ADDR: 10.75.45.104
LB_BLOCKY_IP: 10.75.45.105