diff --git a/packages/extra/monitoring/README.md b/packages/extra/monitoring/README.md index a5f85901..0aa2ed1a 100644 --- a/packages/extra/monitoring/README.md +++ b/packages/extra/monitoring/README.md @@ -4,9 +4,12 @@ ### Common parameters -| Name | Description | Value | -| ----------------- | --------------------------------------------------------------------------------------------------------- | ------- | -| `host` | The hostname used to access the grafana externally (defaults to 'grafana' subdomain for the tenant host). | `""` | -| `metricsStorages` | Configuration of metrics storage instances | `[]` | -| `logsStorages` | Configuration of logs storage instances | `[]` | -| `oncall.enabled` | Enable Grafana OnCall | `false` | +| Name | Description | Value | +| ------------------------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------ | +| `host` | The hostname used to access the grafana externally (defaults to 'grafana' subdomain for the tenant host). | `""` | +| `metricsStorages` | Configuration of metrics storage instances | `[]` | +| `logsStorages` | Configuration of logs storage instances | `[]` | +| `alerta.storage` | Persistent Volume size for alerta database | `10Gi` | +| `alerta.storageClassName` | StorageClass used to store the data | `""` | +| `alerta.alerts.telegram.token` | telegram token for your bot | `7262461387:AAGtwq16iwuVtWtzoN6TUEMpF00fpC9Xz34` | +| `alerta.alerts.telegram.chatID` | specify multiple ID's separated by comma. Get yours in https://t.me/chatid_echo_bot | `-4520856007` | diff --git a/packages/extra/monitoring/templates/alerta/alerta-db.yaml b/packages/extra/monitoring/templates/alerta/alerta-db.yaml new file mode 100644 index 00000000..e06c322c --- /dev/null +++ b/packages/extra/monitoring/templates/alerta/alerta-db.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: alerta-db +spec: + instances: 2 + storage: + size: {{ required ".Values.alerta.storage is required" .Values.alerta.storage }} + {{- with .Values.alerta.storageClassName }} + storageClass: {{ . }} + {{- end }} + + inheritedMetadata: + labels: + policy.cozystack.io/allow-to-apiserver: "true" diff --git a/packages/extra/monitoring/templates/alerta/alerta.yaml b/packages/extra/monitoring/templates/alerta/alerta.yaml new file mode 100644 index 00000000..e723661e --- /dev/null +++ b/packages/extra/monitoring/templates/alerta/alerta.yaml @@ -0,0 +1,214 @@ +{{- $myNS := lookup "v1" "Namespace" "" .Release.Namespace }} +{{- $ingress := index $myNS.metadata.annotations "namespace.cozystack.io/ingress" }} +{{- $host := index $myNS.metadata.annotations "namespace.cozystack.io/host" }} + +{{- $apiKey := randAlphaNum 32 }} +{{- $existingSecret := lookup "v1" "Secret" .Release.Namespace "alerta" }} +{{- if $existingSecret }} +{{- $apiKey = index $existingSecret.data "alerta-api-key" | b64dec }} +{{- end }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: alerta + labels: + app: alerta +type: Opaque +data: + alerta-api-key: {{ $apiKey | b64enc }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: alerta + labels: + app: alerta +data: + config.js: | + // contents of config.js + 'use strict'; + angular.module('config', []) + .constant('config', { + 'endpoint' : "/api", + 'provider' : "basic" + }) + .constant('colors', {}); +--- +apiVersion: v1 +kind: Service +metadata: + name: alerta + labels: + app: alerta +spec: + type: ClusterIP + ports: + - port: 80 + targetPort: http + protocol: TCP + name: http + selector: + app: alerta + release: alerta +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alerta + labels: + app: alerta +spec: + replicas: 1 + selector: + matchLabels: + app: alerta + release: alerta + template: + metadata: + labels: + app: alerta + release: alerta + spec: + containers: + - name: alerta + image: "alerta/alerta-web:9.0.4" + imagePullPolicy: IfNotPresent + env: + - name: ADMIN_USERS + valueFrom: + secretKeyRef: + key: user + name: grafana-admin-password + - name: ADMIN_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: grafana-admin-password + - name: ADMIN_KEY + valueFrom: + secretKeyRef: + name: alerta + key: alerta-api-key + - name: ALERTA_API_KEY + valueFrom: + secretKeyRef: + name: alerta + key: alerta-api-key + - name: DATABASE_URL + valueFrom: + secretKeyRef: + name: alerta-db-app + key: uri + + - name: AUTH_REQUIRED + value: "True" + + {{- if and .Values.alerta.alerts.telegram.chatID .Values.alerta.alerts.telegram.token }} + - name: "PLUGINS" + value: "telegram" + - name: TELEGRAM_CHAT_ID + value: "{{ .Values.alerta.alerts.telegram.chatID }}" + - name: TELEGRAM_TOKEN + value: "{{ .Values.alerta.alerts.telegram.token }}" + - name: TELEGRAM_WEBHOOK_URL + value: "https://alerta.infra.aenix.org/api/webhooks/telegram?api-key={{ $apiKey }}" + {{- end }} + + ports: + - name: http + containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 300 + readinessProbe: + httpGet: + path: / + port: http + initialDelaySeconds: 5 + volumeMounts: + - name: alerta-config + mountPath: /app/config.js + subPath: config.js + volumes: + - name: alerta-config + configMap: + name: alerta +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: alerta + labels: + app: alerta + annotations: + acme.cert-manager.io/http01-ingress-class: {{ $ingress }} + cert-manager.io/cluster-issuer: letsencrypt-prod +spec: + ingressClassName: {{ $ingress }} + tls: + - hosts: + - "{{ .Values.host | default (printf "alerta.%s" $host) }}" + secretName: alerta-tls + rules: + - host: "{{ .Values.host | default (printf "alerta.%s" $host) }}" + http: + paths: + - path: / + pathType: ImplementationSpecific + backend: + service: + name: alerta + port: + name: http +--- +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager +type: Opaque +stringData: + alertmanager.yaml: | + global: + resolve_timeout: 5m + route: + group_by: [alertname, namespace, cluster] + group_wait: 15s + group_interval: 15s + repeat_interval: 1h + receiver: 'alerta' + routes: + - receiver: alerta_no_resolve + repeat_interval: 1m + matchers: + - 'event="Heartbeat"' + - receiver: blackhole + matchers: + - 'severity="none"' + - receiver: blackhole + matchers: + - 'alertname=~"CDIDefaultStorageClassDegraded|CDINoDefaultStorageClass"' + receivers: + - name: blackhole + - name: "alerta" + webhook_configs: + - url: 'http://alerta/api/webhooks/prometheus?api-key={{ $apiKey }}' + send_resolved: true + - name: "alerta_no_resolve" + webhook_configs: + - url: 'http://alerta/api/webhooks/prometheus?api-key={{ $apiKey }}' + send_resolved: false +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMAlertmanager +metadata: + name: alertmanager +spec: + replicaCount: 3 + configSecret: alertmanager + podMetadata: + labels: + policy.cozystack.io/allow-to-apiserver: "true" diff --git a/packages/extra/monitoring/templates/dashboard-resourcemap.yaml b/packages/extra/monitoring/templates/dashboard-resourcemap.yaml index e0828484..f15714da 100644 --- a/packages/extra/monitoring/templates/dashboard-resourcemap.yaml +++ b/packages/extra/monitoring/templates/dashboard-resourcemap.yaml @@ -9,6 +9,7 @@ rules: - ingresses resourceNames: - grafana-ingress + - alerta verbs: ["get", "list", "watch"] - apiGroups: - "" @@ -23,4 +24,5 @@ rules: - services resourceNames: - grafana-service + - alerta verbs: ["get", "list", "watch"] diff --git a/packages/extra/monitoring/templates/grafana/grafana.yaml b/packages/extra/monitoring/templates/grafana/grafana.yaml index a4ae5f33..e98573be 100644 --- a/packages/extra/monitoring/templates/grafana/grafana.yaml +++ b/packages/extra/monitoring/templates/grafana/grafana.yaml @@ -56,7 +56,7 @@ spec: mountPath: /var/lib/grafana containers: - name: grafana - image: grafana/grafana:10.1.0 + image: grafana/grafana:11.2.0 securityContext: allowPrivilegeEscalation: false readOnlyRootFilesystem: false @@ -64,7 +64,7 @@ spec: failureThreshold: 3 env: - name: GF_INSTALL_PLUGINS - value: grafana-worldmap-panel,flant-statusmap-panel,grafana-oncall-app,natel-discrete-panel + value: grafana-worldmap-panel,flant-statusmap-panel,natel-discrete-panel - name: ONCALL_API_URL value: http://grafana-oncall-engine:8080 - name: GF_DATABASE_HOST diff --git a/packages/extra/monitoring/templates/oncall/oncall-db.yaml b/packages/extra/monitoring/templates/oncall/oncall-db.yaml deleted file mode 100644 index 048ec173..00000000 --- a/packages/extra/monitoring/templates/oncall/oncall-db.yaml +++ /dev/null @@ -1,15 +0,0 @@ -{{- if .Values.oncall.enabled }} ---- -apiVersion: postgresql.cnpg.io/v1 -kind: Cluster -metadata: - name: grafana-oncall-db -spec: - instances: 2 - storage: - size: 10Gi - - inheritedMetadata: - labels: - policy.cozystack.io/allow-to-apiserver: "true" -{{- end }} diff --git a/packages/extra/monitoring/templates/oncall/oncall-redis.yaml b/packages/extra/monitoring/templates/oncall/oncall-redis.yaml deleted file mode 100644 index da173a81..00000000 --- a/packages/extra/monitoring/templates/oncall/oncall-redis.yaml +++ /dev/null @@ -1,66 +0,0 @@ -{{- if .Values.oncall.enabled }} ---- -apiVersion: databases.spotahome.com/v1 -kind: RedisFailover -metadata: - name: grafana-oncall - labels: - app.kubernetes.io/instance: {{ .Release.Name }} - app.kubernetes.io/managed-by: {{ .Release.Service }} -spec: - sentinel: - replicas: 3 - resources: - requests: - cpu: 100m - limits: - memory: 100Mi - redis: - replicas: 2 - resources: - requests: - cpu: 150m - memory: 400Mi - limits: - cpu: 2 - memory: 1000Mi - storage: - persistentVolumeClaim: - metadata: - name: redisfailover-persistent-data - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: 5Gi - exporter: - enabled: true - image: oliver006/redis_exporter:v1.55.0-alpine - args: - - --web.telemetry-path - - /metrics - env: - - name: REDIS_EXPORTER_LOG_FORMAT - value: txt - customConfig: - - tcp-keepalive 0 - - loglevel notice - auth: - secretPath: grafana-oncall-redis-password ---- -apiVersion: v1 -kind: Secret -metadata: - name: grafana-oncall-redis-password - labels: - app.kubernetes.io/instance: {{ .Release.Name }} - app.kubernetes.io/managed-by: {{ .Release.Service }} -data: -{{- $existingSecret := lookup "v1" "Secret" .Release.Namespace "grafana-oncall-redis-password" }} - {{- if $existingSecret }} - password: {{ index $existingSecret.data "password" }} - {{- else }} - password: {{ randAlphaNum 32 | b64enc }} - {{- end }} -{{- end }} diff --git a/packages/extra/monitoring/templates/oncall/oncall-release.yaml b/packages/extra/monitoring/templates/oncall/oncall-release.yaml deleted file mode 100644 index 7796e1a7..00000000 --- a/packages/extra/monitoring/templates/oncall/oncall-release.yaml +++ /dev/null @@ -1,40 +0,0 @@ -{{- if .Values.oncall.enabled }} -{{- $myNS := lookup "v1" "Namespace" "" .Release.Namespace }} -{{- $host := index $myNS.metadata.annotations "namespace.cozystack.io/host" }} -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: grafana-oncall-system - labels: - app.kubernetes.io/instance: {{ .Release.Name }} - app.kubernetes.io/managed-by: {{ .Release.Service }} -spec: - chart: - spec: - chart: cozy-grafana-oncall - reconcileStrategy: Revision - sourceRef: - kind: HelmRepository - name: cozystack-system - namespace: cozy-system - version: '*' - interval: 1m0s - timeout: 5m0s - values: - oncall: - fullnameOverride: grafana-oncall - externalGrafana: - url: "https://{{ .Values.host | default (printf "grafana.%s" $host) }}/" - - externalPostgresql: - host: grafana-oncall-db-rw - db_name: app - user: app - existingSecret: grafana-oncall-db-app - passwordKey: password - - externalRedis: - host: rfrm-grafana-oncall - existingSecret: {{ .Release.Name }}-oncall-redis-password - passwordKey: password -{{- end }} diff --git a/packages/extra/monitoring/templates/vm/vmalert.yaml b/packages/extra/monitoring/templates/vm/vmalert.yaml index 0e6536d3..457b82b9 100644 --- a/packages/extra/monitoring/templates/vm/vmalert.yaml +++ b/packages/extra/monitoring/templates/vm/vmalert.yaml @@ -11,7 +11,7 @@ spec: extraArgs: remoteWrite.disablePathAppend: "true" notifiers: - - url: http://vmalertmanager.{{ $.Release.Namespace }}.svc:9093 + - url: http://vmalertmanager-alertmanager.{{ $.Release.Namespace }}.svc:9093 remoteRead: url: http://vmselect-{{ .name }}.{{ $.Release.Namespace }}.svc:8481/select/0/prometheus remoteWrite: diff --git a/packages/extra/monitoring/templates/vm/vmalertmanager.yaml b/packages/extra/monitoring/templates/vm/vmalertmanager.yaml deleted file mode 100644 index eeb0f971..00000000 --- a/packages/extra/monitoring/templates/vm/vmalertmanager.yaml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: v1 -kind: Secret -metadata: - name: alertmanager -type: Opaque -stringData: - alertmanager.yaml: | - global: - resolve_timeout: 5m - route: - group_by: - - job - - alertname - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'webhook' - receivers: - - name: 'webhook' - webhook_configs: - - url: http://{{ .Release.Name }}-oncall-engine.{{ .Release.Namespace }}.svc:8080/integrations/v1/alertmanager/Kjb2NWxxSlgGtxz9F4ihovQBB/ ---- -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMAlertmanager -metadata: - name: alertmanager -spec: - replicaCount: 2 - configSecret: alertmanager - podMetadata: - labels: - policy.cozystack.io/allow-to-apiserver: "true" diff --git a/packages/extra/monitoring/values.schema.json b/packages/extra/monitoring/values.schema.json index eadf8d68..f0bb8b3b 100644 --- a/packages/extra/monitoring/values.schema.json +++ b/packages/extra/monitoring/values.schema.json @@ -23,13 +23,38 @@ "type": "object" } }, - "oncall": { + "alerta": { "type": "object", "properties": { - "enabled": { - "type": "boolean", - "description": "Enable Grafana OnCall", - "default": false + "storage": { + "type": "string", + "description": "Persistent Volume size for alerta database", + "default": "10Gi" + }, + "storageClassName": { + "type": "string", + "description": "StorageClass used to store the data", + "default": "" + }, + "alerts": { + "type": "object", + "properties": { + "telegram": { + "type": "object", + "properties": { + "token": { + "type": "string", + "description": "telegram token for your bot", + "default": "7262461387:AAGtwq16iwuVtWtzoN6TUEMpF00fpC9Xz34" + }, + "chatID": { + "type": "string", + "description": "specify multiple ID's separated by comma. Get yours in https://t.me/chatid_echo_bot", + "default": "-4520856007" + } + } + } + } } } } diff --git a/packages/extra/monitoring/values.yaml b/packages/extra/monitoring/values.yaml index 6353fef4..36862e13 100644 --- a/packages/extra/monitoring/values.yaml +++ b/packages/extra/monitoring/values.yaml @@ -25,7 +25,22 @@ logsStorages: storage: 10Gi storageClassName: replicated -## @param oncall.enabled Enable Grafana OnCall -## -oncall: - enabled: false +## Configuration for Alerta +## @param alerta.storage Persistent Volume size for alerta database +## @param alerta.storageClassName StorageClass used to store the data +## +alerta: + storage: 10Gi + storageClassName: "" + + alerts: + ## @param alerta.alerts.telegram.token telegram token for your bot + ## @param alerta.alerts.telegram.chatID specify multiple ID's separated by comma. Get yours in https://t.me/chatid_echo_bot + ## example: + ## telegram: + ## token: "7262461387:AAGtwq16iwuVtWtzoN6TUEMpF00fpC9Xz34" + ## chatID: "-4520856007" + ## + telegram: + token: "" + chatID: "" diff --git a/packages/system/grafana-oncall/Chart.yaml b/packages/system/grafana-oncall/Chart.yaml deleted file mode 100644 index 0f24a040..00000000 --- a/packages/system/grafana-oncall/Chart.yaml +++ /dev/null @@ -1,3 +0,0 @@ -apiVersion: v2 -name: cozy-grafana-oncall -version: 0.0.0 # Placeholder, the actual version will be automatically set during the build process diff --git a/packages/system/grafana-oncall/Makefile b/packages/system/grafana-oncall/Makefile deleted file mode 100644 index 096e34a8..00000000 --- a/packages/system/grafana-oncall/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -NAME=grafana-oncall-system - -include ../../../scripts/package.mk - -update: - rm -rf charts - helm repo add grafana https://grafana.github.io/helm-charts - helm repo update grafana - helm pull grafana/oncall --untar --untardir charts - rm -rf charts/oncall/charts diff --git a/packages/system/grafana-oncall/charts/oncall/Chart.lock b/packages/system/grafana-oncall/charts/oncall/Chart.lock deleted file mode 100644 index cab54bbf..00000000 --- a/packages/system/grafana-oncall/charts/oncall/Chart.lock +++ /dev/null @@ -1,27 +0,0 @@ -dependencies: -- name: cert-manager - repository: https://charts.jetstack.io - version: v1.8.0 -- name: mariadb - repository: https://charts.bitnami.com/bitnami - version: 12.2.5 -- name: postgresql - repository: https://charts.bitnami.com/bitnami - version: 11.9.10 -- name: rabbitmq - repository: https://charts.bitnami.com/bitnami - version: 12.0.0 -- name: redis - repository: https://charts.bitnami.com/bitnami - version: 16.13.2 -- name: grafana - repository: https://grafana.github.io/helm-charts - version: 6.57.1 -- name: ingress-nginx - repository: https://kubernetes.github.io/ingress-nginx - version: 4.1.4 -- name: prometheus - repository: https://prometheus-community.github.io/helm-charts - version: 25.8.2 -digest: sha256:edc9fef449a694cd319135e37ac84f8247ac9ad0c48ac86099dae4e428beb7b7 -generated: "2024-01-26T17:54:48.132209769Z" diff --git a/packages/system/grafana-oncall/charts/oncall/Chart.yaml b/packages/system/grafana-oncall/charts/oncall/Chart.yaml deleted file mode 100644 index 3f490e21..00000000 --- a/packages/system/grafana-oncall/charts/oncall/Chart.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: v2 -appVersion: v1.3.94 -dependencies: -- condition: cert-manager.enabled - name: cert-manager - repository: https://charts.jetstack.io - version: v1.8.0 -- condition: mariadb.enabled - name: mariadb - repository: https://charts.bitnami.com/bitnami - version: 12.2.5 -- condition: postgresql.enabled - name: postgresql - repository: https://charts.bitnami.com/bitnami - version: 11.9.10 -- condition: rabbitmq.enabled - name: rabbitmq - repository: https://charts.bitnami.com/bitnami - version: 12.0.0 -- condition: redis.enabled - name: redis - repository: https://charts.bitnami.com/bitnami - version: 16.13.2 -- condition: grafana.enabled - name: grafana - repository: https://grafana.github.io/helm-charts - version: 6.57.1 -- condition: ingress-nginx.enabled - name: ingress-nginx - repository: https://kubernetes.github.io/ingress-nginx - version: 4.1.4 -- condition: prometheus.enabled - name: prometheus - repository: https://prometheus-community.github.io/helm-charts - version: 25.8.2 -description: Developer-friendly incident response with brilliant Slack integration -name: oncall -type: application -version: 1.3.94 diff --git a/packages/system/grafana-oncall/charts/oncall/README.md b/packages/system/grafana-oncall/charts/oncall/README.md deleted file mode 100644 index 993f0649..00000000 --- a/packages/system/grafana-oncall/charts/oncall/README.md +++ /dev/null @@ -1,431 +0,0 @@ -# Grafana OnCall Helm Chart - -This Grafana OnCall Chart is the best way to operate Grafana OnCall on Kubernetes. -It will deploy Grafana OnCall engine and celery workers, along with RabbitMQ cluster, Redis Cluster, and MySQL 5.7 database. -It will also deploy cert manager and nginx ingress controller, as Grafana OnCall backend might need to be externally available -to receive alerts from other monitoring systems. Grafana OnCall engine acts as a backend and can be connected to the -Grafana frontend plugin named Grafana OnCall. -Architecture diagram can be found [here](https://raw.githubusercontent.com/grafana/oncall/dev/docs/img/architecture_diagram.png) - -## Production usage - -**Default helm chart configuration is not intended for production.** -The helm chart includes all the services into a single release, which is not recommended for production usage. -It is recommended to run stateful services such as MySQL and RabbitMQ separately from this release or use managed -PaaS solutions. It will significantly reduce the overhead of managing them. -Here are the instructions on how to set up your own [ingress](#set-up-external-access), [MySQL](#connect-external-mysql), -[RabbitMQ](#connect-external-rabbitmq), [Redis](#connect-external-redis) - -### Cluster requirements - -- ensure you can run x86-64/amd64 workloads. arm64 architecture is currently not supported -- kubernetes version 1.25+ is not supported, if cert-manager is enabled - -## Install - -### Prepare the repo - -```bash -# Add the repository -helm repo add grafana https://grafana.github.io/helm-charts -helm repo update -``` - -### Installing the helm chart - -```bash -# Install the chart -helm install \ - --wait \ - --set base_url=example.com \ - --set grafana."grafana\.ini".server.domain=example.com \ - release-oncall \ - grafana/oncall -``` - -Follow the `helm install` output to finish setting up Grafana OnCall backend and Grafana OnCall frontend plugin e.g. - -```bash -👋 Your Grafana OnCall instance has been successfully deployed - - ❗ Set up a DNS record for your domain (use A Record and "@" to point a root domain to the IP address) - Get the external IP address by running the following commands and point example.com to it: - - kubectl get ingress release-oncall -o jsonpath="{.status.loadBalancer.ingress[0].ip}" - - Wait until the dns record got propagated. - NOTE: Check with the following command: nslookup example.com - Try reaching https://example.com/ready/ from the browser, make sure it is not cached locally - - 🦎 Grafana was installed as a part of this helm release. Open https://example.com/grafana/plugins/grafana-oncall-app - The User is admin - Get password by running this command: - - kubectl get secret --namespace default release-oncall-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo - - 🔗 Connect Grafana OnCall Plugin to Grafana OnCall backend: - - Fill the Grafana OnCall Backend URL: - - http://release-oncall-engine:8080 - -🎉🎉🎉 Done! 🎉🎉🎉 -``` - -## Configuration - -You can edit values.yml to make changes to the helm chart configuration and re-deploy the release with the following command: - -```bash -helm upgrade \ - --install \ - --wait \ - --set base_url=example.com \ - --set grafana."grafana\.ini".server.domain=example.com \ - release-oncall \ - grafana/oncall -``` - -### Passwords and external secrets - -As OnCall subcharts are Bitname charts, there is a common approach to secrets. Bundled charts allow specifying passwords -in values.yaml explicitly or as K8s secret value. OnCall chart refers either to secret created in sub-chart or -to specified external secret. -Similarly, if component chart is disabled, the password(s) can be supplied in `external` value -(e.g. externalMysql) explicitly or as K8s secret value. In the first case, the secret is created with the specified -value. In the second case the external secret is used. - -- If `.auth.existingSecret` is non-empty, then this secret is used. Secret keys are pre-defined by chart. -- If subchart supports password files and `.customPasswordFiles` dictionary is non-empty, then password files - are used. Dictionary keys are pre-defined per sub-chart. Password files are not supported by OnCall chart and should - not be used with bundled sub-charts. -- Passwords are specified via `auth` section values, e.g. `auth.password`. K8s secret is created. - - If `.auth.forcePassword` is `true`, then passwords MUST be specified. Otherwise, missing passwords - are generated. - -If external component is used instead of the bundled one: - -- If existingSecret within appropriate external component values is non-empty (e.g. `externalMysql.existingSecret`) then - it is used together with corresponding key names, e.g. `externalMysql.passwordKey`. -- Otherwise, corresponding password values are used, e.g. `externalMysql.password`. K8s secret is created by OnCall chart. - -Below is the summary for the dependent charts. - -MySQL/MariaDB: - -```yaml -database: - type: "mysql" # This is default -mariaDB: - enabled: true # Default - auth: - existingSecret: "" - forcePassword: false - # Secret name: `-mariadb` - rootPassword: "" # Secret key: mariadb-root-password - password: "" # Secret key: mariadb-password - replicationPassword: "" # Secret key: mariadb-replication-password -externalMysql: - password: "" - existingSecret: "" - passwordKey: "" -``` - -Postgres: - -```yaml -database: - type: postgresql -mariadb: - enabled: false # Must be set to false for Postgres -postgresql: - enabled: true # Must be set to true for bundled Postgres - auth: - existingSecret: "" - secretKeys: - adminPasswordKey: "" - userPasswordKey: "" # Not needed - replicationPasswordKey: "" # Not needed with disabled replication - # Secret name: `-postgresql` - postgresPassword: "" # password for admin user postgres. As non-admin user is not created, only this one is relevant. - password: "" # Not needed - replicationPassword: "" # Not needed with disabled replication -externalPostgresql: - user: "" - password: "" - existingSecret: "" - passwordKey: "" -``` - -Rabbitmq: - -```yaml -rabbitmq: - enabled: true - auth: - existingPasswordSecret: "" # Must contain `rabbitmq-password` key - existingErlangSecret: "" # Must contain `rabbitmq-erlang-cookie` key - # Secret name: `-rabbitmq` - password: "" - erlangCookie: "" -externalRabbitmq: - user: "" - password: "" - existingSecret: "" - passwordKey: "" - usernameKey: "" -``` - -Redis: - -```yaml -redis: - enabled: true - auth: - existingSecret: "" - existingSecretPasswordKey: "" - # Secret name: `-redis` - password: "" -externalRedis: - password: "" - existingSecret: "" - passwordKey: "" -``` - -### Running split ingestion and API services - -You can run a detached service for handling integrations by setting up the following variables: - -```yaml -detached_integrations: - enabled: true -detached_integrations_service: - enabled: true -``` - -This will run an integrations-only service listening by default in port 30003. - -### Set up Slack and Telegram - -You can set up Slack connection via following variables: - -```yaml -oncall: - slack: - enabled: true - commandName: oncall - clientId: ~ - clientSecret: ~ - signingSecret: ~ - existingSecret: "" - clientIdKey: "" - clientSecretKey: "" - signingSecretKey: "" - redirectHost: ~ -``` - -`oncall.slack.commandName` is used for changing default bot slash command, -`oncall`. In slack, it could be called via `/`. - -To set up Telegram token and webhook url use: - -```yaml -oncall: - telegram: - enabled: true - token: ~ - webhookUrl: ~ -``` - -To use Telegram long polling instead of webhook use: - -```yaml -telegramPolling: - enabled: true -``` - -### Set up external access - -Grafana OnCall can be connected to the external monitoring systems or grafana deployed to the other cluster. -Nginx Ingress Controller and Cert Manager charts are included in the helm chart with the default configuration. -If you set the DNS A Record pointing to the external IP address of the installation with the Hostname matching -base_url parameter, https will be automatically set up. If grafana is enabled in the chart values, it will also be -available on `https:///grafana/`. See the details in `helm install` output. - -To use a different ingress controller or tls certificate management system, set the following values to -false and edit ingress settings - -```yaml -ingress-nginx: - enabled: false - -cert-manager: - enabled: false - -ingress: - enabled: true - annotations: - kubernetes.io/ingress.class: "nginx" - cert-manager.io/issuer: "letsencrypt-prod" -``` - -### Use PostgreSQL instead of MySQL - -It is possible to use PostgreSQL instead of MySQL. To do so, set mariadb.enabled to `false`, -postgresql.enabled to `true` and database.type to `postgresql`. - -```yaml -mariadb: - enabled: false - -postgresql: - enabled: true - -database: - type: postgresql -``` - -### Connect external MySQL - -It is recommended to use the managed MySQL 5.7 database provided by your cloud provider -Make sure to create the database with the following parameters before installing this chart - -```sql -CREATE DATABASE oncall CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; -``` - -To use an external MySQL instance set mariadb.enabled to `false` and configure the `externalMysql` parameters. - -```yaml -mariadb: - enabled: false - -# Make sure to create the database with the following parameters: -# CREATE DATABASE oncall CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; -externalMysql: - host: - port: - db_name: - user: - password: - existingSecret: "" - usernameKey: username - passwordKey: password -``` - -### Connect external PostgreSQL - -To use an external PostgreSQL instance set mariadb.enabled to `false`, -postgresql.enabled to `false`, database.type to `postgresql` and configure -the `externalPostgresql` parameters. - -```yaml -mariadb: - enabled: false - -postgresql: - enabled: false - -database: - type: postgresql - -# Make sure to create the database with the following parameters: -# CREATE DATABASE oncall WITH ENCODING UTF8; -externalPostgresql: - host: - port: - db_name: - user: - password: - existingSecret: "" - passwordKey: password -``` - -### Connect external RabbitMQ - -Option 1. Install RabbitMQ separately into the cluster using the [official documentation](https://www.rabbitmq.com/kubernetes/operator/operator-overview.html) -Option 2. Use managed solution such as [CloudAMPQ](https://www.cloudamqp.com/) - -To use an external RabbitMQ instance set rabbitmq.enabled to `false` and configure the `externalRabbitmq` parameters. - -```yaml -rabbitmq: - enabled: false # Disable the RabbitMQ dependency from the release - -externalRabbitmq: - host: - port: - user: - password: - protocol: - vhost: - existingSecret: "" - passwordKey: password - usernameKey: username -``` - -### Connect external Redis - -To use an external Redis instance set redis.enabled to `false` and configure the `externalRedis` parameters. - -```yaml -redis: - enabled: false # Disable the Redis dependency from the release - -externalRedis: - host: - password: - existingSecret: "" - passwordKey: password -``` - -## Update - -```bash -# Add & upgrade the repository -helm repo add grafana https://grafana.github.io/helm-charts -helm repo update - -# Re-deploy -helm upgrade \ - --install \ - --wait \ - --set base_url=example.com \ - --set grafana."grafana\.ini".server.domain=example.com \ - release-oncall \ - grafana/oncall -``` - -After re-deploying, please also update the Grafana OnCall plugin on the plugin version page. -See [Grafana docs](https://grafana.com/docs/grafana/latest/administration/plugin-management/#update-a-plugin) for -more info on updating Grafana plugins. - -## Uninstall - -### Uninstalling the helm chart - -```bash -helm delete release-oncall -``` - -### Clean up PVC's - -```bash -kubectl delete pvc data-release-oncall-mariadb-0 data-release-oncall-rabbitmq-0 \ -redis-data-release-oncall-redis-master-0 redis-data-release-oncall-redis-replicas-0 \ -redis-data-release-oncall-redis-replicas-1 redis-data-release-oncall-redis-replicas-2 -``` - -### Clean up secrets - -```bash -kubectl delete secrets certificate-tls release-oncall-cert-manager-webhook-ca release-oncall-ingress-nginx-admission -``` - -## Troubleshooting - -### Issues during initial configuration - -In the event that you run into issues during initial configuration, it is possible that mismatching versions between -your OnCall backend and UI is the culprit. Ensure that the versions match, and if not, -consider updating your `helm` deployment. diff --git a/packages/system/grafana-oncall/charts/oncall/templates/NOTES.txt b/packages/system/grafana-oncall/charts/oncall/templates/NOTES.txt deleted file mode 100644 index 4eee164c..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/NOTES.txt +++ /dev/null @@ -1,41 +0,0 @@ -================================================================= -📞 Grafana OnCall Notes -================================================================= - -👋 Your Grafana OnCall instance has been successfully deployed - -{{- if not .Values.migrate.enabled }} - 🤖 To migrate the database run these commands: - - export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "oncall.name" . }},app.kubernetes.io/instance={{ .Release.Name }},app.kubernetes.io/component=engine" -o jsonpath="{.items[0].metadata.name}") - kubectl exec -it $POD_NAME -c wait-for-db -- bash -c "python manage.py migrate;" -{{- end }} - - ❗ Set up a DNS record for your domain (use A Record and "@" to point a root domain to the IP address) - Get the external IP address by running the following commands and point {{ .Values.base_url }} to it: - - kubectl get ingress {{ include "oncall.fullname" . }} -o jsonpath="{.status.loadBalancer.ingress[0].ip}" - - Wait until the dns record got propagated. - NOTE: Check with the following command: nslookup {{ .Values.base_url }} - Try reaching https://{{ .Values.base_url }}/ready/ from the browser, make sure it is not cached locally - -{{- if .Values.grafana.enabled }} - 🦎 Grafana was installed as a part of this helm release. Open https://{{ .Values.base_url }}/grafana/plugins/grafana-oncall-app - The User is {{ .Values.grafana.adminUser }} - Get password by running this command: - - kubectl get secret --namespace {{ .Release.Namespace }} {{ template "oncall.grafana.fullname" . }} -o jsonpath="{.data.admin-password}" | base64 --decode ; echo - -{{- else }} - 🦎 Grafana was NOT installed as a part of this helm release. Open external Grafana, go to "Configuration" - "Plugins" and find Grafana OnCall plugin - NOTE: Make sure your external Grafana is available by the network for the containers installed by this release. -{{- end }} - - 🔗 Connect Grafana OnCall Plugin to Grafana OnCall backend: - - Fill the Grafana OnCall Backend URL: - - http://{{ include "oncall.engine.fullname" . }}:8080 - -🎉🎉🎉 Done! 🎉🎉🎉 diff --git a/packages/system/grafana-oncall/charts/oncall/templates/_env.tpl b/packages/system/grafana-oncall/charts/oncall/templates/_env.tpl deleted file mode 100644 index 56dff3e5..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/_env.tpl +++ /dev/null @@ -1,656 +0,0 @@ -{{- define "snippet.oncall.env" -}} -- name: BASE_URL - value: {{ .Values.base_url_protocol }}://{{ .Values.base_url }} -- name: SECRET_KEY - valueFrom: - secretKeyRef: - name: {{ include "snippet.oncall.secret.name" . }} - key: {{ include "snippet.oncall.secret.secretKey" . | quote }} -- name: MIRAGE_SECRET_KEY - valueFrom: - secretKeyRef: - name: {{ include "snippet.oncall.secret.name" . }} - key: {{ include "snippet.oncall.secret.mirageSecretKey" . | quote }} -- name: MIRAGE_CIPHER_IV - value: {{ .Values.oncall.mirageCipherIV | default "1234567890abcdef" | quote }} -- name: DJANGO_SETTINGS_MODULE - value: "settings.helm" -- name: AMIXR_DJANGO_ADMIN_PATH - value: "admin" -- name: OSS - value: "True" -- name: DETACHED_INTEGRATIONS_SERVER - value: {{ .Values.detached_integrations.enabled | toString | title | quote }} -{{- include "snippet.oncall.uwsgi" . }} -- name: BROKER_TYPE - value: {{ .Values.broker.type | default "rabbitmq" }} -- name: GRAFANA_API_URL - value: {{ include "snippet.grafana.url" . | quote }} -{{- end }} - -{{- define "snippet.oncall.secret.name" -}} -{{ if .Values.oncall.secrets.existingSecret -}} - {{ .Values.oncall.secrets.existingSecret }} -{{- else -}} - {{ include "oncall.fullname" . }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.secret.secretKey" -}} -{{ if .Values.oncall.secrets.existingSecret -}} - {{ required "oncall.secrets.secretKey is required if oncall.secret.existingSecret is not empty" .Values.oncall.secrets.secretKey }} -{{- else -}} - SECRET_KEY -{{- end }} -{{- end }} - -{{- define "snippet.oncall.secret.mirageSecretKey" -}} -{{ if .Values.oncall.secrets.existingSecret -}} - {{ required "oncall.secrets.mirageSecretKey is required if oncall.secret.existingSecret is not empty" .Values.oncall.secrets.mirageSecretKey }} -{{- else -}} - MIRAGE_SECRET_KEY -{{- end }} -{{- end }} - -{{- define "snippet.oncall.uwsgi" -}} -{{- if .Values.uwsgi }} - {{- range $key, $value := .Values.uwsgi }} -- name: UWSGI_{{ $key | upper | replace "-" "_" }} - value: {{ $value | quote }} - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.slack.env" -}} -- name: FEATURE_SLACK_INTEGRATION_ENABLED - value: {{ .Values.oncall.slack.enabled | toString | title | quote }} -{{- if .Values.oncall.slack.enabled }} -- name: SLACK_SLASH_COMMAND_NAME - value: "/{{ .Values.oncall.slack.commandName | default "oncall" }}" -{{- if .Values.oncall.slack.existingSecret }} -- name: SLACK_CLIENT_OAUTH_ID - valueFrom: - secretKeyRef: - name: {{ .Values.oncall.slack.existingSecret }} - key: {{ required "oncall.slack.clientIdKey is required if oncall.slack.existingSecret is not empty" .Values.oncall.slack.clientIdKey | quote }} -- name: SLACK_CLIENT_OAUTH_SECRET - valueFrom: - secretKeyRef: - name: {{ .Values.oncall.slack.existingSecret }} - key: {{ required "oncall.slack.clientSecretKey is required if oncall.slack.existingSecret is not empty" .Values.oncall.slack.clientSecretKey | quote }} -- name: SLACK_SIGNING_SECRET - valueFrom: - secretKeyRef: - name: {{ .Values.oncall.slack.existingSecret }} - key: {{ required "oncall.slack.signingSecretKey is required if oncall.slack.existingSecret is not empty" .Values.oncall.slack.signingSecretKey | quote }} -{{- else }} -- name: SLACK_CLIENT_OAUTH_ID - value: {{ .Values.oncall.slack.clientId | default "" | quote }} -- name: SLACK_CLIENT_OAUTH_SECRET - value: {{ .Values.oncall.slack.clientSecret | default "" | quote }} -- name: SLACK_SIGNING_SECRET - value: {{ .Values.oncall.slack.signingSecret | default "" | quote }} -{{- end }} -- name: SLACK_INSTALL_RETURN_REDIRECT_HOST - value: {{ .Values.oncall.slack.redirectHost | default (printf "https://%s" .Values.base_url) | quote }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.telegram.env" -}} -{{- if .Values.telegramPolling.enabled -}} -{{- $_ := set .Values.oncall.telegram "enabled" true -}} -{{- end -}} -- name: FEATURE_TELEGRAM_INTEGRATION_ENABLED - value: {{ .Values.oncall.telegram.enabled | toString | title | quote }} -{{- if .Values.oncall.telegram.enabled }} -{{- if .Values.telegramPolling.enabled }} -- name: FEATURE_TELEGRAM_LONG_POLLING_ENABLED - value: {{ .Values.telegramPolling.enabled | toString | title | quote }} -{{- end }} -- name: TELEGRAM_WEBHOOK_HOST - value: {{ .Values.oncall.telegram.webhookUrl | default (printf "https://%s" .Values.base_url) | quote }} -{{- if .Values.oncall.telegram.existingSecret }} -- name: TELEGRAM_TOKEN - valueFrom: - secretKeyRef: - name: {{ .Values.oncall.telegram.existingSecret }} - key: {{ required "oncall.telegram.tokenKey is required if oncall.telegram.existingSecret is not empty" .Values.oncall.telegram.tokenKey | quote }} -{{- else }} -- name: TELEGRAM_TOKEN - value: {{ .Values.oncall.telegram.token | default "" | quote }} -{{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.twilio.env" }} -{{- with .Values.oncall.twilio }} -{{- if .existingSecret }} -- name: TWILIO_ACCOUNT_SID - valueFrom: - secretKeyRef: - name: {{ .existingSecret }} - key: {{ required "oncall.twilio.accountSid is required if oncall.twilio.existingSecret is not empty" .accountSid | quote }} -{{- if .authTokenKey }} -- name: TWILIO_AUTH_TOKEN - valueFrom: - secretKeyRef: - name: {{ .existingSecret }} - key: {{ required "oncall.twilio.authTokenKey is required if oncall.twilio.existingSecret is not empty" .authTokenKey | quote }} -{{- end }} -- name: TWILIO_NUMBER - valueFrom: - secretKeyRef: - name: {{ .existingSecret }} - key: {{ required "oncall.twilio.phoneNumberKey is required if oncall.twilio.existingSecret is not empty" .phoneNumberKey | quote }} -- name: TWILIO_VERIFY_SERVICE_SID - valueFrom: - secretKeyRef: - name: {{ .existingSecret }} - key: {{ required "oncall.twilio.verifySidKey is required if oncall.twilio.existingSecret is not empty" .verifySidKey | quote }} -{{- if and .apiKeySidKey .apiKeySecretKey }} -- name: TWILIO_API_KEY_SID - valueFrom: - secretKeyRef: - name: {{ .existingSecret }} - key: {{ required "oncall.twilio.apiKeySidKey is required if oncall.twilio.existingSecret is not empty" .apiKeySidKey | quote }} -- name: TWILIO_API_KEY_SECRET - valueFrom: - secretKeyRef: - name: {{ .existingSecret }} - key: {{ required "oncall.twilio.apiKeySecretKey is required if oncall.twilio.existingSecret is not empty" .apiKeySecretKey | quote }} -{{- end }} -{{- else }} -{{- if .accountSid }} -- name: TWILIO_ACCOUNT_SID - value: {{ .accountSid | quote }} -{{- end }} -{{- if .authToken }} -- name: TWILIO_AUTH_TOKEN - value: {{ .authToken | quote }} -{{- end }} -{{- if .phoneNumber }} -- name: TWILIO_NUMBER - value: {{ .phoneNumber | quote }} -{{- end }} -{{- if .verifySid }} -- name: TWILIO_VERIFY_SERVICE_SID - value: {{ .verifySid | quote }} -{{- end }} -{{- if .apiKeySid }} -- name: TWILIO_API_KEY_SID - value: {{ .apiKeySid | quote }} -{{- end }} -{{- if .apiKeySecret }} -- name: TWILIO_API_KEY_SECRET - value: {{ .apiKeySecret | quote }} -{{- end }} -{{- end }} -{{- if .limitPhone }} -- name: PHONE_NOTIFICATIONS_LIMIT - value: {{ .limitPhone | quote }} -{{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.celery.env" }} -{{- if .Values.celery.worker_queue }} -- name: CELERY_WORKER_QUEUE - value: {{ .Values.celery.worker_queue | quote }} -{{- end }} -{{- if .Values.celery.worker_concurrency }} -- name: CELERY_WORKER_CONCURRENCY - value: {{ .Values.celery.worker_concurrency | quote }} -{{- end }} -{{- if .Values.celery.worker_max_tasks_per_child }} -- name: CELERY_WORKER_MAX_TASKS_PER_CHILD - value: {{ .Values.celery.worker_max_tasks_per_child | quote }} -{{- end }} -{{- if .Values.celery.worker_beat_enabled }} -- name: CELERY_WORKER_BEAT_ENABLED - value: {{ .Values.celery.worker_beat_enabled | quote }} -{{- end }} -{{- if .Values.celery.worker_shutdown_interval }} -- name: CELERY_WORKER_SHUTDOWN_INTERVAL - value: {{ .Values.celery.worker_shutdown_interval | quote }} -{{- end }} -{{- end }} - -{{- define "snippet.grafana.url" -}} -{{ if .Values.grafana.enabled -}} - http://{{ include "oncall.grafana.fullname" . }} -{{- else -}} - {{ required "externalGrafana.url is required when not grafana.enabled" .Values.externalGrafana.url }} -{{- end }} -{{- end }} - -{{- define "snippet.mysql.env" -}} -- name: MYSQL_HOST - value: {{ include "snippet.mysql.host" . | quote }} -- name: MYSQL_PORT - value: {{ include "snippet.mysql.port" . | quote }} -- name: MYSQL_DB_NAME - value: {{ include "snippet.mysql.db" . | quote }} -- name: MYSQL_USER -{{- if and (not .Values.mariadb.enabled) .Values.externalMysql.existingSecret .Values.externalMysql.usernameKey (not .Values.externalMysql.user) }} - valueFrom: - secretKeyRef: - name: {{ include "snippet.mysql.password.secret.name" . }} - key: {{ .Values.externalMysql.usernameKey | quote }} -{{- else }} - value: {{ include "snippet.mysql.user" . | quote }} -{{- end }} -- name: MYSQL_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "snippet.mysql.password.secret.name" . }} - key: {{ include "snippet.mysql.password.secret.key" . | quote }} -{{- if not .Values.mariadb.enabled }} -{{- with .Values.externalMysql.options }} -- name: MYSQL_OPTIONS - value: {{ . | quote }} -{{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.mysql.password.secret.name" -}} -{{ if .Values.mariadb.enabled -}} - {{ if .Values.mariadb.auth.existingSecret -}} - {{ .Values.mariadb.auth.existingSecret }} - {{- else -}} - {{ include "oncall.mariadb.fullname" . }} - {{- end }} -{{- else -}} - {{ if .Values.externalMysql.existingSecret -}} - {{ .Values.externalMysql.existingSecret }} - {{- else -}} - {{ include "oncall.fullname" . }}-mysql-external - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.mysql.password.secret.key" -}} -{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.existingSecret .Values.externalMysql.passwordKey -}} - {{ .Values.externalMysql.passwordKey }} -{{- else -}} - mariadb-root-password -{{- end }} -{{- end }} - -{{- define "snippet.mysql.host" -}} -{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.host -}} - {{ .Values.externalMysql.host }} -{{- else -}} - {{ include "oncall.mariadb.fullname" . }} -{{- end }} -{{- end }} - -{{- define "snippet.mysql.port" -}} -{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.port -}} - {{ .Values.externalMysql.port }} -{{- else -}} - 3306 -{{- end }} -{{- end }} - -{{- define "snippet.mysql.db" -}} -{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.db_name -}} - {{ .Values.externalMysql.db_name }} -{{- else -}} - {{ .Values.mariadb.auth.database | default "oncall" }} -{{- end }} -{{- end }} - -{{- define "snippet.mysql.user" -}} -{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.user -}} - {{ .Values.externalMysql.user }} -{{- else -}} - {{ .Values.mariadb.auth.username | default "root" }} -{{- end }} -{{- end }} - -{{- define "snippet.postgresql.env" -}} -- name: DATABASE_TYPE - value: {{ .Values.database.type | quote }} -- name: DATABASE_HOST - value: {{ include "snippet.postgresql.host" . | quote }} -- name: DATABASE_PORT - value: {{ include "snippet.postgresql.port" . | quote }} -- name: DATABASE_NAME - value: {{ include "snippet.postgresql.db" . | quote }} -- name: DATABASE_USER - value: {{ include "snippet.postgresql.user" . | quote }} -- name: DATABASE_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "snippet.postgresql.password.secret.name" . }} - key: {{ include "snippet.postgresql.password.secret.key" . | quote }} -{{- if not .Values.postgresql.enabled }} -{{- with .Values.externalPostgresql.options }} -- name: DATABASE_OPTIONS - value: {{ . | quote }} -{{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.sqlite.env" -}} -- name: DATABASE_TYPE - value: sqlite3 -- name: DATABASE_NAME - value: /etc/app/oncall.db -{{- end }} - -{{- define "snippet.postgresql.password.secret.name" -}} -{{ if .Values.postgresql.enabled -}} - {{ if .Values.postgresql.auth.existingSecret -}} - {{ .Values.postgresql.auth.existingSecret }} - {{- else -}} - {{ include "oncall.postgresql.fullname" . }} - {{- end }} -{{- else -}} - {{ if .Values.externalPostgresql.existingSecret -}} - {{ .Values.externalPostgresql.existingSecret }} - {{- else -}} - {{ include "oncall.fullname" . }}-postgresql-external - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.postgresql.password.secret.key" -}} -{{ if .Values.postgresql.enabled -}} - {{ if .Values.postgresql.auth.existingSecret -}} - {{ required "postgresql.auth.secretKeys.adminPasswordKey is required if database.type=postgres and postgresql.enabled and postgresql.auth.existingSecret" .Values.postgresql.auth.secretKeys.adminPasswordKey }} - {{- else -}} - {{ include "postgresql.userPasswordKey" .Subcharts.postgresql }} - {{- end }} -{{- else -}} - {{ if .Values.externalPostgresql.existingSecret -}} - {{ required "externalPostgresql.passwordKey is required if database.type=postgres and not postgresql.enabled and postgresql.auth.existingSecret" .Values.externalPostgresql.passwordKey }} - {{- else -}} - postgres-password - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.postgresql.host" -}} -{{ if not .Values.postgresql.enabled -}} - {{ required "externalPostgresql.host is required if database.type=postgres and not postgresql.enabled" .Values.externalPostgresql.host }} -{{- else -}} - {{ include "oncall.postgresql.fullname" . }} -{{- end }} -{{- end }} - -{{- define "snippet.postgresql.port" -}} -{{ if and (not .Values.postgresql.enabled) .Values.externalPostgresql.port -}} - {{ .Values.externalPostgresql.port }} -{{- else -}} - 5432 -{{- end }} -{{- end }} - -{{- define "snippet.postgresql.db" -}} -{{ if not .Values.postgresql.enabled -}} - {{ .Values.externalPostgresql.db_name | default "oncall" }} -{{- else -}} - {{ .Values.postgresql.auth.database | default "oncall" }} -{{- end }} -{{- end }} - -{{- define "snippet.postgresql.user" -}} -{{ if not .Values.postgresql.enabled -}} - {{ .Values.externalPostgresql.user | default "postgres" }} -{{- else -}} - {{ .Values.postgresql.auth.username | default "postgres" }} -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.env" }} -- name: RABBITMQ_USERNAME -{{- if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.existingSecret .Values.externalRabbitmq.usernameKey (not .Values.externalRabbitmq.user) }} - valueFrom: - secretKeyRef: - name: {{ include "snippet.rabbitmq.password.secret.name" . }} - key: {{ .Values.externalRabbitmq.usernameKey | quote }} -{{- else }} - value: {{ include "snippet.rabbitmq.user" . | quote }} -{{- end }} -- name: RABBITMQ_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "snippet.rabbitmq.password.secret.name" . }} - key: {{ include "snippet.rabbitmq.password.secret.key" . | quote }} -- name: RABBITMQ_HOST - value: {{ include "snippet.rabbitmq.host" . | quote }} -- name: RABBITMQ_PORT - value: {{ include "snippet.rabbitmq.port" . | quote }} -- name: RABBITMQ_PROTOCOL - value: {{ include "snippet.rabbitmq.protocol" . | quote }} -- name: RABBITMQ_VHOST - value: {{ include "snippet.rabbitmq.vhost" . | quote }} -{{- end }} - -{{- define "snippet.rabbitmq.user" -}} -{{ if not .Values.rabbitmq.enabled -}} - {{ required "externalRabbitmq.user is required if not rabbitmq.enabled" .Values.externalRabbitmq.user }} -{{- else -}} - user -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.host" -}} -{{ if not .Values.rabbitmq.enabled -}} - {{ required "externalRabbitmq.host is required if not rabbitmq.enabled" .Values.externalRabbitmq.host }} -{{- else -}} - {{ include "oncall.rabbitmq.fullname" . }} -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.port" -}} -{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.port -}} - {{ required "externalRabbitmq.port is required if not rabbitmq.enabled" .Values.externalRabbitmq.port }} -{{- else -}} - 5672 -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.protocol" -}} -{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.protocol -}} - {{ .Values.externalRabbitmq.protocol }} -{{- else -}} - amqp -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.vhost" -}} -{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.vhost -}} - {{ .Values.externalRabbitmq.vhost }} -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.password.secret.name" -}} -{{ if .Values.rabbitmq.enabled -}} - {{ if .Values.rabbitmq.auth.existingPasswordSecret -}} - {{ .Values.rabbitmq.auth.existingPasswordSecret }} - {{- else -}} - {{ include "oncall.rabbitmq.fullname" . }} - {{- end }} -{{- else -}} - {{ if .Values.externalRabbitmq.existingSecret -}} - {{ .Values.externalRabbitmq.existingSecret }} - {{- else -}} - {{ include "oncall.fullname" . }}-rabbitmq-external - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.rabbitmq.password.secret.key" -}} -{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.passwordKey -}} - {{ .Values.externalRabbitmq.passwordKey }} -{{- else -}} - rabbitmq-password -{{- end }} -{{- end }} - -{{- define "snippet.redis.protocol" -}} -{{ default "redis" .Values.externalRedis.protocol | quote }} -{{- end }} - -{{- define "snippet.redis.host" -}} -{{ if not .Values.redis.enabled -}} - {{ required "externalRedis.host is required if not redis.enabled" .Values.externalRedis.host | quote }} -{{- else -}} - {{ include "oncall.redis.fullname" . }}-master -{{- end }} -{{- end }} - -{{- define "snippet.redis.port" -}} -{{ default 6379 .Values.externalRedis.port | quote }} -{{- end }} - -{{- define "snippet.redis.database" -}} -{{ default 0 .Values.externalRedis.database | quote }} -{{- end }} - -{{- define "snippet.redis.password.secret.name" -}} -{{ if .Values.redis.enabled -}} - {{ if .Values.redis.auth.existingSecret -}} - {{ .Values.redis.auth.existingSecret }} - {{- else -}} - {{ include "oncall.redis.fullname" . }} - {{- end }} -{{- else -}} - {{ if .Values.externalRedis.existingSecret -}} - {{ .Values.externalRedis.existingSecret }} - {{- else -}} - {{ include "oncall.fullname" . }}-redis-external - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.redis.password.secret.key" -}} -{{ if .Values.redis.enabled -}} - {{ if .Values.redis.auth.existingSecret -}} - {{ required "redis.auth.existingSecretPasswordKey is required if redis.auth.existingSecret is non-empty" .Values.redis.auth.existingSecretPasswordKey }} - {{- else -}} - redis-password - {{- end }} -{{- else -}} - {{ if .Values.externalRedis.existingSecret -}} - {{ required "externalRedis.passwordKey is required if externalRedis.existingSecret is non-empty" .Values.externalRedis.passwordKey }} - {{- else -}} - redis-password - {{- end }} -{{- end }} -{{- end }} - -{{- define "snippet.redis.env" -}} -- name: REDIS_PROTOCOL - value: {{ include "snippet.redis.protocol" . }} -- name: REDIS_HOST - value: {{ include "snippet.redis.host" . }} -- name: REDIS_PORT - value: {{ include "snippet.redis.port" . }} -- name: REDIS_DATABASE - value: {{ include "snippet.redis.database" . }} -- name: REDIS_USERNAME - value: {{ default "" .Values.externalRedis.username | quote }} -- name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "snippet.redis.password.secret.name" . }} - key: {{ include "snippet.redis.password.secret.key" . | quote}} -{{- if and (not .Values.redis.enabled) .Values.externalRedis.ssl_options.enabled }} -- name: REDIS_USE_SSL - value: "true" -{{- with .Values.externalRedis.ssl_options.ca_certs }} -- name: REDIS_SSL_CA_CERTS - value: {{ . | quote }} -{{- end }} -{{- with .Values.externalRedis.ssl_options.certfile }} -- name: REDIS_SSL_CERTFILE - value: {{ . | quote }} -{{- end }} -{{- with .Values.externalRedis.ssl_options.keyfile }} -- name: REDIS_SSL_KEYFILE - value: {{ . | quote }} -{{- end }} -{{- with .Values.externalRedis.ssl_options.cert_reqs }} -- name: REDIS_SSL_CERT_REQS - value: {{ . | quote }} -{{- end }} -{{- end }} -{{- end }} - -{{- /* -when broker.type != rabbitmq, we do not need to include rabbitmq environment variables -*/}} -{{- define "snippet.broker.env" -}} -{{- include "snippet.redis.env" . }} -{{- if eq .Values.broker.type "rabbitmq" -}} -{{- include "snippet.rabbitmq.env" . }} -{{- end }} -{{- end }} - -{{- define "snippet.db.env" -}} -{{- if eq .Values.database.type "mysql" }} -{{- include "snippet.mysql.env" . }} -{{- else if eq .Values.database.type "postgresql" }} -{{- include "snippet.postgresql.env" . }} -{{- else if eq .Values.database.type "sqlite" -}} -{{- include "snippet.sqlite.env" . }} -{{- else -}} -{{- fail "value for .Values.db.type must be either 'mysql', 'postgresql', or 'sqlite'" }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.smtp.env" -}} -- name: FEATURE_EMAIL_INTEGRATION_ENABLED - value: {{ .Values.oncall.smtp.enabled | toString | title | quote }} -{{- if .Values.oncall.smtp.enabled }} -- name: EMAIL_HOST - value: {{ .Values.oncall.smtp.host | quote }} -- name: EMAIL_PORT - value: {{ .Values.oncall.smtp.port | default "587" | quote }} -- name: EMAIL_HOST_USER - value: {{ .Values.oncall.smtp.username | quote }} -- name: EMAIL_HOST_PASSWORD - valueFrom: - secretKeyRef: - name: {{ include "oncall.fullname" . }}-smtp - key: smtp-password - optional: true -- name: EMAIL_USE_TLS - value: {{ .Values.oncall.smtp.tls | default true | toString | title | quote }} -- name: EMAIL_FROM_ADDRESS - value: {{ .Values.oncall.smtp.fromEmail | quote }} -- name: EMAIL_NOTIFICATIONS_LIMIT - value: {{ .Values.oncall.smtp.limitEmail | default "200" | quote }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.exporter.env" -}} -{{ if .Values.oncall.exporter.enabled -}} -- name: FEATURE_PROMETHEUS_EXPORTER_ENABLED - value: {{ .Values.oncall.exporter.enabled | toString | title | quote }} -- name: PROMETHEUS_EXPORTER_SECRET - valueFrom: - secretKeyRef: - name: {{ include "oncall.fullname" . }}-exporter - key: exporter-secret - optional: true -{{- else -}} -- name: FEATURE_PROMETHEUS_EXPORTER_ENABLED - value: {{ .Values.oncall.exporter.enabled | toString | title | quote }} -{{- end }} -{{- end }} - -{{- define "snippet.oncall.engine.env" -}} -{{ include "snippet.oncall.env" . }} -{{ include "snippet.oncall.slack.env" . }} -{{ include "snippet.oncall.telegram.env" . }} -{{ include "snippet.oncall.smtp.env" . }} -{{ include "snippet.oncall.twilio.env" . }} -{{ include "snippet.oncall.exporter.env" . }} -{{ include "snippet.db.env" . }} -{{ include "snippet.broker.env" . }} -{{ include "oncall.extraEnvs" . }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/_helpers.tpl b/packages/system/grafana-oncall/charts/oncall/templates/_helpers.tpl deleted file mode 100644 index 6486bfe5..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/_helpers.tpl +++ /dev/null @@ -1,121 +0,0 @@ -{{/* -Expand the name of the chart. -*/}} -{{- define "oncall.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "oncall.fullname" -}} -{{- if .Values.fullnameOverride }} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- $name := default .Chart.Name .Values.nameOverride }} -{{- if contains $name .Release.Name }} -{{- .Release.Name | trunc 63 | trimSuffix "-" }} -{{- else }} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} -{{- end }} -{{- end }} -{{- end }} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "oncall.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* -Common labels -*/}} -{{- define "oncall.labels" -}} -helm.sh/chart: {{ include "oncall.chart" . }} -{{ include "oncall.selectorLabels" . }} -{{- if .Chart.AppVersion }} -app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} -{{- end }} -app.kubernetes.io/managed-by: {{ .Release.Service }} -{{- end }} - -{{/* -Selector labels -*/}} -{{- define "oncall.selectorLabels" -}} -app.kubernetes.io/name: {{ include "oncall.name" . }} -app.kubernetes.io/instance: {{ .Release.Name }} -{{- end }} - -{{/* -Create the name of the service account to use -*/}} -{{- define "oncall.serviceAccountName" -}} -{{- if .Values.serviceAccount.create }} -{{- default (include "oncall.fullname" .) .Values.serviceAccount.name }} -{{- else }} -{{- default "default" .Values.serviceAccount.name }} -{{- end }} -{{- end }} - -{{/* Generate the fullname of mariadb subchart */}} -{{- define "oncall.mariadb.fullname" -}} -{{- printf "%s-%s" .Release.Name "mariadb" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* Generate the fullname of postgresql subchart */}} -{{- define "oncall.postgresql.fullname" -}} -{{- printf "%s-%s" .Release.Name "postgresql" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{- define "oncall.grafana.fullname" -}} -{{- printf "%s-%s" .Release.Name "grafana" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* Generate the fullname of rabbitmq subchart */}} -{{- define "oncall.rabbitmq.fullname" -}} -{{- printf "%s-%s" .Release.Name "rabbitmq" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* Generate the fullname of redis subchart */}} -{{- define "oncall.redis.fullname" -}} -{{- printf "%s-%s" .Release.Name "redis" | trunc 63 | trimSuffix "-" }} -{{- end }} - -{{/* Generate engine image name */}} -{{- define "oncall.engine.image" -}} -{{- printf "%s:%s" .Values.image.repository (.Values.image.tag | default .Chart.AppVersion) }} -{{- end }} - -{{- define "oncall.initContainer" }} -- name: wait-for-db - image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ['sh', '-c', "until (python manage.py migrate --check); do echo Waiting for database migrations; sleep 2; done"] - securityContext: - {{ toYaml .Values.init.securityContext | nindent 4 }} - resources: - {{ toYaml .Values.init.resources | nindent 4 }} - env: - {{- include "snippet.oncall.env" . | nindent 4 }} - {{- include "snippet.db.env" . | nindent 4 }} - {{- include "snippet.broker.env" . | nindent 4 }} - {{- include "oncall.extraEnvs" . | nindent 4 }} -{{- end }} - -{{- define "oncall.extraEnvs" -}} -{{- if .Values.env }} - {{- if (kindIs "map" .Values.env) }} - {{- range $key, $value := .Values.env }} -- name: {{ $key }} - value: {{ $value }} - {{- end -}} - {{/* support previous schema */}} - {{- else }} -{{- toYaml .Values.env }} - {{- end }} -{{- end }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/celery/_helpers.tpl b/packages/system/grafana-oncall/charts/oncall/templates/celery/_helpers.tpl deleted file mode 100644 index 8c37e957..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/celery/_helpers.tpl +++ /dev/null @@ -1,26 +0,0 @@ -{{/* -Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "oncall.celery.name" -}} -{{ include "oncall.name" . | trunc 55 }}-celery -{{- end }} - -{{- define "oncall.celery.fullname" -}} -{{ include "oncall.fullname" . | trunc 55 }}-celery -{{- end }} - -{{/* -Engine common labels -*/}} -{{- define "oncall.celery.labels" -}} -{{ include "oncall.labels" . }} -app.kubernetes.io/component: celery -{{- end }} - -{{/* -Engine selector labels -*/}} -{{- define "oncall.celery.selectorLabels" -}} -{{ include "oncall.selectorLabels" . }} -app.kubernetes.io/component: celery -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/celery/deployment.yaml b/packages/system/grafana-oncall/charts/oncall/templates/celery/deployment.yaml deleted file mode 100644 index b2498dd1..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/celery/deployment.yaml +++ /dev/null @@ -1,89 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "oncall.celery.fullname" . }} - labels: - {{- include "oncall.celery.labels" . | nindent 4 }} -spec: - replicas: {{ .Values.celery.replicaCount }} - selector: - matchLabels: - {{- include "oncall.celery.selectorLabels" . | nindent 6 }} - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - random-annotation: {{ randAlphaNum 10 | lower }} - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "oncall.celery.selectorLabels" . | nindent 8 }} - {{- if .Values.celery.podLabels }} - {{- toYaml .Values.celery.podLabels | nindent 8}} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "oncall.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - initContainers: - {{- include "oncall.initContainer" . | indent 8 }} - {{- with .Values.celery.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.celery.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.celery.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.celery.topologySpreadConstraints }} - topologySpreadConstraints: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.celery.priorityClassName }} - priorityClassName: {{ . }} - {{- end }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "oncall.engine.image" . }} - {{- if .Values.oncall.devMode }} - command: ["python", "manage.py", "start_celery"] - {{- else }} - command: ["./celery_with_exporter.sh"] - {{- end }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - env: - {{- include "snippet.celery.env" . | nindent 12 }} - {{- include "snippet.oncall.engine.env" . | nindent 12 }} - {{- if .Values.celery.livenessProbe.enabled }} - livenessProbe: - exec: - command: [ - "bash", - "-c", - "celery -A engine inspect ping -d celery@$HOSTNAME" - ] - initialDelaySeconds: {{ .Values.celery.livenessProbe.initialDelaySeconds }} - periodSeconds: {{ .Values.celery.livenessProbe.periodSeconds }} - timeoutSeconds: {{ .Values.celery.livenessProbe.timeoutSeconds }} - {{- end }} - resources: - {{- toYaml .Values.celery.resources | nindent 12 }} - {{- with .Values.celery.extraVolumeMounts }} - volumeMounts: {{- . | toYaml | nindent 12 }} - {{- end }} - {{- with .Values.celery.extraContainers }} - {{- tpl . $ | nindent 8 }} - {{- end }} - {{- with .Values.celery.extraVolumes }} - volumes: {{- . | toYaml | nindent 8 }} - {{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/cert-issuer.yaml b/packages/system/grafana-oncall/charts/oncall/templates/cert-issuer.yaml deleted file mode 100644 index 8b1716f3..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/cert-issuer.yaml +++ /dev/null @@ -1,22 +0,0 @@ -{{- if (index .Values "cert-manager").enabled }} -apiVersion: cert-manager.io/v1 -kind: Issuer -metadata: - name: letsencrypt-prod - annotations: - "helm.sh/hook": post-install,post-upgrade -spec: - acme: - # The ACME server URL - server: https://acme-v02.api.letsencrypt.org/directory - # Email address used for ACME registration - email: no-reply@{{ .Values.base_url }} - # Name of a secret used to store the ACME account private key - privateKeySecretRef: - name: letsencrypt-prod - # Enable the HTTP-01 challenge provider - solvers: - - http01: - ingress: - class: nginx -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/engine/_helpers-engine.tpl b/packages/system/grafana-oncall/charts/oncall/templates/engine/_helpers-engine.tpl deleted file mode 100644 index 6d498e93..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/engine/_helpers-engine.tpl +++ /dev/null @@ -1,26 +0,0 @@ -{{/* -Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "oncall.engine.name" -}} -{{ include "oncall.name" . | trunc 55 }}-engine -{{- end }} - -{{- define "oncall.engine.fullname" -}} -{{ include "oncall.fullname" . | trunc 55 }}-engine -{{- end }} - -{{/* -Engine common labels -*/}} -{{- define "oncall.engine.labels" -}} -{{ include "oncall.labels" . }} -app.kubernetes.io/component: engine -{{- end }} - -{{/* -Engien selector labels -*/}} -{{- define "oncall.engine.selectorLabels" -}} -{{ include "oncall.selectorLabels" . }} -app.kubernetes.io/component: engine -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/engine/deployment.yaml b/packages/system/grafana-oncall/charts/oncall/templates/engine/deployment.yaml deleted file mode 100644 index ccb770df..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/engine/deployment.yaml +++ /dev/null @@ -1,98 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "oncall.engine.fullname" . }} - labels: - {{- include "oncall.engine.labels" . | nindent 4 }} -spec: - replicas: {{ .Values.engine.replicaCount }} - selector: - matchLabels: - {{- include "oncall.engine.selectorLabels" . | nindent 6 }} - strategy: - {{- toYaml .Values.engine.updateStrategy | nindent 4 }} - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - random-annotation: {{ randAlphaNum 10 | lower }} - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "oncall.engine.selectorLabels" . | nindent 8 }} - {{- if .Values.engine.podLabels }} - {{- toYaml .Values.engine.podLabels | nindent 8}} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "oncall.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - initContainers: - {{- include "oncall.initContainer" . | indent 8 }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "oncall.engine.image" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- if .Values.oncall.devMode }} - command: ["sh", "-c", "uwsgi --disable-logging --py-autoreload 3 --ini uwsgi.ini"] - {{- end }} - ports: - - name: http - containerPort: 8080 - protocol: TCP - env: - {{- include "snippet.oncall.engine.env" . | nindent 12 }} - livenessProbe: - httpGet: - path: /health/ - port: http - periodSeconds: 60 - timeoutSeconds: 3 - readinessProbe: - httpGet: - path: /ready/ - port: http - periodSeconds: 60 - timeoutSeconds: 3 - startupProbe: - httpGet: - path: /startupprobe/ - port: http - periodSeconds: 10 - timeoutSeconds: 3 - resources: - {{- toYaml .Values.engine.resources | nindent 12 }} - {{- with .Values.engine.extraVolumeMounts }} - volumeMounts: {{- . | toYaml | nindent 12 }} - {{- end }} - {{- with .Values.engine.extraContainers }} - {{- tpl . $ | nindent 8 }} - {{- end }} - {{- with .Values.engine.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.engine.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.engine.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.engine.topologySpreadConstraints }} - topologySpreadConstraints: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.engine.priorityClassName }} - priorityClassName: {{ . }} - {{- end }} - {{- with .Values.engine.extraVolumes }} - volumes: {{- . | toYaml | nindent 8 }} - {{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/engine/job-migrate.yaml b/packages/system/grafana-oncall/charts/oncall/templates/engine/job-migrate.yaml deleted file mode 100644 index 09782954..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/engine/job-migrate.yaml +++ /dev/null @@ -1,102 +0,0 @@ -{{- if .Values.migrate.enabled -}} -apiVersion: batch/v1 -kind: Job -metadata: - {{- if .Values.migrate.useHook }} - name: {{ printf "%s-migrate" (include "oncall.engine.fullname" .) }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-1" - {{- with .Values.migrate.annotations }} - {{- toYaml . | nindent 4 }} - {{- end }} - {{- else }} - name: {{ printf "%s-migrate-%s" (include "oncall.engine.fullname" .) (now | date "2006-01-02-15-04-05") }} - {{- with .Values.migrate.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} - {{- end }} - labels: - {{- include "oncall.engine.labels" . | nindent 4 }} -spec: - backoffLimit: 15 - {{- if .Values.migrate.ttlSecondsAfterFinished }} - ttlSecondsAfterFinished: {{ .Values.migrate.ttlSecondsAfterFinished }} - {{- end }} - template: - metadata: - name: {{ printf "%s-migrate-%s" (include "oncall.engine.fullname" .) (now | date "2006-01-02-15-04-05") }} - {{- with .Values.podAnnotations }} - annotations: - random-annotation: {{ randAlphaNum 10 | lower }} - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "oncall.engine.selectorLabels" . | nindent 8 }} - spec: - restartPolicy: Never - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "oncall.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - {{- with .Values.migrate.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.migrate.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.migrate.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - containers: - - name: {{ .Chart.Name }}-migrate - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "oncall.engine.image" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: - - /bin/sh - - -c - {{- if eq .Values.database.type "mysql" }} - - | - until (nc -vz $MYSQL_HOST $MYSQL_PORT); - do - echo "waiting for MySQL"; sleep 1; - done - python manage.py migrate - {{- else if eq .Values.database.type "postgresql" }} - - | - until (nc -vz $DATABASE_HOST $DATABASE_PORT); - do - echo "waiting for PostgreSQL"; sleep 1; - done - python manage.py migrate - {{- else }} - - python manage.py migrate - {{- end }} - env: - {{- include "snippet.oncall.env" . | nindent 12 }} - {{- include "snippet.oncall.smtp.env" . | nindent 12 }} - {{- include "snippet.oncall.exporter.env" . | nindent 12 }} - {{- include "snippet.db.env" . | nindent 12 }} - {{- include "snippet.broker.env" . | nindent 12 }} - {{- include "oncall.extraEnvs" . | nindent 12 }} - resources: - {{- toYaml .Values.migrate.resources | nindent 12 }} - {{- with .Values.migrate.extraVolumeMounts }} - volumeMounts: {{- . | toYaml | nindent 10 }} - {{- end }} - {{- with .Values.migrate.extraContainers }} - {{- tpl . $ | nindent 6 }} - {{- end }} - {{- with .Values.migrate.extraVolumes }} - volumes: {{- . | toYaml | nindent 8 }} - {{- end }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/engine/service-external.yaml b/packages/system/grafana-oncall/charts/oncall/templates/engine/service-external.yaml deleted file mode 100644 index eec1f0bf..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/engine/service-external.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{{- if .Values.service.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "oncall.engine.fullname" . }}-external - labels: - {{- include "oncall.engine.labels" . | nindent 4 }} - {{- with .Values.service.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - type: {{ .Values.service.type }} - ports: - - port: {{ .Values.service.port }} - targetPort: http - protocol: TCP - name: http - {{- if and (eq .Values.service.type "NodePort") (.Values.service.nodePort) }} - nodePort: {{ .Values.service.nodePort }} - {{- end }} - selector: - {{- include "oncall.engine.selectorLabels" . | nindent 4 }} -{{- end }} \ No newline at end of file diff --git a/packages/system/grafana-oncall/charts/oncall/templates/engine/service-internal.yaml b/packages/system/grafana-oncall/charts/oncall/templates/engine/service-internal.yaml deleted file mode 100644 index 07785035..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/engine/service-internal.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: {{ include "oncall.engine.fullname" . }} - labels: - {{- include "oncall.engine.labels" . | nindent 4 }} -spec: - type: ClusterIP - ports: - - port: 8080 - targetPort: http - protocol: TCP - name: http - selector: - {{- include "oncall.engine.selectorLabels" . | nindent 4 }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/ingress-regular.yaml b/packages/system/grafana-oncall/charts/oncall/templates/ingress-regular.yaml deleted file mode 100644 index 9a5357ff..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/ingress-regular.yaml +++ /dev/null @@ -1,65 +0,0 @@ -{{- if .Values.ingress.enabled -}} -{{- $fullName := include "oncall.fullname" . -}} -{{- $svcPort := .Values.service.port -}} -{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} - {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} - {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} - {{- end }} -{{- end }} -{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1 -{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} -apiVersion: networking.k8s.io/v1beta1 -{{- else -}} -apiVersion: extensions/v1beta1 -{{- end }} -kind: Ingress -metadata: - name: {{ $fullName }} - labels: - {{- include "oncall.labels" . | nindent 4 }} - {{- with .Values.ingress.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} - ingressClassName: {{ .Values.ingress.className }} - {{- end }} - {{- if .Values.ingress.tls }} - tls: - {{- tpl (toYaml .Values.ingress.tls) . | nindent 4 }} - {{- end }} - rules: - - host: {{ .Values.base_url | quote }} - http: - paths: -{{- if .Values.ingress.extraPaths }} -{{ toYaml .Values.ingress.extraPaths | indent 6}} -{{- end }} - - path: / - pathType: Prefix - backend: - service: - name: {{ include "oncall.engine.fullname" . }} - port: - number: 8080 - {{ if .Values.grafana.enabled }} - - path: /grafana - pathType: Prefix - backend: - service: - name: {{ include "oncall.grafana.fullname" . }} - port: - number: 80 - {{- end }} - {{ if .Values.detached_integrations.enabled }} - - path: /integrations - pathType: Prefix - backend: - service: - name: {{ include "oncall.detached_integrations.fullname" . }} - port: - number: 8080 - {{- end }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/integrations/_helpers.tpl b/packages/system/grafana-oncall/charts/oncall/templates/integrations/_helpers.tpl deleted file mode 100644 index 6727ed42..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/integrations/_helpers.tpl +++ /dev/null @@ -1,26 +0,0 @@ -{{/* -Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "oncall.detached_integrations.name" -}} -{{ include "oncall.name" . | trunc 55 }}-integrations -{{- end }} - -{{- define "oncall.detached_integrations.fullname" -}} -{{ include "oncall.fullname" . | trunc 55 }}-integrations -{{- end }} - -{{/* -Integrations common labels -*/}} -{{- define "oncall.detached_integrations.labels" -}} -{{ include "oncall.labels" . }} -app.kubernetes.io/component: integrations -{{- end }} - -{{/* -Integrations selector labels -*/}} -{{- define "oncall.detached_integrations.selectorLabels" -}} -{{ include "oncall.selectorLabels" . }} -app.kubernetes.io/component: integrations -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/integrations/deployment.yaml b/packages/system/grafana-oncall/charts/oncall/templates/integrations/deployment.yaml deleted file mode 100644 index 5e08eaf7..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/integrations/deployment.yaml +++ /dev/null @@ -1,99 +0,0 @@ -{{- if .Values.detached_integrations.enabled -}} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "oncall.detached_integrations.fullname" . }} - labels: - {{- include "oncall.detached_integrations.labels" . | nindent 4 }} -spec: - replicas: {{ .Values.detached_integrations.replicaCount }} - selector: - matchLabels: - {{- include "oncall.detached_integrations.selectorLabels" . | nindent 6 }} - strategy: - {{- toYaml .Values.detached_integrations.updateStrategy | nindent 4 }} - template: - metadata: - {{- with .Values.podAnnotations }} - annotations: - random-annotation: {{ randAlphaNum 10 | lower }} - {{- toYaml . | nindent 8 }} - {{- end }} - labels: - {{- include "oncall.detached_integrations.selectorLabels" . | nindent 8 }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "oncall.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - initContainers: - {{- include "oncall.initContainer" . | indent 8 }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "oncall.engine.image" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- if .Values.oncall.devMode }} - command: ["sh", "-c", "uwsgi --disable-logging --py-autoreload 3 --ini uwsgi.ini"] - {{- end }} - ports: - - name: http - containerPort: 8080 - protocol: TCP - env: - {{- include "snippet.oncall.engine.env" . | nindent 12 }} - - name: ROOT_URLCONF - value: "engine.integrations_urls" - livenessProbe: - httpGet: - path: /health/ - port: http - periodSeconds: 60 - timeoutSeconds: 3 - readinessProbe: - httpGet: - path: /ready/ - port: http - periodSeconds: 60 - timeoutSeconds: 3 - startupProbe: - httpGet: - path: /startupprobe/ - port: http - periodSeconds: 10 - timeoutSeconds: 3 - resources: - {{- toYaml .Values.detached_integrations.resources | nindent 12 }} - {{- with .Values.detached_integrations.extraVolumeMounts }} - volumeMounts: {{- . | toYaml | nindent 12 }} - {{- end }} - {{- with .Values.detached_integrations.extraContainers }} - {{- tpl . $ | nindent 8 }} - {{- end }} - {{- with .Values.detached_integrations.nodeSelector }} - nodeSelector: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.detached_integrations.affinity }} - affinity: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.detached_integrations.tolerations }} - tolerations: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.detached_integrations.topologySpreadConstraints }} - topologySpreadConstraints: - {{- toYaml . | nindent 8 }} - {{- end }} - {{- with .Values.detached_integrations.priorityClassName }} - priorityClassName: {{ . }} - {{- end }} - {{- with .Values.detached_integrations.extraVolumes }} - volumes: {{- . | toYaml | nindent 8 }} - {{- end }} -{{- end -}} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/integrations/service-external.yaml b/packages/system/grafana-oncall/charts/oncall/templates/integrations/service-external.yaml deleted file mode 100644 index 455d4aa0..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/integrations/service-external.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{{- if .Values.detached_integrations_service.enabled }} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "oncall.detached_integrations.fullname" . }}-external - labels: - {{- include "oncall.detached_integrations.labels" . | nindent 4 }} - {{- with .Values.detached_integrations_service.annotations }} - annotations: - {{- toYaml . | nindent 4 }} - {{- end }} -spec: - type: {{ .Values.detached_integrations_service.type }} - ports: - - port: {{ .Values.detached_integrations_service.port }} - targetPort: http - protocol: TCP - name: http - {{- if and (eq .Values.detached_integrations_service.type "NodePort") (.Values.detached_integrations_service.nodePort) }} - nodePort: {{ .Values.detached_integrations_service.nodePort }} - {{- end }} - selector: - {{- include "oncall.detached_integrations.selectorLabels" . | nindent 4 }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/integrations/service-internal.yaml b/packages/system/grafana-oncall/charts/oncall/templates/integrations/service-internal.yaml deleted file mode 100644 index 3527b247..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/integrations/service-internal.yaml +++ /dev/null @@ -1,17 +0,0 @@ -{{- if .Values.detached_integrations.enabled -}} -apiVersion: v1 -kind: Service -metadata: - name: {{ include "oncall.detached_integrations.fullname" . }} - labels: - {{- include "oncall.detached_integrations.labels" . | nindent 4 }} -spec: - type: ClusterIP - ports: - - port: 8080 - targetPort: http - protocol: TCP - name: http - selector: - {{- include "oncall.detached_integrations.selectorLabels" . | nindent 4 }} -{{- end -}} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/secrets.yaml b/packages/system/grafana-oncall/charts/oncall/templates/secrets.yaml deleted file mode 100644 index 821592fa..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/secrets.yaml +++ /dev/null @@ -1,98 +0,0 @@ -{{- if not .Values.oncall.secrets.existingSecret }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }} - labels: - {{- include "oncall.labels" . | nindent 4 }} - {{- if .Values.migrate.useHook }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - {{- end }} -type: Opaque -data: - {{ include "snippet.oncall.secret.secretKey" . }}: {{ randAlphaNum 40 | b64enc | quote }} - {{ include "snippet.oncall.secret.mirageSecretKey" . }}: {{ randAlphaNum 40 | b64enc | quote }} ---- -{{- end }} -{{- if and (eq .Values.database.type "mysql") (not .Values.mariadb.enabled) (not .Values.externalMysql.existingSecret) }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }}-mysql-external - {{- if .Values.migrate.useHook }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - {{- end }} -type: Opaque -data: - mariadb-root-password: {{ required "externalMysql.password is required if not mariadb.enabled and not externalMysql.existingSecret" .Values.externalMysql.password | b64enc | quote }} ---- -{{- end }} -{{- if and (not .Values.postgresql.enabled) (eq .Values.database.type "postgresql") (not .Values.externalPostgresql.existingSecret) }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }}-postgresql-external - {{- if .Values.migrate.useHook }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - {{- end }} -type: Opaque -data: - postgres-password: {{ required "externalPostgresql.password is required if not postgresql.enabled and not externalPostgresql.existingSecret" .Values.externalPostgresql.password | b64enc | quote }} ---- -{{- end }} -{{- if and (eq .Values.broker.type "rabbitmq") (not .Values.rabbitmq.enabled) (not .Values.externalRabbitmq.existingSecret) }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }}-rabbitmq-external - {{- if .Values.migrate.useHook }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - {{- end }} -type: Opaque -data: - rabbitmq-password: {{ required "externalRabbitmq.password is required if not rabbitmq.enabled and not externalRabbitmq.existingSecret" .Values.externalRabbitmq.password | b64enc | quote }} ---- -{{- end }} -{{- if and (eq .Values.broker.type "redis") (not .Values.redis.enabled) (not .Values.externalRedis.existingSecret) }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }}-redis-external - {{- if .Values.migrate.useHook }} - annotations: - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - {{- end }} -type: Opaque -data: - redis-password: {{ required "externalRedis.password is required if not redis.enabled and not externalRedis.existingSecret" .Values.externalRedis.password | b64enc | quote }} ---- -{{- end }} -{{- if and .Values.oncall.smtp.enabled .Values.oncall.smtp.password }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }}-smtp -type: Opaque -data: - smtp-password: {{ .Values.oncall.smtp.password | b64enc | quote }} ---- -{{- end }} -{{- if and .Values.oncall.exporter.enabled .Values.oncall.exporter.authToken }} -apiVersion: v1 -kind: Secret -metadata: - name: {{ include "oncall.fullname" . }}-exporter -type: Opaque -data: - exporter-secret: {{ .Values.oncall.exporter.authToken | b64enc | quote }} ---- -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/serviceaccount.yaml b/packages/system/grafana-oncall/charts/oncall/templates/serviceaccount.yaml deleted file mode 100644 index d0a5a9eb..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/serviceaccount.yaml +++ /dev/null @@ -1,18 +0,0 @@ -{{- if .Values.serviceAccount.create -}} -apiVersion: v1 -kind: ServiceAccount -metadata: - name: {{ include "oncall.serviceAccountName" . }} - labels: - {{- include "oncall.labels" . | nindent 4 }} - {{- if or (.Values.migrate.useHook) (.Values.serviceAccount.annotations) }} - annotations: - {{- if .Values.migrate.useHook }} - "helm.sh/hook": pre-install,pre-upgrade - "helm.sh/hook-weight": "-5" - {{- end }} - {{- with .Values.serviceAccount.annotations }} - {{- toYaml . | nindent 4 }} - {{- end }} - {{- end }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/telegram-polling/_helpers.tpl b/packages/system/grafana-oncall/charts/oncall/templates/telegram-polling/_helpers.tpl deleted file mode 100644 index d2053dc0..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/telegram-polling/_helpers.tpl +++ /dev/null @@ -1,22 +0,0 @@ -{{/* -Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -*/}} -{{- define "oncall.telegramPolling.fullname" -}} -{{ include "oncall.fullname" . | trunc 45 }}-telegram-polling -{{- end }} - -{{/* -Telegram polling common labels -*/}} -{{- define "oncall.telegramPolling.labels" -}} -{{ include "oncall.labels" . }} -app.kubernetes.io/component: telegram-polling -{{- end }} - -{{/* -Telegram polling selector labels -*/}} -{{- define "oncall.telegramPolling.selectorLabels" -}} -{{ include "oncall.selectorLabels" . }} -app.kubernetes.io/component: telegram-polling -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/telegram-polling/deployment.yaml b/packages/system/grafana-oncall/charts/oncall/templates/telegram-polling/deployment.yaml deleted file mode 100644 index 2e448897..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/telegram-polling/deployment.yaml +++ /dev/null @@ -1,53 +0,0 @@ -{{- if .Values.telegramPolling.enabled -}} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: {{ include "oncall.telegramPolling.fullname" . }} - labels: - {{- include "oncall.telegramPolling.labels" . | nindent 4 }} -spec: - replicas: 1 - selector: - matchLabels: - {{- include "oncall.telegramPolling.selectorLabels" . | nindent 6 }} - template: - metadata: - labels: - {{- include "oncall.telegramPolling.selectorLabels" . | nindent 8 }} - {{- if .Values.telegramPolling.podLabels }} - {{- toYaml .Values.telegramPolling.podLabels | nindent 8 }} - {{- end }} - spec: - {{- with .Values.imagePullSecrets }} - imagePullSecrets: - {{- toYaml . | nindent 8 }} - {{- end }} - serviceAccountName: {{ include "oncall.serviceAccountName" . }} - securityContext: - {{- toYaml .Values.podSecurityContext | nindent 8 }} - initContainers: - {{- include "oncall.initContainer" . | nindent 8 }} - containers: - - name: telegram-polling - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "oncall.engine.image" . }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ['sh', '-c', 'python manage.py start_telegram_polling'] - env: - {{- include "snippet.oncall.env" . | nindent 12 }} - {{- include "snippet.oncall.telegram.env" . | nindent 12 }} - {{- include "snippet.db.env" . | nindent 12 }} - {{- include "snippet.broker.env" . | nindent 12 }} - {{- include "oncall.extraEnvs" . | nindent 12 }} - {{- with .Values.telegramPolling.resources }} - resources: - {{- toYaml . | nindent 12 }} - {{- end }} - {{- with .Values.telegramPolling.extraVolumeMounts }} - volumeMounts: {{- . | toYaml | nindent 12 }} - {{- end }} - {{- with .Values.telegramPolling.extraVolumes }} - volumes: {{- . | toYaml | nindent 8 }} - {{- end }} -{{- end -}} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/ui/_helpers.tpl b/packages/system/grafana-oncall/charts/oncall/templates/ui/_helpers.tpl deleted file mode 100644 index 49dcc2e5..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/ui/_helpers.tpl +++ /dev/null @@ -1,8 +0,0 @@ -{{- define "ui.env" -}} -{{- if .Values.ui.env }} - {{- range $key, $value := .Values.ui.env }} -- name: {{ $key }} - value: "{{ $value }}" - {{- end -}} -{{- end }} -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/templates/ui/deployment.yaml b/packages/system/grafana-oncall/charts/oncall/templates/ui/deployment.yaml deleted file mode 100644 index c094368f..00000000 --- a/packages/system/grafana-oncall/charts/oncall/templates/ui/deployment.yaml +++ /dev/null @@ -1,31 +0,0 @@ -{{- if .Values.ui.enabled }} -apiVersion: apps/v1 -kind: Deployment -metadata: - name: oncall-ui - labels: - app.kubernetes.io/component: oncall-ui -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: oncall-ui - template: - metadata: - labels: - app.kubernetes.io/component: oncall-ui - spec: - containers: - - name: oncall-ui - image: "{{ .Values.ui.image.repository }}:{{ .Values.ui.image.tag | default .Chart.AppVersion }}" - imagePullPolicy: IfNotPresent - env: - {{- include "ui.env" . | nindent 12 }} - volumeMounts: - - mountPath: /etc/app - name: hot-reloaded-plugin - volumes: - - name: hot-reloaded-plugin - hostPath: - path: /oncall-plugin -{{- end }} diff --git a/packages/system/grafana-oncall/charts/oncall/values.yaml b/packages/system/grafana-oncall/charts/oncall/values.yaml deleted file mode 100644 index 3306f005..00000000 --- a/packages/system/grafana-oncall/charts/oncall/values.yaml +++ /dev/null @@ -1,719 +0,0 @@ -# Values for configuring the deployment of Grafana OnCall - -# Set the domain name Grafana OnCall will be installed on. -# If you want to install grafana as a part of this release make sure to configure grafana.grafana.ini.server.domain too -base_url: example.com -base_url_protocol: https - -## Optionally specify an array of imagePullSecrets. -## Secrets must be manually created in the namespace. -## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ -## e.g: -## imagePullSecrets: -## - name: myRegistryKeySecretName -imagePullSecrets: [] - -image: - # Grafana OnCall docker image repository - repository: grafana/oncall - tag: - pullPolicy: Always - -# Whether to create additional service for external connections -# ClusterIP service is always created -service: - enabled: false - type: LoadBalancer - port: 8080 - annotations: {} - -# Engine pods configuration -engine: - replicaCount: 1 - resources: - {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - - # Labels for engine pods - podLabels: {} - - ## Deployment update strategy - ## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy - updateStrategy: - rollingUpdate: - maxSurge: 25% - maxUnavailable: 0 - type: RollingUpdate - - ## Affinity for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity - affinity: {} - - ## Node labels for pod assignment - ## ref: https://kubernetes.io/docs/user-guide/node-selection/ - nodeSelector: {} - - ## Tolerations for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ - tolerations: [] - - ## Topology spread constraints for pod assignment - ## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ - topologySpreadConstraints: [] - - ## Priority class for the pods - ## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/ - priorityClassName: "" - - # Extra containers which runs as sidecar - extraContainers: "" - # extraContainers: | - # - name: cloud-sql-proxy - # image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2 - # args: - # - --private-ip - # - --port=5432 - # - example:europe-west3:grafana-oncall-db - - # Extra volume mounts for the main app container - extraVolumeMounts: [] - # - mountPath: /mnt/postgres-tls - # name: postgres-tls - # - mountPath: /mnt/redis-tls - # name: redis-tls - - # Extra volumes for the pod - extraVolumes: [] - # - name: postgres-tls - # configMap: - # name: my-postgres-tls - # defaultMode: 0640 - # - name: redis-tls - # configMap: - # name: my-redis-tls - # defaultMode: 0640 - -detached_integrations_service: - enabled: false - type: LoadBalancer - port: 8080 - annotations: {} - -# Integrations pods configuration -detached_integrations: - enabled: false - replicaCount: 1 - resources: - {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - - ## Deployment update strategy - ## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy - updateStrategy: - rollingUpdate: - maxSurge: 25% - maxUnavailable: 0 - type: RollingUpdate - - ## Affinity for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity - affinity: {} - - ## Node labels for pod assignment - ## ref: https://kubernetes.io/docs/user-guide/node-selection/ - nodeSelector: {} - - ## Tolerations for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ - tolerations: [] - - ## Topology spread constraints for pod assignment - ## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ - topologySpreadConstraints: [] - - ## Priority class for the pods - ## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/ - priorityClassName: "" - - # Extra containers which runs as sidecar - extraContainers: "" - # extraContainers: | - # - name: cloud-sql-proxy - # image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2 - # args: - # - --private-ip - # - --port=5432 - # - example:europe-west3:grafana-oncall-db - - # Extra volume mounts for the container - extraVolumeMounts: [] - # - mountPath: /mnt/postgres-tls - # name: postgres-tls - # - mountPath: /mnt/redis-tls - # name: redis-tls - - # Extra volumes for the pod - extraVolumes: [] - # - name: postgres-tls - # configMap: - # name: my-postgres-tls - # defaultMode: 0640 - # - name: redis-tls - # configMap: - # name: my-redis-tls - # defaultMode: 0640 - -# Celery workers pods configuration -celery: - replicaCount: 1 - worker_queue: "default,critical,long,slack,telegram,webhook,celery,grafana" - worker_concurrency: "1" - worker_max_tasks_per_child: "100" - worker_beat_enabled: "True" - ## Restart of the celery workers once in a given interval as an additional precaution to the probes - ## If this setting is enabled TERM signal will be sent to celery workers - ## It will lead to warm shutdown (waiting for the tasks to complete) and restart the container - ## If this setting is set numbers of pod restarts will increase - ## Comment this line out if you want to remove restarts - worker_shutdown_interval: "65m" - livenessProbe: - enabled: true - initialDelaySeconds: 30 - periodSeconds: 300 - timeoutSeconds: 10 - resources: - {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - - # Labels for celery pods - podLabels: {} - - ## Affinity for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity - affinity: {} - - ## Node labels for pod assignment - ## ref: https://kubernetes.io/docs/user-guide/node-selection/ - nodeSelector: {} - - ## Tolerations for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ - tolerations: [] - - ## Topology spread constraints for pod assignment - ## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ - topologySpreadConstraints: [] - - ## Priority class for the pods - ## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/ - priorityClassName: "" - - # Extra containers which runs as sidecar - extraContainers: "" - # extraContainers: | - # - name: cloud-sql-proxy - # image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2 - # args: - # - --private-ip - # - --port=5432 - # - example:europe-west3:grafana-oncall-db - - # Extra volume mounts for the main container - extraVolumeMounts: [] - # - mountPath: /mnt/postgres-tls - # name: postgres-tls - # - mountPath: /mnt/redis-tls - # name: redis-tls - - # Extra volumes for the pod - extraVolumes: [] - # - name: postgres-tls - # configMap: - # name: my-postgres-tls - # defaultMode: 0640 - # - name: redis-tls - # configMap: - # name: my-redis-tls - # defaultMode: 0640 - -# Telegram polling pod configuration -telegramPolling: - enabled: false - resources: - {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - - # Labels for telegram-polling pods - podLabels: {} - - # Extra volume mounts for the main container - extraVolumeMounts: [] - # - mountPath: /mnt/postgres-tls - # name: postgres-tls - # - mountPath: /mnt/redis-tls - # name: redis-tls - - # Extra volumes for the pod - extraVolumes: [] - # - name: postgres-tls - # configMap: - # name: my-postgres-tls - # defaultMode: 0640 - # - name: redis-tls - # configMap: - # name: my-redis-tls - # defaultMode: 0640 - -oncall: - # this is intended to be used for local development. In short, it will mount the ./engine dir into - # any backend related containers, to allow hot-reloading + also run the containers with slightly modified - # startup commands (which configures the hot-reloading) - devMode: false - - # Override default MIRAGE_CIPHER_IV (must be 16 bytes long) - # For existing installation, this should not be changed. - # mirageCipherIV: 1234567890abcdef - # oncall secrets - secrets: - # Use existing secret. (secretKey and mirageSecretKey is required) - existingSecret: "" - # The key in the secret containing secret key - secretKey: "" - # The key in the secret containing mirage secret key - mirageSecretKey: "" - # Slack configures the Grafana Oncall Slack ChatOps integration. - slack: - # Enable the Slack ChatOps integration for the Oncall Engine. - enabled: false - # Sets the Slack bot slash-command - commandName: oncall - # clientId configures the Slack app OAuth2 client ID. - # api.slack.com/apps/ -> Basic Information -> App Credentials -> Client ID - clientId: ~ - # clientSecret configures the Slack app OAuth2 client secret. - # api.slack.com/apps/ -> Basic Information -> App Credentials -> Client Secret - clientSecret: ~ - # signingSecret - configures the Slack app signature secret used to sign - # requests comming from Slack. - # api.slack.com/apps/ -> Basic Information -> App Credentials -> Signing Secret - signingSecret: ~ - # Use existing secret for clientId, clientSecret and signingSecret. - # clientIdKey, clientSecretKey and signingSecretKey are required - existingSecret: "" - # The key in the secret containing OAuth2 client ID - clientIdKey: "" - # The key in the secret containing OAuth2 client secret - clientSecretKey: "" - # The key in the secret containing the Slack app signature secret - signingSecretKey: "" - # OnCall external URL - redirectHost: ~ - telegram: - enabled: false - token: ~ - webhookUrl: ~ - # Use existing secret. (tokenKey is required) - existingSecret: "" - # The key in the secret containing Telegram token - tokenKey: "" - smtp: - enabled: true - host: ~ - port: ~ - username: ~ - password: ~ - tls: ~ - fromEmail: ~ - exporter: - enabled: false - authToken: ~ - twilio: - # Twilio account SID/username to allow OnCall to send SMSes and make phone calls - accountSid: "" - # Twilio password to allow OnCall to send SMSes and make calls - authToken: "" - # Number from which you will receive calls and SMS - # (NOTE: must be quoted, otherwise would be rendered as float value) - phoneNumber: "" - # SID of Twilio service for number verification. You can create a service in Twilio web interface. - # twilio.com -> verify -> create new service - verifySid: "" - # Twilio API key SID/username to allow OnCall to send SMSes and make phone calls - apiKeySid: "" - # Twilio API key secret/password to allow OnCall to send SMSes and make phone calls - apiKeySecret: "" - # Use existing secret for authToken, phoneNumber, verifySid, apiKeySid and apiKeySecret. - existingSecret: "" - # Twilio password to allow OnCall to send SMSes and make calls - # The key in the secret containing the auth token - authTokenKey: "" - # The key in the secret containing the phone number - phoneNumberKey: "" - # The key in the secret containing verify service sid - verifySidKey: "" - # The key in the secret containing api key sid - apiKeySidKey: "" - # The key in the secret containing the api key secret - apiKeySecretKey: "" - # Phone notifications limit (the only non-secret value). - # TODO: rename to phoneNotificationLimit - limitPhone: - -# Whether to run django database migrations automatically -migrate: - enabled: true - # TTL can be unset by setting ttlSecondsAfterFinished: "" - ttlSecondsAfterFinished: 20 - # use a helm hook to manage the migration job - useHook: false - annotations: {} - - ## Affinity for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity - affinity: {} - - ## Node labels for pod assignment - ## ref: https://kubernetes.io/docs/user-guide/node-selection/ - nodeSelector: {} - - ## Tolerations for pod assignment - ## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ - tolerations: [] - - # Extra containers which runs as sidecar - extraContainers: "" - # extraContainers: | - # - name: cloud-sql-proxy - # image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2 - # args: - # - --private-ip - # - --port=5432 - # - example:europe-west3:grafana-oncall-db - resources: - {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - - # Extra volume mounts for the main container - extraVolumeMounts: [] - # - mountPath: /mnt/postgres-tls - # name: postgres-tls - # - mountPath: /mnt/redis-tls - # name: redis-tls - - # Extra volumes for the pod - extraVolumes: [] - # - name: postgres-tls - # configMap: - # name: my-postgres-tls - # defaultMode: 0640 - # - name: redis-tls - # configMap: - # name: my-redis-tls - # defaultMode: 0640 - -# Sets environment variables with name capitalized and prefixed with UWSGI_, -# and dashes are substituted with underscores. -# see more: https://uwsgi-docs.readthedocs.io/en/latest/Configuration.html#environment-variables -# Set null to disable all UWSGI environment variables -uwsgi: - listen: 1024 - -# Additional env variables to add to deployments -env: {} - -# Enable ingress object for external access to the resources -ingress: - enabled: true - # className: "" - annotations: - kubernetes.io/ingress.class: "nginx" - cert-manager.io/issuer: "letsencrypt-prod" - tls: - - hosts: - - "{{ .Values.base_url }}" - secretName: certificate-tls - # Extra paths to prepend to the host configuration. If using something - # like an ALB ingress controller, you may want to configure SSL redirects - extraPaths: [] - # - path: /* - # backend: - # serviceName: ssl-redirect - # servicePort: use-annotation - ## Or for k8s > 1.19 - # - path: /* - # pathType: Prefix - # backend: - # service: - # name: ssl-redirect - # port: - # name: use-annotation - -# Whether to install ingress controller -ingress-nginx: - enabled: true - -# Install cert-manager as a part of the release -cert-manager: - enabled: true - # Instal CRD resources - installCRDs: true - webhook: - timeoutSeconds: 30 - # cert-manager tries to use the already used port, changing to another one - # https://github.com/cert-manager/cert-manager/issues/3237 - # https://cert-manager.io/docs/installation/compatibility/ - securePort: 10260 - # Fix self-checks https://github.com/jetstack/cert-manager/issues/4286 - podDnsPolicy: None - podDnsConfig: - nameservers: - - 8.8.8.8 - - 1.1.1.1 - -database: - # can be either mysql or postgresql - type: mysql - -# MySQL is included into this release for the convenience. -# It is recommended to host it separately from this release -# Set mariadb.enabled = false and configure externalMysql -mariadb: - enabled: true - auth: - database: oncall - existingSecret: - primary: - extraEnvVars: - - name: MARIADB_COLLATE - value: utf8mb4_unicode_ci - - name: MARIADB_CHARACTER_SET - value: utf8mb4 - secondary: - extraEnvVars: - - name: MARIADB_COLLATE - value: utf8mb4_unicode_ci - - name: MARIADB_CHARACTER_SET - value: utf8mb4 - -# Make sure to create the database with the following parameters: -# CREATE DATABASE oncall CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci; -externalMysql: - host: - port: - db_name: - user: - password: - # Use an existing secret for the mysql password. - existingSecret: - # The key in the secret containing the mysql username - usernameKey: - # The key in the secret containing the mysql password - passwordKey: - # Extra options (see example below) - # Reference: https://pymysql.readthedocs.io/en/latest/modules/connections.html - options: - # options: >- - # ssl_verify_cert=true - # ssl_verify_identity=true - # ssl_ca=/mnt/mysql-tls/ca.crt - # ssl_cert=/mnt/mysql-tls/client.crt - # ssl_key=/mnt/mysql-tls/client.key - -# PostgreSQL is included into this release for the convenience. -# It is recommended to host it separately from this release -# Set postgresql.enabled = false and configure externalPostgresql -postgresql: - enabled: false - auth: - database: oncall - existingSecret: - -# Make sure to create the database with the following parameters: -# CREATE DATABASE oncall WITH ENCODING UTF8; -externalPostgresql: - host: - port: - db_name: - user: - password: - # Use an existing secret for the database password - existingSecret: - # The key in the secret containing the database password - passwordKey: - # Extra options (see example below) - # Reference: https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS - options: - # options: >- - # sslmode=verify-full - # sslrootcert=/mnt/postgres-tls/ca.crt - # sslcert=/mnt/postgres-tls/client.crt - # sslkey=/mnt/postgres-tls/client.key - -# RabbitMQ is included into this release for the convenience. -# It is recommended to host it separately from this release -# Set rabbitmq.enabled = false and configure externalRabbitmq -rabbitmq: - enabled: true - auth: - existingPasswordSecret: - -broker: - type: rabbitmq - -externalRabbitmq: - host: - port: - user: - password: - protocol: - vhost: - # Use an existing secret for the rabbitmq password - existingSecret: - # The key in the secret containing the rabbitmq password - passwordKey: "" - # The key in the secret containing the rabbitmq username - usernameKey: username - -# Redis is included into this release for the convenience. -# It is recommended to host it separately from this release -redis: - enabled: true - auth: - existingSecret: - -externalRedis: - protocol: - host: - port: - database: - username: - password: - # Use an existing secret for the redis password - existingSecret: - # The key in the secret containing the redis password - passwordKey: - - # SSL options - ssl_options: - enabled: false - # CA certificate - ca_certs: - # Client SSL certs - certfile: - keyfile: - # SSL verification mode: "cert_none" | "cert_optional" | "cert_required" - cert_reqs: - -# Grafana is included into this release for the convenience. -# It is recommended to host it separately from this release -grafana: - enabled: true - grafana.ini: - server: - domain: example.com - root_url: "%(protocol)s://%(domain)s/grafana" - serve_from_sub_path: true - persistence: - enabled: true - # Disable psp as PodSecurityPolicy is deprecated in v1.21+, unavailable in v1.25+ - rbac: - pspEnabled: false - plugins: - - grafana-oncall-app - -externalGrafana: - # Example: https://grafana.mydomain.com - url: - -nameOverride: "" -fullnameOverride: "" - -serviceAccount: - # Specifies whether a service account should be created - create: true - # Annotations to add to the service account - annotations: {} - # The name of the service account to use. - # If not set and create is true, a name is generated using the fullname template - name: "" - -podAnnotations: {} - -podSecurityContext: - {} - # fsGroup: 2000 - -securityContext: - {} - # capabilities: - # drop: - # - ALL - # readOnlyRootFilesystem: true - # runAsNonRoot: true - # runAsGroup: 2000 - # runAsUser: 1000 - -init: - securityContext: - {} - # allowPrivilegeEscalation: false - # capabilities: - # drop: - # - ALL - # privileged: false - # readOnlyRootFilesystem: true - # runAsGroup: 2000 - # runAsNonRoot: true - # runAsUser: 1000 - resources: - {} - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -ui: - # this is intended to be used for local development. In short, it will spin up an additional container - # running the plugin frontend, such that hot reloading can be enabled - enabled: false - image: - repository: oncall/ui - tag: dev - # Additional env vars for the ui container - env: {} - -prometheus: - enabled: false - # extraScrapeConfigs: | - # - job_name: 'oncall-exporter' - # metrics_path: /metrics/ - # static_configs: - # - targets: - # - oncall-dev-engine.default.svc.cluster.local:8080 diff --git a/packages/system/grafana-oncall/values.yaml b/packages/system/grafana-oncall/values.yaml deleted file mode 100644 index a660e0c1..00000000 --- a/packages/system/grafana-oncall/values.yaml +++ /dev/null @@ -1,19 +0,0 @@ -oncall: - database: - type: "postgresql" - broker: - type: "redis" - ingress: - enabled: false - cert-manager: - enabled: false - mariadb: - enabled: false - postgresql: - enabled: false - rabbitmq: - enabled: false - redis: - enabled: false - grafana: - enabled: false diff --git a/packages/system/monitoring/Makefile b/packages/system/monitoring/Makefile index b0425ded..61780ca7 100644 --- a/packages/system/monitoring/Makefile +++ b/packages/system/monitoring/Makefile @@ -19,3 +19,26 @@ update: helm repo add fluent https://fluent.github.io/helm-charts helm repo update fluent helm pull fluent/fluent-bit --untar --untardir charts + # alerts from victoria-metrics-k8s-stack + helm repo add vm https://victoriametrics.github.io/helm-charts/ + helm repo update vm + helm pull vm/victoria-metrics-k8s-stack --untar --untardir charts + rm -rf charts/victoria-metrics-k8s-stack/charts + rm -rf charts/victoria-metrics-k8s-stack/hack + rm -rf charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator + rm -rf charts/victoria-metrics-k8s-stack/templates/grafana + rm -rf charts/victoria-metrics-k8s-stack/templates/ingress.yaml + rm -rf charts/victoria-metrics-k8s-stack/files/dashboards + rm -f charts/victoria-metrics-k8s-stack/templates/servicemonitors.yaml + rm -f charts/victoria-metrics-k8s-stack/templates/serviceaccount.yaml + rm -f charts/victoria-metrics-k8s-stack/templates/rules/additionalVictoriaMetricsRules.yml + sed -i '/ namespace:/d' charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml + sed -i 's|job="apiserver"|job="kube-apiserver"|g' `grep -rl 'job="apiserver"' charts/victoria-metrics-k8s-stack/files/rules/generated` + sed -i 's|severity: info|severity: informational|g' `grep -rl 'severity: info' ./charts/victoria-metrics-k8s-stack/files/rules/generated` + sed -i 's|severity: none|severity: ok|g' ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml + sed -i ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml \ + -e '/Watchdog/,/severity:/s/severity: none/severity: ok/' \ + -e '/InfoInhibitor/,/severity:/s/severity: none/severity: major/' + # TODO + rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml + rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/vm*.yaml diff --git a/packages/system/monitoring/alerts/general.yaml b/packages/system/monitoring/alerts/general.yaml deleted file mode 100644 index adf82635..00000000 --- a/packages/system/monitoring/alerts/general.yaml +++ /dev/null @@ -1,57 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-general - namespace: cozy-monitoring -spec: - groups: - - name: coreos.general - labels: - alertname: foo - rules: - - alert: DeadMansSwitch - expr: vector(1) - labels: - severity_level: "4" - annotations: - description: This is a DeadMansSwitch meant to ensure that the entire Alerting - pipeline is functional. - summary: Alerting DeadMansSwitch - - record: fd_utilization - expr: process_open_fds / process_max_fds - - alert: FdExhaustionClose - expr: max(predict_linear(fd_utilization{pod!=""}[1h], 3600 * 4)) BY (job, namespace, pod) > 1 - for: 10m - labels: - severity_level: "4" - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next 4 hours' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: max(predict_linear(fd_utilization{pod=""}[1h], 3600 * 4)) BY (job, instance) > 1 - for: 10m - labels: - severity_level: "4" - annotations: - description: '{{ $labels.job }}: {{ $labels.instance }} instance - will exhaust in file/socket descriptors within the next 4 hours' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: max(predict_linear(fd_utilization{pod!=""}[10m], 3600)) BY (job, namespace, pod) > 1 - for: 10m - labels: - severity_level: "3" - annotations: - description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance - will exhaust in file/socket descriptors within the next hour' - summary: file descriptors soon exhausted - - alert: FdExhaustionClose - expr: max(predict_linear(fd_utilization{pod=""}[10m], 3600)) BY (job, instance) > 1 - for: 10m - labels: - severity_level: "3" - annotations: - description: '{{ $labels.job }}: {{ $labels.instance }} instance - will exhaust in file/socket descriptors within the next hour' - summary: file descriptors soon exhausted diff --git a/packages/system/monitoring/alerts/kube-dns.yaml b/packages/system/monitoring/alerts/kube-dns.yaml deleted file mode 100644 index 4310707b..00000000 --- a/packages/system/monitoring/alerts/kube-dns.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-node - namespace: cozy-monitoring -spec: - groups: - - name: kubernetes.dns - rules: - - alert: KubernetesDnsTargetDown - expr: absent(up{job="kube-dns"} == 1) - for: 5m - labels: - severity_level: "5" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: "markdown" - description: |- - Prometheus is unable to collect metrics from kube-dns. Thus its status is unknown. - - To debug the problem, use the following commands: - 1. `kubectl -n kube-system describe deployment -l k8s-app=kube-dns` - 2. `kubectl -n kube-system describe pod -l k8s-app=kube-dns` - summary: Kube-dns or CoreDNS are not under monitoring. diff --git a/packages/system/monitoring/alerts/kube-state-metrics.yaml b/packages/system/monitoring/alerts/kube-state-metrics.yaml deleted file mode 100644 index a671283f..00000000 --- a/packages/system/monitoring/alerts/kube-state-metrics.yaml +++ /dev/null @@ -1,50 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-kube-state-metrics - namespace: cozy-monitoring -spec: - groups: - - name: kube-state-metrics - rules: - - alert: DeploymentGenerationMismatch - expr: max(kube_deployment_status_observed_generation != kube_deployment_metadata_generation) by (namespace, deployment) - for: 15m - labels: - severity_level: "4" - annotations: - plk_protocol_version: "1" - description: Observed deployment generation does not match expected one for - deployment {{$labels.namespace}}/{{$labels.deployment}} - summary: Deployment is outdated - - alert: KubeStateMetricsDown - expr: absent(up{job="kube-state-metrics"} == 1) - for: 5m - labels: - severity_level: "3" - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - description: |- - There are no metrics about cluster resources for 5 minutes. - - Most alerts an monitroing panels aren't working. - - To debug the problem: - 1. Check kube-state-metrics pods: `kubectl -n d8-monitoring describe pod -l app=kube-state-metrics` - 2. Check its logs: `kubectl -n d8-monitoring describe deploy kube-state-metrics` - summary: > - Kube-state-metrics is not working in the cluster. - - name: kubernetes.rules - rules: - - alert: KubePodCrashLooping - expr: | - max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1 - annotations: - description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is in waiting state (reason: "CrashLoopBackOff").' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping - summary: Pod is crash looping. - for: 15m - labels: - severity: warning diff --git a/packages/system/monitoring/alerts/kubelet.yaml b/packages/system/monitoring/alerts/kubelet.yaml deleted file mode 100644 index d0719f2d..00000000 --- a/packages/system/monitoring/alerts/kubelet.yaml +++ /dev/null @@ -1,63 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-kubelet - namespace: cozy-monitoring -spec: - groups: - - name: coreos.kubelet - rules: - - alert: K8SNodeNotReady - expr: min(kube_node_status_condition{condition="Ready",status="true"}) BY (node) == 0 and - min(kube_node_spec_unschedulable == 0) by (node) - for: 1m - labels: - severity_level: "3" - annotations: - plk_protocol_version: "1" - description: The Kubelet on {{ $labels.node }} has not checked in with the API, - or has set itself to NotReady, for more than 10 minutes - summary: Node status is NotReady - - alert: K8SManyNodesNotReady - expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0 and on (node) kube_node_spec_unschedulable == 0) > 1 - and (count(kube_node_status_condition{condition="Ready",status="true"} == 0 and on (node) kube_node_spec_unschedulable == 0) / - count(kube_node_status_condition{condition="Ready",status="true"} and on (node) kube_node_spec_unschedulable == 0)) > 0.2 - for: 1m - labels: - severity_level: "3" - annotations: - plk_protocol_version: "1" - description: '{{ $value }}% of Kubernetes nodes are not ready' - summary: Too many nodes are not ready - - alert: K8SKubeletDown - expr: (count(up{job="kubelet"} == 0) or absent(up{job="kubelet"} == 1)) / count(up{job="kubelet"}) * 100 > 3 - for: 10m - labels: - severity_level: "4" - tier: "cluster" - annotations: - plk_protocol_version: "1" - plk_group_for__target_down: "TargetDown,prometheus=deckhouse,job=kubelet,kubernetes=~kubernetes" - description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: A few kubelets cannot be scraped - - alert: K8SKubeletDown - expr: (count(up{job="kubelet"} == 0) or absent(up{job="kubelet"} == 1)) / count(up{job="kubelet"}) * 100 > 10 - for: 30m - labels: - severity_level: "3" - tier: "cluster" - annotations: - plk_protocol_version: "1" - plk_group_for__target_down: "TargetDown,prometheus=deckhouse,job=kubelet,kubernetes=~kubernetes" - description: Prometheus failed to scrape {{ $value }}% of kubelets. - summary: Many kubelets cannot be scraped - - alert: K8SKubeletTooManyPods - expr: kubelet_running_pods > on(node) (kube_node_status_capacity{resource="pods",unit="integer"}) * 0.9 - for: 10m - labels: - severity_level: "7" - annotations: - plk_protocol_version: "1" - description: Kubelet {{ $labels.node }} is running {{ $value }} pods, close - to the limit of {{ printf "kube_node_status_capacity{resource=\"pods\",unit=\"integer\",node=\"%s\"}" $labels.node | query | first | value }} - summary: Kubelet is close to pod limit diff --git a/packages/system/monitoring/alerts/node-disk-usage.yaml b/packages/system/monitoring/alerts/node-disk-usage.yaml deleted file mode 100644 index e7ceff1f..00000000 --- a/packages/system/monitoring/alerts/node-disk-usage.yaml +++ /dev/null @@ -1,357 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-node - namespace: cozy-monitoring -spec: - groups: - - name: kubernetes.node.disk_inodes_usage - rules: - - alert: KubeletNodeFSInodesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 < - max by (node, mountpoint) (kubelet_eviction_nodefs_inodes{type="soft"}) - ) - for: 10m - labels: - severity_level: "9" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_nodefs_inodes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletNodeFSInodesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 < - max by (node, mountpoint) (kubelet_eviction_nodefs_inodes{type="hard"} + 5) - ) - for: 5m - labels: - severity_level: "7" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - Threshold at: {{ printf "kubelet_eviction_nodefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: > - Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - alert: KubeletNodeFSInodesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 < - max by (node, mountpoint) (kubelet_eviction_nodefs_inodes{type="hard"}) - ) - labels: - severity_level: "6" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_nodefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletNodeFSInodesUsage - expr: | - ( - ( - max by (node, mountpoint) (node_filesystem_files_free) - ) == 0 - ) - * (max by (node, mountpoint) ({__name__=~"kubelet_eviction_nodefs_inodes"})) - labels: - severity_level: "5" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - summary: No more free inodes on nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - alert: KubeletImageFSInodesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 < - max by (node, mountpoint) (kubelet_eviction_imagefs_inodes{type="soft"}) - ) - for: 10m - labels: - severity_level: "9" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Soft eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_imagefs_inodes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Soft eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletImageFSInodesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 < - max by (node, mountpoint) (kubelet_eviction_imagefs_inodes{type="hard"} + 5) - ) - for: 5m - labels: - severity_level: "7" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Close to hard eviction threshold of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - Threshold at: {{ printf "kubelet_eviction_imagefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Close to hard eviction threshold of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - alert: KubeletImageFSInodesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 < - max by (node, mountpoint) (kubelet_eviction_imagefs_inodes{type="hard"}) - ) - labels: - severity_level: "6" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Hard eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_imagefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Hard eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletImageFSInodesUsage - expr: | - ( - ( - max by (node, mountpoint) (node_filesystem_files_free) - ) == 0 - ) - * (max by (node, mountpoint) ({__name__=~"kubelet_eviction_imagefs_inodes"})) - labels: - severity_level: "5" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - summary: No more free inodes on imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - name: kubernetes.node.disk_bytes_usage - rules: - - alert: KubeletNodeFSBytesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < - max by (node, mountpoint) (kubelet_eviction_nodefs_bytes{type="soft"}) - ) - for: 10m - labels: - severity_level: "9" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_nodefs_bytes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletNodeFSBytesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < - max by (node, mountpoint) (kubelet_eviction_nodefs_bytes{type="hard"} + 5) - ) - for: 5m - labels: - severity_level: "7" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - Threshold at: {{ printf "kubelet_eviction_nodefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - alert: KubeletNodeFSBytesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < - max by (node, mountpoint) (kubelet_eviction_nodefs_bytes{type="hard"}) - ) - labels: - severity_level: "6" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_nodefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletNodeFSBytesUsage - expr: | - ( - ( - max by (node, mountpoint) (node_filesystem_avail_bytes) - ) == 0 - ) - * (max by (node, mountpoint) ({__name__=~"kubelet_eviction_nodefs_bytes"})) - labels: - severity_level: "5" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - summary: No more free space on nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - alert: KubeletImageFSBytesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < - max by (node, mountpoint) (kubelet_eviction_imagefs_bytes{type="soft"}) - ) - for: 10m - labels: - severity_level: "9" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Soft eviction of imagefs (filesystem that the container runtime uses for storing images and container writable layers) on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_imagefs_bytes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Soft eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletImageFSBytesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < - max by (node, mountpoint) (kubelet_eviction_imagefs_bytes{type="hard"} + 5) - ) - for: 5m - labels: - severity_level: "7" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Close to hard eviction threshold of imagefs (filesystem that the container runtime uses for storing images and container writable layers) on node {{$labels.node}} mountpoint {{$labels.mountpoint}}. - - Threshold at: {{ printf "kubelet_eviction_imagefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Close to hard eviction threshold of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. - - - alert: KubeletImageFSBytesUsage - expr: | - ( - max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < - max by (node, mountpoint) (kubelet_eviction_imagefs_bytes{type="hard"}) - ) - labels: - severity_level: "6" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - Hard eviction of imagefs (filesystem that the container runtime uses for storing images and container writable layers) on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - Threshold at: {{ printf "kubelet_eviction_imagefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}% - - Currently at: {{ .Value }}% - summary: Hard eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress. - - - alert: KubeletImageFSBytesUsage - expr: | - ( - ( - max by (node, mountpoint) (node_filesystem_avail_bytes) - ) == 0 - ) - * (max by (node, mountpoint) ({__name__=~"kubelet_eviction_imagefs_bytes"})) - labels: - severity_level: "5" - tier: cluster - annotations: - plk_protocol_version: "1" - plk_markup_format: markdown - plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - No more free bytes on imagefs (filesystem that the container runtime uses for storing images and container writable layers) on node {{$labels.node}} mountpoint {{$labels.mountpoint}}. - summary: No more free bytes on imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint. diff --git a/packages/system/monitoring/alerts/node.yaml b/packages/system/monitoring/alerts/node.yaml deleted file mode 100644 index 88b6f72b..00000000 --- a/packages/system/monitoring/alerts/node.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-node - namespace: cozy-monitoring -spec: - groups: - - name: coreos.node - rules: - - record: node:node_cpu:rate:sum - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) - BY (node) - - record: node:node_filesystem_usage:sum - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) - BY (node) - - record: node:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_receive_bytes[3m])) BY (node) - - record: node:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes[3m])) BY (node) - - record: node:node_cpu:ratio - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(node) - GROUP_LEFT() count(sum(node_cpu) BY (node, cpu)) BY (node) - - record: cluster:node_cpu:sum_rate5m - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - - record: cluster:node_cpu:ratio - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (node, cpu)) - - alert: NodeExporterDown - expr: absent(up{job="node-exporter"} == 1) - for: 10m - labels: - severity_level: "3" - annotations: - plk_protocol_version: "1" - description: Prometheus could not scrape a node-exporter for more than 10m, - or node-exporters have disappeared from discovery - summary: Prometheus could not scrape a node-exporter diff --git a/packages/system/monitoring/alerts/pod-status.yaml b/packages/system/monitoring/alerts/pod-status.yaml deleted file mode 100644 index 9070d2e8..00000000 --- a/packages/system/monitoring/alerts/pod-status.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: alerts-node - namespace: cozy-monitoring -spec: - groups: - - name: kubernetes.pod_status_incorrect - rules: - - alert: PodStatusIsIncorrect - expr: > - (count by (node, namespace, pod) (kube_pod_status_ready{condition="true"} == 0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info))) - and - ( - (count by (namespace, pod) (kube_pod_container_status_ready==1) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info))) - unless - (count by (namespace, pod) (kube_pod_container_status_ready==0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info))) - ) - for: 10m - annotations: - plk_markup_format: markdown - plk_protocol_version: "1" - plk_create_group_if_not_exists__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - plk_grouped_by__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes" - description: | - There is a {{ $labels.namespace }}/{{ $labels.pod }} Pod in the cluster that runs on the {{ $labels.node }} and listed as NotReady while all the Pod's containers are Ready. - - This could be due to the [Kubernetes bug](https://github.com/kubernetes/kubernetes/issues/80968). - - The recommended course of action: - 1. Find all the Pods having this state: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | "\(.spec.nodeName)/\(.metadata.namespace)/\(.metadata.name)"'`; - 2. Find all the Nodes affected: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | .spec.nodeName' -r | sort | uniq -c`; - 3. Restart `kubelet` on each Node: `systemctl restart kubelet`. - summary: The state of the {{ $labels.namespace }}/{{ $labels.pod }} Pod running on the {{ $labels.node }} Node is incorrect. You need to restart `kubelet`. diff --git a/packages/system/grafana-oncall/charts/oncall/.helmignore b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/.helmignore similarity index 87% rename from packages/system/grafana-oncall/charts/oncall/.helmignore rename to packages/system/monitoring/charts/victoria-metrics-k8s-stack/.helmignore index fd6e5fd3..8148a4ff 100644 --- a/packages/system/grafana-oncall/charts/oncall/.helmignore +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/.helmignore @@ -22,5 +22,5 @@ *.tmproj .vscode/ -# exclude helm unit tests -tests/ +# Ignore img folder used for documentation +img/ diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/CHANGELOG.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/CHANGELOG.md new file mode 100644 index 00000000..d044b526 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/CHANGELOG.md @@ -0,0 +1,688 @@ +## Next release + +- TODO + +## 0.25.17 + +**Release date:** 2024-09-20 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829) +- Fixed ETCD dashboard +- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260) +- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration. + +## 0.25.16 + +**Release date:** 2024-09-10 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Do not truncate servicemonitor, datasources, rules, dashboard, alertmanager & vmalert templates names +- Use service label for node-exporter instead of podLabel. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1458) +- Added common chart to a k8s-stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1456) +- Fixed value of custom alertmanager configSecret. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1461) + +## 0.25.15 + +**Release date:** 2024-09-05 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Drop empty endpoints param from scrape configuration +- Fixed proto when TLS is enabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1449) + +## 0.25.14 + +**Release date:** 2024-09-04 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fixed alertmanager templates + +## 0.25.13 + +**Release date:** 2024-09-04 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Use operator's own service monitor + +## 0.25.12 + +**Release date:** 2024-09-03 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Fixed dashboards rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1414) +- Fixed service monitor label name. + +## 0.25.11 + +**Release date:** 2024-09-03 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Merged ingress templates +- Removed custom VMServiceScrape for operator +- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860). +- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312). +- Migrated Node Exporter dashboard into chart +- Deprecated `grafana.sidecar.jsonData`, `grafana.provisionDefaultDatasource` in a favour of `grafana.sidecar.datasources.default` slice of datasources. +- Fail if no notifiers are set, do not set `notifiers` to null if empty + +## 0.25.10 + +**Release date:** 2024-08-31 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fixed ingress extraPaths and externalVM urls rendering + +## 0.25.9 + +**Release date:** 2024-08-31 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fixed vmalert ingress name typo +- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860). +- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312). + +## 0.25.8 + +**Release date:** 2024-08-30 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fixed external notifiers rendering, when alertmanager is disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1378) + +## 0.25.7 + +**Release date:** 2024-08-30 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fixed extra rules template context + +## 0.25.6 + +**Release date:** 2024-08-29 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec` + +**Update note**: Update `kubeScheduler.spec` to `kubeScheduler.vmScrape.spec` + +**Update note**: Update `kubeEtcd.spec` to `kubeEtcd.vmScrape.spec` + +**Update note**: Update `coreDns.spec` to `coreDns.vmScrape.spec` + +**Update note**: Update `kubeDns.spec` to `kubeDns.vmScrape.spec` + +**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec` + +**Update note**: Update `kubeControllerManager.spec` to `kubeControllerManager.vmScrape.spec` + +**Update note**: Update `kubeApiServer.spec` to `kubeApiServer.vmScrape.spec` + +**Update note**: Update `kubelet.spec` to `kubelet.vmScrape.spec` + +**Update note**: Update `kube-state-metrics.spec` to `kube-state-metrics.vmScrape.spec` + +**Update note**: Update `prometheus-node-exporter.spec` to `prometheus-node-exporter.vmScrape.spec` + +**Update note**: Update `grafana.spec` to `grafana.vmScrape.spec` + +- bump version of VM components to [v1.103.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.103.0) +- Added `dashboards.` bool flag to enable dashboard even if component it is for is not installed. +- Allow extra `vmalert.notifiers` without dropping default notifier if `alertmanager.enabled: true` +- Do not drop default notifier, when vmalert.additionalNotifierConfigs is set +- Replaced static url proto with a template, which selects proto depending on a present tls configuration +- Moved kubernetes components monitoring config from `spec` config to `vmScrape.spec` +- Merged servicemonitor templates + +## 0.25.5 + +**Release date:** 2024-08-26 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- TODO + +## 0.25.4 + +**Release date:** 2024-08-26 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- updates operator to [v0.47.2](https://github.com/VictoriaMetrics/operator/releases/tag/v0.47.2) +- kube-state-metrics - 5.16.4 -> 5.25.1 +- prometheus-node-exporter - 4.27.0 -> 4.29.0 +- grafana - 8.3.8 -> 8.4.7 +- added configurable `.Values.global.clusterLabel` to all alerting and recording rules `by` and `on` expressions + +## 0.25.3 + +**Release date:** 2024-08-23 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- updated operator to v0.47.1 release +- Build `app.kubernetes.io/instance` label consistently. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1282) + +## 0.25.2 + +**Release date:** 2024-08-21 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fixed vmalert ingress name. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1271) +- fixed alertmanager ingress host template rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1270) + +## 0.25.1 + +**Release date:** 2024-08-21 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Added `.Values.global.license` configuration +- Fixed extraLabels rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1248) +- Fixed vmalert url to alertmanager by including its path prefix +- Removed `networking.k8s.io/v1beta1/Ingress` and `extensions/v1beta1/Ingress` support +- Fixed kubedns servicemonitor template. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1255) + +## 0.25.0 + +**Release date:** 2024-08-16 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +**Update note**: it requires to update CRD dependency manually before upgrade + +**Update note**: requires Helm 3.14+ + +- Moved dashboards templating logic out of sync script to Helm template +- Allow to disable default grafana datasource +- Synchronize Etcd dashboards and rules with mixin provided by Etcd +- Add alerting rules for VictoriaMetrics operator. +- Updated alerting rules for VictoriaMetrics components. +- Fixed exact rule annotations propagation to other rules. +- Set minimal kubernetes version to 1.25 +- updates operator to v0.47.0 version + +## 0.24.5 + +**Release date:** 2024-08-01 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.102.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.1) + +## 0.24.4 + +**Release date:** 2024-08-01 + +![AppVersion: v1.102.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update dependencies: grafana -> 8.3.6. +- Added `.Values.defaultRules.alerting` and `.Values.defaultRules.recording` to setup common properties for all alerting an recording rules + +## 0.24.3 + +**Release date:** 2024-07-23 + +![AppVersion: v1.102.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.102.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0) + +## 0.24.2 + +**Release date:** 2024-07-15 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fix vmalertmanager configuration when using `.VMAlertmanagerSpec.ConfigRawYaml`. See [this pull request](https://github.com/VictoriaMetrics/helm-charts/pull/1136). + +## 0.24.1 + +**Release date:** 2024-07-10 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- updates operator to v0.46.4 + +## 0.24.0 + +**Release date:** 2024-07-10 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- added ability to override alerting rules labels and annotations: +- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations` +- for all rules in a group - `.Values.defaultRules.groups..rules.spec.labels` and `.Valeus.defaultRules.groups..rules.spec.annotations` +- for each rule individually - `.Values.defaultRules.rules..spec.labels` and `.Values.defaultRules.rules..spec.annotations` +- changed `.Values.defaultRules.rules.` to `.Values.defaultRules.groups..create` +- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups..targetNamespace` +- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups..spec.params` + +## 0.23.6 + +**Release date:** 2024-07-08 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- added ability to override alerting rules labels and annotations: +- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations` +- for all rules in a group - `.Values.defaultRules.groups..rules.spec.labels` and `.Valeus.defaultRules.groups..rules.spec.annotations` +- for each rule individually - `.Values.defaultRules.rules..spec.labels` and `.Values.defaultRules.rules..spec.annotations` +- changed `.Values.defaultRules.rules.` to `.Values.defaultRules.groups..create` +- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups..targetNamespace` +- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups..spec.params` + +## 0.23.5 + +**Release date:** 2024-07-04 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Support configuring vmalert `-notifier.config` with `.Values.vmalert.additionalNotifierConfigs`. + +## 0.23.4 + +**Release date:** 2024-07-02 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Add `extraObjects` to allow deploying additional resources with the chart release. + +## 0.23.3 + +**Release date:** 2024-06-26 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Enable [conversion of Prometheus CRDs](https://docs.victoriametrics.com/operator/migration/#objects-conversion) by default. See [this](https://github.com/VictoriaMetrics/helm-charts/pull/1069) pull request for details. +- use bitnami/kubectl image for cleanup instead of deprecated gcr.io/google_containers/hyperkube + +## 0.23.2 + +**Release date:** 2024-06-14 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Do not add `cluster` external label at VMAgent by default. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/774) for the details. + +## 0.23.1 + +**Release date:** 2024-06-10 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- updates operator to v0.45.0 release +- sync latest vm alerts and dashboards. + +## 0.23.0 + +**Release date:** 2024-05-30 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- sync latest etcd v3.5.x rules from [upstream](https://github.com/etcd-io/etcd/blob/release-3.5/contrib/mixin/mixin.libsonnet). +- add Prometheus operator CRDs as an optional dependency. See [this PR](https://github.com/VictoriaMetrics/helm-charts/pull/1022) and [related issue](https://github.com/VictoriaMetrics/helm-charts/issues/341) for the details. + +## 0.22.1 + +**Release date:** 2024-05-14 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- fix missing serviceaccounts patch permission in VM operator, see [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1012) for details. + +## 0.22.0 + +**Release date:** 2024-05-10 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM operator to [0.44.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.44.0) + +## 0.21.3 + +**Release date:** 2024-04-26 + +![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.101.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.101.0) + +## 0.21.2 + +**Release date:** 2024-04-23 + +![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM operator to [0.43.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.3) + +## 0.21.1 + +**Release date:** 2024-04-18 + +![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +## 0.21.0 + +**Release date:** 2024-04-18 + +![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- TODO + +- bump version of VM operator to [0.43.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.0) +- updates CRDs definitions. + +## 0.20.1 + +**Release date:** 2024-04-16 + +![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- upgraded dashboards and alerting rules, added values file for local (Minikube) setup +- bump version of VM components to [v1.100.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.100.1) + +## 0.20.0 + +**Release date:** 2024-04-02 + +![AppVersion: v1.99.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.99.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM operator to [0.42.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.42.3) + +## 0.19.4 + +**Release date:** 2024-03-05 + +![AppVersion: v1.99.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.99.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0) + +## 0.19.3 + +**Release date:** 2024-03-05 + +![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Commented default configuration for alertmanager. It simplifies configuration and makes it more explicit. See this [issue](https://github.com/VictoriaMetrics/helm-charts/issues/473) for details. +- Allow enabling/disabling default k8s rules when installing. See [#904](https://github.com/VictoriaMetrics/helm-charts/pull/904) by @passie. + +## 0.19.2 + +**Release date:** 2024-02-26 + +![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Fix templating of VMAgent `remoteWrite` in case both `VMSingle` and `VMCluster` are disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/865) for details. + +## 0.19.1 + +**Release date:** 2024-02-21 + +![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update dependencies: victoria-metrics-operator -> 0.28.1, grafana -> 7.3.1. +- Update victoriametrics CRD resources yaml. + +## 0.19.0 + +**Release date:** 2024-02-09 + +![AppVersion: v1.97.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.97.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Do not store original labels in `vmagent`'s memory by default. This reduces memory usage of `vmagent` but makes `vmagent`'s debugging UI less informative. See [this docs](https://docs.victoriametrics.com/vmagent/#relabel-debug) for details on relabeling debug. +- Update dependencies: kube-state-metrics -> 5.16.0, prometheus-node-exporter -> 4.27.0, grafana -> 7.3.0. +- Update victoriametrics CRD resources yaml. +- Update builtin dashboards and rules. + +## 0.18.12 + +**Release date:** 2024-02-01 + +![AppVersion: v1.97.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.97.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.97.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.97.1) +- Fix helm lint when ingress resources enabled - split templates of resources per kind. See [#820](https://github.com/VictoriaMetrics/helm-charts/pull/820) by @MemberIT. + +## 0.18.11 + +**Release date:** 2023-12-15 + +![AppVersion: v1.96.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.96.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Fix missing `.Values.defaultRules.rules.vmcluster` value. See [#801](https://github.com/VictoriaMetrics/helm-charts/pull/801) by @MemberIT. + +## 0.18.10 + +**Release date:** 2023-12-12 + +![AppVersion: v1.96.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.96.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0) +- Add optional allowCrossNamespaceImport to GrafanaDashboard(s) (#788) + +## 0.18.9 + +**Release date:** 2023-12-08 + +![AppVersion: v1.95.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Properly use variable from values file for Grafana datasource type. (#769) +- Update dashboards from upstream sources. (#780) + +## 0.18.8 + +**Release date:** 2023-11-16 + +![AppVersion: v1.95.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.95.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.1) + +## 0.18.7 + +**Release date:** 2023-11-15 + +![AppVersion: v1.95.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.95.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.0) +- Support adding extra group parameters for default vmrules. (#752) + +## 0.18.6 + +**Release date:** 2023-11-01 + +![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Fix kube scheduler default scraping port from 10251 to 10259, Kubernetes changed it since 1.23.0. See [this pr](https://github.com/VictoriaMetrics/helm-charts/pull/736) for details. +- Bump version of operator chart to [0.27.4](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.4) + +## 0.18.5 + +**Release date:** 2023-10-08 + +![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update operator chart to [v0.27.3](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.3) for fixing [#708](https://github.com/VictoriaMetrics/helm-charts/issues/708) + +## 0.18.4 + +**Release date:** 2023-10-04 + +![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update dependencies: [victoria-metrics-operator -> 0.27.2](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.2), prometheus-node-exporter -> 4.23.2, grafana -> 6.59.5. + +## 0.18.3 + +**Release date:** 2023-10-04 + +![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- bump version of VM components to [v1.94.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.94.0) + +## 0.18.2 + +**Release date:** 2023-09-28 + +![AppVersion: v1.93.5](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.5&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Fix behavior of `vmalert.remoteWriteVMAgent` - remoteWrite.url for VMAlert is correctly generated considering endpoint, name, port and http.pathPrefix of VMAgent + +## 0.18.1 + +**Release date:** 2023-09-21 + +![AppVersion: v1.93.5](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.5&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Bump version of VM components to [v1.93.5](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.5) + +## 0.18.0 + +**Release date:** 2023-09-12 + +![AppVersion: v1.93.4](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.4&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Bump version of `grafana` helm-chart to `6.59.*` +- Bump version of `prometheus-node-exporter` helm-chart to `4.23.*` +- Bump version of `kube-state-metrics` helm-chart to `0.59.*` +- Update alerting rules +- Update grafana dashboards +- Add `make` commands `sync-rules` and `sync-dashboards` +- Add support of VictoriaMetrics datasource + +## 0.17.8 + +**Release date:** 2023-09-11 + +![AppVersion: v1.93.4](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.4&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Bump version of VM components to [v1.93.4](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.4) +- Bump version of operator chart to [0.27.0](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.0) + +## 0.17.7 + +**Release date:** 2023-09-07 + +![AppVersion: v1.93.3](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.3&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Bump version of operator helm-chart to `0.26.2` + +## 0.17.6 + +**Release date:** 2023-09-04 + +![AppVersion: v1.93.3](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.3&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Move `cleanupCRD` option to victoria-metrics-operator chart (#593) +- Disable `honorTimestamps` for cadvisor scrape job by default (#617) +- For vmalert all replicas of alertmanager are added to notifiers (only if alertmanager is enabled) (#619) +- Add `grafanaOperatorDashboardsFormat` option (#615) +- Fix query expression for memory calculation in `k8s-views-global` dashboard (#636) +- Bump version of Victoria Metrics components to `v1.93.3` +- Bump version of operator helm-chart to `0.26.0` + +## 0.17.5 + +**Release date:** 2023-08-23 + +![AppVersion: v1.93.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update VictoriaMetrics components from v1.93.0 to v1.93.1 + +## 0.17.4 + +**Release date:** 2023-08-12 + +![AppVersion: v1.93.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update VictoriaMetrics components from v1.92.1 to v1.93.0 +- delete an obsolete parameter remaining by mistake (see ) (#602) + +## 0.17.3 + +**Release date:** 2023-07-28 + +![AppVersion: v1.92.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.92.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update VictoriaMetrics components from v1.92.0 to v1.92.1 (#599) + +## 0.17.2 + +**Release date:** 2023-07-27 + +![AppVersion: v1.92.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.92.0&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Update VictoriaMetrics components from v1.91.3 to v1.92.0 diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/Chart.lock b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/Chart.lock new file mode 100644 index 00000000..c059a99c --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/Chart.lock @@ -0,0 +1,24 @@ +dependencies: +- name: victoria-metrics-common + repository: https://victoriametrics.github.io/helm-charts + version: 0.0.11 +- name: victoria-metrics-operator + repository: https://victoriametrics.github.io/helm-charts + version: 0.34.8 +- name: kube-state-metrics + repository: https://prometheus-community.github.io/helm-charts + version: 5.25.1 +- name: prometheus-node-exporter + repository: https://prometheus-community.github.io/helm-charts + version: 4.39.0 +- name: grafana + repository: https://grafana.github.io/helm-charts + version: 8.4.9 +- name: crds + repository: "" + version: 0.0.0 +- name: prometheus-operator-crds + repository: https://prometheus-community.github.io/helm-charts + version: 11.0.0 +digest: sha256:11b119ebabf4ff0ea2951e7c72f51d0223dc3f50fb061a43b01fe7856491b836 +generated: "2024-09-12T11:50:51.935071545Z" diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/Chart.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/Chart.yaml new file mode 100644 index 00000000..90e1012d --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/Chart.yaml @@ -0,0 +1,66 @@ +annotations: + artifacthub.io/category: monitoring-logging + artifacthub.io/changes: | + - Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829) + - Fixed ETCD dashboard + - Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260) + - 'Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.' + artifacthub.io/license: Apache-2.0 + artifacthub.io/links: | + - name: Sources + url: https://docs.victoriametrics.com/vmgateway + - name: Charts repo + url: https://victoriametrics.github.io/helm-charts/ + - name: Docs + url: https://docs.victoriametrics.com + artifacthub.io/operator: "true" +apiVersion: v2 +appVersion: v1.102.1 +dependencies: +- name: victoria-metrics-common + repository: https://victoriametrics.github.io/helm-charts + version: 0.0.* +- condition: victoria-metrics-operator.enabled + name: victoria-metrics-operator + repository: https://victoriametrics.github.io/helm-charts + version: 0.34.* +- condition: kube-state-metrics.enabled + name: kube-state-metrics + repository: https://prometheus-community.github.io/helm-charts + version: 5.25.* +- condition: prometheus-node-exporter.enabled + name: prometheus-node-exporter + repository: https://prometheus-community.github.io/helm-charts + version: 4.39.* +- condition: grafana.enabled + name: grafana + repository: https://grafana.github.io/helm-charts + version: 8.4.* +- condition: crds.enabled + name: crds + repository: "" + version: 0.0.0 +- condition: prometheus-operator-crds.enabled + name: prometheus-operator-crds + repository: https://prometheus-community.github.io/helm-charts + version: 11.0.* +description: Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics + Operator, Grafana dashboards, ServiceScrapes and VMRules +home: https://github.com/VictoriaMetrics/helm-charts +icon: https://avatars.githubusercontent.com/u/43720803?s=200&v=4 +keywords: +- victoriametrics +- operator +- monitoring +- kubernetes +- observability +- tsdb +- metrics +- metricsql +- timeseries +kubeVersion: '>=1.25.0-0' +name: victoria-metrics-k8s-stack +sources: +- https://github.com/VictoriaMetrics/helm-charts +type: application +version: 0.25.17 diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/README.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/README.md new file mode 100644 index 00000000..d95202e0 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/README.md @@ -0,0 +1,2576 @@ +![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![Version: 0.25.17](https://img.shields.io/badge/Version-0.25.17-informational?style=flat-square) +[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/victoriametrics)](https://artifacthub.io/packages/helm/victoriametrics/victoria-metrics-k8s-stack) + +Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics Operator, Grafana dashboards, ServiceScrapes and VMRules + +* [Overview](#Overview) +* [Configuration](#Configuration) +* [Prerequisites](#Prerequisites) +* [Dependencies](#Dependencies) +* [Quick Start](#How-to-install) +* [Uninstall](#How-to-uninstall) +* [Version Upgrade](#Upgrade-guide) +* [Troubleshooting](#Troubleshooting) +* [Values](#Parameters) + +## Overview +This chart is an All-in-one solution to start monitoring kubernetes cluster. +It installs multiple dependency charts like [grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana), [node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter), [kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) and [victoria-metrics-operator](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator). +Also it installs Custom Resources like [VMSingle](https://docs.victoriametrics.com/operator/quick-start#vmsingle), [VMCluster](https://docs.victoriametrics.com/operator/quick-start#vmcluster), [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent), [VMAlert](https://docs.victoriametrics.com/operator/quick-start#vmalert). + +By default, the operator [converts all existing prometheus-operator API objects](https://docs.victoriametrics.com/operator/quick-start#migration-from-prometheus-operator-objects) into corresponding VictoriaMetrics Operator objects. + +To enable metrics collection for kubernetes this chart installs multiple scrape configurations for kuberenetes components like kubelet and kube-proxy, etc. Metrics collection is done by [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent). So if want to ship metrics to external VictoriaMetrics database you can disable VMSingle installation by setting `vmsingle.enabled` to `false` and setting `vmagent.vmagentSpec.remoteWrite.url` to your external VictoriaMetrics database. + +This chart also installs bunch of dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project. + +![Overview](img/k8s-stack-overview.png) + +## Configuration + +Configuration of this chart is done through helm values. + +### Dependencies + +Dependencies can be enabled or disabled by setting `enabled` to `true` or `false` in `values.yaml` file. + +**!Important:** for dependency charts anything that you can find in values.yaml of dependency chart can be configured in this chart under key for that dependency. For example if you want to configure `grafana` you can find all possible configuration options in [values.yaml](https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml) and you should set them in values for this chart under grafana: key. For example if you want to configure `grafana.persistence.enabled` you should set it in values.yaml like this: +```yaml +################################################# +### dependencies ##### +################################################# +# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration +grafana: + enabled: true + persistence: + type: pvc + enabled: false +``` + +### VictoriaMetrics components + +This chart installs multiple VictoriaMetrics components using Custom Resources that are managed by [victoria-metrics-operator](https://docs.victoriametrics.com/operator/design) +Each resource can be configured using `spec` of that resource from API docs of [victoria-metrics-operator](https://docs.victoriametrics.com/operator/api). For example if you want to configure `VMAgent` you can find all possible configuration options in [API docs](https://docs.victoriametrics.com/operator/api#vmagent) and you should set them in values for this chart under `vmagent.spec` key. For example if you want to configure `remoteWrite.url` you should set it in values.yaml like this: +```yaml +vmagent: + spec: + remoteWrite: + - url: "https://insert.vmcluster.domain.com/insert/0/prometheus/api/v1/write" +``` + +### ArgoCD issues + +#### Operator self signed certificates +When deploying K8s stack using ArgoCD without Cert Manager (`.Values.victoria-metrics-operator.admissionWebhooks.certManager.enabled: false`) +it will rerender operator's webhook certificates on each sync since Helm `lookup` function is not respected by ArgoCD. +To prevent this please update you K8s stack Application `spec.syncPolicy` and `spec.ignoreDifferences` with a following: + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +... +spec: + ... + syncPolicy: + syncOptions: + # https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/#respect-ignore-difference-configs + # argocd must also ignore difference during apply stage + # otherwise it ll silently override changes and cause a problem + - RespectIgnoreDifferences=true + ignoreDifferences: + - group: "" + kind: Secret + name: -validation + namespace: kube-system + jsonPointers: + - /data + - group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration + name: -admission + jqPathExpressions: + - '.webhooks[]?.clientConfig.caBundle' +``` +where `` is output of `{{ include "vm-operator.fullname" }}` for your setup + +#### `metadata.annotations: Too long: must have at most 262144 bytes` on dashboards + +If one of dashboards ConfigMap is failing with error `Too long: must have at most 262144 bytes`, please make sure you've added `argocd.argoproj.io/sync-options: ServerSideApply=true` annotation to your dashboards: + +```yaml +grafana: + sidecar: + dashboards: + additionalDashboardAnnotations + argocd.argoproj.io/sync-options: ServerSideApply=true +``` + +argocd.argoproj.io/sync-options: ServerSideApply=true + +### Rules and dashboards + +This chart by default install multiple dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) +you can disable dashboards with `defaultDashboardsEnabled: false` and `experimentalDashboardsEnabled: false` +and rules can be configured under `defaultRules` + +### Prometheus scrape configs +This chart installs multiple scrape configurations for kubernetes monitoring. They are configured under `#ServiceMonitors` section in `values.yaml` file. For example if you want to configure scrape config for `kubelet` you should set it in values.yaml like this: +```yaml +kubelet: + enabled: true + # spec for VMNodeScrape crd + # https://docs.victoriametrics.com/operator/api#vmnodescrapespec + spec: + interval: "30s" +``` + +### Using externally managed Grafana + +If you want to use an externally managed Grafana instance but still want to use the dashboards provided by this chart you can set + `grafana.enabled` to `false` and set `defaultDashboardsEnabled` to `true`. This will install the dashboards + but will not install Grafana. + +For example: +```yaml +defaultDashboardsEnabled: true + +grafana: + enabled: false +``` + +This will create ConfigMaps with dashboards to be imported into Grafana. + +If additional configuration for labels or annotations is needed in order to import dashboard to an existing Grafana you can +set `.grafana.sidecar.dashboards.additionalDashboardLabels` or `.grafana.sidecar.dashboards.additionalDashboardAnnotations` in `values.yaml`: + +For example: +```yaml +defaultDashboardsEnabled: true + +grafana: + enabled: false + sidecar: + dashboards: + additionalDashboardLabels: + key: value + additionalDashboardAnnotations: + key: value +``` + +## Prerequisites + +* Install the follow packages: ``git``, ``kubectl``, ``helm``, ``helm-docs``. See this [tutorial](../../REQUIREMENTS.md). + +* Add dependency chart repositories + +```console +helm repo add grafana https://grafana.github.io/helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +``` + +* PV support on underlying infrastructure. + +## How to install + +Access a Kubernetes cluster. + +### Setup chart repository (can be omitted for OCI repositories) + +Add a chart helm repository with follow commands: + +```console +helm repo add vm https://victoriametrics.github.io/helm-charts/ + +helm repo update +``` +List versions of `vm/victoria-metrics-k8s-stack` chart available to installation: + +```console +helm search repo vm/victoria-metrics-k8s-stack -l +``` + +### Install `victoria-metrics-k8s-stack` chart + +Export default values of `victoria-metrics-k8s-stack` chart to file `values.yaml`: + + - For HTTPS repository + + ```console + helm show values vm/victoria-metrics-k8s-stack > values.yaml + ``` + - For OCI repository + + ```console + helm show values oci://ghcr.io/victoriametrics/helm-charts/victoria-metrics-k8s-stack > values.yaml + ``` + +Change the values according to the need of the environment in ``values.yaml`` file. + +Test the installation with command: + + - For HTTPS repository + + ```console + helm install vmks vm/victoria-metrics-k8s-stack -f values.yaml -n NAMESPACE --debug --dry-run + ``` + + - For OCI repository + + ```console + helm install vmks oci://ghcr.io/victoriametrics/helm-charts/victoria-metrics-k8s-stack -f values.yaml -n NAMESPACE --debug --dry-run + ``` + +Install chart with command: + + - For HTTPS repository + + ```console + helm install vmks vm/victoria-metrics-k8s-stack -f values.yaml -n NAMESPACE + ``` + + - For OCI repository + + ```console + helm install vmks oci://ghcr.io/victoriametrics/helm-charts/victoria-metrics-k8s-stack -f values.yaml -n NAMESPACE + ``` + +Get the pods lists by running this commands: + +```console +kubectl get pods -A | grep 'vmks' +``` + +Get the application by running this command: + +```console +helm list -f vmks -n NAMESPACE +``` + +See the history of versions of `vmks` application with command. + +```console +helm history vmks -n NAMESPACE +``` + +### Install locally (Minikube) + +To run VictoriaMetrics stack locally it's possible to use [Minikube](https://github.com/kubernetes/minikube). To avoid dashboards and alert rules issues please follow the steps below: + +Run Minikube cluster + +``` +minikube start --container-runtime=containerd --extra-config=scheduler.bind-address=0.0.0.0 --extra-config=controller-manager.bind-address=0.0.0.0 +``` + +Install helm chart + +``` +helm install [RELEASE_NAME] vm/victoria-metrics-k8s-stack -f values.yaml -f values.minikube.yaml -n NAMESPACE --debug --dry-run +``` + +## How to uninstall + +Remove application with command. + +```console +helm uninstall vmks -n NAMESPACE +``` + +CRDs created by this chart are not removed by default and should be manually cleaned up: + +```console +kubectl get crd | grep victoriametrics.com | awk '{print $1 }' | xargs -i kubectl delete crd {} +``` + +## Troubleshooting + +- If you cannot install helm chart with error `configmap already exist`. It could happen because of name collisions, if you set too long release name. + Kubernetes by default, allows only 63 symbols at resource names and all resource names are trimmed by helm to 63 symbols. + To mitigate it, use shorter name for helm chart release name, like: +```bash +# stack - is short enough +helm upgrade -i stack vm/victoria-metrics-k8s-stack +``` + Or use override for helm chart release name: +```bash +helm upgrade -i some-very-long-name vm/victoria-metrics-k8s-stack --set fullnameOverride=stack +``` + +## Upgrade guide + +Usually, helm upgrade doesn't requires manual actions. Just execute command: + +```console +$ helm upgrade [RELEASE_NAME] vm/victoria-metrics-k8s-stack +``` + +But release with CRD update can only be patched manually with kubectl. +Since helm does not perform a CRD update, we recommend that you always perform this when updating the helm-charts version: + +```console +# 1. check the changes in CRD +$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl diff -f - + +# 2. apply the changes (update CRD) +$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl apply -f - --server-side +``` + +All other manual actions upgrades listed below: + +### Upgrade to 0.13.0 + +- node-exporter starting from version 4.0.0 is using the Kubernetes recommended labels. Therefore you have to delete the daemonset before you upgrade. + +```bash +kubectl delete daemonset -l app=prometheus-node-exporter +``` +- scrape configuration for kubernetes components was moved from `vmServiceScrape.spec` section to `spec` section. If you previously modified scrape configuration you need to update your `values.yaml` + +- `grafana.defaultDashboardsEnabled` was renamed to `defaultDashboardsEnabled` (moved to top level). You may need to update it in your `values.yaml` + +### Upgrade to 0.6.0 + + All `CRD` must be update to the lastest version with command: + +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml + +``` + +### Upgrade to 0.4.0 + + All `CRD` must be update to `v1` version with command: + +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml + +``` + +### Upgrade from 0.2.8 to 0.2.9 + + Update `VMAgent` crd + +command: +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.16.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml +``` + + ### Upgrade from 0.2.5 to 0.2.6 + +New CRD added to operator - `VMUser` and `VMAuth`, new fields added to exist crd. +Manual commands: +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmusers.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmauths.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmalerts.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmsingles.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmclusters.yaml +``` + +## Documentation of Helm Chart + +Install ``helm-docs`` following the instructions on this [tutorial](../../REQUIREMENTS.md). + +Generate docs with ``helm-docs`` command. + +```bash +cd charts/victoria-metrics-k8s-stack + +helm-docs +``` + +The markdown generation is entirely go template driven. The tool parses metadata from charts and generates a number of sub-templates that can be referenced in a template file (by default ``README.md.gotmpl``). If no template file is provided, the tool has a default internal template that will generate a reasonably formatted README. + +## Parameters + +The following tables lists the configurable parameters of the chart and their default values. + +Change the values according to the need of the environment in ``victoria-metrics-k8s-stack/values.yaml`` file. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeyTypeDefaultDescription
additionalVictoriaMetricsMapstring
+null
+
+
alertmanager.annotationsobject
+{}
+
+
alertmanager.configobject
+receivers:
+    - name: blackhole
+route:
+    receiver: blackhole
+templates:
+    - /etc/vm/configs/**/*.tmpl
+
+

alertmanager configuration

+
alertmanager.enabledbool
+true
+
+
alertmanager.ingressobject
+annotations: {}
+enabled: false
+extraPaths: []
+hosts:
+    - alertmanager.domain.com
+labels: {}
+path: '{{ .Values.alertmanager.spec.routePrefix | default "/" }}'
+pathType: Prefix
+tls: []
+
+

alertmanager ingress configuration

+
alertmanager.monzoTemplate.enabledbool
+true
+
+
alertmanager.specobject
+configSecret: ""
+externalURL: ""
+image:
+    tag: v0.25.0
+port: "9093"
+routePrefix: /
+selectAllByDefault: true
+
+

full spec for VMAlertmanager CRD. Allowed values described here

+
alertmanager.spec.configSecretstring
+""
+
+

if this one defined, it will be used for alertmanager configuration and config parameter will be ignored

+
alertmanager.templateFilesobject
+{}
+
+

extra alert templates

+
argocdReleaseOverridestring
+""
+
+

For correct working need set value ‘argocdReleaseOverride=$ARGOCD_APP_NAME’

+
coreDns.enabledbool
+true
+
+
coreDns.service.enabledbool
+true
+
+
coreDns.service.portint
+9153
+
+
coreDns.service.selector.k8s-appstring
+kube-dns
+
+
coreDns.service.targetPortint
+9153
+
+
coreDns.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics
+    jobLabel: jobLabel
+    namespaceSelector:
+        matchNames:
+            - kube-system
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
crds.enabledbool
+true
+
+
dashboardsobject
+node-exporter-full: true
+operator: false
+vmalert: false
+
+

Enable dashboards despite it’s dependency is not installed

+
dashboards.node-exporter-fullbool
+true
+
+

in ArgoCD using client-side apply this dashboard reaches annotations size limit and causes k8s issues without server side apply See this issue

+
defaultDashboardsEnabledbool
+true
+
+

Create default dashboards

+
defaultRulesobject
+alerting:
+    spec:
+        annotations: {}
+        labels: {}
+annotations: {}
+create: true
+group:
+    spec:
+        params: {}
+groups:
+    alertmanager:
+        create: true
+        rules: {}
+    etcd:
+        create: true
+        rules: {}
+    general:
+        create: true
+        rules: {}
+    k8sContainerCpuUsageSecondsTotal:
+        create: true
+        rules: {}
+    k8sContainerMemoryCache:
+        create: true
+        rules: {}
+    k8sContainerMemoryRss:
+        create: true
+        rules: {}
+    k8sContainerMemorySwap:
+        create: true
+        rules: {}
+    k8sContainerMemoryWorkingSetBytes:
+        create: true
+        rules: {}
+    k8sContainerResource:
+        create: true
+        rules: {}
+    k8sPodOwner:
+        create: true
+        rules: {}
+    kubeApiserver:
+        create: true
+        rules: {}
+    kubeApiserverAvailability:
+        create: true
+        rules: {}
+    kubeApiserverBurnrate:
+        create: true
+        rules: {}
+    kubeApiserverHistogram:
+        create: true
+        rules: {}
+    kubeApiserverSlos:
+        create: true
+        rules: {}
+    kubePrometheusGeneral:
+        create: true
+        rules: {}
+    kubePrometheusNodeRecording:
+        create: true
+        rules: {}
+    kubeScheduler:
+        create: true
+        rules: {}
+    kubeStateMetrics:
+        create: true
+        rules: {}
+    kubelet:
+        create: true
+        rules: {}
+    kubernetesApps:
+        create: true
+        rules: {}
+        targetNamespace: .*
+    kubernetesResources:
+        create: true
+        rules: {}
+    kubernetesStorage:
+        create: true
+        rules: {}
+        targetNamespace: .*
+    kubernetesSystem:
+        create: true
+        rules: {}
+    kubernetesSystemApiserver:
+        create: true
+        rules: {}
+    kubernetesSystemControllerManager:
+        create: true
+        rules: {}
+    kubernetesSystemKubelet:
+        create: true
+        rules: {}
+    kubernetesSystemScheduler:
+        create: true
+        rules: {}
+    node:
+        create: true
+        rules: {}
+    nodeNetwork:
+        create: true
+        rules: {}
+    vmHealth:
+        create: true
+        rules: {}
+    vmagent:
+        create: true
+        rules: {}
+    vmcluster:
+        create: true
+        rules: {}
+    vmoperator:
+        create: true
+        rules: {}
+    vmsingle:
+        create: true
+        rules: {}
+labels: {}
+recording:
+    spec:
+        annotations: {}
+        labels: {}
+rule:
+    spec:
+        annotations: {}
+        labels: {}
+rules: {}
+runbookUrl: https://runbooks.prometheus-operator.dev/runbooks
+
+

Create default rules for monitoring the cluster

+
defaultRules.alertingobject
+spec:
+    annotations: {}
+    labels: {}
+
+

Common properties for VMRules alerts

+
defaultRules.alerting.spec.annotationsobject
+{}
+
+

Additional annotations for VMRule alerts

+
defaultRules.alerting.spec.labelsobject
+{}
+
+

Additional labels for VMRule alerts

+
defaultRules.annotationsobject
+{}
+
+

Annotations for default rules

+
defaultRules.groupobject
+spec:
+    params: {}
+
+

Common properties for VMRule groups

+
defaultRules.group.spec.paramsobject
+{}
+
+

Optional HTTP URL parameters added to each rule request

+
defaultRules.groups.etcd.rulesobject
+{}
+
+

Common properties for all rules in a group

+
defaultRules.labelsobject
+{}
+
+

Labels for default rules

+
defaultRules.recordingobject
+spec:
+    annotations: {}
+    labels: {}
+
+

Common properties for VMRules recording rules

+
defaultRules.recording.spec.annotationsobject
+{}
+
+

Additional annotations for VMRule recording rules

+
defaultRules.recording.spec.labelsobject
+{}
+
+

Additional labels for VMRule recording rules

+
defaultRules.ruleobject
+spec:
+    annotations: {}
+    labels: {}
+
+

Common properties for all VMRules

+
defaultRules.rule.spec.annotationsobject
+{}
+
+

Additional annotations for all VMRules

+
defaultRules.rule.spec.labelsobject
+{}
+
+

Additional labels for all VMRules

+
defaultRules.rulesobject
+{}
+
+

Per rule properties

+
defaultRules.runbookUrlstring
+https://runbooks.prometheus-operator.dev/runbooks
+
+

Runbook url prefix for default rules

+
experimentalDashboardsEnabledbool
+true
+
+

Create experimental dashboards

+
externalVM.read.urlstring
+""
+
+
externalVM.write.urlstring
+""
+
+
extraObjectslist
+[]
+
+

Add extra objects dynamically to this chart

+
fullnameOverridestring
+""
+
+
global.clusterLabelstring
+cluster
+
+
global.license.keystring
+""
+
+
global.license.keyRefobject
+{}
+
+
grafana.additionalDataSourceslist
+[]
+
+
grafana.defaultDashboardsTimezonestring
+utc
+
+
grafana.defaultDatasourceTypestring
+prometheus
+
+
grafana.enabledbool
+true
+
+
grafana.forceDeployDatasourcebool
+false
+
+
grafana.ingress.annotationsobject
+{}
+
+
grafana.ingress.enabledbool
+false
+
+
grafana.ingress.extraPathslist
+[]
+
+
grafana.ingress.hosts[0]string
+grafana.domain.com
+
+
grafana.ingress.labelsobject
+{}
+
+
grafana.ingress.pathstring
+/
+
+
grafana.ingress.pathTypestring
+Prefix
+
+
grafana.ingress.tlslist
+[]
+
+
grafana.sidecar.dashboards.additionalDashboardAnnotationsobject
+{}
+
+
grafana.sidecar.dashboards.additionalDashboardLabelsobject
+{}
+
+
grafana.sidecar.dashboards.defaultFolderNamestring
+default
+
+
grafana.sidecar.dashboards.enabledbool
+true
+
+
grafana.sidecar.dashboards.folderstring
+/var/lib/grafana/dashboards
+
+
grafana.sidecar.dashboards.multiclusterbool
+false
+
+
grafana.sidecar.dashboards.provider.namestring
+default
+
+
grafana.sidecar.dashboards.provider.orgidint
+1
+
+
grafana.sidecar.datasources.createVMReplicasDatasourcesbool
+false
+
+
grafana.sidecar.datasources.defaultlist
+- isDefault: true
+  name: VictoriaMetrics
+- isDefault: false
+  name: VictoriaMetrics (DS)
+  type: victoriametrics-datasource
+
+

list of default prometheus compatible datasource configurations. VM url will be added to each of them in templates and type will be set to defaultDatasourceType if not defined

+
grafana.sidecar.datasources.enabledbool
+true
+
+
grafana.sidecar.datasources.initDatasourcesbool
+true
+
+
grafana.vmScrapeobject
+enabled: true
+spec:
+    endpoints:
+        - port: '{{ .Values.grafana.service.portName }}'
+    selector:
+        matchLabels:
+            app.kubernetes.io/name: '{{ include "grafana.name" .Subcharts.grafana }}'
+
+

grafana VM scrape config

+
grafana.vmScrape.specobject
+endpoints:
+    - port: '{{ .Values.grafana.service.portName }}'
+selector:
+    matchLabels:
+        app.kubernetes.io/name: '{{ include "grafana.name" .Subcharts.grafana }}'
+
+

Scrape configuration for Grafana

+
grafanaOperatorDashboardsFormatobject
+allowCrossNamespaceImport: false
+enabled: false
+instanceSelector:
+    matchLabels:
+        dashboards: grafana
+
+

Create dashboards as CRDs (reuqires grafana-operator to be installed)

+
kube-state-metrics.enabledbool
+true
+
+
kube-state-metrics.vmScrapeobject
+enabled: true
+spec:
+    endpoints:
+        - honorLabels: true
+          metricRelabelConfigs:
+            - action: labeldrop
+              regex: (uid|container_id|image_id)
+          port: http
+    jobLabel: app.kubernetes.io/name
+    selector:
+        matchLabels:
+            app.kubernetes.io/instance: '{{ include "vm.release" . }}'
+            app.kubernetes.io/name: '{{ include "kube-state-metrics.name" (index .Subcharts "kube-state-metrics") }}'
+
+

Scrape configuration for Kube State Metrics

+
kubeApiServer.enabledbool
+true
+
+
kubeApiServer.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: https
+          scheme: https
+          tlsConfig:
+            caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+            serverName: kubernetes
+    jobLabel: component
+    namespaceSelector:
+        matchNames:
+            - default
+    selector:
+        matchLabels:
+            component: apiserver
+            provider: kubernetes
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
kubeControllerManager.enabledbool
+true
+
+
kubeControllerManager.endpointslist
+[]
+
+
kubeControllerManager.service.enabledbool
+true
+
+
kubeControllerManager.service.portint
+10257
+
+
kubeControllerManager.service.selector.componentstring
+kube-controller-manager
+
+
kubeControllerManager.service.targetPortint
+10257
+
+
kubeControllerManager.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics
+          scheme: https
+          tlsConfig:
+            caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+            serverName: kubernetes
+    jobLabel: jobLabel
+    namespaceSelector:
+        matchNames:
+            - kube-system
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
kubeDns.enabledbool
+false
+
+
kubeDns.service.enabledbool
+false
+
+
kubeDns.service.ports.dnsmasq.portint
+10054
+
+
kubeDns.service.ports.dnsmasq.targetPortint
+10054
+
+
kubeDns.service.ports.skydns.portint
+10055
+
+
kubeDns.service.ports.skydns.targetPortint
+10055
+
+
kubeDns.service.selector.k8s-appstring
+kube-dns
+
+
kubeDns.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics-dnsmasq
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics-skydns
+    jobLabel: jobLabel
+    namespaceSelector:
+        matchNames:
+            - kube-system
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
kubeEtcd.enabledbool
+true
+
+
kubeEtcd.endpointslist
+[]
+
+
kubeEtcd.service.enabledbool
+true
+
+
kubeEtcd.service.portint
+2379
+
+
kubeEtcd.service.selector.componentstring
+etcd
+
+
kubeEtcd.service.targetPortint
+2379
+
+
kubeEtcd.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics
+          scheme: https
+          tlsConfig:
+            caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    jobLabel: jobLabel
+    namespaceSelector:
+        matchNames:
+            - kube-system
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
kubeProxy.enabledbool
+false
+
+
kubeProxy.endpointslist
+[]
+
+
kubeProxy.service.enabledbool
+true
+
+
kubeProxy.service.portint
+10249
+
+
kubeProxy.service.selector.k8s-appstring
+kube-proxy
+
+
kubeProxy.service.targetPortint
+10249
+
+
kubeProxy.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics
+          scheme: https
+          tlsConfig:
+            caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    jobLabel: jobLabel
+    namespaceSelector:
+        matchNames:
+            - kube-system
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
kubeScheduler.enabledbool
+true
+
+
kubeScheduler.endpointslist
+[]
+
+
kubeScheduler.service.enabledbool
+true
+
+
kubeScheduler.service.portint
+10259
+
+
kubeScheduler.service.selector.componentstring
+kube-scheduler
+
+
kubeScheduler.service.targetPortint
+10259
+
+
kubeScheduler.vmScrapeobject
+spec:
+    endpoints:
+        - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+          port: http-metrics
+          scheme: https
+          tlsConfig:
+            caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+    jobLabel: jobLabel
+    namespaceSelector:
+        matchNames:
+            - kube-system
+
+

spec for VMServiceScrape crd https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec

+
kubelet.enabledbool
+true
+
+
kubelet.vmScrapeobject
+kind: VMNodeScrape
+spec:
+    bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    honorTimestamps: false
+    interval: 30s
+    metricRelabelConfigs:
+        - action: labeldrop
+          regex: (uid)
+        - action: labeldrop
+          regex: (id|name)
+        - action: drop
+          regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count)
+          source_labels:
+            - __name__
+    relabelConfigs:
+        - action: labelmap
+          regex: __meta_kubernetes_node_label_(.+)
+        - sourceLabels:
+            - __metrics_path__
+          targetLabel: metrics_path
+        - replacement: kubelet
+          targetLabel: job
+    scheme: https
+    scrapeTimeout: 5s
+    tlsConfig:
+        caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+        insecureSkipVerify: true
+
+

spec for VMNodeScrape crd https://docs.victoriametrics.com/operator/api.html#vmnodescrapespec

+
kubelet.vmScrapes.cadvisorobject
+enabled: true
+spec:
+    path: /metrics/cadvisor
+
+

Enable scraping /metrics/cadvisor from kubelet’s service

+
kubelet.vmScrapes.kubelet.specobject
+{}
+
+
kubelet.vmScrapes.probesobject
+enabled: true
+spec:
+    path: /metrics/probes
+
+

Enable scraping /metrics/probes from kubelet’s service

+
nameOverridestring
+""
+
+
prometheus-node-exporter.enabledbool
+true
+
+
prometheus-node-exporter.extraArgs[0]string
+--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
+
+
prometheus-node-exporter.extraArgs[1]string
+--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
+
+
prometheus-node-exporter.service.labels.jobLabelstring
+node-exporter
+
+
prometheus-node-exporter.vmScrapeobject
+enabled: true
+spec:
+    endpoints:
+        - metricRelabelConfigs:
+            - action: drop
+              regex: /var/lib/kubelet/pods.+
+              source_labels:
+                - mountpoint
+          port: metrics
+    jobLabel: jobLabel
+    selector:
+        matchLabels:
+            app.kubernetes.io/name: '{{ include "prometheus-node-exporter.name" (index .Subcharts "prometheus-node-exporter") }}'
+
+

node exporter VM scrape config

+
prometheus-node-exporter.vmScrape.specobject
+endpoints:
+    - metricRelabelConfigs:
+        - action: drop
+          regex: /var/lib/kubelet/pods.+
+          source_labels:
+            - mountpoint
+      port: metrics
+jobLabel: jobLabel
+selector:
+    matchLabels:
+        app.kubernetes.io/name: '{{ include "prometheus-node-exporter.name" (index .Subcharts "prometheus-node-exporter") }}'
+
+

Scrape configuration for Node Exporter

+
prometheus-operator-crds.enabledbool
+false
+
+
serviceAccount.annotationsobject
+{}
+
+

Annotations to add to the service account

+
serviceAccount.createbool
+true
+
+

Specifies whether a service account should be created

+
serviceAccount.namestring
+""
+
+

If not set and create is true, a name is generated using the fullname template

+
tenantstring
+"0"
+
+
victoria-metrics-operatorobject
+crd:
+    cleanup:
+        enabled: true
+        image:
+            pullPolicy: IfNotPresent
+            repository: bitnami/kubectl
+    create: false
+enabled: true
+operator:
+    disable_prometheus_converter: false
+serviceMonitor:
+    enabled: true
+
+

also checkout here possible ENV variables to configure operator behaviour https://docs.victoriametrics.com/operator/vars

+
victoria-metrics-operator.crd.cleanupobject
+enabled: true
+image:
+    pullPolicy: IfNotPresent
+    repository: bitnami/kubectl
+
+

tells helm to clean up vm cr resources when uninstalling

+
victoria-metrics-operator.crd.createbool
+false
+
+

we disable crd creation by operator chart as we create them in this chart

+
victoria-metrics-operator.operator.disable_prometheus_converterbool
+false
+
+

By default, operator converts prometheus-operator objects.

+
vmagent.additionalRemoteWriteslist
+[]
+
+

remoteWrite configuration of VMAgent, allowed parameters defined in a spec

+
vmagent.annotationsobject
+{}
+
+
vmagent.enabledbool
+true
+
+
vmagent.ingressobject
+annotations: {}
+enabled: false
+extraPaths: []
+hosts:
+    - vmagent.domain.com
+labels: {}
+path: ""
+pathType: Prefix
+tls: []
+
+

vmagent ingress configuration

+
vmagent.ingress.extraPathslist
+[]
+
+

Extra paths to prepend to every host configuration. This is useful when working with annotation based services.

+
vmagent.specobject
+externalLabels: {}
+extraArgs:
+    promscrape.dropOriginalLabels: "true"
+    promscrape.streamParse: "true"
+image:
+    tag: v1.103.0
+port: "8429"
+scrapeInterval: 20s
+selectAllByDefault: true
+
+

full spec for VMAgent CRD. Allowed values described here

+
vmalert.additionalNotifierConfigsobject
+{}
+
+
vmalert.annotationsobject
+{}
+
+
vmalert.enabledbool
+true
+
+
vmalert.ingressobject
+annotations: {}
+enabled: false
+extraPaths: []
+hosts:
+    - vmalert.domain.com
+labels: {}
+path: ""
+pathType: Prefix
+tls: []
+
+

vmalert ingress config

+
vmalert.remoteWriteVMAgentbool
+false
+
+
vmalert.specobject
+evaluationInterval: 15s
+externalLabels: {}
+extraArgs:
+    http.pathPrefix: /
+image:
+    tag: v1.103.0
+port: "8080"
+selectAllByDefault: true
+
+

full spec for VMAlert CRD. Allowed values described here

+
vmalert.templateFilesobject
+{}
+
+

extra vmalert annotation templates

+
vmauth.annotationsobject
+{}
+
+
vmauth.enabledbool
+false
+
+
vmauth.specobject
+discover_backend_ips: true
+port: "8427"
+
+

full spec for VMAuth CRD. Allowed values described here

+
vmcluster.annotationsobject
+{}
+
+
vmcluster.enabledbool
+false
+
+
vmcluster.ingress.insert.annotationsobject
+{}
+
+
vmcluster.ingress.insert.enabledbool
+false
+
+
vmcluster.ingress.insert.extraPathslist
+[]
+
+
vmcluster.ingress.insert.hosts[0]string
+vminsert.domain.com
+
+
vmcluster.ingress.insert.labelsobject
+{}
+
+
vmcluster.ingress.insert.pathstring
+'{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vminsert }}'
+
+
vmcluster.ingress.insert.pathTypestring
+Prefix
+
+
vmcluster.ingress.insert.tlslist
+[]
+
+
vmcluster.ingress.select.annotationsobject
+{}
+
+
vmcluster.ingress.select.enabledbool
+false
+
+
vmcluster.ingress.select.extraPathslist
+[]
+
+
vmcluster.ingress.select.hosts[0]string
+vmselect.domain.com
+
+
vmcluster.ingress.select.labelsobject
+{}
+
+
vmcluster.ingress.select.pathstring
+'{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vmselect }}'
+
+
vmcluster.ingress.select.pathTypestring
+Prefix
+
+
vmcluster.ingress.select.tlslist
+[]
+
+
vmcluster.ingress.storage.annotationsobject
+{}
+
+
vmcluster.ingress.storage.enabledbool
+false
+
+
vmcluster.ingress.storage.extraPathslist
+[]
+
+
vmcluster.ingress.storage.hosts[0]string
+vmstorage.domain.com
+
+
vmcluster.ingress.storage.labelsobject
+{}
+
+
vmcluster.ingress.storage.pathstring
+""
+
+
vmcluster.ingress.storage.pathTypestring
+Prefix
+
+
vmcluster.ingress.storage.tlslist
+[]
+
+
vmcluster.specobject
+replicationFactor: 2
+retentionPeriod: "1"
+vminsert:
+    extraArgs: {}
+    image:
+        tag: v1.103.0-cluster
+    port: "8480"
+    replicaCount: 2
+    resources: {}
+vmselect:
+    cacheMountPath: /select-cache
+    extraArgs: {}
+    image:
+        tag: v1.103.0-cluster
+    port: "8481"
+    replicaCount: 2
+    resources: {}
+    storage:
+        volumeClaimTemplate:
+            spec:
+                resources:
+                    requests:
+                        storage: 2Gi
+vmstorage:
+    image:
+        tag: v1.103.0-cluster
+    replicaCount: 2
+    resources: {}
+    storage:
+        volumeClaimTemplate:
+            spec:
+                resources:
+                    requests:
+                        storage: 10Gi
+    storageDataPath: /vm-data
+
+

full spec for VMCluster CRD. Allowed values described here

+
vmcluster.spec.retentionPeriodstring
+"1"
+
+

Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these docs

+
vmsingle.annotationsobject
+{}
+
+
vmsingle.enabledbool
+true
+
+
vmsingle.ingress.annotationsobject
+{}
+
+
vmsingle.ingress.enabledbool
+false
+
+
vmsingle.ingress.extraPathslist
+[]
+
+
vmsingle.ingress.hosts[0]string
+vmsingle.domain.com
+
+
vmsingle.ingress.labelsobject
+{}
+
+
vmsingle.ingress.pathstring
+""
+
+
vmsingle.ingress.pathTypestring
+Prefix
+
+
vmsingle.ingress.tlslist
+[]
+
+
vmsingle.specobject
+extraArgs: {}
+image:
+    tag: v1.103.0
+port: "8429"
+replicaCount: 1
+retentionPeriod: "1"
+storage:
+    accessModes:
+        - ReadWriteOnce
+    resources:
+        requests:
+            storage: 20Gi
+
+

full spec for VMSingle CRD. Allowed values describe here

+
vmsingle.spec.retentionPeriodstring
+"1"
+
+

Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these docs

+
+ diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/README.md.gotmpl b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/README.md.gotmpl new file mode 100644 index 00000000..7ac63b6e --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/README.md.gotmpl @@ -0,0 +1,300 @@ +{{ template "chart.typeBadge" . }} {{ template "chart.versionBadge" . }} +[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/victoriametrics)](https://artifacthub.io/packages/helm/victoriametrics/victoria-metrics-k8s-stack) + +{{ template "chart.description" . }} + +* [Overview](#Overview) +* [Configuration](#Configuration) +* [Prerequisites](#Prerequisites) +* [Dependencies](#Dependencies) +* [Quick Start](#How-to-install) +* [Uninstall](#How-to-uninstall) +* [Version Upgrade](#Upgrade-guide) +* [Troubleshooting](#Troubleshooting) +* [Values](#Parameters) + + +## Overview +This chart is an All-in-one solution to start monitoring kubernetes cluster. +It installs multiple dependency charts like [grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana), [node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter), [kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) and [victoria-metrics-operator](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator). +Also it installs Custom Resources like [VMSingle](https://docs.victoriametrics.com/operator/quick-start#vmsingle), [VMCluster](https://docs.victoriametrics.com/operator/quick-start#vmcluster), [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent), [VMAlert](https://docs.victoriametrics.com/operator/quick-start#vmalert). + +By default, the operator [converts all existing prometheus-operator API objects](https://docs.victoriametrics.com/operator/quick-start#migration-from-prometheus-operator-objects) into corresponding VictoriaMetrics Operator objects. + +To enable metrics collection for kubernetes this chart installs multiple scrape configurations for kuberenetes components like kubelet and kube-proxy, etc. Metrics collection is done by [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent). So if want to ship metrics to external VictoriaMetrics database you can disable VMSingle installation by setting `vmsingle.enabled` to `false` and setting `vmagent.vmagentSpec.remoteWrite.url` to your external VictoriaMetrics database. + +This chart also installs bunch of dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project. + +![Overview](img/k8s-stack-overview.png) + + +## Configuration + +Configuration of this chart is done through helm values. + +### Dependencies + +Dependencies can be enabled or disabled by setting `enabled` to `true` or `false` in `values.yaml` file. + +**!Important:** for dependency charts anything that you can find in values.yaml of dependency chart can be configured in this chart under key for that dependency. For example if you want to configure `grafana` you can find all possible configuration options in [values.yaml](https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml) and you should set them in values for this chart under grafana: key. For example if you want to configure `grafana.persistence.enabled` you should set it in values.yaml like this: +```yaml +################################################# +### dependencies ##### +################################################# +# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration +grafana: + enabled: true + persistence: + type: pvc + enabled: false +``` + +### VictoriaMetrics components + +This chart installs multiple VictoriaMetrics components using Custom Resources that are managed by [victoria-metrics-operator](https://docs.victoriametrics.com/operator/design) +Each resource can be configured using `spec` of that resource from API docs of [victoria-metrics-operator](https://docs.victoriametrics.com/operator/api). For example if you want to configure `VMAgent` you can find all possible configuration options in [API docs](https://docs.victoriametrics.com/operator/api#vmagent) and you should set them in values for this chart under `vmagent.spec` key. For example if you want to configure `remoteWrite.url` you should set it in values.yaml like this: +```yaml +vmagent: + spec: + remoteWrite: + - url: "https://insert.vmcluster.domain.com/insert/0/prometheus/api/v1/write" +``` + +### ArgoCD issues + +#### Operator self signed certificates +When deploying K8s stack using ArgoCD without Cert Manager (`.Values.victoria-metrics-operator.admissionWebhooks.certManager.enabled: false`) +it will rerender operator's webhook certificates on each sync since Helm `lookup` function is not respected by ArgoCD. +To prevent this please update you K8s stack Application `spec.syncPolicy` and `spec.ignoreDifferences` with a following: + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +... +spec: + ... + syncPolicy: + syncOptions: + # https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/#respect-ignore-difference-configs + # argocd must also ignore difference during apply stage + # otherwise it ll silently override changes and cause a problem + - RespectIgnoreDifferences=true + ignoreDifferences: + - group: "" + kind: Secret + name: -validation + namespace: kube-system + jsonPointers: + - /data + - group: admissionregistration.k8s.io + kind: ValidatingWebhookConfiguration + name: -admission + jqPathExpressions: + - '.webhooks[]?.clientConfig.caBundle' +``` +where `` is output of `{{"{{"}} include "vm-operator.fullname" {{"}}"}}` for your setup + +#### `metadata.annotations: Too long: must have at most 262144 bytes` on dashboards + +If one of dashboards ConfigMap is failing with error `Too long: must have at most 262144 bytes`, please make sure you've added `argocd.argoproj.io/sync-options: ServerSideApply=true` annotation to your dashboards: + +```yaml +grafana: + sidecar: + dashboards: + additionalDashboardAnnotations + argocd.argoproj.io/sync-options: ServerSideApply=true +``` + +argocd.argoproj.io/sync-options: ServerSideApply=true + +### Rules and dashboards + +This chart by default install multiple dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) +you can disable dashboards with `defaultDashboardsEnabled: false` and `experimentalDashboardsEnabled: false` +and rules can be configured under `defaultRules` + +### Prometheus scrape configs +This chart installs multiple scrape configurations for kubernetes monitoring. They are configured under `#ServiceMonitors` section in `values.yaml` file. For example if you want to configure scrape config for `kubelet` you should set it in values.yaml like this: +```yaml +kubelet: + enabled: true + # spec for VMNodeScrape crd + # https://docs.victoriametrics.com/operator/api#vmnodescrapespec + spec: + interval: "30s" +``` + +### Using externally managed Grafana + +If you want to use an externally managed Grafana instance but still want to use the dashboards provided by this chart you can set + `grafana.enabled` to `false` and set `defaultDashboardsEnabled` to `true`. This will install the dashboards + but will not install Grafana. + +For example: +```yaml +defaultDashboardsEnabled: true + +grafana: + enabled: false +``` + +This will create ConfigMaps with dashboards to be imported into Grafana. + +If additional configuration for labels or annotations is needed in order to import dashboard to an existing Grafana you can +set `.grafana.sidecar.dashboards.additionalDashboardLabels` or `.grafana.sidecar.dashboards.additionalDashboardAnnotations` in `values.yaml`: + +For example: +```yaml +defaultDashboardsEnabled: true + +grafana: + enabled: false + sidecar: + dashboards: + additionalDashboardLabels: + key: value + additionalDashboardAnnotations: + key: value +``` + +## Prerequisites + +* Install the follow packages: ``git``, ``kubectl``, ``helm``, ``helm-docs``. See this [tutorial](../../REQUIREMENTS.md). + +* Add dependency chart repositories + +```console +helm repo add grafana https://grafana.github.io/helm-charts +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm repo update +``` + +* PV support on underlying infrastructure. + +{{ include "chart.installSection" . }} + +### Install locally (Minikube) + +To run VictoriaMetrics stack locally it's possible to use [Minikube](https://github.com/kubernetes/minikube). To avoid dashboards and alert rules issues please follow the steps below: + +Run Minikube cluster + +``` +minikube start --container-runtime=containerd --extra-config=scheduler.bind-address=0.0.0.0 --extra-config=controller-manager.bind-address=0.0.0.0 +``` + +Install helm chart + +``` +helm install [RELEASE_NAME] vm/victoria-metrics-k8s-stack -f values.yaml -f values.minikube.yaml -n NAMESPACE --debug --dry-run +``` + +{{ include "chart.uninstallSection" . }} + +CRDs created by this chart are not removed by default and should be manually cleaned up: + +```console +kubectl get crd | grep victoriametrics.com | awk '{print $1 }' | xargs -i kubectl delete crd {} +``` + +## Troubleshooting + +- If you cannot install helm chart with error `configmap already exist`. It could happen because of name collisions, if you set too long release name. + Kubernetes by default, allows only 63 symbols at resource names and all resource names are trimmed by helm to 63 symbols. + To mitigate it, use shorter name for helm chart release name, like: +```bash +# stack - is short enough +helm upgrade -i stack vm/victoria-metrics-k8s-stack +``` + Or use override for helm chart release name: +```bash +helm upgrade -i some-very-long-name vm/victoria-metrics-k8s-stack --set fullnameOverride=stack +``` + + +## Upgrade guide + +Usually, helm upgrade doesn't requires manual actions. Just execute command: + +```console +$ helm upgrade [RELEASE_NAME] vm/victoria-metrics-k8s-stack +``` + +But release with CRD update can only be patched manually with kubectl. +Since helm does not perform a CRD update, we recommend that you always perform this when updating the helm-charts version: + +```console +# 1. check the changes in CRD +$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl diff -f - + +# 2. apply the changes (update CRD) +$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl apply -f - --server-side +``` + +All other manual actions upgrades listed below: + + +### Upgrade to 0.13.0 + + +- node-exporter starting from version 4.0.0 is using the Kubernetes recommended labels. Therefore you have to delete the daemonset before you upgrade. + +```bash +kubectl delete daemonset -l app=prometheus-node-exporter +``` +- scrape configuration for kubernetes components was moved from `vmServiceScrape.spec` section to `spec` section. If you previously modified scrape configuration you need to update your `values.yaml` + +- `grafana.defaultDashboardsEnabled` was renamed to `defaultDashboardsEnabled` (moved to top level). You may need to update it in your `values.yaml` + + +### Upgrade to 0.6.0 + + + All `CRD` must be update to the lastest version with command: + +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml + +``` + +### Upgrade to 0.4.0 + + All `CRD` must be update to `v1` version with command: + +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml + +``` + +### Upgrade from 0.2.8 to 0.2.9 + + Update `VMAgent` crd + +command: +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.16.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml +``` + + ### Upgrade from 0.2.5 to 0.2.6 + +New CRD added to operator - `VMUser` and `VMAuth`, new fields added to exist crd. +Manual commands: +```bash +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmusers.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmauths.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmalerts.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmsingles.yaml +kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmclusters.yaml +``` + +{{ include "chart.helmDocs" . }} + +## Parameters + +The following tables lists the configurable parameters of the chart and their default values. + +Change the values according to the need of the environment in ``victoria-metrics-k8s-stack/values.yaml`` file. + +{{ template "chart.valuesTableHtml" . }} diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/RELEASE_GUIDE.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/RELEASE_GUIDE.md new file mode 100644 index 00000000..12e01eb5 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/RELEASE_GUIDE.md @@ -0,0 +1,40 @@ +# Release process guidance + +## Update version for VictoriaMetrics kubernetes monitoring stack + +1. Update dependency requirements in [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml) +2. Apply changes via `helm dependency update` +3. Update image tag in chart values: + +
+ + ```console + make sync-rules + make sync-dashboards + ``` +
+4. Bump version of the victoria-metrics-k8s-stack [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml) +5. Run linter: + +
+ + ```console + make lint + ``` + +
+6. Render templates locally to check for errors: + +
+ + ```console + helm template vm-k8s-stack ./charts/victoria-metrics-k8s-stack --output-dir out --values ./charts/victoria-metrics-k8s-stack/values.yaml --debug + ``` + +
+7. Test updated chart by installing it to your kubernetes cluster. +8. Update docs with + ```console + helm-docs + ``` +9. Commit the changes and send a [PR](https://github.com/VictoriaMetrics/helm-charts/pulls) diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/RELEASE_NOTES.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/RELEASE_NOTES.md new file mode 100644 index 00000000..0a413896 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/RELEASE_NOTES.md @@ -0,0 +1,12 @@ +# Release notes for version 0.25.17 + +**Release date:** 2024-09-20 + +![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=) +![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm) + +- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829) +- Fixed ETCD dashboard +- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260) +- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration. + diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/_changelog.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/_changelog.md new file mode 100644 index 00000000..79e80b88 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/_changelog.md @@ -0,0 +1,13 @@ +--- +weight: 1 +title: CHANGELOG +menu: + docs: + weight: 1 + identifier: helm-victoriametrics-k8s-stack-changelog + parent: helm-victoriametrics-k8s-stack +url: /helm/victoriametrics-k8s-stack/changelog +aliases: + - /helm/victoriametrics-k8s-stack/changelog/index.html +--- +{{% content "CHANGELOG.md" %}} diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/_index.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/_index.md new file mode 100644 index 00000000..d23dc833 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/_index.md @@ -0,0 +1,13 @@ +--- +weight: 9 +title: VictoriaMetrics K8s Stack +menu: + docs: + parent: helm + weight: 9 + identifier: helm-victoriametrics-k8s-stack +url: /helm/victoriametrics-k8s-stack +aliases: + - /helm/victoriametrics-k8s-stack/index.html +--- +{{% content "README.md" %}} diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml new file mode 100644 index 00000000..9484dd86 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/etcd.yaml @@ -0,0 +1,165 @@ +condition: '{{ .Values.kubeEtcd.enabled }}' +name: etcd +rules: +- alert: etcdMembersDown + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).' + summary: 'etcd cluster members are down.' + condition: '{{ true }}' + expr: |- + max without (endpoint) ( + sum without (instance) (up{job=~".*etcd.*"} == bool 0) + or + count without (To) ( + sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 + ) + ) + > 0 + for: 10m + labels: + severity: critical +- alert: etcdInsufficientMembers + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' + summary: 'etcd cluster has insufficient number of members.' + condition: '{{ true }}' + expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) + for: 3m + labels: + severity: critical +- alert: etcdNoLeader + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' + summary: 'etcd cluster has no leader.' + condition: '{{ true }}' + expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical +- alert: etcdHighNumberOfLeaderChanges + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' + summary: 'etcd cluster has high number of leader changes.' + condition: '{{ true }}' + expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 + for: 5m + labels: + severity: warning +- alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster has high number of failed grpc requests.' + condition: '{{ true }}' + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) + > 1 + for: 10m + labels: + severity: warning +- alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster has high number of failed grpc requests.' + condition: '{{ true }}' + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) + > 5 + for: 5m + labels: + severity: critical +- alert: etcdGRPCRequestsSlow + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.' + summary: 'etcd grpc requests are slow' + condition: '{{ true }}' + expr: |- + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) + > 0.15 + for: 10m + labels: + severity: critical +- alert: etcdMemberCommunicationSlow + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster member communication is slow.' + condition: '{{ true }}' + expr: |- + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.15 + for: 10m + labels: + severity: warning +- alert: etcdHighNumberOfFailedProposals + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster has high number of proposal failures.' + condition: '{{ true }}' + expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning +- alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster 99th percentile fsync durations are too high.' + condition: '{{ true }}' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.5 + for: 10m + labels: + severity: warning +- alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster 99th percentile fsync durations are too high.' + condition: '{{ true }}' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 1 + for: 10m + labels: + severity: critical +- alert: etcdHighCommitDurations + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' + summary: 'etcd cluster 99th percentile commit durations are too high.' + condition: '{{ true }}' + expr: |- + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.25 + for: 10m + labels: + severity: warning +- alert: etcdDatabaseQuotaLowSpace + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' + summary: 'etcd cluster database is running full.' + condition: '{{ true }}' + expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 + for: 10m + labels: + severity: critical +- alert: etcdExcessiveDatabaseGrowth + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.' + summary: 'etcd cluster database growing very fast.' + condition: '{{ true }}' + expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"} + for: 10m + labels: + severity: warning +- alert: etcdDatabaseHighFragmentationRatio + annotations: + description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' + runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation' + summary: 'etcd database size in use is less than 50% of the actual allocated storage.' + condition: '{{ true }}' + expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 + for: 10m + labels: + severity: warning diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml new file mode 100644 index 00000000..ae0fa110 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml @@ -0,0 +1,53 @@ +condition: '{{ true }}' +name: general.rules +rules: +- alert: TargetDown + annotations: + description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/targetdown' + summary: 'One or more targets are unreachable.' + condition: '{{ true }}' + expr: 100 * (count(up == 0) BY (job,namespace,service,{{ .Values.global.clusterLabel }}) / count(up) BY (job,namespace,service,{{ .Values.global.clusterLabel }})) > 10 + for: 10m + labels: + severity: warning +- alert: Watchdog + annotations: + description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. + + This alert is always firing, therefore it should always be firing in Alertmanager + + and always fire against a receiver. There are integrations with various notification + + mechanisms that send a notification when this alert is not firing. For example the + + "DeadMansSnitch" integration in PagerDuty. + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/watchdog' + summary: 'An alert that should always be firing to certify that Alertmanager is working properly.' + condition: '{{ true }}' + expr: vector(1) + labels: + severity: ok +- alert: InfoInhibitor + annotations: + description: 'This is an alert that is used to inhibit info alerts. + + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with + + other alerts. + + This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a + + severity of ''warning'' or ''critical'' starts firing on the same namespace. + + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor' + summary: 'Info-level alert inhibition.' + condition: '{{ true }}' + expr: ALERTS{severity = "info"} == 1 unless on (namespace,{{ .Values.global.clusterLabel }}) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: major diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml new file mode 100644 index 00000000..85ed9d09 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_cpu_usage_seconds_total.yaml @@ -0,0 +1,11 @@ +condition: '{{ true }}' +name: k8s.rules.container_cpu_usage_seconds_total +rules: +- condition: '{{ true }}' + expr: |- + sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) ( + irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) + ) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + 1, max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml new file mode 100644 index 00000000..3b22ddd6 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_cache.yaml @@ -0,0 +1,10 @@ +condition: '{{ true }}' +name: k8s.rules.container_memory_cache +rules: +- condition: '{{ true }}' + expr: |- + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, + max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml new file mode 100644 index 00000000..6a5f6df3 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_rss.yaml @@ -0,0 +1,10 @@ +condition: '{{ true }}' +name: k8s.rules.container_memory_rss +rules: +- condition: '{{ true }}' + expr: |- + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, + max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml new file mode 100644 index 00000000..481e7711 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_swap.yaml @@ -0,0 +1,10 @@ +condition: '{{ true }}' +name: k8s.rules.container_memory_swap +rules: +- condition: '{{ true }}' + expr: |- + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, + max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml new file mode 100644 index 00000000..f5ca9504 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_memory_working_set_bytes.yaml @@ -0,0 +1,10 @@ +condition: '{{ true }}' +name: k8s.rules.container_memory_working_set_bytes +rules: +- condition: '{{ true }}' + expr: |- + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, + max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_resource.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_resource.yaml new file mode 100644 index 00000000..260a20e2 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.container_resource.yaml @@ -0,0 +1,79 @@ +condition: '{{ true }}' +name: k8s.rules.container_resource +rules: +- condition: '{{ true }}' + expr: |- + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }}) + group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests +- condition: '{{ true }}' + expr: |- + sum by (namespace,{{ .Values.global.clusterLabel }}) ( + sum by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) ( + kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} + ) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_requests:sum +- condition: '{{ true }}' + expr: |- + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }}) + group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests +- condition: '{{ true }}' + expr: |- + sum by (namespace,{{ .Values.global.clusterLabel }}) ( + sum by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) ( + kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} + ) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_requests:sum +- condition: '{{ true }}' + expr: |- + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }}) + group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits +- condition: '{{ true }}' + expr: |- + sum by (namespace,{{ .Values.global.clusterLabel }}) ( + sum by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) ( + kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} + ) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_memory:kube_pod_container_resource_limits:sum +- condition: '{{ true }}' + expr: |- + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }}) + group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + (kube_pod_status_phase{phase=~"Pending|Running"} == 1) + ) + record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits +- condition: '{{ true }}' + expr: |- + sum by (namespace,{{ .Values.global.clusterLabel }}) ( + sum by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) ( + kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} + ) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace_cpu:kube_pod_container_resource_limits:sum diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml new file mode 100644 index 00000000..3cefefe0 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/k8s.rules.pod_owner.yaml @@ -0,0 +1,54 @@ +condition: '{{ true }}' +name: k8s.rules.pod_owner +rules: +- condition: '{{ true }}' + expr: |- + max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on (replicaset,namespace,{{ .Values.global.clusterLabel }}) group_left(owner_name) topk by (replicaset,namespace,{{ .Values.global.clusterLabel }}) ( + 1, max by (replicaset,namespace,owner_name,{{ .Values.global.clusterLabel }}) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel +- condition: '{{ true }}' + expr: |- + max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel +- condition: '{{ true }}' + expr: |- + max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel +- condition: '{{ true }}' + expr: |- + max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) + labels: + workload_type: job + record: namespace_workload_pod:kube_pod_owner:relabel diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml new file mode 100644 index 00000000..aab98ce1 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-availability.rules.yaml @@ -0,0 +1,128 @@ +condition: '{{ .Values.kubeApiServer.enabled }}' +interval: 3m +name: kube-apiserver-availability.rules +rules: +- condition: '{{ true }}' + expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d +- condition: '{{ true }}' + expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + labels: + verb: read + record: code:apiserver_request_total:increase30d +- condition: '{{ true }}' + expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + labels: + verb: write + record: code:apiserver_request_total:increase30d +- condition: '{{ true }}' + expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h])) + record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h +- condition: '{{ true }}' + expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d +- condition: '{{ true }}' + expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) + record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h +- condition: '{{ true }}' + expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d +- condition: '{{ true }}' + expr: |- + 1 - ( + ( + # write too slow + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + ( + # read too slow + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + ) + + # errors + sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d) + labels: + verb: all + record: apiserver_request:availability30d +- condition: '{{ true }}' + expr: |- + 1 - ( + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) + - + ( + # too slow + ( + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"}) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"}) + + + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"}) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read"}) + labels: + verb: read + record: apiserver_request:availability30d +- condition: '{{ true }}' + expr: |- + 1 - ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - + sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"}) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write"}) + labels: + verb: write + record: apiserver_request:availability30d +- condition: '{{ true }}' + expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: code_resource:apiserver_request_total:rate5m +- condition: '{{ true }}' + expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: code_resource:apiserver_request_total:rate5m +- condition: '{{ true }}' + expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h +- condition: '{{ true }}' + expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h +- condition: '{{ true }}' + expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h +- condition: '{{ true }}' + expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml new file mode 100644 index 00000000..6a87f5a7 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-burnrate.rules.yaml @@ -0,0 +1,318 @@ +condition: '{{ .Values.kubeApiServer.enabled }}' +name: kube-apiserver-burnrate.rules +rules: +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + - + ( + ( + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h])) + or + vector(0) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h])) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h])) + ) + ) + + + # errors + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + labels: + verb: write + record: apiserver_request:burnrate5m +- condition: '{{ true }}' + expr: |- + ( + ( + # too slow + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) + - + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h])) + ) + + + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + labels: + verb: write + record: apiserver_request:burnrate6h diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml new file mode 100644 index 00000000..4ba999e2 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-histogram.rules.yaml @@ -0,0 +1,15 @@ +condition: '{{ .Values.kubeApiServer.enabled }}' +name: kube-apiserver-histogram.rules +rules: +- condition: '{{ true }}' + expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + labels: + quantile: '0.99' + verb: read + record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + labels: + quantile: '0.99' + verb: write + record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml new file mode 100644 index 00000000..ed75cccd --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-apiserver-slos.yaml @@ -0,0 +1,63 @@ +condition: '{{ .Values.kubeApiServer.enabled }}' +name: kube-apiserver-slos +rules: +- alert: KubeAPIErrorBudgetBurn + annotations: + description: 'The API server is burning too much error budget.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn' + summary: 'The API server is burning too much error budget.' + condition: '{{ true }}' + expr: |- + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m +- alert: KubeAPIErrorBudgetBurn + annotations: + description: 'The API server is burning too much error budget.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn' + summary: 'The API server is burning too much error budget.' + condition: '{{ true }}' + expr: |- + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m +- alert: KubeAPIErrorBudgetBurn + annotations: + description: 'The API server is burning too much error budget.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn' + summary: 'The API server is burning too much error budget.' + condition: '{{ true }}' + expr: |- + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h + labels: + long: 1d + severity: warning + short: 2h +- alert: KubeAPIErrorBudgetBurn + annotations: + description: 'The API server is burning too much error budget.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn' + summary: 'The API server is burning too much error budget.' + condition: '{{ true }}' + expr: |- + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h + labels: + long: 3d + severity: warning + short: 6h diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml new file mode 100644 index 00000000..68295151 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-general.rules.yaml @@ -0,0 +1,9 @@ +condition: '{{ true }}' +name: kube-prometheus-general.rules +rules: +- condition: '{{ true }}' + expr: count without(instance, pod, node) (up == 1) + record: count:up1 +- condition: '{{ true }}' + expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml new file mode 100644 index 00000000..ee7e514e --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-prometheus-node-recording.rules.yaml @@ -0,0 +1,21 @@ +condition: '{{ true }}' +name: kube-prometheus-node-recording.rules +rules: +- condition: '{{ true }}' + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum +- condition: '{{ true }}' + expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum +- condition: '{{ true }}' + expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum +- condition: '{{ true }}' + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio +- condition: '{{ true }}' + expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m +- condition: '{{ true }}' + expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) + record: cluster:node_cpu:ratio diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml new file mode 100644 index 00000000..51cdb0e4 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-scheduler.rules.yaml @@ -0,0 +1,48 @@ +condition: '{{ .Values.kubeScheduler.enabled }}' +name: kube-scheduler.rules +rules: +- condition: '{{ true }}' + expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.99' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.9' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + labels: + quantile: '0.5' + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml new file mode 100644 index 00000000..e392b7a0 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kube-state-metrics.yaml @@ -0,0 +1,55 @@ +condition: '{{ true }}' +name: kube-state-metrics +rules: +- alert: KubeStateMetricsListErrors + annotations: + description: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors' + summary: 'kube-state-metrics is experiencing errors in list operations.' + condition: '{{ true }}' + expr: |- + (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }}) + / + sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }})) + > 0.01 + for: 15m + labels: + severity: critical +- alert: KubeStateMetricsWatchErrors + annotations: + description: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors' + summary: 'kube-state-metrics is experiencing errors in watch operations.' + condition: '{{ true }}' + expr: |- + (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }}) + / + sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }})) + > 0.01 + for: 15m + labels: + severity: critical +- alert: KubeStateMetricsShardingMismatch + annotations: + description: 'kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch' + summary: 'kube-state-metrics sharding is misconfigured.' + condition: '{{ true }}' + expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) != 0 + for: 15m + labels: + severity: critical +- alert: KubeStateMetricsShardsMissing + annotations: + description: 'kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing' + summary: 'kube-state-metrics shards are missing.' + condition: '{{ true }}' + expr: |- + 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - 1 + - + sum( 2 ^ max by (shard_ordinal,{{ .Values.global.clusterLabel }}) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ .Values.global.clusterLabel }}) + != 0 + for: 15m + labels: + severity: critical diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml new file mode 100644 index 00000000..98ea1a57 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubelet.rules.yaml @@ -0,0 +1,18 @@ +condition: '{{ .Values.kubelet.enabled }}' +name: kubelet.rules +rules: +- condition: '{{ true }}' + expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: '0.99' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: '0.9' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile +- condition: '{{ true }}' + expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: '0.5' + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml new file mode 100644 index 00000000..4e398e37 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-apps.yaml @@ -0,0 +1,257 @@ +condition: '{{ true }}' +name: kubernetes-apps +rules: +- alert: KubePodCrashLooping + annotations: + description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping' + summary: 'Pod is crash looping.' + condition: '{{ true }}' + expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1 + for: 15m + labels: + severity: warning +- alert: KubePodNotReady + annotations: + description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready' + summary: 'Pod has been in a non-ready state for more than 15 minutes.' + condition: '{{ true }}' + expr: |- + sum by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + max by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"} + ) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(owner_kind) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) ( + 1, max by (namespace,pod,owner_kind,{{ .Values.global.clusterLabel }}) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning +- alert: KubeDeploymentGenerationMismatch + annotations: + description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch' + summary: 'Deployment generation mismatch due to possible roll-back' + condition: '{{ true }}' + expr: |- + kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + for: 15m + labels: + severity: warning +- alert: KubeDeploymentReplicasMismatch + annotations: + description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch' + summary: 'Deployment has not matched the expected number of replicas.' + condition: '{{ true }}' + expr: |- + ( + kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning +- alert: KubeDeploymentRolloutStuck + annotations: + description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentrolloutstuck' + summary: 'Deployment rollout is not progressing.' + condition: '{{ true }}' + expr: |- + kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != 0 + for: 15m + labels: + severity: warning +- alert: KubeStatefulSetReplicasMismatch + annotations: + description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch' + summary: 'StatefulSet has not matched the expected number of replicas.' + condition: '{{ true }}' + expr: |- + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning +- alert: KubeStatefulSetGenerationMismatch + annotations: + description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch' + summary: 'StatefulSet generation mismatch due to possible roll-back' + condition: '{{ true }}' + expr: |- + kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + for: 15m + labels: + severity: warning +- alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout' + summary: 'StatefulSet update has not been rolled out.' + condition: '{{ true }}' + expr: |- + ( + max by (namespace,statefulset,{{ .Values.global.clusterLabel }}) ( + kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning +- alert: KubeDaemonSetRolloutStuck + annotations: + description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck' + summary: 'DaemonSet rollout is stuck.' + condition: '{{ true }}' + expr: |- + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning +- alert: KubeContainerWaiting + annotations: + description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting' + summary: 'Pod container waiting longer than 1 hour' + condition: '{{ true }}' + expr: sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) > 0 + for: 1h + labels: + severity: warning +- alert: KubeDaemonSetNotScheduled + annotations: + description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled' + summary: 'DaemonSet pods are not scheduled.' + condition: '{{ true }}' + expr: |- + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 + for: 10m + labels: + severity: warning +- alert: KubeDaemonSetMisScheduled + annotations: + description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled' + summary: 'DaemonSet pods are misscheduled.' + condition: '{{ true }}' + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 + for: 15m + labels: + severity: warning +- alert: KubeJobNotCompleted + annotations: + description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted' + summary: 'Job did not complete in time' + condition: '{{ true }}' + expr: |- + time() - max by (namespace,job_name,{{ .Values.global.clusterLabel }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + and + kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200 + labels: + severity: warning +- alert: KubeJobFailed + annotations: + description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed' + summary: 'Job failed to complete.' + condition: '{{ true }}' + expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0 + for: 15m + labels: + severity: warning +- alert: KubeHpaReplicasMismatch + annotations: + description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch' + summary: 'HPA has not matched desired number of replicas.' + condition: '{{ true }}' + expr: |- + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0 + for: 15m + labels: + severity: warning +- alert: KubeHpaMaxedOut + annotations: + description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout' + summary: 'HPA is running at max replicas' + condition: '{{ true }}' + expr: |- + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} + for: 15m + labels: + severity: warning diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml new file mode 100644 index 00000000..cf32b91d --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-resources.yaml @@ -0,0 +1,113 @@ +condition: '{{ true }}' +name: kubernetes-resources +rules: +- alert: KubeCPUOvercommit + annotations: + description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit' + summary: 'Cluster has overcommitted CPU resource requests.' + condition: '{{ true }}' + expr: |- + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0 + and + (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0 + for: 10m + labels: + severity: warning +- alert: KubeMemoryOvercommit + annotations: + description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit' + summary: 'Cluster has overcommitted memory resource requests.' + condition: '{{ true }}' + expr: |- + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0 + and + (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0 + for: 10m + labels: + severity: warning +- alert: KubeCPUQuotaOvercommit + annotations: + description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit' + summary: 'Cluster has overcommitted CPU resource requests.' + condition: '{{ true }}' + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ .Values.global.clusterLabel }}) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) + > 1.5 + for: 5m + labels: + severity: warning +- alert: KubeMemoryQuotaOvercommit + annotations: + description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Namespaces.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit' + summary: 'Cluster has overcommitted memory resource requests.' + condition: '{{ true }}' + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ .Values.global.clusterLabel }}) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) + > 1.5 + for: 5m + labels: + severity: warning +- alert: KubeQuotaAlmostFull + annotations: + description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull' + summary: 'Namespace quota is going to be full.' + condition: '{{ true }}' + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: informational +- alert: KubeQuotaFullyUsed + annotations: + description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused' + summary: 'Namespace quota is fully used.' + condition: '{{ true }}' + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: informational +- alert: KubeQuotaExceeded + annotations: + description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded' + summary: 'Namespace quota has exceeded the limits.' + condition: '{{ true }}' + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning +- alert: CPUThrottlingHigh + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh' + summary: 'Processes experience elevated CPU throttling.' + condition: '{{ true }}' + expr: |- + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }}) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }}) + > ( 25 / 100 ) + for: 15m + labels: + severity: informational diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml new file mode 100644 index 00000000..1e6703c0 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-storage.yaml @@ -0,0 +1,101 @@ +condition: '{{ true }}' +name: kubernetes-storage +rules: +- alert: KubePersistentVolumeFillingUp + annotations: + description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup' + summary: 'PersistentVolume is filling up.' + condition: '{{ true }}' + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical +- alert: KubePersistentVolumeFillingUp + annotations: + description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup' + summary: 'PersistentVolume is filling up.' + condition: '{{ true }}' + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning +- alert: KubePersistentVolumeInodesFillingUp + annotations: + description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup' + summary: 'PersistentVolumeInodes are filling up.' + condition: '{{ true }}' + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical +- alert: KubePersistentVolumeInodesFillingUp + annotations: + description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup' + summary: 'PersistentVolumeInodes are filling up.' + condition: '{{ true }}' + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }}) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning +- alert: KubePersistentVolumeErrors + annotations: + description: 'The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors' + summary: 'PersistentVolume is having issues with provisioning.' + condition: '{{ true }}' + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml new file mode 100644 index 00000000..6621da32 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-apiserver.yaml @@ -0,0 +1,62 @@ +condition: '{{ true }}' +name: kubernetes-system-apiserver +rules: +- alert: KubeClientCertificateExpiration + annotations: + description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration' + summary: 'Client certificate is about to expire.' + condition: '{{ true }}' + expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800 + for: 5m + labels: + severity: warning +- alert: KubeClientCertificateExpiration + annotations: + description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration' + summary: 'Client certificate is about to expire.' + condition: '{{ true }}' + expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400 + for: 5m + labels: + severity: critical +- alert: KubeAggregatedAPIErrors + annotations: + description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors' + summary: 'Kubernetes aggregated API has reported errors.' + condition: '{{ true }}' + expr: sum by (name,namespace,{{ .Values.global.clusterLabel }})(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4 + labels: + severity: warning +- alert: KubeAggregatedAPIDown + annotations: + description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown' + summary: 'Kubernetes aggregated API is down.' + condition: '{{ true }}' + expr: (1 - max by (name,namespace,{{ .Values.global.clusterLabel }})(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m]))) * 100 < 85 + for: 5m + labels: + severity: warning +- alert: KubeAPIDown + annotations: + description: 'KubeAPI has disappeared from Prometheus target discovery.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown' + summary: 'Target disappeared from Prometheus target discovery.' + condition: '{{ .Values.kubeApiServer.enabled }}' + expr: absent(up{job="kube-apiserver"} == 1) + for: 15m + labels: + severity: critical +- alert: KubeAPITerminatedRequests + annotations: + description: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests' + summary: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.' + condition: '{{ true }}' + expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) ) > 0.20 + for: 5m + labels: + severity: warning diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml new file mode 100644 index 00000000..e53aebed --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-controller-manager.yaml @@ -0,0 +1,13 @@ +condition: '{{ .Values.kubeControllerManager.enabled }}' +name: kubernetes-system-controller-manager +rules: +- alert: KubeControllerManagerDown + annotations: + description: 'KubeControllerManager has disappeared from Prometheus target discovery.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown' + summary: 'Target disappeared from Prometheus target discovery.' + condition: '{{ .Values.kubeControllerManager.enabled }}' + expr: absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml new file mode 100644 index 00000000..77af3f58 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-kubelet.yaml @@ -0,0 +1,136 @@ +condition: '{{ true }}' +name: kubernetes-system-kubelet +rules: +- alert: KubeNodeNotReady + annotations: + description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready' + summary: 'Node is not ready.' + condition: '{{ true }}' + expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m + labels: + severity: warning +- alert: KubeNodeUnreachable + annotations: + description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable' + summary: 'Node is unreachable.' + condition: '{{ true }}' + expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 + for: 15m + labels: + severity: warning +- alert: KubeletTooManyPods + annotations: + description: 'Kubelet ''{{`{{`}} $labels.node {{`}}`}}'' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods' + summary: 'Kubelet is running at capacity.' + condition: '{{ true }}' + expr: |- + count by (node,{{ .Values.global.clusterLabel }}) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,{{ .Values.global.clusterLabel }}) group_left(node) topk by (instance,pod,namespace,{{ .Values.global.clusterLabel }}) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by (node,{{ .Values.global.clusterLabel }}) ( + kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 + ) > 0.95 + for: 15m + labels: + severity: informational +- alert: KubeNodeReadinessFlapping + annotations: + description: 'The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping' + summary: 'Node readiness status is flapping.' + condition: '{{ true }}' + expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (node,{{ .Values.global.clusterLabel }}) > 2 + for: 15m + labels: + severity: warning +- alert: KubeletPlegDurationHigh + annotations: + description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh' + summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.' + condition: '{{ true }}' + expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m + labels: + severity: warning +- alert: KubeletPodStartUpLatencyHigh + annotations: + description: 'Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh' + summary: 'Kubelet Pod startup latency is too high.' + condition: '{{ true }}' + expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }})) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + for: 15m + labels: + severity: warning +- alert: KubeletClientCertificateExpiration + annotations: + description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration' + summary: 'Kubelet client certificate is about to expire.' + condition: '{{ true }}' + expr: kubelet_certificate_manager_client_ttl_seconds < 604800 + labels: + severity: warning +- alert: KubeletClientCertificateExpiration + annotations: + description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration' + summary: 'Kubelet client certificate is about to expire.' + condition: '{{ true }}' + expr: kubelet_certificate_manager_client_ttl_seconds < 86400 + labels: + severity: critical +- alert: KubeletServerCertificateExpiration + annotations: + description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration' + summary: 'Kubelet server certificate is about to expire.' + condition: '{{ true }}' + expr: kubelet_certificate_manager_server_ttl_seconds < 604800 + labels: + severity: warning +- alert: KubeletServerCertificateExpiration + annotations: + description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration' + summary: 'Kubelet server certificate is about to expire.' + condition: '{{ true }}' + expr: kubelet_certificate_manager_server_ttl_seconds < 86400 + labels: + severity: critical +- alert: KubeletClientCertificateRenewalErrors + annotations: + description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors' + summary: 'Kubelet has failed to renew its client certificate.' + condition: '{{ true }}' + expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning +- alert: KubeletServerCertificateRenewalErrors + annotations: + description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors' + summary: 'Kubelet has failed to renew its server certificate.' + condition: '{{ true }}' + expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0 + for: 15m + labels: + severity: warning +- alert: KubeletDown + annotations: + description: 'Kubelet has disappeared from Prometheus target discovery.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown' + summary: 'Target disappeared from Prometheus target discovery.' + condition: '{{ .Values.kubelet.enabled }}' + expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1) + for: 15m + labels: + severity: critical diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml new file mode 100644 index 00000000..1a0983f2 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system-scheduler.yaml @@ -0,0 +1,13 @@ +condition: '{{ .Values.kubeScheduler.enabled }}' +name: kubernetes-system-scheduler +rules: +- alert: KubeSchedulerDown + annotations: + description: 'KubeScheduler has disappeared from Prometheus target discovery.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown' + summary: 'Target disappeared from Prometheus target discovery.' + condition: '{{ .Values.kubeScheduler.enabled }}' + expr: absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml new file mode 100644 index 00000000..677f4929 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/kubernetes-system.yaml @@ -0,0 +1,27 @@ +condition: '{{ true }}' +name: kubernetes-system +rules: +- alert: KubeVersionMismatch + annotations: + description: 'There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch' + summary: 'Different semantic versions of Kubernetes components running.' + condition: '{{ true }}' + expr: count by ({{ .Values.global.clusterLabel }}) (count by (git_version,{{ .Values.global.clusterLabel }}) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 + for: 15m + labels: + severity: warning +- alert: KubeClientErrors + annotations: + description: 'Kubernetes API server client ''{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}'' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.''' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors' + summary: 'Kubernetes API server client is experiencing errors.' + condition: '{{ true }}' + expr: |- + (sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }}) + / + sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }})) + > 0.01 + for: 15m + labels: + severity: warning diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml new file mode 100644 index 00000000..77cfa5a1 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.rules.yaml @@ -0,0 +1,76 @@ +condition: '{{ true }}' +name: node-exporter.rules +rules: +- condition: '{{ true }}' + expr: |- + count without (cpu, mode) ( + node_cpu_seconds_total{job="node-exporter",mode="idle"} + ) + record: instance:node_num_cpu:sum +- condition: '{{ true }}' + expr: |- + 1 - avg without (cpu) ( + sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) + ) + record: instance:node_cpu_utilisation:rate5m +- condition: '{{ true }}' + expr: |- + ( + node_load1{job="node-exporter"} + / + instance:node_num_cpu:sum{job="node-exporter"} + ) + record: instance:node_load1_per_cpu:ratio +- condition: '{{ true }}' + expr: |- + 1 - ( + ( + node_memory_MemAvailable_bytes{job="node-exporter"} + or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + + node_memory_Cached_bytes{job="node-exporter"} + + + node_memory_MemFree_bytes{job="node-exporter"} + + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) + / + node_memory_MemTotal_bytes{job="node-exporter"} + ) + record: instance:node_memory_utilisation:ratio +- condition: '{{ true }}' + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) + record: instance:node_vmstat_pgmajfault:rate5m +- condition: '{{ true }}' + expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_seconds:rate5m +- condition: '{{ true }}' + expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + record: instance_device:node_disk_io_time_weighted_seconds:rate5m +- condition: '{{ true }}' + expr: |- + sum without (device) ( + rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_bytes_excluding_lo:rate5m +- condition: '{{ true }}' + expr: |- + sum without (device) ( + rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_bytes_excluding_lo:rate5m +- condition: '{{ true }}' + expr: |- + sum without (device) ( + rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_receive_drop_excluding_lo:rate5m +- condition: '{{ true }}' + expr: |- + sum without (device) ( + rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) + ) + record: instance:node_network_transmit_drop_excluding_lo:rate5m diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml new file mode 100644 index 00000000..3df8ed27 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-exporter.yaml @@ -0,0 +1,336 @@ +condition: '{{ true }}' +name: node-exporter +rules: +- alert: NodeFilesystemSpaceFillingUp + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup' + summary: 'Filesystem is predicted to run out of space within the next 24 hours.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning +- alert: NodeFilesystemSpaceFillingUp + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup' + summary: 'Filesystem is predicted to run out of space within the next 4 hours.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical +- alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace' + summary: 'Filesystem has less than 5% space left.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning +- alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace' + summary: 'Filesystem has less than 3% space left.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical +- alert: NodeFilesystemFilesFillingUp + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup' + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning +- alert: NodeFilesystemFilesFillingUp + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup' + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical +- alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles' + summary: 'Filesystem has less than 5% inodes left.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning +- alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles' + summary: 'Filesystem has less than 3% inodes left.' + condition: '{{ true }}' + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical +- alert: NodeNetworkReceiveErrs + annotations: + description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs' + summary: 'Network interface is reporting many receive errors.' + condition: '{{ true }}' + expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning +- alert: NodeNetworkTransmitErrs + annotations: + description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs' + summary: 'Network interface is reporting many transmit errors.' + condition: '{{ true }}' + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 + for: 1h + labels: + severity: warning +- alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused' + summary: 'Number of conntrack are getting close to the limit.' + condition: '{{ true }}' + expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 + labels: + severity: warning +- alert: NodeTextFileCollectorScrapeError + annotations: + description: 'Node Exporter text file collector on {{`{{`}} $labels.instance {{`}}`}} failed to scrape.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror' + summary: 'Node Exporter text file collector failed to scrape.' + condition: '{{ true }}' + expr: node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning +- alert: NodeClockSkewDetected + annotations: + description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected' + summary: 'Clock skew detected.' + condition: '{{ true }}' + expr: |- + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning +- alert: NodeClockNotSynchronising + annotations: + description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising' + summary: 'Clock not synchronising.' + condition: '{{ true }}' + expr: |- + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning +- alert: NodeRAIDDegraded + annotations: + description: 'RAID array ''{{`{{`}} $labels.device {{`}}`}}'' at {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded' + summary: 'RAID Array is degraded.' + condition: '{{ true }}' + expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 + for: 15m + labels: + severity: critical +- alert: NodeRAIDDiskFailure + annotations: + description: 'At least one device in RAID array at {{`{{`}} $labels.instance {{`}}`}} failed. Array ''{{`{{`}} $labels.device {{`}}`}}'' needs attention and possibly a disk swap.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure' + summary: 'Failed device in RAID array.' + condition: '{{ true }}' + expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 + labels: + severity: warning +- alert: NodeFileDescriptorLimit + annotations: + description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit' + summary: 'Kernel is predicted to exhaust file descriptors limit soon.' + condition: '{{ true }}' + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning +- alert: NodeFileDescriptorLimit + annotations: + description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit' + summary: 'Kernel is predicted to exhaust file descriptors limit soon.' + condition: '{{ true }}' + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical +- alert: NodeCPUHighUsage + annotations: + description: 'CPU usage at {{`{{`}} $labels.instance {{`}}`}} has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage' + summary: 'High CPU usage.' + condition: '{{ true }}' + expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90 + for: 15m + labels: + severity: informational +- alert: NodeSystemSaturation + annotations: + description: 'System load per core at {{`{{`}} $labels.instance {{`}}`}} has been above 2 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. + + This might indicate this instance resources saturation and can cause it becoming unresponsive. + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemsaturation' + summary: 'System saturated, load per core is very high.' + condition: '{{ true }}' + expr: |- + node_load1{job="node-exporter"} + / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 + for: 15m + labels: + severity: warning +- alert: NodeMemoryMajorPagesFaults + annotations: + description: 'Memory major pages are occurring at very high rate at {{`{{`}} $labels.instance {{`}}`}}, 500 major page faults per second for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. + + Please check that there is enough memory available at this instance. + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults' + summary: 'Memory major page faults are occurring at very high rate.' + condition: '{{ true }}' + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 + for: 15m + labels: + severity: warning +- alert: NodeMemoryHighUtilization + annotations: + description: 'Memory is filling up at {{`{{`}} $labels.instance {{`}}`}}, has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%. + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization' + summary: 'Host is running out of memory.' + condition: '{{ true }}' + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 + for: 15m + labels: + severity: warning +- alert: NodeDiskIOSaturation + annotations: + description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}. + + This symptom might indicate disk saturation. + + ' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation' + summary: 'Disk IO queue is high.' + condition: '{{ true }}' + expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10 + for: 30m + labels: + severity: warning +- alert: NodeSystemdServiceFailed + annotations: + description: 'Systemd service {{`{{`}} $labels.name {{`}}`}} has entered failed state at {{`{{`}} $labels.instance {{`}}`}}' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed' + summary: 'Systemd service has entered failed state.' + condition: '{{ true }}' + expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 + for: 5m + labels: + severity: warning +- alert: NodeBondingDegraded + annotations: + description: 'Bonding interface {{`{{`}} $labels.master {{`}}`}} on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more slave failures.' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded' + summary: 'Bonding interface is degraded' + condition: '{{ true }}' + expr: (node_bonding_slaves - node_bonding_active) != 0 + for: 5m + labels: + severity: warning diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml new file mode 100644 index 00000000..d785e205 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node-network.yaml @@ -0,0 +1,13 @@ +condition: '{{ true }}' +name: node-network +rules: +- alert: NodeNetworkInterfaceFlapping + annotations: + description: 'Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}' + runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping' + summary: 'Network interface is often changing its status' + condition: '{{ true }}' + expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml new file mode 100644 index 00000000..80e9fb18 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/files/rules/generated/node.rules.yaml @@ -0,0 +1,44 @@ +condition: '{{ true }}' +name: node.rules +rules: +- condition: '{{ true }}' + expr: |- + topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, + max by (node,namespace,pod,{{ .Values.global.clusterLabel }}) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' +- condition: '{{ true }}' + expr: |- + count by (node,{{ .Values.global.clusterLabel }}) ( + node_cpu_seconds_total{mode="idle",job="node-exporter"} + * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) + topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, node_namespace_pod:kube_pod_info:) + ) + record: node:node_num_cpu:sum +- condition: '{{ true }}' + expr: |- + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by ({{ .Values.global.clusterLabel }}) + record: :node_memory_MemAvailable_bytes:sum +- condition: '{{ true }}' + expr: |- + avg by (node,{{ .Values.global.clusterLabel }}) ( + sum without (mode) ( + rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) + ) + ) + record: node:node_cpu_utilization:ratio_rate5m +- condition: '{{ true }}' + expr: |- + avg by ({{ .Values.global.clusterLabel }}) ( + node:node_cpu_utilization:ratio_rate5m + ) + record: cluster:node_cpu:ratio_rate5m diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl new file mode 100644 index 00000000..4429e725 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/_helpers.tpl @@ -0,0 +1,458 @@ +{{- /* Expand the name of the chart. */ -}} +{{- define "victoria-metrics-k8s-stack.name" -}} + {{- $Chart := (.helm).Chart | default .Chart -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- default $Chart.Name $Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end }} + +{{- /* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/ -}} +{{- define "victoria-metrics-k8s-stack.fullname" -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $Chart := (.helm).Chart | default .Chart -}} + {{- $Release := (.helm).Release | default .Release -}} + {{- $fullname := "" -}} + {{- if .appKey -}} + {{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}} + {{- $values := $Values -}} + {{- $global := (index $Values.global $Chart.Name) | default dict -}} + {{- range $ak := $appKey }} + {{- $values = (index $values $ak) | default dict -}} + {{- $global = (index $global $ak) | default dict -}} + {{- if $values.name -}} + {{- $fullname = $values.name -}} + {{- else if $global.name -}} + {{- $fullname = $global.name -}} + {{- end -}} + {{- end }} + {{- end -}} + {{- if empty $fullname -}} + {{- if $Values.fullnameOverride -}} + {{- $fullname = $Values.fullnameOverride -}} + {{- else if (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}} + {{- $fullname = (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}} + {{- else -}} + {{- $name := default $Chart.Name $Values.nameOverride -}} + {{- if contains $name $Release.Name -}} + {{- $fullname = $Release.Name -}} + {{- else -}} + {{- $fullname = (printf "%s-%s" $Release.Name $name) }} + {{- end -}} + {{- end }} + {{- end -}} + {{- $fullname | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- /* Create chart name and version as used by the chart label. */ -}} +{{- define "victoria-metrics-k8s-stack.chart" -}} + {{- $Chart := (.helm).Chart | default .Chart -}} + {{- printf "%s-%s" $Chart.Name $Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end }} + +{{- /* Create the name of the service account to use */ -}} +{{- define "victoria-metrics-k8s-stack.serviceAccountName" -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- if $Values.serviceAccount.create -}} + {{- default (include "victoria-metrics-k8s-stack.fullname" .) $Values.serviceAccount.name -}} + {{- else -}} + {{- default "default" $Values.serviceAccount.name -}} + {{- end }} +{{- end }} + +{{- /* Common labels */ -}} +{{- define "victoria-metrics-k8s-stack.labels" -}} + {{- $Release := (.helm).Release | default .Release -}} + {{- $Chart := (.helm).Chart | default .Chart -}} + {{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.selectorLabels" .)) -}} + {{- $_ := set $labels "helm.sh/chart" (include "victoria-metrics-k8s-stack.chart" .) -}} + {{- $_ := set $labels "app.kubernetes.io/managed-by" $Release.Service -}} + {{- with $Chart.AppVersion }} + {{- $_ := set $labels "app.kubernetes.io/version" . -}} + {{- end -}} + {{- toYaml $labels -}} +{{- end }} + +{{- define "vm.release" -}} + {{- $Release := (.helm).Release | default .Release -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- default $Release.Name $Values.argocdReleaseOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{- /* Selector labels */ -}} +{{- define "victoria-metrics-k8s-stack.selectorLabels" -}} + {{- $labels := .extraLabels | default dict -}} + {{- $_ := set $labels "app.kubernetes.io/name" (include "victoria-metrics-k8s-stack.name" .) -}} + {{- $_ := set $labels "app.kubernetes.io/instance" (include "vm.release" .) -}} + {{- toYaml $labels -}} +{{- end }} + +{{- /* Create the name for VM service */ -}} +{{- define "vm.service" -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $name := (include "victoria-metrics-k8s-stack.fullname" .) -}} + {{- with .appKey -}} + {{- $prefix := . -}} + {{- if kindIs "slice" $prefix }} + {{- $prefix = last $prefix -}} + {{- end -}} + {{- $prefix = ternary $prefix (printf "vm%s" $prefix) (hasPrefix "vm" $prefix) -}} + {{- $name = printf "%s-%s" $prefix $name -}} + {{- end -}} + {{- if hasKey . "appIdx" -}} + {{- $name = (printf "%s-%d.%s" $name .appIdx $name) -}} + {{- end -}} + {{- $name -}} +{{- end }} + +{{- define "vm.url" -}} + {{- $name := (include "vm.service" .) -}} + {{- $Release := (.helm).Release | default .Release -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $ns := include "vm.namespace" . -}} + {{- $proto := "http" -}} + {{- $port := 80 -}} + {{- $path := .appRoute | default "/" -}} + {{- $isSecure := false -}} + {{- if .appSecure -}} + {{- $isSecure = .appSecure -}} + {{- end -}} + {{- if .appKey -}} + {{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}} + {{- $spec := $Values -}} + {{- range $ak := $appKey -}} + {{- if hasKey $spec $ak -}} + {{- $spec = (index $spec $ak) -}} + {{- end -}} + {{- if hasKey $spec "spec" -}} + {{- $spec = $spec.spec -}} + {{- end -}} + {{- end -}} + {{- $isSecure = (eq ($spec.extraArgs).tls "true") | default $isSecure -}} + {{- $proto = (ternary "https" "http" $isSecure) -}} + {{- $port = (ternary 443 80 $isSecure) -}} + {{- $port = $spec.port | default $port -}} + {{- $path = dig "http.pathPrefix" $path ($spec.extraArgs | default dict) -}} + {{- end -}} + {{- printf "%s://%s.%s.svc:%d%s" $proto $name $ns (int $port) $path -}} +{{- end -}} + +{{- define "vm.read.endpoint" -}} + {{- $ctx := . -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $endpoint := default dict -}} + {{- if $Values.vmsingle.enabled -}} + {{- $_ := set $ctx "appKey" "vmsingle" -}} + {{- $_ := set $endpoint "url" (include "vm.url" $ctx) -}} + {{- else if $Values.vmcluster.enabled -}} + {{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}} + {{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}} + {{- $tenant := ($Values.tenant | default 0) -}} + {{- $_ := set $endpoint "url" (printf "%s/select/%d/prometheus" $baseURL (int $tenant)) -}} + {{- else if $Values.externalVM.read.url -}} + {{- $endpoint = $Values.externalVM.read -}} + {{- end -}} + {{- toYaml $endpoint -}} +{{- end }} + +{{- define "vm.write.endpoint" -}} + {{- $ctx := . -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $endpoint := default dict -}} + {{- if $Values.vmsingle.enabled -}} + {{- $_ := set $ctx "appKey" "vmsingle" -}} + {{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}} + {{- $_ := set $endpoint "url" (printf "%s/api/v1/write" $baseURL) -}} + {{- else if $Values.vmcluster.enabled -}} + {{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}} + {{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}} + {{- $tenant := ($Values.tenant | default 0) -}} + {{- $_ := set $endpoint "url" (printf "%s/insert/%d/prometheus/api/v1/write" $baseURL (int $tenant)) -}} + {{- else if $Values.externalVM.write.url -}} + {{- $endpoint = $Values.externalVM.write -}} + {{- end -}} + {{- toYaml $endpoint -}} +{{- end -}} + +{{- /* VMAlert remotes */ -}} +{{- define "vm.alert.remotes" -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $remotes := default dict -}} + {{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}} + {{- $ctx := dict "helm" . -}} + {{- $remoteWrite := (include "vm.write.endpoint" $ctx | fromYaml) -}} + {{- if $Values.vmalert.remoteWriteVMAgent -}} + {{- $ctx := dict "helm" . "appKey" "vmagent" -}} + {{- $remoteWrite = dict "url" (printf "%s/api/v1/write" (include "vm.url" $ctx)) -}} + {{- end -}} + {{- $ctx := dict "helm" . -}} + {{- $remoteRead := (fromYaml (include "vm.read.endpoint" $ctx)) -}} + {{- $_ := set $remotes "remoteWrite" $remoteWrite -}} + {{- $_ := set $remotes "remoteRead" $remoteRead -}} + {{- $_ := set $remotes "datasource" $remoteRead -}} + {{- if $Values.vmalert.additionalNotifierConfigs }} + {{- $configName := printf "%s-vmalert-additional-notifier" $fullname -}} + {{- $notifierConfigRef := dict "name" $configName "key" "notifier-configs.yaml" -}} + {{- $_ := set $remotes "notifierConfigRef" $notifierConfigRef -}} + {{- else if $Values.alertmanager.enabled -}} + {{- $notifiers := default list -}} + {{- $appSecure := (not (empty (((.Values.alertmanager).spec).webConfig).tls_server_config)) -}} + {{- $ctx := dict "helm" . "appKey" "alertmanager" "appSecure" $appSecure "appRoute" ((.Values.alertmanager).spec).routePrefix -}} + {{- $alertManagerReplicas := (.Values.alertmanager.spec.replicaCount | default 1 | int) -}} + {{- range until $alertManagerReplicas -}} + {{- $_ := set $ctx "appIdx" . -}} + {{- $notifiers = append $notifiers (dict "url" (include "vm.url" $ctx)) -}} + {{- end }} + {{- $_ := set $remotes "notifiers" $notifiers -}} + {{- end -}} + {{- toYaml $remotes -}} +{{- end -}} + +{{- /* VMAlert templates */ -}} +{{- define "vm.alert.templates" -}} + {{- $Values := (.helm).Values | default .Values}} + {{- $cms := ($Values.vmalert.spec.configMaps | default list) -}} + {{- if $Values.vmalert.templateFiles -}} + {{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}} + {{- $cms = append $cms (printf "%s-vmalert-extra-tpl" $fullname) -}} + {{- end -}} + {{- $output := dict "configMaps" (compact $cms) -}} + {{- toYaml $output -}} +{{- end -}} + +{{- define "vm.license.global" -}} + {{- $license := (deepCopy (.Values.global).license) | default dict -}} + {{- if $license.key -}} + {{- if hasKey $license "keyRef" -}} + {{- $_ := unset $license "keyRef" -}} + {{- end -}} + {{- else if $license.keyRef.name -}} + {{- if hasKey $license "key" -}} + {{- $_ := unset $license "key" -}} + {{- end -}} + {{- else -}} + {{- $license = default dict -}} + {{- end -}} + {{- toYaml $license -}} +{{- end -}} + +{{- /* VMAlert spec */ -}} +{{- define "vm.alert.spec" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $extraArgs := dict "remoteWrite.disablePathAppend" "true" -}} + {{- if $Values.vmalert.templateFiles -}} + {{- $ruleTmpl := (printf "/etc/vm/configs/%s-vmalert-extra-tpl/*.tmpl" (include "victoria-metrics-k8s-stack.fullname" .)) -}} + {{- $_ := set $extraArgs "rule.templates" $ruleTmpl -}} + {{- end -}} + {{- $vmAlertRemotes := (include "vm.alert.remotes" . | fromYaml) -}} + {{- $vmAlertTemplates := (include "vm.alert.templates" . | fromYaml) -}} + {{- $spec := dict "extraArgs" $extraArgs -}} + {{- with (include "vm.license.global" .) -}} + {{- $_ := set $spec "license" (fromYaml .) -}} + {{- end -}} + {{- with concat ($vmAlertRemotes.notifiers | default list) (.Values.vmalert.spec.notifiers | default list) }} + {{- $_ := set $vmAlertRemotes "notifiers" . }} + {{- end }} + {{- $spec := deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec }} + {{- if not (or (hasKey $spec "notifier") (hasKey $spec "notifiers") (hasKey $spec "notifierConfigRef") (hasKey $spec.extraArgs "notifier.blackhole")) }} + {{- fail "Neither `notifier`, `notifiers` nor `notifierConfigRef` is set for vmalert. If it's intentionally please consider setting `.vmalert.spec.extraArgs.['notifier.blackhole']` to `'true'`"}} + {{- end }} + {{- tpl (deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec | toYaml) . -}} +{{- end }} + +{{- /* VM Agent remoteWrites */ -}} +{{- define "vm.agent.remote.write" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $remoteWrites := $Values.vmagent.additionalRemoteWrites | default list -}} + {{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.externalVM.write.url -}} + {{- $ctx := dict "helm" . -}} + {{- $remoteWrites = append $remoteWrites (fromYaml (include "vm.write.endpoint" $ctx)) -}} + {{- end -}} + {{- toYaml (dict "remoteWrite" $remoteWrites) -}} +{{- end -}} + +{{- /* VMAgent spec */ -}} +{{- define "vm.agent.spec" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $spec := (include "vm.agent.remote.write" . | fromYaml) -}} + {{- with (include "vm.license.global" .) -}} + {{- $_ := set $spec "license" (fromYaml .) -}} + {{- end -}} + {{- tpl (deepCopy $Values.vmagent.spec | mergeOverwrite $spec | toYaml) . -}} +{{- end }} + +{{- /* VMAuth spec */ -}} +{{- define "vm.auth.spec" -}} + {{- $ctx := . -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $unauthorizedAccessConfig := default list }} + {{- if $Values.vmsingle.enabled -}} + {{- $_ := set $ctx "appKey" (list "vmsingle") -}} + {{- $url := (include "vm.url" $ctx) }} + {{- $srcPath := clean (printf "%s/.*" (urlParse $url).path) }} + {{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list $url)) }} + {{- else if $Values.vmcluster.enabled -}} + {{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}} + {{- $writeUrl := (include "vm.url" $ctx) }} + {{- $writeSrcPath := clean (printf "%s/insert/.*" (urlParse $writeUrl).path) }} + {{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $writeSrcPath) "url_prefix" (list $writeUrl)) }} + {{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}} + {{- $readUrl := (include "vm.url" $ctx) }} + {{- $readSrcPath := clean (printf "%s/select/.*" (urlParse $readUrl).path) }} + {{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $readSrcPath) "url_prefix" (list $readUrl)) }} + {{- else if or $Values.externalVM.read.url $Values.externalVM.write.url }} + {{- with $Values.externalVM.read.url }} + {{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }} + {{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }} + {{- end -}} + {{- with $Values.externalVM.write.url }} + {{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }} + {{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }} + {{- end -}} + {{- end -}} + {{- $spec := $Values.vmauth.spec }} + {{- $_ := set $spec "unauthorizedAccessConfig" (concat $unauthorizedAccessConfig ($spec.unauthorizedAccessConfig | default list)) }} + {{- with (include "vm.license.global" .) -}} + {{- $_ := set $spec "license" (fromYaml .) -}} + {{- end -}} + {{- tpl (toYaml $spec) . -}} +{{- end -}} + +{{- /* Alermanager spec */ -}} +{{- define "vm.alertmanager.spec" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}} + {{- $spec := $Values.alertmanager.spec -}} + {{- if and (not $Values.alertmanager.spec.configRawYaml) (not $Values.alertmanager.spec.configSecret) -}} + {{- $_ := set $spec "configSecret" (printf "%s-alertmanager" $fullname) -}} + {{- end -}} + {{- $templates := default list -}} + {{- if $Values.alertmanager.monzoTemplate.enabled -}} + {{- $configMap := (printf "%s-alertmanager-monzo-tpl" $fullname) -}} + {{- $templates = append $templates (dict "name" $configMap "key" "monzo.tmpl") -}} + {{- end -}} + {{- $configMap := (printf "%s-alertmanager-extra-tpl" $fullname) -}} + {{- range $key, $value := (.Values.alertmanager.templateFiles | default dict) -}} + {{- $templates = append $templates (dict "name" $configMap "key" $key) -}} + {{- end -}} + {{- $_ := set $spec "templates" $templates -}} + {{- toYaml $spec -}} +{{- end -}} + +{{- /* Single spec */ -}} +{{- define "vm.single.spec" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $extraArgs := default dict -}} + {{- if $Values.vmalert.enabled }} + {{- $ctx := dict "helm" . "appKey" "vmalert" -}} + {{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}} + {{- end -}} + {{- $spec := dict "extraArgs" $extraArgs -}} + {{- with (include "vm.license.global" .) -}} + {{- $_ := set $spec "license" (fromYaml .) -}} + {{- end -}} + {{- tpl (deepCopy $Values.vmsingle.spec | mergeOverwrite $spec | toYaml) . -}} +{{- end }} + +{{- /* Cluster spec */ -}} +{{- define "vm.select.spec" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $extraArgs := default dict -}} + {{- if $Values.vmalert.enabled -}} + {{- $ctx := dict "helm" . "appKey" "vmalert" -}} + {{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}} + {{- end -}} + {{- $spec := dict "extraArgs" $extraArgs -}} + {{- toYaml $spec -}} +{{- end -}} + +{{- define "vm.cluster.spec" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $spec := (include "vm.select.spec" . | fromYaml) -}} + {{- $clusterSpec := (deepCopy $Values.vmcluster.spec) -}} + {{- with (include "vm.license.global" .) -}} + {{- $_ := set $clusterSpec "license" (fromYaml .) -}} + {{- end -}} + {{- tpl ($clusterSpec | mergeOverwrite (dict "vmselect" $spec) | toYaml) . -}} +{{- end -}} + +{{- define "vm.data.source.enabled" -}} + {{- $Values := (.helm).Values | default .Values -}} + {{- $grafana := $Values.grafana -}} + {{- $isEnabled := false -}} + {{- if $grafana.plugins -}} + {{- range $value := $grafana.plugins -}} + {{- if contains "victoriametrics-datasource" $value -}} + {{- $isEnabled = true -}} + {{- end }} + {{- end }} + {{- end }} + {{- $unsignedPlugins := ((index $grafana "grafana.ini").plugins).allow_loading_unsigned_plugins | default "" -}} + {{- $allowUnsigned := contains "victoriametrics-datasource" $unsignedPlugins -}} + {{- ternary "true" "" (and $isEnabled $allowUnsigned) -}} +{{- end -}} + +{{- /* Datasources */ -}} +{{- define "vm.data.sources" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $grafana := $Values.grafana -}} + {{- $datasources := $Values.grafana.additionalDataSources | default list -}} + {{- $vmDatasource := "victoriametrics-datasource" -}} + {{- $allowVMDatasource := (ternary false true (empty (include "vm.data.source.enabled" .))) -}} + {{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled -}} + {{- $ctx := dict "helm" . -}} + {{- $readEndpoint:= (include "vm.read.endpoint" $ctx | fromYaml) -}} + {{- $defaultDatasources := default list -}} + {{- range $ds := $grafana.sidecar.datasources.default }} + {{- if not $ds.type -}} + {{- $_ := set $ds "type" $Values.grafana.defaultDatasourceType }} + {{- end -}} + {{- if or (ne $ds.type $vmDatasource) $allowVMDatasource -}} + {{- $_ := set $ds "url" $readEndpoint.url -}} + {{- $defaultDatasources = append $defaultDatasources $ds -}} + {{- end -}} + {{- end }} + {{- $datasources = concat $datasources $defaultDatasources -}} + {{- if and $grafana.sidecar.datasources.createVMReplicasDatasources $defaultDatasources -}} + {{- range $id := until (int $Values.vmsingle.spec.replicaCount) -}} + {{- $_ := set $ctx "appIdx" $id -}} + {{- $readEndpoint := (include "vm.read.endpoint" $ctx | fromYaml) -}} + {{- range $ds := $defaultDatasources -}} + {{- $ds = (deepCopy $ds) -}} + {{- $_ := set $ds "url" $readEndpoint.url -}} + {{- $_ := set $ds "name" (printf "%s-%d" $ds.name $id) -}} + {{- $_ := set $ds "isDefault" false -}} + {{- $datasources = append $datasources $ds -}} + {{- end -}} + {{- end -}} + {{- end -}} + {{- end -}} + {{- toYaml $datasources -}} +{{- end }} + +{{- /* VMRule name */ -}} +{{- define "victoria-metrics-k8s-stack.rulegroup.name" -}} + {{- printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) (.name | replace "_" "") -}} +{{- end -}} + +{{- /* VMRule labels */ -}} +{{- define "victoria-metrics-k8s-stack.rulegroup.labels" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.labels" .)) -}} + {{- $_ := set $labels "app" (include "victoria-metrics-k8s-stack.name" .) -}} + {{- $labels = mergeOverwrite $labels (deepCopy $Values.defaultRules.labels) -}} + {{- toYaml $labels -}} +{{- end }} + +{{- /* VMRule key */ -}} +{{- define "victoria-metrics-k8s-stack.rulegroup.key" -}} + {{- without (regexSplit "[-_.]" .name -1) "exporter" "rules" | join "-" | camelcase | untitle -}} +{{- end -}} + +{{- /* VMAlertmanager name */ -}} +{{- define "victoria-metrics-k8s-stack.alertmanager.name" -}} + {{- $Values := (.helm).Values | default .Values }} + {{- $Values.alertmanager.name | default (printf "%s-%s" "vmalertmanager" (include "victoria-metrics-k8s-stack.fullname" .) | trunc 63 | trimSuffix "-") -}} +{{- end -}} diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/extra-objects.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/extra-objects.yaml new file mode 100644 index 00000000..a9bb3b6b --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/extra-objects.yaml @@ -0,0 +1,4 @@ +{{ range .Values.extraObjects }} +--- +{{ tpl (toYaml .) $ }} +{{ end }} diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml new file mode 100644 index 00000000..afa2900c --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml @@ -0,0 +1,121 @@ +{{- if .Values.defaultRules.create }} + +{{- /* +Default rules alias +*/}} +{{- $defaultRules := .Values.defaultRules -}} + +{{- /* +Exact rules overrides +*/}} +{{- $exactRules := index $defaultRules "rules" | default dict }} + +{{- /* +Create custom template context +*/}} +{{- $ctx := (dict "Values" (deepCopy .Values) "Release" (deepCopy .Release) "Chart" (deepCopy .Chart) "Template" (deepCopy .Template)) -}} + +{{- /* +Loop through all rules files, that were crafted with hack/sync_rules.py +*/}} +{{- range $groupFile, $_ := .Files.Glob "files/rules/**.yaml" -}} + +{{- /* +Get group name from file +*/}} +{{- $groupBase := base $groupFile -}} +{{- $groupNameRaw := trimSuffix (ext $groupBase) $groupBase -}} + +{{- /* +Create context for templating +*/}} +{{- $_ := set $ctx "name" $groupNameRaw -}} + +{{- /* +Create sanitized group name retrieved from file +*/}} +{{- $groupName := include "victoria-metrics-k8s-stack.rulegroup.key" $ctx -}} + +{{- /* +Merge common group with a group data of a current iteration +*/}} + +{{- $group := mergeOverwrite (deepCopy (dig "group" (default dict) $defaultRules)) (dig "groups" $groupName (default dict) $defaultRules) -}} + +{{- /* +Get group data from file +*/}} +{{- $groupCtx := mergeOverwrite (deepCopy $ctx) $group }} +{{- $groupData := fromYaml (tpl ($.Files.Get $groupFile) $groupCtx) -}} + +{{- /* +Save rules spec from file +*/}} +{{- $rulesSpec := $groupData.rules -}} + +{{- /* +Delete rules from group +*/}} +{{- $_ := unset $groupData "rules" -}} + +{{- /* +Save condition for group from file +*/}} +{{- $groupCondition := (eq $groupData.condition "true") -}} + +{{- /* +Delete condition from group +*/}} +{{- $_ := unset $groupData "condition" -}} + +{{- /* +Merge group spec +*/}} +{{- $groupSpec := mergeOverwrite (deepCopy $groupData) (dig "spec" (default dict) $group) -}} + +{{- /* +Filter out ignore rules +*/}} +{{- $commonRule := dig "rule" (default dict) $defaultRules }} +{{- $commonInGroupRule := dig "rules" (default dict) $group }} +{{- $filteredRulesSpec := default list }} +{{- range $_, $ruleSpec := $rulesSpec }} + {{- $ruleName := $ruleSpec.alert | default "" }} + {{- $ruleKey := (hasKey $ruleSpec "record" | ternary "recording" "alerting") -}} + {{- $ruleCondition := (eq $ruleSpec.condition "true") }} + {{- $_ := unset $ruleSpec "condition" }} + {{- $exactRule := index $exactRules $ruleName | default dict }} + {{- $defaultRule := deepCopy (index $defaultRules $ruleKey) }} + {{- $resultRule := mergeOverwrite (deepCopy $commonRule) $defaultRule $commonInGroupRule $exactRule }} + {{- if (and (dig "create" true $resultRule) $ruleCondition) }} + {{- $ruleSpec := mergeOverwrite (deepCopy $ruleSpec) (dig "spec" (default dict) $resultRule) }} + {{- $filteredRulesSpec = append $filteredRulesSpec $ruleSpec }} + {{- end }} +{{- end -}} +{{- $rulesSpec = $filteredRulesSpec }} + +{{- /* +Check if group is enabled +*/}} +{{- if (and $rulesSpec (dig "create" true $group) $groupCondition) }} +--- +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMRule +metadata: + name: {{ include "victoria-metrics-k8s-stack.rulegroup.name" $ctx }} + {{- with (include "victoria-metrics-k8s-stack.rulegroup.labels" $ctx) }} + labels: {{ . | nindent 4 }} + {{- end }} + {{- with $.Values.defaultRules.annotations }} + annotations: {{ toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - {{ toYaml $groupSpec | indent 4 | trim }} + rules: + {{- range $ruleSpec := $rulesSpec }} + - {{ toYaml $ruleSpec | indent 6 | trim }} + {{- end }} +{{- end }} +{{- end }} +{{- end }} diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/todo.md b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/todo.md new file mode 100644 index 00000000..33f0a470 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/todo.md @@ -0,0 +1,26 @@ +### plans + +* [x] VMCluster +* [x] VMSingle +* [x] VMAgent +* [x] VMAlert +* [x] AlertManager +* [x] Annotations +* [x] ServiceScrapes + * [x] Nodeexporter + * [x] Grafana + * [x] kube-state-metrics + * [x] kube-mixin + * [x] core-dns +* [x] Grafana DS +* [x] Dashboards + * [x] Nodeexporter + * [x] kube-state-metrics + * [x] kube-mixin +* [x] Rules + * [x] kube-mixin + * [x] kube-prometheus + * [x] victoria-metrics +* [ ] ServiceAccounts stuff +* [ ] SelectorOvverride for ServiceScrapes +* [ ] helm hook for uninstall crd objects before chart remove diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/values.minikube.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/values.minikube.yaml new file mode 100644 index 00000000..f4438de3 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/values.minikube.yaml @@ -0,0 +1,38 @@ +vmagent: + spec: + volumes: + - hostPath: + path: /var/lib/minikube/certs/etcd + type: DirectoryOrCreate + name: etcd-certs + volumeMounts: + - mountPath: /var/lib/minikube/certs/etcd + name: etcd-certs +kubeScheduler: + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true +kubeControllerManager: + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecureSkipVerify: true +kubeEtcd: + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/lib/minikube/certs/etcd/ca.crt + certFile: /var/lib/minikube/certs/etcd/peer.crt + keyFile: /var/lib/minikube/certs/etcd/peer.key diff --git a/packages/system/monitoring/charts/victoria-metrics-k8s-stack/values.yaml b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/values.yaml new file mode 100644 index 00000000..778f7e00 --- /dev/null +++ b/packages/system/monitoring/charts/victoria-metrics-k8s-stack/values.yaml @@ -0,0 +1,1233 @@ +global: + clusterLabel: cluster + license: + key: "" + keyRef: {} + # name: secret-license + # key: license + +nameOverride: "" +fullnameOverride: "" +tenant: "0" +# -- If this chart is used in "Argocd" with "releaseName" field then +# -- VMServiceScrapes couldn't select the proper services. +# -- For correct working need set value 'argocdReleaseOverride=$ARGOCD_APP_NAME' +argocdReleaseOverride: "" + +# -- victoria-metrics-operator dependency chart configuration. +# -- For possible values refer to https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator#parameters +# -- also checkout here possible ENV variables to configure operator behaviour https://docs.victoriametrics.com/operator/vars +victoria-metrics-operator: + enabled: true + serviceMonitor: + enabled: true + crd: + # -- we disable crd creation by operator chart as we create them in this chart + create: false + + # -- tells helm to clean up vm cr resources when uninstalling + cleanup: + enabled: true + image: + repository: bitnami/kubectl + # use image tag that matches k8s API version by default + # tag: 1.29.6 + pullPolicy: IfNotPresent + operator: + # -- By default, operator converts prometheus-operator objects. + disable_prometheus_converter: false + +serviceAccount: + # -- Specifies whether a service account should be created + create: true + # -- Annotations to add to the service account + annotations: {} + # -- The name of the service account to use. + # -- If not set and create is true, a name is generated using the fullname template + name: "" + +# -- Enable dashboards despite it's dependency is not installed +dashboards: + vmalert: false + operator: false + # -- in ArgoCD using client-side apply this dashboard reaches annotations size limit and causes k8s issues without server side apply + # See [this issue](https://github.com/VictoriaMetrics/helm-charts/tree/disable-node-exporter-dashboard-by-default/charts/victoria-metrics-k8s-stack#metadataannotations-too-long-must-have-at-most-262144-bytes-on-dashboards) + node-exporter-full: true + +# -- Create default rules for monitoring the cluster +defaultRules: + create: true + + # -- Common properties for VMRule groups + group: + spec: + # -- Optional HTTP URL parameters added to each rule request + params: {} + + # -- Common properties for all VMRules + rule: + spec: + # -- Additional labels for all VMRules + labels: {} + # -- Additional annotations for all VMRules + annotations: {} + + # -- Common properties for VMRules alerts + alerting: + spec: + # -- Additional labels for VMRule alerts + labels: {} + # -- Additional annotations for VMRule alerts + annotations: {} + + # -- Common properties for VMRules recording rules + recording: + spec: + # -- Additional labels for VMRule recording rules + labels: {} + # -- Additional annotations for VMRule recording rules + annotations: {} + + # -- Per rule properties + rules: {} + # CPUThrottlingHigh: + # create: true + # spec: + # for: 15m + # labels: + # severity: critical + groups: + etcd: + create: true + # -- Common properties for all rules in a group + rules: {} + # spec: + # annotations: + # dashboard: https://example.com/dashboard/1 + general: + create: true + rules: {} + k8sContainerMemoryRss: + create: true + rules: {} + k8sContainerMemoryCache: + create: true + rules: {} + k8sContainerCpuUsageSecondsTotal: + create: true + rules: {} + k8sPodOwner: + create: true + rules: {} + k8sContainerResource: + create: true + rules: {} + k8sContainerMemoryWorkingSetBytes: + create: true + rules: {} + k8sContainerMemorySwap: + create: true + rules: {} + kubeApiserver: + create: true + rules: {} + kubeApiserverAvailability: + create: true + rules: {} + kubeApiserverBurnrate: + create: true + rules: {} + kubeApiserverHistogram: + create: true + rules: {} + kubeApiserverSlos: + create: true + rules: {} + kubelet: + create: true + rules: {} + kubePrometheusGeneral: + create: true + rules: {} + kubePrometheusNodeRecording: + create: true + rules: {} + kubernetesApps: + create: true + rules: {} + targetNamespace: ".*" + kubernetesResources: + create: true + rules: {} + kubernetesStorage: + create: true + rules: {} + targetNamespace: ".*" + kubernetesSystem: + create: true + rules: {} + kubernetesSystemKubelet: + create: true + rules: {} + kubernetesSystemApiserver: + create: true + rules: {} + kubernetesSystemControllerManager: + create: true + rules: {} + kubeScheduler: + create: true + rules: {} + kubernetesSystemScheduler: + create: true + rules: {} + kubeStateMetrics: + create: true + rules: {} + nodeNetwork: + create: true + rules: {} + node: + create: true + rules: {} + vmagent: + create: true + rules: {} + vmsingle: + create: true + rules: {} + vmcluster: + create: true + rules: {} + vmHealth: + create: true + rules: {} + vmoperator: + create: true + rules: {} + alertmanager: + create: true + rules: {} + + # -- Runbook url prefix for default rules + runbookUrl: https://runbooks.prometheus-operator.dev/runbooks + + # -- Labels for default rules + labels: {} + # -- Annotations for default rules + annotations: {} + +# -- Create default dashboards +defaultDashboardsEnabled: true + +# -- Create experimental dashboards +experimentalDashboardsEnabled: true + +# -- Create dashboards as CRDs (reuqires grafana-operator to be installed) +grafanaOperatorDashboardsFormat: + enabled: false + instanceSelector: + matchLabels: + dashboards: "grafana" + allowCrossNamespaceImport: false + +## -- Provide custom recording or alerting rules to be deployed into the cluster. +additionalVictoriaMetricsMap: +# rule-name: +# groups: +# - name: my_group +# rules: +# - record: my_record +# expr: 100 * my_record + +externalVM: + read: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + write: + url: "" + # bearerTokenSecret: + # name: dbaas-read-access-token + # key: bearerToken + +############## + +# Configures vmsingle params +vmsingle: + annotations: {} + enabled: true + # -- full spec for VMSingle CRD. Allowed values describe [here](https://docs.victoriametrics.com/operator/api#vmsinglespec) + spec: + port: "8429" + image: + tag: v1.103.0 + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicaCount: 1 + extraArgs: {} + storage: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmsingle.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmsingle-ingress-tls + # hosts: + # - vmsingle.domain.com + +vmcluster: + enabled: false + annotations: {} + # -- full spec for VMCluster CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmclusterspec) + spec: + # -- Data retention period. Possible units character: h(ours), d(ays), w(eeks), y(ears), if no unit character specified - month. The minimum retention period is 24h. See these [docs](https://docs.victoriametrics.com/single-server-victoriametrics/#retention) + retentionPeriod: "1" + replicationFactor: 2 + vmstorage: + image: + tag: v1.103.0-cluster + replicaCount: 2 + storageDataPath: "/vm-data" + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 10Gi + resources: + {} + # limits: + # cpu: "1" + # memory: 1500Mi + vmselect: + port: "8481" + image: + tag: v1.103.0-cluster + replicaCount: 2 + cacheMountPath: "/select-cache" + extraArgs: {} + storage: + volumeClaimTemplate: + spec: + resources: + requests: + storage: 2Gi + resources: + {} + # limits: + # cpu: "1" + # memory: "1000Mi" + # requests: + # cpu: "0.5" + # memory: "500Mi" + vminsert: + port: "8480" + image: + tag: v1.103.0-cluster + replicaCount: 2 + extraArgs: {} + resources: + {} + # limits: + # cpu: "1" + # memory: 1000Mi + # requests: + # cpu: "0.5" + # memory: "500Mi" + + ingress: + storage: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmstorage.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmstorage-ingress-tls + # hosts: + # - vmstorage.domain.com + select: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: '{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vmselect }}' + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmselect.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmselect-ingress-tls + # hosts: + # - vmselect.domain.com + insert: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: '{{ dig "extraArgs" "http.pathPrefix" "/" .Values.vmcluster.spec.vminsert }}' + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vminsert.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vminsert-ingress-tls + # hosts: + # - vminsert.domain.com + +alertmanager: + enabled: true + annotations: {} + # -- (object) full spec for VMAlertmanager CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmalertmanagerspec) + spec: + port: "9093" + selectAllByDefault: true + image: + tag: v0.25.0 + externalURL: "" + routePrefix: / + + # -- (string) if this one defined, it will be used for alertmanager configuration and config parameter will be ignored + configSecret: "" + # -- (object) alertmanager configuration + config: + templates: + - "/etc/vm/configs/**/*.tmpl" + route: + # group_by: ["alertgroup", "job"] + # group_wait: 30s + # group_interval: 5m + # repeat_interval: 12h + receiver: "blackhole" + ## routes: + ################################################### + ## Duplicate code_owner routes to teams + ## These will send alerts to team channels but continue + ## processing through the rest of the tree to handled by on-call + # - matchers: + # - code_owner_channel!="" + # - severity=~"info|warning|critical" + # group_by: ["code_owner_channel", "alertgroup", "job"] + # receiver: slack-code-owners + # ################################################### + # ## Standard on-call routes + # - matchers: + # - severity=~"info|warning|critical" + # receiver: slack-monitoring + # continue: true + + # inhibit_rules: + # - target_matchers: + # - severity=~"warning|info" + # source_matchers: + # - severity=critical + # equal: + # - cluster + # - namespace + # - alertname + # - target_matchers: + # - severity=info + # source_matchers: + # - severity=warning + # equal: + # - cluster + # - namespace + # - alertname + # - target_matchers: + # - severity=info + # source_matchers: + # - alertname=InfoInhibitor + # equal: + # - cluster + # - namespace + + receivers: + - name: blackhole + # - name: "slack-monitoring" + # slack_configs: + # - channel: "#channel" + # send_resolved: true + # title: '{{ template "slack.monzo.title" . }}' + # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + # color: '{{ template "slack.monzo.color" . }}' + # text: '{{ template "slack.monzo.text" . }}' + # actions: + # - type: button + # text: "Runbook :green_book:" + # url: "{{ (index .Alerts 0).Annotations.runbook_url }}" + # - type: button + # text: "Query :mag:" + # url: "{{ (index .Alerts 0).GeneratorURL }}" + # - type: button + # text: "Dashboard :grafana:" + # url: "{{ (index .Alerts 0).Annotations.dashboard }}" + # - type: button + # text: "Silence :no_bell:" + # url: '{{ template "__alert_silence_link" . }}' + # - type: button + # text: '{{ template "slack.monzo.link_button_text" . }}' + # url: "{{ .CommonAnnotations.link_url }}" + # - name: slack-code-owners + # slack_configs: + # - channel: "#{{ .CommonLabels.code_owner_channel }}" + # send_resolved: true + # title: '{{ template "slack.monzo.title" . }}' + # icon_emoji: '{{ template "slack.monzo.icon_emoji" . }}' + # color: '{{ template "slack.monzo.color" . }}' + # text: '{{ template "slack.monzo.text" . }}' + # actions: + # - type: button + # text: "Runbook :green_book:" + # url: "{{ (index .Alerts 0).Annotations.runbook }}" + # - type: button + # text: "Query :mag:" + # url: "{{ (index .Alerts 0).GeneratorURL }}" + # - type: button + # text: "Dashboard :grafana:" + # url: "{{ (index .Alerts 0).Annotations.dashboard }}" + # - type: button + # text: "Silence :no_bell:" + # url: '{{ template "__alert_silence_link" . }}' + # - type: button + # text: '{{ template "slack.monzo.link_button_text" . }}' + # url: "{{ .CommonAnnotations.link_url }}" + # + # better alert templates for slack + # source https://gist.github.com/milesbxf/e2744fc90e9c41b47aa47925f8ff6512 + monzoTemplate: + enabled: true + + # -- (object) extra alert templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + # -- (object) alertmanager ingress configuration + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: '{{ .Values.alertmanager.spec.routePrefix | default "/" }}' + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - alertmanager.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: alertmanager-ingress-tls + # hosts: + # - alertmanager.domain.com + +vmalert: + annotations: {} + enabled: true + + # Controls whether VMAlert should use VMAgent or VMInsert as a target for remotewrite + remoteWriteVMAgent: false + # -- (object) full spec for VMAlert CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmalertspec) + spec: + port: "8080" + selectAllByDefault: true + image: + tag: v1.103.0 + evaluationInterval: 15s + extraArgs: + http.pathPrefix: "/" + + # External labels to add to all generated recording rules and alerts + externalLabels: {} + + # -- (object) extra vmalert annotation templates + templateFiles: + {} + # template_1.tmpl: |- + # {{ define "hello" -}} + # hello, Victoria! + # {{- end }} + # template_2.tmpl: "" + + ## additionalNotifierConfigs allows to configure static notifiers, discover notifiers via Consul and DNS, + ## see specification in https://docs.victoriametrics.com/vmalert/#notifier-configuration-file. + ## This configuration will be created as separate secret and mounted to vmalert pod. + additionalNotifierConfigs: {} + # dns_sd_configs: + # - names: + # - my.domain.com + # type: 'A' + # port: 9093 + # -- (object) vmalert ingress config + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmalert.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmalert-ingress-tls + # hosts: + # - vmalert.domain.com + +vmauth: + enabled: false + annotations: {} + # -- (object) full spec for VMAuth CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmauthspec) + spec: + discover_backend_ips: true + port: "8427" + +vmagent: + enabled: true + annotations: {} + # -- remoteWrite configuration of VMAgent, allowed parameters defined in a [spec](https://docs.victoriametrics.com/operator/api#vmagentremotewritespec) + additionalRemoteWrites: + [] + #- url: http://some-remote-write/api/v1/write + # -- (object) full spec for VMAgent CRD. Allowed values described [here](https://docs.victoriametrics.com/operator/api#vmagentspec) + spec: + port: "8429" + selectAllByDefault: true + image: + tag: v1.103.0 + scrapeInterval: 20s + externalLabels: {} + # For multi-cluster setups it is useful to use "cluster" label to identify the metrics source. + # For example: + # cluster: cluster-name + extraArgs: + promscrape.streamParse: "true" + # Do not store original labels in vmagent's memory by default. This reduces the amount of memory used by vmagent + # but makes vmagent debugging UI less informative. See: https://docs.victoriametrics.com/vmagent/#relabel-debug + promscrape.dropOriginalLabels: "true" + # -- (object) vmagent ingress configuration + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: "" + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - vmagent.domain.com + # -- Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: vmagent-ingress-tls + # hosts: + # - vmagent.domain.com + +################################################# +### dependencies ##### +################################################# +# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration +grafana: + enabled: true + ## all values for grafana helm chart can be specified here + sidecar: + datasources: + enabled: true + # -- list of default prometheus compatible datasource configurations. + # VM `url` will be added to each of them in templates and `type` will be set to defaultDatasourceType if not defined + default: + - name: VictoriaMetrics + isDefault: true + - name: VictoriaMetrics (DS) + isDefault: false + type: victoriametrics-datasource + initDatasources: true + createVMReplicasDatasources: false + dashboards: + provider: + name: default + orgid: 1 + additionalDashboardLabels: {} + folder: /var/lib/grafana/dashboards + defaultFolderName: default + additionalDashboardAnnotations: {} + enabled: true + multicluster: false + + ## ForceDeployDatasource Create datasource configmap even if grafana deployment has been disabled + forceDeployDatasource: false + + ## Configure additional grafana datasources (passed through tpl) + ## ref: http://docs.grafana.org/administration/provisioning/#datasources + additionalDataSources: [] + # - name: prometheus-sample + # access: proxy + # basicAuth: true + # basicAuthPassword: pass + # basicAuthUser: daco + # editable: false + # jsonData: + # tlsSkipVerify: true + # orgId: 1 + # type: prometheus + # url: https://{{ printf "%s-prometheus.svc" .Release.Name }}:9090 + # version: 1 + + defaultDashboardsTimezone: utc + + # Enabling VictoriaMetrics Datasource in Grafana. See more details here: https://github.com/VictoriaMetrics/grafana-datasource/blob/main/README.md#victoriametrics-datasource-for-grafana + # Note that Grafana will need internet access to install the datasource plugin. + # Uncomment the block below, if you want to enable VictoriaMetrics Datasource in Grafana: + #plugins: + # - "https://github.com/VictoriaMetrics/grafana-datasource/releases/download/v0.5.0/victoriametrics-datasource-v0.5.0.zip;victoriametrics-datasource" + #grafana.ini: + # plugins: + # # Why VictoriaMetrics datasource is unsigned: https://github.com/VictoriaMetrics/grafana-datasource/blob/main/README.md#why-victoriametrics-datasource-is-unsigned + # allow_loading_unsigned_plugins: victoriametrics-datasource + + # Change datasource type in dashboards from Prometheus to VictoriaMetrics. + # you can use `victoriametrics-datasource` instead of `prometheus` if enabled VictoriaMetrics Datasource above + defaultDatasourceType: "prometheus" + + ingress: + enabled: false + # For Kubernetes >= 1.18 you should specify the ingress-controller via the field ingressClassName + # See https://kubernetes.io/blog/2020/04/02/improvements-to-the-ingress-api-in-kubernetes-1.18/#specifying-the-class-of-an-ingress + # ingressClassName: nginx + # Values can be templated + annotations: + {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + labels: {} + path: / + # pathType is only for k8s > 1.19 + pathType: Prefix + + hosts: + - grafana.domain.com + ## Extra paths to prepend to every host configuration. This is useful when working with annotation based services. + extraPaths: [] + # - path: /* + # backend: + # serviceName: ssl-redirect + # servicePort: use-annotation + ## Or for k8s > 1.19 + # - path: /* + # pathType: Prefix + # backend: + # service: + # name: ssl-redirect + # port: + # name: service + tls: [] + # - secretName: grafana-ingress-tls + # hosts: + # - grafana.domain.com + + # -- grafana VM scrape config + vmScrape: + # whether we should create a service scrape resource for grafana + enabled: true + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Grafana + spec: + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "grafana.name" .Subcharts.grafana }}' + endpoints: + - port: '{{ .Values.grafana.service.portName }}' + +# prometheus-node-exporter dependency chart configuration. For possible values refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-node-exporter/values.yaml +prometheus-node-exporter: + enabled: true + + # all values for prometheus-node-exporter helm chart can be specified here + service: + ## Add the 'node-exporter' label to be used by serviceMonitor to match standard common usage in rules and grafana dashboards + ## + labels: + jobLabel: node-exporter + extraArgs: + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + # -- node exporter VM scrape config + vmScrape: + # whether we should create a service scrape resource for node-exporter + enabled: true + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Node Exporter + spec: + jobLabel: jobLabel + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "prometheus-node-exporter.name" (index .Subcharts "prometheus-node-exporter") }}' + endpoints: + - port: metrics + metricRelabelConfigs: + - action: drop + source_labels: [mountpoint] + regex: "/var/lib/kubelet/pods.+" +# kube-state-metrics dependency chart configuration. For possible values refer to https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-state-metrics/values.yaml +kube-state-metrics: + enabled: true + ## all values for kube-state-metrics helm chart can be specified here + + # -- [Scrape configuration](https://docs.victoriametrics.com/operator/api#vmservicescrapespec) for Kube State Metrics + vmScrape: + enabled: true + spec: + selector: + matchLabels: + app.kubernetes.io/name: '{{ include "kube-state-metrics.name" (index .Subcharts "kube-state-metrics") }}' + app.kubernetes.io/instance: '{{ include "vm.release" . }}' + endpoints: + - port: http + honorLabels: true + metricRelabelConfigs: + - action: labeldrop + regex: (uid|container_id|image_id) + jobLabel: app.kubernetes.io/name + +################################################# +### Service Monitors ##### +################################################# +# Component scraping the kubelets +kubelet: + enabled: true + vmScrapes: + # -- Enable scraping /metrics/cadvisor from kubelet's service + cadvisor: + enabled: true + spec: + path: /metrics/cadvisor + # -- Enable scraping /metrics/probes from kubelet's service + probes: + enabled: true + spec: + path: /metrics/probes + kubelet: + spec: {} + # -- spec for VMNodeScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmnodescrapespec + vmScrape: + kind: VMNodeScrape + spec: + scheme: "https" + honorLabels: true + interval: "30s" + scrapeTimeout: "5s" + tlsConfig: + insecureSkipVerify: true + caFile: "/var/run/secrets/kubernetes.io/serviceaccount/ca.crt" + bearerTokenFile: "/var/run/secrets/kubernetes.io/serviceaccount/token" + # drop high cardinality label and useless metrics for cadvisor and kubelet + metricRelabelConfigs: + - action: labeldrop + regex: (uid) + - action: labeldrop + regex: (id|name) + - action: drop + source_labels: [__name__] + regex: (rest_client_request_duration_seconds_bucket|rest_client_request_duration_seconds_sum|rest_client_request_duration_seconds_count) + relabelConfigs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - sourceLabels: [__metrics_path__] + targetLabel: metrics_path + - targetLabel: "job" + replacement: "kubelet" + # ignore timestamps of cadvisor's metrics by default + # more info here https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4697#issuecomment-1656540535 + honorTimestamps: false +# Component scraping the kube api server +kubeApiServer: + enabled: true + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes + +# Component scraping the kube controller manager +kubeControllerManager: + enabled: true + + ## If your kube controller manager is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## If using kubeControllerManager.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 10257 + targetPort: 10257 + selector: + component: kube-controller-manager + + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: + - "kube-system" + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + +# Component scraping kubeDns. Use either this or coreDns +kubeDns: + enabled: false + # -- + service: + enabled: false + ports: + dnsmasq: + port: 10054 + targetPort: 10054 + skydns: + port: 10055 + targetPort: 10055 + selector: + k8s-app: kube-dns + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - port: http-metrics-dnsmasq + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - port: http-metrics-skydns + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# Component scraping coreDns. Use either this or kubeDns +coreDns: + enabled: true + service: + enabled: true + port: 9153 + targetPort: 9153 + selector: + k8s-app: kube-dns + + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - port: http-metrics + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + +# Component scraping etcd +kubeEtcd: + enabled: true + + ## If your etcd is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## Etcd service. If using kubeEtcd.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 2379 + targetPort: 2379 + selector: + component: etcd + + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# Component scraping kube scheduler +kubeScheduler: + enabled: true + + ## If your kube scheduler is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + ## If using kubeScheduler.endpoints only the port and targetPort are used + ## + service: + enabled: true + port: 10259 + targetPort: 10259 + selector: + component: kube-scheduler + + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +# Component scraping kube proxy +kubeProxy: + enabled: false + + ## If your kube proxy is not deployed as a pod, specify IPs it can be found on + ## + endpoints: [] + # - 10.141.4.22 + # - 10.141.4.23 + # - 10.141.4.24 + + service: + enabled: true + port: 10249 + targetPort: 10249 + selector: + k8s-app: kube-proxy + + # -- spec for VMServiceScrape crd + # https://docs.victoriametrics.com/operator/api.html#vmservicescrapespec + vmScrape: + spec: + jobLabel: jobLabel + namespaceSelector: + matchNames: [kube-system] + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # bearerTokenSecret: + # key: "" + port: http-metrics + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + +## install vm operator crds +crds: + enabled: true + +## install prometheus operator crds +prometheus-operator-crds: + enabled: false + +# -- Add extra objects dynamically to this chart +extraObjects: [] diff --git a/packages/system/monitoring/templates/cadvisor-scrape.yaml b/packages/system/monitoring/templates/cadvisor-scrape.yaml index 4961b7b0..a3f2698a 100644 --- a/packages/system/monitoring/templates/cadvisor-scrape.yaml +++ b/packages/system/monitoring/templates/cadvisor-scrape.yaml @@ -21,6 +21,8 @@ spec: relabelConfigs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) + - action: labeldrop + regex: '.*node_kubevirt_io.*' - sourceLabels: [__metrics_path__] targetLabel: metrics_path - replacement: cadvisor @@ -30,3 +32,4 @@ spec: tlsConfig: caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt insecureSkipVerify: true + diff --git a/packages/system/monitoring/templates/kubelet-scrape.yaml b/packages/system/monitoring/templates/kubelet-scrape.yaml index 33c5c6ed..bb06b8d0 100644 --- a/packages/system/monitoring/templates/kubelet-scrape.yaml +++ b/packages/system/monitoring/templates/kubelet-scrape.yaml @@ -21,6 +21,8 @@ spec: relabelConfigs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) + - action: labeldrop + regex: '.*node_kubevirt_io.*' - sourceLabels: [__metrics_path__] targetLabel: metrics_path - replacement: kubelet @@ -51,6 +53,8 @@ spec: relabelConfigs: - action: labelmap regex: __meta_kubernetes_node_label_(.+) + - action: labeldrop + regex: '.*node_kubevirt_io.*' - sourceLabels: - __metrics_path__ targetLabel: metrics_path diff --git a/packages/system/monitoring/templates/vmagent.yaml b/packages/system/monitoring/templates/vmagent.yaml index 1c686b28..46dc9e7b 100644 --- a/packages/system/monitoring/templates/vmagent.yaml +++ b/packages/system/monitoring/templates/vmagent.yaml @@ -8,6 +8,7 @@ spec: cluster: cozystack extraArgs: promscrape.streamParse: "true" + promscrape.maxScrapeSize: 32MB remoteWrite: - url: http://vminsert-shortterm.tenant-root.svc:8480/insert/0/prometheus/api/v1/write - url: http://vminsert-longterm.tenant-root.svc:8480/insert/0/prometheus/api/v1/write diff --git a/packages/system/monitoring/values.yaml b/packages/system/monitoring/values.yaml index 6bd9064d..1b505397 100644 --- a/packages/system/monitoring/values.yaml +++ b/packages/system/monitoring/values.yaml @@ -1,3 +1,7 @@ +global: + victoria-metrics-k8s-stack: + fullnameOverride: alerts + metrics-server: defaultArgs: - --cert-dir=/tmp @@ -62,3 +66,19 @@ fluent-bit: Operation lift Nested_under kubernetes Add_prefix kubernetes_ + +victoria-metrics-k8s-stack: + externalVM: + read: + url: "http://vmselect-shortterm.tenant-root.svc:8480/select/0/prometheus/api/v1/read" + write: + url: "http://vminsert-shortterm.tenant-root.svc:8480/insert/0/prometheus/api/v1/write" + + grafana: + enabled: false + kube-state-metrics: + enabled: false + prometheus-node-exporter: + enabled: false + alertmanager: + name: vmalertmanager-alertmanager