Add basic alerting system (#355)

![alerta](https://github.com/user-attachments/assets/87f792c1-0e1f-4070-84b1-7335cc0e7810)


- Remove grafana-oncall
- Add Alerta
- Configure basic alerts
- Update grafana 10 --> 11

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

- **New Features**
- Added new configuration options for the Alerta service, enhancing user
customization.
- Introduced a new Helm chart for the VictoriaMetrics Kubernetes stack,
enabling comprehensive monitoring solutions.
- Added VMAuth feature for enhanced authentication in the Kubernetes
stack.

- **Bug Fixes**
- Fixed issues with the ETCD dashboard and improved ingress path prefix
handling.

- **Documentation**
- Updated README and release guide for the VictoriaMetrics stack with
installation and configuration instructions.
	- Introduced a changelog for organized tracking of changes.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
Andrei Kvapil
2024-09-26 10:56:53 +02:00
committed by GitHub
parent b82ba4ac3c
commit ec27a19afb
99 changed files with 8226 additions and 3724 deletions

View File

@@ -4,9 +4,12 @@
### Common parameters
| Name | Description | Value |
| ----------------- | --------------------------------------------------------------------------------------------------------- | ------- |
| `host` | The hostname used to access the grafana externally (defaults to 'grafana' subdomain for the tenant host). | `""` |
| `metricsStorages` | Configuration of metrics storage instances | `[]` |
| `logsStorages` | Configuration of logs storage instances | `[]` |
| `oncall.enabled` | Enable Grafana OnCall | `false` |
| Name | Description | Value |
| ------------------------------- | --------------------------------------------------------------------------------------------------------- | ------------------------------------------------ |
| `host` | The hostname used to access the grafana externally (defaults to 'grafana' subdomain for the tenant host). | `""` |
| `metricsStorages` | Configuration of metrics storage instances | `[]` |
| `logsStorages` | Configuration of logs storage instances | `[]` |
| `alerta.storage` | Persistent Volume size for alerta database | `10Gi` |
| `alerta.storageClassName` | StorageClass used to store the data | `""` |
| `alerta.alerts.telegram.token` | telegram token for your bot | `7262461387:AAGtwq16iwuVtWtzoN6TUEMpF00fpC9Xz34` |
| `alerta.alerts.telegram.chatID` | specify multiple ID's separated by comma. Get yours in https://t.me/chatid_echo_bot | `-4520856007` |

View File

@@ -0,0 +1,16 @@
---
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: alerta-db
spec:
instances: 2
storage:
size: {{ required ".Values.alerta.storage is required" .Values.alerta.storage }}
{{- with .Values.alerta.storageClassName }}
storageClass: {{ . }}
{{- end }}
inheritedMetadata:
labels:
policy.cozystack.io/allow-to-apiserver: "true"

View File

@@ -0,0 +1,214 @@
{{- $myNS := lookup "v1" "Namespace" "" .Release.Namespace }}
{{- $ingress := index $myNS.metadata.annotations "namespace.cozystack.io/ingress" }}
{{- $host := index $myNS.metadata.annotations "namespace.cozystack.io/host" }}
{{- $apiKey := randAlphaNum 32 }}
{{- $existingSecret := lookup "v1" "Secret" .Release.Namespace "alerta" }}
{{- if $existingSecret }}
{{- $apiKey = index $existingSecret.data "alerta-api-key" | b64dec }}
{{- end }}
---
apiVersion: v1
kind: Secret
metadata:
name: alerta
labels:
app: alerta
type: Opaque
data:
alerta-api-key: {{ $apiKey | b64enc }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: alerta
labels:
app: alerta
data:
config.js: |
// contents of config.js
'use strict';
angular.module('config', [])
.constant('config', {
'endpoint' : "/api",
'provider' : "basic"
})
.constant('colors', {});
---
apiVersion: v1
kind: Service
metadata:
name: alerta
labels:
app: alerta
spec:
type: ClusterIP
ports:
- port: 80
targetPort: http
protocol: TCP
name: http
selector:
app: alerta
release: alerta
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: alerta
labels:
app: alerta
spec:
replicas: 1
selector:
matchLabels:
app: alerta
release: alerta
template:
metadata:
labels:
app: alerta
release: alerta
spec:
containers:
- name: alerta
image: "alerta/alerta-web:9.0.4"
imagePullPolicy: IfNotPresent
env:
- name: ADMIN_USERS
valueFrom:
secretKeyRef:
key: user
name: grafana-admin-password
- name: ADMIN_PASSWORD
valueFrom:
secretKeyRef:
key: password
name: grafana-admin-password
- name: ADMIN_KEY
valueFrom:
secretKeyRef:
name: alerta
key: alerta-api-key
- name: ALERTA_API_KEY
valueFrom:
secretKeyRef:
name: alerta
key: alerta-api-key
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: alerta-db-app
key: uri
- name: AUTH_REQUIRED
value: "True"
{{- if and .Values.alerta.alerts.telegram.chatID .Values.alerta.alerts.telegram.token }}
- name: "PLUGINS"
value: "telegram"
- name: TELEGRAM_CHAT_ID
value: "{{ .Values.alerta.alerts.telegram.chatID }}"
- name: TELEGRAM_TOKEN
value: "{{ .Values.alerta.alerts.telegram.token }}"
- name: TELEGRAM_WEBHOOK_URL
value: "https://alerta.infra.aenix.org/api/webhooks/telegram?api-key={{ $apiKey }}"
{{- end }}
ports:
- name: http
containerPort: 8080
protocol: TCP
livenessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 300
readinessProbe:
httpGet:
path: /
port: http
initialDelaySeconds: 5
volumeMounts:
- name: alerta-config
mountPath: /app/config.js
subPath: config.js
volumes:
- name: alerta-config
configMap:
name: alerta
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alerta
labels:
app: alerta
annotations:
acme.cert-manager.io/http01-ingress-class: {{ $ingress }}
cert-manager.io/cluster-issuer: letsencrypt-prod
spec:
ingressClassName: {{ $ingress }}
tls:
- hosts:
- "{{ .Values.host | default (printf "alerta.%s" $host) }}"
secretName: alerta-tls
rules:
- host: "{{ .Values.host | default (printf "alerta.%s" $host) }}"
http:
paths:
- path: /
pathType: ImplementationSpecific
backend:
service:
name: alerta
port:
name: http
---
apiVersion: v1
kind: Secret
metadata:
name: alertmanager
type: Opaque
stringData:
alertmanager.yaml: |
global:
resolve_timeout: 5m
route:
group_by: [alertname, namespace, cluster]
group_wait: 15s
group_interval: 15s
repeat_interval: 1h
receiver: 'alerta'
routes:
- receiver: alerta_no_resolve
repeat_interval: 1m
matchers:
- 'event="Heartbeat"'
- receiver: blackhole
matchers:
- 'severity="none"'
- receiver: blackhole
matchers:
- 'alertname=~"CDIDefaultStorageClassDegraded|CDINoDefaultStorageClass"'
receivers:
- name: blackhole
- name: "alerta"
webhook_configs:
- url: 'http://alerta/api/webhooks/prometheus?api-key={{ $apiKey }}'
send_resolved: true
- name: "alerta_no_resolve"
webhook_configs:
- url: 'http://alerta/api/webhooks/prometheus?api-key={{ $apiKey }}'
send_resolved: false
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMAlertmanager
metadata:
name: alertmanager
spec:
replicaCount: 3
configSecret: alertmanager
podMetadata:
labels:
policy.cozystack.io/allow-to-apiserver: "true"

View File

@@ -9,6 +9,7 @@ rules:
- ingresses
resourceNames:
- grafana-ingress
- alerta
verbs: ["get", "list", "watch"]
- apiGroups:
- ""
@@ -23,4 +24,5 @@ rules:
- services
resourceNames:
- grafana-service
- alerta
verbs: ["get", "list", "watch"]

View File

@@ -56,7 +56,7 @@ spec:
mountPath: /var/lib/grafana
containers:
- name: grafana
image: grafana/grafana:10.1.0
image: grafana/grafana:11.2.0
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: false
@@ -64,7 +64,7 @@ spec:
failureThreshold: 3
env:
- name: GF_INSTALL_PLUGINS
value: grafana-worldmap-panel,flant-statusmap-panel,grafana-oncall-app,natel-discrete-panel
value: grafana-worldmap-panel,flant-statusmap-panel,natel-discrete-panel
- name: ONCALL_API_URL
value: http://grafana-oncall-engine:8080
- name: GF_DATABASE_HOST

View File

@@ -1,15 +0,0 @@
{{- if .Values.oncall.enabled }}
---
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: grafana-oncall-db
spec:
instances: 2
storage:
size: 10Gi
inheritedMetadata:
labels:
policy.cozystack.io/allow-to-apiserver: "true"
{{- end }}

View File

@@ -1,66 +0,0 @@
{{- if .Values.oncall.enabled }}
---
apiVersion: databases.spotahome.com/v1
kind: RedisFailover
metadata:
name: grafana-oncall
labels:
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
spec:
sentinel:
replicas: 3
resources:
requests:
cpu: 100m
limits:
memory: 100Mi
redis:
replicas: 2
resources:
requests:
cpu: 150m
memory: 400Mi
limits:
cpu: 2
memory: 1000Mi
storage:
persistentVolumeClaim:
metadata:
name: redisfailover-persistent-data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
exporter:
enabled: true
image: oliver006/redis_exporter:v1.55.0-alpine
args:
- --web.telemetry-path
- /metrics
env:
- name: REDIS_EXPORTER_LOG_FORMAT
value: txt
customConfig:
- tcp-keepalive 0
- loglevel notice
auth:
secretPath: grafana-oncall-redis-password
---
apiVersion: v1
kind: Secret
metadata:
name: grafana-oncall-redis-password
labels:
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
data:
{{- $existingSecret := lookup "v1" "Secret" .Release.Namespace "grafana-oncall-redis-password" }}
{{- if $existingSecret }}
password: {{ index $existingSecret.data "password" }}
{{- else }}
password: {{ randAlphaNum 32 | b64enc }}
{{- end }}
{{- end }}

View File

@@ -1,40 +0,0 @@
{{- if .Values.oncall.enabled }}
{{- $myNS := lookup "v1" "Namespace" "" .Release.Namespace }}
{{- $host := index $myNS.metadata.annotations "namespace.cozystack.io/host" }}
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: grafana-oncall-system
labels:
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
spec:
chart:
spec:
chart: cozy-grafana-oncall
reconcileStrategy: Revision
sourceRef:
kind: HelmRepository
name: cozystack-system
namespace: cozy-system
version: '*'
interval: 1m0s
timeout: 5m0s
values:
oncall:
fullnameOverride: grafana-oncall
externalGrafana:
url: "https://{{ .Values.host | default (printf "grafana.%s" $host) }}/"
externalPostgresql:
host: grafana-oncall-db-rw
db_name: app
user: app
existingSecret: grafana-oncall-db-app
passwordKey: password
externalRedis:
host: rfrm-grafana-oncall
existingSecret: {{ .Release.Name }}-oncall-redis-password
passwordKey: password
{{- end }}

View File

@@ -11,7 +11,7 @@ spec:
extraArgs:
remoteWrite.disablePathAppend: "true"
notifiers:
- url: http://vmalertmanager.{{ $.Release.Namespace }}.svc:9093
- url: http://vmalertmanager-alertmanager.{{ $.Release.Namespace }}.svc:9093
remoteRead:
url: http://vmselect-{{ .name }}.{{ $.Release.Namespace }}.svc:8481/select/0/prometheus
remoteWrite:

View File

@@ -1,32 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: alertmanager
type: Opaque
stringData:
alertmanager.yaml: |
global:
resolve_timeout: 5m
route:
group_by:
- job
- alertname
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: http://{{ .Release.Name }}-oncall-engine.{{ .Release.Namespace }}.svc:8080/integrations/v1/alertmanager/Kjb2NWxxSlgGtxz9F4ihovQBB/
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMAlertmanager
metadata:
name: alertmanager
spec:
replicaCount: 2
configSecret: alertmanager
podMetadata:
labels:
policy.cozystack.io/allow-to-apiserver: "true"

View File

@@ -23,13 +23,38 @@
"type": "object"
}
},
"oncall": {
"alerta": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enable Grafana OnCall",
"default": false
"storage": {
"type": "string",
"description": "Persistent Volume size for alerta database",
"default": "10Gi"
},
"storageClassName": {
"type": "string",
"description": "StorageClass used to store the data",
"default": ""
},
"alerts": {
"type": "object",
"properties": {
"telegram": {
"type": "object",
"properties": {
"token": {
"type": "string",
"description": "telegram token for your bot",
"default": "7262461387:AAGtwq16iwuVtWtzoN6TUEMpF00fpC9Xz34"
},
"chatID": {
"type": "string",
"description": "specify multiple ID's separated by comma. Get yours in https://t.me/chatid_echo_bot",
"default": "-4520856007"
}
}
}
}
}
}
}

View File

@@ -25,7 +25,22 @@ logsStorages:
storage: 10Gi
storageClassName: replicated
## @param oncall.enabled Enable Grafana OnCall
##
oncall:
enabled: false
## Configuration for Alerta
## @param alerta.storage Persistent Volume size for alerta database
## @param alerta.storageClassName StorageClass used to store the data
##
alerta:
storage: 10Gi
storageClassName: ""
alerts:
## @param alerta.alerts.telegram.token telegram token for your bot
## @param alerta.alerts.telegram.chatID specify multiple ID's separated by comma. Get yours in https://t.me/chatid_echo_bot
## example:
## telegram:
## token: "7262461387:AAGtwq16iwuVtWtzoN6TUEMpF00fpC9Xz34"
## chatID: "-4520856007"
##
telegram:
token: ""
chatID: ""

View File

@@ -1,3 +0,0 @@
apiVersion: v2
name: cozy-grafana-oncall
version: 0.0.0 # Placeholder, the actual version will be automatically set during the build process

View File

@@ -1,10 +0,0 @@
NAME=grafana-oncall-system
include ../../../scripts/package.mk
update:
rm -rf charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update grafana
helm pull grafana/oncall --untar --untardir charts
rm -rf charts/oncall/charts

View File

@@ -1,27 +0,0 @@
dependencies:
- name: cert-manager
repository: https://charts.jetstack.io
version: v1.8.0
- name: mariadb
repository: https://charts.bitnami.com/bitnami
version: 12.2.5
- name: postgresql
repository: https://charts.bitnami.com/bitnami
version: 11.9.10
- name: rabbitmq
repository: https://charts.bitnami.com/bitnami
version: 12.0.0
- name: redis
repository: https://charts.bitnami.com/bitnami
version: 16.13.2
- name: grafana
repository: https://grafana.github.io/helm-charts
version: 6.57.1
- name: ingress-nginx
repository: https://kubernetes.github.io/ingress-nginx
version: 4.1.4
- name: prometheus
repository: https://prometheus-community.github.io/helm-charts
version: 25.8.2
digest: sha256:edc9fef449a694cd319135e37ac84f8247ac9ad0c48ac86099dae4e428beb7b7
generated: "2024-01-26T17:54:48.132209769Z"

View File

@@ -1,39 +0,0 @@
apiVersion: v2
appVersion: v1.3.94
dependencies:
- condition: cert-manager.enabled
name: cert-manager
repository: https://charts.jetstack.io
version: v1.8.0
- condition: mariadb.enabled
name: mariadb
repository: https://charts.bitnami.com/bitnami
version: 12.2.5
- condition: postgresql.enabled
name: postgresql
repository: https://charts.bitnami.com/bitnami
version: 11.9.10
- condition: rabbitmq.enabled
name: rabbitmq
repository: https://charts.bitnami.com/bitnami
version: 12.0.0
- condition: redis.enabled
name: redis
repository: https://charts.bitnami.com/bitnami
version: 16.13.2
- condition: grafana.enabled
name: grafana
repository: https://grafana.github.io/helm-charts
version: 6.57.1
- condition: ingress-nginx.enabled
name: ingress-nginx
repository: https://kubernetes.github.io/ingress-nginx
version: 4.1.4
- condition: prometheus.enabled
name: prometheus
repository: https://prometheus-community.github.io/helm-charts
version: 25.8.2
description: Developer-friendly incident response with brilliant Slack integration
name: oncall
type: application
version: 1.3.94

View File

@@ -1,431 +0,0 @@
# Grafana OnCall Helm Chart
This Grafana OnCall Chart is the best way to operate Grafana OnCall on Kubernetes.
It will deploy Grafana OnCall engine and celery workers, along with RabbitMQ cluster, Redis Cluster, and MySQL 5.7 database.
It will also deploy cert manager and nginx ingress controller, as Grafana OnCall backend might need to be externally available
to receive alerts from other monitoring systems. Grafana OnCall engine acts as a backend and can be connected to the
Grafana frontend plugin named Grafana OnCall.
Architecture diagram can be found [here](https://raw.githubusercontent.com/grafana/oncall/dev/docs/img/architecture_diagram.png)
## Production usage
**Default helm chart configuration is not intended for production.**
The helm chart includes all the services into a single release, which is not recommended for production usage.
It is recommended to run stateful services such as MySQL and RabbitMQ separately from this release or use managed
PaaS solutions. It will significantly reduce the overhead of managing them.
Here are the instructions on how to set up your own [ingress](#set-up-external-access), [MySQL](#connect-external-mysql),
[RabbitMQ](#connect-external-rabbitmq), [Redis](#connect-external-redis)
### Cluster requirements
- ensure you can run x86-64/amd64 workloads. arm64 architecture is currently not supported
- kubernetes version 1.25+ is not supported, if cert-manager is enabled
## Install
### Prepare the repo
```bash
# Add the repository
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
```
### Installing the helm chart
```bash
# Install the chart
helm install \
--wait \
--set base_url=example.com \
--set grafana."grafana\.ini".server.domain=example.com \
release-oncall \
grafana/oncall
```
Follow the `helm install` output to finish setting up Grafana OnCall backend and Grafana OnCall frontend plugin e.g.
```bash
👋 Your Grafana OnCall instance has been successfully deployed
❗ Set up a DNS record for your domain (use A Record and "@" to point a root domain to the IP address)
Get the external IP address by running the following commands and point example.com to it:
kubectl get ingress release-oncall -o jsonpath="{.status.loadBalancer.ingress[0].ip}"
Wait until the dns record got propagated.
NOTE: Check with the following command: nslookup example.com
Try reaching https://example.com/ready/ from the browser, make sure it is not cached locally
🦎 Grafana was installed as a part of this helm release. Open https://example.com/grafana/plugins/grafana-oncall-app
The User is admin
Get password by running this command:
kubectl get secret --namespace default release-oncall-grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
🔗 Connect Grafana OnCall Plugin to Grafana OnCall backend:
Fill the Grafana OnCall Backend URL:
http://release-oncall-engine:8080
🎉🎉🎉 Done! 🎉🎉🎉
```
## Configuration
You can edit values.yml to make changes to the helm chart configuration and re-deploy the release with the following command:
```bash
helm upgrade \
--install \
--wait \
--set base_url=example.com \
--set grafana."grafana\.ini".server.domain=example.com \
release-oncall \
grafana/oncall
```
### Passwords and external secrets
As OnCall subcharts are Bitname charts, there is a common approach to secrets. Bundled charts allow specifying passwords
in values.yaml explicitly or as K8s secret value. OnCall chart refers either to secret created in sub-chart or
to specified external secret.
Similarly, if component chart is disabled, the password(s) can be supplied in `external<Component>` value
(e.g. externalMysql) explicitly or as K8s secret value. In the first case, the secret is created with the specified
value. In the second case the external secret is used.
- If `<subchart>.auth.existingSecret` is non-empty, then this secret is used. Secret keys are pre-defined by chart.
- If subchart supports password files and `<subchart>.customPasswordFiles` dictionary is non-empty, then password files
are used. Dictionary keys are pre-defined per sub-chart. Password files are not supported by OnCall chart and should
not be used with bundled sub-charts.
- Passwords are specified via `auth` section values, e.g. `auth.password`. K8s secret is created.
- If `<subchart>.auth.forcePassword` is `true`, then passwords MUST be specified. Otherwise, missing passwords
are generated.
If external component is used instead of the bundled one:
- If existingSecret within appropriate external component values is non-empty (e.g. `externalMysql.existingSecret`) then
it is used together with corresponding key names, e.g. `externalMysql.passwordKey`.
- Otherwise, corresponding password values are used, e.g. `externalMysql.password`. K8s secret is created by OnCall chart.
Below is the summary for the dependent charts.
MySQL/MariaDB:
```yaml
database:
type: "mysql" # This is default
mariaDB:
enabled: true # Default
auth:
existingSecret: ""
forcePassword: false
# Secret name: `<release>-mariadb`
rootPassword: "" # Secret key: mariadb-root-password
password: "" # Secret key: mariadb-password
replicationPassword: "" # Secret key: mariadb-replication-password
externalMysql:
password: ""
existingSecret: ""
passwordKey: ""
```
Postgres:
```yaml
database:
type: postgresql
mariadb:
enabled: false # Must be set to false for Postgres
postgresql:
enabled: true # Must be set to true for bundled Postgres
auth:
existingSecret: ""
secretKeys:
adminPasswordKey: ""
userPasswordKey: "" # Not needed
replicationPasswordKey: "" # Not needed with disabled replication
# Secret name: `<release>-postgresql`
postgresPassword: "" # password for admin user postgres. As non-admin user is not created, only this one is relevant.
password: "" # Not needed
replicationPassword: "" # Not needed with disabled replication
externalPostgresql:
user: ""
password: ""
existingSecret: ""
passwordKey: ""
```
Rabbitmq:
```yaml
rabbitmq:
enabled: true
auth:
existingPasswordSecret: "" # Must contain `rabbitmq-password` key
existingErlangSecret: "" # Must contain `rabbitmq-erlang-cookie` key
# Secret name: `<release>-rabbitmq`
password: ""
erlangCookie: ""
externalRabbitmq:
user: ""
password: ""
existingSecret: ""
passwordKey: ""
usernameKey: ""
```
Redis:
```yaml
redis:
enabled: true
auth:
existingSecret: ""
existingSecretPasswordKey: ""
# Secret name: `<release>-redis`
password: ""
externalRedis:
password: ""
existingSecret: ""
passwordKey: ""
```
### Running split ingestion and API services
You can run a detached service for handling integrations by setting up the following variables:
```yaml
detached_integrations:
enabled: true
detached_integrations_service:
enabled: true
```
This will run an integrations-only service listening by default in port 30003.
### Set up Slack and Telegram
You can set up Slack connection via following variables:
```yaml
oncall:
slack:
enabled: true
commandName: oncall
clientId: ~
clientSecret: ~
signingSecret: ~
existingSecret: ""
clientIdKey: ""
clientSecretKey: ""
signingSecretKey: ""
redirectHost: ~
```
`oncall.slack.commandName` is used for changing default bot slash command,
`oncall`. In slack, it could be called via `/<oncall.slack.commandName>`.
To set up Telegram token and webhook url use:
```yaml
oncall:
telegram:
enabled: true
token: ~
webhookUrl: ~
```
To use Telegram long polling instead of webhook use:
```yaml
telegramPolling:
enabled: true
```
### Set up external access
Grafana OnCall can be connected to the external monitoring systems or grafana deployed to the other cluster.
Nginx Ingress Controller and Cert Manager charts are included in the helm chart with the default configuration.
If you set the DNS A Record pointing to the external IP address of the installation with the Hostname matching
base_url parameter, https will be automatically set up. If grafana is enabled in the chart values, it will also be
available on `https://<base_url>/grafana/`. See the details in `helm install` output.
To use a different ingress controller or tls certificate management system, set the following values to
false and edit ingress settings
```yaml
ingress-nginx:
enabled: false
cert-manager:
enabled: false
ingress:
enabled: true
annotations:
kubernetes.io/ingress.class: "nginx"
cert-manager.io/issuer: "letsencrypt-prod"
```
### Use PostgreSQL instead of MySQL
It is possible to use PostgreSQL instead of MySQL. To do so, set mariadb.enabled to `false`,
postgresql.enabled to `true` and database.type to `postgresql`.
```yaml
mariadb:
enabled: false
postgresql:
enabled: true
database:
type: postgresql
```
### Connect external MySQL
It is recommended to use the managed MySQL 5.7 database provided by your cloud provider
Make sure to create the database with the following parameters before installing this chart
```sql
CREATE DATABASE oncall CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
```
To use an external MySQL instance set mariadb.enabled to `false` and configure the `externalMysql` parameters.
```yaml
mariadb:
enabled: false
# Make sure to create the database with the following parameters:
# CREATE DATABASE oncall CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
externalMysql:
host:
port:
db_name:
user:
password:
existingSecret: ""
usernameKey: username
passwordKey: password
```
### Connect external PostgreSQL
To use an external PostgreSQL instance set mariadb.enabled to `false`,
postgresql.enabled to `false`, database.type to `postgresql` and configure
the `externalPostgresql` parameters.
```yaml
mariadb:
enabled: false
postgresql:
enabled: false
database:
type: postgresql
# Make sure to create the database with the following parameters:
# CREATE DATABASE oncall WITH ENCODING UTF8;
externalPostgresql:
host:
port:
db_name:
user:
password:
existingSecret: ""
passwordKey: password
```
### Connect external RabbitMQ
Option 1. Install RabbitMQ separately into the cluster using the [official documentation](https://www.rabbitmq.com/kubernetes/operator/operator-overview.html)
Option 2. Use managed solution such as [CloudAMPQ](https://www.cloudamqp.com/)
To use an external RabbitMQ instance set rabbitmq.enabled to `false` and configure the `externalRabbitmq` parameters.
```yaml
rabbitmq:
enabled: false # Disable the RabbitMQ dependency from the release
externalRabbitmq:
host:
port:
user:
password:
protocol:
vhost:
existingSecret: ""
passwordKey: password
usernameKey: username
```
### Connect external Redis
To use an external Redis instance set redis.enabled to `false` and configure the `externalRedis` parameters.
```yaml
redis:
enabled: false # Disable the Redis dependency from the release
externalRedis:
host:
password:
existingSecret: ""
passwordKey: password
```
## Update
```bash
# Add & upgrade the repository
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
# Re-deploy
helm upgrade \
--install \
--wait \
--set base_url=example.com \
--set grafana."grafana\.ini".server.domain=example.com \
release-oncall \
grafana/oncall
```
After re-deploying, please also update the Grafana OnCall plugin on the plugin version page.
See [Grafana docs](https://grafana.com/docs/grafana/latest/administration/plugin-management/#update-a-plugin) for
more info on updating Grafana plugins.
## Uninstall
### Uninstalling the helm chart
```bash
helm delete release-oncall
```
### Clean up PVC's
```bash
kubectl delete pvc data-release-oncall-mariadb-0 data-release-oncall-rabbitmq-0 \
redis-data-release-oncall-redis-master-0 redis-data-release-oncall-redis-replicas-0 \
redis-data-release-oncall-redis-replicas-1 redis-data-release-oncall-redis-replicas-2
```
### Clean up secrets
```bash
kubectl delete secrets certificate-tls release-oncall-cert-manager-webhook-ca release-oncall-ingress-nginx-admission
```
## Troubleshooting
### Issues during initial configuration
In the event that you run into issues during initial configuration, it is possible that mismatching versions between
your OnCall backend and UI is the culprit. Ensure that the versions match, and if not,
consider updating your `helm` deployment.

View File

@@ -1,41 +0,0 @@
=================================================================
📞 Grafana OnCall Notes
=================================================================
👋 Your Grafana OnCall instance has been successfully deployed
{{- if not .Values.migrate.enabled }}
🤖 To migrate the database run these commands:
export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "oncall.name" . }},app.kubernetes.io/instance={{ .Release.Name }},app.kubernetes.io/component=engine" -o jsonpath="{.items[0].metadata.name}")
kubectl exec -it $POD_NAME -c wait-for-db -- bash -c "python manage.py migrate;"
{{- end }}
❗ Set up a DNS record for your domain (use A Record and "@" to point a root domain to the IP address)
Get the external IP address by running the following commands and point {{ .Values.base_url }} to it:
kubectl get ingress {{ include "oncall.fullname" . }} -o jsonpath="{.status.loadBalancer.ingress[0].ip}"
Wait until the dns record got propagated.
NOTE: Check with the following command: nslookup {{ .Values.base_url }}
Try reaching https://{{ .Values.base_url }}/ready/ from the browser, make sure it is not cached locally
{{- if .Values.grafana.enabled }}
🦎 Grafana was installed as a part of this helm release. Open https://{{ .Values.base_url }}/grafana/plugins/grafana-oncall-app
The User is {{ .Values.grafana.adminUser }}
Get password by running this command:
kubectl get secret --namespace {{ .Release.Namespace }} {{ template "oncall.grafana.fullname" . }} -o jsonpath="{.data.admin-password}" | base64 --decode ; echo
{{- else }}
🦎 Grafana was NOT installed as a part of this helm release. Open external Grafana, go to "Configuration" - "Plugins" and find Grafana OnCall plugin
NOTE: Make sure your external Grafana is available by the network for the containers installed by this release.
{{- end }}
🔗 Connect Grafana OnCall Plugin to Grafana OnCall backend:
Fill the Grafana OnCall Backend URL:
http://{{ include "oncall.engine.fullname" . }}:8080
🎉🎉🎉 Done! 🎉🎉🎉

View File

@@ -1,656 +0,0 @@
{{- define "snippet.oncall.env" -}}
- name: BASE_URL
value: {{ .Values.base_url_protocol }}://{{ .Values.base_url }}
- name: SECRET_KEY
valueFrom:
secretKeyRef:
name: {{ include "snippet.oncall.secret.name" . }}
key: {{ include "snippet.oncall.secret.secretKey" . | quote }}
- name: MIRAGE_SECRET_KEY
valueFrom:
secretKeyRef:
name: {{ include "snippet.oncall.secret.name" . }}
key: {{ include "snippet.oncall.secret.mirageSecretKey" . | quote }}
- name: MIRAGE_CIPHER_IV
value: {{ .Values.oncall.mirageCipherIV | default "1234567890abcdef" | quote }}
- name: DJANGO_SETTINGS_MODULE
value: "settings.helm"
- name: AMIXR_DJANGO_ADMIN_PATH
value: "admin"
- name: OSS
value: "True"
- name: DETACHED_INTEGRATIONS_SERVER
value: {{ .Values.detached_integrations.enabled | toString | title | quote }}
{{- include "snippet.oncall.uwsgi" . }}
- name: BROKER_TYPE
value: {{ .Values.broker.type | default "rabbitmq" }}
- name: GRAFANA_API_URL
value: {{ include "snippet.grafana.url" . | quote }}
{{- end }}
{{- define "snippet.oncall.secret.name" -}}
{{ if .Values.oncall.secrets.existingSecret -}}
{{ .Values.oncall.secrets.existingSecret }}
{{- else -}}
{{ include "oncall.fullname" . }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.secret.secretKey" -}}
{{ if .Values.oncall.secrets.existingSecret -}}
{{ required "oncall.secrets.secretKey is required if oncall.secret.existingSecret is not empty" .Values.oncall.secrets.secretKey }}
{{- else -}}
SECRET_KEY
{{- end }}
{{- end }}
{{- define "snippet.oncall.secret.mirageSecretKey" -}}
{{ if .Values.oncall.secrets.existingSecret -}}
{{ required "oncall.secrets.mirageSecretKey is required if oncall.secret.existingSecret is not empty" .Values.oncall.secrets.mirageSecretKey }}
{{- else -}}
MIRAGE_SECRET_KEY
{{- end }}
{{- end }}
{{- define "snippet.oncall.uwsgi" -}}
{{- if .Values.uwsgi }}
{{- range $key, $value := .Values.uwsgi }}
- name: UWSGI_{{ $key | upper | replace "-" "_" }}
value: {{ $value | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.slack.env" -}}
- name: FEATURE_SLACK_INTEGRATION_ENABLED
value: {{ .Values.oncall.slack.enabled | toString | title | quote }}
{{- if .Values.oncall.slack.enabled }}
- name: SLACK_SLASH_COMMAND_NAME
value: "/{{ .Values.oncall.slack.commandName | default "oncall" }}"
{{- if .Values.oncall.slack.existingSecret }}
- name: SLACK_CLIENT_OAUTH_ID
valueFrom:
secretKeyRef:
name: {{ .Values.oncall.slack.existingSecret }}
key: {{ required "oncall.slack.clientIdKey is required if oncall.slack.existingSecret is not empty" .Values.oncall.slack.clientIdKey | quote }}
- name: SLACK_CLIENT_OAUTH_SECRET
valueFrom:
secretKeyRef:
name: {{ .Values.oncall.slack.existingSecret }}
key: {{ required "oncall.slack.clientSecretKey is required if oncall.slack.existingSecret is not empty" .Values.oncall.slack.clientSecretKey | quote }}
- name: SLACK_SIGNING_SECRET
valueFrom:
secretKeyRef:
name: {{ .Values.oncall.slack.existingSecret }}
key: {{ required "oncall.slack.signingSecretKey is required if oncall.slack.existingSecret is not empty" .Values.oncall.slack.signingSecretKey | quote }}
{{- else }}
- name: SLACK_CLIENT_OAUTH_ID
value: {{ .Values.oncall.slack.clientId | default "" | quote }}
- name: SLACK_CLIENT_OAUTH_SECRET
value: {{ .Values.oncall.slack.clientSecret | default "" | quote }}
- name: SLACK_SIGNING_SECRET
value: {{ .Values.oncall.slack.signingSecret | default "" | quote }}
{{- end }}
- name: SLACK_INSTALL_RETURN_REDIRECT_HOST
value: {{ .Values.oncall.slack.redirectHost | default (printf "https://%s" .Values.base_url) | quote }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.telegram.env" -}}
{{- if .Values.telegramPolling.enabled -}}
{{- $_ := set .Values.oncall.telegram "enabled" true -}}
{{- end -}}
- name: FEATURE_TELEGRAM_INTEGRATION_ENABLED
value: {{ .Values.oncall.telegram.enabled | toString | title | quote }}
{{- if .Values.oncall.telegram.enabled }}
{{- if .Values.telegramPolling.enabled }}
- name: FEATURE_TELEGRAM_LONG_POLLING_ENABLED
value: {{ .Values.telegramPolling.enabled | toString | title | quote }}
{{- end }}
- name: TELEGRAM_WEBHOOK_HOST
value: {{ .Values.oncall.telegram.webhookUrl | default (printf "https://%s" .Values.base_url) | quote }}
{{- if .Values.oncall.telegram.existingSecret }}
- name: TELEGRAM_TOKEN
valueFrom:
secretKeyRef:
name: {{ .Values.oncall.telegram.existingSecret }}
key: {{ required "oncall.telegram.tokenKey is required if oncall.telegram.existingSecret is not empty" .Values.oncall.telegram.tokenKey | quote }}
{{- else }}
- name: TELEGRAM_TOKEN
value: {{ .Values.oncall.telegram.token | default "" | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.twilio.env" }}
{{- with .Values.oncall.twilio }}
{{- if .existingSecret }}
- name: TWILIO_ACCOUNT_SID
valueFrom:
secretKeyRef:
name: {{ .existingSecret }}
key: {{ required "oncall.twilio.accountSid is required if oncall.twilio.existingSecret is not empty" .accountSid | quote }}
{{- if .authTokenKey }}
- name: TWILIO_AUTH_TOKEN
valueFrom:
secretKeyRef:
name: {{ .existingSecret }}
key: {{ required "oncall.twilio.authTokenKey is required if oncall.twilio.existingSecret is not empty" .authTokenKey | quote }}
{{- end }}
- name: TWILIO_NUMBER
valueFrom:
secretKeyRef:
name: {{ .existingSecret }}
key: {{ required "oncall.twilio.phoneNumberKey is required if oncall.twilio.existingSecret is not empty" .phoneNumberKey | quote }}
- name: TWILIO_VERIFY_SERVICE_SID
valueFrom:
secretKeyRef:
name: {{ .existingSecret }}
key: {{ required "oncall.twilio.verifySidKey is required if oncall.twilio.existingSecret is not empty" .verifySidKey | quote }}
{{- if and .apiKeySidKey .apiKeySecretKey }}
- name: TWILIO_API_KEY_SID
valueFrom:
secretKeyRef:
name: {{ .existingSecret }}
key: {{ required "oncall.twilio.apiKeySidKey is required if oncall.twilio.existingSecret is not empty" .apiKeySidKey | quote }}
- name: TWILIO_API_KEY_SECRET
valueFrom:
secretKeyRef:
name: {{ .existingSecret }}
key: {{ required "oncall.twilio.apiKeySecretKey is required if oncall.twilio.existingSecret is not empty" .apiKeySecretKey | quote }}
{{- end }}
{{- else }}
{{- if .accountSid }}
- name: TWILIO_ACCOUNT_SID
value: {{ .accountSid | quote }}
{{- end }}
{{- if .authToken }}
- name: TWILIO_AUTH_TOKEN
value: {{ .authToken | quote }}
{{- end }}
{{- if .phoneNumber }}
- name: TWILIO_NUMBER
value: {{ .phoneNumber | quote }}
{{- end }}
{{- if .verifySid }}
- name: TWILIO_VERIFY_SERVICE_SID
value: {{ .verifySid | quote }}
{{- end }}
{{- if .apiKeySid }}
- name: TWILIO_API_KEY_SID
value: {{ .apiKeySid | quote }}
{{- end }}
{{- if .apiKeySecret }}
- name: TWILIO_API_KEY_SECRET
value: {{ .apiKeySecret | quote }}
{{- end }}
{{- end }}
{{- if .limitPhone }}
- name: PHONE_NOTIFICATIONS_LIMIT
value: {{ .limitPhone | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.celery.env" }}
{{- if .Values.celery.worker_queue }}
- name: CELERY_WORKER_QUEUE
value: {{ .Values.celery.worker_queue | quote }}
{{- end }}
{{- if .Values.celery.worker_concurrency }}
- name: CELERY_WORKER_CONCURRENCY
value: {{ .Values.celery.worker_concurrency | quote }}
{{- end }}
{{- if .Values.celery.worker_max_tasks_per_child }}
- name: CELERY_WORKER_MAX_TASKS_PER_CHILD
value: {{ .Values.celery.worker_max_tasks_per_child | quote }}
{{- end }}
{{- if .Values.celery.worker_beat_enabled }}
- name: CELERY_WORKER_BEAT_ENABLED
value: {{ .Values.celery.worker_beat_enabled | quote }}
{{- end }}
{{- if .Values.celery.worker_shutdown_interval }}
- name: CELERY_WORKER_SHUTDOWN_INTERVAL
value: {{ .Values.celery.worker_shutdown_interval | quote }}
{{- end }}
{{- end }}
{{- define "snippet.grafana.url" -}}
{{ if .Values.grafana.enabled -}}
http://{{ include "oncall.grafana.fullname" . }}
{{- else -}}
{{ required "externalGrafana.url is required when not grafana.enabled" .Values.externalGrafana.url }}
{{- end }}
{{- end }}
{{- define "snippet.mysql.env" -}}
- name: MYSQL_HOST
value: {{ include "snippet.mysql.host" . | quote }}
- name: MYSQL_PORT
value: {{ include "snippet.mysql.port" . | quote }}
- name: MYSQL_DB_NAME
value: {{ include "snippet.mysql.db" . | quote }}
- name: MYSQL_USER
{{- if and (not .Values.mariadb.enabled) .Values.externalMysql.existingSecret .Values.externalMysql.usernameKey (not .Values.externalMysql.user) }}
valueFrom:
secretKeyRef:
name: {{ include "snippet.mysql.password.secret.name" . }}
key: {{ .Values.externalMysql.usernameKey | quote }}
{{- else }}
value: {{ include "snippet.mysql.user" . | quote }}
{{- end }}
- name: MYSQL_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "snippet.mysql.password.secret.name" . }}
key: {{ include "snippet.mysql.password.secret.key" . | quote }}
{{- if not .Values.mariadb.enabled }}
{{- with .Values.externalMysql.options }}
- name: MYSQL_OPTIONS
value: {{ . | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.mysql.password.secret.name" -}}
{{ if .Values.mariadb.enabled -}}
{{ if .Values.mariadb.auth.existingSecret -}}
{{ .Values.mariadb.auth.existingSecret }}
{{- else -}}
{{ include "oncall.mariadb.fullname" . }}
{{- end }}
{{- else -}}
{{ if .Values.externalMysql.existingSecret -}}
{{ .Values.externalMysql.existingSecret }}
{{- else -}}
{{ include "oncall.fullname" . }}-mysql-external
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.mysql.password.secret.key" -}}
{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.existingSecret .Values.externalMysql.passwordKey -}}
{{ .Values.externalMysql.passwordKey }}
{{- else -}}
mariadb-root-password
{{- end }}
{{- end }}
{{- define "snippet.mysql.host" -}}
{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.host -}}
{{ .Values.externalMysql.host }}
{{- else -}}
{{ include "oncall.mariadb.fullname" . }}
{{- end }}
{{- end }}
{{- define "snippet.mysql.port" -}}
{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.port -}}
{{ .Values.externalMysql.port }}
{{- else -}}
3306
{{- end }}
{{- end }}
{{- define "snippet.mysql.db" -}}
{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.db_name -}}
{{ .Values.externalMysql.db_name }}
{{- else -}}
{{ .Values.mariadb.auth.database | default "oncall" }}
{{- end }}
{{- end }}
{{- define "snippet.mysql.user" -}}
{{ if and (not .Values.mariadb.enabled) .Values.externalMysql.user -}}
{{ .Values.externalMysql.user }}
{{- else -}}
{{ .Values.mariadb.auth.username | default "root" }}
{{- end }}
{{- end }}
{{- define "snippet.postgresql.env" -}}
- name: DATABASE_TYPE
value: {{ .Values.database.type | quote }}
- name: DATABASE_HOST
value: {{ include "snippet.postgresql.host" . | quote }}
- name: DATABASE_PORT
value: {{ include "snippet.postgresql.port" . | quote }}
- name: DATABASE_NAME
value: {{ include "snippet.postgresql.db" . | quote }}
- name: DATABASE_USER
value: {{ include "snippet.postgresql.user" . | quote }}
- name: DATABASE_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "snippet.postgresql.password.secret.name" . }}
key: {{ include "snippet.postgresql.password.secret.key" . | quote }}
{{- if not .Values.postgresql.enabled }}
{{- with .Values.externalPostgresql.options }}
- name: DATABASE_OPTIONS
value: {{ . | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.sqlite.env" -}}
- name: DATABASE_TYPE
value: sqlite3
- name: DATABASE_NAME
value: /etc/app/oncall.db
{{- end }}
{{- define "snippet.postgresql.password.secret.name" -}}
{{ if .Values.postgresql.enabled -}}
{{ if .Values.postgresql.auth.existingSecret -}}
{{ .Values.postgresql.auth.existingSecret }}
{{- else -}}
{{ include "oncall.postgresql.fullname" . }}
{{- end }}
{{- else -}}
{{ if .Values.externalPostgresql.existingSecret -}}
{{ .Values.externalPostgresql.existingSecret }}
{{- else -}}
{{ include "oncall.fullname" . }}-postgresql-external
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.postgresql.password.secret.key" -}}
{{ if .Values.postgresql.enabled -}}
{{ if .Values.postgresql.auth.existingSecret -}}
{{ required "postgresql.auth.secretKeys.adminPasswordKey is required if database.type=postgres and postgresql.enabled and postgresql.auth.existingSecret" .Values.postgresql.auth.secretKeys.adminPasswordKey }}
{{- else -}}
{{ include "postgresql.userPasswordKey" .Subcharts.postgresql }}
{{- end }}
{{- else -}}
{{ if .Values.externalPostgresql.existingSecret -}}
{{ required "externalPostgresql.passwordKey is required if database.type=postgres and not postgresql.enabled and postgresql.auth.existingSecret" .Values.externalPostgresql.passwordKey }}
{{- else -}}
postgres-password
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.postgresql.host" -}}
{{ if not .Values.postgresql.enabled -}}
{{ required "externalPostgresql.host is required if database.type=postgres and not postgresql.enabled" .Values.externalPostgresql.host }}
{{- else -}}
{{ include "oncall.postgresql.fullname" . }}
{{- end }}
{{- end }}
{{- define "snippet.postgresql.port" -}}
{{ if and (not .Values.postgresql.enabled) .Values.externalPostgresql.port -}}
{{ .Values.externalPostgresql.port }}
{{- else -}}
5432
{{- end }}
{{- end }}
{{- define "snippet.postgresql.db" -}}
{{ if not .Values.postgresql.enabled -}}
{{ .Values.externalPostgresql.db_name | default "oncall" }}
{{- else -}}
{{ .Values.postgresql.auth.database | default "oncall" }}
{{- end }}
{{- end }}
{{- define "snippet.postgresql.user" -}}
{{ if not .Values.postgresql.enabled -}}
{{ .Values.externalPostgresql.user | default "postgres" }}
{{- else -}}
{{ .Values.postgresql.auth.username | default "postgres" }}
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.env" }}
- name: RABBITMQ_USERNAME
{{- if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.existingSecret .Values.externalRabbitmq.usernameKey (not .Values.externalRabbitmq.user) }}
valueFrom:
secretKeyRef:
name: {{ include "snippet.rabbitmq.password.secret.name" . }}
key: {{ .Values.externalRabbitmq.usernameKey | quote }}
{{- else }}
value: {{ include "snippet.rabbitmq.user" . | quote }}
{{- end }}
- name: RABBITMQ_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "snippet.rabbitmq.password.secret.name" . }}
key: {{ include "snippet.rabbitmq.password.secret.key" . | quote }}
- name: RABBITMQ_HOST
value: {{ include "snippet.rabbitmq.host" . | quote }}
- name: RABBITMQ_PORT
value: {{ include "snippet.rabbitmq.port" . | quote }}
- name: RABBITMQ_PROTOCOL
value: {{ include "snippet.rabbitmq.protocol" . | quote }}
- name: RABBITMQ_VHOST
value: {{ include "snippet.rabbitmq.vhost" . | quote }}
{{- end }}
{{- define "snippet.rabbitmq.user" -}}
{{ if not .Values.rabbitmq.enabled -}}
{{ required "externalRabbitmq.user is required if not rabbitmq.enabled" .Values.externalRabbitmq.user }}
{{- else -}}
user
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.host" -}}
{{ if not .Values.rabbitmq.enabled -}}
{{ required "externalRabbitmq.host is required if not rabbitmq.enabled" .Values.externalRabbitmq.host }}
{{- else -}}
{{ include "oncall.rabbitmq.fullname" . }}
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.port" -}}
{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.port -}}
{{ required "externalRabbitmq.port is required if not rabbitmq.enabled" .Values.externalRabbitmq.port }}
{{- else -}}
5672
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.protocol" -}}
{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.protocol -}}
{{ .Values.externalRabbitmq.protocol }}
{{- else -}}
amqp
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.vhost" -}}
{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.vhost -}}
{{ .Values.externalRabbitmq.vhost }}
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.password.secret.name" -}}
{{ if .Values.rabbitmq.enabled -}}
{{ if .Values.rabbitmq.auth.existingPasswordSecret -}}
{{ .Values.rabbitmq.auth.existingPasswordSecret }}
{{- else -}}
{{ include "oncall.rabbitmq.fullname" . }}
{{- end }}
{{- else -}}
{{ if .Values.externalRabbitmq.existingSecret -}}
{{ .Values.externalRabbitmq.existingSecret }}
{{- else -}}
{{ include "oncall.fullname" . }}-rabbitmq-external
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.rabbitmq.password.secret.key" -}}
{{ if and (not .Values.rabbitmq.enabled) .Values.externalRabbitmq.passwordKey -}}
{{ .Values.externalRabbitmq.passwordKey }}
{{- else -}}
rabbitmq-password
{{- end }}
{{- end }}
{{- define "snippet.redis.protocol" -}}
{{ default "redis" .Values.externalRedis.protocol | quote }}
{{- end }}
{{- define "snippet.redis.host" -}}
{{ if not .Values.redis.enabled -}}
{{ required "externalRedis.host is required if not redis.enabled" .Values.externalRedis.host | quote }}
{{- else -}}
{{ include "oncall.redis.fullname" . }}-master
{{- end }}
{{- end }}
{{- define "snippet.redis.port" -}}
{{ default 6379 .Values.externalRedis.port | quote }}
{{- end }}
{{- define "snippet.redis.database" -}}
{{ default 0 .Values.externalRedis.database | quote }}
{{- end }}
{{- define "snippet.redis.password.secret.name" -}}
{{ if .Values.redis.enabled -}}
{{ if .Values.redis.auth.existingSecret -}}
{{ .Values.redis.auth.existingSecret }}
{{- else -}}
{{ include "oncall.redis.fullname" . }}
{{- end }}
{{- else -}}
{{ if .Values.externalRedis.existingSecret -}}
{{ .Values.externalRedis.existingSecret }}
{{- else -}}
{{ include "oncall.fullname" . }}-redis-external
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.redis.password.secret.key" -}}
{{ if .Values.redis.enabled -}}
{{ if .Values.redis.auth.existingSecret -}}
{{ required "redis.auth.existingSecretPasswordKey is required if redis.auth.existingSecret is non-empty" .Values.redis.auth.existingSecretPasswordKey }}
{{- else -}}
redis-password
{{- end }}
{{- else -}}
{{ if .Values.externalRedis.existingSecret -}}
{{ required "externalRedis.passwordKey is required if externalRedis.existingSecret is non-empty" .Values.externalRedis.passwordKey }}
{{- else -}}
redis-password
{{- end }}
{{- end }}
{{- end }}
{{- define "snippet.redis.env" -}}
- name: REDIS_PROTOCOL
value: {{ include "snippet.redis.protocol" . }}
- name: REDIS_HOST
value: {{ include "snippet.redis.host" . }}
- name: REDIS_PORT
value: {{ include "snippet.redis.port" . }}
- name: REDIS_DATABASE
value: {{ include "snippet.redis.database" . }}
- name: REDIS_USERNAME
value: {{ default "" .Values.externalRedis.username | quote }}
- name: REDIS_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "snippet.redis.password.secret.name" . }}
key: {{ include "snippet.redis.password.secret.key" . | quote}}
{{- if and (not .Values.redis.enabled) .Values.externalRedis.ssl_options.enabled }}
- name: REDIS_USE_SSL
value: "true"
{{- with .Values.externalRedis.ssl_options.ca_certs }}
- name: REDIS_SSL_CA_CERTS
value: {{ . | quote }}
{{- end }}
{{- with .Values.externalRedis.ssl_options.certfile }}
- name: REDIS_SSL_CERTFILE
value: {{ . | quote }}
{{- end }}
{{- with .Values.externalRedis.ssl_options.keyfile }}
- name: REDIS_SSL_KEYFILE
value: {{ . | quote }}
{{- end }}
{{- with .Values.externalRedis.ssl_options.cert_reqs }}
- name: REDIS_SSL_CERT_REQS
value: {{ . | quote }}
{{- end }}
{{- end }}
{{- end }}
{{- /*
when broker.type != rabbitmq, we do not need to include rabbitmq environment variables
*/}}
{{- define "snippet.broker.env" -}}
{{- include "snippet.redis.env" . }}
{{- if eq .Values.broker.type "rabbitmq" -}}
{{- include "snippet.rabbitmq.env" . }}
{{- end }}
{{- end }}
{{- define "snippet.db.env" -}}
{{- if eq .Values.database.type "mysql" }}
{{- include "snippet.mysql.env" . }}
{{- else if eq .Values.database.type "postgresql" }}
{{- include "snippet.postgresql.env" . }}
{{- else if eq .Values.database.type "sqlite" -}}
{{- include "snippet.sqlite.env" . }}
{{- else -}}
{{- fail "value for .Values.db.type must be either 'mysql', 'postgresql', or 'sqlite'" }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.smtp.env" -}}
- name: FEATURE_EMAIL_INTEGRATION_ENABLED
value: {{ .Values.oncall.smtp.enabled | toString | title | quote }}
{{- if .Values.oncall.smtp.enabled }}
- name: EMAIL_HOST
value: {{ .Values.oncall.smtp.host | quote }}
- name: EMAIL_PORT
value: {{ .Values.oncall.smtp.port | default "587" | quote }}
- name: EMAIL_HOST_USER
value: {{ .Values.oncall.smtp.username | quote }}
- name: EMAIL_HOST_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "oncall.fullname" . }}-smtp
key: smtp-password
optional: true
- name: EMAIL_USE_TLS
value: {{ .Values.oncall.smtp.tls | default true | toString | title | quote }}
- name: EMAIL_FROM_ADDRESS
value: {{ .Values.oncall.smtp.fromEmail | quote }}
- name: EMAIL_NOTIFICATIONS_LIMIT
value: {{ .Values.oncall.smtp.limitEmail | default "200" | quote }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.exporter.env" -}}
{{ if .Values.oncall.exporter.enabled -}}
- name: FEATURE_PROMETHEUS_EXPORTER_ENABLED
value: {{ .Values.oncall.exporter.enabled | toString | title | quote }}
- name: PROMETHEUS_EXPORTER_SECRET
valueFrom:
secretKeyRef:
name: {{ include "oncall.fullname" . }}-exporter
key: exporter-secret
optional: true
{{- else -}}
- name: FEATURE_PROMETHEUS_EXPORTER_ENABLED
value: {{ .Values.oncall.exporter.enabled | toString | title | quote }}
{{- end }}
{{- end }}
{{- define "snippet.oncall.engine.env" -}}
{{ include "snippet.oncall.env" . }}
{{ include "snippet.oncall.slack.env" . }}
{{ include "snippet.oncall.telegram.env" . }}
{{ include "snippet.oncall.smtp.env" . }}
{{ include "snippet.oncall.twilio.env" . }}
{{ include "snippet.oncall.exporter.env" . }}
{{ include "snippet.db.env" . }}
{{ include "snippet.broker.env" . }}
{{ include "oncall.extraEnvs" . }}
{{- end }}

View File

@@ -1,121 +0,0 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "oncall.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "oncall.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "oncall.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "oncall.labels" -}}
helm.sh/chart: {{ include "oncall.chart" . }}
{{ include "oncall.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "oncall.selectorLabels" -}}
app.kubernetes.io/name: {{ include "oncall.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "oncall.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "oncall.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
{{/* Generate the fullname of mariadb subchart */}}
{{- define "oncall.mariadb.fullname" -}}
{{- printf "%s-%s" .Release.Name "mariadb" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/* Generate the fullname of postgresql subchart */}}
{{- define "oncall.postgresql.fullname" -}}
{{- printf "%s-%s" .Release.Name "postgresql" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- define "oncall.grafana.fullname" -}}
{{- printf "%s-%s" .Release.Name "grafana" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/* Generate the fullname of rabbitmq subchart */}}
{{- define "oncall.rabbitmq.fullname" -}}
{{- printf "%s-%s" .Release.Name "rabbitmq" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/* Generate the fullname of redis subchart */}}
{{- define "oncall.redis.fullname" -}}
{{- printf "%s-%s" .Release.Name "redis" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/* Generate engine image name */}}
{{- define "oncall.engine.image" -}}
{{- printf "%s:%s" .Values.image.repository (.Values.image.tag | default .Chart.AppVersion) }}
{{- end }}
{{- define "oncall.initContainer" }}
- name: wait-for-db
image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: {{ .Values.image.pullPolicy }}
command: ['sh', '-c', "until (python manage.py migrate --check); do echo Waiting for database migrations; sleep 2; done"]
securityContext:
{{ toYaml .Values.init.securityContext | nindent 4 }}
resources:
{{ toYaml .Values.init.resources | nindent 4 }}
env:
{{- include "snippet.oncall.env" . | nindent 4 }}
{{- include "snippet.db.env" . | nindent 4 }}
{{- include "snippet.broker.env" . | nindent 4 }}
{{- include "oncall.extraEnvs" . | nindent 4 }}
{{- end }}
{{- define "oncall.extraEnvs" -}}
{{- if .Values.env }}
{{- if (kindIs "map" .Values.env) }}
{{- range $key, $value := .Values.env }}
- name: {{ $key }}
value: {{ $value }}
{{- end -}}
{{/* support previous schema */}}
{{- else }}
{{- toYaml .Values.env }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,26 +0,0 @@
{{/*
Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "oncall.celery.name" -}}
{{ include "oncall.name" . | trunc 55 }}-celery
{{- end }}
{{- define "oncall.celery.fullname" -}}
{{ include "oncall.fullname" . | trunc 55 }}-celery
{{- end }}
{{/*
Engine common labels
*/}}
{{- define "oncall.celery.labels" -}}
{{ include "oncall.labels" . }}
app.kubernetes.io/component: celery
{{- end }}
{{/*
Engine selector labels
*/}}
{{- define "oncall.celery.selectorLabels" -}}
{{ include "oncall.selectorLabels" . }}
app.kubernetes.io/component: celery
{{- end }}

View File

@@ -1,89 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "oncall.celery.fullname" . }}
labels:
{{- include "oncall.celery.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.celery.replicaCount }}
selector:
matchLabels:
{{- include "oncall.celery.selectorLabels" . | nindent 6 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
random-annotation: {{ randAlphaNum 10 | lower }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "oncall.celery.selectorLabels" . | nindent 8 }}
{{- if .Values.celery.podLabels }}
{{- toYaml .Values.celery.podLabels | nindent 8}}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "oncall.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
{{- include "oncall.initContainer" . | indent 8 }}
{{- with .Values.celery.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.celery.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.celery.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.celery.topologySpreadConstraints }}
topologySpreadConstraints:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.celery.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "oncall.engine.image" . }}
{{- if .Values.oncall.devMode }}
command: ["python", "manage.py", "start_celery"]
{{- else }}
command: ["./celery_with_exporter.sh"]
{{- end }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
env:
{{- include "snippet.celery.env" . | nindent 12 }}
{{- include "snippet.oncall.engine.env" . | nindent 12 }}
{{- if .Values.celery.livenessProbe.enabled }}
livenessProbe:
exec:
command: [
"bash",
"-c",
"celery -A engine inspect ping -d celery@$HOSTNAME"
]
initialDelaySeconds: {{ .Values.celery.livenessProbe.initialDelaySeconds }}
periodSeconds: {{ .Values.celery.livenessProbe.periodSeconds }}
timeoutSeconds: {{ .Values.celery.livenessProbe.timeoutSeconds }}
{{- end }}
resources:
{{- toYaml .Values.celery.resources | nindent 12 }}
{{- with .Values.celery.extraVolumeMounts }}
volumeMounts: {{- . | toYaml | nindent 12 }}
{{- end }}
{{- with .Values.celery.extraContainers }}
{{- tpl . $ | nindent 8 }}
{{- end }}
{{- with .Values.celery.extraVolumes }}
volumes: {{- . | toYaml | nindent 8 }}
{{- end }}

View File

@@ -1,22 +0,0 @@
{{- if (index .Values "cert-manager").enabled }}
apiVersion: cert-manager.io/v1
kind: Issuer
metadata:
name: letsencrypt-prod
annotations:
"helm.sh/hook": post-install,post-upgrade
spec:
acme:
# The ACME server URL
server: https://acme-v02.api.letsencrypt.org/directory
# Email address used for ACME registration
email: no-reply@{{ .Values.base_url }}
# Name of a secret used to store the ACME account private key
privateKeySecretRef:
name: letsencrypt-prod
# Enable the HTTP-01 challenge provider
solvers:
- http01:
ingress:
class: nginx
{{- end }}

View File

@@ -1,26 +0,0 @@
{{/*
Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "oncall.engine.name" -}}
{{ include "oncall.name" . | trunc 55 }}-engine
{{- end }}
{{- define "oncall.engine.fullname" -}}
{{ include "oncall.fullname" . | trunc 55 }}-engine
{{- end }}
{{/*
Engine common labels
*/}}
{{- define "oncall.engine.labels" -}}
{{ include "oncall.labels" . }}
app.kubernetes.io/component: engine
{{- end }}
{{/*
Engien selector labels
*/}}
{{- define "oncall.engine.selectorLabels" -}}
{{ include "oncall.selectorLabels" . }}
app.kubernetes.io/component: engine
{{- end }}

View File

@@ -1,98 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "oncall.engine.fullname" . }}
labels:
{{- include "oncall.engine.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.engine.replicaCount }}
selector:
matchLabels:
{{- include "oncall.engine.selectorLabels" . | nindent 6 }}
strategy:
{{- toYaml .Values.engine.updateStrategy | nindent 4 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
random-annotation: {{ randAlphaNum 10 | lower }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "oncall.engine.selectorLabels" . | nindent 8 }}
{{- if .Values.engine.podLabels }}
{{- toYaml .Values.engine.podLabels | nindent 8}}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "oncall.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
{{- include "oncall.initContainer" . | indent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "oncall.engine.image" . }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- if .Values.oncall.devMode }}
command: ["sh", "-c", "uwsgi --disable-logging --py-autoreload 3 --ini uwsgi.ini"]
{{- end }}
ports:
- name: http
containerPort: 8080
protocol: TCP
env:
{{- include "snippet.oncall.engine.env" . | nindent 12 }}
livenessProbe:
httpGet:
path: /health/
port: http
periodSeconds: 60
timeoutSeconds: 3
readinessProbe:
httpGet:
path: /ready/
port: http
periodSeconds: 60
timeoutSeconds: 3
startupProbe:
httpGet:
path: /startupprobe/
port: http
periodSeconds: 10
timeoutSeconds: 3
resources:
{{- toYaml .Values.engine.resources | nindent 12 }}
{{- with .Values.engine.extraVolumeMounts }}
volumeMounts: {{- . | toYaml | nindent 12 }}
{{- end }}
{{- with .Values.engine.extraContainers }}
{{- tpl . $ | nindent 8 }}
{{- end }}
{{- with .Values.engine.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.engine.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.engine.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.engine.topologySpreadConstraints }}
topologySpreadConstraints:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.engine.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
{{- with .Values.engine.extraVolumes }}
volumes: {{- . | toYaml | nindent 8 }}
{{- end }}

View File

@@ -1,102 +0,0 @@
{{- if .Values.migrate.enabled -}}
apiVersion: batch/v1
kind: Job
metadata:
{{- if .Values.migrate.useHook }}
name: {{ printf "%s-migrate" (include "oncall.engine.fullname" .) }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-1"
{{- with .Values.migrate.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- else }}
name: {{ printf "%s-migrate-%s" (include "oncall.engine.fullname" .) (now | date "2006-01-02-15-04-05") }}
{{- with .Values.migrate.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
labels:
{{- include "oncall.engine.labels" . | nindent 4 }}
spec:
backoffLimit: 15
{{- if .Values.migrate.ttlSecondsAfterFinished }}
ttlSecondsAfterFinished: {{ .Values.migrate.ttlSecondsAfterFinished }}
{{- end }}
template:
metadata:
name: {{ printf "%s-migrate-%s" (include "oncall.engine.fullname" .) (now | date "2006-01-02-15-04-05") }}
{{- with .Values.podAnnotations }}
annotations:
random-annotation: {{ randAlphaNum 10 | lower }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "oncall.engine.selectorLabels" . | nindent 8 }}
spec:
restartPolicy: Never
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "oncall.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
{{- with .Values.migrate.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.migrate.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.migrate.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
containers:
- name: {{ .Chart.Name }}-migrate
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "oncall.engine.image" . }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
command:
- /bin/sh
- -c
{{- if eq .Values.database.type "mysql" }}
- |
until (nc -vz $MYSQL_HOST $MYSQL_PORT);
do
echo "waiting for MySQL"; sleep 1;
done
python manage.py migrate
{{- else if eq .Values.database.type "postgresql" }}
- |
until (nc -vz $DATABASE_HOST $DATABASE_PORT);
do
echo "waiting for PostgreSQL"; sleep 1;
done
python manage.py migrate
{{- else }}
- python manage.py migrate
{{- end }}
env:
{{- include "snippet.oncall.env" . | nindent 12 }}
{{- include "snippet.oncall.smtp.env" . | nindent 12 }}
{{- include "snippet.oncall.exporter.env" . | nindent 12 }}
{{- include "snippet.db.env" . | nindent 12 }}
{{- include "snippet.broker.env" . | nindent 12 }}
{{- include "oncall.extraEnvs" . | nindent 12 }}
resources:
{{- toYaml .Values.migrate.resources | nindent 12 }}
{{- with .Values.migrate.extraVolumeMounts }}
volumeMounts: {{- . | toYaml | nindent 10 }}
{{- end }}
{{- with .Values.migrate.extraContainers }}
{{- tpl . $ | nindent 6 }}
{{- end }}
{{- with .Values.migrate.extraVolumes }}
volumes: {{- . | toYaml | nindent 8 }}
{{- end }}
{{- end }}

View File

@@ -1,24 +0,0 @@
{{- if .Values.service.enabled }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "oncall.engine.fullname" . }}-external
labels:
{{- include "oncall.engine.labels" . | nindent 4 }}
{{- with .Values.service.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.service.type }}
ports:
- port: {{ .Values.service.port }}
targetPort: http
protocol: TCP
name: http
{{- if and (eq .Values.service.type "NodePort") (.Values.service.nodePort) }}
nodePort: {{ .Values.service.nodePort }}
{{- end }}
selector:
{{- include "oncall.engine.selectorLabels" . | nindent 4 }}
{{- end }}

View File

@@ -1,15 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: {{ include "oncall.engine.fullname" . }}
labels:
{{- include "oncall.engine.labels" . | nindent 4 }}
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: http
protocol: TCP
name: http
selector:
{{- include "oncall.engine.selectorLabels" . | nindent 4 }}

View File

@@ -1,65 +0,0 @@
{{- if .Values.ingress.enabled -}}
{{- $fullName := include "oncall.fullname" . -}}
{{- $svcPort := .Values.service.port -}}
{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }}
{{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }}
{{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}}
{{- end }}
{{- end }}
{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1
{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}}
apiVersion: networking.k8s.io/v1beta1
{{- else -}}
apiVersion: extensions/v1beta1
{{- end }}
kind: Ingress
metadata:
name: {{ $fullName }}
labels:
{{- include "oncall.labels" . | nindent 4 }}
{{- with .Values.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- tpl (toYaml .Values.ingress.tls) . | nindent 4 }}
{{- end }}
rules:
- host: {{ .Values.base_url | quote }}
http:
paths:
{{- if .Values.ingress.extraPaths }}
{{ toYaml .Values.ingress.extraPaths | indent 6}}
{{- end }}
- path: /
pathType: Prefix
backend:
service:
name: {{ include "oncall.engine.fullname" . }}
port:
number: 8080
{{ if .Values.grafana.enabled }}
- path: /grafana
pathType: Prefix
backend:
service:
name: {{ include "oncall.grafana.fullname" . }}
port:
number: 80
{{- end }}
{{ if .Values.detached_integrations.enabled }}
- path: /integrations
pathType: Prefix
backend:
service:
name: {{ include "oncall.detached_integrations.fullname" . }}
port:
number: 8080
{{- end }}
{{- end }}

View File

@@ -1,26 +0,0 @@
{{/*
Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "oncall.detached_integrations.name" -}}
{{ include "oncall.name" . | trunc 55 }}-integrations
{{- end }}
{{- define "oncall.detached_integrations.fullname" -}}
{{ include "oncall.fullname" . | trunc 55 }}-integrations
{{- end }}
{{/*
Integrations common labels
*/}}
{{- define "oncall.detached_integrations.labels" -}}
{{ include "oncall.labels" . }}
app.kubernetes.io/component: integrations
{{- end }}
{{/*
Integrations selector labels
*/}}
{{- define "oncall.detached_integrations.selectorLabels" -}}
{{ include "oncall.selectorLabels" . }}
app.kubernetes.io/component: integrations
{{- end }}

View File

@@ -1,99 +0,0 @@
{{- if .Values.detached_integrations.enabled -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "oncall.detached_integrations.fullname" . }}
labels:
{{- include "oncall.detached_integrations.labels" . | nindent 4 }}
spec:
replicas: {{ .Values.detached_integrations.replicaCount }}
selector:
matchLabels:
{{- include "oncall.detached_integrations.selectorLabels" . | nindent 6 }}
strategy:
{{- toYaml .Values.detached_integrations.updateStrategy | nindent 4 }}
template:
metadata:
{{- with .Values.podAnnotations }}
annotations:
random-annotation: {{ randAlphaNum 10 | lower }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "oncall.detached_integrations.selectorLabels" . | nindent 8 }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "oncall.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
{{- include "oncall.initContainer" . | indent 8 }}
containers:
- name: {{ .Chart.Name }}
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "oncall.engine.image" . }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
{{- if .Values.oncall.devMode }}
command: ["sh", "-c", "uwsgi --disable-logging --py-autoreload 3 --ini uwsgi.ini"]
{{- end }}
ports:
- name: http
containerPort: 8080
protocol: TCP
env:
{{- include "snippet.oncall.engine.env" . | nindent 12 }}
- name: ROOT_URLCONF
value: "engine.integrations_urls"
livenessProbe:
httpGet:
path: /health/
port: http
periodSeconds: 60
timeoutSeconds: 3
readinessProbe:
httpGet:
path: /ready/
port: http
periodSeconds: 60
timeoutSeconds: 3
startupProbe:
httpGet:
path: /startupprobe/
port: http
periodSeconds: 10
timeoutSeconds: 3
resources:
{{- toYaml .Values.detached_integrations.resources | nindent 12 }}
{{- with .Values.detached_integrations.extraVolumeMounts }}
volumeMounts: {{- . | toYaml | nindent 12 }}
{{- end }}
{{- with .Values.detached_integrations.extraContainers }}
{{- tpl . $ | nindent 8 }}
{{- end }}
{{- with .Values.detached_integrations.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.detached_integrations.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.detached_integrations.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.detached_integrations.topologySpreadConstraints }}
topologySpreadConstraints:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.detached_integrations.priorityClassName }}
priorityClassName: {{ . }}
{{- end }}
{{- with .Values.detached_integrations.extraVolumes }}
volumes: {{- . | toYaml | nindent 8 }}
{{- end }}
{{- end -}}

View File

@@ -1,24 +0,0 @@
{{- if .Values.detached_integrations_service.enabled }}
apiVersion: v1
kind: Service
metadata:
name: {{ include "oncall.detached_integrations.fullname" . }}-external
labels:
{{- include "oncall.detached_integrations.labels" . | nindent 4 }}
{{- with .Values.detached_integrations_service.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
type: {{ .Values.detached_integrations_service.type }}
ports:
- port: {{ .Values.detached_integrations_service.port }}
targetPort: http
protocol: TCP
name: http
{{- if and (eq .Values.detached_integrations_service.type "NodePort") (.Values.detached_integrations_service.nodePort) }}
nodePort: {{ .Values.detached_integrations_service.nodePort }}
{{- end }}
selector:
{{- include "oncall.detached_integrations.selectorLabels" . | nindent 4 }}
{{- end }}

View File

@@ -1,17 +0,0 @@
{{- if .Values.detached_integrations.enabled -}}
apiVersion: v1
kind: Service
metadata:
name: {{ include "oncall.detached_integrations.fullname" . }}
labels:
{{- include "oncall.detached_integrations.labels" . | nindent 4 }}
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: http
protocol: TCP
name: http
selector:
{{- include "oncall.detached_integrations.selectorLabels" . | nindent 4 }}
{{- end -}}

View File

@@ -1,98 +0,0 @@
{{- if not .Values.oncall.secrets.existingSecret }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}
labels:
{{- include "oncall.labels" . | nindent 4 }}
{{- if .Values.migrate.useHook }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
{{- end }}
type: Opaque
data:
{{ include "snippet.oncall.secret.secretKey" . }}: {{ randAlphaNum 40 | b64enc | quote }}
{{ include "snippet.oncall.secret.mirageSecretKey" . }}: {{ randAlphaNum 40 | b64enc | quote }}
---
{{- end }}
{{- if and (eq .Values.database.type "mysql") (not .Values.mariadb.enabled) (not .Values.externalMysql.existingSecret) }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}-mysql-external
{{- if .Values.migrate.useHook }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
{{- end }}
type: Opaque
data:
mariadb-root-password: {{ required "externalMysql.password is required if not mariadb.enabled and not externalMysql.existingSecret" .Values.externalMysql.password | b64enc | quote }}
---
{{- end }}
{{- if and (not .Values.postgresql.enabled) (eq .Values.database.type "postgresql") (not .Values.externalPostgresql.existingSecret) }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}-postgresql-external
{{- if .Values.migrate.useHook }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
{{- end }}
type: Opaque
data:
postgres-password: {{ required "externalPostgresql.password is required if not postgresql.enabled and not externalPostgresql.existingSecret" .Values.externalPostgresql.password | b64enc | quote }}
---
{{- end }}
{{- if and (eq .Values.broker.type "rabbitmq") (not .Values.rabbitmq.enabled) (not .Values.externalRabbitmq.existingSecret) }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}-rabbitmq-external
{{- if .Values.migrate.useHook }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
{{- end }}
type: Opaque
data:
rabbitmq-password: {{ required "externalRabbitmq.password is required if not rabbitmq.enabled and not externalRabbitmq.existingSecret" .Values.externalRabbitmq.password | b64enc | quote }}
---
{{- end }}
{{- if and (eq .Values.broker.type "redis") (not .Values.redis.enabled) (not .Values.externalRedis.existingSecret) }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}-redis-external
{{- if .Values.migrate.useHook }}
annotations:
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
{{- end }}
type: Opaque
data:
redis-password: {{ required "externalRedis.password is required if not redis.enabled and not externalRedis.existingSecret" .Values.externalRedis.password | b64enc | quote }}
---
{{- end }}
{{- if and .Values.oncall.smtp.enabled .Values.oncall.smtp.password }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}-smtp
type: Opaque
data:
smtp-password: {{ .Values.oncall.smtp.password | b64enc | quote }}
---
{{- end }}
{{- if and .Values.oncall.exporter.enabled .Values.oncall.exporter.authToken }}
apiVersion: v1
kind: Secret
metadata:
name: {{ include "oncall.fullname" . }}-exporter
type: Opaque
data:
exporter-secret: {{ .Values.oncall.exporter.authToken | b64enc | quote }}
---
{{- end }}

View File

@@ -1,18 +0,0 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "oncall.serviceAccountName" . }}
labels:
{{- include "oncall.labels" . | nindent 4 }}
{{- if or (.Values.migrate.useHook) (.Values.serviceAccount.annotations) }}
annotations:
{{- if .Values.migrate.useHook }}
"helm.sh/hook": pre-install,pre-upgrade
"helm.sh/hook-weight": "-5"
{{- end }}
{{- with .Values.serviceAccount.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,22 +0,0 @@
{{/*
Maximum of 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "oncall.telegramPolling.fullname" -}}
{{ include "oncall.fullname" . | trunc 45 }}-telegram-polling
{{- end }}
{{/*
Telegram polling common labels
*/}}
{{- define "oncall.telegramPolling.labels" -}}
{{ include "oncall.labels" . }}
app.kubernetes.io/component: telegram-polling
{{- end }}
{{/*
Telegram polling selector labels
*/}}
{{- define "oncall.telegramPolling.selectorLabels" -}}
{{ include "oncall.selectorLabels" . }}
app.kubernetes.io/component: telegram-polling
{{- end }}

View File

@@ -1,53 +0,0 @@
{{- if .Values.telegramPolling.enabled -}}
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "oncall.telegramPolling.fullname" . }}
labels:
{{- include "oncall.telegramPolling.labels" . | nindent 4 }}
spec:
replicas: 1
selector:
matchLabels:
{{- include "oncall.telegramPolling.selectorLabels" . | nindent 6 }}
template:
metadata:
labels:
{{- include "oncall.telegramPolling.selectorLabels" . | nindent 8 }}
{{- if .Values.telegramPolling.podLabels }}
{{- toYaml .Values.telegramPolling.podLabels | nindent 8 }}
{{- end }}
spec:
{{- with .Values.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "oncall.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
{{- include "oncall.initContainer" . | nindent 8 }}
containers:
- name: telegram-polling
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "oncall.engine.image" . }}
imagePullPolicy: {{ .Values.image.pullPolicy }}
command: ['sh', '-c', 'python manage.py start_telegram_polling']
env:
{{- include "snippet.oncall.env" . | nindent 12 }}
{{- include "snippet.oncall.telegram.env" . | nindent 12 }}
{{- include "snippet.db.env" . | nindent 12 }}
{{- include "snippet.broker.env" . | nindent 12 }}
{{- include "oncall.extraEnvs" . | nindent 12 }}
{{- with .Values.telegramPolling.resources }}
resources:
{{- toYaml . | nindent 12 }}
{{- end }}
{{- with .Values.telegramPolling.extraVolumeMounts }}
volumeMounts: {{- . | toYaml | nindent 12 }}
{{- end }}
{{- with .Values.telegramPolling.extraVolumes }}
volumes: {{- . | toYaml | nindent 8 }}
{{- end }}
{{- end -}}

View File

@@ -1,8 +0,0 @@
{{- define "ui.env" -}}
{{- if .Values.ui.env }}
{{- range $key, $value := .Values.ui.env }}
- name: {{ $key }}
value: "{{ $value }}"
{{- end -}}
{{- end }}
{{- end }}

View File

@@ -1,31 +0,0 @@
{{- if .Values.ui.enabled }}
apiVersion: apps/v1
kind: Deployment
metadata:
name: oncall-ui
labels:
app.kubernetes.io/component: oncall-ui
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/component: oncall-ui
template:
metadata:
labels:
app.kubernetes.io/component: oncall-ui
spec:
containers:
- name: oncall-ui
image: "{{ .Values.ui.image.repository }}:{{ .Values.ui.image.tag | default .Chart.AppVersion }}"
imagePullPolicy: IfNotPresent
env:
{{- include "ui.env" . | nindent 12 }}
volumeMounts:
- mountPath: /etc/app
name: hot-reloaded-plugin
volumes:
- name: hot-reloaded-plugin
hostPath:
path: /oncall-plugin
{{- end }}

View File

@@ -1,719 +0,0 @@
# Values for configuring the deployment of Grafana OnCall
# Set the domain name Grafana OnCall will be installed on.
# If you want to install grafana as a part of this release make sure to configure grafana.grafana.ini.server.domain too
base_url: example.com
base_url_protocol: https
## Optionally specify an array of imagePullSecrets.
## Secrets must be manually created in the namespace.
## ref: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
## e.g:
## imagePullSecrets:
## - name: myRegistryKeySecretName
imagePullSecrets: []
image:
# Grafana OnCall docker image repository
repository: grafana/oncall
tag:
pullPolicy: Always
# Whether to create additional service for external connections
# ClusterIP service is always created
service:
enabled: false
type: LoadBalancer
port: 8080
annotations: {}
# Engine pods configuration
engine:
replicaCount: 1
resources:
{}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Labels for engine pods
podLabels: {}
## Deployment update strategy
## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy
updateStrategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 0
type: RollingUpdate
## Affinity for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}
## Node labels for pod assignment
## ref: https://kubernetes.io/docs/user-guide/node-selection/
nodeSelector: {}
## Tolerations for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
tolerations: []
## Topology spread constraints for pod assignment
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
topologySpreadConstraints: []
## Priority class for the pods
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/
priorityClassName: ""
# Extra containers which runs as sidecar
extraContainers: ""
# extraContainers: |
# - name: cloud-sql-proxy
# image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2
# args:
# - --private-ip
# - --port=5432
# - example:europe-west3:grafana-oncall-db
# Extra volume mounts for the main app container
extraVolumeMounts: []
# - mountPath: /mnt/postgres-tls
# name: postgres-tls
# - mountPath: /mnt/redis-tls
# name: redis-tls
# Extra volumes for the pod
extraVolumes: []
# - name: postgres-tls
# configMap:
# name: my-postgres-tls
# defaultMode: 0640
# - name: redis-tls
# configMap:
# name: my-redis-tls
# defaultMode: 0640
detached_integrations_service:
enabled: false
type: LoadBalancer
port: 8080
annotations: {}
# Integrations pods configuration
detached_integrations:
enabled: false
replicaCount: 1
resources:
{}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
## Deployment update strategy
## ref: https://kubernetes.io/docs/concepts/workloads/controllers/deployment/#strategy
updateStrategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 0
type: RollingUpdate
## Affinity for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}
## Node labels for pod assignment
## ref: https://kubernetes.io/docs/user-guide/node-selection/
nodeSelector: {}
## Tolerations for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
tolerations: []
## Topology spread constraints for pod assignment
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
topologySpreadConstraints: []
## Priority class for the pods
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/
priorityClassName: ""
# Extra containers which runs as sidecar
extraContainers: ""
# extraContainers: |
# - name: cloud-sql-proxy
# image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2
# args:
# - --private-ip
# - --port=5432
# - example:europe-west3:grafana-oncall-db
# Extra volume mounts for the container
extraVolumeMounts: []
# - mountPath: /mnt/postgres-tls
# name: postgres-tls
# - mountPath: /mnt/redis-tls
# name: redis-tls
# Extra volumes for the pod
extraVolumes: []
# - name: postgres-tls
# configMap:
# name: my-postgres-tls
# defaultMode: 0640
# - name: redis-tls
# configMap:
# name: my-redis-tls
# defaultMode: 0640
# Celery workers pods configuration
celery:
replicaCount: 1
worker_queue: "default,critical,long,slack,telegram,webhook,celery,grafana"
worker_concurrency: "1"
worker_max_tasks_per_child: "100"
worker_beat_enabled: "True"
## Restart of the celery workers once in a given interval as an additional precaution to the probes
## If this setting is enabled TERM signal will be sent to celery workers
## It will lead to warm shutdown (waiting for the tasks to complete) and restart the container
## If this setting is set numbers of pod restarts will increase
## Comment this line out if you want to remove restarts
worker_shutdown_interval: "65m"
livenessProbe:
enabled: true
initialDelaySeconds: 30
periodSeconds: 300
timeoutSeconds: 10
resources:
{}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Labels for celery pods
podLabels: {}
## Affinity for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}
## Node labels for pod assignment
## ref: https://kubernetes.io/docs/user-guide/node-selection/
nodeSelector: {}
## Tolerations for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
tolerations: []
## Topology spread constraints for pod assignment
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/
topologySpreadConstraints: []
## Priority class for the pods
## ref: https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/
priorityClassName: ""
# Extra containers which runs as sidecar
extraContainers: ""
# extraContainers: |
# - name: cloud-sql-proxy
# image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2
# args:
# - --private-ip
# - --port=5432
# - example:europe-west3:grafana-oncall-db
# Extra volume mounts for the main container
extraVolumeMounts: []
# - mountPath: /mnt/postgres-tls
# name: postgres-tls
# - mountPath: /mnt/redis-tls
# name: redis-tls
# Extra volumes for the pod
extraVolumes: []
# - name: postgres-tls
# configMap:
# name: my-postgres-tls
# defaultMode: 0640
# - name: redis-tls
# configMap:
# name: my-redis-tls
# defaultMode: 0640
# Telegram polling pod configuration
telegramPolling:
enabled: false
resources:
{}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Labels for telegram-polling pods
podLabels: {}
# Extra volume mounts for the main container
extraVolumeMounts: []
# - mountPath: /mnt/postgres-tls
# name: postgres-tls
# - mountPath: /mnt/redis-tls
# name: redis-tls
# Extra volumes for the pod
extraVolumes: []
# - name: postgres-tls
# configMap:
# name: my-postgres-tls
# defaultMode: 0640
# - name: redis-tls
# configMap:
# name: my-redis-tls
# defaultMode: 0640
oncall:
# this is intended to be used for local development. In short, it will mount the ./engine dir into
# any backend related containers, to allow hot-reloading + also run the containers with slightly modified
# startup commands (which configures the hot-reloading)
devMode: false
# Override default MIRAGE_CIPHER_IV (must be 16 bytes long)
# For existing installation, this should not be changed.
# mirageCipherIV: 1234567890abcdef
# oncall secrets
secrets:
# Use existing secret. (secretKey and mirageSecretKey is required)
existingSecret: ""
# The key in the secret containing secret key
secretKey: ""
# The key in the secret containing mirage secret key
mirageSecretKey: ""
# Slack configures the Grafana Oncall Slack ChatOps integration.
slack:
# Enable the Slack ChatOps integration for the Oncall Engine.
enabled: false
# Sets the Slack bot slash-command
commandName: oncall
# clientId configures the Slack app OAuth2 client ID.
# api.slack.com/apps/<yourApp> -> Basic Information -> App Credentials -> Client ID
clientId: ~
# clientSecret configures the Slack app OAuth2 client secret.
# api.slack.com/apps/<yourApp> -> Basic Information -> App Credentials -> Client Secret
clientSecret: ~
# signingSecret - configures the Slack app signature secret used to sign
# requests comming from Slack.
# api.slack.com/apps/<yourApp> -> Basic Information -> App Credentials -> Signing Secret
signingSecret: ~
# Use existing secret for clientId, clientSecret and signingSecret.
# clientIdKey, clientSecretKey and signingSecretKey are required
existingSecret: ""
# The key in the secret containing OAuth2 client ID
clientIdKey: ""
# The key in the secret containing OAuth2 client secret
clientSecretKey: ""
# The key in the secret containing the Slack app signature secret
signingSecretKey: ""
# OnCall external URL
redirectHost: ~
telegram:
enabled: false
token: ~
webhookUrl: ~
# Use existing secret. (tokenKey is required)
existingSecret: ""
# The key in the secret containing Telegram token
tokenKey: ""
smtp:
enabled: true
host: ~
port: ~
username: ~
password: ~
tls: ~
fromEmail: ~
exporter:
enabled: false
authToken: ~
twilio:
# Twilio account SID/username to allow OnCall to send SMSes and make phone calls
accountSid: ""
# Twilio password to allow OnCall to send SMSes and make calls
authToken: ""
# Number from which you will receive calls and SMS
# (NOTE: must be quoted, otherwise would be rendered as float value)
phoneNumber: ""
# SID of Twilio service for number verification. You can create a service in Twilio web interface.
# twilio.com -> verify -> create new service
verifySid: ""
# Twilio API key SID/username to allow OnCall to send SMSes and make phone calls
apiKeySid: ""
# Twilio API key secret/password to allow OnCall to send SMSes and make phone calls
apiKeySecret: ""
# Use existing secret for authToken, phoneNumber, verifySid, apiKeySid and apiKeySecret.
existingSecret: ""
# Twilio password to allow OnCall to send SMSes and make calls
# The key in the secret containing the auth token
authTokenKey: ""
# The key in the secret containing the phone number
phoneNumberKey: ""
# The key in the secret containing verify service sid
verifySidKey: ""
# The key in the secret containing api key sid
apiKeySidKey: ""
# The key in the secret containing the api key secret
apiKeySecretKey: ""
# Phone notifications limit (the only non-secret value).
# TODO: rename to phoneNotificationLimit
limitPhone:
# Whether to run django database migrations automatically
migrate:
enabled: true
# TTL can be unset by setting ttlSecondsAfterFinished: ""
ttlSecondsAfterFinished: 20
# use a helm hook to manage the migration job
useHook: false
annotations: {}
## Affinity for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity
affinity: {}
## Node labels for pod assignment
## ref: https://kubernetes.io/docs/user-guide/node-selection/
nodeSelector: {}
## Tolerations for pod assignment
## ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/
tolerations: []
# Extra containers which runs as sidecar
extraContainers: ""
# extraContainers: |
# - name: cloud-sql-proxy
# image: gcr.io/cloud-sql-connectors/cloud-sql-proxy:2.1.2
# args:
# - --private-ip
# - --port=5432
# - example:europe-west3:grafana-oncall-db
resources:
{}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
# Extra volume mounts for the main container
extraVolumeMounts: []
# - mountPath: /mnt/postgres-tls
# name: postgres-tls
# - mountPath: /mnt/redis-tls
# name: redis-tls
# Extra volumes for the pod
extraVolumes: []
# - name: postgres-tls
# configMap:
# name: my-postgres-tls
# defaultMode: 0640
# - name: redis-tls
# configMap:
# name: my-redis-tls
# defaultMode: 0640
# Sets environment variables with name capitalized and prefixed with UWSGI_,
# and dashes are substituted with underscores.
# see more: https://uwsgi-docs.readthedocs.io/en/latest/Configuration.html#environment-variables
# Set null to disable all UWSGI environment variables
uwsgi:
listen: 1024
# Additional env variables to add to deployments
env: {}
# Enable ingress object for external access to the resources
ingress:
enabled: true
# className: ""
annotations:
kubernetes.io/ingress.class: "nginx"
cert-manager.io/issuer: "letsencrypt-prod"
tls:
- hosts:
- "{{ .Values.base_url }}"
secretName: certificate-tls
# Extra paths to prepend to the host configuration. If using something
# like an ALB ingress controller, you may want to configure SSL redirects
extraPaths: []
# - path: /*
# backend:
# serviceName: ssl-redirect
# servicePort: use-annotation
## Or for k8s > 1.19
# - path: /*
# pathType: Prefix
# backend:
# service:
# name: ssl-redirect
# port:
# name: use-annotation
# Whether to install ingress controller
ingress-nginx:
enabled: true
# Install cert-manager as a part of the release
cert-manager:
enabled: true
# Instal CRD resources
installCRDs: true
webhook:
timeoutSeconds: 30
# cert-manager tries to use the already used port, changing to another one
# https://github.com/cert-manager/cert-manager/issues/3237
# https://cert-manager.io/docs/installation/compatibility/
securePort: 10260
# Fix self-checks https://github.com/jetstack/cert-manager/issues/4286
podDnsPolicy: None
podDnsConfig:
nameservers:
- 8.8.8.8
- 1.1.1.1
database:
# can be either mysql or postgresql
type: mysql
# MySQL is included into this release for the convenience.
# It is recommended to host it separately from this release
# Set mariadb.enabled = false and configure externalMysql
mariadb:
enabled: true
auth:
database: oncall
existingSecret:
primary:
extraEnvVars:
- name: MARIADB_COLLATE
value: utf8mb4_unicode_ci
- name: MARIADB_CHARACTER_SET
value: utf8mb4
secondary:
extraEnvVars:
- name: MARIADB_COLLATE
value: utf8mb4_unicode_ci
- name: MARIADB_CHARACTER_SET
value: utf8mb4
# Make sure to create the database with the following parameters:
# CREATE DATABASE oncall CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
externalMysql:
host:
port:
db_name:
user:
password:
# Use an existing secret for the mysql password.
existingSecret:
# The key in the secret containing the mysql username
usernameKey:
# The key in the secret containing the mysql password
passwordKey:
# Extra options (see example below)
# Reference: https://pymysql.readthedocs.io/en/latest/modules/connections.html
options:
# options: >-
# ssl_verify_cert=true
# ssl_verify_identity=true
# ssl_ca=/mnt/mysql-tls/ca.crt
# ssl_cert=/mnt/mysql-tls/client.crt
# ssl_key=/mnt/mysql-tls/client.key
# PostgreSQL is included into this release for the convenience.
# It is recommended to host it separately from this release
# Set postgresql.enabled = false and configure externalPostgresql
postgresql:
enabled: false
auth:
database: oncall
existingSecret:
# Make sure to create the database with the following parameters:
# CREATE DATABASE oncall WITH ENCODING UTF8;
externalPostgresql:
host:
port:
db_name:
user:
password:
# Use an existing secret for the database password
existingSecret:
# The key in the secret containing the database password
passwordKey:
# Extra options (see example below)
# Reference: https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS
options:
# options: >-
# sslmode=verify-full
# sslrootcert=/mnt/postgres-tls/ca.crt
# sslcert=/mnt/postgres-tls/client.crt
# sslkey=/mnt/postgres-tls/client.key
# RabbitMQ is included into this release for the convenience.
# It is recommended to host it separately from this release
# Set rabbitmq.enabled = false and configure externalRabbitmq
rabbitmq:
enabled: true
auth:
existingPasswordSecret:
broker:
type: rabbitmq
externalRabbitmq:
host:
port:
user:
password:
protocol:
vhost:
# Use an existing secret for the rabbitmq password
existingSecret:
# The key in the secret containing the rabbitmq password
passwordKey: ""
# The key in the secret containing the rabbitmq username
usernameKey: username
# Redis is included into this release for the convenience.
# It is recommended to host it separately from this release
redis:
enabled: true
auth:
existingSecret:
externalRedis:
protocol:
host:
port:
database:
username:
password:
# Use an existing secret for the redis password
existingSecret:
# The key in the secret containing the redis password
passwordKey:
# SSL options
ssl_options:
enabled: false
# CA certificate
ca_certs:
# Client SSL certs
certfile:
keyfile:
# SSL verification mode: "cert_none" | "cert_optional" | "cert_required"
cert_reqs:
# Grafana is included into this release for the convenience.
# It is recommended to host it separately from this release
grafana:
enabled: true
grafana.ini:
server:
domain: example.com
root_url: "%(protocol)s://%(domain)s/grafana"
serve_from_sub_path: true
persistence:
enabled: true
# Disable psp as PodSecurityPolicy is deprecated in v1.21+, unavailable in v1.25+
rbac:
pspEnabled: false
plugins:
- grafana-oncall-app
externalGrafana:
# Example: https://grafana.mydomain.com
url:
nameOverride: ""
fullnameOverride: ""
serviceAccount:
# Specifies whether a service account should be created
create: true
# Annotations to add to the service account
annotations: {}
# The name of the service account to use.
# If not set and create is true, a name is generated using the fullname template
name: ""
podAnnotations: {}
podSecurityContext:
{}
# fsGroup: 2000
securityContext:
{}
# capabilities:
# drop:
# - ALL
# readOnlyRootFilesystem: true
# runAsNonRoot: true
# runAsGroup: 2000
# runAsUser: 1000
init:
securityContext:
{}
# allowPrivilegeEscalation: false
# capabilities:
# drop:
# - ALL
# privileged: false
# readOnlyRootFilesystem: true
# runAsGroup: 2000
# runAsNonRoot: true
# runAsUser: 1000
resources:
{}
# limits:
# cpu: 100m
# memory: 128Mi
# requests:
# cpu: 100m
# memory: 128Mi
ui:
# this is intended to be used for local development. In short, it will spin up an additional container
# running the plugin frontend, such that hot reloading can be enabled
enabled: false
image:
repository: oncall/ui
tag: dev
# Additional env vars for the ui container
env: {}
prometheus:
enabled: false
# extraScrapeConfigs: |
# - job_name: 'oncall-exporter'
# metrics_path: /metrics/
# static_configs:
# - targets:
# - oncall-dev-engine.default.svc.cluster.local:8080

View File

@@ -1,19 +0,0 @@
oncall:
database:
type: "postgresql"
broker:
type: "redis"
ingress:
enabled: false
cert-manager:
enabled: false
mariadb:
enabled: false
postgresql:
enabled: false
rabbitmq:
enabled: false
redis:
enabled: false
grafana:
enabled: false

View File

@@ -19,3 +19,26 @@ update:
helm repo add fluent https://fluent.github.io/helm-charts
helm repo update fluent
helm pull fluent/fluent-bit --untar --untardir charts
# alerts from victoria-metrics-k8s-stack
helm repo add vm https://victoriametrics.github.io/helm-charts/
helm repo update vm
helm pull vm/victoria-metrics-k8s-stack --untar --untardir charts
rm -rf charts/victoria-metrics-k8s-stack/charts
rm -rf charts/victoria-metrics-k8s-stack/hack
rm -rf charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator
rm -rf charts/victoria-metrics-k8s-stack/templates/grafana
rm -rf charts/victoria-metrics-k8s-stack/templates/ingress.yaml
rm -rf charts/victoria-metrics-k8s-stack/files/dashboards
rm -f charts/victoria-metrics-k8s-stack/templates/servicemonitors.yaml
rm -f charts/victoria-metrics-k8s-stack/templates/serviceaccount.yaml
rm -f charts/victoria-metrics-k8s-stack/templates/rules/additionalVictoriaMetricsRules.yml
sed -i '/ namespace:/d' charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml
sed -i 's|job="apiserver"|job="kube-apiserver"|g' `grep -rl 'job="apiserver"' charts/victoria-metrics-k8s-stack/files/rules/generated`
sed -i 's|severity: info|severity: informational|g' `grep -rl 'severity: info' ./charts/victoria-metrics-k8s-stack/files/rules/generated`
sed -i 's|severity: none|severity: ok|g' ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
sed -i ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml \
-e '/Watchdog/,/severity:/s/severity: none/severity: ok/' \
-e '/InfoInhibitor/,/severity:/s/severity: none/severity: major/'
# TODO
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/vm*.yaml

View File

@@ -1,57 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-general
namespace: cozy-monitoring
spec:
groups:
- name: coreos.general
labels:
alertname: foo
rules:
- alert: DeadMansSwitch
expr: vector(1)
labels:
severity_level: "4"
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
pipeline is functional.
summary: Alerting DeadMansSwitch
- record: fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: max(predict_linear(fd_utilization{pod!=""}[1h], 3600 * 4)) BY (job, namespace, pod) > 1
for: 10m
labels:
severity_level: "4"
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: max(predict_linear(fd_utilization{pod=""}[1h], 3600 * 4)) BY (job, instance) > 1
for: 10m
labels:
severity_level: "4"
annotations:
description: '{{ $labels.job }}: {{ $labels.instance }} instance
will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: max(predict_linear(fd_utilization{pod!=""}[10m], 3600)) BY (job, namespace, pod) > 1
for: 10m
labels:
severity_level: "3"
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: max(predict_linear(fd_utilization{pod=""}[10m], 3600)) BY (job, instance) > 1
for: 10m
labels:
severity_level: "3"
annotations:
description: '{{ $labels.job }}: {{ $labels.instance }} instance
will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted

View File

@@ -1,25 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node
namespace: cozy-monitoring
spec:
groups:
- name: kubernetes.dns
rules:
- alert: KubernetesDnsTargetDown
expr: absent(up{job="kube-dns"} == 1)
for: 5m
labels:
severity_level: "5"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: "markdown"
description: |-
Prometheus is unable to collect metrics from kube-dns. Thus its status is unknown.
To debug the problem, use the following commands:
1. `kubectl -n kube-system describe deployment -l k8s-app=kube-dns`
2. `kubectl -n kube-system describe pod -l k8s-app=kube-dns`
summary: Kube-dns or CoreDNS are not under monitoring.

View File

@@ -1,50 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-state-metrics
namespace: cozy-monitoring
spec:
groups:
- name: kube-state-metrics
rules:
- alert: DeploymentGenerationMismatch
expr: max(kube_deployment_status_observed_generation != kube_deployment_metadata_generation) by (namespace, deployment)
for: 15m
labels:
severity_level: "4"
annotations:
plk_protocol_version: "1"
description: Observed deployment generation does not match expected one for
deployment {{$labels.namespace}}/{{$labels.deployment}}
summary: Deployment is outdated
- alert: KubeStateMetricsDown
expr: absent(up{job="kube-state-metrics"} == 1)
for: 5m
labels:
severity_level: "3"
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
description: |-
There are no metrics about cluster resources for 5 minutes.
Most alerts an monitroing panels aren't working.
To debug the problem:
1. Check kube-state-metrics pods: `kubectl -n d8-monitoring describe pod -l app=kube-state-metrics`
2. Check its logs: `kubectl -n d8-monitoring describe deploy kube-state-metrics`
summary: >
Kube-state-metrics is not working in the cluster.
- name: kubernetes.rules
rules:
- alert: KubePodCrashLooping
expr: |
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1
annotations:
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
summary: Pod is crash looping.
for: 15m
labels:
severity: warning

View File

@@ -1,63 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubelet
namespace: cozy-monitoring
spec:
groups:
- name: coreos.kubelet
rules:
- alert: K8SNodeNotReady
expr: min(kube_node_status_condition{condition="Ready",status="true"}) BY (node) == 0 and
min(kube_node_spec_unschedulable == 0) by (node)
for: 1m
labels:
severity_level: "3"
annotations:
plk_protocol_version: "1"
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
or has set itself to NotReady, for more than 10 minutes
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0 and on (node) kube_node_spec_unschedulable == 0) > 1
and (count(kube_node_status_condition{condition="Ready",status="true"} == 0 and on (node) kube_node_spec_unschedulable == 0) /
count(kube_node_status_condition{condition="Ready",status="true"} and on (node) kube_node_spec_unschedulable == 0)) > 0.2
for: 1m
labels:
severity_level: "3"
annotations:
plk_protocol_version: "1"
description: '{{ $value }}% of Kubernetes nodes are not ready'
summary: Too many nodes are not ready
- alert: K8SKubeletDown
expr: (count(up{job="kubelet"} == 0) or absent(up{job="kubelet"} == 1)) / count(up{job="kubelet"}) * 100 > 3
for: 10m
labels:
severity_level: "4"
tier: "cluster"
annotations:
plk_protocol_version: "1"
plk_group_for__target_down: "TargetDown,prometheus=deckhouse,job=kubelet,kubernetes=~kubernetes"
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: A few kubelets cannot be scraped
- alert: K8SKubeletDown
expr: (count(up{job="kubelet"} == 0) or absent(up{job="kubelet"} == 1)) / count(up{job="kubelet"}) * 100 > 10
for: 30m
labels:
severity_level: "3"
tier: "cluster"
annotations:
plk_protocol_version: "1"
plk_group_for__target_down: "TargetDown,prometheus=deckhouse,job=kubelet,kubernetes=~kubernetes"
description: Prometheus failed to scrape {{ $value }}% of kubelets.
summary: Many kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pods > on(node) (kube_node_status_capacity{resource="pods",unit="integer"}) * 0.9
for: 10m
labels:
severity_level: "7"
annotations:
plk_protocol_version: "1"
description: Kubelet {{ $labels.node }} is running {{ $value }} pods, close
to the limit of {{ printf "kube_node_status_capacity{resource=\"pods\",unit=\"integer\",node=\"%s\"}" $labels.node | query | first | value }}
summary: Kubelet is close to pod limit

View File

@@ -1,357 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node
namespace: cozy-monitoring
spec:
groups:
- name: kubernetes.node.disk_inodes_usage
rules:
- alert: KubeletNodeFSInodesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 <
max by (node, mountpoint) (kubelet_eviction_nodefs_inodes{type="soft"})
)
for: 10m
labels:
severity_level: "9"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_nodefs_inodes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletNodeFSInodesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 <
max by (node, mountpoint) (kubelet_eviction_nodefs_inodes{type="hard"} + 5)
)
for: 5m
labels:
severity_level: "7"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
Threshold at: {{ printf "kubelet_eviction_nodefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: >
Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- alert: KubeletNodeFSInodesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 <
max by (node, mountpoint) (kubelet_eviction_nodefs_inodes{type="hard"})
)
labels:
severity_level: "6"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_nodefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletNodeFSInodesUsage
expr: |
(
(
max by (node, mountpoint) (node_filesystem_files_free)
) == 0
)
* (max by (node, mountpoint) ({__name__=~"kubelet_eviction_nodefs_inodes"}))
labels:
severity_level: "5"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
summary: No more free inodes on nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- alert: KubeletImageFSInodesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 <
max by (node, mountpoint) (kubelet_eviction_imagefs_inodes{type="soft"})
)
for: 10m
labels:
severity_level: "9"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Soft eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_imagefs_inodes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Soft eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletImageFSInodesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 <
max by (node, mountpoint) (kubelet_eviction_imagefs_inodes{type="hard"} + 5)
)
for: 5m
labels:
severity_level: "7"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Close to hard eviction threshold of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
Threshold at: {{ printf "kubelet_eviction_imagefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Close to hard eviction threshold of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- alert: KubeletImageFSInodesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_files_free / node_filesystem_files) * 100 <
max by (node, mountpoint) (kubelet_eviction_imagefs_inodes{type="hard"})
)
labels:
severity_level: "6"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Hard eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_imagefs_inodes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Hard eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletImageFSInodesUsage
expr: |
(
(
max by (node, mountpoint) (node_filesystem_files_free)
) == 0
)
* (max by (node, mountpoint) ({__name__=~"kubelet_eviction_imagefs_inodes"}))
labels:
severity_level: "5"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
summary: No more free inodes on imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- name: kubernetes.node.disk_bytes_usage
rules:
- alert: KubeletNodeFSBytesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 <
max by (node, mountpoint) (kubelet_eviction_nodefs_bytes{type="soft"})
)
for: 10m
labels:
severity_level: "9"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_nodefs_bytes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Soft eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletNodeFSBytesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 <
max by (node, mountpoint) (kubelet_eviction_nodefs_bytes{type="hard"} + 5)
)
for: 5m
labels:
severity_level: "7"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
Threshold at: {{ printf "kubelet_eviction_nodefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Close to hard eviction threshold of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- alert: KubeletNodeFSBytesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 <
max by (node, mountpoint) (kubelet_eviction_nodefs_bytes{type="hard"})
)
labels:
severity_level: "6"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_nodefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Hard eviction of nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletNodeFSBytesUsage
expr: |
(
(
max by (node, mountpoint) (node_filesystem_avail_bytes)
) == 0
)
* (max by (node, mountpoint) ({__name__=~"kubelet_eviction_nodefs_bytes"}))
labels:
severity_level: "5"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
summary: No more free space on nodefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- alert: KubeletImageFSBytesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 <
max by (node, mountpoint) (kubelet_eviction_imagefs_bytes{type="soft"})
)
for: 10m
labels:
severity_level: "9"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Soft eviction of imagefs (filesystem that the container runtime uses for storing images and container writable layers) on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_imagefs_bytes{type=\"soft\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Soft eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletImageFSBytesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 <
max by (node, mountpoint) (kubelet_eviction_imagefs_bytes{type="hard"} + 5)
)
for: 5m
labels:
severity_level: "7"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Close to hard eviction threshold of imagefs (filesystem that the container runtime uses for storing images and container writable layers) on node {{$labels.node}} mountpoint {{$labels.mountpoint}}.
Threshold at: {{ printf "kubelet_eviction_imagefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Close to hard eviction threshold of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.
- alert: KubeletImageFSBytesUsage
expr: |
(
max by (node, mountpoint) (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 <
max by (node, mountpoint) (kubelet_eviction_imagefs_bytes{type="hard"})
)
labels:
severity_level: "6"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
Hard eviction of imagefs (filesystem that the container runtime uses for storing images and container writable layers) on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
Threshold at: {{ printf "kubelet_eviction_imagefs_bytes{type=\"hard\", node=\"%s\", mountpoint=\"%s\"}" $labels.node $labels.mountpoint | query | first | value }}%
Currently at: {{ .Value }}%
summary: Hard eviction of imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint is in progress.
- alert: KubeletImageFSBytesUsage
expr: |
(
(
max by (node, mountpoint) (node_filesystem_avail_bytes)
) == 0
)
* (max by (node, mountpoint) ({__name__=~"kubelet_eviction_imagefs_bytes"}))
labels:
severity_level: "5"
tier: cluster
annotations:
plk_protocol_version: "1"
plk_markup_format: markdown
plk_create_group_if_not_exists__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_disk_usage: "NodeDiskUsage,tier=cluster,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
No more free bytes on imagefs (filesystem that the container runtime uses for storing images and container writable layers) on node {{$labels.node}} mountpoint {{$labels.mountpoint}}.
summary: No more free bytes on imagefs on the {{$labels.node}} Node at the {{$labels.mountpoint}} mountpoint.

View File

@@ -1,36 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node
namespace: cozy-monitoring
spec:
groups:
- name: coreos.node
rules:
- record: node:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m]))
BY (node)
- record: node:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (node)
- record: node:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (node)
- record: node:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (node)
- record: node:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(node)
GROUP_LEFT() count(sum(node_cpu) BY (node, cpu)) BY (node)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (node, cpu))
- alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity_level: "3"
annotations:
plk_protocol_version: "1"
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
summary: Prometheus could not scrape a node-exporter

View File

@@ -1,34 +0,0 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node
namespace: cozy-monitoring
spec:
groups:
- name: kubernetes.pod_status_incorrect
rules:
- alert: PodStatusIsIncorrect
expr: >
(count by (node, namespace, pod) (kube_pod_status_ready{condition="true"} == 0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
and
(
(count by (namespace, pod) (kube_pod_container_status_ready==1) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
unless
(count by (namespace, pod) (kube_pod_container_status_ready==0) * on (namespace, pod) group_left(node) (max by (namespace, node, pod) (kube_pod_info)))
)
for: 10m
annotations:
plk_markup_format: markdown
plk_protocol_version: "1"
plk_create_group_if_not_exists__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
plk_grouped_by__node_have_pods_with_incorrect_status: "NodeHavePodsWithIncorrectStatus,prometheus=deckhouse,node={{ $labels.node }},kubernetes=~kubernetes"
description: |
There is a {{ $labels.namespace }}/{{ $labels.pod }} Pod in the cluster that runs on the {{ $labels.node }} and listed as NotReady while all the Pod's containers are Ready.
This could be due to the [Kubernetes bug](https://github.com/kubernetes/kubernetes/issues/80968).
The recommended course of action:
1. Find all the Pods having this state: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | "\(.spec.nodeName)/\(.metadata.namespace)/\(.metadata.name)"'`;
2. Find all the Nodes affected: `kubectl get pod -o json --all-namespaces | jq '.items[] | select(.status.phase == "Running") | select(.status.conditions[] | select(.type == "ContainersReady" and .status == "True")) | select(.status.conditions[] | select(.type == "Ready" and .status == "False")) | .spec.nodeName' -r | sort | uniq -c`;
3. Restart `kubelet` on each Node: `systemctl restart kubelet`.
summary: The state of the {{ $labels.namespace }}/{{ $labels.pod }} Pod running on the {{ $labels.node }} Node is incorrect. You need to restart `kubelet`.

View File

@@ -22,5 +22,5 @@
*.tmproj
.vscode/
# exclude helm unit tests
tests/
# Ignore img folder used for documentation
img/

View File

@@ -0,0 +1,688 @@
## Next release
- TODO
## 0.25.17
**Release date:** 2024-09-20
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
- Fixed ETCD dashboard
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.
## 0.25.16
**Release date:** 2024-09-10
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Do not truncate servicemonitor, datasources, rules, dashboard, alertmanager & vmalert templates names
- Use service label for node-exporter instead of podLabel. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1458)
- Added common chart to a k8s-stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1456)
- Fixed value of custom alertmanager configSecret. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1461)
## 0.25.15
**Release date:** 2024-09-05
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Drop empty endpoints param from scrape configuration
- Fixed proto when TLS is enabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1449)
## 0.25.14
**Release date:** 2024-09-04
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed alertmanager templates
## 0.25.13
**Release date:** 2024-09-04
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Use operator's own service monitor
## 0.25.12
**Release date:** 2024-09-03
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fixed dashboards rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1414)
- Fixed service monitor label name.
## 0.25.11
**Release date:** 2024-09-03
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Merged ingress templates
- Removed custom VMServiceScrape for operator
- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860).
- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312).
- Migrated Node Exporter dashboard into chart
- Deprecated `grafana.sidecar.jsonData`, `grafana.provisionDefaultDatasource` in a favour of `grafana.sidecar.datasources.default` slice of datasources.
- Fail if no notifiers are set, do not set `notifiers` to null if empty
## 0.25.10
**Release date:** 2024-08-31
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed ingress extraPaths and externalVM urls rendering
## 0.25.9
**Release date:** 2024-08-31
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed vmalert ingress name typo
- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860).
- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312).
## 0.25.8
**Release date:** 2024-08-30
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed external notifiers rendering, when alertmanager is disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1378)
## 0.25.7
**Release date:** 2024-08-30
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed extra rules template context
## 0.25.6
**Release date:** 2024-08-29
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec`
**Update note**: Update `kubeScheduler.spec` to `kubeScheduler.vmScrape.spec`
**Update note**: Update `kubeEtcd.spec` to `kubeEtcd.vmScrape.spec`
**Update note**: Update `coreDns.spec` to `coreDns.vmScrape.spec`
**Update note**: Update `kubeDns.spec` to `kubeDns.vmScrape.spec`
**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec`
**Update note**: Update `kubeControllerManager.spec` to `kubeControllerManager.vmScrape.spec`
**Update note**: Update `kubeApiServer.spec` to `kubeApiServer.vmScrape.spec`
**Update note**: Update `kubelet.spec` to `kubelet.vmScrape.spec`
**Update note**: Update `kube-state-metrics.spec` to `kube-state-metrics.vmScrape.spec`
**Update note**: Update `prometheus-node-exporter.spec` to `prometheus-node-exporter.vmScrape.spec`
**Update note**: Update `grafana.spec` to `grafana.vmScrape.spec`
- bump version of VM components to [v1.103.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.103.0)
- Added `dashboards.<dashboardName>` bool flag to enable dashboard even if component it is for is not installed.
- Allow extra `vmalert.notifiers` without dropping default notifier if `alertmanager.enabled: true`
- Do not drop default notifier, when vmalert.additionalNotifierConfigs is set
- Replaced static url proto with a template, which selects proto depending on a present tls configuration
- Moved kubernetes components monitoring config from `spec` config to `vmScrape.spec`
- Merged servicemonitor templates
## 0.25.5
**Release date:** 2024-08-26
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- TODO
## 0.25.4
**Release date:** 2024-08-26
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updates operator to [v0.47.2](https://github.com/VictoriaMetrics/operator/releases/tag/v0.47.2)
- kube-state-metrics - 5.16.4 -> 5.25.1
- prometheus-node-exporter - 4.27.0 -> 4.29.0
- grafana - 8.3.8 -> 8.4.7
- added configurable `.Values.global.clusterLabel` to all alerting and recording rules `by` and `on` expressions
## 0.25.3
**Release date:** 2024-08-23
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updated operator to v0.47.1 release
- Build `app.kubernetes.io/instance` label consistently. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1282)
## 0.25.2
**Release date:** 2024-08-21
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed vmalert ingress name. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1271)
- fixed alertmanager ingress host template rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1270)
## 0.25.1
**Release date:** 2024-08-21
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Added `.Values.global.license` configuration
- Fixed extraLabels rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1248)
- Fixed vmalert url to alertmanager by including its path prefix
- Removed `networking.k8s.io/v1beta1/Ingress` and `extensions/v1beta1/Ingress` support
- Fixed kubedns servicemonitor template. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1255)
## 0.25.0
**Release date:** 2024-08-16
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
**Update note**: it requires to update CRD dependency manually before upgrade
**Update note**: requires Helm 3.14+
- Moved dashboards templating logic out of sync script to Helm template
- Allow to disable default grafana datasource
- Synchronize Etcd dashboards and rules with mixin provided by Etcd
- Add alerting rules for VictoriaMetrics operator.
- Updated alerting rules for VictoriaMetrics components.
- Fixed exact rule annotations propagation to other rules.
- Set minimal kubernetes version to 1.25
- updates operator to v0.47.0 version
## 0.24.5
**Release date:** 2024-08-01
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.102.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.1)
## 0.24.4
**Release date:** 2024-08-01
![AppVersion: v1.102.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update dependencies: grafana -> 8.3.6.
- Added `.Values.defaultRules.alerting` and `.Values.defaultRules.recording` to setup common properties for all alerting an recording rules
## 0.24.3
**Release date:** 2024-07-23
![AppVersion: v1.102.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.102.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0)
## 0.24.2
**Release date:** 2024-07-15
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fix vmalertmanager configuration when using `.VMAlertmanagerSpec.ConfigRawYaml`. See [this pull request](https://github.com/VictoriaMetrics/helm-charts/pull/1136).
## 0.24.1
**Release date:** 2024-07-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updates operator to v0.46.4
## 0.24.0
**Release date:** 2024-07-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- added ability to override alerting rules labels and annotations:
- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations`
- for all rules in a group - `.Values.defaultRules.groups.<groupName>.rules.spec.labels` and `.Valeus.defaultRules.groups.<groupName>.rules.spec.annotations`
- for each rule individually - `.Values.defaultRules.rules.<ruleName>.spec.labels` and `.Values.defaultRules.rules.<ruleName>.spec.annotations`
- changed `.Values.defaultRules.rules.<groupName>` to `.Values.defaultRules.groups.<groupName>.create`
- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups.<groupName>.targetNamespace`
- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups.<groupName>.spec.params`
## 0.23.6
**Release date:** 2024-07-08
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- added ability to override alerting rules labels and annotations:
- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations`
- for all rules in a group - `.Values.defaultRules.groups.<groupName>.rules.spec.labels` and `.Valeus.defaultRules.groups.<groupName>.rules.spec.annotations`
- for each rule individually - `.Values.defaultRules.rules.<ruleName>.spec.labels` and `.Values.defaultRules.rules.<ruleName>.spec.annotations`
- changed `.Values.defaultRules.rules.<groupName>` to `.Values.defaultRules.groups.<groupName>.create`
- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups.<groupName>.targetNamespace`
- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups.<groupName>.spec.params`
## 0.23.5
**Release date:** 2024-07-04
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Support configuring vmalert `-notifier.config` with `.Values.vmalert.additionalNotifierConfigs`.
## 0.23.4
**Release date:** 2024-07-02
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Add `extraObjects` to allow deploying additional resources with the chart release.
## 0.23.3
**Release date:** 2024-06-26
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Enable [conversion of Prometheus CRDs](https://docs.victoriametrics.com/operator/migration/#objects-conversion) by default. See [this](https://github.com/VictoriaMetrics/helm-charts/pull/1069) pull request for details.
- use bitnami/kubectl image for cleanup instead of deprecated gcr.io/google_containers/hyperkube
## 0.23.2
**Release date:** 2024-06-14
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Do not add `cluster` external label at VMAgent by default. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/774) for the details.
## 0.23.1
**Release date:** 2024-06-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updates operator to v0.45.0 release
- sync latest vm alerts and dashboards.
## 0.23.0
**Release date:** 2024-05-30
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- sync latest etcd v3.5.x rules from [upstream](https://github.com/etcd-io/etcd/blob/release-3.5/contrib/mixin/mixin.libsonnet).
- add Prometheus operator CRDs as an optional dependency. See [this PR](https://github.com/VictoriaMetrics/helm-charts/pull/1022) and [related issue](https://github.com/VictoriaMetrics/helm-charts/issues/341) for the details.
## 0.22.1
**Release date:** 2024-05-14
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fix missing serviceaccounts patch permission in VM operator, see [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1012) for details.
## 0.22.0
**Release date:** 2024-05-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM operator to [0.44.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.44.0)
## 0.21.3
**Release date:** 2024-04-26
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.101.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.101.0)
## 0.21.2
**Release date:** 2024-04-23
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM operator to [0.43.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.3)
## 0.21.1
**Release date:** 2024-04-18
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
## 0.21.0
**Release date:** 2024-04-18
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- TODO
- bump version of VM operator to [0.43.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.0)
- updates CRDs definitions.
## 0.20.1
**Release date:** 2024-04-16
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- upgraded dashboards and alerting rules, added values file for local (Minikube) setup
- bump version of VM components to [v1.100.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.100.1)
## 0.20.0
**Release date:** 2024-04-02
![AppVersion: v1.99.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.99.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM operator to [0.42.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.42.3)
## 0.19.4
**Release date:** 2024-03-05
![AppVersion: v1.99.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.99.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0)
## 0.19.3
**Release date:** 2024-03-05
![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Commented default configuration for alertmanager. It simplifies configuration and makes it more explicit. See this [issue](https://github.com/VictoriaMetrics/helm-charts/issues/473) for details.
- Allow enabling/disabling default k8s rules when installing. See [#904](https://github.com/VictoriaMetrics/helm-charts/pull/904) by @passie.
## 0.19.2
**Release date:** 2024-02-26
![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix templating of VMAgent `remoteWrite` in case both `VMSingle` and `VMCluster` are disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/865) for details.
## 0.19.1
**Release date:** 2024-02-21
![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update dependencies: victoria-metrics-operator -> 0.28.1, grafana -> 7.3.1.
- Update victoriametrics CRD resources yaml.
## 0.19.0
**Release date:** 2024-02-09
![AppVersion: v1.97.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.97.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Do not store original labels in `vmagent`'s memory by default. This reduces memory usage of `vmagent` but makes `vmagent`'s debugging UI less informative. See [this docs](https://docs.victoriametrics.com/vmagent/#relabel-debug) for details on relabeling debug.
- Update dependencies: kube-state-metrics -> 5.16.0, prometheus-node-exporter -> 4.27.0, grafana -> 7.3.0.
- Update victoriametrics CRD resources yaml.
- Update builtin dashboards and rules.
## 0.18.12
**Release date:** 2024-02-01
![AppVersion: v1.97.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.97.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.97.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.97.1)
- Fix helm lint when ingress resources enabled - split templates of resources per kind. See [#820](https://github.com/VictoriaMetrics/helm-charts/pull/820) by @MemberIT.
## 0.18.11
**Release date:** 2023-12-15
![AppVersion: v1.96.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.96.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix missing `.Values.defaultRules.rules.vmcluster` value. See [#801](https://github.com/VictoriaMetrics/helm-charts/pull/801) by @MemberIT.
## 0.18.10
**Release date:** 2023-12-12
![AppVersion: v1.96.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.96.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0)
- Add optional allowCrossNamespaceImport to GrafanaDashboard(s) (#788)
## 0.18.9
**Release date:** 2023-12-08
![AppVersion: v1.95.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Properly use variable from values file for Grafana datasource type. (#769)
- Update dashboards from upstream sources. (#780)
## 0.18.8
**Release date:** 2023-11-16
![AppVersion: v1.95.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.95.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.1)
## 0.18.7
**Release date:** 2023-11-15
![AppVersion: v1.95.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.95.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.0)
- Support adding extra group parameters for default vmrules. (#752)
## 0.18.6
**Release date:** 2023-11-01
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix kube scheduler default scraping port from 10251 to 10259, Kubernetes changed it since 1.23.0. See [this pr](https://github.com/VictoriaMetrics/helm-charts/pull/736) for details.
- Bump version of operator chart to [0.27.4](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.4)
## 0.18.5
**Release date:** 2023-10-08
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update operator chart to [v0.27.3](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.3) for fixing [#708](https://github.com/VictoriaMetrics/helm-charts/issues/708)
## 0.18.4
**Release date:** 2023-10-04
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update dependencies: [victoria-metrics-operator -> 0.27.2](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.2), prometheus-node-exporter -> 4.23.2, grafana -> 6.59.5.
## 0.18.3
**Release date:** 2023-10-04
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.94.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.94.0)
## 0.18.2
**Release date:** 2023-09-28
![AppVersion: v1.93.5](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.5&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix behavior of `vmalert.remoteWriteVMAgent` - remoteWrite.url for VMAlert is correctly generated considering endpoint, name, port and http.pathPrefix of VMAgent
## 0.18.1
**Release date:** 2023-09-21
![AppVersion: v1.93.5](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.5&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of VM components to [v1.93.5](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.5)
## 0.18.0
**Release date:** 2023-09-12
![AppVersion: v1.93.4](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.4&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of `grafana` helm-chart to `6.59.*`
- Bump version of `prometheus-node-exporter` helm-chart to `4.23.*`
- Bump version of `kube-state-metrics` helm-chart to `0.59.*`
- Update alerting rules
- Update grafana dashboards
- Add `make` commands `sync-rules` and `sync-dashboards`
- Add support of VictoriaMetrics datasource
## 0.17.8
**Release date:** 2023-09-11
![AppVersion: v1.93.4](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.4&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of VM components to [v1.93.4](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.4)
- Bump version of operator chart to [0.27.0](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.0)
## 0.17.7
**Release date:** 2023-09-07
![AppVersion: v1.93.3](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.3&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of operator helm-chart to `0.26.2`
## 0.17.6
**Release date:** 2023-09-04
![AppVersion: v1.93.3](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.3&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Move `cleanupCRD` option to victoria-metrics-operator chart (#593)
- Disable `honorTimestamps` for cadvisor scrape job by default (#617)
- For vmalert all replicas of alertmanager are added to notifiers (only if alertmanager is enabled) (#619)
- Add `grafanaOperatorDashboardsFormat` option (#615)
- Fix query expression for memory calculation in `k8s-views-global` dashboard (#636)
- Bump version of Victoria Metrics components to `v1.93.3`
- Bump version of operator helm-chart to `0.26.0`
## 0.17.5
**Release date:** 2023-08-23
![AppVersion: v1.93.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.93.0 to v1.93.1
## 0.17.4
**Release date:** 2023-08-12
![AppVersion: v1.93.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.92.1 to v1.93.0
- delete an obsolete parameter remaining by mistake (see <https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack#upgrade-to-0130>) (#602)
## 0.17.3
**Release date:** 2023-07-28
![AppVersion: v1.92.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.92.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.92.0 to v1.92.1 (#599)
## 0.17.2
**Release date:** 2023-07-27
![AppVersion: v1.92.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.92.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.91.3 to v1.92.0

View File

@@ -0,0 +1,24 @@
dependencies:
- name: victoria-metrics-common
repository: https://victoriametrics.github.io/helm-charts
version: 0.0.11
- name: victoria-metrics-operator
repository: https://victoriametrics.github.io/helm-charts
version: 0.34.8
- name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 5.25.1
- name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
version: 4.39.0
- name: grafana
repository: https://grafana.github.io/helm-charts
version: 8.4.9
- name: crds
repository: ""
version: 0.0.0
- name: prometheus-operator-crds
repository: https://prometheus-community.github.io/helm-charts
version: 11.0.0
digest: sha256:11b119ebabf4ff0ea2951e7c72f51d0223dc3f50fb061a43b01fe7856491b836
generated: "2024-09-12T11:50:51.935071545Z"

View File

@@ -0,0 +1,66 @@
annotations:
artifacthub.io/category: monitoring-logging
artifacthub.io/changes: |
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
- Fixed ETCD dashboard
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
- 'Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.'
artifacthub.io/license: Apache-2.0
artifacthub.io/links: |
- name: Sources
url: https://docs.victoriametrics.com/vmgateway
- name: Charts repo
url: https://victoriametrics.github.io/helm-charts/
- name: Docs
url: https://docs.victoriametrics.com
artifacthub.io/operator: "true"
apiVersion: v2
appVersion: v1.102.1
dependencies:
- name: victoria-metrics-common
repository: https://victoriametrics.github.io/helm-charts
version: 0.0.*
- condition: victoria-metrics-operator.enabled
name: victoria-metrics-operator
repository: https://victoriametrics.github.io/helm-charts
version: 0.34.*
- condition: kube-state-metrics.enabled
name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 5.25.*
- condition: prometheus-node-exporter.enabled
name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
version: 4.39.*
- condition: grafana.enabled
name: grafana
repository: https://grafana.github.io/helm-charts
version: 8.4.*
- condition: crds.enabled
name: crds
repository: ""
version: 0.0.0
- condition: prometheus-operator-crds.enabled
name: prometheus-operator-crds
repository: https://prometheus-community.github.io/helm-charts
version: 11.0.*
description: Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics
Operator, Grafana dashboards, ServiceScrapes and VMRules
home: https://github.com/VictoriaMetrics/helm-charts
icon: https://avatars.githubusercontent.com/u/43720803?s=200&v=4
keywords:
- victoriametrics
- operator
- monitoring
- kubernetes
- observability
- tsdb
- metrics
- metricsql
- timeseries
kubeVersion: '>=1.25.0-0'
name: victoria-metrics-k8s-stack
sources:
- https://github.com/VictoriaMetrics/helm-charts
type: application
version: 0.25.17

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,300 @@
{{ template "chart.typeBadge" . }} {{ template "chart.versionBadge" . }}
[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/victoriametrics)](https://artifacthub.io/packages/helm/victoriametrics/victoria-metrics-k8s-stack)
{{ template "chart.description" . }}
* [Overview](#Overview)
* [Configuration](#Configuration)
* [Prerequisites](#Prerequisites)
* [Dependencies](#Dependencies)
* [Quick Start](#How-to-install)
* [Uninstall](#How-to-uninstall)
* [Version Upgrade](#Upgrade-guide)
* [Troubleshooting](#Troubleshooting)
* [Values](#Parameters)
## Overview
This chart is an All-in-one solution to start monitoring kubernetes cluster.
It installs multiple dependency charts like [grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana), [node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter), [kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) and [victoria-metrics-operator](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator).
Also it installs Custom Resources like [VMSingle](https://docs.victoriametrics.com/operator/quick-start#vmsingle), [VMCluster](https://docs.victoriametrics.com/operator/quick-start#vmcluster), [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent), [VMAlert](https://docs.victoriametrics.com/operator/quick-start#vmalert).
By default, the operator [converts all existing prometheus-operator API objects](https://docs.victoriametrics.com/operator/quick-start#migration-from-prometheus-operator-objects) into corresponding VictoriaMetrics Operator objects.
To enable metrics collection for kubernetes this chart installs multiple scrape configurations for kuberenetes components like kubelet and kube-proxy, etc. Metrics collection is done by [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent). So if want to ship metrics to external VictoriaMetrics database you can disable VMSingle installation by setting `vmsingle.enabled` to `false` and setting `vmagent.vmagentSpec.remoteWrite.url` to your external VictoriaMetrics database.
This chart also installs bunch of dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project.
![Overview](img/k8s-stack-overview.png)
## Configuration
Configuration of this chart is done through helm values.
### Dependencies
Dependencies can be enabled or disabled by setting `enabled` to `true` or `false` in `values.yaml` file.
**!Important:** for dependency charts anything that you can find in values.yaml of dependency chart can be configured in this chart under key for that dependency. For example if you want to configure `grafana` you can find all possible configuration options in [values.yaml](https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml) and you should set them in values for this chart under grafana: key. For example if you want to configure `grafana.persistence.enabled` you should set it in values.yaml like this:
```yaml
#################################################
### dependencies #####
#################################################
# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration
grafana:
enabled: true
persistence:
type: pvc
enabled: false
```
### VictoriaMetrics components
This chart installs multiple VictoriaMetrics components using Custom Resources that are managed by [victoria-metrics-operator](https://docs.victoriametrics.com/operator/design)
Each resource can be configured using `spec` of that resource from API docs of [victoria-metrics-operator](https://docs.victoriametrics.com/operator/api). For example if you want to configure `VMAgent` you can find all possible configuration options in [API docs](https://docs.victoriametrics.com/operator/api#vmagent) and you should set them in values for this chart under `vmagent.spec` key. For example if you want to configure `remoteWrite.url` you should set it in values.yaml like this:
```yaml
vmagent:
spec:
remoteWrite:
- url: "https://insert.vmcluster.domain.com/insert/0/prometheus/api/v1/write"
```
### ArgoCD issues
#### Operator self signed certificates
When deploying K8s stack using ArgoCD without Cert Manager (`.Values.victoria-metrics-operator.admissionWebhooks.certManager.enabled: false`)
it will rerender operator's webhook certificates on each sync since Helm `lookup` function is not respected by ArgoCD.
To prevent this please update you K8s stack Application `spec.syncPolicy` and `spec.ignoreDifferences` with a following:
```yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
...
spec:
...
syncPolicy:
syncOptions:
# https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/#respect-ignore-difference-configs
# argocd must also ignore difference during apply stage
# otherwise it ll silently override changes and cause a problem
- RespectIgnoreDifferences=true
ignoreDifferences:
- group: ""
kind: Secret
name: <fullname>-validation
namespace: kube-system
jsonPointers:
- /data
- group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
name: <fullname>-admission
jqPathExpressions:
- '.webhooks[]?.clientConfig.caBundle'
```
where `<fullname>` is output of `{{"{{"}} include "vm-operator.fullname" {{"}}"}}` for your setup
#### `metadata.annotations: Too long: must have at most 262144 bytes` on dashboards
If one of dashboards ConfigMap is failing with error `Too long: must have at most 262144 bytes`, please make sure you've added `argocd.argoproj.io/sync-options: ServerSideApply=true` annotation to your dashboards:
```yaml
grafana:
sidecar:
dashboards:
additionalDashboardAnnotations
argocd.argoproj.io/sync-options: ServerSideApply=true
```
argocd.argoproj.io/sync-options: ServerSideApply=true
### Rules and dashboards
This chart by default install multiple dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus)
you can disable dashboards with `defaultDashboardsEnabled: false` and `experimentalDashboardsEnabled: false`
and rules can be configured under `defaultRules`
### Prometheus scrape configs
This chart installs multiple scrape configurations for kubernetes monitoring. They are configured under `#ServiceMonitors` section in `values.yaml` file. For example if you want to configure scrape config for `kubelet` you should set it in values.yaml like this:
```yaml
kubelet:
enabled: true
# spec for VMNodeScrape crd
# https://docs.victoriametrics.com/operator/api#vmnodescrapespec
spec:
interval: "30s"
```
### Using externally managed Grafana
If you want to use an externally managed Grafana instance but still want to use the dashboards provided by this chart you can set
`grafana.enabled` to `false` and set `defaultDashboardsEnabled` to `true`. This will install the dashboards
but will not install Grafana.
For example:
```yaml
defaultDashboardsEnabled: true
grafana:
enabled: false
```
This will create ConfigMaps with dashboards to be imported into Grafana.
If additional configuration for labels or annotations is needed in order to import dashboard to an existing Grafana you can
set `.grafana.sidecar.dashboards.additionalDashboardLabels` or `.grafana.sidecar.dashboards.additionalDashboardAnnotations` in `values.yaml`:
For example:
```yaml
defaultDashboardsEnabled: true
grafana:
enabled: false
sidecar:
dashboards:
additionalDashboardLabels:
key: value
additionalDashboardAnnotations:
key: value
```
## Prerequisites
* Install the follow packages: ``git``, ``kubectl``, ``helm``, ``helm-docs``. See this [tutorial](../../REQUIREMENTS.md).
* Add dependency chart repositories
```console
helm repo add grafana https://grafana.github.io/helm-charts
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
```
* PV support on underlying infrastructure.
{{ include "chart.installSection" . }}
### Install locally (Minikube)
To run VictoriaMetrics stack locally it's possible to use [Minikube](https://github.com/kubernetes/minikube). To avoid dashboards and alert rules issues please follow the steps below:
Run Minikube cluster
```
minikube start --container-runtime=containerd --extra-config=scheduler.bind-address=0.0.0.0 --extra-config=controller-manager.bind-address=0.0.0.0
```
Install helm chart
```
helm install [RELEASE_NAME] vm/victoria-metrics-k8s-stack -f values.yaml -f values.minikube.yaml -n NAMESPACE --debug --dry-run
```
{{ include "chart.uninstallSection" . }}
CRDs created by this chart are not removed by default and should be manually cleaned up:
```console
kubectl get crd | grep victoriametrics.com | awk '{print $1 }' | xargs -i kubectl delete crd {}
```
## Troubleshooting
- If you cannot install helm chart with error `configmap already exist`. It could happen because of name collisions, if you set too long release name.
Kubernetes by default, allows only 63 symbols at resource names and all resource names are trimmed by helm to 63 symbols.
To mitigate it, use shorter name for helm chart release name, like:
```bash
# stack - is short enough
helm upgrade -i stack vm/victoria-metrics-k8s-stack
```
Or use override for helm chart release name:
```bash
helm upgrade -i some-very-long-name vm/victoria-metrics-k8s-stack --set fullnameOverride=stack
```
## Upgrade guide
Usually, helm upgrade doesn't requires manual actions. Just execute command:
```console
$ helm upgrade [RELEASE_NAME] vm/victoria-metrics-k8s-stack
```
But release with CRD update can only be patched manually with kubectl.
Since helm does not perform a CRD update, we recommend that you always perform this when updating the helm-charts version:
```console
# 1. check the changes in CRD
$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl diff -f -
# 2. apply the changes (update CRD)
$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl apply -f - --server-side
```
All other manual actions upgrades listed below:
### Upgrade to 0.13.0
- node-exporter starting from version 4.0.0 is using the Kubernetes recommended labels. Therefore you have to delete the daemonset before you upgrade.
```bash
kubectl delete daemonset -l app=prometheus-node-exporter
```
- scrape configuration for kubernetes components was moved from `vmServiceScrape.spec` section to `spec` section. If you previously modified scrape configuration you need to update your `values.yaml`
- `grafana.defaultDashboardsEnabled` was renamed to `defaultDashboardsEnabled` (moved to top level). You may need to update it in your `values.yaml`
### Upgrade to 0.6.0
All `CRD` must be update to the lastest version with command:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml
```
### Upgrade to 0.4.0
All `CRD` must be update to `v1` version with command:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml
```
### Upgrade from 0.2.8 to 0.2.9
Update `VMAgent` crd
command:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.16.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml
```
### Upgrade from 0.2.5 to 0.2.6
New CRD added to operator - `VMUser` and `VMAuth`, new fields added to exist crd.
Manual commands:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmusers.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmauths.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmalerts.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmsingles.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmclusters.yaml
```
{{ include "chart.helmDocs" . }}
## Parameters
The following tables lists the configurable parameters of the chart and their default values.
Change the values according to the need of the environment in ``victoria-metrics-k8s-stack/values.yaml`` file.
{{ template "chart.valuesTableHtml" . }}

View File

@@ -0,0 +1,40 @@
# Release process guidance
## Update version for VictoriaMetrics kubernetes monitoring stack
1. Update dependency requirements in [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml)
2. Apply changes via `helm dependency update`
3. Update image tag in chart values:
<div class="with-copy" markdown="1">
```console
make sync-rules
make sync-dashboards
```
</div>
4. Bump version of the victoria-metrics-k8s-stack [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml)
5. Run linter:
<div class="with-copy" markdown="1">
```console
make lint
```
</div>
6. Render templates locally to check for errors:
<div class="with-copy" markdown="1">
```console
helm template vm-k8s-stack ./charts/victoria-metrics-k8s-stack --output-dir out --values ./charts/victoria-metrics-k8s-stack/values.yaml --debug
```
</div>
7. Test updated chart by installing it to your kubernetes cluster.
8. Update docs with
```console
helm-docs
```
9. Commit the changes and send a [PR](https://github.com/VictoriaMetrics/helm-charts/pulls)

View File

@@ -0,0 +1,12 @@
# Release notes for version 0.25.17
**Release date:** 2024-09-20
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
- Fixed ETCD dashboard
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.

View File

@@ -0,0 +1,13 @@
---
weight: 1
title: CHANGELOG
menu:
docs:
weight: 1
identifier: helm-victoriametrics-k8s-stack-changelog
parent: helm-victoriametrics-k8s-stack
url: /helm/victoriametrics-k8s-stack/changelog
aliases:
- /helm/victoriametrics-k8s-stack/changelog/index.html
---
{{% content "CHANGELOG.md" %}}

View File

@@ -0,0 +1,13 @@
---
weight: 9
title: VictoriaMetrics K8s Stack
menu:
docs:
parent: helm
weight: 9
identifier: helm-victoriametrics-k8s-stack
url: /helm/victoriametrics-k8s-stack
aliases:
- /helm/victoriametrics-k8s-stack/index.html
---
{{% content "README.md" %}}

View File

@@ -0,0 +1,165 @@
condition: '{{ .Values.kubeEtcd.enabled }}'
name: etcd
rules:
- alert: etcdMembersDown
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
summary: 'etcd cluster members are down.'
condition: '{{ true }}'
expr: |-
max without (endpoint) (
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical
- alert: etcdInsufficientMembers
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
summary: 'etcd cluster has insufficient number of members.'
condition: '{{ true }}'
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
summary: 'etcd cluster has no leader.'
condition: '{{ true }}'
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
summary: 'etcd cluster has high number of leader changes.'
condition: '{{ true }}'
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster has high number of failed grpc requests.'
condition: '{{ true }}'
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster has high number of failed grpc requests.'
condition: '{{ true }}'
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.'
summary: 'etcd grpc requests are slow'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster member communication is slow.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster has high number of proposal failures.'
condition: '{{ true }}'
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster 99th percentile fsync durations are too high.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster 99th percentile fsync durations are too high.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: critical
- alert: etcdHighCommitDurations
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster 99th percentile commit durations are too high.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- alert: etcdDatabaseQuotaLowSpace
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
summary: 'etcd cluster database is running full.'
condition: '{{ true }}'
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
for: 10m
labels:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
summary: 'etcd cluster database growing very fast.'
condition: '{{ true }}'
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
labels:
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation'
summary: 'etcd database size in use is less than 50% of the actual allocated storage.'
condition: '{{ true }}'
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
labels:
severity: warning

View File

@@ -0,0 +1,53 @@
condition: '{{ true }}'
name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/targetdown'
summary: 'One or more targets are unreachable.'
condition: '{{ true }}'
expr: 100 * (count(up == 0) BY (job,namespace,service,{{ .Values.global.clusterLabel }}) / count(up) BY (job,namespace,service,{{ .Values.global.clusterLabel }})) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/watchdog'
summary: 'An alert that should always be firing to certify that Alertmanager is working properly.'
condition: '{{ true }}'
expr: vector(1)
labels:
severity: ok
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor'
summary: 'Info-level alert inhibition.'
condition: '{{ true }}'
expr: ALERTS{severity = "info"} == 1 unless on (namespace,{{ .Values.global.clusterLabel }}) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: major

View File

@@ -0,0 +1,11 @@
condition: '{{ true }}'
name: k8s.rules.container_cpu_usage_seconds_total
rules:
- condition: '{{ true }}'
expr: |-
sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (
1, max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate

View File

@@ -0,0 +1,10 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_cache
rules:
- condition: '{{ true }}'
expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache

View File

@@ -0,0 +1,10 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_rss
rules:
- condition: '{{ true }}'
expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss

View File

@@ -0,0 +1,10 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_swap
rules:
- condition: '{{ true }}'
expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap

View File

@@ -0,0 +1,10 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_working_set_bytes
rules:
- condition: '{{ true }}'
expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes

View File

@@ -0,0 +1,79 @@
condition: '{{ true }}'
name: k8s.rules.container_resource
rules:
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum

View File

@@ -0,0 +1,54 @@
condition: '{{ true }}'
name: k8s.rules.pod_owner
rules:
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on (replicaset,namespace,{{ .Values.global.clusterLabel }}) group_left(owner_name) topk by (replicaset,namespace,{{ .Values.global.clusterLabel }}) (
1, max by (replicaset,namespace,owner_name,{{ .Values.global.clusterLabel }}) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel

View File

@@ -0,0 +1,128 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
interval: 3m
name: kube-apiserver-availability.rules
rules:
- condition: '{{ true }}'
expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
record: code_verb:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h]))
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30)
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
- condition: '{{ true }}'
expr: |-
1 - (
(
# write too slow
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
) +
(
# read too slow
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
)
) +
# errors
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- condition: '{{ true }}'
expr: |-
1 - (
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
# too slow
(
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- condition: '{{ true }}'
expr: |-
1 - (
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- condition: '{{ true }}'
expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: code_resource:apiserver_request_total:rate5m
- condition: '{{ true }}'
expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: code_resource:apiserver_request_total:rate5m
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h

View File

@@ -0,0 +1,318 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
name: kube-apiserver-burnrate.rules
rules:
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
record: apiserver_request:burnrate1d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
record: apiserver_request:burnrate1h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
record: apiserver_request:burnrate2h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
record: apiserver_request:burnrate30m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
record: apiserver_request:burnrate3d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: apiserver_request:burnrate5m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
record: apiserver_request:burnrate6h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
record: apiserver_request:burnrate1d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
record: apiserver_request:burnrate1h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
record: apiserver_request:burnrate2h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
record: apiserver_request:burnrate30m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
record: apiserver_request:burnrate3d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: apiserver_request:burnrate5m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
record: apiserver_request:burnrate6h

View File

@@ -0,0 +1,15 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
name: kube-apiserver-histogram.rules
rules:
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
labels:
quantile: '0.99'
verb: read
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
labels:
quantile: '0.99'
verb: write
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile

View File

@@ -0,0 +1,63 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
name: kube-apiserver-slos
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
labels:
long: 1h
severity: critical
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
labels:
long: 6h
severity: critical
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
labels:
long: 1d
severity: warning
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
labels:
long: 3d
severity: warning
short: 6h

View File

@@ -0,0 +1,9 @@
condition: '{{ true }}'
name: kube-prometheus-general.rules
rules:
- condition: '{{ true }}'
expr: count without(instance, pod, node) (up == 1)
record: count:up1
- condition: '{{ true }}'
expr: count without(instance, pod, node) (up == 0)
record: count:up0

View File

@@ -0,0 +1,21 @@
condition: '{{ true }}'
name: kube-prometheus-node-recording.rules
rules:
- condition: '{{ true }}'
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- condition: '{{ true }}'
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- condition: '{{ true }}'
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- condition: '{{ true }}'
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- condition: '{{ true }}'
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- condition: '{{ true }}'
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio

View File

@@ -0,0 +1,48 @@
condition: '{{ .Values.kubeScheduler.enabled }}'
name: kube-scheduler.rules
rules:
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile

View File

@@ -0,0 +1,55 @@
condition: '{{ true }}'
name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors'
summary: 'kube-state-metrics is experiencing errors in list operations.'
condition: '{{ true }}'
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }})
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }}))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors'
summary: 'kube-state-metrics is experiencing errors in watch operations.'
condition: '{{ true }}'
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }})
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }}))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: 'kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch'
summary: 'kube-state-metrics sharding is misconfigured.'
condition: '{{ true }}'
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: 'kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing'
summary: 'kube-state-metrics shards are missing.'
condition: '{{ true }}'
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - 1
-
sum( 2 ^ max by (shard_ordinal,{{ .Values.global.clusterLabel }}) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ .Values.global.clusterLabel }})
!= 0
for: 15m
labels:
severity: critical

View File

@@ -0,0 +1,18 @@
condition: '{{ .Values.kubelet.enabled }}'
name: kubelet.rules
rules:
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile

View File

@@ -0,0 +1,257 @@
condition: '{{ true }}'
name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping'
summary: 'Pod is crash looping.'
condition: '{{ true }}'
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1
for: 15m
labels:
severity: warning
- alert: KubePodNotReady
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready'
summary: 'Pod has been in a non-ready state for more than 15 minutes.'
condition: '{{ true }}'
expr: |-
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(owner_kind) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (
1, max by (namespace,pod,owner_kind,{{ .Values.global.clusterLabel }}) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
labels:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch'
summary: 'Deployment generation mismatch due to possible roll-back'
condition: '{{ true }}'
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch'
summary: 'Deployment has not matched the expected number of replicas.'
condition: '{{ true }}'
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentrolloutstuck'
summary: 'Deployment rollout is not progressing.'
condition: '{{ true }}'
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch'
summary: 'StatefulSet has not matched the expected number of replicas.'
condition: '{{ true }}'
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch'
summary: 'StatefulSet generation mismatch due to possible roll-back'
condition: '{{ true }}'
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout'
summary: 'StatefulSet update has not been rolled out.'
condition: '{{ true }}'
expr: |-
(
max by (namespace,statefulset,{{ .Values.global.clusterLabel }}) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck'
summary: 'DaemonSet rollout is stuck.'
condition: '{{ true }}'
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting'
summary: 'Pod container waiting longer than 1 hour'
condition: '{{ true }}'
expr: sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled'
summary: 'DaemonSet pods are not scheduled.'
condition: '{{ true }}'
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled'
summary: 'DaemonSet pods are misscheduled.'
condition: '{{ true }}'
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 15m
labels:
severity: warning
- alert: KubeJobNotCompleted
annotations:
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted'
summary: 'Job did not complete in time'
condition: '{{ true }}'
expr: |-
time() - max by (namespace,job_name,{{ .Values.global.clusterLabel }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200
labels:
severity: warning
- alert: KubeJobFailed
annotations:
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed'
summary: 'Job failed to complete.'
condition: '{{ true }}'
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch'
summary: 'HPA has not matched desired number of replicas.'
condition: '{{ true }}'
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout'
summary: 'HPA is running at max replicas'
condition: '{{ true }}'
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning

View File

@@ -0,0 +1,113 @@
condition: '{{ true }}'
name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit'
summary: 'Cluster has overcommitted CPU resource requests.'
condition: '{{ true }}'
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit'
summary: 'Cluster has overcommitted memory resource requests.'
condition: '{{ true }}'
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit'
summary: 'Cluster has overcommitted CPU resource requests.'
condition: '{{ true }}'
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ .Values.global.clusterLabel }})
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Namespaces.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit'
summary: 'Cluster has overcommitted memory resource requests.'
condition: '{{ true }}'
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ .Values.global.clusterLabel }})
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull'
summary: 'Namespace quota is going to be full.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: informational
- alert: KubeQuotaFullyUsed
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused'
summary: 'Namespace quota is fully used.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: informational
- alert: KubeQuotaExceeded
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded'
summary: 'Namespace quota has exceeded the limits.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh'
summary: 'Processes experience elevated CPU throttling.'
condition: '{{ true }}'
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }})
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }})
> ( 25 / 100 )
for: 15m
labels:
severity: informational

View File

@@ -0,0 +1,101 @@
condition: '{{ true }}'
name: kubernetes-storage
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup'
summary: 'PersistentVolume is filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup'
summary: 'PersistentVolume is filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup'
summary: 'PersistentVolumeInodes are filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup'
summary: 'PersistentVolumeInodes are filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
description: 'The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors'
summary: 'PersistentVolume is having issues with provisioning.'
condition: '{{ true }}'
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical

View File

@@ -0,0 +1,62 @@
condition: '{{ true }}'
name: kubernetes-system-apiserver
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
summary: 'Client certificate is about to expire.'
condition: '{{ true }}'
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
for: 5m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
summary: 'Client certificate is about to expire.'
condition: '{{ true }}'
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
for: 5m
labels:
severity: critical
- alert: KubeAggregatedAPIErrors
annotations:
description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors'
summary: 'Kubernetes aggregated API has reported errors.'
condition: '{{ true }}'
expr: sum by (name,namespace,{{ .Values.global.clusterLabel }})(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
annotations:
description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown'
summary: 'Kubernetes aggregated API is down.'
condition: '{{ true }}'
expr: (1 - max by (name,namespace,{{ .Values.global.clusterLabel }})(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m]))) * 100 < 85
for: 5m
labels:
severity: warning
- alert: KubeAPIDown
annotations:
description: 'KubeAPI has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubeApiServer.enabled }}'
expr: absent(up{job="kube-apiserver"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests'
summary: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
condition: '{{ true }}'
expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning

View File

@@ -0,0 +1,13 @@
condition: '{{ .Values.kubeControllerManager.enabled }}'
name: kubernetes-system-controller-manager
rules:
- alert: KubeControllerManagerDown
annotations:
description: 'KubeControllerManager has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubeControllerManager.enabled }}'
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -0,0 +1,136 @@
condition: '{{ true }}'
name: kubernetes-system-kubelet
rules:
- alert: KubeNodeNotReady
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready'
summary: 'Node is not ready.'
condition: '{{ true }}'
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
labels:
severity: warning
- alert: KubeNodeUnreachable
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable'
summary: 'Node is unreachable.'
condition: '{{ true }}'
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
description: 'Kubelet ''{{`{{`}} $labels.node {{`}}`}}'' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods'
summary: 'Kubelet is running at capacity.'
condition: '{{ true }}'
expr: |-
count by (node,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,{{ .Values.global.clusterLabel }}) group_left(node) topk by (instance,pod,namespace,{{ .Values.global.clusterLabel }}) (1, kube_pod_info{job="kube-state-metrics"})
)
/
max by (node,{{ .Values.global.clusterLabel }}) (
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
) > 0.95
for: 15m
labels:
severity: informational
- alert: KubeNodeReadinessFlapping
annotations:
description: 'The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping'
summary: 'Node readiness status is flapping.'
condition: '{{ true }}'
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (node,{{ .Values.global.clusterLabel }}) > 2
for: 15m
labels:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh'
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.'
condition: '{{ true }}'
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: 'Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh'
summary: 'Kubelet Pod startup latency is too high.'
condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }})) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
summary: 'Kubelet client certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
summary: 'Kubelet client certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
summary: 'Kubelet server certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
summary: 'Kubelet server certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors'
summary: 'Kubelet has failed to renew its client certificate.'
condition: '{{ true }}'
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors'
summary: 'Kubelet has failed to renew its server certificate.'
condition: '{{ true }}'
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletDown
annotations:
description: 'Kubelet has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubelet.enabled }}'
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -0,0 +1,13 @@
condition: '{{ .Values.kubeScheduler.enabled }}'
name: kubernetes-system-scheduler
rules:
- alert: KubeSchedulerDown
annotations:
description: 'KubeScheduler has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubeScheduler.enabled }}'
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -0,0 +1,27 @@
condition: '{{ true }}'
name: kubernetes-system
rules:
- alert: KubeVersionMismatch
annotations:
description: 'There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch'
summary: 'Different semantic versions of Kubernetes components running.'
condition: '{{ true }}'
expr: count by ({{ .Values.global.clusterLabel }}) (count by (git_version,{{ .Values.global.clusterLabel }}) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
description: 'Kubernetes API server client ''{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}'' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'''
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors'
summary: 'Kubernetes API server client is experiencing errors.'
condition: '{{ true }}'
expr: |-
(sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }})
/
sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }}))
> 0.01
for: 15m
labels:
severity: warning

View File

@@ -0,0 +1,76 @@
condition: '{{ true }}'
name: node-exporter.rules
rules:
- condition: '{{ true }}'
expr: |-
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- condition: '{{ true }}'
expr: |-
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- condition: '{{ true }}'
expr: |-
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- condition: '{{ true }}'
expr: |-
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- condition: '{{ true }}'
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- condition: '{{ true }}'
expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- condition: '{{ true }}'
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@@ -0,0 +1,336 @@
condition: '{{ true }}'
name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup'
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup'
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace'
summary: 'Filesystem has less than 5% space left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace'
summary: 'Filesystem has less than 3% space left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup'
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup'
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles'
summary: 'Filesystem has less than 5% inodes left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles'
summary: 'Filesystem has less than 3% inodes left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs'
summary: 'Network interface is reporting many receive errors.'
condition: '{{ true }}'
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs'
summary: 'Network interface is reporting many transmit errors.'
condition: '{{ true }}'
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused'
summary: 'Number of conntrack are getting close to the limit.'
condition: '{{ true }}'
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: 'Node Exporter text file collector on {{`{{`}} $labels.instance {{`}}`}} failed to scrape.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror'
summary: 'Node Exporter text file collector failed to scrape.'
condition: '{{ true }}'
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected'
summary: 'Clock skew detected.'
condition: '{{ true }}'
expr: |-
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising'
summary: 'Clock not synchronising.'
condition: '{{ true }}'
expr: |-
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: 'RAID array ''{{`{{`}} $labels.device {{`}}`}}'' at {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded'
summary: 'RAID Array is degraded.'
condition: '{{ true }}'
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: 'At least one device in RAID array at {{`{{`}} $labels.instance {{`}}`}} failed. Array ''{{`{{`}} $labels.device {{`}}`}}'' needs attention and possibly a disk swap.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure'
summary: 'Failed device in RAID array.'
condition: '{{ true }}'
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit'
summary: 'Kernel is predicted to exhaust file descriptors limit soon.'
condition: '{{ true }}'
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit'
summary: 'Kernel is predicted to exhaust file descriptors limit soon.'
condition: '{{ true }}'
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
- alert: NodeCPUHighUsage
annotations:
description: 'CPU usage at {{`{{`}} $labels.instance {{`}}`}} has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage'
summary: 'High CPU usage.'
condition: '{{ true }}'
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
severity: informational
- alert: NodeSystemSaturation
annotations:
description: 'System load per core at {{`{{`}} $labels.instance {{`}}`}} has been above 2 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
This might indicate this instance resources saturation and can cause it becoming unresponsive.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemsaturation'
summary: 'System saturated, load per core is very high.'
condition: '{{ true }}'
expr: |-
node_load1{job="node-exporter"}
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
for: 15m
labels:
severity: warning
- alert: NodeMemoryMajorPagesFaults
annotations:
description: 'Memory major pages are occurring at very high rate at {{`{{`}} $labels.instance {{`}}`}}, 500 major page faults per second for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
Please check that there is enough memory available at this instance.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults'
summary: 'Memory major page faults are occurring at very high rate.'
condition: '{{ true }}'
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 15m
labels:
severity: warning
- alert: NodeMemoryHighUtilization
annotations:
description: 'Memory is filling up at {{`{{`}} $labels.instance {{`}}`}}, has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization'
summary: 'Host is running out of memory.'
condition: '{{ true }}'
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
for: 15m
labels:
severity: warning
- alert: NodeDiskIOSaturation
annotations:
description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
This symptom might indicate disk saturation.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation'
summary: 'Disk IO queue is high.'
condition: '{{ true }}'
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description: 'Systemd service {{`{{`}} $labels.name {{`}}`}} has entered failed state at {{`{{`}} $labels.instance {{`}}`}}'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed'
summary: 'Systemd service has entered failed state.'
condition: '{{ true }}'
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
- alert: NodeBondingDegraded
annotations:
description: 'Bonding interface {{`{{`}} $labels.master {{`}}`}} on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more slave failures.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded'
summary: 'Bonding interface is degraded'
condition: '{{ true }}'
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning

View File

@@ -0,0 +1,13 @@
condition: '{{ true }}'
name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: 'Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping'
summary: 'Network interface is often changing its status'
condition: '{{ true }}'
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning

View File

@@ -0,0 +1,44 @@
condition: '{{ true }}'
name: node.rules
rules:
- condition: '{{ true }}'
expr: |-
topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (node,namespace,pod,{{ .Values.global.clusterLabel }}) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- condition: '{{ true }}'
expr: |-
count by (node,{{ .Values.global.clusterLabel }}) (
node_cpu_seconds_total{mode="idle",job="node-exporter"}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node)
topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, node_namespace_pod:kube_pod_info:)
)
record: node:node_num_cpu:sum
- condition: '{{ true }}'
expr: |-
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by ({{ .Values.global.clusterLabel }})
record: :node_memory_MemAvailable_bytes:sum
- condition: '{{ true }}'
expr: |-
avg by (node,{{ .Values.global.clusterLabel }}) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
record: node:node_cpu_utilization:ratio_rate5m
- condition: '{{ true }}'
expr: |-
avg by ({{ .Values.global.clusterLabel }}) (
node:node_cpu_utilization:ratio_rate5m
)
record: cluster:node_cpu:ratio_rate5m

View File

@@ -0,0 +1,458 @@
{{- /* Expand the name of the chart. */ -}}
{{- define "victoria-metrics-k8s-stack.name" -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- default $Chart.Name $Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end }}
{{- /*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/ -}}
{{- define "victoria-metrics-k8s-stack.fullname" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $fullname := "" -}}
{{- if .appKey -}}
{{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}}
{{- $values := $Values -}}
{{- $global := (index $Values.global $Chart.Name) | default dict -}}
{{- range $ak := $appKey }}
{{- $values = (index $values $ak) | default dict -}}
{{- $global = (index $global $ak) | default dict -}}
{{- if $values.name -}}
{{- $fullname = $values.name -}}
{{- else if $global.name -}}
{{- $fullname = $global.name -}}
{{- end -}}
{{- end }}
{{- end -}}
{{- if empty $fullname -}}
{{- if $Values.fullnameOverride -}}
{{- $fullname = $Values.fullnameOverride -}}
{{- else if (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}}
{{- $fullname = (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}}
{{- else -}}
{{- $name := default $Chart.Name $Values.nameOverride -}}
{{- if contains $name $Release.Name -}}
{{- $fullname = $Release.Name -}}
{{- else -}}
{{- $fullname = (printf "%s-%s" $Release.Name $name) }}
{{- end -}}
{{- end }}
{{- end -}}
{{- $fullname | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- /* Create chart name and version as used by the chart label. */ -}}
{{- define "victoria-metrics-k8s-stack.chart" -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- printf "%s-%s" $Chart.Name $Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end }}
{{- /* Create the name of the service account to use */ -}}
{{- define "victoria-metrics-k8s-stack.serviceAccountName" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- if $Values.serviceAccount.create -}}
{{- default (include "victoria-metrics-k8s-stack.fullname" .) $Values.serviceAccount.name -}}
{{- else -}}
{{- default "default" $Values.serviceAccount.name -}}
{{- end }}
{{- end }}
{{- /* Common labels */ -}}
{{- define "victoria-metrics-k8s-stack.labels" -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.selectorLabels" .)) -}}
{{- $_ := set $labels "helm.sh/chart" (include "victoria-metrics-k8s-stack.chart" .) -}}
{{- $_ := set $labels "app.kubernetes.io/managed-by" $Release.Service -}}
{{- with $Chart.AppVersion }}
{{- $_ := set $labels "app.kubernetes.io/version" . -}}
{{- end -}}
{{- toYaml $labels -}}
{{- end }}
{{- define "vm.release" -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- default $Release.Name $Values.argocdReleaseOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- /* Selector labels */ -}}
{{- define "victoria-metrics-k8s-stack.selectorLabels" -}}
{{- $labels := .extraLabels | default dict -}}
{{- $_ := set $labels "app.kubernetes.io/name" (include "victoria-metrics-k8s-stack.name" .) -}}
{{- $_ := set $labels "app.kubernetes.io/instance" (include "vm.release" .) -}}
{{- toYaml $labels -}}
{{- end }}
{{- /* Create the name for VM service */ -}}
{{- define "vm.service" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $name := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- with .appKey -}}
{{- $prefix := . -}}
{{- if kindIs "slice" $prefix }}
{{- $prefix = last $prefix -}}
{{- end -}}
{{- $prefix = ternary $prefix (printf "vm%s" $prefix) (hasPrefix "vm" $prefix) -}}
{{- $name = printf "%s-%s" $prefix $name -}}
{{- end -}}
{{- if hasKey . "appIdx" -}}
{{- $name = (printf "%s-%d.%s" $name .appIdx $name) -}}
{{- end -}}
{{- $name -}}
{{- end }}
{{- define "vm.url" -}}
{{- $name := (include "vm.service" .) -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $ns := include "vm.namespace" . -}}
{{- $proto := "http" -}}
{{- $port := 80 -}}
{{- $path := .appRoute | default "/" -}}
{{- $isSecure := false -}}
{{- if .appSecure -}}
{{- $isSecure = .appSecure -}}
{{- end -}}
{{- if .appKey -}}
{{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}}
{{- $spec := $Values -}}
{{- range $ak := $appKey -}}
{{- if hasKey $spec $ak -}}
{{- $spec = (index $spec $ak) -}}
{{- end -}}
{{- if hasKey $spec "spec" -}}
{{- $spec = $spec.spec -}}
{{- end -}}
{{- end -}}
{{- $isSecure = (eq ($spec.extraArgs).tls "true") | default $isSecure -}}
{{- $proto = (ternary "https" "http" $isSecure) -}}
{{- $port = (ternary 443 80 $isSecure) -}}
{{- $port = $spec.port | default $port -}}
{{- $path = dig "http.pathPrefix" $path ($spec.extraArgs | default dict) -}}
{{- end -}}
{{- printf "%s://%s.%s.svc:%d%s" $proto $name $ns (int $port) $path -}}
{{- end -}}
{{- define "vm.read.endpoint" -}}
{{- $ctx := . -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $endpoint := default dict -}}
{{- if $Values.vmsingle.enabled -}}
{{- $_ := set $ctx "appKey" "vmsingle" -}}
{{- $_ := set $endpoint "url" (include "vm.url" $ctx) -}}
{{- else if $Values.vmcluster.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}}
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
{{- $tenant := ($Values.tenant | default 0) -}}
{{- $_ := set $endpoint "url" (printf "%s/select/%d/prometheus" $baseURL (int $tenant)) -}}
{{- else if $Values.externalVM.read.url -}}
{{- $endpoint = $Values.externalVM.read -}}
{{- end -}}
{{- toYaml $endpoint -}}
{{- end }}
{{- define "vm.write.endpoint" -}}
{{- $ctx := . -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $endpoint := default dict -}}
{{- if $Values.vmsingle.enabled -}}
{{- $_ := set $ctx "appKey" "vmsingle" -}}
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
{{- $_ := set $endpoint "url" (printf "%s/api/v1/write" $baseURL) -}}
{{- else if $Values.vmcluster.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}}
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
{{- $tenant := ($Values.tenant | default 0) -}}
{{- $_ := set $endpoint "url" (printf "%s/insert/%d/prometheus/api/v1/write" $baseURL (int $tenant)) -}}
{{- else if $Values.externalVM.write.url -}}
{{- $endpoint = $Values.externalVM.write -}}
{{- end -}}
{{- toYaml $endpoint -}}
{{- end -}}
{{- /* VMAlert remotes */ -}}
{{- define "vm.alert.remotes" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $remotes := default dict -}}
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- $ctx := dict "helm" . -}}
{{- $remoteWrite := (include "vm.write.endpoint" $ctx | fromYaml) -}}
{{- if $Values.vmalert.remoteWriteVMAgent -}}
{{- $ctx := dict "helm" . "appKey" "vmagent" -}}
{{- $remoteWrite = dict "url" (printf "%s/api/v1/write" (include "vm.url" $ctx)) -}}
{{- end -}}
{{- $ctx := dict "helm" . -}}
{{- $remoteRead := (fromYaml (include "vm.read.endpoint" $ctx)) -}}
{{- $_ := set $remotes "remoteWrite" $remoteWrite -}}
{{- $_ := set $remotes "remoteRead" $remoteRead -}}
{{- $_ := set $remotes "datasource" $remoteRead -}}
{{- if $Values.vmalert.additionalNotifierConfigs }}
{{- $configName := printf "%s-vmalert-additional-notifier" $fullname -}}
{{- $notifierConfigRef := dict "name" $configName "key" "notifier-configs.yaml" -}}
{{- $_ := set $remotes "notifierConfigRef" $notifierConfigRef -}}
{{- else if $Values.alertmanager.enabled -}}
{{- $notifiers := default list -}}
{{- $appSecure := (not (empty (((.Values.alertmanager).spec).webConfig).tls_server_config)) -}}
{{- $ctx := dict "helm" . "appKey" "alertmanager" "appSecure" $appSecure "appRoute" ((.Values.alertmanager).spec).routePrefix -}}
{{- $alertManagerReplicas := (.Values.alertmanager.spec.replicaCount | default 1 | int) -}}
{{- range until $alertManagerReplicas -}}
{{- $_ := set $ctx "appIdx" . -}}
{{- $notifiers = append $notifiers (dict "url" (include "vm.url" $ctx)) -}}
{{- end }}
{{- $_ := set $remotes "notifiers" $notifiers -}}
{{- end -}}
{{- toYaml $remotes -}}
{{- end -}}
{{- /* VMAlert templates */ -}}
{{- define "vm.alert.templates" -}}
{{- $Values := (.helm).Values | default .Values}}
{{- $cms := ($Values.vmalert.spec.configMaps | default list) -}}
{{- if $Values.vmalert.templateFiles -}}
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- $cms = append $cms (printf "%s-vmalert-extra-tpl" $fullname) -}}
{{- end -}}
{{- $output := dict "configMaps" (compact $cms) -}}
{{- toYaml $output -}}
{{- end -}}
{{- define "vm.license.global" -}}
{{- $license := (deepCopy (.Values.global).license) | default dict -}}
{{- if $license.key -}}
{{- if hasKey $license "keyRef" -}}
{{- $_ := unset $license "keyRef" -}}
{{- end -}}
{{- else if $license.keyRef.name -}}
{{- if hasKey $license "key" -}}
{{- $_ := unset $license "key" -}}
{{- end -}}
{{- else -}}
{{- $license = default dict -}}
{{- end -}}
{{- toYaml $license -}}
{{- end -}}
{{- /* VMAlert spec */ -}}
{{- define "vm.alert.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $extraArgs := dict "remoteWrite.disablePathAppend" "true" -}}
{{- if $Values.vmalert.templateFiles -}}
{{- $ruleTmpl := (printf "/etc/vm/configs/%s-vmalert-extra-tpl/*.tmpl" (include "victoria-metrics-k8s-stack.fullname" .)) -}}
{{- $_ := set $extraArgs "rule.templates" $ruleTmpl -}}
{{- end -}}
{{- $vmAlertRemotes := (include "vm.alert.remotes" . | fromYaml) -}}
{{- $vmAlertTemplates := (include "vm.alert.templates" . | fromYaml) -}}
{{- $spec := dict "extraArgs" $extraArgs -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- with concat ($vmAlertRemotes.notifiers | default list) (.Values.vmalert.spec.notifiers | default list) }}
{{- $_ := set $vmAlertRemotes "notifiers" . }}
{{- end }}
{{- $spec := deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec }}
{{- if not (or (hasKey $spec "notifier") (hasKey $spec "notifiers") (hasKey $spec "notifierConfigRef") (hasKey $spec.extraArgs "notifier.blackhole")) }}
{{- fail "Neither `notifier`, `notifiers` nor `notifierConfigRef` is set for vmalert. If it's intentionally please consider setting `.vmalert.spec.extraArgs.['notifier.blackhole']` to `'true'`"}}
{{- end }}
{{- tpl (deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec | toYaml) . -}}
{{- end }}
{{- /* VM Agent remoteWrites */ -}}
{{- define "vm.agent.remote.write" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $remoteWrites := $Values.vmagent.additionalRemoteWrites | default list -}}
{{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.externalVM.write.url -}}
{{- $ctx := dict "helm" . -}}
{{- $remoteWrites = append $remoteWrites (fromYaml (include "vm.write.endpoint" $ctx)) -}}
{{- end -}}
{{- toYaml (dict "remoteWrite" $remoteWrites) -}}
{{- end -}}
{{- /* VMAgent spec */ -}}
{{- define "vm.agent.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $spec := (include "vm.agent.remote.write" . | fromYaml) -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl (deepCopy $Values.vmagent.spec | mergeOverwrite $spec | toYaml) . -}}
{{- end }}
{{- /* VMAuth spec */ -}}
{{- define "vm.auth.spec" -}}
{{- $ctx := . -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $unauthorizedAccessConfig := default list }}
{{- if $Values.vmsingle.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmsingle") -}}
{{- $url := (include "vm.url" $ctx) }}
{{- $srcPath := clean (printf "%s/.*" (urlParse $url).path) }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list $url)) }}
{{- else if $Values.vmcluster.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}}
{{- $writeUrl := (include "vm.url" $ctx) }}
{{- $writeSrcPath := clean (printf "%s/insert/.*" (urlParse $writeUrl).path) }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $writeSrcPath) "url_prefix" (list $writeUrl)) }}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}}
{{- $readUrl := (include "vm.url" $ctx) }}
{{- $readSrcPath := clean (printf "%s/select/.*" (urlParse $readUrl).path) }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $readSrcPath) "url_prefix" (list $readUrl)) }}
{{- else if or $Values.externalVM.read.url $Values.externalVM.write.url }}
{{- with $Values.externalVM.read.url }}
{{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }}
{{- end -}}
{{- with $Values.externalVM.write.url }}
{{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }}
{{- end -}}
{{- end -}}
{{- $spec := $Values.vmauth.spec }}
{{- $_ := set $spec "unauthorizedAccessConfig" (concat $unauthorizedAccessConfig ($spec.unauthorizedAccessConfig | default list)) }}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl (toYaml $spec) . -}}
{{- end -}}
{{- /* Alermanager spec */ -}}
{{- define "vm.alertmanager.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- $spec := $Values.alertmanager.spec -}}
{{- if and (not $Values.alertmanager.spec.configRawYaml) (not $Values.alertmanager.spec.configSecret) -}}
{{- $_ := set $spec "configSecret" (printf "%s-alertmanager" $fullname) -}}
{{- end -}}
{{- $templates := default list -}}
{{- if $Values.alertmanager.monzoTemplate.enabled -}}
{{- $configMap := (printf "%s-alertmanager-monzo-tpl" $fullname) -}}
{{- $templates = append $templates (dict "name" $configMap "key" "monzo.tmpl") -}}
{{- end -}}
{{- $configMap := (printf "%s-alertmanager-extra-tpl" $fullname) -}}
{{- range $key, $value := (.Values.alertmanager.templateFiles | default dict) -}}
{{- $templates = append $templates (dict "name" $configMap "key" $key) -}}
{{- end -}}
{{- $_ := set $spec "templates" $templates -}}
{{- toYaml $spec -}}
{{- end -}}
{{- /* Single spec */ -}}
{{- define "vm.single.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $extraArgs := default dict -}}
{{- if $Values.vmalert.enabled }}
{{- $ctx := dict "helm" . "appKey" "vmalert" -}}
{{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}}
{{- end -}}
{{- $spec := dict "extraArgs" $extraArgs -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl (deepCopy $Values.vmsingle.spec | mergeOverwrite $spec | toYaml) . -}}
{{- end }}
{{- /* Cluster spec */ -}}
{{- define "vm.select.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $extraArgs := default dict -}}
{{- if $Values.vmalert.enabled -}}
{{- $ctx := dict "helm" . "appKey" "vmalert" -}}
{{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}}
{{- end -}}
{{- $spec := dict "extraArgs" $extraArgs -}}
{{- toYaml $spec -}}
{{- end -}}
{{- define "vm.cluster.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $spec := (include "vm.select.spec" . | fromYaml) -}}
{{- $clusterSpec := (deepCopy $Values.vmcluster.spec) -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $clusterSpec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl ($clusterSpec | mergeOverwrite (dict "vmselect" $spec) | toYaml) . -}}
{{- end -}}
{{- define "vm.data.source.enabled" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $grafana := $Values.grafana -}}
{{- $isEnabled := false -}}
{{- if $grafana.plugins -}}
{{- range $value := $grafana.plugins -}}
{{- if contains "victoriametrics-datasource" $value -}}
{{- $isEnabled = true -}}
{{- end }}
{{- end }}
{{- end }}
{{- $unsignedPlugins := ((index $grafana "grafana.ini").plugins).allow_loading_unsigned_plugins | default "" -}}
{{- $allowUnsigned := contains "victoriametrics-datasource" $unsignedPlugins -}}
{{- ternary "true" "" (and $isEnabled $allowUnsigned) -}}
{{- end -}}
{{- /* Datasources */ -}}
{{- define "vm.data.sources" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $grafana := $Values.grafana -}}
{{- $datasources := $Values.grafana.additionalDataSources | default list -}}
{{- $vmDatasource := "victoriametrics-datasource" -}}
{{- $allowVMDatasource := (ternary false true (empty (include "vm.data.source.enabled" .))) -}}
{{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled -}}
{{- $ctx := dict "helm" . -}}
{{- $readEndpoint:= (include "vm.read.endpoint" $ctx | fromYaml) -}}
{{- $defaultDatasources := default list -}}
{{- range $ds := $grafana.sidecar.datasources.default }}
{{- if not $ds.type -}}
{{- $_ := set $ds "type" $Values.grafana.defaultDatasourceType }}
{{- end -}}
{{- if or (ne $ds.type $vmDatasource) $allowVMDatasource -}}
{{- $_ := set $ds "url" $readEndpoint.url -}}
{{- $defaultDatasources = append $defaultDatasources $ds -}}
{{- end -}}
{{- end }}
{{- $datasources = concat $datasources $defaultDatasources -}}
{{- if and $grafana.sidecar.datasources.createVMReplicasDatasources $defaultDatasources -}}
{{- range $id := until (int $Values.vmsingle.spec.replicaCount) -}}
{{- $_ := set $ctx "appIdx" $id -}}
{{- $readEndpoint := (include "vm.read.endpoint" $ctx | fromYaml) -}}
{{- range $ds := $defaultDatasources -}}
{{- $ds = (deepCopy $ds) -}}
{{- $_ := set $ds "url" $readEndpoint.url -}}
{{- $_ := set $ds "name" (printf "%s-%d" $ds.name $id) -}}
{{- $_ := set $ds "isDefault" false -}}
{{- $datasources = append $datasources $ds -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{- toYaml $datasources -}}
{{- end }}
{{- /* VMRule name */ -}}
{{- define "victoria-metrics-k8s-stack.rulegroup.name" -}}
{{- printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) (.name | replace "_" "") -}}
{{- end -}}
{{- /* VMRule labels */ -}}
{{- define "victoria-metrics-k8s-stack.rulegroup.labels" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.labels" .)) -}}
{{- $_ := set $labels "app" (include "victoria-metrics-k8s-stack.name" .) -}}
{{- $labels = mergeOverwrite $labels (deepCopy $Values.defaultRules.labels) -}}
{{- toYaml $labels -}}
{{- end }}
{{- /* VMRule key */ -}}
{{- define "victoria-metrics-k8s-stack.rulegroup.key" -}}
{{- without (regexSplit "[-_.]" .name -1) "exporter" "rules" | join "-" | camelcase | untitle -}}
{{- end -}}
{{- /* VMAlertmanager name */ -}}
{{- define "victoria-metrics-k8s-stack.alertmanager.name" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $Values.alertmanager.name | default (printf "%s-%s" "vmalertmanager" (include "victoria-metrics-k8s-stack.fullname" .) | trunc 63 | trimSuffix "-") -}}
{{- end -}}

View File

@@ -0,0 +1,4 @@
{{ range .Values.extraObjects }}
---
{{ tpl (toYaml .) $ }}
{{ end }}

View File

@@ -0,0 +1,121 @@
{{- if .Values.defaultRules.create }}
{{- /*
Default rules alias
*/}}
{{- $defaultRules := .Values.defaultRules -}}
{{- /*
Exact rules overrides
*/}}
{{- $exactRules := index $defaultRules "rules" | default dict }}
{{- /*
Create custom template context
*/}}
{{- $ctx := (dict "Values" (deepCopy .Values) "Release" (deepCopy .Release) "Chart" (deepCopy .Chart) "Template" (deepCopy .Template)) -}}
{{- /*
Loop through all rules files, that were crafted with hack/sync_rules.py
*/}}
{{- range $groupFile, $_ := .Files.Glob "files/rules/**.yaml" -}}
{{- /*
Get group name from file
*/}}
{{- $groupBase := base $groupFile -}}
{{- $groupNameRaw := trimSuffix (ext $groupBase) $groupBase -}}
{{- /*
Create context for templating
*/}}
{{- $_ := set $ctx "name" $groupNameRaw -}}
{{- /*
Create sanitized group name retrieved from file
*/}}
{{- $groupName := include "victoria-metrics-k8s-stack.rulegroup.key" $ctx -}}
{{- /*
Merge common group with a group data of a current iteration
*/}}
{{- $group := mergeOverwrite (deepCopy (dig "group" (default dict) $defaultRules)) (dig "groups" $groupName (default dict) $defaultRules) -}}
{{- /*
Get group data from file
*/}}
{{- $groupCtx := mergeOverwrite (deepCopy $ctx) $group }}
{{- $groupData := fromYaml (tpl ($.Files.Get $groupFile) $groupCtx) -}}
{{- /*
Save rules spec from file
*/}}
{{- $rulesSpec := $groupData.rules -}}
{{- /*
Delete rules from group
*/}}
{{- $_ := unset $groupData "rules" -}}
{{- /*
Save condition for group from file
*/}}
{{- $groupCondition := (eq $groupData.condition "true") -}}
{{- /*
Delete condition from group
*/}}
{{- $_ := unset $groupData "condition" -}}
{{- /*
Merge group spec
*/}}
{{- $groupSpec := mergeOverwrite (deepCopy $groupData) (dig "spec" (default dict) $group) -}}
{{- /*
Filter out ignore rules
*/}}
{{- $commonRule := dig "rule" (default dict) $defaultRules }}
{{- $commonInGroupRule := dig "rules" (default dict) $group }}
{{- $filteredRulesSpec := default list }}
{{- range $_, $ruleSpec := $rulesSpec }}
{{- $ruleName := $ruleSpec.alert | default "" }}
{{- $ruleKey := (hasKey $ruleSpec "record" | ternary "recording" "alerting") -}}
{{- $ruleCondition := (eq $ruleSpec.condition "true") }}
{{- $_ := unset $ruleSpec "condition" }}
{{- $exactRule := index $exactRules $ruleName | default dict }}
{{- $defaultRule := deepCopy (index $defaultRules $ruleKey) }}
{{- $resultRule := mergeOverwrite (deepCopy $commonRule) $defaultRule $commonInGroupRule $exactRule }}
{{- if (and (dig "create" true $resultRule) $ruleCondition) }}
{{- $ruleSpec := mergeOverwrite (deepCopy $ruleSpec) (dig "spec" (default dict) $resultRule) }}
{{- $filteredRulesSpec = append $filteredRulesSpec $ruleSpec }}
{{- end }}
{{- end -}}
{{- $rulesSpec = $filteredRulesSpec }}
{{- /*
Check if group is enabled
*/}}
{{- if (and $rulesSpec (dig "create" true $group) $groupCondition) }}
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: {{ include "victoria-metrics-k8s-stack.rulegroup.name" $ctx }}
{{- with (include "victoria-metrics-k8s-stack.rulegroup.labels" $ctx) }}
labels: {{ . | nindent 4 }}
{{- end }}
{{- with $.Values.defaultRules.annotations }}
annotations: {{ toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
- {{ toYaml $groupSpec | indent 4 | trim }}
rules:
{{- range $ruleSpec := $rulesSpec }}
- {{ toYaml $ruleSpec | indent 6 | trim }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -0,0 +1,26 @@
### plans
* [x] VMCluster
* [x] VMSingle
* [x] VMAgent
* [x] VMAlert
* [x] AlertManager
* [x] Annotations
* [x] ServiceScrapes
* [x] Nodeexporter
* [x] Grafana
* [x] kube-state-metrics
* [x] kube-mixin
* [x] core-dns
* [x] Grafana DS
* [x] Dashboards
* [x] Nodeexporter
* [x] kube-state-metrics
* [x] kube-mixin
* [x] Rules
* [x] kube-mixin
* [x] kube-prometheus
* [x] victoria-metrics
* [ ] ServiceAccounts stuff
* [ ] SelectorOvverride for ServiceScrapes
* [ ] helm hook for uninstall crd objects before chart remove

View File

@@ -0,0 +1,38 @@
vmagent:
spec:
volumes:
- hostPath:
path: /var/lib/minikube/certs/etcd
type: DirectoryOrCreate
name: etcd-certs
volumeMounts:
- mountPath: /var/lib/minikube/certs/etcd
name: etcd-certs
kubeScheduler:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: http-metrics
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
kubeControllerManager:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: http-metrics
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
kubeEtcd:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: http-metrics
scheme: https
tlsConfig:
caFile: /var/lib/minikube/certs/etcd/ca.crt
certFile: /var/lib/minikube/certs/etcd/peer.crt
keyFile: /var/lib/minikube/certs/etcd/peer.key

File diff suppressed because it is too large Load Diff

View File

@@ -21,6 +21,8 @@ spec:
relabelConfigs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- action: labeldrop
regex: '.*node_kubevirt_io.*'
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- replacement: cadvisor
@@ -30,3 +32,4 @@ spec:
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true

View File

@@ -21,6 +21,8 @@ spec:
relabelConfigs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- action: labeldrop
regex: '.*node_kubevirt_io.*'
- sourceLabels: [__metrics_path__]
targetLabel: metrics_path
- replacement: kubelet
@@ -51,6 +53,8 @@ spec:
relabelConfigs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- action: labeldrop
regex: '.*node_kubevirt_io.*'
- sourceLabels:
- __metrics_path__
targetLabel: metrics_path

View File

@@ -8,6 +8,7 @@ spec:
cluster: cozystack
extraArgs:
promscrape.streamParse: "true"
promscrape.maxScrapeSize: 32MB
remoteWrite:
- url: http://vminsert-shortterm.tenant-root.svc:8480/insert/0/prometheus/api/v1/write
- url: http://vminsert-longterm.tenant-root.svc:8480/insert/0/prometheus/api/v1/write

View File

@@ -1,3 +1,7 @@
global:
victoria-metrics-k8s-stack:
fullnameOverride: alerts
metrics-server:
defaultArgs:
- --cert-dir=/tmp
@@ -62,3 +66,19 @@ fluent-bit:
Operation lift
Nested_under kubernetes
Add_prefix kubernetes_
victoria-metrics-k8s-stack:
externalVM:
read:
url: "http://vmselect-shortterm.tenant-root.svc:8480/select/0/prometheus/api/v1/read"
write:
url: "http://vminsert-shortterm.tenant-root.svc:8480/insert/0/prometheus/api/v1/write"
grafana:
enabled: false
kube-state-metrics:
enabled: false
prometheus-node-exporter:
enabled: false
alertmanager:
name: vmalertmanager-alertmanager