mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-03-27 22:03:14 +00:00
Compare commits
30 Commits
kubermatic
...
release-0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
28fca4ef12 | ||
|
|
7db92db23c | ||
|
|
2c167719b4 | ||
|
|
d61ec3df04 | ||
|
|
9cfc07353f | ||
|
|
bb985806a0 | ||
|
|
2179dbd05c | ||
|
|
7ba27f4857 | ||
|
|
1e47fc55ca | ||
|
|
d481d83661 | ||
|
|
770089244e | ||
|
|
05de907054 | ||
|
|
34fd00e7cd | ||
|
|
5fb9cfe3da | ||
|
|
bb2e948131 | ||
|
|
bd10b18291 | ||
|
|
4eaca42ce9 | ||
|
|
b605c85eb2 | ||
|
|
929ab5c5eb | ||
|
|
4b90bf5aac | ||
|
|
7a1b56fa78 | ||
|
|
7161b4db06 | ||
|
|
b6e3203446 | ||
|
|
ab8394140c | ||
|
|
d657ca62b8 | ||
|
|
3d928611ed | ||
|
|
8cb2256042 | ||
|
|
ecfa4f8005 | ||
|
|
01ce122ada | ||
|
|
00b2834efc |
10
hack/e2e.sh
10
hack/e2e.sh
@@ -36,7 +36,7 @@ mkdir -p srv1 srv2 srv3
|
||||
|
||||
# Prepare cloud-init
|
||||
for i in 1 2 3; do
|
||||
echo "local-hostname: srv$i" > "srv$i/meta-data"
|
||||
echo "hostname: srv$i" > "srv$i/meta-data"
|
||||
echo '#cloud-config' > "srv$i/user-data"
|
||||
cat > "srv$i/network-config" <<EOT
|
||||
version: 2
|
||||
@@ -114,7 +114,7 @@ machine:
|
||||
- name: zfs
|
||||
- name: spl
|
||||
install:
|
||||
image: ghcr.io/aenix-io/cozystack/talos:v1.7.1
|
||||
image: ghcr.io/aenix-io/cozystack/talos:v1.8.0
|
||||
files:
|
||||
- content: |
|
||||
[plugins]
|
||||
@@ -182,7 +182,7 @@ timeout 60 sh -c 'until nc -nzv 192.168.123.11 50000 && nc -nzv 192.168.123.12 5
|
||||
talosctl bootstrap -n 192.168.123.11 -e 192.168.123.11
|
||||
|
||||
# Wait for etcd
|
||||
timeout 120 sh -c 'while talosctl etcd members -n 192.168.123.11,192.168.123.12,192.168.123.13 -e 192.168.123.10 2>&1 | grep "rpc error"; do sleep 1; done'
|
||||
timeout 180 sh -c 'while talosctl etcd members -n 192.168.123.11,192.168.123.12,192.168.123.13 -e 192.168.123.10 2>&1 | grep "rpc error"; do sleep 1; done'
|
||||
|
||||
rm -f kubeconfig
|
||||
talosctl kubeconfig kubeconfig -e 192.168.123.10 -n 192.168.123.10
|
||||
@@ -217,6 +217,10 @@ timeout 60 sh -c 'until kubectl get hr -A | grep cozy; do sleep 1; done'
|
||||
sleep 5
|
||||
|
||||
kubectl get hr -A | awk 'NR>1 {print "kubectl wait --timeout=15m --for=condition=ready -n " $1 " hr/" $2 " &"} END{print "wait"}' | sh -x
|
||||
|
||||
# Wait for Cluster-API providers
|
||||
kubectl wait deploy --timeout=30s --for=condition=available -n cozy-cluster-api capi-controller-manager capi-kamaji-controller-manager capi-kubeadm-bootstrap-controller-manager capi-operator-cluster-api-operator capk-controller-manager
|
||||
|
||||
# Wait for linstor controller
|
||||
kubectl wait deploy --timeout=5m --for=condition=available -n cozy-linstor linstor-controller
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ spec:
|
||||
serviceAccountName: cozystack
|
||||
containers:
|
||||
- name: cozystack
|
||||
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.15.0"
|
||||
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.16.5"
|
||||
env:
|
||||
- name: KUBERNETES_SERVICE_HOST
|
||||
value: localhost
|
||||
@@ -87,7 +87,7 @@ spec:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: darkhttpd
|
||||
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.15.0"
|
||||
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.16.5"
|
||||
command:
|
||||
- /usr/bin/darkhttpd
|
||||
- /cozystack/assets
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/postgres-backup:0.6.2@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1
|
||||
ghcr.io/aenix-io/cozystack/postgres-backup:0.7.0@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/nginx-cache:0.3.1@sha256:556bc8d29ee9e90b3d64d0481dcfc66483d055803315bba3d9ece17c0d97f32b
|
||||
ghcr.io/aenix-io/cozystack/nginx-cache:0.3.1@sha256:cd744b2d1d50191f4908f2db83079b32973d1c009fe9468627be72efbfa0a107
|
||||
|
||||
@@ -76,3 +76,5 @@ spec:
|
||||
metadata:
|
||||
labels:
|
||||
policy.cozystack.io/allow-to-apiserver: "true"
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
|
||||
@@ -16,7 +16,7 @@ type: application
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.11.0
|
||||
version: 0.12.1
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/cluster-autoscaler:latest@sha256:7f617de5a24de790a15d9e97c6287ff2b390922e6e74c7a665cbf498f634514d
|
||||
ghcr.io/aenix-io/cozystack/cluster-autoscaler:0.12.1@sha256:7f617de5a24de790a15d9e97c6287ff2b390922e6e74c7a665cbf498f634514d
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/kubevirt-cloud-provider:latest@sha256:735aa8092501fc0f2904b685b15bc0137ea294cb08301ca1185d3dec5f467f0f
|
||||
ghcr.io/aenix-io/cozystack/kubevirt-cloud-provider:0.12.1@sha256:ca606d6039ed43a48d4dfd98a91fd3cec120f08c1e221cd4e99ea94239389742
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/kubevirt-csi-driver:latest@sha256:e56b46591cdf9140e97c3220a0c2681aadd4a4b3f7ea8473fb2504dc96e8b53a
|
||||
ghcr.io/aenix-io/cozystack/kubevirt-csi-driver:0.12.1@sha256:86029548078960feecca116087b2135230d676b83c503f292eb50e1199be2790
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/ubuntu-container-disk:v1.30.1@sha256:5ce80a453073c4f44347409133fc7b15f1d2f37a564d189871a4082fc552ff0f
|
||||
ghcr.io/aenix-io/cozystack/ubuntu-container-disk:v1.30.1@sha256:1f249fbe52821a62f706c6038b13401234e1b758ac498e53395b8f9a642b015f
|
||||
|
||||
@@ -16,6 +16,7 @@ spec:
|
||||
app: {{ .Release.Name }}-cluster-autoscaler
|
||||
policy.cozystack.io/allow-to-apiserver: "true"
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
tolerations:
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
|
||||
@@ -210,6 +210,26 @@ spec:
|
||||
name: {{ $.Release.Name }}-{{ $groupName }}-{{ $kubevirtmachinetemplateHash }}
|
||||
namespace: default
|
||||
version: v1.30.1
|
||||
---
|
||||
apiVersion: cluster.x-k8s.io/v1beta1
|
||||
kind: MachineHealthCheck
|
||||
metadata:
|
||||
name: {{ $.Release.Name }}-{{ $groupName }}
|
||||
namespace: {{ $.Release.Namespace }}
|
||||
spec:
|
||||
clusterName: {{ $.Release.Name }}
|
||||
nodeStartupTimeout: 10m
|
||||
selector:
|
||||
matchLabels:
|
||||
cluster.x-k8s.io/cluster-name: {{ $.Release.Name }}
|
||||
cluster.x-k8s.io/deployment-name: {{ $.Release.Name }}-{{ $groupName }}
|
||||
unhealthyConditions:
|
||||
- type: Ready
|
||||
status: Unknown
|
||||
timeout: 30s
|
||||
- type: Ready
|
||||
status: "False"
|
||||
timeout: 300s
|
||||
{{- end }}
|
||||
---
|
||||
{{- /*
|
||||
|
||||
@@ -15,6 +15,7 @@ spec:
|
||||
app: {{ .Release.Name }}-kcsi-driver
|
||||
policy.cozystack.io/allow-to-apiserver: "true"
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
serviceAccountName: {{ .Release.Name }}-kcsi
|
||||
priorityClassName: system-cluster-critical
|
||||
tolerations:
|
||||
|
||||
@@ -30,7 +30,6 @@ spec:
|
||||
retries: -1
|
||||
values:
|
||||
cilium:
|
||||
tunnel: disabled
|
||||
k8sServiceHost: {{ .Release.Name }}.{{ .Release.Namespace }}.svc
|
||||
k8sServicePort: 6443
|
||||
routingMode: tunnel
|
||||
|
||||
@@ -15,6 +15,7 @@ spec:
|
||||
k8s-app: {{ .Release.Name }}-kccm
|
||||
policy.cozystack.io/allow-to-apiserver: "true"
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
tolerations:
|
||||
- key: CriticalAddonsOnly
|
||||
operator: Exists
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/mariadb-backup:0.5.1@sha256:fa2b3195521cffa55eb6d71a50b875d3c234a45e5dff71b2b9002674175bea93
|
||||
ghcr.io/aenix-io/cozystack/mariadb-backup:0.5.1@sha256:793edb25a29cbc00781e40af883815ca36937e736e2b0d202ea9c9619fb6ca11
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/postgres-backup:0.6.2@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1
|
||||
ghcr.io/aenix-io/cozystack/postgres-backup:0.7.0@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1
|
||||
|
||||
@@ -10,7 +10,9 @@ spec:
|
||||
postgresql:
|
||||
parameters:
|
||||
max_wal_senders: "30"
|
||||
max_connections: “{{ .Values.postgresql.parameters.max_connections }}”
|
||||
{{- with .Values.postgresql.parameters.max_connections }}
|
||||
max_connections: "{{ . }}"
|
||||
{{- end }}
|
||||
|
||||
minSyncReplicas: {{ .Values.quorum.minSyncReplicas }}
|
||||
maxSyncReplicas: {{ .Values.quorum.maxSyncReplicas }}
|
||||
|
||||
@@ -29,9 +29,9 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"max_connections": {
|
||||
"type": "string",
|
||||
"type": "number",
|
||||
"description": "Determines the maximum number of concurrent connections to the database server. The default is typically 100 connections",
|
||||
"default": "100"
|
||||
"default": 100
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -103,4 +103,4 @@
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ storageClass: ""
|
||||
## @param postgresql.parameters.max_connections Determines the maximum number of concurrent connections to the database server. The default is typically 100 connections
|
||||
postgresql:
|
||||
parameters:
|
||||
max_connections: "100"
|
||||
max_connections: 100
|
||||
|
||||
## Configuration for the quorum-based synchronous replication
|
||||
## @param quorum.minSyncReplicas Minimum number of synchronous replicas that must acknowledge a transaction before it is considered committed.
|
||||
|
||||
@@ -16,7 +16,7 @@ type: application
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.4.1
|
||||
version: 0.4.2
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
|
||||
@@ -16,6 +16,8 @@ spec:
|
||||
statefulSet:
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
enableServiceLinks: false
|
||||
metadata:
|
||||
labels:
|
||||
policy.cozystack.io/allow-to-apiserver: "true"
|
||||
@@ -47,7 +49,7 @@ metadata:
|
||||
config: '{{ printf "%s %s" $user $password | sha256sum }}'
|
||||
spec:
|
||||
importCredentialsSecret:
|
||||
name: {{ $.Release.Name }}-{{ $user }}-credentials
|
||||
name: {{ $.Release.Name }}-{{ kebabcase $user }}-credentials
|
||||
rabbitmqClusterReference:
|
||||
name: {{ $.Release.Name }}
|
||||
---
|
||||
|
||||
@@ -31,7 +31,11 @@ kubernetes 0.8.0 ac11056e
|
||||
kubernetes 0.8.1 e54608d8
|
||||
kubernetes 0.8.2 5ca8823
|
||||
kubernetes 0.9.0 9b6dd19
|
||||
kubernetes 0.10.0 HEAD
|
||||
kubernetes 0.10.0 ac5c38b
|
||||
kubernetes 0.11.0 4eaca42
|
||||
kubernetes 0.11.1 4f430a90
|
||||
kubernetes 0.12.0 74649f8
|
||||
kubernetes 0.12.1 HEAD
|
||||
mysql 0.1.0 f642698
|
||||
mysql 0.2.0 8b975ff0
|
||||
mysql 0.3.0 5ca8823
|
||||
@@ -48,12 +52,14 @@ postgres 0.4.0 ec283c33
|
||||
postgres 0.4.1 5ca8823
|
||||
postgres 0.5.0 c07c4bbd
|
||||
postgres 0.6.0 2a4768a
|
||||
postgres 0.6.2 HEAD
|
||||
postgres 0.6.2 54fd61c
|
||||
postgres 0.7.0 HEAD
|
||||
rabbitmq 0.1.0 f642698
|
||||
rabbitmq 0.2.0 5ca8823
|
||||
rabbitmq 0.3.0 9e33dc0
|
||||
rabbitmq 0.4.0 36d8855
|
||||
rabbitmq 0.4.1 HEAD
|
||||
rabbitmq 0.4.1 35536bb
|
||||
rabbitmq 0.4.2 HEAD
|
||||
redis 0.1.1 f642698
|
||||
redis 0.2.0 5ca8823
|
||||
redis 0.3.0 HEAD
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
arch: amd64
|
||||
platform: metal
|
||||
secureboot: false
|
||||
version: v1.7.6
|
||||
version: v1.8.0
|
||||
input:
|
||||
kernel:
|
||||
path: /usr/install/amd64/vmlinuz
|
||||
initramfs:
|
||||
path: /usr/install/amd64/initramfs.xz
|
||||
baseInstaller:
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.7.6
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.8.0
|
||||
systemExtensions:
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
|
||||
output:
|
||||
kind: initramfs
|
||||
imageOptions: {}
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
arch: amd64
|
||||
platform: metal
|
||||
secureboot: false
|
||||
version: v1.7.6
|
||||
version: v1.8.0
|
||||
input:
|
||||
kernel:
|
||||
path: /usr/install/amd64/vmlinuz
|
||||
initramfs:
|
||||
path: /usr/install/amd64/initramfs.xz
|
||||
baseInstaller:
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.7.6
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.8.0
|
||||
systemExtensions:
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
|
||||
output:
|
||||
kind: installer
|
||||
imageOptions: {}
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
arch: amd64
|
||||
platform: metal
|
||||
secureboot: false
|
||||
version: v1.7.6
|
||||
version: v1.8.0
|
||||
input:
|
||||
kernel:
|
||||
path: /usr/install/amd64/vmlinuz
|
||||
initramfs:
|
||||
path: /usr/install/amd64/initramfs.xz
|
||||
baseInstaller:
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.7.6
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.8.0
|
||||
systemExtensions:
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
|
||||
output:
|
||||
kind: iso
|
||||
imageOptions: {}
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
arch: amd64
|
||||
platform: metal
|
||||
secureboot: false
|
||||
version: v1.7.6
|
||||
version: v1.8.0
|
||||
input:
|
||||
kernel:
|
||||
path: /usr/install/amd64/vmlinuz
|
||||
initramfs:
|
||||
path: /usr/install/amd64/initramfs.xz
|
||||
baseInstaller:
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.7.6
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.8.0
|
||||
systemExtensions:
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
|
||||
output:
|
||||
kind: kernel
|
||||
imageOptions: {}
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
arch: amd64
|
||||
platform: metal
|
||||
secureboot: false
|
||||
version: v1.7.6
|
||||
version: v1.8.0
|
||||
input:
|
||||
kernel:
|
||||
path: /usr/install/amd64/vmlinuz
|
||||
initramfs:
|
||||
path: /usr/install/amd64/initramfs.xz
|
||||
baseInstaller:
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.7.6
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.8.0
|
||||
systemExtensions:
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
|
||||
output:
|
||||
kind: image
|
||||
imageOptions: { diskSize: 1306525696, diskFormat: raw }
|
||||
|
||||
@@ -3,24 +3,24 @@
|
||||
arch: amd64
|
||||
platform: nocloud
|
||||
secureboot: false
|
||||
version: v1.7.6
|
||||
version: v1.8.0
|
||||
input:
|
||||
kernel:
|
||||
path: /usr/install/amd64/vmlinuz
|
||||
initramfs:
|
||||
path: /usr/install/amd64/initramfs.xz
|
||||
baseInstaller:
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.7.6
|
||||
imageRef: ghcr.io/siderolabs/installer:v1.8.0
|
||||
systemExtensions:
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
|
||||
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
|
||||
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
|
||||
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
|
||||
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
|
||||
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
|
||||
output:
|
||||
kind: image
|
||||
imageOptions: { diskSize: 1306525696, diskFormat: raw }
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
cozystack:
|
||||
image: ghcr.io/aenix-io/cozystack/cozystack:v0.15.0@sha256:aeff26a80f84b4323578e613b3bf03caa842d617ec8d9ca98706867c1e70609f
|
||||
image: ghcr.io/aenix-io/cozystack/cozystack:v0.16.5@sha256:5bd08ec86b8392d31a1df7cb496d7c861142771c323c302729f7728da9b49ae2
|
||||
|
||||
@@ -29,6 +29,7 @@ releases:
|
||||
enableIdentityMark: true
|
||||
ipv4NativeRoutingCIDR: "{{ index $cozyConfig.data "ipv4-pod-cidr" }}"
|
||||
autoDirectNodeRoutes: true
|
||||
routingMode: native
|
||||
|
||||
- name: cert-manager
|
||||
releaseName: cert-manager
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
e2e:
|
||||
image: ghcr.io/aenix-io/cozystack/e2e-sandbox:v0.15.0@sha256:20cc84e4a11db31434881355c070113a7823501a28a6114ca02830b18607ad21
|
||||
image: ghcr.io/aenix-io/cozystack/e2e-sandbox:v0.16.5@sha256:25b298d621ec79431d106184d59849bbae634588742583d111628126ad8615c5
|
||||
|
||||
@@ -3,4 +3,4 @@ name: monitoring
|
||||
description: Monitoring and observability stack
|
||||
icon: /logos/monitoring.svg
|
||||
type: application
|
||||
version: 1.4.0
|
||||
version: 1.5.0
|
||||
|
||||
@@ -151,10 +151,10 @@ spec:
|
||||
ingressClassName: {{ $ingress }}
|
||||
tls:
|
||||
- hosts:
|
||||
- "{{ .Values.host | default (printf "alerta.%s" $host) }}"
|
||||
- "{{ printf "alerta.%s" (.Values.host | default $host) }}"
|
||||
secretName: alerta-tls
|
||||
rules:
|
||||
- host: "{{ .Values.host | default (printf "alerta.%s" $host) }}"
|
||||
- host: "{{ printf "alerta.%s" (.Values.host | default $host) }}"
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
|
||||
@@ -22,7 +22,7 @@ spec:
|
||||
password: ${GF_DATABASE_PASSWORD}
|
||||
#ssl_mode: require
|
||||
server:
|
||||
root_url: "https://{{ .Values.host | default (printf "grafana.%s" $host) }}"
|
||||
root_url: "https://{{ printf "grafana.%s" (.Values.host | default $host) }}"
|
||||
security:
|
||||
admin_user: user
|
||||
admin_password: ${GF_PASSWORD}
|
||||
@@ -95,7 +95,7 @@ spec:
|
||||
spec:
|
||||
ingressClassName: "{{ $ingress }}"
|
||||
rules:
|
||||
- host: "{{ .Values.host | default (printf "grafana.%s" $host) }}"
|
||||
- host: "{{ printf "grafana.%s" (.Values.host | default $host) }}"
|
||||
http:
|
||||
paths:
|
||||
- backend:
|
||||
@@ -107,5 +107,5 @@ spec:
|
||||
pathType: Prefix
|
||||
tls:
|
||||
- hosts:
|
||||
- "{{ .Values.host | default (printf "grafana.%s" $host) }}"
|
||||
- "{{ printf "grafana.%s" (.Values.host | default $host) }}"
|
||||
secretName: grafana-ingress-tls
|
||||
|
||||
@@ -16,7 +16,7 @@ type: application
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.2.0
|
||||
version: 0.2.1
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
|
||||
@@ -38,6 +38,10 @@ spec:
|
||||
storageClass: {{ . }}
|
||||
{{- end }}
|
||||
maxVolumes: 0
|
||||
|
||||
filer:
|
||||
s3:
|
||||
domainName: {{ .Values.host | default (printf "s3.%s" $host) }}
|
||||
|
||||
s3:
|
||||
ingress:
|
||||
|
||||
@@ -12,6 +12,8 @@ monitoring 1.1.0 15478a88
|
||||
monitoring 1.2.0 c9e0d63b
|
||||
monitoring 1.2.1 4471b4ba
|
||||
monitoring 1.3.0 6c5cf5b
|
||||
monitoring 1.4.0 HEAD
|
||||
monitoring 1.4.0 adaf603b
|
||||
monitoring 1.5.0 HEAD
|
||||
seaweedfs 0.1.0 5ca8823
|
||||
seaweedfs 0.2.0 HEAD
|
||||
seaweedfs 0.2.0 9e33dc0
|
||||
seaweedfs 0.2.1 HEAD
|
||||
|
||||
9
packages/system/capi-operator/values.yaml
Normal file
9
packages/system/capi-operator/values.yaml
Normal file
@@ -0,0 +1,9 @@
|
||||
cluster-api-operator:
|
||||
resources:
|
||||
manager:
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 512Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 100Mi
|
||||
@@ -79,7 +79,7 @@ annotations:
|
||||
Pod IP Pool\n description: |\n CiliumPodIPPool defines an IP pool that can
|
||||
be used for pooled IPAM (i.e. the multi-pool IPAM mode).\n"
|
||||
apiVersion: v2
|
||||
appVersion: 1.16.1
|
||||
appVersion: 1.16.2
|
||||
description: eBPF-based Networking, Security, and Observability
|
||||
home: https://cilium.io/
|
||||
icon: https://cdn.jsdelivr.net/gh/cilium/cilium@main/Documentation/images/logo-solo.svg
|
||||
@@ -95,4 +95,4 @@ kubeVersion: '>= 1.21.0-0'
|
||||
name: cilium
|
||||
sources:
|
||||
- https://github.com/cilium/cilium
|
||||
version: 1.16.1
|
||||
version: 1.16.2
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# cilium
|
||||
|
||||
 
|
||||
 
|
||||
|
||||
Cilium is open source software for providing and transparently securing
|
||||
network connectivity and loadbalancing between application workloads such as
|
||||
@@ -83,7 +83,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| authentication.mutual.spire.install.agent.tolerations | list | `[{"effect":"NoSchedule","key":"node.kubernetes.io/not-ready"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/master"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/control-plane"},{"effect":"NoSchedule","key":"node.cloudprovider.kubernetes.io/uninitialized","value":"true"},{"key":"CriticalAddonsOnly","operator":"Exists"}]` | SPIRE agent tolerations configuration By default it follows the same tolerations as the agent itself to allow the Cilium agent on this node to connect to SPIRE. ref: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ |
|
||||
| authentication.mutual.spire.install.enabled | bool | `true` | Enable SPIRE installation. This will only take effect only if authentication.mutual.spire.enabled is true |
|
||||
| authentication.mutual.spire.install.existingNamespace | bool | `false` | SPIRE namespace already exists. Set to true if Helm should not create, manage, and import the SPIRE namespace. |
|
||||
| authentication.mutual.spire.install.initImage | object | `{"digest":"sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7","override":null,"pullPolicy":"IfNotPresent","repository":"docker.io/library/busybox","tag":"1.36.1","useDigest":true}` | init container image of SPIRE agent and server |
|
||||
| authentication.mutual.spire.install.initImage | object | `{"digest":"sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140","override":null,"pullPolicy":"IfNotPresent","repository":"docker.io/library/busybox","tag":"1.36.1","useDigest":true}` | init container image of SPIRE agent and server |
|
||||
| authentication.mutual.spire.install.namespace | string | `"cilium-spire"` | SPIRE namespace to install into |
|
||||
| authentication.mutual.spire.install.server.affinity | object | `{}` | SPIRE server affinity configuration |
|
||||
| authentication.mutual.spire.install.server.annotations | object | `{}` | SPIRE server annotations |
|
||||
@@ -182,7 +182,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| clustermesh.apiserver.extraVolumeMounts | list | `[]` | Additional clustermesh-apiserver volumeMounts. |
|
||||
| clustermesh.apiserver.extraVolumes | list | `[]` | Additional clustermesh-apiserver volumes. |
|
||||
| clustermesh.apiserver.healthPort | int | `9880` | TCP port for the clustermesh-apiserver health API. |
|
||||
| clustermesh.apiserver.image | object | `{"digest":"sha256:e9c77417cd474cc943b2303a76c5cf584ac7024dd513ebb8d608cb62fe28896f","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/clustermesh-apiserver","tag":"v1.16.1","useDigest":true}` | Clustermesh API server image. |
|
||||
| clustermesh.apiserver.image | object | `{"digest":"sha256:cc84190fed92e03a2b3a33bc670b2447b521ee258ad9b076baaad13be312ea73","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/clustermesh-apiserver","tag":"v1.16.2","useDigest":true}` | Clustermesh API server image. |
|
||||
| clustermesh.apiserver.kvstoremesh.enabled | bool | `true` | Enable KVStoreMesh. KVStoreMesh caches the information retrieved from the remote clusters in the local etcd instance. |
|
||||
| clustermesh.apiserver.kvstoremesh.extraArgs | list | `[]` | Additional KVStoreMesh arguments. |
|
||||
| clustermesh.apiserver.kvstoremesh.extraEnv | list | `[]` | Additional KVStoreMesh environment variables. |
|
||||
@@ -353,7 +353,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| envoy.extraVolumes | list | `[]` | Additional envoy volumes. |
|
||||
| envoy.healthPort | int | `9878` | TCP port for the health API. |
|
||||
| envoy.idleTimeoutDurationSeconds | int | `60` | Set Envoy upstream HTTP idle connection timeout seconds. Does not apply to connections with pending requests. Default 60s |
|
||||
| envoy.image | object | `{"digest":"sha256:bd5ff8c66716080028f414ec1cb4f7dc66f40d2fb5a009fff187f4a9b90b566b","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium-envoy","tag":"v1.29.7-39a2a56bbd5b3a591f69dbca51d3e30ef97e0e51","useDigest":true}` | Envoy container image. |
|
||||
| envoy.image | object | `{"digest":"sha256:9762041c3760de226a8b00cc12f27dacc28b7691ea926748f9b5c18862db503f","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium-envoy","tag":"v1.29.9-1726784081-a90146d13b4cd7d168d573396ccf2b3db5a3b047","useDigest":true}` | Envoy container image. |
|
||||
| envoy.livenessProbe.failureThreshold | int | `10` | failure threshold of liveness probe |
|
||||
| envoy.livenessProbe.periodSeconds | int | `30` | interval between checks of the liveness probe |
|
||||
| envoy.log.format | string | `"[%Y-%m-%d %T.%e][%t][%l][%n] [%g:%#] %v"` | The format string to use for laying out the log message metadata of Envoy. |
|
||||
@@ -484,7 +484,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| hubble.relay.extraVolumes | list | `[]` | Additional hubble-relay volumes. |
|
||||
| hubble.relay.gops.enabled | bool | `true` | Enable gops for hubble-relay |
|
||||
| hubble.relay.gops.port | int | `9893` | Configure gops listen port for hubble-relay |
|
||||
| hubble.relay.image | object | `{"digest":"sha256:2e1b4c739a676ae187d4c2bfc45c3e865bda2567cc0320a90cb666657fcfcc35","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/hubble-relay","tag":"v1.16.1","useDigest":true}` | Hubble-relay container image. |
|
||||
| hubble.relay.image | object | `{"digest":"sha256:4b559907b378ac18af82541dafab430a857d94f1057f2598645624e6e7ea286c","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/hubble-relay","tag":"v1.16.2","useDigest":true}` | Hubble-relay container image. |
|
||||
| hubble.relay.listenHost | string | `""` | Host to listen to. Specify an empty string to bind to all the interfaces. |
|
||||
| hubble.relay.listenPort | string | `"4245"` | Port to listen to. |
|
||||
| hubble.relay.nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node labels for pod assignment ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector |
|
||||
@@ -590,7 +590,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| hubble.ui.updateStrategy | object | `{"rollingUpdate":{"maxUnavailable":1},"type":"RollingUpdate"}` | hubble-ui update strategy. |
|
||||
| identityAllocationMode | string | `"crd"` | Method to use for identity allocation (`crd` or `kvstore`). |
|
||||
| identityChangeGracePeriod | string | `"5s"` | Time to wait before using new identity on endpoint identity change. |
|
||||
| image | object | `{"digest":"sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.1","useDigest":true}` | Agent container image. |
|
||||
| image | object | `{"digest":"sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.2","useDigest":true}` | Agent container image. |
|
||||
| imagePullSecrets | list | `[]` | Configure image pull secrets for pulling container images |
|
||||
| ingressController.default | bool | `false` | Set cilium ingress controller to be the default ingress controller This will let cilium ingress controller route entries without ingress class set |
|
||||
| ingressController.defaultSecretName | string | `nil` | Default secret name for ingresses without .spec.tls[].secretName set. |
|
||||
@@ -717,7 +717,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| operator.hostNetwork | bool | `true` | HostNetwork setting |
|
||||
| operator.identityGCInterval | string | `"15m0s"` | Interval for identity garbage collection. |
|
||||
| operator.identityHeartbeatTimeout | string | `"30m0s"` | Timeout for identity heartbeats. |
|
||||
| operator.image | object | `{"alibabacloudDigest":"sha256:4381adf48d76ec482551183947e537d44bcac9b6c31a635a9ac63f696d978804","awsDigest":"sha256:e3876fcaf2d6ccc8d5b4aaaded7b1efa971f3f4175eaa2c8a499878d58c39df4","azureDigest":"sha256:e55c222654a44ceb52db7ade3a7b9e8ef05681ff84c14ad1d46fea34869a7a22","genericDigest":"sha256:3bc7e7a43bc4a4d8989cb7936c5d96675dd2d02c306adf925ce0a7c35aa27dc4","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/operator","suffix":"","tag":"v1.16.1","useDigest":true}` | cilium-operator image. |
|
||||
| operator.image | object | `{"alibabacloudDigest":"sha256:16e33abb6b8381e2f66388b6d7141399f06c9b51b9ffa08fd159b8d321929716","awsDigest":"sha256:b6a73ec94407a56cccc8a395225e2aecc3ca3611e7acfeec86201c19fc0727dd","azureDigest":"sha256:fde7cf8bb887e106cd388bb5c3327e92682b2ec3ab4f03bb57b87f495b99f727","genericDigest":"sha256:cccfd3b886d52cb132c06acca8ca559f0fce91a6bd99016219b1a81fdbc4813a","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/operator","suffix":"","tag":"v1.16.2","useDigest":true}` | cilium-operator image. |
|
||||
| operator.nodeGCInterval | string | `"5m0s"` | Interval for cilium node garbage collection. |
|
||||
| operator.nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node labels for cilium-operator pod assignment ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector |
|
||||
| operator.podAnnotations | object | `{}` | Annotations to be added to cilium-operator pods |
|
||||
@@ -767,7 +767,7 @@ contributors across the globe, there is almost always someone available to help.
|
||||
| preflight.extraEnv | list | `[]` | Additional preflight environment variables. |
|
||||
| preflight.extraVolumeMounts | list | `[]` | Additional preflight volumeMounts. |
|
||||
| preflight.extraVolumes | list | `[]` | Additional preflight volumes. |
|
||||
| preflight.image | object | `{"digest":"sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.1","useDigest":true}` | Cilium pre-flight image. |
|
||||
| preflight.image | object | `{"digest":"sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.2","useDigest":true}` | Cilium pre-flight image. |
|
||||
| preflight.nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node labels for preflight pod assignment ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector |
|
||||
| preflight.podAnnotations | object | `{}` | Annotations to be added to preflight pods |
|
||||
| preflight.podDisruptionBudget.enabled | bool | `false` | enable PodDisruptionBudget ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ |
|
||||
|
||||
@@ -26,10 +26,6 @@ spec:
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
{{- if and .Values.envoy.prometheus.enabled (not .Values.envoy.prometheus.serviceMonitor.enabled) }}
|
||||
prometheus.io/port: "{{ .Values.envoy.prometheus.port }}"
|
||||
prometheus.io/scrape: "true"
|
||||
{{- end }}
|
||||
{{- if .Values.envoy.rollOutPods }}
|
||||
# ensure pods roll when configmap updates
|
||||
cilium.io/cilium-envoy-configmap-checksum: {{ include (print $.Template.BasePath "/cilium-envoy/configmap.yaml") . | sha256sum | quote }}
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
{{- $envoyDS := eq (include "envoyDaemonSetEnabled" .) "true" -}}
|
||||
{{- if and $envoyDS (not .Values.preflight.enabled) .Values.envoy.prometheus.enabled }}
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: cilium-envoy
|
||||
namespace: {{ .Release.Namespace }}
|
||||
{{- if or (not .Values.envoy.prometheus.serviceMonitor.enabled) .Values.envoy.annotations }}
|
||||
annotations:
|
||||
{{- if not .Values.envoy.prometheus.serviceMonitor.enabled }}
|
||||
prometheus.io/scrape: "true"
|
||||
prometheus.io/port: {{ .Values.envoy.prometheus.port | quote }}
|
||||
{{- end }}
|
||||
{{- with .Values.envoy.annotations }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
labels:
|
||||
k8s-app: cilium-envoy
|
||||
app.kubernetes.io/name: cilium-envoy
|
||||
app.kubernetes.io/part-of: cilium
|
||||
io.cilium/app: proxy
|
||||
spec:
|
||||
clusterIP: None
|
||||
type: ClusterIP
|
||||
selector:
|
||||
k8s-app: cilium-envoy
|
||||
ports:
|
||||
- name: envoy-metrics
|
||||
port: {{ .Values.envoy.prometheus.port }}
|
||||
protocol: TCP
|
||||
targetPort: envoy-metrics
|
||||
{{- end }}
|
||||
@@ -362,7 +362,7 @@ spec:
|
||||
name: cilium-clustermesh
|
||||
optional: true
|
||||
# note: items are not explicitly listed here, since the entries of this secret
|
||||
# depend on the peers configured, and that would cause a restart of all agents
|
||||
# depend on the peers configured, and that would cause a restart of all operators
|
||||
# at every addition/removal. Leaving the field empty makes each secret entry
|
||||
# to be automatically projected into the volume as a file whose name is the key.
|
||||
- secret:
|
||||
@@ -384,5 +384,28 @@ spec:
|
||||
- key: {{ .Values.tls.caBundle.key }}
|
||||
path: common-etcd-client-ca.crt
|
||||
{{- end }}
|
||||
# note: we configure the volume for the kvstoremesh-specific certificate
|
||||
# regardless of whether KVStoreMesh is enabled or not, so that it can be
|
||||
# automatically mounted in case KVStoreMesh gets subsequently enabled,
|
||||
# without requiring an operator restart.
|
||||
- secret:
|
||||
name: clustermesh-apiserver-local-cert
|
||||
optional: true
|
||||
items:
|
||||
- key: tls.key
|
||||
path: local-etcd-client.key
|
||||
- key: tls.crt
|
||||
path: local-etcd-client.crt
|
||||
{{- if not .Values.tls.caBundle.enabled }}
|
||||
- key: ca.crt
|
||||
path: local-etcd-client-ca.crt
|
||||
{{- else }}
|
||||
- {{ .Values.tls.caBundle.useSecret | ternary "secret" "configMap" }}:
|
||||
name: {{ .Values.tls.caBundle.name }}
|
||||
optional: true
|
||||
items:
|
||||
- key: {{ .Values.tls.caBundle.key }}
|
||||
path: local-etcd-client-ca.crt
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
@@ -1,3 +1,47 @@
|
||||
{{/* validate deprecated options are not being used */}}
|
||||
|
||||
{{/* Options deprecated in v1.15 and removed in v1.16 */}}
|
||||
{{- if or
|
||||
(dig "encryption" "keyFile" "" .Values.AsMap)
|
||||
(dig "encryption" "mountPath" "" .Values.AsMap)
|
||||
(dig "encryption" "secretName" "" .Values.AsMap)
|
||||
(dig "encryption" "interface" "" .Values.AsMap)
|
||||
}}
|
||||
{{ fail "encryption.{keyFile,mountPath,secretName,interface} were deprecated in v1.14 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if or
|
||||
((dig "proxy" "prometheus" "enabled" "" .Values.AsMap) | toString)
|
||||
(dig "proxy" "prometheus" "port" "" .Values.AsMap)
|
||||
}}
|
||||
{{ fail "proxy.prometheus.enabled and proxy.prometheus.port were deprecated in v1.14 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if (dig "endpointStatus" "" .Values.AsMap) }}
|
||||
{{ fail "endpointStatus has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if (dig "remoteNodeIdentity" "" .Values.AsMap) }}
|
||||
{{ fail "remoteNodeIdentity was deprecated in v1.15 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if (dig "containerRuntime" "integration" "" .Values.AsMap) }}
|
||||
{{ fail "containerRuntime.integration was deprecated in v1.14 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if (dig "etcd" "managed" "" .Values.AsMap) }}
|
||||
{{ fail "etcd.managed was deprecated in v1.10 has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
|
||||
{{/* Options deprecated in v1.14 and removed in v1.15 */}}
|
||||
{{- if .Values.tunnel }}
|
||||
{{ fail "tunnel was deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if or (dig "clustermesh" "apiserver" "tls" "ca" "cert" "" .Values.AsMap) (dig "clustermesh" "apiserver" "tls" "ca" "key" "" .Values.AsMap) }}
|
||||
{{ fail "clustermesh.apiserver.tls.ca.cert and clustermesh.apiserver.tls.ca.key were deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if .Values.enableK8sEventHandover }}
|
||||
{{ fail "enableK8sEventHandover was deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
{{- if .Values.enableCnpStatusUpdates }}
|
||||
{{ fail "enableCnpStatusUpdates was deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
|
||||
{{- end }}
|
||||
|
||||
{{/* validate hubble config */}}
|
||||
{{- if and .Values.hubble.ui.enabled (not .Values.hubble.ui.standalone.enabled) }}
|
||||
{{- if not .Values.hubble.relay.enabled }}
|
||||
|
||||
@@ -153,10 +153,10 @@ image:
|
||||
# @schema
|
||||
override: ~
|
||||
repository: "quay.io/cilium/cilium"
|
||||
tag: "v1.16.1"
|
||||
tag: "v1.16.2"
|
||||
pullPolicy: "IfNotPresent"
|
||||
# cilium-digest
|
||||
digest: "sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39"
|
||||
digest: "sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea"
|
||||
useDigest: true
|
||||
# -- Affinity for cilium-agent.
|
||||
affinity:
|
||||
@@ -1309,9 +1309,9 @@ hubble:
|
||||
# @schema
|
||||
override: ~
|
||||
repository: "quay.io/cilium/hubble-relay"
|
||||
tag: "v1.16.1"
|
||||
tag: "v1.16.2"
|
||||
# hubble-relay-digest
|
||||
digest: "sha256:2e1b4c739a676ae187d4c2bfc45c3e865bda2567cc0320a90cb666657fcfcc35"
|
||||
digest: "sha256:4b559907b378ac18af82541dafab430a857d94f1057f2598645624e6e7ea286c"
|
||||
useDigest: true
|
||||
pullPolicy: "IfNotPresent"
|
||||
# -- Specifies the resources for the hubble-relay pods
|
||||
@@ -2158,9 +2158,9 @@ envoy:
|
||||
# @schema
|
||||
override: ~
|
||||
repository: "quay.io/cilium/cilium-envoy"
|
||||
tag: "v1.29.7-39a2a56bbd5b3a591f69dbca51d3e30ef97e0e51"
|
||||
tag: "v1.29.9-1726784081-a90146d13b4cd7d168d573396ccf2b3db5a3b047"
|
||||
pullPolicy: "IfNotPresent"
|
||||
digest: "sha256:bd5ff8c66716080028f414ec1cb4f7dc66f40d2fb5a009fff187f4a9b90b566b"
|
||||
digest: "sha256:9762041c3760de226a8b00cc12f27dacc28b7691ea926748f9b5c18862db503f"
|
||||
useDigest: true
|
||||
# -- Additional containers added to the cilium Envoy DaemonSet.
|
||||
extraContainers: []
|
||||
@@ -2474,15 +2474,15 @@ operator:
|
||||
# @schema
|
||||
override: ~
|
||||
repository: "quay.io/cilium/operator"
|
||||
tag: "v1.16.1"
|
||||
tag: "v1.16.2"
|
||||
# operator-generic-digest
|
||||
genericDigest: "sha256:3bc7e7a43bc4a4d8989cb7936c5d96675dd2d02c306adf925ce0a7c35aa27dc4"
|
||||
genericDigest: "sha256:cccfd3b886d52cb132c06acca8ca559f0fce91a6bd99016219b1a81fdbc4813a"
|
||||
# operator-azure-digest
|
||||
azureDigest: "sha256:e55c222654a44ceb52db7ade3a7b9e8ef05681ff84c14ad1d46fea34869a7a22"
|
||||
azureDigest: "sha256:fde7cf8bb887e106cd388bb5c3327e92682b2ec3ab4f03bb57b87f495b99f727"
|
||||
# operator-aws-digest
|
||||
awsDigest: "sha256:e3876fcaf2d6ccc8d5b4aaaded7b1efa971f3f4175eaa2c8a499878d58c39df4"
|
||||
awsDigest: "sha256:b6a73ec94407a56cccc8a395225e2aecc3ca3611e7acfeec86201c19fc0727dd"
|
||||
# operator-alibabacloud-digest
|
||||
alibabacloudDigest: "sha256:4381adf48d76ec482551183947e537d44bcac9b6c31a635a9ac63f696d978804"
|
||||
alibabacloudDigest: "sha256:16e33abb6b8381e2f66388b6d7141399f06c9b51b9ffa08fd159b8d321929716"
|
||||
useDigest: true
|
||||
pullPolicy: "IfNotPresent"
|
||||
suffix: ""
|
||||
@@ -2756,9 +2756,9 @@ preflight:
|
||||
# @schema
|
||||
override: ~
|
||||
repository: "quay.io/cilium/cilium"
|
||||
tag: "v1.16.1"
|
||||
tag: "v1.16.2"
|
||||
# cilium-digest
|
||||
digest: "sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39"
|
||||
digest: "sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea"
|
||||
useDigest: true
|
||||
pullPolicy: "IfNotPresent"
|
||||
# -- The priority class to use for the preflight pod.
|
||||
@@ -2905,9 +2905,9 @@ clustermesh:
|
||||
# @schema
|
||||
override: ~
|
||||
repository: "quay.io/cilium/clustermesh-apiserver"
|
||||
tag: "v1.16.1"
|
||||
tag: "v1.16.2"
|
||||
# clustermesh-apiserver-digest
|
||||
digest: "sha256:e9c77417cd474cc943b2303a76c5cf584ac7024dd513ebb8d608cb62fe28896f"
|
||||
digest: "sha256:cc84190fed92e03a2b3a33bc670b2447b521ee258ad9b076baaad13be312ea73"
|
||||
useDigest: true
|
||||
pullPolicy: "IfNotPresent"
|
||||
# -- TCP port for the clustermesh-apiserver health API.
|
||||
@@ -3406,7 +3406,7 @@ authentication:
|
||||
override: ~
|
||||
repository: "docker.io/library/busybox"
|
||||
tag: "1.36.1"
|
||||
digest: "sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7"
|
||||
digest: "sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140"
|
||||
useDigest: true
|
||||
pullPolicy: "IfNotPresent"
|
||||
# SPIRE agent configuration
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
ARG VERSION=v1.16.1
|
||||
ARG VERSION=v1.16.2
|
||||
FROM quay.io/cilium/cilium:${VERSION}
|
||||
|
||||
@@ -15,4 +15,4 @@ cilium:
|
||||
enableIdentityMark: false
|
||||
enableRuntimeDeviceDetection: true
|
||||
forceDeviceDetection: true
|
||||
devices: ovn0
|
||||
devices: "ovn0 genev_sys_6081"
|
||||
|
||||
@@ -12,7 +12,7 @@ cilium:
|
||||
mode: "kubernetes"
|
||||
image:
|
||||
repository: ghcr.io/aenix-io/cozystack/cilium
|
||||
tag: 1.16.1
|
||||
digest: "sha256:9593dbc3bd25487b52d8f43330d4a308e450605479a8384a32117e9613289892"
|
||||
tag: 1.16.2
|
||||
digest: "sha256:534c5b04fef356a6be59234243c23c0c09702fe1e2c8872012afb391ce2965c4"
|
||||
envoy:
|
||||
enabled: false
|
||||
|
||||
@@ -33,11 +33,11 @@ kubeapps:
|
||||
image:
|
||||
registry: ghcr.io/aenix-io/cozystack
|
||||
repository: dashboard
|
||||
tag: v0.15.0
|
||||
tag: v0.16.5
|
||||
digest: "sha256:4818712e9fc9c57cc321512760c3226af564a04e69d4b3ec9229ab91fd39abeb"
|
||||
kubeappsapis:
|
||||
image:
|
||||
registry: ghcr.io/aenix-io/cozystack
|
||||
repository: kubeapps-apis
|
||||
tag: v0.15.0
|
||||
digest: "sha256:70c095c8f7e3ecfa11433a3a2c8f57f6ff5a0053f006939a2c171c180cc50baf"
|
||||
tag: v0.16.5
|
||||
digest: "sha256:126bb6955ff142e7e00e712c037f3e97bd39b360641fba0b8ca8bc083d5e8224"
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
NAME=fluxcd
|
||||
NAMESPACE=cozy-$(NAME)
|
||||
|
||||
include ../../../scripts/package.mk
|
||||
|
||||
apply-locally:
|
||||
helm upgrade -i -n $(NAMESPACE) $(NAME) .
|
||||
|
||||
include ../../../scripts/package.mk
|
||||
update:
|
||||
rm -rf charts
|
||||
helm pull oci://ghcr.io/controlplaneio-fluxcd/charts/flux-instance --untar --untardir charts
|
||||
|
||||
@@ -21,6 +21,4 @@
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
||||
|
||||
# Ignore img folder used for documentation
|
||||
img/
|
||||
helmdocs.gotmpl
|
||||
28
packages/system/fluxcd/charts/flux-instance/Chart.yaml
Normal file
28
packages/system/fluxcd/charts/flux-instance/Chart.yaml
Normal file
@@ -0,0 +1,28 @@
|
||||
annotations:
|
||||
artifacthub.io/license: AGPL-3.0
|
||||
artifacthub.io/links: |
|
||||
- name: Documentation
|
||||
url: https://fluxcd.control-plane.io/operator
|
||||
- name: Chart Source
|
||||
url: https://github.com/controlplaneio-fluxcd/charts
|
||||
- name: Upstream Project
|
||||
url: https://github.com/controlplaneio-fluxcd/flux-operator
|
||||
apiVersion: v2
|
||||
appVersion: v0.9.0
|
||||
description: 'A Helm chart for deploying a Flux instance managed by Flux Operator. '
|
||||
home: https://github.com/controlplaneio-fluxcd
|
||||
icon: https://raw.githubusercontent.com/cncf/artwork/main/projects/flux/icon/color/flux-icon-color.png
|
||||
keywords:
|
||||
- flux
|
||||
- fluxcd
|
||||
- gitops
|
||||
kubeVersion: '>=1.22.0-0'
|
||||
maintainers:
|
||||
- email: flux-enterprise@control-plane.io
|
||||
name: ControlPlane Flux Team
|
||||
name: flux-instance
|
||||
sources:
|
||||
- https://github.com/controlplaneio-fluxcd/flux-operator
|
||||
- https://github.com/controlplaneio-fluxcd/charts
|
||||
type: application
|
||||
version: 0.9.0
|
||||
52
packages/system/fluxcd/charts/flux-instance/README.md
Normal file
52
packages/system/fluxcd/charts/flux-instance/README.md
Normal file
@@ -0,0 +1,52 @@
|
||||
# flux-instance
|
||||
|
||||
  
|
||||
|
||||
This chart is a thin wrapper around the `FluxInstance` custom resource, which is
|
||||
used by the [Flux Operator](https://github.com/controlplaneio-fluxcd/flux-operator)
|
||||
to install, configure and automatically upgrade Flux.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Kubernetes 1.22+
|
||||
- Helm 3.8+
|
||||
|
||||
## Installing the Chart
|
||||
|
||||
To deploy Flux in the `flux-system` namespace:
|
||||
|
||||
```console
|
||||
helm -n flux-system install flux oci://ghcr.io/controlplaneio-fluxcd/charts/flux-instance
|
||||
```
|
||||
|
||||
For more information on the available configuration options,
|
||||
see the [Flux Instance documentation](https://fluxcd.control-plane.io/operator/fluxinstance/).
|
||||
|
||||
## Uninstalling the Chart
|
||||
|
||||
To uninstall Flux without affecting the resources it manages:
|
||||
|
||||
```console
|
||||
helm -n flux-system uninstall flux
|
||||
```
|
||||
|
||||
## Values
|
||||
|
||||
| Key | Type | Default | Description |
|
||||
|-----|------|---------|-------------|
|
||||
| commonAnnotations | object | `{}` | Common annotations to add to all deployed objects including pods. |
|
||||
| commonLabels | object | `{}` | Common labels to add to all deployed objects including pods. |
|
||||
| fullnameOverride | string | `"flux"` | |
|
||||
| instance.cluster | object | `{"domain":"cluster.local","multitenant":false,"networkPolicy":true,"tenantDefaultServiceAccount":"default","type":"kubernetes"}` | Cluster https://fluxcd.control-plane.io/operator/fluxinstance/#cluster-configuration |
|
||||
| instance.components | list | `["source-controller","kustomize-controller","helm-controller","notification-controller"]` | Components https://fluxcd.control-plane.io/operator/fluxinstance/#components-configuration |
|
||||
| instance.distribution | object | `{"artifact":"oci://ghcr.io/controlplaneio-fluxcd/flux-operator-manifests:latest","imagePullSecret":"","registry":"ghcr.io/fluxcd","version":"2.x"}` | Distribution https://fluxcd.control-plane.io/operator/fluxinstance/#distribution-configuration |
|
||||
| instance.kustomize.patches | list | `[]` | Kustomize patches https://fluxcd.control-plane.io/operator/fluxinstance/#kustomize-patches |
|
||||
| instance.sharding | object | `{"key":"sharding.fluxcd.io/key","shards":[]}` | Sharding https://fluxcd.control-plane.io/operator/fluxinstance/#sharding-configuration |
|
||||
| instance.storage | object | `{"class":"","size":""}` | Storage https://fluxcd.control-plane.io/operator/fluxinstance/#storage-configuration |
|
||||
| instance.sync | object | `{"kind":"GitRepository","path":"","pullSecret":"","ref":"","url":""}` | Sync https://fluxcd.control-plane.io/operator/fluxinstance/#sync-configuration |
|
||||
| nameOverride | string | `""` | |
|
||||
|
||||
## Source Code
|
||||
|
||||
* <https://github.com/controlplaneio-fluxcd/flux-operator>
|
||||
* <https://github.com/controlplaneio-fluxcd/charts>
|
||||
@@ -0,0 +1 @@
|
||||
Documentation at https://fluxcd.control-plane.io/operator/
|
||||
@@ -0,0 +1,51 @@
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "flux-instance.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/}}
|
||||
{{- define "flux-instance.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "flux-instance.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "flux-instance.labels" -}}
|
||||
helm.sh/chart: {{ include "flux-instance.chart" . }}
|
||||
{{ include "flux-instance.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "flux-instance.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "flux-instance.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
@@ -0,0 +1,43 @@
|
||||
apiVersion: fluxcd.controlplane.io/v1
|
||||
kind: FluxInstance
|
||||
metadata:
|
||||
name: {{ include "flux-instance.fullname" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
labels:
|
||||
{{- include "flux-instance.labels" . | nindent 4 }}
|
||||
{{- with .Values.commonLabels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- with .Values.commonAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
distribution:
|
||||
version: {{ .Values.instance.distribution.version }}
|
||||
registry: {{ .Values.instance.distribution.registry }}
|
||||
artifact: {{ .Values.instance.distribution.artifact }}
|
||||
{{- if .Values.instance.distribution.imagePullSecret }}
|
||||
imagePullSecret: {{ .Values.instance.distribution.imagePullSecret }}
|
||||
{{- end }}
|
||||
components: {{ .Values.instance.components | toYaml | nindent 4 }}
|
||||
cluster: {{ .Values.instance.cluster | toYaml | nindent 4 }}
|
||||
kustomize: {{ .Values.instance.kustomize | toYaml | nindent 4 }}
|
||||
{{- if .Values.instance.sync.url }}
|
||||
sync:
|
||||
kind: {{ .Values.instance.sync.kind }}
|
||||
url: {{ .Values.instance.sync.url }}
|
||||
ref: {{ .Values.instance.sync.ref }}
|
||||
path: {{ .Values.instance.sync.path }}
|
||||
{{- if .Values.instance.sync.pullSecret }}
|
||||
pullSecret: {{ .Values.instance.sync.pullSecret }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.instance.storage.size }}
|
||||
storage: {{ .Values.instance.storage | toYaml | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.instance.sharding.shards }}
|
||||
sharding:
|
||||
key: {{ .Values.instance.sharding.key }}
|
||||
shards: {{ .Values.instance.sharding.shards | toYaml | nindent 4 }}
|
||||
{{- end }}
|
||||
153
packages/system/fluxcd/charts/flux-instance/values.schema.json
Normal file
153
packages/system/fluxcd/charts/flux-instance/values.schema.json
Normal file
@@ -0,0 +1,153 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2019-09/schema",
|
||||
"properties": {
|
||||
"commonAnnotations": {
|
||||
"properties": {},
|
||||
"type": "object"
|
||||
},
|
||||
"commonLabels": {
|
||||
"properties": {},
|
||||
"type": "object"
|
||||
},
|
||||
"fullnameOverride": {
|
||||
"type": "string"
|
||||
},
|
||||
"instance": {
|
||||
"properties": {
|
||||
"cluster": {
|
||||
"properties": {
|
||||
"domain": {
|
||||
"type": "string"
|
||||
},
|
||||
"multitenant": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"networkPolicy": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"tenantDefaultServiceAccount": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"enum": [
|
||||
"kubernetes",
|
||||
"openshift",
|
||||
"aws",
|
||||
"azure",
|
||||
"gcp"
|
||||
],
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
},
|
||||
"components": {
|
||||
"items": {
|
||||
"enum": [
|
||||
"source-controller",
|
||||
"kustomize-controller",
|
||||
"helm-controller",
|
||||
"notification-controller",
|
||||
"image-reflector-controller",
|
||||
"image-automation-controller"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"type": "array",
|
||||
"uniqueItems": true
|
||||
},
|
||||
"distribution": {
|
||||
"properties": {
|
||||
"artifact": {
|
||||
"type": "string"
|
||||
},
|
||||
"imagePullSecret": {
|
||||
"type": "string"
|
||||
},
|
||||
"registry": {
|
||||
"type": "string"
|
||||
},
|
||||
"version": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"version",
|
||||
"registry"
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"kustomize": {
|
||||
"properties": {
|
||||
"patches": {
|
||||
"items": {
|
||||
"type": "object"
|
||||
},
|
||||
"type": "array"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
},
|
||||
"sharding": {
|
||||
"properties": {
|
||||
"key": {
|
||||
"type": "string"
|
||||
},
|
||||
"shards": {
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": "array"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
},
|
||||
"storage": {
|
||||
"properties": {
|
||||
"class": {
|
||||
"type": "string"
|
||||
},
|
||||
"size": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
},
|
||||
"sync": {
|
||||
"properties": {
|
||||
"kind": {
|
||||
"enum": [
|
||||
"GitRepository",
|
||||
"OCIRepository",
|
||||
"Bucket"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"pullSecret": {
|
||||
"type": "string"
|
||||
},
|
||||
"ref": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"distribution",
|
||||
"cluster"
|
||||
],
|
||||
"type": "object"
|
||||
},
|
||||
"nameOverride": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"type": "object"
|
||||
}
|
||||
49
packages/system/fluxcd/charts/flux-instance/values.yaml
Normal file
49
packages/system/fluxcd/charts/flux-instance/values.yaml
Normal file
@@ -0,0 +1,49 @@
|
||||
# Default values for flux-instance.
|
||||
|
||||
nameOverride: ""
|
||||
fullnameOverride: "flux"
|
||||
|
||||
instance:
|
||||
# -- Distribution https://fluxcd.control-plane.io/operator/fluxinstance/#distribution-configuration
|
||||
distribution: # @schema required: true
|
||||
version: "2.x" # @schema required: true
|
||||
registry: "ghcr.io/fluxcd" # @schema required: true
|
||||
artifact: "oci://ghcr.io/controlplaneio-fluxcd/flux-operator-manifests:latest"
|
||||
imagePullSecret: ""
|
||||
# -- Components https://fluxcd.control-plane.io/operator/fluxinstance/#components-configuration
|
||||
components: # @schema item: string; uniqueItems: true; itemEnum: [source-controller,kustomize-controller,helm-controller,notification-controller,image-reflector-controller,image-automation-controller]
|
||||
- source-controller
|
||||
- kustomize-controller
|
||||
- helm-controller
|
||||
- notification-controller
|
||||
# -- Cluster https://fluxcd.control-plane.io/operator/fluxinstance/#cluster-configuration
|
||||
cluster: # @schema required: true
|
||||
type: kubernetes # @schema enum:[kubernetes,openshift,aws,azure,gcp]
|
||||
domain: "cluster.local"
|
||||
networkPolicy: true
|
||||
multitenant: false
|
||||
tenantDefaultServiceAccount: "default"
|
||||
# -- Storage https://fluxcd.control-plane.io/operator/fluxinstance/#storage-configuration
|
||||
storage: # @schema required: false
|
||||
class: ""
|
||||
size: ""
|
||||
# -- Sharding https://fluxcd.control-plane.io/operator/fluxinstance/#sharding-configuration
|
||||
sharding: # @schema required: false
|
||||
key: "sharding.fluxcd.io/key"
|
||||
shards: [] # @schema item: string
|
||||
# -- Sync https://fluxcd.control-plane.io/operator/fluxinstance/#sync-configuration
|
||||
sync: # @schema required: false
|
||||
kind: "GitRepository" # @schema enum:[GitRepository,OCIRepository,Bucket]
|
||||
url: ""
|
||||
ref: ""
|
||||
path: ""
|
||||
pullSecret: ""
|
||||
kustomize: # @schema required: false
|
||||
# -- Kustomize patches https://fluxcd.control-plane.io/operator/fluxinstance/#kustomize-patches
|
||||
patches: [] # @schema item: object
|
||||
|
||||
# -- Common annotations to add to all deployed objects including pods.
|
||||
commonAnnotations: { }
|
||||
|
||||
# -- Common labels to add to all deployed objects including pods.
|
||||
commonLabels: { }
|
||||
@@ -1,25 +0,0 @@
|
||||
apiVersion: fluxcd.controlplane.io/v1
|
||||
kind: FluxInstance
|
||||
metadata:
|
||||
name: flux
|
||||
spec:
|
||||
{{- with .Values.cluster }}
|
||||
cluster:
|
||||
{{- with .networkPolicy }}
|
||||
networkPolicy: {{ . }}
|
||||
{{- end }}
|
||||
{{- with .domain }}
|
||||
domain: {{ . }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
distribution:
|
||||
version: {{ .Values.distribution.version }}
|
||||
registry: {{ .Values.distribution.registry }}
|
||||
components:
|
||||
{{- if .Values.components }}
|
||||
{{- toYaml .Values.components | nindent 4 }}
|
||||
{{- end }}
|
||||
kustomize:
|
||||
{{- if .Values.kustomize }}
|
||||
{{- toYaml .Values.kustomize | nindent 4 }}
|
||||
{{- end }}
|
||||
@@ -1,47 +1,49 @@
|
||||
cluster:
|
||||
networkPolicy: true
|
||||
# domain: cozy.local
|
||||
distribution:
|
||||
version: 2.3.x
|
||||
registry: ghcr.io/fluxcd
|
||||
components:
|
||||
- source-controller
|
||||
- kustomize-controller
|
||||
- helm-controller
|
||||
- notification-controller
|
||||
- image-reflector-controller
|
||||
- image-automation-controller
|
||||
kustomize:
|
||||
patches:
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: "(kustomize-controller|helm-controller|source-controller)"
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --concurrent=20
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --requeue-dependency=5s
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources/limits
|
||||
value:
|
||||
cpu: 2000m
|
||||
memory: 2048Mi
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: source-controller
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --storage-adv-addr=source-controller.cozy-fluxcd.svc
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: (kustomize-controller|helm-controller|image-reflector-controller|image-automation-controller)
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
|
||||
flux-instance:
|
||||
instance:
|
||||
cluster:
|
||||
networkPolicy: true
|
||||
domain: cozy.local # -- default value is overriden in patches
|
||||
distribution:
|
||||
version: 2.3.x
|
||||
registry: ghcr.io/fluxcd
|
||||
components:
|
||||
- source-controller
|
||||
- kustomize-controller
|
||||
- helm-controller
|
||||
- notification-controller
|
||||
- image-reflector-controller
|
||||
- image-automation-controller
|
||||
kustomize:
|
||||
patches:
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: "(kustomize-controller|helm-controller|source-controller)"
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --concurrent=20
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --requeue-dependency=5s
|
||||
- op: replace
|
||||
path: /spec/template/spec/containers/0/resources/limits
|
||||
value:
|
||||
cpu: 2000m
|
||||
memory: 2048Mi
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: source-controller
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --storage-adv-addr=source-controller.cozy-fluxcd.svc
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
|
||||
- target:
|
||||
kind: Deployment
|
||||
name: (kustomize-controller|helm-controller|image-reflector-controller|image-automation-controller)
|
||||
patch: |
|
||||
- op: add
|
||||
path: /spec/template/spec/containers/0/args/-
|
||||
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
|
||||
|
||||
@@ -6,7 +6,7 @@ ingress-nginx:
|
||||
registry: ghcr.io
|
||||
image: kvaps/ingress-nginx-with-protobuf-exporter/controller
|
||||
tag: v1.11.2
|
||||
digest: sha256:f4194edb06a43c82405167427ebd552b90af9698bd295845418680aebc13f600
|
||||
digest: sha256:e80856ece4e30e9646d65c8d92c25a3446a0bba1c2468cd026f17df9e60d2c0f
|
||||
allowSnippetAnnotations: true
|
||||
replicaCount: 2
|
||||
admissionWebhooks:
|
||||
|
||||
@@ -1 +1,25 @@
|
||||
FROM clastix/kamaji:edge-24.9.2
|
||||
# Build the manager binary
|
||||
FROM golang:1.22 as builder
|
||||
|
||||
ARG VERSION=edge-24.9.2
|
||||
ARG TARGETOS TARGETARCH
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN curl -sSL https://github.com/clastix/kamaji/archive/refs/tags/${VERSION}.tar.gz | tar -xzvf- --strip=1
|
||||
|
||||
COPY patches /patches
|
||||
RUN git apply /patches/disable-datastore-check.diff
|
||||
|
||||
RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH go build \
|
||||
-ldflags "-X github.com/clastix/kamaji/internal.GitRepo=$GIT_REPO -X github.com/clastix/kamaji/internal.GitTag=$GIT_LAST_TAG -X github.com/clastix/kamaji/internal.GitCommit=$GIT_HEAD_COMMIT -X github.com/clastix/kamaji/internal.GitDirty=$GIT_MODIFIED -X github.com/clastix/kamaji/internal.BuildTime=$BUILD_DATE" \
|
||||
-a -o kamaji main.go
|
||||
|
||||
# Use distroless as minimal base image to package the manager binary
|
||||
# Refer to https://github.com/GoogleContainerTools/distroless for more details
|
||||
FROM gcr.io/distroless/static:nonroot
|
||||
WORKDIR /
|
||||
COPY --from=builder /workspace/kamaji .
|
||||
USER 65532:65532
|
||||
|
||||
ENTRYPOINT ["/kamaji"]
|
||||
|
||||
@@ -0,0 +1,23 @@
|
||||
diff --git a/cmd/manager/cmd.go b/cmd/manager/cmd.go
|
||||
index 9a24d4e..a03a4e0 100644
|
||||
--- a/cmd/manager/cmd.go
|
||||
+++ b/cmd/manager/cmd.go
|
||||
@@ -31,7 +31,6 @@ import (
|
||||
"github.com/clastix/kamaji/controllers/soot"
|
||||
"github.com/clastix/kamaji/internal"
|
||||
"github.com/clastix/kamaji/internal/builders/controlplane"
|
||||
- datastoreutils "github.com/clastix/kamaji/internal/datastore/utils"
|
||||
"github.com/clastix/kamaji/internal/webhook"
|
||||
"github.com/clastix/kamaji/internal/webhook/handlers"
|
||||
"github.com/clastix/kamaji/internal/webhook/routes"
|
||||
@@ -80,10 +79,6 @@ func NewCmd(scheme *runtime.Scheme) *cobra.Command {
|
||||
return fmt.Errorf("unable to read webhook CA: %w", err)
|
||||
}
|
||||
|
||||
- if err = datastoreutils.CheckExists(ctx, scheme, datastore); err != nil {
|
||||
- return err
|
||||
- }
|
||||
-
|
||||
if controllerReconcileTimeout.Seconds() == 0 {
|
||||
return fmt.Errorf("the controller reconcile timeout must be greater than zero")
|
||||
}
|
||||
@@ -3,7 +3,7 @@ kamaji:
|
||||
deploy: false
|
||||
image:
|
||||
pullPolicy: IfNotPresent
|
||||
tag: latest@sha256:bb45d953a8ba46a19c8941ccc9fc8498d91435c77db439d8b1d6bde9fea8802a
|
||||
tag: v0.16.5@sha256:95a9658cbbe1cbfbc42b9ab1df4f2a39342d7a8f1ff10a10b81b8656f3744c39
|
||||
repository: ghcr.io/aenix-io/cozystack/kamaji
|
||||
resources:
|
||||
limits:
|
||||
|
||||
@@ -22,4 +22,4 @@ global:
|
||||
images:
|
||||
kubeovn:
|
||||
repository: kubeovn
|
||||
tag: v1.13.0@sha256:11c4ef0f71c73df4703743c0f63b7ff0ec67af6342caf1e7db8ebd5546071855
|
||||
tag: v1.13.0@sha256:ba4e98866295db13d88b10984c230e1cb0db86782767c5b9aff452865cdd1012
|
||||
|
||||
@@ -19,26 +19,3 @@ update:
|
||||
helm repo add fluent https://fluent.github.io/helm-charts
|
||||
helm repo update fluent
|
||||
helm pull fluent/fluent-bit --untar --untardir charts
|
||||
# alerts from victoria-metrics-k8s-stack
|
||||
helm repo add vm https://victoriametrics.github.io/helm-charts/
|
||||
helm repo update vm
|
||||
helm pull vm/victoria-metrics-k8s-stack --untar --untardir charts
|
||||
rm -rf charts/victoria-metrics-k8s-stack/charts
|
||||
rm -rf charts/victoria-metrics-k8s-stack/hack
|
||||
rm -rf charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator
|
||||
rm -rf charts/victoria-metrics-k8s-stack/templates/grafana
|
||||
rm -rf charts/victoria-metrics-k8s-stack/templates/ingress.yaml
|
||||
rm -rf charts/victoria-metrics-k8s-stack/files/dashboards
|
||||
rm -f charts/victoria-metrics-k8s-stack/templates/servicemonitors.yaml
|
||||
rm -f charts/victoria-metrics-k8s-stack/templates/serviceaccount.yaml
|
||||
rm -f charts/victoria-metrics-k8s-stack/templates/rules/additionalVictoriaMetricsRules.yml
|
||||
sed -i '/ namespace:/d' charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml
|
||||
sed -i 's|job="apiserver"|job="kube-apiserver"|g' `grep -rl 'job="apiserver"' charts/victoria-metrics-k8s-stack/files/rules/generated`
|
||||
sed -i 's|severity: info|severity: informational|g' `grep -rl 'severity: info' ./charts/victoria-metrics-k8s-stack/files/rules/generated`
|
||||
sed -i 's|severity: none|severity: ok|g' ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
|
||||
sed -i ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml \
|
||||
-e '/Watchdog/,/severity:/s/severity: none/severity: ok/' \
|
||||
-e '/InfoInhibitor/,/severity:/s/severity: none/severity: major/'
|
||||
# TODO
|
||||
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
|
||||
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/vm*.yaml
|
||||
|
||||
221
packages/system/monitoring/alerts/etcd.yaml
Normal file
221
packages/system/monitoring/alerts/etcd.yaml
Normal file
@@ -0,0 +1,221 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-etcd
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
params: {}
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||
without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance
|
||||
}} has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||
resources, high network latency, or disruptions by other components and
|
||||
should be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||
>= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests
|
||||
for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests
|
||||
for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
|
||||
service: etcd
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||
}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
|
||||
service: etcd
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member communication with
|
||||
{{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.member }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the
|
||||
defined quota on etcd instance {{ $labels.instance }}, please defrag or
|
||||
increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100
|
||||
> 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of
|
||||
disk space in the next four hours, based on write observations within the
|
||||
past four hours on etcd instance {{ $labels.instance }}, please check as
|
||||
it might be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h],
|
||||
4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||
retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated
|
||||
storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) <
|
||||
0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
128
packages/system/monitoring/alerts/flux.yaml
Normal file
128
packages/system/monitoring/alerts/flux.yaml
Normal file
@@ -0,0 +1,128 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
annotations:
|
||||
meta.helm.sh/release-name: monitoring
|
||||
meta.helm.sh/release-namespace: cozy-monitoring
|
||||
labels:
|
||||
app: victoria-metrics-k8s-stack
|
||||
app.kubernetes.io/instance: monitoring
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/name: victoria-metrics-k8s-stack
|
||||
app.kubernetes.io/version: v1.102.1
|
||||
helm.sh/chart: victoria-metrics-k8s-stack-0.25.17
|
||||
name: alerts-flux-resources
|
||||
namespace: cozy-monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: flux-resources-alerts
|
||||
rules:
|
||||
- alert: HelmReleaseNotReady
|
||||
expr: gotk_resource_info{customresource_kind="HelmRelease", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready"
|
||||
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is in an unready state for more than 15 minutes."
|
||||
|
||||
- alert: GitRepositorySyncFailed
|
||||
expr: gotk_resource_info{customresource_kind="GitRepository", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
|
||||
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
|
||||
|
||||
- alert: KustomizationNotApplied
|
||||
expr: gotk_resource_info{customresource_kind="Kustomization", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not applied"
|
||||
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not successfully applied for more than 15 minutes."
|
||||
|
||||
- alert: ImageRepositorySyncFailed
|
||||
expr: gotk_resource_info{customresource_kind="ImageRepository", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
|
||||
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
|
||||
|
||||
- alert: HelmChartFailed
|
||||
expr: gotk_resource_info{customresource_kind="HelmChart", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has failed"
|
||||
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready for more than 15 minutes."
|
||||
|
||||
- alert: HelmReleaseSuspended
|
||||
expr: gotk_resource_info{customresource_kind="HelmRelease", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: GitRepositorySuspended
|
||||
expr: gotk_resource_info{customresource_kind="GitRepository", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: KustomizationSuspended
|
||||
expr: gotk_resource_info{customresource_kind="Kustomization", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: ImageRepositorySuspended
|
||||
expr: gotk_resource_info{customresource_kind="ImageRepository", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: HelmChartSuspended
|
||||
expr: gotk_resource_info{customresource_kind="HelmChart", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
57
packages/system/monitoring/alerts/general.rules.yaml
Normal file
57
packages/system/monitoring/alerts/general.rules.yaml
Normal file
@@ -0,0 +1,57 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-general.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
params: {}
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job,namespace,service,cluster) / count(up) BY
|
||||
(job,namespace,service,cluster)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: general.rules
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager
|
||||
is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: ok
|
||||
exported_instance: global
|
||||
service: general.rules
|
||||
event: Heartbeat
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert that is used to inhibit info alerts.
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
other alerts.
|
||||
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
||||
severity of 'warning' or 'critical' starts firing on the same namespace.
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace,cluster) ALERTS{alertname
|
||||
!= "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} ==
|
||||
1
|
||||
labels:
|
||||
severity: major
|
||||
exported_instance: global
|
||||
service: general.rules
|
||||
@@ -0,0 +1,18 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containercpuusagesecondstotal
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_cpu_usage_seconds_total
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,pod,container,cluster) (
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (
|
||||
1, max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemorycache
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_cache
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemoryrss
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_rss
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemoryswap
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_swap
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemoryworkingsetbytes
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_working_set_bytes
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
@@ -0,0 +1,93 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containerresource
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_resource
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
60
packages/system/monitoring/alerts/k8s.rules.pod_owner.yaml
Normal file
60
packages/system/monitoring/alerts/k8s.rules.pod_owner.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.podowner
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.pod_owner
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on (replicaset,namespace,cluster) group_left(owner_name) topk by (replicaset,namespace,cluster) (
|
||||
1, max by (replicaset,namespace,owner_name,cluster) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: job
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
@@ -0,0 +1,146 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-availability.rules
|
||||
spec:
|
||||
groups:
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24
|
||||
* 30
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,cluster) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h]))
|
||||
labels: {}
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,cluster) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d])
|
||||
* 24 * 30)
|
||||
labels: {}
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,le,cluster) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
|
||||
labels: {}
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,le,cluster) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d])
|
||||
* 24 * 30)
|
||||
labels: {}
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- annotations: {}
|
||||
expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- annotations: {}
|
||||
expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
@@ -0,0 +1,324 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-burnrate.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-burnrate.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-histogram.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-histogram.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])))
|
||||
> 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])))
|
||||
> 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
73
packages/system/monitoring/alerts/kube-apiserver-slos.yaml
Normal file
73
packages/system/monitoring/alerts/kube-apiserver-slos.yaml
Normal file
@@ -0,0 +1,73 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-slos
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-slos
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-prometheus-general.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-general.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: count without(instance, pod, node) (up == 1)
|
||||
labels: {}
|
||||
record: count:up1
|
||||
- annotations: {}
|
||||
expr: count without(instance, pod, node) (up == 0)
|
||||
labels: {}
|
||||
record: count:up0
|
||||
@@ -0,0 +1,37 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-prometheus-node-recording.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
|
||||
BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_cpu:rate:sum
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu)) BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_cpu:ratio
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
labels: {}
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- annotations: {}
|
||||
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance,
|
||||
cpu))
|
||||
labels: {}
|
||||
record: cluster:node_cpu:ratio
|
||||
63
packages/system/monitoring/alerts/kube-scheduler.rules.yaml
Normal file
63
packages/system/monitoring/alerts/kube-scheduler.rules.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-scheduler.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
73
packages/system/monitoring/alerts/kube-state-metrics.yaml
Normal file
73
packages/system/monitoring/alerts/kube-state-metrics.yaml
Normal file
@@ -0,0 +1,73 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-state-metrics
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate
|
||||
in list operations. This is likely causing it to not be able to expose metrics
|
||||
about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate
|
||||
in watch operations. This is likely causing it to not be able to expose
|
||||
metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: kube-state-metrics pods are running with different --total-shards
|
||||
configuration, some Kubernetes objects may be exposed multiple times or
|
||||
not exposed at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by
|
||||
(cluster) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects
|
||||
are not being exposed.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
|
||||
-
|
||||
sum( 2 ^ max by (shard_ordinal,cluster) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
30
packages/system/monitoring/alerts/kubelet.rules.yaml
Normal file
30
packages/system/monitoring/alerts/kubelet.rules.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubelet.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
304
packages/system/monitoring/alerts/kubernetes-apps.yaml
Normal file
304
packages/system/monitoring/alerts/kubernetes-apps.yaml
Normal file
@@ -0,0 +1,304 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-apps
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
||||
) * on (namespace,pod,cluster) group_left(owner_kind) topk by (namespace,pod,cluster) (
|
||||
1, max by (namespace,pod,owner_kind,cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} is not progressing for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||
summary: Deployment rollout is not progressing.
|
||||
expr: |-
|
||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
|
||||
has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: StatefulSet has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
|
||||
update has not been rolled out.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max by (namespace,statefulset,cluster) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
|
||||
not finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on
|
||||
container {{ $labels.container}} has been in waiting state for longer than
|
||||
1 hour.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace,pod,container,cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||
namespace=~".*"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container
|
||||
}}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
|
||||
more than {{ "43200" | humanizeDuration }} to complete.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by (namespace,job_name,cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.job_name }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
|
||||
complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.job_name }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler
|
||||
}}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler
|
||||
}}'
|
||||
service: kubernetes-apps
|
||||
138
packages/system/monitoring/alerts/kubernetes-resources.yaml
Normal file
138
packages/system/monitoring/alerts/kubernetes-resources.yaml
Normal file
@@ -0,0 +1,138 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-resources
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource
|
||||
requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||
failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.namespace }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.namespace }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}'
|
||||
service: kubernetes-resources
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
|
||||
$labels.pod }}.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,cluster)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,cluster)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container
|
||||
}}'
|
||||
service: kubernetes-resources
|
||||
130
packages/system/monitoring/alerts/kubernetes-storage.yaml
Normal file
130
packages/system/monitoring/alerts/kubernetes-storage.yaml
Normal file
@@ -0,0 +1,130 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-storage
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{
|
||||
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{
|
||||
with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill
|
||||
up within four days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{
|
||||
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{
|
||||
with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run
|
||||
out of inodes within four days. Currently {{ $value | humanizePercentage
|
||||
}} of its inodes are free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: The persistent volume {{ $labels.persistentvolume }} {{ with
|
||||
$labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase
|
||||
}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.persistentvolume }}'
|
||||
service: kubernetes-storage
|
||||
@@ -0,0 +1,91 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-apiserver
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-apiserver
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to kubernetes apiserver
|
||||
is expiring in less than 7.0 days.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"}
|
||||
> 0 and on (job,cluster) histogram_quantile(0.01, sum by (job,le,cluster)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m])))
|
||||
< 604800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to kubernetes apiserver
|
||||
is expiring in less than 24.0 hours.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"}
|
||||
> 0 and on (job,cluster) histogram_quantile(0.01, sum by (job,le,cluster)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m])))
|
||||
< 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAggregatedAPIErrors
|
||||
annotations:
|
||||
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
|
||||
}} has reported errors. It has appeared unavailable {{ $value | humanize
|
||||
}} times averaged over the past 10m.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
|
||||
summary: Kubernetes aggregated API has reported errors.
|
||||
expr: sum by (name,namespace,cluster)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m]))
|
||||
> 4
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.name }}/{{ $labels.namespace }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAggregatedAPIDown
|
||||
annotations:
|
||||
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
|
||||
}} has been only {{ $value | humanize }}% available over the last 10m.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
|
||||
summary: Kubernetes aggregated API is down.
|
||||
expr: (1 - max by (name,namespace,cluster)(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m])))
|
||||
* 100 < 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.name }}/{{ $labels.namespace }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/apiserver'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage
|
||||
}} of its incoming requests.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
|
||||
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage
|
||||
}} of its incoming requests.
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) /
|
||||
( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m]))
|
||||
) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}/apiserver'
|
||||
service: kubernetes-system-apiserver
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-controller-manager
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-controller-manager
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
description: KubeControllerManager has disappeared from Prometheus target
|
||||
discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/controller-manager'
|
||||
service: kubernetes-system-controller-manager
|
||||
175
packages/system/monitoring/alerts/kubernetes-system-kubelet.yaml
Normal file
175
packages/system/monitoring/alerts/kubernetes-system-kubelet.yaml
Normal file
@@ -0,0 +1,175 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-kubelet
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kubelet
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"}
|
||||
== 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is unreachable and some workloads may be
|
||||
rescheduled.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"}
|
||||
unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"})
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |-
|
||||
count by (node,cluster) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster) group_left(node) topk by (instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by (node,cluster) (
|
||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
description: The readiness status of node {{ $labels.node }} has changed {{
|
||||
$value }} times in the last 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m]))
|
||||
by (node,cluster) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
|
||||
duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"}
|
||||
>= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||
on node {{ $labels.node }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster)) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}
|
||||
> 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{ $labels.node }} has failed to renew its client
|
||||
certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{ $labels.node }} has failed to renew its server
|
||||
certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-scheduler
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-scheduler
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.scheduler }}'
|
||||
service: kubernetes-system-scheduler
|
||||
37
packages/system/monitoring/alerts/kubernetes-system.yaml
Normal file
37
packages/system/monitoring/alerts/kubernetes-system.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different semantic versions of Kubernetes
|
||||
components running.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
|
||||
summary: Different semantic versions of Kubernetes components running.
|
||||
expr: count by (cluster) (count by (git_version,cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*")))
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-system
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
|
||||
summary: Kubernetes API server client is experiencing errors.
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,cluster)
|
||||
/
|
||||
sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: kubernetes-system
|
||||
93
packages/system/monitoring/alerts/node-exporter.rules.yaml
Normal file
93
packages/system/monitoring/alerts/node-exporter.rules.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node-exporter.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_num_cpu:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- annotations: {}
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
labels: {}
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- annotations: {}
|
||||
expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
labels: {}
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- annotations: {}
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
labels: {}
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
396
packages/system/monitoring/alerts/node-exporter.yaml
Normal file
396
packages/system/monitoring/alerts/node-exporter.yaml
Normal file
@@ -0,0 +1,396 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node-exporter
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
params: {}
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.node }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.node }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.node }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||
> 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector on {{ $labels.node }} failed
|
||||
to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock at {{ $labels.node }} is out of sync by more than 0.05s.
|
||||
Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock at {{ $labels.node }} is not synchronising. Ensure
|
||||
NTP is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' at {{ $labels.node }} is
|
||||
in degraded state due to one or more disks failures. Number of spare drives
|
||||
is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array at {{ $labels.node }} failed.
|
||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
> 0
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.node }} is currently
|
||||
at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.node }} is currently
|
||||
at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
description: |
|
||||
CPU usage at {{ $labels.node }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||
mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
description: |
|
||||
System load per core at {{ $labels.node }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
||||
summary: System saturated, load per core is very high.
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
description: |
|
||||
Memory major pages are occurring at very high rate at {{ $labels.node }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
Please check that there is enough memory available at this instance.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
||||
summary: Memory major page faults are occurring at very high rate.
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
description: |
|
||||
Memory is filling up at {{ $labels.node }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: node-exporter
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
description: |
|
||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.node }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This symptom might indicate disk saturation.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
> 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description: Systemd service {{ $labels.name }} has entered failed state at
|
||||
{{ $labels.node }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.name }}'
|
||||
service: node-exporter
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description: Bonding interface {{ $labels.master }} on {{ $labels.node
|
||||
}} is in degraded state due to one or more slave failures.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}/{{ $labels.master }}'
|
||||
service: node-exporter
|
||||
21
packages/system/monitoring/alerts/node-network.yaml
Normal file
21
packages/system/monitoring/alerts/node-network.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node-network
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
params: {}
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: Network interface "{{ $labels.device }}" changing its up status
|
||||
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changing its status
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-network
|
||||
55
packages/system/monitoring/alerts/node.rules.yaml
Normal file
55
packages/system/monitoring/alerts/node.rules.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
topk by (namespace,pod,cluster) (1,
|
||||
max by (node,namespace,pod,cluster) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
labels: {}
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
count by (node,cluster) (
|
||||
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
||||
* on (namespace,pod,cluster) group_left(node)
|
||||
topk by (namespace,pod,cluster) (1, node_namespace_pod:kube_pod_info:)
|
||||
)
|
||||
labels: {}
|
||||
record: node:node_num_cpu:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
labels: {}
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
avg by (node,cluster) (
|
||||
sum without (mode) (
|
||||
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: node:node_cpu_utilization:ratio_rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
avg by (cluster) (
|
||||
node:node_cpu_utilization:ratio_rate5m
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
@@ -1,3 +0,0 @@
|
||||
apiVersion: v1
|
||||
name: alerts
|
||||
version: v0.0.0
|
||||
@@ -1,48 +0,0 @@
|
||||
groups:
|
||||
- name: blackbox-exporter
|
||||
rules:
|
||||
- alert: HttpProbeFailed
|
||||
annotations:
|
||||
message: Probing the blackbox-exporter target {{ $labels.instance }} failed.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobefailed
|
||||
expr: probe_success != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
|
||||
- alert: HttpProbeSlow
|
||||
annotations:
|
||||
message: '{{ $labels.instance }} takes {{ $value }} seconds to respond.'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobeslow
|
||||
expr: sum by (instance) (probe_http_duration_seconds) > 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
runbook:
|
||||
steps:
|
||||
- Check the target system's resource usage for anomalias.
|
||||
- Check if the target application has been recently rescheduled and is still settling.
|
||||
|
||||
- alert: HttpCertExpiresSoon
|
||||
annotations:
|
||||
message: The certificate for {{ $labels.instance }} expires in less than 3 days.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiressoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 3*24*3600
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
|
||||
- alert: HttpCertExpiresVerySoon
|
||||
annotations:
|
||||
message: The certificate for {{ $labels.instance }} expires in less than 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiresverysoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 24*3600
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user