Compare commits

..

1 Commits

Author SHA1 Message Date
Andrei Kvapil
1b0a7eacca Add alerts from kubermatic
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
2024-09-27 17:29:02 +02:00
70 changed files with 1543 additions and 742 deletions

View File

@@ -114,7 +114,7 @@ machine:
- name: zfs
- name: spl
install:
image: ghcr.io/aenix-io/cozystack/talos:v1.8.0
image: ghcr.io/aenix-io/cozystack/talos:v1.7.1
files:
- content: |
[plugins]

View File

@@ -68,7 +68,7 @@ spec:
serviceAccountName: cozystack
containers:
- name: cozystack
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.16.0"
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.15.0"
env:
- name: KUBERNETES_SERVICE_HOST
value: localhost
@@ -87,7 +87,7 @@ spec:
fieldRef:
fieldPath: metadata.name
- name: darkhttpd
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.16.0"
image: "ghcr.io/aenix-io/cozystack/cozystack:v0.15.0"
command:
- /usr/bin/darkhttpd
- /cozystack/assets

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/postgres-backup:0.7.0@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1
ghcr.io/aenix-io/cozystack/postgres-backup:0.6.2@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/nginx-cache:0.3.1@sha256:cd744b2d1d50191f4908f2db83079b32973d1c009fe9468627be72efbfa0a107
ghcr.io/aenix-io/cozystack/nginx-cache:0.3.1@sha256:556bc8d29ee9e90b3d64d0481dcfc66483d055803315bba3d9ece17c0d97f32b

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/cluster-autoscaler:0.11.0@sha256:7f617de5a24de790a15d9e97c6287ff2b390922e6e74c7a665cbf498f634514d
ghcr.io/aenix-io/cozystack/cluster-autoscaler:latest@sha256:7f617de5a24de790a15d9e97c6287ff2b390922e6e74c7a665cbf498f634514d

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/kubevirt-cloud-provider:0.11.0@sha256:ba567212f9fe5e7c32af9857edd79eb012f3eb39c2eae0fc831b14d5b7879427
ghcr.io/aenix-io/cozystack/kubevirt-cloud-provider:latest@sha256:735aa8092501fc0f2904b685b15bc0137ea294cb08301ca1185d3dec5f467f0f

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/kubevirt-csi-driver:0.11.0@sha256:1a9e6592fc035dbaae27f308b934206858c2e0025d4c99cd906b51615cc9766c
ghcr.io/aenix-io/cozystack/kubevirt-csi-driver:latest@sha256:e56b46591cdf9140e97c3220a0c2681aadd4a4b3f7ea8473fb2504dc96e8b53a

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/ubuntu-container-disk:v1.30.1@sha256:1f249fbe52821a62f706c6038b13401234e1b758ac498e53395b8f9a642b015f
ghcr.io/aenix-io/cozystack/ubuntu-container-disk:v1.30.1@sha256:5ce80a453073c4f44347409133fc7b15f1d2f37a564d189871a4082fc552ff0f

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/mariadb-backup:0.5.1@sha256:793edb25a29cbc00781e40af883815ca36937e736e2b0d202ea9c9619fb6ca11
ghcr.io/aenix-io/cozystack/mariadb-backup:0.5.1@sha256:fa2b3195521cffa55eb6d71a50b875d3c234a45e5dff71b2b9002674175bea93

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/postgres-backup:0.7.0@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1
ghcr.io/aenix-io/cozystack/postgres-backup:0.6.2@sha256:d2015c6dba92293bda652d055e97d1be80e8414c2dc78037c12812d1a2e2cba1

View File

@@ -10,9 +10,7 @@ spec:
postgresql:
parameters:
max_wal_senders: "30"
{{- with .Values.postgresql.parameters.max_connections }}
max_connections: "{{ . }}"
{{- end }}
max_connections: “{{ .Values.postgresql.parameters.max_connections }}
minSyncReplicas: {{ .Values.quorum.minSyncReplicas }}
maxSyncReplicas: {{ .Values.quorum.maxSyncReplicas }}

View File

@@ -29,9 +29,9 @@
"type": "object",
"properties": {
"max_connections": {
"type": "number",
"type": "string",
"description": "Determines the maximum number of concurrent connections to the database server. The default is typically 100 connections",
"default": 100
"default": "100"
}
}
}
@@ -103,4 +103,4 @@
}
}
}
}
}

View File

@@ -14,7 +14,7 @@ storageClass: ""
## @param postgresql.parameters.max_connections Determines the maximum number of concurrent connections to the database server. The default is typically 100 connections
postgresql:
parameters:
max_connections: 100
max_connections: "100"
## Configuration for the quorum-based synchronous replication
## @param quorum.minSyncReplicas Minimum number of synchronous replicas that must acknowledge a transaction before it is considered committed.

View File

@@ -16,7 +16,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.4.2
version: 0.4.1
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

View File

@@ -47,7 +47,7 @@ metadata:
config: '{{ printf "%s %s" $user $password | sha256sum }}'
spec:
importCredentialsSecret:
name: {{ $.Release.Name }}-{{ kebabcase $user }}-credentials
name: {{ $.Release.Name }}-{{ $user }}-credentials
rabbitmqClusterReference:
name: {{ $.Release.Name }}
---

View File

@@ -31,8 +31,7 @@ kubernetes 0.8.0 ac11056e
kubernetes 0.8.1 e54608d8
kubernetes 0.8.2 5ca8823
kubernetes 0.9.0 9b6dd19
kubernetes 0.10.0 ac5c38b
kubernetes 0.11.0 HEAD
kubernetes 0.10.0 HEAD
mysql 0.1.0 f642698
mysql 0.2.0 8b975ff0
mysql 0.3.0 5ca8823
@@ -49,14 +48,12 @@ postgres 0.4.0 ec283c33
postgres 0.4.1 5ca8823
postgres 0.5.0 c07c4bbd
postgres 0.6.0 2a4768a
postgres 0.6.2 54fd61c
postgres 0.7.0 HEAD
postgres 0.6.2 HEAD
rabbitmq 0.1.0 f642698
rabbitmq 0.2.0 5ca8823
rabbitmq 0.3.0 9e33dc0
rabbitmq 0.4.0 36d8855
rabbitmq 0.4.1 35536bb
rabbitmq 0.4.2 HEAD
rabbitmq 0.4.1 HEAD
redis 0.1.1 f642698
redis 0.2.0 5ca8823
redis 0.3.0 HEAD

View File

@@ -3,24 +3,24 @@
arch: amd64
platform: metal
secureboot: false
version: v1.8.0
version: v1.7.6
input:
kernel:
path: /usr/install/amd64/vmlinuz
initramfs:
path: /usr/install/amd64/initramfs.xz
baseInstaller:
imageRef: ghcr.io/siderolabs/installer:v1.8.0
imageRef: ghcr.io/siderolabs/installer:v1.7.6
systemExtensions:
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
output:
kind: initramfs
imageOptions: {}

View File

@@ -3,24 +3,24 @@
arch: amd64
platform: metal
secureboot: false
version: v1.8.0
version: v1.7.6
input:
kernel:
path: /usr/install/amd64/vmlinuz
initramfs:
path: /usr/install/amd64/initramfs.xz
baseInstaller:
imageRef: ghcr.io/siderolabs/installer:v1.8.0
imageRef: ghcr.io/siderolabs/installer:v1.7.6
systemExtensions:
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
output:
kind: installer
imageOptions: {}

View File

@@ -3,24 +3,24 @@
arch: amd64
platform: metal
secureboot: false
version: v1.8.0
version: v1.7.6
input:
kernel:
path: /usr/install/amd64/vmlinuz
initramfs:
path: /usr/install/amd64/initramfs.xz
baseInstaller:
imageRef: ghcr.io/siderolabs/installer:v1.8.0
imageRef: ghcr.io/siderolabs/installer:v1.7.6
systemExtensions:
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
output:
kind: iso
imageOptions: {}

View File

@@ -3,24 +3,24 @@
arch: amd64
platform: metal
secureboot: false
version: v1.8.0
version: v1.7.6
input:
kernel:
path: /usr/install/amd64/vmlinuz
initramfs:
path: /usr/install/amd64/initramfs.xz
baseInstaller:
imageRef: ghcr.io/siderolabs/installer:v1.8.0
imageRef: ghcr.io/siderolabs/installer:v1.7.6
systemExtensions:
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
output:
kind: kernel
imageOptions: {}

View File

@@ -3,24 +3,24 @@
arch: amd64
platform: metal
secureboot: false
version: v1.8.0
version: v1.7.6
input:
kernel:
path: /usr/install/amd64/vmlinuz
initramfs:
path: /usr/install/amd64/initramfs.xz
baseInstaller:
imageRef: ghcr.io/siderolabs/installer:v1.8.0
imageRef: ghcr.io/siderolabs/installer:v1.7.6
systemExtensions:
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
output:
kind: image
imageOptions: { diskSize: 1306525696, diskFormat: raw }

View File

@@ -3,24 +3,24 @@
arch: amd64
platform: nocloud
secureboot: false
version: v1.8.0
version: v1.7.6
input:
kernel:
path: /usr/install/amd64/vmlinuz
initramfs:
path: /usr/install/amd64/initramfs.xz
baseInstaller:
imageRef: ghcr.io/siderolabs/installer:v1.8.0
imageRef: ghcr.io/siderolabs/installer:v1.7.6
systemExtensions:
- imageRef: ghcr.io/siderolabs/amd-ucode:20240909
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240909
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240909
- imageRef: ghcr.io/siderolabs/i915-ucode:20240909
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240909
- imageRef: ghcr.io/siderolabs/intel-ucode:20240910
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240909
- imageRef: ghcr.io/siderolabs/drbd:9.2.11-v1.8.0
- imageRef: ghcr.io/siderolabs/zfs:2.2.6-v1.8.0
- imageRef: ghcr.io/siderolabs/amd-ucode:20240811
- imageRef: ghcr.io/siderolabs/amdgpu-firmware:20240811
- imageRef: ghcr.io/siderolabs/bnx2-bnx2x:20240811
- imageRef: ghcr.io/siderolabs/i915-ucode:20240811
- imageRef: ghcr.io/siderolabs/intel-ice-firmware:20240811
- imageRef: ghcr.io/siderolabs/intel-ucode:20240813
- imageRef: ghcr.io/siderolabs/qlogic-firmware:20240811
- imageRef: ghcr.io/siderolabs/drbd:9.2.8-v1.7.6
- imageRef: ghcr.io/siderolabs/zfs:2.2.4-v1.7.6
output:
kind: image
imageOptions: { diskSize: 1306525696, diskFormat: raw }

View File

@@ -1,2 +1,2 @@
cozystack:
image: ghcr.io/aenix-io/cozystack/cozystack:v0.16.0@sha256:1119f30a50b3fea1ac7d8068009ca233df6214d709c7861f7ce8fbf0402cdc72
image: ghcr.io/aenix-io/cozystack/cozystack:v0.15.0@sha256:aeff26a80f84b4323578e613b3bf03caa842d617ec8d9ca98706867c1e70609f

View File

@@ -1,2 +1,2 @@
e2e:
image: ghcr.io/aenix-io/cozystack/e2e-sandbox:v0.16.0@sha256:25b298d621ec79431d106184d59849bbae634588742583d111628126ad8615c5
image: ghcr.io/aenix-io/cozystack/e2e-sandbox:v0.15.0@sha256:20cc84e4a11db31434881355c070113a7823501a28a6114ca02830b18607ad21

View File

@@ -3,4 +3,4 @@ name: monitoring
description: Monitoring and observability stack
icon: /logos/monitoring.svg
type: application
version: 1.5.0
version: 1.4.0

View File

@@ -12,7 +12,6 @@ monitoring 1.1.0 15478a88
monitoring 1.2.0 c9e0d63b
monitoring 1.2.1 4471b4ba
monitoring 1.3.0 6c5cf5b
monitoring 1.4.0 adaf603b
monitoring 1.5.0 HEAD
monitoring 1.4.0 HEAD
seaweedfs 0.1.0 5ca8823
seaweedfs 0.2.0 HEAD

View File

@@ -79,7 +79,7 @@ annotations:
Pod IP Pool\n description: |\n CiliumPodIPPool defines an IP pool that can
be used for pooled IPAM (i.e. the multi-pool IPAM mode).\n"
apiVersion: v2
appVersion: 1.16.2
appVersion: 1.16.1
description: eBPF-based Networking, Security, and Observability
home: https://cilium.io/
icon: https://cdn.jsdelivr.net/gh/cilium/cilium@main/Documentation/images/logo-solo.svg
@@ -95,4 +95,4 @@ kubeVersion: '>= 1.21.0-0'
name: cilium
sources:
- https://github.com/cilium/cilium
version: 1.16.2
version: 1.16.1

View File

@@ -1,6 +1,6 @@
# cilium
![Version: 1.16.2](https://img.shields.io/badge/Version-1.16.2-informational?style=flat-square) ![AppVersion: 1.16.2](https://img.shields.io/badge/AppVersion-1.16.2-informational?style=flat-square)
![Version: 1.16.1](https://img.shields.io/badge/Version-1.16.1-informational?style=flat-square) ![AppVersion: 1.16.1](https://img.shields.io/badge/AppVersion-1.16.1-informational?style=flat-square)
Cilium is open source software for providing and transparently securing
network connectivity and loadbalancing between application workloads such as
@@ -83,7 +83,7 @@ contributors across the globe, there is almost always someone available to help.
| authentication.mutual.spire.install.agent.tolerations | list | `[{"effect":"NoSchedule","key":"node.kubernetes.io/not-ready"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/master"},{"effect":"NoSchedule","key":"node-role.kubernetes.io/control-plane"},{"effect":"NoSchedule","key":"node.cloudprovider.kubernetes.io/uninitialized","value":"true"},{"key":"CriticalAddonsOnly","operator":"Exists"}]` | SPIRE agent tolerations configuration By default it follows the same tolerations as the agent itself to allow the Cilium agent on this node to connect to SPIRE. ref: https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ |
| authentication.mutual.spire.install.enabled | bool | `true` | Enable SPIRE installation. This will only take effect only if authentication.mutual.spire.enabled is true |
| authentication.mutual.spire.install.existingNamespace | bool | `false` | SPIRE namespace already exists. Set to true if Helm should not create, manage, and import the SPIRE namespace. |
| authentication.mutual.spire.install.initImage | object | `{"digest":"sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140","override":null,"pullPolicy":"IfNotPresent","repository":"docker.io/library/busybox","tag":"1.36.1","useDigest":true}` | init container image of SPIRE agent and server |
| authentication.mutual.spire.install.initImage | object | `{"digest":"sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7","override":null,"pullPolicy":"IfNotPresent","repository":"docker.io/library/busybox","tag":"1.36.1","useDigest":true}` | init container image of SPIRE agent and server |
| authentication.mutual.spire.install.namespace | string | `"cilium-spire"` | SPIRE namespace to install into |
| authentication.mutual.spire.install.server.affinity | object | `{}` | SPIRE server affinity configuration |
| authentication.mutual.spire.install.server.annotations | object | `{}` | SPIRE server annotations |
@@ -182,7 +182,7 @@ contributors across the globe, there is almost always someone available to help.
| clustermesh.apiserver.extraVolumeMounts | list | `[]` | Additional clustermesh-apiserver volumeMounts. |
| clustermesh.apiserver.extraVolumes | list | `[]` | Additional clustermesh-apiserver volumes. |
| clustermesh.apiserver.healthPort | int | `9880` | TCP port for the clustermesh-apiserver health API. |
| clustermesh.apiserver.image | object | `{"digest":"sha256:cc84190fed92e03a2b3a33bc670b2447b521ee258ad9b076baaad13be312ea73","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/clustermesh-apiserver","tag":"v1.16.2","useDigest":true}` | Clustermesh API server image. |
| clustermesh.apiserver.image | object | `{"digest":"sha256:e9c77417cd474cc943b2303a76c5cf584ac7024dd513ebb8d608cb62fe28896f","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/clustermesh-apiserver","tag":"v1.16.1","useDigest":true}` | Clustermesh API server image. |
| clustermesh.apiserver.kvstoremesh.enabled | bool | `true` | Enable KVStoreMesh. KVStoreMesh caches the information retrieved from the remote clusters in the local etcd instance. |
| clustermesh.apiserver.kvstoremesh.extraArgs | list | `[]` | Additional KVStoreMesh arguments. |
| clustermesh.apiserver.kvstoremesh.extraEnv | list | `[]` | Additional KVStoreMesh environment variables. |
@@ -353,7 +353,7 @@ contributors across the globe, there is almost always someone available to help.
| envoy.extraVolumes | list | `[]` | Additional envoy volumes. |
| envoy.healthPort | int | `9878` | TCP port for the health API. |
| envoy.idleTimeoutDurationSeconds | int | `60` | Set Envoy upstream HTTP idle connection timeout seconds. Does not apply to connections with pending requests. Default 60s |
| envoy.image | object | `{"digest":"sha256:9762041c3760de226a8b00cc12f27dacc28b7691ea926748f9b5c18862db503f","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium-envoy","tag":"v1.29.9-1726784081-a90146d13b4cd7d168d573396ccf2b3db5a3b047","useDigest":true}` | Envoy container image. |
| envoy.image | object | `{"digest":"sha256:bd5ff8c66716080028f414ec1cb4f7dc66f40d2fb5a009fff187f4a9b90b566b","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium-envoy","tag":"v1.29.7-39a2a56bbd5b3a591f69dbca51d3e30ef97e0e51","useDigest":true}` | Envoy container image. |
| envoy.livenessProbe.failureThreshold | int | `10` | failure threshold of liveness probe |
| envoy.livenessProbe.periodSeconds | int | `30` | interval between checks of the liveness probe |
| envoy.log.format | string | `"[%Y-%m-%d %T.%e][%t][%l][%n] [%g:%#] %v"` | The format string to use for laying out the log message metadata of Envoy. |
@@ -484,7 +484,7 @@ contributors across the globe, there is almost always someone available to help.
| hubble.relay.extraVolumes | list | `[]` | Additional hubble-relay volumes. |
| hubble.relay.gops.enabled | bool | `true` | Enable gops for hubble-relay |
| hubble.relay.gops.port | int | `9893` | Configure gops listen port for hubble-relay |
| hubble.relay.image | object | `{"digest":"sha256:4b559907b378ac18af82541dafab430a857d94f1057f2598645624e6e7ea286c","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/hubble-relay","tag":"v1.16.2","useDigest":true}` | Hubble-relay container image. |
| hubble.relay.image | object | `{"digest":"sha256:2e1b4c739a676ae187d4c2bfc45c3e865bda2567cc0320a90cb666657fcfcc35","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/hubble-relay","tag":"v1.16.1","useDigest":true}` | Hubble-relay container image. |
| hubble.relay.listenHost | string | `""` | Host to listen to. Specify an empty string to bind to all the interfaces. |
| hubble.relay.listenPort | string | `"4245"` | Port to listen to. |
| hubble.relay.nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node labels for pod assignment ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector |
@@ -590,7 +590,7 @@ contributors across the globe, there is almost always someone available to help.
| hubble.ui.updateStrategy | object | `{"rollingUpdate":{"maxUnavailable":1},"type":"RollingUpdate"}` | hubble-ui update strategy. |
| identityAllocationMode | string | `"crd"` | Method to use for identity allocation (`crd` or `kvstore`). |
| identityChangeGracePeriod | string | `"5s"` | Time to wait before using new identity on endpoint identity change. |
| image | object | `{"digest":"sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.2","useDigest":true}` | Agent container image. |
| image | object | `{"digest":"sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.1","useDigest":true}` | Agent container image. |
| imagePullSecrets | list | `[]` | Configure image pull secrets for pulling container images |
| ingressController.default | bool | `false` | Set cilium ingress controller to be the default ingress controller This will let cilium ingress controller route entries without ingress class set |
| ingressController.defaultSecretName | string | `nil` | Default secret name for ingresses without .spec.tls[].secretName set. |
@@ -717,7 +717,7 @@ contributors across the globe, there is almost always someone available to help.
| operator.hostNetwork | bool | `true` | HostNetwork setting |
| operator.identityGCInterval | string | `"15m0s"` | Interval for identity garbage collection. |
| operator.identityHeartbeatTimeout | string | `"30m0s"` | Timeout for identity heartbeats. |
| operator.image | object | `{"alibabacloudDigest":"sha256:16e33abb6b8381e2f66388b6d7141399f06c9b51b9ffa08fd159b8d321929716","awsDigest":"sha256:b6a73ec94407a56cccc8a395225e2aecc3ca3611e7acfeec86201c19fc0727dd","azureDigest":"sha256:fde7cf8bb887e106cd388bb5c3327e92682b2ec3ab4f03bb57b87f495b99f727","genericDigest":"sha256:cccfd3b886d52cb132c06acca8ca559f0fce91a6bd99016219b1a81fdbc4813a","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/operator","suffix":"","tag":"v1.16.2","useDigest":true}` | cilium-operator image. |
| operator.image | object | `{"alibabacloudDigest":"sha256:4381adf48d76ec482551183947e537d44bcac9b6c31a635a9ac63f696d978804","awsDigest":"sha256:e3876fcaf2d6ccc8d5b4aaaded7b1efa971f3f4175eaa2c8a499878d58c39df4","azureDigest":"sha256:e55c222654a44ceb52db7ade3a7b9e8ef05681ff84c14ad1d46fea34869a7a22","genericDigest":"sha256:3bc7e7a43bc4a4d8989cb7936c5d96675dd2d02c306adf925ce0a7c35aa27dc4","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/operator","suffix":"","tag":"v1.16.1","useDigest":true}` | cilium-operator image. |
| operator.nodeGCInterval | string | `"5m0s"` | Interval for cilium node garbage collection. |
| operator.nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node labels for cilium-operator pod assignment ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector |
| operator.podAnnotations | object | `{}` | Annotations to be added to cilium-operator pods |
@@ -767,7 +767,7 @@ contributors across the globe, there is almost always someone available to help.
| preflight.extraEnv | list | `[]` | Additional preflight environment variables. |
| preflight.extraVolumeMounts | list | `[]` | Additional preflight volumeMounts. |
| preflight.extraVolumes | list | `[]` | Additional preflight volumes. |
| preflight.image | object | `{"digest":"sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.2","useDigest":true}` | Cilium pre-flight image. |
| preflight.image | object | `{"digest":"sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39","override":null,"pullPolicy":"IfNotPresent","repository":"quay.io/cilium/cilium","tag":"v1.16.1","useDigest":true}` | Cilium pre-flight image. |
| preflight.nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node labels for preflight pod assignment ref: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector |
| preflight.podAnnotations | object | `{}` | Annotations to be added to preflight pods |
| preflight.podDisruptionBudget.enabled | bool | `false` | enable PodDisruptionBudget ref: https://kubernetes.io/docs/concepts/workloads/pods/disruptions/ |

View File

@@ -26,6 +26,10 @@ spec:
template:
metadata:
annotations:
{{- if and .Values.envoy.prometheus.enabled (not .Values.envoy.prometheus.serviceMonitor.enabled) }}
prometheus.io/port: "{{ .Values.envoy.prometheus.port }}"
prometheus.io/scrape: "true"
{{- end }}
{{- if .Values.envoy.rollOutPods }}
# ensure pods roll when configmap updates
cilium.io/cilium-envoy-configmap-checksum: {{ include (print $.Template.BasePath "/cilium-envoy/configmap.yaml") . | sha256sum | quote }}

View File

@@ -1,33 +0,0 @@
{{- $envoyDS := eq (include "envoyDaemonSetEnabled" .) "true" -}}
{{- if and $envoyDS (not .Values.preflight.enabled) .Values.envoy.prometheus.enabled }}
apiVersion: v1
kind: Service
metadata:
name: cilium-envoy
namespace: {{ .Release.Namespace }}
{{- if or (not .Values.envoy.prometheus.serviceMonitor.enabled) .Values.envoy.annotations }}
annotations:
{{- if not .Values.envoy.prometheus.serviceMonitor.enabled }}
prometheus.io/scrape: "true"
prometheus.io/port: {{ .Values.envoy.prometheus.port | quote }}
{{- end }}
{{- with .Values.envoy.annotations }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
labels:
k8s-app: cilium-envoy
app.kubernetes.io/name: cilium-envoy
app.kubernetes.io/part-of: cilium
io.cilium/app: proxy
spec:
clusterIP: None
type: ClusterIP
selector:
k8s-app: cilium-envoy
ports:
- name: envoy-metrics
port: {{ .Values.envoy.prometheus.port }}
protocol: TCP
targetPort: envoy-metrics
{{- end }}

View File

@@ -362,7 +362,7 @@ spec:
name: cilium-clustermesh
optional: true
# note: items are not explicitly listed here, since the entries of this secret
# depend on the peers configured, and that would cause a restart of all operators
# depend on the peers configured, and that would cause a restart of all agents
# at every addition/removal. Leaving the field empty makes each secret entry
# to be automatically projected into the volume as a file whose name is the key.
- secret:
@@ -384,28 +384,5 @@ spec:
- key: {{ .Values.tls.caBundle.key }}
path: common-etcd-client-ca.crt
{{- end }}
# note: we configure the volume for the kvstoremesh-specific certificate
# regardless of whether KVStoreMesh is enabled or not, so that it can be
# automatically mounted in case KVStoreMesh gets subsequently enabled,
# without requiring an operator restart.
- secret:
name: clustermesh-apiserver-local-cert
optional: true
items:
- key: tls.key
path: local-etcd-client.key
- key: tls.crt
path: local-etcd-client.crt
{{- if not .Values.tls.caBundle.enabled }}
- key: ca.crt
path: local-etcd-client-ca.crt
{{- else }}
- {{ .Values.tls.caBundle.useSecret | ternary "secret" "configMap" }}:
name: {{ .Values.tls.caBundle.name }}
optional: true
items:
- key: {{ .Values.tls.caBundle.key }}
path: local-etcd-client-ca.crt
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,47 +1,3 @@
{{/* validate deprecated options are not being used */}}
{{/* Options deprecated in v1.15 and removed in v1.16 */}}
{{- if or
(dig "encryption" "keyFile" "" .Values.AsMap)
(dig "encryption" "mountPath" "" .Values.AsMap)
(dig "encryption" "secretName" "" .Values.AsMap)
(dig "encryption" "interface" "" .Values.AsMap)
}}
{{ fail "encryption.{keyFile,mountPath,secretName,interface} were deprecated in v1.14 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
{{- end }}
{{- if or
((dig "proxy" "prometheus" "enabled" "" .Values.AsMap) | toString)
(dig "proxy" "prometheus" "port" "" .Values.AsMap)
}}
{{ fail "proxy.prometheus.enabled and proxy.prometheus.port were deprecated in v1.14 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
{{- end }}
{{- if (dig "endpointStatus" "" .Values.AsMap) }}
{{ fail "endpointStatus has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
{{- end }}
{{- if (dig "remoteNodeIdentity" "" .Values.AsMap) }}
{{ fail "remoteNodeIdentity was deprecated in v1.15 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
{{- end }}
{{- if (dig "containerRuntime" "integration" "" .Values.AsMap) }}
{{ fail "containerRuntime.integration was deprecated in v1.14 and has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
{{- end }}
{{- if (dig "etcd" "managed" "" .Values.AsMap) }}
{{ fail "etcd.managed was deprecated in v1.10 has been removed in v1.16. For details please refer to https://docs.cilium.io/en/v1.16/operations/upgrade/#helm-options" }}
{{- end }}
{{/* Options deprecated in v1.14 and removed in v1.15 */}}
{{- if .Values.tunnel }}
{{ fail "tunnel was deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
{{- end }}
{{- if or (dig "clustermesh" "apiserver" "tls" "ca" "cert" "" .Values.AsMap) (dig "clustermesh" "apiserver" "tls" "ca" "key" "" .Values.AsMap) }}
{{ fail "clustermesh.apiserver.tls.ca.cert and clustermesh.apiserver.tls.ca.key were deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
{{- end }}
{{- if .Values.enableK8sEventHandover }}
{{ fail "enableK8sEventHandover was deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
{{- end }}
{{- if .Values.enableCnpStatusUpdates }}
{{ fail "enableCnpStatusUpdates was deprecated in v1.14 and has been removed in v1.15. For details please refer to https://docs.cilium.io/en/v1.15/operations/upgrade/#helm-options" }}
{{- end }}
{{/* validate hubble config */}}
{{- if and .Values.hubble.ui.enabled (not .Values.hubble.ui.standalone.enabled) }}
{{- if not .Values.hubble.relay.enabled }}

View File

@@ -153,10 +153,10 @@ image:
# @schema
override: ~
repository: "quay.io/cilium/cilium"
tag: "v1.16.2"
tag: "v1.16.1"
pullPolicy: "IfNotPresent"
# cilium-digest
digest: "sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea"
digest: "sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39"
useDigest: true
# -- Affinity for cilium-agent.
affinity:
@@ -1309,9 +1309,9 @@ hubble:
# @schema
override: ~
repository: "quay.io/cilium/hubble-relay"
tag: "v1.16.2"
tag: "v1.16.1"
# hubble-relay-digest
digest: "sha256:4b559907b378ac18af82541dafab430a857d94f1057f2598645624e6e7ea286c"
digest: "sha256:2e1b4c739a676ae187d4c2bfc45c3e865bda2567cc0320a90cb666657fcfcc35"
useDigest: true
pullPolicy: "IfNotPresent"
# -- Specifies the resources for the hubble-relay pods
@@ -2158,9 +2158,9 @@ envoy:
# @schema
override: ~
repository: "quay.io/cilium/cilium-envoy"
tag: "v1.29.9-1726784081-a90146d13b4cd7d168d573396ccf2b3db5a3b047"
tag: "v1.29.7-39a2a56bbd5b3a591f69dbca51d3e30ef97e0e51"
pullPolicy: "IfNotPresent"
digest: "sha256:9762041c3760de226a8b00cc12f27dacc28b7691ea926748f9b5c18862db503f"
digest: "sha256:bd5ff8c66716080028f414ec1cb4f7dc66f40d2fb5a009fff187f4a9b90b566b"
useDigest: true
# -- Additional containers added to the cilium Envoy DaemonSet.
extraContainers: []
@@ -2474,15 +2474,15 @@ operator:
# @schema
override: ~
repository: "quay.io/cilium/operator"
tag: "v1.16.2"
tag: "v1.16.1"
# operator-generic-digest
genericDigest: "sha256:cccfd3b886d52cb132c06acca8ca559f0fce91a6bd99016219b1a81fdbc4813a"
genericDigest: "sha256:3bc7e7a43bc4a4d8989cb7936c5d96675dd2d02c306adf925ce0a7c35aa27dc4"
# operator-azure-digest
azureDigest: "sha256:fde7cf8bb887e106cd388bb5c3327e92682b2ec3ab4f03bb57b87f495b99f727"
azureDigest: "sha256:e55c222654a44ceb52db7ade3a7b9e8ef05681ff84c14ad1d46fea34869a7a22"
# operator-aws-digest
awsDigest: "sha256:b6a73ec94407a56cccc8a395225e2aecc3ca3611e7acfeec86201c19fc0727dd"
awsDigest: "sha256:e3876fcaf2d6ccc8d5b4aaaded7b1efa971f3f4175eaa2c8a499878d58c39df4"
# operator-alibabacloud-digest
alibabacloudDigest: "sha256:16e33abb6b8381e2f66388b6d7141399f06c9b51b9ffa08fd159b8d321929716"
alibabacloudDigest: "sha256:4381adf48d76ec482551183947e537d44bcac9b6c31a635a9ac63f696d978804"
useDigest: true
pullPolicy: "IfNotPresent"
suffix: ""
@@ -2756,9 +2756,9 @@ preflight:
# @schema
override: ~
repository: "quay.io/cilium/cilium"
tag: "v1.16.2"
tag: "v1.16.1"
# cilium-digest
digest: "sha256:4386a8580d8d86934908eea022b0523f812e6a542f30a86a47edd8bed90d51ea"
digest: "sha256:0b4a3ab41a4760d86b7fc945b8783747ba27f29dac30dd434d94f2c9e3679f39"
useDigest: true
pullPolicy: "IfNotPresent"
# -- The priority class to use for the preflight pod.
@@ -2905,9 +2905,9 @@ clustermesh:
# @schema
override: ~
repository: "quay.io/cilium/clustermesh-apiserver"
tag: "v1.16.2"
tag: "v1.16.1"
# clustermesh-apiserver-digest
digest: "sha256:cc84190fed92e03a2b3a33bc670b2447b521ee258ad9b076baaad13be312ea73"
digest: "sha256:e9c77417cd474cc943b2303a76c5cf584ac7024dd513ebb8d608cb62fe28896f"
useDigest: true
pullPolicy: "IfNotPresent"
# -- TCP port for the clustermesh-apiserver health API.
@@ -3406,7 +3406,7 @@ authentication:
override: ~
repository: "docker.io/library/busybox"
tag: "1.36.1"
digest: "sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140"
digest: "sha256:9ae97d36d26566ff84e8893c64a6dc4fe8ca6d1144bf5b87b2b85a32def253c7"
useDigest: true
pullPolicy: "IfNotPresent"
# SPIRE agent configuration

View File

@@ -1,2 +1,2 @@
ARG VERSION=v1.16.2
ARG VERSION=v1.16.1
FROM quay.io/cilium/cilium:${VERSION}

View File

@@ -15,4 +15,4 @@ cilium:
enableIdentityMark: false
enableRuntimeDeviceDetection: true
forceDeviceDetection: true
devices: "ovn0 genev_sys_6081"
devices: ovn0

View File

@@ -12,7 +12,7 @@ cilium:
mode: "kubernetes"
image:
repository: ghcr.io/aenix-io/cozystack/cilium
tag: 1.16.2
digest: "sha256:534c5b04fef356a6be59234243c23c0c09702fe1e2c8872012afb391ce2965c4"
tag: 1.16.1
digest: "sha256:9593dbc3bd25487b52d8f43330d4a308e450605479a8384a32117e9613289892"
envoy:
enabled: false

View File

@@ -33,11 +33,11 @@ kubeapps:
image:
registry: ghcr.io/aenix-io/cozystack
repository: dashboard
tag: v0.16.0
tag: v0.15.0
digest: "sha256:4818712e9fc9c57cc321512760c3226af564a04e69d4b3ec9229ab91fd39abeb"
kubeappsapis:
image:
registry: ghcr.io/aenix-io/cozystack
repository: kubeapps-apis
tag: v0.16.0
digest: "sha256:55bc8e2495933112c7cb4bb9e3b1fcb8df46aa14e27fa007f78388a9757e3238"
tag: v0.15.0
digest: "sha256:70c095c8f7e3ecfa11433a3a2c8f57f6ff5a0053f006939a2c171c180cc50baf"

View File

@@ -1,11 +1,7 @@
NAME=fluxcd
NAMESPACE=cozy-$(NAME)
include ../../../scripts/package.mk
apply-locally:
helm upgrade -i -n $(NAMESPACE) $(NAME) .
update:
rm -rf charts
helm pull oci://ghcr.io/controlplaneio-fluxcd/charts/flux-instance --untar --untardir charts
include ../../../scripts/package.mk

View File

@@ -1,24 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
helmdocs.gotmpl

View File

@@ -1,28 +0,0 @@
annotations:
artifacthub.io/license: AGPL-3.0
artifacthub.io/links: |
- name: Documentation
url: https://fluxcd.control-plane.io/operator
- name: Chart Source
url: https://github.com/controlplaneio-fluxcd/charts
- name: Upstream Project
url: https://github.com/controlplaneio-fluxcd/flux-operator
apiVersion: v2
appVersion: v0.9.0
description: 'A Helm chart for deploying a Flux instance managed by Flux Operator. '
home: https://github.com/controlplaneio-fluxcd
icon: https://raw.githubusercontent.com/cncf/artwork/main/projects/flux/icon/color/flux-icon-color.png
keywords:
- flux
- fluxcd
- gitops
kubeVersion: '>=1.22.0-0'
maintainers:
- email: flux-enterprise@control-plane.io
name: ControlPlane Flux Team
name: flux-instance
sources:
- https://github.com/controlplaneio-fluxcd/flux-operator
- https://github.com/controlplaneio-fluxcd/charts
type: application
version: 0.9.0

View File

@@ -1,52 +0,0 @@
# flux-instance
![Version: 0.9.0](https://img.shields.io/badge/Version-0.9.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v0.9.0](https://img.shields.io/badge/AppVersion-v0.9.0-informational?style=flat-square)
This chart is a thin wrapper around the `FluxInstance` custom resource, which is
used by the [Flux Operator](https://github.com/controlplaneio-fluxcd/flux-operator)
to install, configure and automatically upgrade Flux.
## Prerequisites
- Kubernetes 1.22+
- Helm 3.8+
## Installing the Chart
To deploy Flux in the `flux-system` namespace:
```console
helm -n flux-system install flux oci://ghcr.io/controlplaneio-fluxcd/charts/flux-instance
```
For more information on the available configuration options,
see the [Flux Instance documentation](https://fluxcd.control-plane.io/operator/fluxinstance/).
## Uninstalling the Chart
To uninstall Flux without affecting the resources it manages:
```console
helm -n flux-system uninstall flux
```
## Values
| Key | Type | Default | Description |
|-----|------|---------|-------------|
| commonAnnotations | object | `{}` | Common annotations to add to all deployed objects including pods. |
| commonLabels | object | `{}` | Common labels to add to all deployed objects including pods. |
| fullnameOverride | string | `"flux"` | |
| instance.cluster | object | `{"domain":"cluster.local","multitenant":false,"networkPolicy":true,"tenantDefaultServiceAccount":"default","type":"kubernetes"}` | Cluster https://fluxcd.control-plane.io/operator/fluxinstance/#cluster-configuration |
| instance.components | list | `["source-controller","kustomize-controller","helm-controller","notification-controller"]` | Components https://fluxcd.control-plane.io/operator/fluxinstance/#components-configuration |
| instance.distribution | object | `{"artifact":"oci://ghcr.io/controlplaneio-fluxcd/flux-operator-manifests:latest","imagePullSecret":"","registry":"ghcr.io/fluxcd","version":"2.x"}` | Distribution https://fluxcd.control-plane.io/operator/fluxinstance/#distribution-configuration |
| instance.kustomize.patches | list | `[]` | Kustomize patches https://fluxcd.control-plane.io/operator/fluxinstance/#kustomize-patches |
| instance.sharding | object | `{"key":"sharding.fluxcd.io/key","shards":[]}` | Sharding https://fluxcd.control-plane.io/operator/fluxinstance/#sharding-configuration |
| instance.storage | object | `{"class":"","size":""}` | Storage https://fluxcd.control-plane.io/operator/fluxinstance/#storage-configuration |
| instance.sync | object | `{"kind":"GitRepository","path":"","pullSecret":"","ref":"","url":""}` | Sync https://fluxcd.control-plane.io/operator/fluxinstance/#sync-configuration |
| nameOverride | string | `""` | |
## Source Code
* <https://github.com/controlplaneio-fluxcd/flux-operator>
* <https://github.com/controlplaneio-fluxcd/charts>

View File

@@ -1 +0,0 @@
Documentation at https://fluxcd.control-plane.io/operator/

View File

@@ -1,51 +0,0 @@
{{/*
Expand the name of the chart.
*/}}
{{- define "flux-instance.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/}}
{{- define "flux-instance.fullname" -}}
{{- if .Values.fullnameOverride }}
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- $name := default .Chart.Name .Values.nameOverride }}
{{- if contains $name .Release.Name }}
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
{{- else }}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
{{- end }}
{{- end }}
{{- end }}
{{/*
Create chart name and version as used by the chart label.
*/}}
{{- define "flux-instance.chart" -}}
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
{{- end }}
{{/*
Common labels
*/}}
{{- define "flux-instance.labels" -}}
helm.sh/chart: {{ include "flux-instance.chart" . }}
{{ include "flux-instance.selectorLabels" . }}
{{- if .Chart.AppVersion }}
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
{{- end }}
app.kubernetes.io/managed-by: {{ .Release.Service }}
{{- end }}
{{/*
Selector labels
*/}}
{{- define "flux-instance.selectorLabels" -}}
app.kubernetes.io/name: {{ include "flux-instance.name" . }}
app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }}

View File

@@ -1,43 +0,0 @@
apiVersion: fluxcd.controlplane.io/v1
kind: FluxInstance
metadata:
name: {{ include "flux-instance.fullname" . }}
namespace: {{ .Release.Namespace }}
labels:
{{- include "flux-instance.labels" . | nindent 4 }}
{{- with .Values.commonLabels }}
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with .Values.commonAnnotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
distribution:
version: {{ .Values.instance.distribution.version }}
registry: {{ .Values.instance.distribution.registry }}
artifact: {{ .Values.instance.distribution.artifact }}
{{- if .Values.instance.distribution.imagePullSecret }}
imagePullSecret: {{ .Values.instance.distribution.imagePullSecret }}
{{- end }}
components: {{ .Values.instance.components | toYaml | nindent 4 }}
cluster: {{ .Values.instance.cluster | toYaml | nindent 4 }}
kustomize: {{ .Values.instance.kustomize | toYaml | nindent 4 }}
{{- if .Values.instance.sync.url }}
sync:
kind: {{ .Values.instance.sync.kind }}
url: {{ .Values.instance.sync.url }}
ref: {{ .Values.instance.sync.ref }}
path: {{ .Values.instance.sync.path }}
{{- if .Values.instance.sync.pullSecret }}
pullSecret: {{ .Values.instance.sync.pullSecret }}
{{- end }}
{{- end }}
{{- if .Values.instance.storage.size }}
storage: {{ .Values.instance.storage | toYaml | nindent 4 }}
{{- end }}
{{- if .Values.instance.sharding.shards }}
sharding:
key: {{ .Values.instance.sharding.key }}
shards: {{ .Values.instance.sharding.shards | toYaml | nindent 4 }}
{{- end }}

View File

@@ -1,153 +0,0 @@
{
"$schema": "https://json-schema.org/draft/2019-09/schema",
"properties": {
"commonAnnotations": {
"properties": {},
"type": "object"
},
"commonLabels": {
"properties": {},
"type": "object"
},
"fullnameOverride": {
"type": "string"
},
"instance": {
"properties": {
"cluster": {
"properties": {
"domain": {
"type": "string"
},
"multitenant": {
"type": "boolean"
},
"networkPolicy": {
"type": "boolean"
},
"tenantDefaultServiceAccount": {
"type": "string"
},
"type": {
"enum": [
"kubernetes",
"openshift",
"aws",
"azure",
"gcp"
],
"type": "string"
}
},
"type": "object"
},
"components": {
"items": {
"enum": [
"source-controller",
"kustomize-controller",
"helm-controller",
"notification-controller",
"image-reflector-controller",
"image-automation-controller"
],
"type": "string"
},
"type": "array",
"uniqueItems": true
},
"distribution": {
"properties": {
"artifact": {
"type": "string"
},
"imagePullSecret": {
"type": "string"
},
"registry": {
"type": "string"
},
"version": {
"type": "string"
}
},
"required": [
"version",
"registry"
],
"type": "object"
},
"kustomize": {
"properties": {
"patches": {
"items": {
"type": "object"
},
"type": "array"
}
},
"type": "object"
},
"sharding": {
"properties": {
"key": {
"type": "string"
},
"shards": {
"items": {
"type": "string"
},
"type": "array"
}
},
"type": "object"
},
"storage": {
"properties": {
"class": {
"type": "string"
},
"size": {
"type": "string"
}
},
"type": "object"
},
"sync": {
"properties": {
"kind": {
"enum": [
"GitRepository",
"OCIRepository",
"Bucket"
],
"type": "string"
},
"path": {
"type": "string"
},
"pullSecret": {
"type": "string"
},
"ref": {
"type": "string"
},
"url": {
"type": "string"
}
},
"type": "object"
}
},
"required": [
"distribution",
"cluster"
],
"type": "object"
},
"nameOverride": {
"type": "string"
}
},
"type": "object"
}

View File

@@ -1,49 +0,0 @@
# Default values for flux-instance.
nameOverride: ""
fullnameOverride: "flux"
instance:
# -- Distribution https://fluxcd.control-plane.io/operator/fluxinstance/#distribution-configuration
distribution: # @schema required: true
version: "2.x" # @schema required: true
registry: "ghcr.io/fluxcd" # @schema required: true
artifact: "oci://ghcr.io/controlplaneio-fluxcd/flux-operator-manifests:latest"
imagePullSecret: ""
# -- Components https://fluxcd.control-plane.io/operator/fluxinstance/#components-configuration
components: # @schema item: string; uniqueItems: true; itemEnum: [source-controller,kustomize-controller,helm-controller,notification-controller,image-reflector-controller,image-automation-controller]
- source-controller
- kustomize-controller
- helm-controller
- notification-controller
# -- Cluster https://fluxcd.control-plane.io/operator/fluxinstance/#cluster-configuration
cluster: # @schema required: true
type: kubernetes # @schema enum:[kubernetes,openshift,aws,azure,gcp]
domain: "cluster.local"
networkPolicy: true
multitenant: false
tenantDefaultServiceAccount: "default"
# -- Storage https://fluxcd.control-plane.io/operator/fluxinstance/#storage-configuration
storage: # @schema required: false
class: ""
size: ""
# -- Sharding https://fluxcd.control-plane.io/operator/fluxinstance/#sharding-configuration
sharding: # @schema required: false
key: "sharding.fluxcd.io/key"
shards: [] # @schema item: string
# -- Sync https://fluxcd.control-plane.io/operator/fluxinstance/#sync-configuration
sync: # @schema required: false
kind: "GitRepository" # @schema enum:[GitRepository,OCIRepository,Bucket]
url: ""
ref: ""
path: ""
pullSecret: ""
kustomize: # @schema required: false
# -- Kustomize patches https://fluxcd.control-plane.io/operator/fluxinstance/#kustomize-patches
patches: [] # @schema item: object
# -- Common annotations to add to all deployed objects including pods.
commonAnnotations: { }
# -- Common labels to add to all deployed objects including pods.
commonLabels: { }

View File

@@ -0,0 +1,25 @@
apiVersion: fluxcd.controlplane.io/v1
kind: FluxInstance
metadata:
name: flux
spec:
{{- with .Values.cluster }}
cluster:
{{- with .networkPolicy }}
networkPolicy: {{ . }}
{{- end }}
{{- with .domain }}
domain: {{ . }}
{{- end }}
{{- end }}
distribution:
version: {{ .Values.distribution.version }}
registry: {{ .Values.distribution.registry }}
components:
{{- if .Values.components }}
{{- toYaml .Values.components | nindent 4 }}
{{- end }}
kustomize:
{{- if .Values.kustomize }}
{{- toYaml .Values.kustomize | nindent 4 }}
{{- end }}

View File

@@ -1,49 +1,47 @@
flux-instance:
instance:
cluster:
networkPolicy: true
domain: cozy.local # -- default value is overriden in patches
distribution:
version: 2.3.x
registry: ghcr.io/fluxcd
components:
- source-controller
- kustomize-controller
- helm-controller
- notification-controller
- image-reflector-controller
- image-automation-controller
kustomize:
patches:
- target:
kind: Deployment
name: "(kustomize-controller|helm-controller|source-controller)"
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: --concurrent=20
- op: add
path: /spec/template/spec/containers/0/args/-
value: --requeue-dependency=5s
- op: replace
path: /spec/template/spec/containers/0/resources/limits
value:
cpu: 2000m
memory: 2048Mi
- target:
kind: Deployment
name: source-controller
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: --storage-adv-addr=source-controller.cozy-fluxcd.svc
- op: add
path: /spec/template/spec/containers/0/args/-
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
- target:
kind: Deployment
name: (kustomize-controller|helm-controller|image-reflector-controller|image-automation-controller)
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
cluster:
networkPolicy: true
# domain: cozy.local
distribution:
version: 2.3.x
registry: ghcr.io/fluxcd
components:
- source-controller
- kustomize-controller
- helm-controller
- notification-controller
- image-reflector-controller
- image-automation-controller
kustomize:
patches:
- target:
kind: Deployment
name: "(kustomize-controller|helm-controller|source-controller)"
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: --concurrent=20
- op: add
path: /spec/template/spec/containers/0/args/-
value: --requeue-dependency=5s
- op: replace
path: /spec/template/spec/containers/0/resources/limits
value:
cpu: 2000m
memory: 2048Mi
- target:
kind: Deployment
name: source-controller
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: --storage-adv-addr=source-controller.cozy-fluxcd.svc
- op: add
path: /spec/template/spec/containers/0/args/-
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/
- target:
kind: Deployment
name: (kustomize-controller|helm-controller|image-reflector-controller|image-automation-controller)
patch: |
- op: add
path: /spec/template/spec/containers/0/args/-
value: --events-addr=http://notification-controller.cozy-fluxcd.svc/

View File

@@ -6,7 +6,7 @@ ingress-nginx:
registry: ghcr.io
image: kvaps/ingress-nginx-with-protobuf-exporter/controller
tag: v1.11.2
digest: sha256:e80856ece4e30e9646d65c8d92c25a3446a0bba1c2468cd026f17df9e60d2c0f
digest: sha256:f4194edb06a43c82405167427ebd552b90af9698bd295845418680aebc13f600
allowSnippetAnnotations: true
replicaCount: 2
admissionWebhooks:

View File

@@ -1,25 +1 @@
# Build the manager binary
FROM golang:1.22 as builder
ARG VERSION=edge-24.9.2
ARG TARGETOS TARGETARCH
WORKDIR /workspace
RUN curl -sSL https://github.com/clastix/kamaji/archive/refs/tags/${VERSION}.tar.gz | tar -xzvf- --strip=1
COPY patches /patches
RUN git apply /patches/disable-datastore-check.diff
RUN CGO_ENABLED=0 GOOS=linux GOARCH=$TARGETARCH go build \
-ldflags "-X github.com/clastix/kamaji/internal.GitRepo=$GIT_REPO -X github.com/clastix/kamaji/internal.GitTag=$GIT_LAST_TAG -X github.com/clastix/kamaji/internal.GitCommit=$GIT_HEAD_COMMIT -X github.com/clastix/kamaji/internal.GitDirty=$GIT_MODIFIED -X github.com/clastix/kamaji/internal.BuildTime=$BUILD_DATE" \
-a -o kamaji main.go
# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
FROM gcr.io/distroless/static:nonroot
WORKDIR /
COPY --from=builder /workspace/kamaji .
USER 65532:65532
ENTRYPOINT ["/kamaji"]
FROM clastix/kamaji:edge-24.9.2

View File

@@ -1,23 +0,0 @@
diff --git a/cmd/manager/cmd.go b/cmd/manager/cmd.go
index 9a24d4e..a03a4e0 100644
--- a/cmd/manager/cmd.go
+++ b/cmd/manager/cmd.go
@@ -31,7 +31,6 @@ import (
"github.com/clastix/kamaji/controllers/soot"
"github.com/clastix/kamaji/internal"
"github.com/clastix/kamaji/internal/builders/controlplane"
- datastoreutils "github.com/clastix/kamaji/internal/datastore/utils"
"github.com/clastix/kamaji/internal/webhook"
"github.com/clastix/kamaji/internal/webhook/handlers"
"github.com/clastix/kamaji/internal/webhook/routes"
@@ -80,10 +79,6 @@ func NewCmd(scheme *runtime.Scheme) *cobra.Command {
return fmt.Errorf("unable to read webhook CA: %w", err)
}
- if err = datastoreutils.CheckExists(ctx, scheme, datastore); err != nil {
- return err
- }
-
if controllerReconcileTimeout.Seconds() == 0 {
return fmt.Errorf("the controller reconcile timeout must be greater than zero")
}

View File

@@ -3,7 +3,7 @@ kamaji:
deploy: false
image:
pullPolicy: IfNotPresent
tag: v0.16.0@sha256:241e6cdf60905e53f0cb47aadcab69dd8ffa97d316faac6bdeb704c1b13c24db
tag: latest@sha256:bb45d953a8ba46a19c8941ccc9fc8498d91435c77db439d8b1d6bde9fea8802a
repository: ghcr.io/aenix-io/cozystack/kamaji
resources:
limits:

View File

@@ -22,4 +22,4 @@ global:
images:
kubeovn:
repository: kubeovn
tag: v1.13.0@sha256:6676da12917c3645a08fb2adb7def5fcb68995338126b257e0a0a9570516cd5c
tag: v1.13.0@sha256:11c4ef0f71c73df4703743c0f63b7ff0ec67af6342caf1e7db8ebd5546071855

View File

@@ -0,0 +1,3 @@
apiVersion: v1
name: alerts
version: v0.0.0

View File

@@ -0,0 +1,48 @@
groups:
- name: blackbox-exporter
rules:
- alert: HttpProbeFailed
annotations:
message: Probing the blackbox-exporter target {{ $labels.instance }} failed.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobefailed
expr: probe_success != 1
for: 5m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: blackbox-exporter
- alert: HttpProbeSlow
annotations:
message: '{{ $labels.instance }} takes {{ $value }} seconds to respond.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobeslow
expr: sum by (instance) (probe_http_duration_seconds) > 3
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: blackbox-exporter
runbook:
steps:
- Check the target system's resource usage for anomalias.
- Check if the target application has been recently rescheduled and is still settling.
- alert: HttpCertExpiresSoon
annotations:
message: The certificate for {{ $labels.instance }} expires in less than 3 days.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiressoon
expr: probe_ssl_earliest_cert_expiry - time() < 3*24*3600
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: blackbox-exporter
- alert: HttpCertExpiresVerySoon
annotations:
message: The certificate for {{ $labels.instance }} expires in less than 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiresverysoon
expr: probe_ssl_earliest_cert_expiry - time() < 24*3600
labels:
severity: critical
resource: '{{ $labels.instance }}'
service: blackbox-exporter

View File

@@ -0,0 +1,44 @@
groups:
- name: cadvisor
rules:
- alert: CadvisorDown
annotations:
message: Cadvisor has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-cadvisordown
expr: absent(up{job="cadvisor"} == 1)
for: 15m
labels:
severity: critical
resource: cadvisor
service: cadvisor
- record: namespace:container_memory_usage_bytes:sum
expr: |
sum by (namespace) (
container_memory_usage_bytes{job="cadvisor", image!="", container!=""}
)
- record: namespace:container_cpu_usage_seconds_total:sum_rate
expr: |
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])) by (namespace)
- record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
expr: |
sum by (namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])
)
# triggered by kernel bug, see issue kubermatic#2367
# - alert: CPUThrottlingHigh
# annotations:
# message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for {{ $labels.container }}.'
# runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-cputhrottlinghigh
# expr: |
# 100 * sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace)
# /
# sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)
# > 25
# for: 15m
# labels:
# severity: warning

View File

@@ -0,0 +1,22 @@
groups:
- name: cert-manager
rules:
- alert: CertManagerCertExpiresSoon
annotations:
message: The certificate {{ $labels.name }} expires in less than 3 days.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-certmanagercertexpiressoon
expr: certmanager_certificate_expiration_timestamp_seconds - time() < 3*24*3600
labels:
severity: warning
resource: '{{ $labels.name }}'
service: cert-manager
- alert: CertManagerCertExpiresVerySoon
annotations:
message: The certificate {{ $labels.name }} expires in less than 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-certmanagercertexpiresverysoon
expr: certmanager_certificate_expiration_timestamp_seconds - time() < 24*3600
labels:
severity: critical
resource: '{{ $labels.name }}'
service: cert-manager

View File

@@ -0,0 +1,20 @@
groups:
- name: helm-exporter
rules:
- alert: HelmReleaseNotDeployed
annotations:
message:
The Helm release `{{ $labels.release }}` (`{{ $labels.chart }}` chart in namespace `{{ $labels.exported_namespace }}`)
in version {{ $labels.version }} has not been ready for more than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-helmreleasenotdeployed
expr: helm_chart_info != 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.release }}'
service: helm-exporter
runbook:
steps:
- Check the installed Helm releases via `helm --namespace monitoring ls --all`.
- If Helm cannot repair the chart automatically, delete/purge the chart (`helm delete --purge [RELEASE]`) and
re-install the chart again.

View File

@@ -0,0 +1,128 @@
groups:
- name: kube-apiserver
rules:
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
############################################################
# alerts
############################################################
- alert: KubernetesApiserverDown
annotations:
message: KubernetesApiserver has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubernetesapiserverdown
expr: absent(up{job="apiserver"} == 1)
for: 15m
labels:
severity: critical
resource: apiserver
service: kubernetes
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
for: 10m
labels:
severity: critical
resource: apiserver
service: kubernetes
- alert: KubeAPITerminatedRequests
annotations:
message: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapiterminatedrequests
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))
/
(sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
resource: apiserver
service: kubernetes
- alert: KubeAPITerminatedRequests
annotations:
message: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapiterminatedrequests
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))
/
(sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 10m
labels:
severity: critical
resource: apiserver
service: kubernetes
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
for: 10m
labels:
severity: warning
resource: apiserver
service: kubernetes
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7 days.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
and
histogram_quantile(0.01, sum by (job, instance, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
severity: warning
resource: apiserver
service: kubernetes
runbook:
steps:
- Check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/) on how to renew certificates.
- If your certificate has already expired, the steps in the documentation might not work. Check [Github](https://github.com/kubernetes/kubeadm/issues/581#issuecomment-421477139)
for hints about fixing your cluster.
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
and
histogram_quantile(0.01, sum by (job, instance, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
severity: critical
resource: apiserver
service: kubernetes
runbook:
steps:
- Urgently renew your certificates. Expired certificates can make fixing the cluster difficult to begin with.
- Check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/) on how to renew certificates.
- If your certificate has already expired, the steps in the documentation might not work. Check [Github](https://github.com/kubernetes/kubeadm/issues/581#issuecomment-421477139)
for hints about fixing your cluster.

View File

@@ -0,0 +1,143 @@
groups:
- name: kube-kubelet
rules:
- alert: KubeletDown
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletdown
expr: absent(up{job="kubelet"} == 1)
for: 15m
labels:
severity: critical
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubePersistentVolumeUsageCritical
annotations:
message:
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in namespace
{{ $labels.namespace }} is only {{ printf "%0.0f" $value }}% free.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepersistentvolumeusagecritical
expr: |
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
< 3
for: 1m
labels:
severity: critical
service: kubelet
resource: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}'
- alert: KubePersistentVolumeFullInFourDays
annotations:
message:
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
in namespace {{ $labels.namespace }} is expected to fill up within four days.
Currently {{ $value }} bytes are available.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepersistentvolumefullinfourdays
expr: |
(
kubelet_volume_stats_used_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
) > 0.85
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m
labels:
severity: critical
service: kubelet
resource: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}'
- alert: KubeletTooManyPods
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} pods, close to the limit of 110.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubelettoomanypods
expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubeClientErrors
annotations:
message:
The kubelet on {{ $labels.instance }} is experiencing {{ printf "%0.0f" $value }}% errors.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"(5..|<error>)",job="kubelet"}[5m])) by (instance)
/
sum(rate(rest_client_requests_total{job="kubelet"}[5m])) by (instance))
* 100 > 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
# a dedicated rule for pods to include more helpful labels in the message like the instance and job name
- alert: KubeClientErrors
annotations:
message:
The pod {{ $labels.namespace }}/{{ $labels.pod }} is experiencing {{ printf "%0.0f" $value }}% errors.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"(5..|<error>)",job="pods"}[5m])) by (namespace, pod)
/
sum(rate(rest_client_requests_total{job="pods"}[5m])) by (namespace, pod))
* 100 > 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubeletRuntimeErrors
annotations:
message:
The kubelet on {{ $labels.instance }} is having an elevated error rate for container runtime oprations.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletruntimeerrors
expr: |
sum(rate(kubelet_runtime_operations_errors_total{job="kubelet"}[5m])) by (instance) > 0.1
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubeletCGroupManagerDurationHigh
annotations:
message:
The kubelet's cgroup manager duration on {{ $labels.instance }} has been elevated ({{ printf "%0.2f" $value }}ms) for more than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletcgroupmanagerlatencyhigh
expr: |
sum(rate(kubelet_cgroup_manager_duration_seconds{quantile="0.9"}[5m])) by (instance) * 1000 > 1
for: 15m
labels:
resource: '{{ $labels.instance }}'
service: kubelet
severity: warning
- alert: KubeletPodWorkerDurationHigh
annotations:
message:
The kubelet's pod worker duration for {{ $labels.operation_type }} operations on {{ $labels.instance }} has been elevated ({{ printf "%0.2f" $value }}ms) for more than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletpodworkerdurationhigh
expr: |
sum(rate(kubelet_pod_worker_duration_seconds{quantile="0.9"}[5m])) by (instance, operation_type) * 1000 > 250
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}/{{ $labels.operation_type }}'
service: kubelet
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different versions of Kubernetes components running.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeversionmismatch
expr: count(count(kubernetes_build_info{job!="dns"}) by (gitVersion)) > 1
for: 1h
labels:
severity: warning

View File

@@ -0,0 +1,339 @@
groups:
- name: kube-state-metrics
rules:
- record: ':kube_pod_info_node_count:'
expr: |
sum(min(kube_pod_info) by (node))
- record: 'node_namespace_pod:kube_pod_info:'
expr: |
max(kube_pod_info{job="kube-state-metrics"}) by (node, namespace, pod)
- record: namespace_name:container_cpu_usage_seconds_total:sum_rate
expr: |
sum by (namespace, label_name) (
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])) by (namespace, pod)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
- record: namespace_name:container_memory_usage_bytes:sum
expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{job="cadvisor",image!="", container!=""}) by (pod, namespace)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
- record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests{job="kube-state-metrics",resource="memory"}) by (namespace, pod)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
- record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests{job="kube-state-metrics",resource="cpu"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
############################################################
# alerts
############################################################
- alert: KubeStateMetricsDown
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatemetricsdown
expr: absent(up{job="kube-state-metrics"} == 1)
for: 15m
labels:
severity: critical
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: kube-state-metrics
- alert: KubePodCrashLooping
annotations:
message:
Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting
{{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodcrashlooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 1h
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.pod }}"
runbook:
steps:
- Check the pod's logs.
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodnotready
expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
for: 30m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.pod }}"
runbook:
steps:
- Check the pod via `kubectl describe pod [POD]` to find out about scheduling issues.
- alert: KubeDeploymentGenerationMismatch
annotations:
message:
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match,
this indicates that the Deployment has failed but has not been rolled back.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedeploymentgenerationmismatch
expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.deployment }}"
- alert: KubeDeploymentReplicasMismatch
annotations:
message:
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected
number of replicas for longer than an hour.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedeploymentreplicasmismatch
expr: |
kube_deployment_spec_replicas{job="kube-state-metrics"}
!=
kube_deployment_status_replicas_available{job="kube-state-metrics"}
for: 1h
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.deployment }}"
- alert: KubeStatefulSetReplicasMismatch
annotations:
message:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected
number of replicas for longer than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetreplicasmismatch
expr: |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
- alert: KubeStatefulSetGenerationMismatch
annotations:
message:
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match,
this indicates that the StatefulSet has failed but has not been rolled back.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetgenerationmismatch
expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetupdatenotrolledout
expr: |
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
)
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
- alert: KubeDaemonSetRolloutStuck
annotations:
message:
Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }}
are scheduled and ready.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetrolloutstuck
expr: |
kube_daemonset_status_number_ready{job="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetnotscheduled
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetmisscheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
- alert: KubeCronJobRunning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecronjobrunning
expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.cronjob }}"
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubejobcompletion
expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.job_name }}"
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubejobfailed
expr: kube_job_status_failed{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.job_name }}"
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for namespaces.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecpuovercommit
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for pods and cannot tolerate node failure.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecpuovercommit
expr: |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for namespaces.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubememovercommit
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal_bytes{app="node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for pods and cannot tolerate node failure.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubememovercommit
expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal_bytes)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubequotaexceeded
expr: |
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubePodOOMKilled
annotations:
message:
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
has been OOMKilled {{ $value }} times in the last 30 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodoomkilled
expr: |
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 30m >= 2)
and
ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[30m]) == 1
for: 0m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}'
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubenodenotready
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.node }}'

View File

@@ -0,0 +1,342 @@
groups:
- name: node-exporter
rules:
- record: node:node_num_cpu:sum
expr: |
count by (node) (sum by (node, cpu) (
node_cpu_seconds_total{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
- record: :node_cpu_utilisation:avg1m
expr: |
1 - avg(rate(node_cpu_seconds_total{app="node-exporter",mode="idle"}[1m]))
- record: node:node_cpu_utilisation:avg1m
expr: |
1 - avg by (node) (
rate(node_cpu_seconds_total{app="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
- record: ':node_cpu_saturation_load1:'
expr: |
sum(node_load1{app="node-exporter"})
/
sum(node:node_num_cpu:sum)
- record: 'node:node_cpu_saturation_load1:'
expr: |
sum by (node) (
node_load1{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
- record: node:cluster_cpu_utilisation:ratio
expr: |
node:node_cpu_utilisation:avg1m
*
node:node_num_cpu:sum
/
scalar(sum(node:node_num_cpu:sum))
- record: ':node_memory_utilisation:'
expr: |
1 -
sum(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
/
sum(node_memory_MemTotal_bytes{app="node-exporter"})
- record: node:node_memory_bytes_available:sum
expr: |
sum by (node) (
(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:node_memory_bytes_total:sum
expr: |
sum by (node) (
node_memory_MemTotal_bytes{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:node_memory_utilisation:ratio
expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
- record: :node_memory_swap_io_bytes:sum_rate
expr: |
1e3 * sum(
(rate(node_vmstat_pgpgin{app="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{app="node-exporter"}[1m]))
)
- record: 'node:node_memory_utilisation:'
expr: |
1 -
sum by (node) (
(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal_bytes{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: 'node:node_memory_utilisation_2:'
expr: |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
- record: node:node_memory_swap_io_bytes:sum_rate
expr: |
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{app="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{app="node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:cluster_memory_utilisation:ratio
expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
- record: :node_disk_utilisation:avg_irate
expr: |
avg(irate(node_disk_io_time_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
- record: node:node_disk_utilisation:avg_irate
expr: |
avg by (node) (
irate(node_disk_io_time_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: :node_disk_saturation:avg_irate
expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
- record: node:node_disk_saturation:avg_irate
expr: |
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: 'node:node_filesystem_usage:'
expr: |
max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: 'node:node_filesystem_avail:'
expr: |
max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: :node_net_utilisation:sum_irate
expr: |
sum(irate(node_network_receive_bytes_total{app="node-exporter",device!~"veth.+"}[1m])) +
sum(irate(node_network_transmit_bytes_total{app="node-exporter",device!~"veth.+"}[1m]))
- record: node:node_net_utilisation:sum_irate
expr: |
sum by (node) (
(irate(node_network_receive_bytes_total{app="node-exporter",device!~"veth.+"}[1m]) +
irate(node_network_transmit_bytes_total{app="node-exporter",device!~"veth.+"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: :node_net_saturation:sum_irate
expr: |
sum(irate(node_network_receive_drop_total{app="node-exporter",device!~"veth.+"}[1m])) +
sum(irate(node_network_transmit_drop_total{app="node-exporter",device!~"veth.+"}[1m]))
- record: node:node_net_saturation:sum_irate
expr: |
sum by (node) (
(irate(node_network_receive_drop_total{app="node-exporter",device!~"veth.+"}[1m]) +
irate(node_network_transmit_drop_total{app="node-exporter",device!~"veth.+"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
############################################################
# alerts
############################################################
- alert: NodeFilesystemSpaceFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of space within the next 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemspacefillingup
expr: |
predict_linear(node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"}[6h], 24*60*60) < 0
and
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} < 0.4
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemSpaceFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of space within the next 4 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemspacefillingup
expr: |
predict_linear(node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"}[6h], 4*60*60) < 0
and
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} < 0.2
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfSpace
annotations:
message:
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only
{{ $value }}% available space left on drive {{ $labels.device }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
expr: |
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 10
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 30m
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfSpace
annotations:
message:
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only
{{ $value }}% available space left on drive {{ $labels.device }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
expr: |
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 5
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 10m
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemFilesFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of files within the next 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemfilesfillingup
expr: |
predict_linear(node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"}[6h], 24*60*60) < 0
and
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} < 0.4
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemFilesFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of files within the next 4 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemfilesfillingup
expr: |
predict_linear(node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"}[6h], 4*60*60) < 0
and
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} < 0.2
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfFiles
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ $value }}% available inodes left.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutoffiles
expr: |
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 5
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfSpace
annotations:
message:
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only {{ $value }}% inodes available on drive {{ $labels.device }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
expr: |
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 10
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeNetworkReceiveErrs
annotations:
message:
'{{ $labels.instance }} interface {{ $labels.device }} shows errors
while receiving packets ({{ $value }} errors in two minutes).'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodenetworkreceiveerrs
expr: increase(node_network_receive_errs_total[2m]) > 10
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeNetworkTransmitErrs
annotations:
message:
'{{ $labels.instance }} interface {{ $labels.device }} shows errors
while transmitting packets ({{ $value }} errors in two minutes).'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodenetworktransmiterrs
expr: increase(node_network_transmit_errs_total[2m]) > 10
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'

View File

@@ -0,0 +1,95 @@
groups:
- name: prometheus
rules:
- alert: PromScrapeFailed
annotations:
message: Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promscrapefailed
expr: up != 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check the Prometheus Service Discovery page to find out why the target is unreachable.
- alert: PromBadConfig
annotations:
message: Prometheus failed to reload config.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-prombadconfig
expr: prometheus_config_last_reload_successful{job="prometheus"} == 0
for: 15m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- Check the `prometheus-rules` configmap via `kubectl -n monitoring get configmap prometheus-rules -o yaml`.
- alert: PromAlertmanagerBadConfig
annotations:
message: Alertmanager failed to reload config.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promalertmanagerbadconfig
expr: alertmanager_config_last_reload_successful{job="alertmanager"} == 0
for: 10m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Alertmanager pod's logs via `kubectl -n monitoring logs alertmanager-0`, `-1` and `-2`.
- Check the `alertmanager` secret via `kubectl -n monitoring get secret alertmanager -o yaml`.
- alert: PromAlertsFailed
annotations:
message: Alertmanager failed to send an alert.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promalertsfailed
expr: sum(increase(alertmanager_notifications_failed_total{job="alertmanager"}[5m])) by (namespace) > 0
for: 5m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- 'Make sure the Alertmanager StatefulSet is running: `kubectl -n monitoring get pods`.'
- alert: PromRemoteStorageFailures
annotations:
message: Prometheus failed to send {{ printf "%.1f" $value }}% samples.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promremotestoragefailures
expr: |
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[1m]) * 100)
/
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[1m]))
> 1
for: 15m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Ensure that the Prometheus volume has not reached capacity.
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- alert: PromRuleFailures
annotations:
message: Prometheus failed to evaluate {{ printf "%.1f" $value }} rules/sec.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promrulefailures
expr: rate(prometheus_rule_evaluation_failures_total{job="prometheus"}[1m]) > 0
for: 15m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- Check CPU/memory pressure on the node.

View File

@@ -0,0 +1,85 @@
groups:
- name: thanos
rules:
- alert: ThanosSidecarDown
annotations:
message: The Thanos sidecar in `{{ $labels.namespace }}/{{ $labels.pod }}` is down.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanossidecardown
expr: thanos_sidecar_prometheus_up != 1
for: 5m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosSidecarNoHeartbeat
annotations:
message: The Thanos sidecar in `{{ $labels.namespace }}/{{ $labels.pod }}` didn't send a heartbeat in {{ $value }} seconds.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanossidecardown
expr: time() - thanos_sidecar_last_heartbeat_success_time_seconds > 60
for: 3m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosCompactorManyRetries
annotations:
message: The Thanos compactor in `{{ $labels.namespace }}` is experiencing a high retry rate.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanoscompactormanyretries
expr: sum(rate(thanos_compact_retries_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
runbook:
steps:
- Check the `thanos-compact` pod's logs.
- alert: ThanosShipperManyDirSyncFailures
annotations:
message: The Thanos shipper in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a high dir-sync failure rate.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosshippermanydirsyncfailures
expr: sum(rate(thanos_shipper_dir_sync_failures_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
runbook:
steps:
- Check the `thanos` containers's logs inside the Prometheus pod.
- alert: ThanosManyPanicRecoveries
annotations:
message: The Thanos component in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a panic recovery rate.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanypanicrecoveries
expr: sum(rate(thanos_grpc_req_panics_recovered_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosManyBlockLoadFailures
annotations:
message: The Thanos store in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a many failed block loads.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanyblockloadfailures
expr: sum(rate(thanos_bucket_store_block_load_failures_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosManyBlockDropFailures
annotations:
message: The Thanos store in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a many failed block drops.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanyblockdropfailures
expr: sum(rate(thanos_bucket_store_block_drop_failures_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos

View File

@@ -0,0 +1,34 @@
groups:
- name: velero
rules:
- alert: VeleroBackupTakesTooLong
annotations:
message: Last backup with schedule {{ $labels.schedule }} has not finished successfully within 60min.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-velerobackuptakestoolong
expr: time() - velero_backup_last_successful_timestamp{schedule!=""} > 3600
for: 5m
labels:
severity: warning
resource: '{{ $labels.schedule }}'
service: 'velero'
runbook:
steps:
- Check if a backup is really in "InProgress" state via `velero -n velero backup get`.
- Check the backup logs via `velero -n velero backup logs [BACKUP_NAME]`.
- Depending on the backup, find the pod and check the processes inside that pod or any sidecar containers.
- alert: VeleroNoRecentBackup
annotations:
message: There has not been a successful backup for schedule {{ $labels.schedule }} in the last 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-veleronorecentbackup
expr: time() - velero_backup_last_successful_timestamp{schedule!=""} > 3600*25
labels:
severity: critical
resource: '{{ $labels.schedule }}'
service: 'velero'
runbook:
steps:
- Check if really no backups happened via `velero -n velero backup get`.
- If a backup failed, check its logs via `velero -n velero backup logs [BACKUP_NAME]`.
- If a backup was not even triggered, check the Velero server's logs via `kubectl -n velero logs -l 'name=velero-server'`.
- Make sure the Velero server pod has not been rescheduled and possibly opt to schedule it on a stable node using a node affinity.

View File

@@ -0,0 +1,23 @@
groups:
- name: vertical-pod-autoscaler
rules:
# These rules provide metrics to be consumed by Kubernetes' VPA. The VPA only needs a tiny fraction
# of the labels available on the container_* metrics, so we reduce them with the inner query to
# only contain pod name, namespace and name.
# Because the VPA does not allow to change the metric name it queries, but only the job selector,
# we "cheat" by re-using the same metric name and injecting a custom job ("cadvisor-vpa") label.
- record: container_cpu_usage_seconds_total
expr: |
label_replace(
sum(container_cpu_usage_seconds_total{job="cadvisor", pod=~".+", name!="POD", name!=""}) by (pod, namespace, name),
"job", "cadvisor-vpa", "", ""
)
- record: container_memory_usage_bytes
expr: |
label_replace(
sum(container_memory_usage_bytes{job="cadvisor", pod=~".+", name!="POD", name!=""}) by (pod, namespace, name),
"job", "cadvisor-vpa", "", ""
)

View File

@@ -0,0 +1,9 @@
{{- range $path, $file := .Files.Glob "files/rules/**/*.yaml" }}
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: {{ toString $path | trimPrefix "files/rules/" | replace "/" "-" | replace ".yaml" "" }}
spec:
{{- toString $file | nindent 2 }}
---
{{- end }}

View File

@@ -1,7 +1,3 @@
NAME=seaweedfs-system
include ../../../scripts/package.mk
update:
rm -rf charts
mkdir -p charts

View File

@@ -877,7 +877,7 @@ s3:
# For more information, visit: https://container-object-storage-interface.github.io/docs/deployment-guide
cosi:
enabled: false
image: "ghcr.io/seaweedfs/seaweedfs-cosi-driver:v0.1.2"
image: "ghcr.io/seaweedfs/seaweedfs-cosi-driver:v0.1.1"
driverName: "seaweedfs.objectstorage.k8s.io"
bucketClassName: "seaweedfs"
endpoint: ""

View File

@@ -114,6 +114,7 @@ seaweedfs:
policy.cozystack.io/allow-to-apiserver: "true"
driverName: "seaweedfs.objectstorage.k8s.io"
bucketClassName: "seaweedfs"
image: "ghcr.io/seaweedfs/seaweedfs-cosi-driver:v0.1.1"
region: ""
sidecar: