Compare commits

...

1 Commits

Author SHA1 Message Date
Andrei Kvapil
1b0a7eacca Add alerts from kubermatic
Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
2024-09-27 17:29:02 +02:00
14 changed files with 1335 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
apiVersion: v1
name: alerts
version: v0.0.0

View File

@@ -0,0 +1,48 @@
groups:
- name: blackbox-exporter
rules:
- alert: HttpProbeFailed
annotations:
message: Probing the blackbox-exporter target {{ $labels.instance }} failed.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobefailed
expr: probe_success != 1
for: 5m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: blackbox-exporter
- alert: HttpProbeSlow
annotations:
message: '{{ $labels.instance }} takes {{ $value }} seconds to respond.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobeslow
expr: sum by (instance) (probe_http_duration_seconds) > 3
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: blackbox-exporter
runbook:
steps:
- Check the target system's resource usage for anomalias.
- Check if the target application has been recently rescheduled and is still settling.
- alert: HttpCertExpiresSoon
annotations:
message: The certificate for {{ $labels.instance }} expires in less than 3 days.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiressoon
expr: probe_ssl_earliest_cert_expiry - time() < 3*24*3600
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: blackbox-exporter
- alert: HttpCertExpiresVerySoon
annotations:
message: The certificate for {{ $labels.instance }} expires in less than 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiresverysoon
expr: probe_ssl_earliest_cert_expiry - time() < 24*3600
labels:
severity: critical
resource: '{{ $labels.instance }}'
service: blackbox-exporter

View File

@@ -0,0 +1,44 @@
groups:
- name: cadvisor
rules:
- alert: CadvisorDown
annotations:
message: Cadvisor has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-cadvisordown
expr: absent(up{job="cadvisor"} == 1)
for: 15m
labels:
severity: critical
resource: cadvisor
service: cadvisor
- record: namespace:container_memory_usage_bytes:sum
expr: |
sum by (namespace) (
container_memory_usage_bytes{job="cadvisor", image!="", container!=""}
)
- record: namespace:container_cpu_usage_seconds_total:sum_rate
expr: |
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])) by (namespace)
- record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
expr: |
sum by (namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])
)
# triggered by kernel bug, see issue kubermatic#2367
# - alert: CPUThrottlingHigh
# annotations:
# message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for {{ $labels.container }}.'
# runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-cputhrottlinghigh
# expr: |
# 100 * sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace)
# /
# sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)
# > 25
# for: 15m
# labels:
# severity: warning

View File

@@ -0,0 +1,22 @@
groups:
- name: cert-manager
rules:
- alert: CertManagerCertExpiresSoon
annotations:
message: The certificate {{ $labels.name }} expires in less than 3 days.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-certmanagercertexpiressoon
expr: certmanager_certificate_expiration_timestamp_seconds - time() < 3*24*3600
labels:
severity: warning
resource: '{{ $labels.name }}'
service: cert-manager
- alert: CertManagerCertExpiresVerySoon
annotations:
message: The certificate {{ $labels.name }} expires in less than 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-certmanagercertexpiresverysoon
expr: certmanager_certificate_expiration_timestamp_seconds - time() < 24*3600
labels:
severity: critical
resource: '{{ $labels.name }}'
service: cert-manager

View File

@@ -0,0 +1,20 @@
groups:
- name: helm-exporter
rules:
- alert: HelmReleaseNotDeployed
annotations:
message:
The Helm release `{{ $labels.release }}` (`{{ $labels.chart }}` chart in namespace `{{ $labels.exported_namespace }}`)
in version {{ $labels.version }} has not been ready for more than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-helmreleasenotdeployed
expr: helm_chart_info != 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.release }}'
service: helm-exporter
runbook:
steps:
- Check the installed Helm releases via `helm --namespace monitoring ls --all`.
- If Helm cannot repair the chart automatically, delete/purge the chart (`helm delete --purge [RELEASE]`) and
re-install the chart again.

View File

@@ -0,0 +1,128 @@
groups:
- name: kube-apiserver
rules:
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels:
quantile: "0.99"
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels:
quantile: "0.9"
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
labels:
quantile: "0.5"
############################################################
# alerts
############################################################
- alert: KubernetesApiserverDown
annotations:
message: KubernetesApiserver has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubernetesapiserverdown
expr: absent(up{job="apiserver"} == 1)
for: 15m
labels:
severity: critical
resource: apiserver
service: kubernetes
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
for: 10m
labels:
severity: critical
resource: apiserver
service: kubernetes
- alert: KubeAPITerminatedRequests
annotations:
message: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapiterminatedrequests
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))
/
(sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning
resource: apiserver
service: kubernetes
- alert: KubeAPITerminatedRequests
annotations:
message: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapiterminatedrequests
expr: |
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))
/
(sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
for: 10m
labels:
severity: critical
resource: apiserver
service: kubernetes
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
for: 10m
labels:
severity: warning
resource: apiserver
service: kubernetes
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 7 days.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
and
histogram_quantile(0.01, sum by (job, instance, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
severity: warning
resource: apiserver
service: kubernetes
runbook:
steps:
- Check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/) on how to renew certificates.
- If your certificate has already expired, the steps in the documentation might not work. Check [Github](https://github.com/kubernetes/kubeadm/issues/581#issuecomment-421477139)
for hints about fixing your cluster.
- alert: KubeClientCertificateExpiration
annotations:
message: A client certificate used to authenticate to the apiserver is expiring in less than 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclientcertificateexpiration
expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
and
histogram_quantile(0.01, sum by (job, instance, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
severity: critical
resource: apiserver
service: kubernetes
runbook:
steps:
- Urgently renew your certificates. Expired certificates can make fixing the cluster difficult to begin with.
- Check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/) on how to renew certificates.
- If your certificate has already expired, the steps in the documentation might not work. Check [Github](https://github.com/kubernetes/kubeadm/issues/581#issuecomment-421477139)
for hints about fixing your cluster.

View File

@@ -0,0 +1,143 @@
groups:
- name: kube-kubelet
rules:
- alert: KubeletDown
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletdown
expr: absent(up{job="kubelet"} == 1)
for: 15m
labels:
severity: critical
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubePersistentVolumeUsageCritical
annotations:
message:
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in namespace
{{ $labels.namespace }} is only {{ printf "%0.0f" $value }}% free.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepersistentvolumeusagecritical
expr: |
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
< 3
for: 1m
labels:
severity: critical
service: kubelet
resource: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}'
- alert: KubePersistentVolumeFullInFourDays
annotations:
message:
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
in namespace {{ $labels.namespace }} is expected to fill up within four days.
Currently {{ $value }} bytes are available.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepersistentvolumefullinfourdays
expr: |
(
kubelet_volume_stats_used_bytes{job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet"}
) > 0.85
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m
labels:
severity: critical
service: kubelet
resource: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}'
- alert: KubeletTooManyPods
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} pods, close to the limit of 110.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubelettoomanypods
expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubeClientErrors
annotations:
message:
The kubelet on {{ $labels.instance }} is experiencing {{ printf "%0.0f" $value }}% errors.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"(5..|<error>)",job="kubelet"}[5m])) by (instance)
/
sum(rate(rest_client_requests_total{job="kubelet"}[5m])) by (instance))
* 100 > 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
# a dedicated rule for pods to include more helpful labels in the message like the instance and job name
- alert: KubeClientErrors
annotations:
message:
The pod {{ $labels.namespace }}/{{ $labels.pod }} is experiencing {{ printf "%0.0f" $value }}% errors.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclienterrors
expr: |
(sum(rate(rest_client_requests_total{code=~"(5..|<error>)",job="pods"}[5m])) by (namespace, pod)
/
sum(rate(rest_client_requests_total{job="pods"}[5m])) by (namespace, pod))
* 100 > 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubeletRuntimeErrors
annotations:
message:
The kubelet on {{ $labels.instance }} is having an elevated error rate for container runtime oprations.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletruntimeerrors
expr: |
sum(rate(kubelet_runtime_operations_errors_total{job="kubelet"}[5m])) by (instance) > 0.1
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}'
service: kubelet
- alert: KubeletCGroupManagerDurationHigh
annotations:
message:
The kubelet's cgroup manager duration on {{ $labels.instance }} has been elevated ({{ printf "%0.2f" $value }}ms) for more than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletcgroupmanagerlatencyhigh
expr: |
sum(rate(kubelet_cgroup_manager_duration_seconds{quantile="0.9"}[5m])) by (instance) * 1000 > 1
for: 15m
labels:
resource: '{{ $labels.instance }}'
service: kubelet
severity: warning
- alert: KubeletPodWorkerDurationHigh
annotations:
message:
The kubelet's pod worker duration for {{ $labels.operation_type }} operations on {{ $labels.instance }} has been elevated ({{ printf "%0.2f" $value }}ms) for more than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletpodworkerdurationhigh
expr: |
sum(rate(kubelet_pod_worker_duration_seconds{quantile="0.9"}[5m])) by (instance, operation_type) * 1000 > 250
for: 15m
labels:
severity: warning
resource: '{{ $labels.instance }}/{{ $labels.operation_type }}'
service: kubelet
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different versions of Kubernetes components running.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeversionmismatch
expr: count(count(kubernetes_build_info{job!="dns"}) by (gitVersion)) > 1
for: 1h
labels:
severity: warning

View File

@@ -0,0 +1,339 @@
groups:
- name: kube-state-metrics
rules:
- record: ':kube_pod_info_node_count:'
expr: |
sum(min(kube_pod_info) by (node))
- record: 'node_namespace_pod:kube_pod_info:'
expr: |
max(kube_pod_info{job="kube-state-metrics"}) by (node, namespace, pod)
- record: namespace_name:container_cpu_usage_seconds_total:sum_rate
expr: |
sum by (namespace, label_name) (
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])) by (namespace, pod)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
- record: namespace_name:container_memory_usage_bytes:sum
expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{job="cadvisor",image!="", container!=""}) by (pod, namespace)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
- record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests{job="kube-state-metrics",resource="memory"}) by (namespace, pod)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
- record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests{job="kube-state-metrics",resource="cpu"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
* on (namespace, pod) group_left (label_name)
kube_pod_labels{job="kube-state-metrics"}
)
############################################################
# alerts
############################################################
- alert: KubeStateMetricsDown
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatemetricsdown
expr: absent(up{job="kube-state-metrics"} == 1)
for: 15m
labels:
severity: critical
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: kube-state-metrics
- alert: KubePodCrashLooping
annotations:
message:
Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting
{{ printf "%.2f" $value }} times / 5 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodcrashlooping
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 1h
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.pod }}"
runbook:
steps:
- Check the pod's logs.
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodnotready
expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
for: 30m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.pod }}"
runbook:
steps:
- Check the pod via `kubectl describe pod [POD]` to find out about scheduling issues.
- alert: KubeDeploymentGenerationMismatch
annotations:
message:
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match,
this indicates that the Deployment has failed but has not been rolled back.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedeploymentgenerationmismatch
expr: |
kube_deployment_status_observed_generation{job="kube-state-metrics"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.deployment }}"
- alert: KubeDeploymentReplicasMismatch
annotations:
message:
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected
number of replicas for longer than an hour.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedeploymentreplicasmismatch
expr: |
kube_deployment_spec_replicas{job="kube-state-metrics"}
!=
kube_deployment_status_replicas_available{job="kube-state-metrics"}
for: 1h
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.deployment }}"
- alert: KubeStatefulSetReplicasMismatch
annotations:
message:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected
number of replicas for longer than 15 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetreplicasmismatch
expr: |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
- alert: KubeStatefulSetGenerationMismatch
annotations:
message:
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match,
this indicates that the StatefulSet has failed but has not been rolled back.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetgenerationmismatch
expr: |
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics"}
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetupdatenotrolledout
expr: |
max without (revision) (
kube_statefulset_status_current_revision{job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
)
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
- alert: KubeDaemonSetRolloutStuck
annotations:
message:
Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }}
are scheduled and ready.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetrolloutstuck
expr: |
kube_daemonset_status_number_ready{job="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
for: 15m
labels:
severity: critical
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetnotscheduled
expr: |
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetmisscheduled
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
- alert: KubeCronJobRunning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecronjobrunning
expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.cronjob }}"
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubejobcompletion
expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.job_name }}"
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubejobfailed
expr: kube_job_status_failed{job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
resource: "{{ $labels.namespace }}/{{ $labels.job_name }}"
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for namespaces.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecpuovercommit
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for pods and cannot tolerate node failure.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecpuovercommit
expr: |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for namespaces.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubememovercommit
expr: |
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal_bytes{app="node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for pods and cannot tolerate node failure.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubememovercommit
expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal_bytes)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubequotaexceeded
expr: |
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
resource: cluster
service: kube-state-metrics
- alert: KubePodOOMKilled
annotations:
message:
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
has been OOMKilled {{ $value }} times in the last 30 minutes.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodoomkilled
expr: |
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 30m >= 2)
and
ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[30m]) == 1
for: 0m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}'
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubenodenotready
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.node }}'

View File

@@ -0,0 +1,342 @@
groups:
- name: node-exporter
rules:
- record: node:node_num_cpu:sum
expr: |
count by (node) (sum by (node, cpu) (
node_cpu_seconds_total{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
- record: :node_cpu_utilisation:avg1m
expr: |
1 - avg(rate(node_cpu_seconds_total{app="node-exporter",mode="idle"}[1m]))
- record: node:node_cpu_utilisation:avg1m
expr: |
1 - avg by (node) (
rate(node_cpu_seconds_total{app="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
- record: ':node_cpu_saturation_load1:'
expr: |
sum(node_load1{app="node-exporter"})
/
sum(node:node_num_cpu:sum)
- record: 'node:node_cpu_saturation_load1:'
expr: |
sum by (node) (
node_load1{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
- record: node:cluster_cpu_utilisation:ratio
expr: |
node:node_cpu_utilisation:avg1m
*
node:node_num_cpu:sum
/
scalar(sum(node:node_num_cpu:sum))
- record: ':node_memory_utilisation:'
expr: |
1 -
sum(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
/
sum(node_memory_MemTotal_bytes{app="node-exporter"})
- record: node:node_memory_bytes_available:sum
expr: |
sum by (node) (
(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:node_memory_bytes_total:sum
expr: |
sum by (node) (
node_memory_MemTotal_bytes{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:node_memory_utilisation:ratio
expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
- record: :node_memory_swap_io_bytes:sum_rate
expr: |
1e3 * sum(
(rate(node_vmstat_pgpgin{app="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{app="node-exporter"}[1m]))
)
- record: 'node:node_memory_utilisation:'
expr: |
1 -
sum by (node) (
(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal_bytes{app="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: 'node:node_memory_utilisation_2:'
expr: |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
- record: node:node_memory_swap_io_bytes:sum_rate
expr: |
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{app="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{app="node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: node:cluster_memory_utilisation:ratio
expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
- record: :node_disk_utilisation:avg_irate
expr: |
avg(irate(node_disk_io_time_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
- record: node:node_disk_utilisation:avg_irate
expr: |
avg by (node) (
irate(node_disk_io_time_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: :node_disk_saturation:avg_irate
expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
- record: node:node_disk_saturation:avg_irate
expr: |
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: 'node:node_filesystem_usage:'
expr: |
max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: 'node:node_filesystem_avail:'
expr: |
max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
- record: :node_net_utilisation:sum_irate
expr: |
sum(irate(node_network_receive_bytes_total{app="node-exporter",device!~"veth.+"}[1m])) +
sum(irate(node_network_transmit_bytes_total{app="node-exporter",device!~"veth.+"}[1m]))
- record: node:node_net_utilisation:sum_irate
expr: |
sum by (node) (
(irate(node_network_receive_bytes_total{app="node-exporter",device!~"veth.+"}[1m]) +
irate(node_network_transmit_bytes_total{app="node-exporter",device!~"veth.+"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
- record: :node_net_saturation:sum_irate
expr: |
sum(irate(node_network_receive_drop_total{app="node-exporter",device!~"veth.+"}[1m])) +
sum(irate(node_network_transmit_drop_total{app="node-exporter",device!~"veth.+"}[1m]))
- record: node:node_net_saturation:sum_irate
expr: |
sum by (node) (
(irate(node_network_receive_drop_total{app="node-exporter",device!~"veth.+"}[1m]) +
irate(node_network_transmit_drop_total{app="node-exporter",device!~"veth.+"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
############################################################
# alerts
############################################################
- alert: NodeFilesystemSpaceFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of space within the next 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemspacefillingup
expr: |
predict_linear(node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"}[6h], 24*60*60) < 0
and
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} < 0.4
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemSpaceFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of space within the next 4 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemspacefillingup
expr: |
predict_linear(node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"}[6h], 4*60*60) < 0
and
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} < 0.2
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfSpace
annotations:
message:
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only
{{ $value }}% available space left on drive {{ $labels.device }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
expr: |
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 10
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 30m
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfSpace
annotations:
message:
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only
{{ $value }}% available space left on drive {{ $labels.device }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
expr: |
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 5
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 10m
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemFilesFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of files within the next 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemfilesfillingup
expr: |
predict_linear(node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"}[6h], 24*60*60) < 0
and
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} < 0.4
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemFilesFillingUp
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
to run out of files within the next 4 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemfilesfillingup
expr: |
predict_linear(node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"}[6h], 4*60*60) < 0
and
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} < 0.2
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfFiles
annotations:
message:
Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
{{ $value }}% available inodes left.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutoffiles
expr: |
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 5
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: warning
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeFilesystemOutOfSpace
annotations:
message:
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only {{ $value }}% inodes available on drive {{ $labels.device }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
expr: |
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 10
and
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeNetworkReceiveErrs
annotations:
message:
'{{ $labels.instance }} interface {{ $labels.device }} shows errors
while receiving packets ({{ $value }} errors in two minutes).'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodenetworkreceiveerrs
expr: increase(node_network_receive_errs_total[2m]) > 10
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'
- alert: NodeNetworkTransmitErrs
annotations:
message:
'{{ $labels.instance }} interface {{ $labels.device }} shows errors
while transmitting packets ({{ $value }} errors in two minutes).'
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodenetworktransmiterrs
expr: increase(node_network_transmit_errs_total[2m]) > 10
for: 1h
labels:
severity: critical
resource: '{{ $labels.instance }} {{ $labels.device }}'
service: 'node-exporter'

View File

@@ -0,0 +1,95 @@
groups:
- name: prometheus
rules:
- alert: PromScrapeFailed
annotations:
message: Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promscrapefailed
expr: up != 1
for: 15m
labels:
severity: warning
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check the Prometheus Service Discovery page to find out why the target is unreachable.
- alert: PromBadConfig
annotations:
message: Prometheus failed to reload config.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-prombadconfig
expr: prometheus_config_last_reload_successful{job="prometheus"} == 0
for: 15m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- Check the `prometheus-rules` configmap via `kubectl -n monitoring get configmap prometheus-rules -o yaml`.
- alert: PromAlertmanagerBadConfig
annotations:
message: Alertmanager failed to reload config.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promalertmanagerbadconfig
expr: alertmanager_config_last_reload_successful{job="alertmanager"} == 0
for: 10m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Alertmanager pod's logs via `kubectl -n monitoring logs alertmanager-0`, `-1` and `-2`.
- Check the `alertmanager` secret via `kubectl -n monitoring get secret alertmanager -o yaml`.
- alert: PromAlertsFailed
annotations:
message: Alertmanager failed to send an alert.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promalertsfailed
expr: sum(increase(alertmanager_notifications_failed_total{job="alertmanager"}[5m])) by (namespace) > 0
for: 5m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- 'Make sure the Alertmanager StatefulSet is running: `kubectl -n monitoring get pods`.'
- alert: PromRemoteStorageFailures
annotations:
message: Prometheus failed to send {{ printf "%.1f" $value }}% samples.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promremotestoragefailures
expr: |
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[1m]) * 100)
/
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[1m]))
> 1
for: 15m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Ensure that the Prometheus volume has not reached capacity.
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- alert: PromRuleFailures
annotations:
message: Prometheus failed to evaluate {{ printf "%.1f" $value }} rules/sec.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promrulefailures
expr: rate(prometheus_rule_evaluation_failures_total{job="prometheus"}[1m]) > 0
for: 15m
labels:
severity: critical
resource: '{{ $labels.job }}/{{ $labels.instance }}'
service: prometheus
runbook:
steps:
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
- Check CPU/memory pressure on the node.

View File

@@ -0,0 +1,85 @@
groups:
- name: thanos
rules:
- alert: ThanosSidecarDown
annotations:
message: The Thanos sidecar in `{{ $labels.namespace }}/{{ $labels.pod }}` is down.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanossidecardown
expr: thanos_sidecar_prometheus_up != 1
for: 5m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosSidecarNoHeartbeat
annotations:
message: The Thanos sidecar in `{{ $labels.namespace }}/{{ $labels.pod }}` didn't send a heartbeat in {{ $value }} seconds.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanossidecardown
expr: time() - thanos_sidecar_last_heartbeat_success_time_seconds > 60
for: 3m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosCompactorManyRetries
annotations:
message: The Thanos compactor in `{{ $labels.namespace }}` is experiencing a high retry rate.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanoscompactormanyretries
expr: sum(rate(thanos_compact_retries_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
runbook:
steps:
- Check the `thanos-compact` pod's logs.
- alert: ThanosShipperManyDirSyncFailures
annotations:
message: The Thanos shipper in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a high dir-sync failure rate.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosshippermanydirsyncfailures
expr: sum(rate(thanos_shipper_dir_sync_failures_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
runbook:
steps:
- Check the `thanos` containers's logs inside the Prometheus pod.
- alert: ThanosManyPanicRecoveries
annotations:
message: The Thanos component in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a panic recovery rate.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanypanicrecoveries
expr: sum(rate(thanos_grpc_req_panics_recovered_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosManyBlockLoadFailures
annotations:
message: The Thanos store in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a many failed block loads.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanyblockloadfailures
expr: sum(rate(thanos_bucket_store_block_load_failures_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos
- alert: ThanosManyBlockDropFailures
annotations:
message: The Thanos store in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a many failed block drops.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanyblockdropfailures
expr: sum(rate(thanos_bucket_store_block_drop_failures_total[5m])) > 0.01
for: 10m
labels:
severity: warning
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: thanos

View File

@@ -0,0 +1,34 @@
groups:
- name: velero
rules:
- alert: VeleroBackupTakesTooLong
annotations:
message: Last backup with schedule {{ $labels.schedule }} has not finished successfully within 60min.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-velerobackuptakestoolong
expr: time() - velero_backup_last_successful_timestamp{schedule!=""} > 3600
for: 5m
labels:
severity: warning
resource: '{{ $labels.schedule }}'
service: 'velero'
runbook:
steps:
- Check if a backup is really in "InProgress" state via `velero -n velero backup get`.
- Check the backup logs via `velero -n velero backup logs [BACKUP_NAME]`.
- Depending on the backup, find the pod and check the processes inside that pod or any sidecar containers.
- alert: VeleroNoRecentBackup
annotations:
message: There has not been a successful backup for schedule {{ $labels.schedule }} in the last 24 hours.
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-veleronorecentbackup
expr: time() - velero_backup_last_successful_timestamp{schedule!=""} > 3600*25
labels:
severity: critical
resource: '{{ $labels.schedule }}'
service: 'velero'
runbook:
steps:
- Check if really no backups happened via `velero -n velero backup get`.
- If a backup failed, check its logs via `velero -n velero backup logs [BACKUP_NAME]`.
- If a backup was not even triggered, check the Velero server's logs via `kubectl -n velero logs -l 'name=velero-server'`.
- Make sure the Velero server pod has not been rescheduled and possibly opt to schedule it on a stable node using a node affinity.

View File

@@ -0,0 +1,23 @@
groups:
- name: vertical-pod-autoscaler
rules:
# These rules provide metrics to be consumed by Kubernetes' VPA. The VPA only needs a tiny fraction
# of the labels available on the container_* metrics, so we reduce them with the inner query to
# only contain pod name, namespace and name.
# Because the VPA does not allow to change the metric name it queries, but only the job selector,
# we "cheat" by re-using the same metric name and injecting a custom job ("cadvisor-vpa") label.
- record: container_cpu_usage_seconds_total
expr: |
label_replace(
sum(container_cpu_usage_seconds_total{job="cadvisor", pod=~".+", name!="POD", name!=""}) by (pod, namespace, name),
"job", "cadvisor-vpa", "", ""
)
- record: container_memory_usage_bytes
expr: |
label_replace(
sum(container_memory_usage_bytes{job="cadvisor", pod=~".+", name!="POD", name!=""}) by (pod, namespace, name),
"job", "cadvisor-vpa", "", ""
)

View File

@@ -0,0 +1,9 @@
{{- range $path, $file := .Files.Glob "files/rules/**/*.yaml" }}
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: {{ toString $path | trimPrefix "files/rules/" | replace "/" "-" | replace ".yaml" "" }}
spec:
{{- toString $file | nindent 2 }}
---
{{- end }}