mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-01-28 18:18:41 +00:00
Compare commits
1 Commits
v0.33.2
...
kubermatic
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1b0a7eacca |
3
packages/system/monitoring/charts/alerts/Chart.yaml
Normal file
3
packages/system/monitoring/charts/alerts/Chart.yaml
Normal file
@@ -0,0 +1,3 @@
|
||||
apiVersion: v1
|
||||
name: alerts
|
||||
version: v0.0.0
|
||||
@@ -0,0 +1,48 @@
|
||||
groups:
|
||||
- name: blackbox-exporter
|
||||
rules:
|
||||
- alert: HttpProbeFailed
|
||||
annotations:
|
||||
message: Probing the blackbox-exporter target {{ $labels.instance }} failed.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobefailed
|
||||
expr: probe_success != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
|
||||
- alert: HttpProbeSlow
|
||||
annotations:
|
||||
message: '{{ $labels.instance }} takes {{ $value }} seconds to respond.'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpprobeslow
|
||||
expr: sum by (instance) (probe_http_duration_seconds) > 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
runbook:
|
||||
steps:
|
||||
- Check the target system's resource usage for anomalias.
|
||||
- Check if the target application has been recently rescheduled and is still settling.
|
||||
|
||||
- alert: HttpCertExpiresSoon
|
||||
annotations:
|
||||
message: The certificate for {{ $labels.instance }} expires in less than 3 days.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiressoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 3*24*3600
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
|
||||
- alert: HttpCertExpiresVerySoon
|
||||
annotations:
|
||||
message: The certificate for {{ $labels.instance }} expires in less than 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-httpcertexpiresverysoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 24*3600
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: blackbox-exporter
|
||||
@@ -0,0 +1,44 @@
|
||||
groups:
|
||||
- name: cadvisor
|
||||
rules:
|
||||
- alert: CadvisorDown
|
||||
annotations:
|
||||
message: Cadvisor has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-cadvisordown
|
||||
expr: absent(up{job="cadvisor"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: cadvisor
|
||||
service: cadvisor
|
||||
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: |
|
||||
sum by (namespace) (
|
||||
container_memory_usage_bytes{job="cadvisor", image!="", container!=""}
|
||||
)
|
||||
|
||||
- record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
expr: |
|
||||
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])) by (namespace)
|
||||
|
||||
- record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
expr: |
|
||||
sum by (namespace, pod, container) (
|
||||
rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])
|
||||
)
|
||||
|
||||
# triggered by kernel bug, see issue kubermatic#2367
|
||||
|
||||
# - alert: CPUThrottlingHigh
|
||||
# annotations:
|
||||
# message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace }} for {{ $labels.container }}.'
|
||||
# runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-cputhrottlinghigh
|
||||
# expr: |
|
||||
# 100 * sum(increase(container_cpu_cfs_throttled_periods_total[5m])) by (container, pod, namespace)
|
||||
# /
|
||||
# sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace)
|
||||
# > 25
|
||||
# for: 15m
|
||||
# labels:
|
||||
# severity: warning
|
||||
@@ -0,0 +1,22 @@
|
||||
groups:
|
||||
- name: cert-manager
|
||||
rules:
|
||||
- alert: CertManagerCertExpiresSoon
|
||||
annotations:
|
||||
message: The certificate {{ $labels.name }} expires in less than 3 days.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-certmanagercertexpiressoon
|
||||
expr: certmanager_certificate_expiration_timestamp_seconds - time() < 3*24*3600
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.name }}'
|
||||
service: cert-manager
|
||||
|
||||
- alert: CertManagerCertExpiresVerySoon
|
||||
annotations:
|
||||
message: The certificate {{ $labels.name }} expires in less than 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-certmanagercertexpiresverysoon
|
||||
expr: certmanager_certificate_expiration_timestamp_seconds - time() < 24*3600
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.name }}'
|
||||
service: cert-manager
|
||||
@@ -0,0 +1,20 @@
|
||||
groups:
|
||||
- name: helm-exporter
|
||||
rules:
|
||||
- alert: HelmReleaseNotDeployed
|
||||
annotations:
|
||||
message:
|
||||
The Helm release `{{ $labels.release }}` (`{{ $labels.chart }}` chart in namespace `{{ $labels.exported_namespace }}`)
|
||||
in version {{ $labels.version }} has not been ready for more than 15 minutes.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-helmreleasenotdeployed
|
||||
expr: helm_chart_info != 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.release }}'
|
||||
service: helm-exporter
|
||||
runbook:
|
||||
steps:
|
||||
- Check the installed Helm releases via `helm --namespace monitoring ls --all`.
|
||||
- If Helm cannot repair the chart automatically, delete/purge the chart (`helm delete --purge [RELEASE]`) and
|
||||
re-install the chart again.
|
||||
@@ -0,0 +1,128 @@
|
||||
groups:
|
||||
- name: kube-apiserver
|
||||
rules:
|
||||
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
|
||||
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
expr: |
|
||||
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
|
||||
- record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
||||
expr: |
|
||||
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
|
||||
############################################################
|
||||
# alerts
|
||||
############################################################
|
||||
|
||||
- alert: KubernetesApiserverDown
|
||||
annotations:
|
||||
message: KubernetesApiserver has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubernetesapiserverdown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
message: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapiterminatedrequests
|
||||
expr: |
|
||||
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))
|
||||
/
|
||||
(sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
message: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapiterminatedrequests
|
||||
expr: |
|
||||
sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m]))
|
||||
/
|
||||
(sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7 days.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
|
||||
and
|
||||
histogram_quantile(0.01, sum by (job, instance, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
runbook:
|
||||
steps:
|
||||
- Check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/) on how to renew certificates.
|
||||
- If your certificate has already expired, the steps in the documentation might not work. Check [Github](https://github.com/kubernetes/kubeadm/issues/581#issuecomment-421477139)
|
||||
for hints about fixing your cluster.
|
||||
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
|
||||
and
|
||||
histogram_quantile(0.01, sum by (job, instance, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
resource: apiserver
|
||||
service: kubernetes
|
||||
runbook:
|
||||
steps:
|
||||
- Urgently renew your certificates. Expired certificates can make fixing the cluster difficult to begin with.
|
||||
- Check the [Kubernetes documentation](https://kubernetes.io/docs/tasks/administer-cluster/kubeadm/kubeadm-certs/) on how to renew certificates.
|
||||
- If your certificate has already expired, the steps in the documentation might not work. Check [Github](https://github.com/kubernetes/kubeadm/issues/581#issuecomment-421477139)
|
||||
for hints about fixing your cluster.
|
||||
@@ -0,0 +1,143 @@
|
||||
groups:
|
||||
- name: kube-kubelet
|
||||
rules:
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
message: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletdown
|
||||
expr: absent(up{job="kubelet"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: kubelet
|
||||
|
||||
- alert: KubePersistentVolumeUsageCritical
|
||||
annotations:
|
||||
message:
|
||||
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in namespace
|
||||
{{ $labels.namespace }} is only {{ printf "%0.0f" $value }}% free.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepersistentvolumeusagecritical
|
||||
expr: |
|
||||
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
< 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: kubelet
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}'
|
||||
|
||||
- alert: KubePersistentVolumeFullInFourDays
|
||||
annotations:
|
||||
message:
|
||||
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }}
|
||||
in namespace {{ $labels.namespace }} is expected to fill up within four days.
|
||||
Currently {{ $value }} bytes are available.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepersistentvolumefullinfourdays
|
||||
expr: |
|
||||
(
|
||||
kubelet_volume_stats_used_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
) > 0.85
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
service: kubelet
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}'
|
||||
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
message: Kubelet {{ $labels.instance }} is running {{ $value }} pods, close to the limit of 110.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubelettoomanypods
|
||||
expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: kubelet
|
||||
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message:
|
||||
The kubelet on {{ $labels.instance }} is experiencing {{ printf "%0.0f" $value }}% errors.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclienterrors
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"(5..|<error>)",job="kubelet"}[5m])) by (instance)
|
||||
/
|
||||
sum(rate(rest_client_requests_total{job="kubelet"}[5m])) by (instance))
|
||||
* 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: kubelet
|
||||
|
||||
# a dedicated rule for pods to include more helpful labels in the message like the instance and job name
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message:
|
||||
The pod {{ $labels.namespace }}/{{ $labels.pod }} is experiencing {{ printf "%0.0f" $value }}% errors.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeclienterrors
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"(5..|<error>)",job="pods"}[5m])) by (namespace, pod)
|
||||
/
|
||||
sum(rate(rest_client_requests_total{job="pods"}[5m])) by (namespace, pod))
|
||||
* 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: kubelet
|
||||
|
||||
- alert: KubeletRuntimeErrors
|
||||
annotations:
|
||||
message:
|
||||
The kubelet on {{ $labels.instance }} is having an elevated error rate for container runtime oprations.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletruntimeerrors
|
||||
expr: |
|
||||
sum(rate(kubelet_runtime_operations_errors_total{job="kubelet"}[5m])) by (instance) > 0.1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: kubelet
|
||||
|
||||
- alert: KubeletCGroupManagerDurationHigh
|
||||
annotations:
|
||||
message:
|
||||
The kubelet's cgroup manager duration on {{ $labels.instance }} has been elevated ({{ printf "%0.2f" $value }}ms) for more than 15 minutes.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletcgroupmanagerlatencyhigh
|
||||
expr: |
|
||||
sum(rate(kubelet_cgroup_manager_duration_seconds{quantile="0.9"}[5m])) by (instance) * 1000 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
resource: '{{ $labels.instance }}'
|
||||
service: kubelet
|
||||
severity: warning
|
||||
|
||||
- alert: KubeletPodWorkerDurationHigh
|
||||
annotations:
|
||||
message:
|
||||
The kubelet's pod worker duration for {{ $labels.operation_type }} operations on {{ $labels.instance }} has been elevated ({{ printf "%0.2f" $value }}ms) for more than 15 minutes.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeletpodworkerdurationhigh
|
||||
expr: |
|
||||
sum(rate(kubelet_pod_worker_duration_seconds{quantile="0.9"}[5m])) by (instance, operation_type) * 1000 > 250
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }}/{{ $labels.operation_type }}'
|
||||
service: kubelet
|
||||
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
message: There are {{ $value }} different versions of Kubernetes components running.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubeversionmismatch
|
||||
expr: count(count(kubernetes_build_info{job!="dns"}) by (gitVersion)) > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -0,0 +1,339 @@
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
- record: ':kube_pod_info_node_count:'
|
||||
expr: |
|
||||
sum(min(kube_pod_info) by (node))
|
||||
|
||||
- record: 'node_namespace_pod:kube_pod_info:'
|
||||
expr: |
|
||||
max(kube_pod_info{job="kube-state-metrics"}) by (node, namespace, pod)
|
||||
|
||||
- record: namespace_name:container_cpu_usage_seconds_total:sum_rate
|
||||
expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!="", container!=""}[5m])) by (namespace, pod)
|
||||
* on (namespace, pod) group_left (label_name)
|
||||
kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
|
||||
- record: namespace_name:container_memory_usage_bytes:sum
|
||||
expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(container_memory_usage_bytes{job="cadvisor",image!="", container!=""}) by (pod, namespace)
|
||||
* on (namespace, pod) group_left (label_name)
|
||||
kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
|
||||
- record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests{job="kube-state-metrics",resource="memory"}) by (namespace, pod)
|
||||
* on (namespace, pod) group_left (label_name)
|
||||
kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
|
||||
- record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests{job="kube-state-metrics",resource="cpu"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
|
||||
* on (namespace, pod) group_left (label_name)
|
||||
kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
|
||||
############################################################
|
||||
# alerts
|
||||
############################################################
|
||||
|
||||
- alert: KubeStateMetricsDown
|
||||
annotations:
|
||||
message: KubeStateMetrics has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatemetricsdown
|
||||
expr: absent(up{job="kube-state-metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kube-state-metrics
|
||||
|
||||
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
message:
|
||||
Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting
|
||||
{{ printf "%.2f" $value }} times / 5 minutes.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodcrashlooping
|
||||
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
runbook:
|
||||
steps:
|
||||
- Check the pod's logs.
|
||||
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than an hour.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodnotready
|
||||
expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
runbook:
|
||||
steps:
|
||||
- Check the pod via `kubectl describe pod [POD]` to find out about scheduling issues.
|
||||
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
message:
|
||||
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match,
|
||||
this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedeploymentgenerationmismatch
|
||||
expr: |
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.deployment }}"
|
||||
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
message:
|
||||
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected
|
||||
number of replicas for longer than an hour.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedeploymentreplicasmismatch
|
||||
expr: |
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.deployment }}"
|
||||
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
message:
|
||||
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected
|
||||
number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetreplicasmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
|
||||
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
message:
|
||||
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match,
|
||||
this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetgenerationmismatch
|
||||
expr: |
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
|
||||
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubestatefulsetupdatenotrolledout
|
||||
expr: |
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.statefulset }}"
|
||||
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message:
|
||||
Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }}
|
||||
are scheduled and ready.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetrolloutstuck
|
||||
expr: |
|
||||
kube_daemonset_status_number_ready{job="kube-state-metrics"}
|
||||
/
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
|
||||
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetnotscheduled
|
||||
expr: |
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
|
||||
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubedaemonsetmisscheduled
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.daemonset }}"
|
||||
|
||||
- alert: KubeCronJobRunning
|
||||
annotations:
|
||||
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecronjobrunning
|
||||
expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.cronjob }}"
|
||||
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than one hour to complete.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubejobcompletion
|
||||
expr: kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubejobfailed
|
||||
expr: kube_job_status_failed{job="kube-state-metrics"} > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: "{{ $labels.namespace }}/{{ $labels.job_name }}"
|
||||
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for namespaces.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecpuovercommit
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"})
|
||||
/
|
||||
sum(node:node_num_cpu:sum)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: cluster
|
||||
service: kube-state-metrics
|
||||
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for pods and cannot tolerate node failure.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubecpuovercommit
|
||||
expr: |
|
||||
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
|
||||
/
|
||||
sum(node:node_num_cpu:sum)
|
||||
>
|
||||
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: cluster
|
||||
service: kube-state-metrics
|
||||
|
||||
- alert: KubeMemOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for namespaces.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubememovercommit
|
||||
expr: |
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"})
|
||||
/
|
||||
sum(node_memory_MemTotal_bytes{app="node-exporter"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: cluster
|
||||
service: kube-state-metrics
|
||||
|
||||
- alert: KubeMemOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for pods and cannot tolerate node failure.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubememovercommit
|
||||
expr: |
|
||||
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
|
||||
/
|
||||
sum(node_memory_MemTotal_bytes)
|
||||
>
|
||||
(count(node:node_num_cpu:sum)-1)
|
||||
/
|
||||
count(node:node_num_cpu:sum)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: cluster
|
||||
service: kube-state-metrics
|
||||
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value }}% of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubequotaexceeded
|
||||
expr: |
|
||||
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: cluster
|
||||
service: kube-state-metrics
|
||||
|
||||
- alert: KubePodOOMKilled
|
||||
annotations:
|
||||
message:
|
||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
has been OOMKilled {{ $value }} times in the last 30 minutes.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubepodoomkilled
|
||||
expr: |
|
||||
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 30m >= 2)
|
||||
and
|
||||
ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[30m]) == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container }}'
|
||||
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
message: '{{ $labels.node }} has been unready for more than an hour.'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-kubenodenotready
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.node }}'
|
||||
@@ -0,0 +1,342 @@
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- record: node:node_num_cpu:sum
|
||||
expr: |
|
||||
count by (node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{app="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
))
|
||||
|
||||
- record: :node_cpu_utilisation:avg1m
|
||||
expr: |
|
||||
1 - avg(rate(node_cpu_seconds_total{app="node-exporter",mode="idle"}[1m]))
|
||||
|
||||
- record: node:node_cpu_utilisation:avg1m
|
||||
expr: |
|
||||
1 - avg by (node) (
|
||||
rate(node_cpu_seconds_total{app="node-exporter",mode="idle"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:)
|
||||
|
||||
- record: ':node_cpu_saturation_load1:'
|
||||
expr: |
|
||||
sum(node_load1{app="node-exporter"})
|
||||
/
|
||||
sum(node:node_num_cpu:sum)
|
||||
|
||||
- record: 'node:node_cpu_saturation_load1:'
|
||||
expr: |
|
||||
sum by (node) (
|
||||
node_load1{app="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
/
|
||||
node:node_num_cpu:sum
|
||||
|
||||
- record: node:cluster_cpu_utilisation:ratio
|
||||
expr: |
|
||||
node:node_cpu_utilisation:avg1m
|
||||
*
|
||||
node:node_num_cpu:sum
|
||||
/
|
||||
scalar(sum(node:node_num_cpu:sum))
|
||||
|
||||
- record: ':node_memory_utilisation:'
|
||||
expr: |
|
||||
1 -
|
||||
sum(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
|
||||
/
|
||||
sum(node_memory_MemTotal_bytes{app="node-exporter"})
|
||||
|
||||
- record: node:node_memory_bytes_available:sum
|
||||
expr: |
|
||||
sum by (node) (
|
||||
(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: node:node_memory_bytes_total:sum
|
||||
expr: |
|
||||
sum by (node) (
|
||||
node_memory_MemTotal_bytes{app="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: node:node_memory_utilisation:ratio
|
||||
expr: |
|
||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
||||
/
|
||||
scalar(sum(node:node_memory_bytes_total:sum))
|
||||
|
||||
- record: :node_memory_swap_io_bytes:sum_rate
|
||||
expr: |
|
||||
1e3 * sum(
|
||||
(rate(node_vmstat_pgpgin{app="node-exporter"}[1m])
|
||||
+ rate(node_vmstat_pgpgout{app="node-exporter"}[1m]))
|
||||
)
|
||||
|
||||
- record: 'node:node_memory_utilisation:'
|
||||
expr: |
|
||||
1 -
|
||||
sum by (node) (
|
||||
(node_memory_MemFree_bytes{app="node-exporter"} + node_memory_Cached_bytes{app="node-exporter"} + node_memory_Buffers_bytes{app="node-exporter"})
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
/
|
||||
sum by (node) (
|
||||
node_memory_MemTotal_bytes{app="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: 'node:node_memory_utilisation_2:'
|
||||
expr: |
|
||||
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
||||
|
||||
- record: node:node_memory_swap_io_bytes:sum_rate
|
||||
expr: |
|
||||
1e3 * sum by (node) (
|
||||
(rate(node_vmstat_pgpgin{app="node-exporter"}[1m])
|
||||
+ rate(node_vmstat_pgpgout{app="node-exporter"}[1m]))
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: node:cluster_memory_utilisation:ratio
|
||||
expr: |
|
||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
||||
/
|
||||
scalar(sum(node:node_memory_bytes_total:sum))
|
||||
|
||||
- record: :node_disk_utilisation:avg_irate
|
||||
expr: |
|
||||
avg(irate(node_disk_io_time_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
|
||||
|
||||
- record: node:node_disk_utilisation:avg_irate
|
||||
expr: |
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: :node_disk_saturation:avg_irate
|
||||
expr: |
|
||||
avg(irate(node_disk_io_time_weighted_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
|
||||
|
||||
- record: node:node_disk_saturation:avg_irate
|
||||
expr: |
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_weighted_seconds_total{app="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: 'node:node_filesystem_usage:'
|
||||
expr: |
|
||||
max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
|
||||
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
||||
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
||||
|
||||
- record: 'node:node_filesystem_avail:'
|
||||
expr: |
|
||||
max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
||||
|
||||
- record: :node_net_utilisation:sum_irate
|
||||
expr: |
|
||||
sum(irate(node_network_receive_bytes_total{app="node-exporter",device!~"veth.+"}[1m])) +
|
||||
sum(irate(node_network_transmit_bytes_total{app="node-exporter",device!~"veth.+"}[1m]))
|
||||
|
||||
- record: node:node_net_utilisation:sum_irate
|
||||
expr: |
|
||||
sum by (node) (
|
||||
(irate(node_network_receive_bytes_total{app="node-exporter",device!~"veth.+"}[1m]) +
|
||||
irate(node_network_transmit_bytes_total{app="node-exporter",device!~"veth.+"}[1m]))
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
- record: :node_net_saturation:sum_irate
|
||||
expr: |
|
||||
sum(irate(node_network_receive_drop_total{app="node-exporter",device!~"veth.+"}[1m])) +
|
||||
sum(irate(node_network_transmit_drop_total{app="node-exporter",device!~"veth.+"}[1m]))
|
||||
|
||||
- record: node:node_net_saturation:sum_irate
|
||||
expr: |
|
||||
sum by (node) (
|
||||
(irate(node_network_receive_drop_total{app="node-exporter",device!~"veth.+"}[1m]) +
|
||||
irate(node_network_transmit_drop_total{app="node-exporter",device!~"veth.+"}[1m]))
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
|
||||
############################################################
|
||||
# alerts
|
||||
############################################################
|
||||
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
|
||||
to run out of space within the next 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemspacefillingup
|
||||
expr: |
|
||||
predict_linear(node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} < 0.4
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
|
||||
to run out of space within the next 4 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemspacefillingup
|
||||
expr: |
|
||||
predict_linear(node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} < 0.2
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemOutOfSpace
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only
|
||||
{{ $value }}% available space left on drive {{ $labels.device }}.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
|
||||
expr: |
|
||||
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 10
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemOutOfSpace
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only
|
||||
{{ $value }}% available space left on drive {{ $labels.device }}.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
|
||||
expr: |
|
||||
node_filesystem_avail_bytes{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_size_bytes{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
|
||||
to run out of files within the next 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemfilesfillingup
|
||||
expr: |
|
||||
predict_linear(node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} < 0.4
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on {{ $labels.device }} at {{ $labels.instance }} is predicted
|
||||
to run out of files within the next 4 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemfilesfillingup
|
||||
expr: |
|
||||
predict_linear(node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} < 0.2
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemOutOfFiles
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only
|
||||
{{ $value }}% available inodes left.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutoffiles
|
||||
expr: |
|
||||
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeFilesystemOutOfSpace
|
||||
annotations:
|
||||
message:
|
||||
Filesystem on node {{ $labels.node_name }} having IP {{ $labels.instance }} has only {{ $value }}% inodes available on drive {{ $labels.device }}.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodefilesystemoutofspace
|
||||
expr: |
|
||||
node_filesystem_files_free{app="node-exporter",fstype=~"ext.|xfs"} / node_filesystem_files{app="node-exporter",fstype=~"ext.|xfs"} * 100 < 10
|
||||
and
|
||||
node_filesystem_readonly{app="node-exporter",fstype=~"ext.|xfs"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
message:
|
||||
'{{ $labels.instance }} interface {{ $labels.device }} shows errors
|
||||
while receiving packets ({{ $value }} errors in two minutes).'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodenetworkreceiveerrs
|
||||
expr: increase(node_network_receive_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
message:
|
||||
'{{ $labels.instance }} interface {{ $labels.device }} shows errors
|
||||
while transmitting packets ({{ $value }} errors in two minutes).'
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-nodenetworktransmiterrs
|
||||
expr: increase(node_network_transmit_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.instance }} {{ $labels.device }}'
|
||||
service: 'node-exporter'
|
||||
@@ -0,0 +1,95 @@
|
||||
groups:
|
||||
- name: prometheus
|
||||
rules:
|
||||
- alert: PromScrapeFailed
|
||||
annotations:
|
||||
message: Prometheus failed to scrape a target {{ $labels.job }} / {{ $labels.instance }}.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promscrapefailed
|
||||
expr: up != 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
service: prometheus
|
||||
runbook:
|
||||
steps:
|
||||
- Check the Prometheus Service Discovery page to find out why the target is unreachable.
|
||||
|
||||
- alert: PromBadConfig
|
||||
annotations:
|
||||
message: Prometheus failed to reload config.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-prombadconfig
|
||||
expr: prometheus_config_last_reload_successful{job="prometheus"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
service: prometheus
|
||||
runbook:
|
||||
steps:
|
||||
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
|
||||
- Check the `prometheus-rules` configmap via `kubectl -n monitoring get configmap prometheus-rules -o yaml`.
|
||||
|
||||
- alert: PromAlertmanagerBadConfig
|
||||
annotations:
|
||||
message: Alertmanager failed to reload config.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promalertmanagerbadconfig
|
||||
expr: alertmanager_config_last_reload_successful{job="alertmanager"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
service: prometheus
|
||||
runbook:
|
||||
steps:
|
||||
- Check Alertmanager pod's logs via `kubectl -n monitoring logs alertmanager-0`, `-1` and `-2`.
|
||||
- Check the `alertmanager` secret via `kubectl -n monitoring get secret alertmanager -o yaml`.
|
||||
|
||||
- alert: PromAlertsFailed
|
||||
annotations:
|
||||
message: Alertmanager failed to send an alert.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promalertsfailed
|
||||
expr: sum(increase(alertmanager_notifications_failed_total{job="alertmanager"}[5m])) by (namespace) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
service: prometheus
|
||||
runbook:
|
||||
steps:
|
||||
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
|
||||
- 'Make sure the Alertmanager StatefulSet is running: `kubectl -n monitoring get pods`.'
|
||||
|
||||
- alert: PromRemoteStorageFailures
|
||||
annotations:
|
||||
message: Prometheus failed to send {{ printf "%.1f" $value }}% samples.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promremotestoragefailures
|
||||
expr: |
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[1m]) * 100)
|
||||
/
|
||||
(rate(prometheus_remote_storage_failed_samples_total{job="prometheus"}[1m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus"}[1m]))
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
service: prometheus
|
||||
runbook:
|
||||
steps:
|
||||
- Ensure that the Prometheus volume has not reached capacity.
|
||||
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
|
||||
|
||||
- alert: PromRuleFailures
|
||||
annotations:
|
||||
message: Prometheus failed to evaluate {{ printf "%.1f" $value }} rules/sec.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-promrulefailures
|
||||
expr: rate(prometheus_rule_evaluation_failures_total{job="prometheus"}[1m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.job }}/{{ $labels.instance }}'
|
||||
service: prometheus
|
||||
runbook:
|
||||
steps:
|
||||
- Check Prometheus pod's logs via `kubectl -n monitoring logs prometheus-0` and `-1`.
|
||||
- Check CPU/memory pressure on the node.
|
||||
@@ -0,0 +1,85 @@
|
||||
groups:
|
||||
- name: thanos
|
||||
rules:
|
||||
- alert: ThanosSidecarDown
|
||||
annotations:
|
||||
message: The Thanos sidecar in `{{ $labels.namespace }}/{{ $labels.pod }}` is down.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanossidecardown
|
||||
expr: thanos_sidecar_prometheus_up != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
|
||||
- alert: ThanosSidecarNoHeartbeat
|
||||
annotations:
|
||||
message: The Thanos sidecar in `{{ $labels.namespace }}/{{ $labels.pod }}` didn't send a heartbeat in {{ $value }} seconds.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanossidecardown
|
||||
expr: time() - thanos_sidecar_last_heartbeat_success_time_seconds > 60
|
||||
for: 3m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
|
||||
- alert: ThanosCompactorManyRetries
|
||||
annotations:
|
||||
message: The Thanos compactor in `{{ $labels.namespace }}` is experiencing a high retry rate.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanoscompactormanyretries
|
||||
expr: sum(rate(thanos_compact_retries_total[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
runbook:
|
||||
steps:
|
||||
- Check the `thanos-compact` pod's logs.
|
||||
|
||||
- alert: ThanosShipperManyDirSyncFailures
|
||||
annotations:
|
||||
message: The Thanos shipper in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a high dir-sync failure rate.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosshippermanydirsyncfailures
|
||||
expr: sum(rate(thanos_shipper_dir_sync_failures_total[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
runbook:
|
||||
steps:
|
||||
- Check the `thanos` containers's logs inside the Prometheus pod.
|
||||
|
||||
- alert: ThanosManyPanicRecoveries
|
||||
annotations:
|
||||
message: The Thanos component in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a panic recovery rate.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanypanicrecoveries
|
||||
expr: sum(rate(thanos_grpc_req_panics_recovered_total[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
|
||||
- alert: ThanosManyBlockLoadFailures
|
||||
annotations:
|
||||
message: The Thanos store in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a many failed block loads.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanyblockloadfailures
|
||||
expr: sum(rate(thanos_bucket_store_block_load_failures_total[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
|
||||
- alert: ThanosManyBlockDropFailures
|
||||
annotations:
|
||||
message: The Thanos store in `{{ $labels.namespace }}/{{ $labels.pod }}` is experiencing a many failed block drops.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-thanosmanyblockdropfailures
|
||||
expr: sum(rate(thanos_bucket_store_block_drop_failures_total[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: thanos
|
||||
@@ -0,0 +1,34 @@
|
||||
groups:
|
||||
- name: velero
|
||||
rules:
|
||||
- alert: VeleroBackupTakesTooLong
|
||||
annotations:
|
||||
message: Last backup with schedule {{ $labels.schedule }} has not finished successfully within 60min.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-velerobackuptakestoolong
|
||||
expr: time() - velero_backup_last_successful_timestamp{schedule!=""} > 3600
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
resource: '{{ $labels.schedule }}'
|
||||
service: 'velero'
|
||||
runbook:
|
||||
steps:
|
||||
- Check if a backup is really in "InProgress" state via `velero -n velero backup get`.
|
||||
- Check the backup logs via `velero -n velero backup logs [BACKUP_NAME]`.
|
||||
- Depending on the backup, find the pod and check the processes inside that pod or any sidecar containers.
|
||||
|
||||
- alert: VeleroNoRecentBackup
|
||||
annotations:
|
||||
message: There has not been a successful backup for schedule {{ $labels.schedule }} in the last 24 hours.
|
||||
runbook_url: https://docs.kubermatic.com/kubermatic/master/cheat_sheets/alerting_runbook/#alert-veleronorecentbackup
|
||||
expr: time() - velero_backup_last_successful_timestamp{schedule!=""} > 3600*25
|
||||
labels:
|
||||
severity: critical
|
||||
resource: '{{ $labels.schedule }}'
|
||||
service: 'velero'
|
||||
runbook:
|
||||
steps:
|
||||
- Check if really no backups happened via `velero -n velero backup get`.
|
||||
- If a backup failed, check its logs via `velero -n velero backup logs [BACKUP_NAME]`.
|
||||
- If a backup was not even triggered, check the Velero server's logs via `kubectl -n velero logs -l 'name=velero-server'`.
|
||||
- Make sure the Velero server pod has not been rescheduled and possibly opt to schedule it on a stable node using a node affinity.
|
||||
@@ -0,0 +1,23 @@
|
||||
groups:
|
||||
- name: vertical-pod-autoscaler
|
||||
rules:
|
||||
|
||||
# These rules provide metrics to be consumed by Kubernetes' VPA. The VPA only needs a tiny fraction
|
||||
# of the labels available on the container_* metrics, so we reduce them with the inner query to
|
||||
# only contain pod name, namespace and name.
|
||||
# Because the VPA does not allow to change the metric name it queries, but only the job selector,
|
||||
# we "cheat" by re-using the same metric name and injecting a custom job ("cadvisor-vpa") label.
|
||||
|
||||
- record: container_cpu_usage_seconds_total
|
||||
expr: |
|
||||
label_replace(
|
||||
sum(container_cpu_usage_seconds_total{job="cadvisor", pod=~".+", name!="POD", name!=""}) by (pod, namespace, name),
|
||||
"job", "cadvisor-vpa", "", ""
|
||||
)
|
||||
|
||||
- record: container_memory_usage_bytes
|
||||
expr: |
|
||||
label_replace(
|
||||
sum(container_memory_usage_bytes{job="cadvisor", pod=~".+", name!="POD", name!=""}) by (pod, namespace, name),
|
||||
"job", "cadvisor-vpa", "", ""
|
||||
)
|
||||
@@ -0,0 +1,9 @@
|
||||
{{- range $path, $file := .Files.Glob "files/rules/**/*.yaml" }}
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: {{ toString $path | trimPrefix "files/rules/" | replace "/" "-" | replace ".yaml" "" }}
|
||||
spec:
|
||||
{{- toString $file | nindent 2 }}
|
||||
---
|
||||
{{- end }}
|
||||
Reference in New Issue
Block a user