Rework alerts; Add fluxcd alerts (#388)

- Rework alerts
- Add fluxcd alerts

---------

Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
Andrei Kvapil
2024-10-03 15:59:49 +02:00
committed by GitHub
parent 929ab5c5eb
commit b605c85eb2
80 changed files with 3191 additions and 7929 deletions

View File

@@ -19,26 +19,3 @@ update:
helm repo add fluent https://fluent.github.io/helm-charts
helm repo update fluent
helm pull fluent/fluent-bit --untar --untardir charts
# alerts from victoria-metrics-k8s-stack
helm repo add vm https://victoriametrics.github.io/helm-charts/
helm repo update vm
helm pull vm/victoria-metrics-k8s-stack --untar --untardir charts
rm -rf charts/victoria-metrics-k8s-stack/charts
rm -rf charts/victoria-metrics-k8s-stack/hack
rm -rf charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator
rm -rf charts/victoria-metrics-k8s-stack/templates/grafana
rm -rf charts/victoria-metrics-k8s-stack/templates/ingress.yaml
rm -rf charts/victoria-metrics-k8s-stack/files/dashboards
rm -f charts/victoria-metrics-k8s-stack/templates/servicemonitors.yaml
rm -f charts/victoria-metrics-k8s-stack/templates/serviceaccount.yaml
rm -f charts/victoria-metrics-k8s-stack/templates/rules/additionalVictoriaMetricsRules.yml
sed -i '/ namespace:/d' charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml
sed -i 's|job="apiserver"|job="kube-apiserver"|g' `grep -rl 'job="apiserver"' charts/victoria-metrics-k8s-stack/files/rules/generated`
sed -i 's|severity: info|severity: informational|g' `grep -rl 'severity: info' ./charts/victoria-metrics-k8s-stack/files/rules/generated`
sed -i 's|severity: none|severity: ok|g' ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
sed -i ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml \
-e '/Watchdog/,/severity:/s/severity: none/severity: ok/' \
-e '/InfoInhibitor/,/severity:/s/severity: none/severity: major/'
# TODO
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/vm*.yaml

View File

@@ -0,0 +1,221 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-etcd
spec:
groups:
- name: etcd
params: {}
rules:
- alert: etcdMembersDown
annotations:
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
}}).'
summary: etcd cluster members are down.
expr: |-
max without (endpoint) (
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdInsufficientMembers
annotations:
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
summary: etcd cluster has insufficient number of members.
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdNoLeader
annotations:
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance
}} has no leader.'
summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdHighNumberOfLeaderChanges
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
within the last 15 minutes. Frequent elections may be a sign of insufficient
resources, high network latency, or disruptions by other components and
should be investigated.'
summary: etcd cluster has high number of leader changes.
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
>= 4
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests
for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance
}}.'
summary: etcd cluster has high number of failed grpc requests.
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
service: etcd
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests
for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance
}}.'
summary: etcd cluster has high number of failed grpc requests.
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
service: etcd
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
}} method.'
summary: etcd grpc requests are slow
expr: |-
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
service: etcd
- alert: etcdMemberCommunicationSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": member communication with
{{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
}}.'
summary: etcd cluster member communication is slow.
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.member }}'
service: etcd
- alert: etcdHighNumberOfFailedProposals
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdHighCommitDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdDatabaseQuotaLowSpace
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the
defined quota on etcd instance {{ $labels.instance }}, please defrag or
increase the quota as the writes to etcd will be disabled when it is full.'
summary: etcd cluster database is running full.
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])
/ last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100
> 95
for: 10m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of
disk space in the next four hours, based on write observations within the
past four hours on etcd instance {{ $labels.instance }}, please check as
it might be disruptive.'
summary: etcd cluster database growing very fast.
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h],
4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: etcd
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
retrieve the unused fragmented disk space.'
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
summary: etcd database size in use is less than 50% of the actual allocated
storage.
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) <
0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: etcd

View File

@@ -0,0 +1,128 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
annotations:
meta.helm.sh/release-name: monitoring
meta.helm.sh/release-namespace: cozy-monitoring
labels:
app: victoria-metrics-k8s-stack
app.kubernetes.io/instance: monitoring
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/name: victoria-metrics-k8s-stack
app.kubernetes.io/version: v1.102.1
helm.sh/chart: victoria-metrics-k8s-stack-0.25.17
name: alerts-flux-resources
namespace: cozy-monitoring
spec:
groups:
- name: flux-resources-alerts
rules:
- alert: HelmReleaseNotReady
expr: gotk_resource_info{customresource_kind="HelmRelease", ready!="True"} > 0
for: 5m
labels:
severity: major
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready"
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is in an unready state for more than 15 minutes."
- alert: GitRepositorySyncFailed
expr: gotk_resource_info{customresource_kind="GitRepository", ready!="True"} > 0
for: 5m
labels:
severity: major
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
- alert: KustomizationNotApplied
expr: gotk_resource_info{customresource_kind="Kustomization", ready!="True"} > 0
for: 5m
labels:
severity: major
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not applied"
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not successfully applied for more than 15 minutes."
- alert: ImageRepositorySyncFailed
expr: gotk_resource_info{customresource_kind="ImageRepository", ready!="True"} > 0
for: 5m
labels:
severity: major
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
- alert: HelmChartFailed
expr: gotk_resource_info{customresource_kind="HelmChart", ready!="True"} > 0
for: 5m
labels:
severity: major
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has failed"
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready for more than 15 minutes."
- alert: HelmReleaseSuspended
expr: gotk_resource_info{customresource_kind="HelmRelease", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
- alert: GitRepositorySuspended
expr: gotk_resource_info{customresource_kind="GitRepository", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
- alert: KustomizationSuspended
expr: gotk_resource_info{customresource_kind="Kustomization", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
- alert: ImageRepositorySuspended
expr: gotk_resource_info{customresource_kind="ImageRepository", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
- alert: HelmChartSuspended
expr: gotk_resource_info{customresource_kind="HelmChart", suspended="true"} > 0
for: 5m
labels:
severity: warning
service: fluxcd
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
annotations:
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."

View File

@@ -0,0 +1,57 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-general.rules
spec:
groups:
- name: general.rules
params: {}
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (job,namespace,service,cluster) / count(up) BY
(job,namespace,service,cluster)) > 10
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: general.rules
- alert: Watchdog
annotations:
description: |
This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
severity: ok
exported_instance: global
service: general.rules
event: Heartbeat
- alert: InfoInhibitor
annotations:
description: |
This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
other alerts.
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
severity of 'warning' or 'critical' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on (namespace,cluster) ALERTS{alertname
!= "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} ==
1
labels:
severity: major
exported_instance: global
service: general.rules

View File

@@ -0,0 +1,18 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.containercpuusagesecondstotal
spec:
groups:
- name: k8s.rules.container_cpu_usage_seconds_total
params: {}
rules:
- annotations: {}
expr: |-
sum by (namespace,pod,container,cluster) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (
1, max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
)
labels: {}
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate

View File

@@ -0,0 +1,17 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.containermemorycache
spec:
groups:
- name: k8s.rules.container_memory_cache
params: {}
rules:
- annotations: {}
expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
)
labels: {}
record: node_namespace_pod_container:container_memory_cache

View File

@@ -0,0 +1,17 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.containermemoryrss
spec:
groups:
- name: k8s.rules.container_memory_rss
params: {}
rules:
- annotations: {}
expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
)
labels: {}
record: node_namespace_pod_container:container_memory_rss

View File

@@ -0,0 +1,17 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.containermemoryswap
spec:
groups:
- name: k8s.rules.container_memory_swap
params: {}
rules:
- annotations: {}
expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
)
labels: {}
record: node_namespace_pod_container:container_memory_swap

View File

@@ -0,0 +1,17 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.containermemoryworkingsetbytes
spec:
groups:
- name: k8s.rules.container_memory_working_set_bytes
params: {}
rules:
- annotations: {}
expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
)
labels: {}
record: node_namespace_pod_container:container_memory_working_set_bytes

View File

@@ -0,0 +1,93 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.containerresource
spec:
groups:
- name: k8s.rules.container_resource
params: {}
rules:
- annotations: {}
expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster)
group_left() max by (namespace,pod,cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
labels: {}
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- annotations: {}
expr: |-
sum by (namespace,cluster) (
sum by (namespace,pod,cluster) (
max by (namespace,pod,container,cluster) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
labels: {}
record: namespace_memory:kube_pod_container_resource_requests:sum
- annotations: {}
expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster)
group_left() max by (namespace,pod,cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
labels: {}
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- annotations: {}
expr: |-
sum by (namespace,cluster) (
sum by (namespace,pod,cluster) (
max by (namespace,pod,container,cluster) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
labels: {}
record: namespace_cpu:kube_pod_container_resource_requests:sum
- annotations: {}
expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster)
group_left() max by (namespace,pod,cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
labels: {}
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- annotations: {}
expr: |-
sum by (namespace,cluster) (
sum by (namespace,pod,cluster) (
max by (namespace,pod,container,cluster) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
labels: {}
record: namespace_memory:kube_pod_container_resource_limits:sum
- annotations: {}
expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster)
group_left() max by (namespace,pod,cluster) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
labels: {}
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- annotations: {}
expr: |-
sum by (namespace,cluster) (
sum by (namespace,pod,cluster) (
max by (namespace,pod,container,cluster) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
labels: {}
record: namespace_cpu:kube_pod_container_resource_limits:sum

View File

@@ -0,0 +1,60 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-k8s.rules.podowner
spec:
groups:
- name: k8s.rules.pod_owner
params: {}
rules:
- annotations: {}
expr: |-
max by (namespace,workload,pod,cluster) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on (replicaset,namespace,cluster) group_left(owner_name) topk by (replicaset,namespace,cluster) (
1, max by (replicaset,namespace,owner_name,cluster) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- annotations: {}
expr: |-
max by (namespace,workload,pod,cluster) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- annotations: {}
expr: |-
max by (namespace,workload,pod,cluster) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- annotations: {}
expr: |-
max by (namespace,workload,pod,cluster) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel

View File

@@ -0,0 +1,146 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-apiserver-availability.rules
spec:
groups:
- interval: 3m
name: kube-apiserver-availability.rules
params: {}
rules:
- annotations: {}
expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24
* 30
labels: {}
record: code_verb:apiserver_request_total:increase30d
- annotations: {}
expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- annotations: {}
expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- annotations: {}
expr: sum by (verb,scope,cluster) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h]))
labels: {}
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
- annotations: {}
expr: sum by (verb,scope,cluster) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d])
* 24 * 30)
labels: {}
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
- annotations: {}
expr: sum by (verb,scope,le,cluster) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
labels: {}
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
- annotations: {}
expr: sum by (verb,scope,le,cluster) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d])
* 24 * 30)
labels: {}
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
- annotations: {}
expr: |-
1 - (
(
# write too slow
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
) +
(
# read too slow
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
(
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
or
vector(0)
)
+
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
)
) +
# errors
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum by (cluster) (code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- annotations: {}
expr: |-
1 - (
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
# too slow
(
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
or
vector(0)
)
+
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
)
+
# errors
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- annotations: {}
expr: |-
1 - (
(
# too slow
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
)
+
# errors
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- annotations: {}
expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: code_resource:apiserver_request_total:rate5m
- annotations: {}
expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: code_resource:apiserver_request_total:rate5m
- annotations: {}
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
labels: {}
record: code_verb:apiserver_request_total:increase1h
- annotations: {}
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
labels: {}
record: code_verb:apiserver_request_total:increase1h
- annotations: {}
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
labels: {}
record: code_verb:apiserver_request_total:increase1h
- annotations: {}
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
labels: {}
record: code_verb:apiserver_request_total:increase1h

View File

@@ -0,0 +1,324 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-apiserver-burnrate.rules
spec:
groups:
- name: kube-apiserver-burnrate.rules
params: {}
rules:
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
record: apiserver_request:burnrate1d
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
record: apiserver_request:burnrate1h
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
record: apiserver_request:burnrate2h
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
record: apiserver_request:burnrate30m
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
record: apiserver_request:burnrate3d
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: apiserver_request:burnrate5m
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
(
(
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
or
vector(0)
)
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
+
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
)
)
+
# errors
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
record: apiserver_request:burnrate6h
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
record: apiserver_request:burnrate1d
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
record: apiserver_request:burnrate1h
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
record: apiserver_request:burnrate2h
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
record: apiserver_request:burnrate30m
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
record: apiserver_request:burnrate3d
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: apiserver_request:burnrate5m
- annotations: {}
expr: |-
(
(
# too slow
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
)
+
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
)
/
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
record: apiserver_request:burnrate6h

View File

@@ -0,0 +1,23 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-apiserver-histogram.rules
spec:
groups:
- name: kube-apiserver-histogram.rules
params: {}
rules:
- annotations: {}
expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])))
> 0
labels:
quantile: '0.99'
verb: read
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])))
> 0
labels:
quantile: '0.99'
verb: write
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile

View File

@@ -0,0 +1,73 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-apiserver-slos
spec:
groups:
- name: kube-apiserver-slos
params: {}
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
labels:
long: 1h
severity: critical
short: 5m
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
service: kube-apiserver-slos
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
labels:
long: 6h
severity: critical
short: 30m
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
service: kube-apiserver-slos
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
labels:
long: 1d
severity: warning
short: 2h
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
service: kube-apiserver-slos
- alert: KubeAPIErrorBudgetBurn
annotations:
description: The API server is burning too much error budget.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
summary: The API server is burning too much error budget.
expr: |-
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
labels:
long: 3d
severity: warning
short: 6h
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
service: kube-apiserver-slos

View File

@@ -0,0 +1,17 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-prometheus-general.rules
spec:
groups:
- name: kube-prometheus-general.rules
params: {}
rules:
- annotations: {}
expr: count without(instance, pod, node) (up == 1)
labels: {}
record: count:up1
- annotations: {}
expr: count without(instance, pod, node) (up == 0)
labels: {}
record: count:up0

View File

@@ -0,0 +1,37 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-prometheus-node-recording.rules
spec:
groups:
- name: kube-prometheus-node-recording.rules
params: {}
rules:
- annotations: {}
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
BY (instance)
labels: {}
record: instance:node_cpu:rate:sum
- annotations: {}
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
labels: {}
record: instance:node_network_receive_bytes:rate:sum
- annotations: {}
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
labels: {}
record: instance:node_network_transmit_bytes:rate:sum
- annotations: {}
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
labels: {}
record: instance:node_cpu:ratio
- annotations: {}
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
labels: {}
record: cluster:node_cpu:sum_rate5m
- annotations: {}
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance,
cpu))
labels: {}
record: cluster:node_cpu:ratio

View File

@@ -0,0 +1,63 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-scheduler.rules
spec:
groups:
- name: kube-scheduler.rules
params: {}
rules:
- annotations: {}
expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile

View File

@@ -0,0 +1,73 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kube-state-metrics
spec:
groups:
- name: kube-state-metrics
params: {}
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
summary: kube-state-metrics is experiencing errors in list operations.
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
service: kube-state-metrics
- alert: KubeStateMetricsWatchErrors
annotations:
description: kube-state-metrics is experiencing errors at an elevated rate
in watch operations. This is likely causing it to not be able to expose
metrics about Kubernetes objects correctly or at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
summary: kube-state-metrics is experiencing errors in watch operations.
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
> 0.01
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
service: kube-state-metrics
- alert: KubeStateMetricsShardingMismatch
annotations:
description: kube-state-metrics pods are running with different --total-shards
configuration, some Kubernetes objects may be exposed multiple times or
not exposed at all.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
summary: kube-state-metrics sharding is misconfigured.
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by
(cluster) != 0
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
service: kube-state-metrics
- alert: KubeStateMetricsShardsMissing
annotations:
description: kube-state-metrics shards are missing, some Kubernetes objects
are not being exposed.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
summary: kube-state-metrics shards are missing.
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
-
sum( 2 ^ max by (shard_ordinal,cluster) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
!= 0
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
service: kube-state-metrics

View File

@@ -0,0 +1,30 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubelet.rules
spec:
groups:
- name: kubelet.rules
params: {}
rules:
- annotations: {}
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- annotations: {}
expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile

View File

@@ -0,0 +1,304 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-apps
spec:
groups:
- name: kubernetes-apps
params: {}
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
summary: Pod is crash looping.
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: kubernetes-apps
- alert: KubePodNotReady
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
expr: |-
sum by (namespace,pod,cluster) (
max by (namespace,pod,cluster) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
) * on (namespace,pod,cluster) group_left(owner_kind) topk by (namespace,pod,cluster) (
1, max by (namespace,pod,owner_kind,cluster) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: kubernetes-apps
- alert: KubeDeploymentGenerationMismatch
annotations:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
summary: Deployment generation mismatch due to possible roll-back
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
service: kubernetes-apps
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
service: kubernetes-apps
- alert: KubeDeploymentRolloutStuck
annotations:
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
}} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
!= 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
service: kubernetes-apps
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
has not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
service: kubernetes-apps
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
service: kubernetes-apps
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
update has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
max by (namespace,statefulset,cluster) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
service: kubernetes-apps
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
not finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
service: kubernetes-apps
- alert: KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on
container {{ $labels.container}} has been in waiting state for longer than
1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: sum by (namespace,pod,container,cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container
}}'
service: kubernetes-apps
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
service: kubernetes-apps
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
> 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
service: kubernetes-apps
- alert: KubeJobNotCompleted
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
more than {{ "43200" | humanizeDuration }} to complete.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
expr: |-
time() - max by (namespace,job_name,cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.job_name }}'
service: kubernetes-apps
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
complete. Removing failed job after investigation should clear this alert.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.job_name }}'
service: kubernetes-apps
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched desired number of replicas.
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler
}}'
service: kubernetes-apps
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has been running at max replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler
}}'
service: kubernetes-apps

View File

@@ -0,0 +1,138 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-resources
spec:
groups:
- name: kubernetes-resources
params: {}
rules:
- alert: KubeCPUOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource
requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.cluster }}'
service: kubernetes-resources
- alert: KubeMemoryOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.cluster }}'
service: kubernetes-resources
- alert: KubeCPUQuotaOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource
requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.cluster }}'
service: kubernetes-resources
- alert: KubeMemoryQuotaOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
> 1.5
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.cluster }}'
service: kubernetes-resources
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: informational
exported_instance: '{{ $labels.namespace }}'
service: kubernetes-resources
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: informational
exported_instance: '{{ $labels.namespace }}'
service: kubernetes-resources
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}'
service: kubernetes-resources
- alert: CPUThrottlingHigh
annotations:
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,cluster)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,cluster)
> ( 25 / 100 )
for: 15m
labels:
severity: informational
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container
}}'
service: kubernetes-resources

View File

@@ -0,0 +1,130 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-storage
spec:
groups:
- name: kubernetes-storage
params: {}
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
}}'
service: kubernetes-storage
- alert: KubePersistentVolumeFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{
with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill
up within four days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
summary: PersistentVolume is filling up.
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
}}'
service: kubernetes-storage
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
summary: PersistentVolumeInodes are filling up.
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
}}'
service: kubernetes-storage
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{
with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run
out of inodes within four days. Currently {{ $value | humanizePercentage
}} of its inodes are free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
summary: PersistentVolumeInodes are filling up.
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,cluster)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
}}'
service: kubernetes-storage
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{ $labels.persistentvolume }} {{ with
$labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase
}}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
> 0
for: 5m
labels:
severity: critical
exported_instance: '{{ $labels.persistentvolume }}'
service: kubernetes-storage

View File

@@ -0,0 +1,91 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-system-apiserver
spec:
groups:
- name: kubernetes-system-apiserver
params: {}
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver
is expiring in less than 7.0 days.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"}
> 0 and on (job,cluster) histogram_quantile(0.01, sum by (job,le,cluster)
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m])))
< 604800
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: kubernetes-system-apiserver
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to kubernetes apiserver
is expiring in less than 24.0 hours.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"}
> 0 and on (job,cluster) histogram_quantile(0.01, sum by (job,le,cluster)
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m])))
< 86400
for: 5m
labels:
severity: critical
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
service: kubernetes-system-apiserver
- alert: KubeAggregatedAPIErrors
annotations:
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
}} has reported errors. It has appeared unavailable {{ $value | humanize
}} times averaged over the past 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
summary: Kubernetes aggregated API has reported errors.
expr: sum by (name,namespace,cluster)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m]))
> 4
labels:
severity: warning
exported_instance: '{{ $labels.name }}/{{ $labels.namespace }}'
service: kubernetes-system-apiserver
- alert: KubeAggregatedAPIDown
annotations:
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
}} has been only {{ $value | humanize }}% available over the last 10m.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
summary: Kubernetes aggregated API is down.
expr: (1 - max by (name,namespace,cluster)(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m])))
* 100 < 85
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.name }}/{{ $labels.namespace }}'
service: kubernetes-system-apiserver
- alert: KubeAPIDown
annotations:
description: KubeAPI has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-apiserver"} == 1)
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.cluster }}/apiserver'
service: kubernetes-system-apiserver
- alert: KubeAPITerminatedRequests
annotations:
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage
}} of its incoming requests.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage
}} of its incoming requests.
expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) /
( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m]))
) > 0.20
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.cluster }}/apiserver'
service: kubernetes-system-apiserver

View File

@@ -0,0 +1,21 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-system-controller-manager
spec:
groups:
- name: kubernetes-system-controller-manager
params: {}
rules:
- alert: KubeControllerManagerDown
annotations:
description: KubeControllerManager has disappeared from Prometheus target
discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/controller-manager'
service: kubernetes-system-controller-manager

View File

@@ -0,0 +1,175 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-system-kubelet
spec:
groups:
- name: kubernetes-system-kubelet
params: {}
rules:
- alert: KubeNodeNotReady
annotations:
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
summary: Node is not ready.
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"}
== 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeNodeUnreachable
annotations:
description: '{{ $labels.node }} is unreachable and some workloads may be
rescheduled.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
summary: Node is unreachable.
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"}
unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"})
== 1
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletTooManyPods
annotations:
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
}} of its Pod capacity.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
summary: Kubelet is running at capacity.
expr: |-
count by (node,cluster) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster) group_left(node) topk by (instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
)
/
max by (node,cluster) (
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
) > 0.95
for: 15m
labels:
severity: informational
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeNodeReadinessFlapping
annotations:
description: The readiness status of node {{ $labels.node }} has changed {{
$value }} times in the last 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
summary: Node readiness status is flapping.
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m]))
by (node,cluster) > 2
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletPlegDurationHigh
annotations:
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
duration of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"}
>= 10
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
summary: Kubelet Pod startup latency is too high.
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",
metrics_path="/metrics"}[5m])) by (instance,le,cluster)) * on (instance,cluster)
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}
> 60
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletClientCertificateExpiration
annotations:
description: Client certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
summary: Kubelet client certificate is about to expire.
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletServerCertificateExpiration
annotations:
description: Server certificate for Kubelet on node {{ $labels.node }} expires
in {{ $value | humanizeDuration }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
summary: Kubelet server certificate is about to expire.
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its client
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
summary: Kubelet has failed to renew its client certificate.
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])
> 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: Kubelet on node {{ $labels.node }} has failed to renew its server
certificate ({{ $value | humanize }} errors in the last 5 minutes).
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
summary: Kubelet has failed to renew its server certificate.
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet
- alert: KubeletDown
annotations:
description: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.node }}'
service: kubernetes-system-kubelet

View File

@@ -0,0 +1,20 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-system-scheduler
spec:
groups:
- name: kubernetes-system-scheduler
params: {}
rules:
- alert: KubeSchedulerDown
annotations:
description: KubeScheduler has disappeared from Prometheus target discovery.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown
summary: Target disappeared from Prometheus target discovery.
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.scheduler }}'
service: kubernetes-system-scheduler

View File

@@ -0,0 +1,37 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-kubernetes-system
spec:
groups:
- name: kubernetes-system
params: {}
rules:
- alert: KubeVersionMismatch
annotations:
description: There are {{ $value }} different semantic versions of Kubernetes
components running.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
summary: Different semantic versions of Kubernetes components running.
expr: count by (cluster) (count by (git_version,cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*")))
> 1
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.cluster }}'
service: kubernetes-system
- alert: KubeClientErrors
annotations:
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
summary: Kubernetes API server client is experiencing errors.
expr: |-
(sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,cluster)
/
sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,cluster))
> 0.01
for: 15m
labels:
severity: warning
service: kubernetes-system

View File

@@ -0,0 +1,93 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node-exporter.rules
spec:
groups:
- name: node-exporter.rules
params: {}
rules:
- annotations: {}
expr: |-
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
labels: {}
record: instance:node_num_cpu:sum
- annotations: {}
expr: |-
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
labels: {}
record: instance:node_cpu_utilisation:rate5m
- annotations: {}
expr: |-
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
labels: {}
record: instance:node_load1_per_cpu:ratio
- annotations: {}
expr: |-
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
labels: {}
record: instance:node_memory_utilisation:ratio
- annotations: {}
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
labels: {}
record: instance:node_vmstat_pgmajfault:rate5m
- annotations: {}
expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
labels: {}
record: instance_device:node_disk_io_time_seconds:rate5m
- annotations: {}
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
labels: {}
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- annotations: {}
expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
labels: {}
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- annotations: {}
expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
labels: {}
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- annotations: {}
expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
labels: {}
record: instance:node_network_receive_drop_excluding_lo:rate5m
- annotations: {}
expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
labels: {}
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@@ -0,0 +1,396 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node-exporter
spec:
groups:
- name: node-exporter
params: {}
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
> 0.01
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
> 0.01
for: 1h
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
> 0.75
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector on {{ $labels.instance }} failed
to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeClockSkewDetected
annotations:
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
expr: |-
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeClockNotSynchronising
annotations:
description: Clock at {{ $labels.instance }} is not synchronising. Ensure
NTP is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
expr: |-
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded.
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
> 0
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array at {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array.
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
> 0
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently
at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeCPUHighUsage
annotations:
description: |
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: High CPU usage.
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
severity: informational
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeSystemSaturation
annotations:
description: |
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
This might indicate this instance resources saturation and can cause it becoming unresponsive.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
summary: System saturated, load per core is very high.
expr: |-
node_load1{job="node-exporter"}
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeMemoryMajorPagesFaults
annotations:
description: |
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
Please check that there is enough memory available at this instance.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
summary: Memory major page faults are occurring at very high rate.
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeMemoryHighUtilization
annotations:
description: |
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: Host is running out of memory.
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
* 100) > 90
for: 15m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}'
service: node-exporter
- alert: NodeDiskIOSaturation
annotations:
description: |
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
This symptom might indicate disk saturation.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
summary: Disk IO queue is high.
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
> 10
for: 30m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-exporter
- alert: NodeSystemdServiceFailed
annotations:
description: Systemd service {{ $labels.name }} has entered failed state at
{{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.name }}'
service: node-exporter
- alert: NodeBondingDegraded
annotations:
description: Bonding interface {{ $labels.master }} on {{ $labels.instance
}} is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.master }}'
service: node-exporter

View File

@@ -0,0 +1,21 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node-network
spec:
groups:
- name: node-network
params: {}
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: Network interface "{{ $labels.device }}" changing its up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
summary: Network interface is often changing its status
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
service: node-network

View File

@@ -0,0 +1,55 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: alerts-node.rules
spec:
groups:
- name: node.rules
params: {}
rules:
- annotations: {}
expr: |-
topk by (namespace,pod,cluster) (1,
max by (node,namespace,pod,cluster) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
labels: {}
record: 'node_namespace_pod:kube_pod_info:'
- annotations: {}
expr: |-
count by (node,cluster) (
node_cpu_seconds_total{mode="idle",job="node-exporter"}
* on (namespace,pod,cluster) group_left(node)
topk by (namespace,pod,cluster) (1, node_namespace_pod:kube_pod_info:)
)
labels: {}
record: node:node_num_cpu:sum
- annotations: {}
expr: |-
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by (cluster)
labels: {}
record: :node_memory_MemAvailable_bytes:sum
- annotations: {}
expr: |-
avg by (node,cluster) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
labels: {}
record: node:node_cpu_utilization:ratio_rate5m
- annotations: {}
expr: |-
avg by (cluster) (
node:node_cpu_utilization:ratio_rate5m
)
labels: {}
record: cluster:node_cpu:ratio_rate5m

View File

@@ -1,26 +0,0 @@
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*.orig
*~
# Various IDEs
.project
.idea/
*.tmproj
.vscode/
# Ignore img folder used for documentation
img/

View File

@@ -1,688 +0,0 @@
## Next release
- TODO
## 0.25.17
**Release date:** 2024-09-20
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
- Fixed ETCD dashboard
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.
## 0.25.16
**Release date:** 2024-09-10
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Do not truncate servicemonitor, datasources, rules, dashboard, alertmanager & vmalert templates names
- Use service label for node-exporter instead of podLabel. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1458)
- Added common chart to a k8s-stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1456)
- Fixed value of custom alertmanager configSecret. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1461)
## 0.25.15
**Release date:** 2024-09-05
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Drop empty endpoints param from scrape configuration
- Fixed proto when TLS is enabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1449)
## 0.25.14
**Release date:** 2024-09-04
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed alertmanager templates
## 0.25.13
**Release date:** 2024-09-04
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Use operator's own service monitor
## 0.25.12
**Release date:** 2024-09-03
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fixed dashboards rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1414)
- Fixed service monitor label name.
## 0.25.11
**Release date:** 2024-09-03
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Merged ingress templates
- Removed custom VMServiceScrape for operator
- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860).
- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312).
- Migrated Node Exporter dashboard into chart
- Deprecated `grafana.sidecar.jsonData`, `grafana.provisionDefaultDatasource` in a favour of `grafana.sidecar.datasources.default` slice of datasources.
- Fail if no notifiers are set, do not set `notifiers` to null if empty
## 0.25.10
**Release date:** 2024-08-31
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed ingress extraPaths and externalVM urls rendering
## 0.25.9
**Release date:** 2024-08-31
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed vmalert ingress name typo
- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860).
- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312).
## 0.25.8
**Release date:** 2024-08-30
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed external notifiers rendering, when alertmanager is disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1378)
## 0.25.7
**Release date:** 2024-08-30
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed extra rules template context
## 0.25.6
**Release date:** 2024-08-29
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec`
**Update note**: Update `kubeScheduler.spec` to `kubeScheduler.vmScrape.spec`
**Update note**: Update `kubeEtcd.spec` to `kubeEtcd.vmScrape.spec`
**Update note**: Update `coreDns.spec` to `coreDns.vmScrape.spec`
**Update note**: Update `kubeDns.spec` to `kubeDns.vmScrape.spec`
**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec`
**Update note**: Update `kubeControllerManager.spec` to `kubeControllerManager.vmScrape.spec`
**Update note**: Update `kubeApiServer.spec` to `kubeApiServer.vmScrape.spec`
**Update note**: Update `kubelet.spec` to `kubelet.vmScrape.spec`
**Update note**: Update `kube-state-metrics.spec` to `kube-state-metrics.vmScrape.spec`
**Update note**: Update `prometheus-node-exporter.spec` to `prometheus-node-exporter.vmScrape.spec`
**Update note**: Update `grafana.spec` to `grafana.vmScrape.spec`
- bump version of VM components to [v1.103.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.103.0)
- Added `dashboards.<dashboardName>` bool flag to enable dashboard even if component it is for is not installed.
- Allow extra `vmalert.notifiers` without dropping default notifier if `alertmanager.enabled: true`
- Do not drop default notifier, when vmalert.additionalNotifierConfigs is set
- Replaced static url proto with a template, which selects proto depending on a present tls configuration
- Moved kubernetes components monitoring config from `spec` config to `vmScrape.spec`
- Merged servicemonitor templates
## 0.25.5
**Release date:** 2024-08-26
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- TODO
## 0.25.4
**Release date:** 2024-08-26
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updates operator to [v0.47.2](https://github.com/VictoriaMetrics/operator/releases/tag/v0.47.2)
- kube-state-metrics - 5.16.4 -> 5.25.1
- prometheus-node-exporter - 4.27.0 -> 4.29.0
- grafana - 8.3.8 -> 8.4.7
- added configurable `.Values.global.clusterLabel` to all alerting and recording rules `by` and `on` expressions
## 0.25.3
**Release date:** 2024-08-23
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updated operator to v0.47.1 release
- Build `app.kubernetes.io/instance` label consistently. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1282)
## 0.25.2
**Release date:** 2024-08-21
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fixed vmalert ingress name. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1271)
- fixed alertmanager ingress host template rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1270)
## 0.25.1
**Release date:** 2024-08-21
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Added `.Values.global.license` configuration
- Fixed extraLabels rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1248)
- Fixed vmalert url to alertmanager by including its path prefix
- Removed `networking.k8s.io/v1beta1/Ingress` and `extensions/v1beta1/Ingress` support
- Fixed kubedns servicemonitor template. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1255)
## 0.25.0
**Release date:** 2024-08-16
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
**Update note**: it requires to update CRD dependency manually before upgrade
**Update note**: requires Helm 3.14+
- Moved dashboards templating logic out of sync script to Helm template
- Allow to disable default grafana datasource
- Synchronize Etcd dashboards and rules with mixin provided by Etcd
- Add alerting rules for VictoriaMetrics operator.
- Updated alerting rules for VictoriaMetrics components.
- Fixed exact rule annotations propagation to other rules.
- Set minimal kubernetes version to 1.25
- updates operator to v0.47.0 version
## 0.24.5
**Release date:** 2024-08-01
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.102.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.1)
## 0.24.4
**Release date:** 2024-08-01
![AppVersion: v1.102.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update dependencies: grafana -> 8.3.6.
- Added `.Values.defaultRules.alerting` and `.Values.defaultRules.recording` to setup common properties for all alerting an recording rules
## 0.24.3
**Release date:** 2024-07-23
![AppVersion: v1.102.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.102.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0)
## 0.24.2
**Release date:** 2024-07-15
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fix vmalertmanager configuration when using `.VMAlertmanagerSpec.ConfigRawYaml`. See [this pull request](https://github.com/VictoriaMetrics/helm-charts/pull/1136).
## 0.24.1
**Release date:** 2024-07-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updates operator to v0.46.4
## 0.24.0
**Release date:** 2024-07-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- added ability to override alerting rules labels and annotations:
- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations`
- for all rules in a group - `.Values.defaultRules.groups.<groupName>.rules.spec.labels` and `.Valeus.defaultRules.groups.<groupName>.rules.spec.annotations`
- for each rule individually - `.Values.defaultRules.rules.<ruleName>.spec.labels` and `.Values.defaultRules.rules.<ruleName>.spec.annotations`
- changed `.Values.defaultRules.rules.<groupName>` to `.Values.defaultRules.groups.<groupName>.create`
- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups.<groupName>.targetNamespace`
- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups.<groupName>.spec.params`
## 0.23.6
**Release date:** 2024-07-08
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- added ability to override alerting rules labels and annotations:
- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations`
- for all rules in a group - `.Values.defaultRules.groups.<groupName>.rules.spec.labels` and `.Valeus.defaultRules.groups.<groupName>.rules.spec.annotations`
- for each rule individually - `.Values.defaultRules.rules.<ruleName>.spec.labels` and `.Values.defaultRules.rules.<ruleName>.spec.annotations`
- changed `.Values.defaultRules.rules.<groupName>` to `.Values.defaultRules.groups.<groupName>.create`
- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups.<groupName>.targetNamespace`
- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups.<groupName>.spec.params`
## 0.23.5
**Release date:** 2024-07-04
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Support configuring vmalert `-notifier.config` with `.Values.vmalert.additionalNotifierConfigs`.
## 0.23.4
**Release date:** 2024-07-02
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Add `extraObjects` to allow deploying additional resources with the chart release.
## 0.23.3
**Release date:** 2024-06-26
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Enable [conversion of Prometheus CRDs](https://docs.victoriametrics.com/operator/migration/#objects-conversion) by default. See [this](https://github.com/VictoriaMetrics/helm-charts/pull/1069) pull request for details.
- use bitnami/kubectl image for cleanup instead of deprecated gcr.io/google_containers/hyperkube
## 0.23.2
**Release date:** 2024-06-14
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Do not add `cluster` external label at VMAgent by default. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/774) for the details.
## 0.23.1
**Release date:** 2024-06-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- updates operator to v0.45.0 release
- sync latest vm alerts and dashboards.
## 0.23.0
**Release date:** 2024-05-30
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- sync latest etcd v3.5.x rules from [upstream](https://github.com/etcd-io/etcd/blob/release-3.5/contrib/mixin/mixin.libsonnet).
- add Prometheus operator CRDs as an optional dependency. See [this PR](https://github.com/VictoriaMetrics/helm-charts/pull/1022) and [related issue](https://github.com/VictoriaMetrics/helm-charts/issues/341) for the details.
## 0.22.1
**Release date:** 2024-05-14
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- fix missing serviceaccounts patch permission in VM operator, see [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1012) for details.
## 0.22.0
**Release date:** 2024-05-10
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM operator to [0.44.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.44.0)
## 0.21.3
**Release date:** 2024-04-26
![AppVersion: v1.101.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.101.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.101.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.101.0)
## 0.21.2
**Release date:** 2024-04-23
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM operator to [0.43.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.3)
## 0.21.1
**Release date:** 2024-04-18
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
## 0.21.0
**Release date:** 2024-04-18
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- TODO
- bump version of VM operator to [0.43.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.0)
- updates CRDs definitions.
## 0.20.1
**Release date:** 2024-04-16
![AppVersion: v1.100.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.100.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- upgraded dashboards and alerting rules, added values file for local (Minikube) setup
- bump version of VM components to [v1.100.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.100.1)
## 0.20.0
**Release date:** 2024-04-02
![AppVersion: v1.99.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.99.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM operator to [0.42.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.42.3)
## 0.19.4
**Release date:** 2024-03-05
![AppVersion: v1.99.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.99.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0)
## 0.19.3
**Release date:** 2024-03-05
![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Commented default configuration for alertmanager. It simplifies configuration and makes it more explicit. See this [issue](https://github.com/VictoriaMetrics/helm-charts/issues/473) for details.
- Allow enabling/disabling default k8s rules when installing. See [#904](https://github.com/VictoriaMetrics/helm-charts/pull/904) by @passie.
## 0.19.2
**Release date:** 2024-02-26
![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix templating of VMAgent `remoteWrite` in case both `VMSingle` and `VMCluster` are disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/865) for details.
## 0.19.1
**Release date:** 2024-02-21
![AppVersion: v1.98.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.98.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update dependencies: victoria-metrics-operator -> 0.28.1, grafana -> 7.3.1.
- Update victoriametrics CRD resources yaml.
## 0.19.0
**Release date:** 2024-02-09
![AppVersion: v1.97.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.97.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Do not store original labels in `vmagent`'s memory by default. This reduces memory usage of `vmagent` but makes `vmagent`'s debugging UI less informative. See [this docs](https://docs.victoriametrics.com/vmagent/#relabel-debug) for details on relabeling debug.
- Update dependencies: kube-state-metrics -> 5.16.0, prometheus-node-exporter -> 4.27.0, grafana -> 7.3.0.
- Update victoriametrics CRD resources yaml.
- Update builtin dashboards and rules.
## 0.18.12
**Release date:** 2024-02-01
![AppVersion: v1.97.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.97.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.97.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.97.1)
- Fix helm lint when ingress resources enabled - split templates of resources per kind. See [#820](https://github.com/VictoriaMetrics/helm-charts/pull/820) by @MemberIT.
## 0.18.11
**Release date:** 2023-12-15
![AppVersion: v1.96.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.96.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix missing `.Values.defaultRules.rules.vmcluster` value. See [#801](https://github.com/VictoriaMetrics/helm-charts/pull/801) by @MemberIT.
## 0.18.10
**Release date:** 2023-12-12
![AppVersion: v1.96.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.96.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0)
- Add optional allowCrossNamespaceImport to GrafanaDashboard(s) (#788)
## 0.18.9
**Release date:** 2023-12-08
![AppVersion: v1.95.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Properly use variable from values file for Grafana datasource type. (#769)
- Update dashboards from upstream sources. (#780)
## 0.18.8
**Release date:** 2023-11-16
![AppVersion: v1.95.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.95.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.1)
## 0.18.7
**Release date:** 2023-11-15
![AppVersion: v1.95.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.95.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.95.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.0)
- Support adding extra group parameters for default vmrules. (#752)
## 0.18.6
**Release date:** 2023-11-01
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix kube scheduler default scraping port from 10251 to 10259, Kubernetes changed it since 1.23.0. See [this pr](https://github.com/VictoriaMetrics/helm-charts/pull/736) for details.
- Bump version of operator chart to [0.27.4](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.4)
## 0.18.5
**Release date:** 2023-10-08
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update operator chart to [v0.27.3](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.3) for fixing [#708](https://github.com/VictoriaMetrics/helm-charts/issues/708)
## 0.18.4
**Release date:** 2023-10-04
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update dependencies: [victoria-metrics-operator -> 0.27.2](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.2), prometheus-node-exporter -> 4.23.2, grafana -> 6.59.5.
## 0.18.3
**Release date:** 2023-10-04
![AppVersion: v1.94.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.94.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- bump version of VM components to [v1.94.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.94.0)
## 0.18.2
**Release date:** 2023-09-28
![AppVersion: v1.93.5](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.5&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Fix behavior of `vmalert.remoteWriteVMAgent` - remoteWrite.url for VMAlert is correctly generated considering endpoint, name, port and http.pathPrefix of VMAgent
## 0.18.1
**Release date:** 2023-09-21
![AppVersion: v1.93.5](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.5&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of VM components to [v1.93.5](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.5)
## 0.18.0
**Release date:** 2023-09-12
![AppVersion: v1.93.4](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.4&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of `grafana` helm-chart to `6.59.*`
- Bump version of `prometheus-node-exporter` helm-chart to `4.23.*`
- Bump version of `kube-state-metrics` helm-chart to `0.59.*`
- Update alerting rules
- Update grafana dashboards
- Add `make` commands `sync-rules` and `sync-dashboards`
- Add support of VictoriaMetrics datasource
## 0.17.8
**Release date:** 2023-09-11
![AppVersion: v1.93.4](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.4&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of VM components to [v1.93.4](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.4)
- Bump version of operator chart to [0.27.0](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.0)
## 0.17.7
**Release date:** 2023-09-07
![AppVersion: v1.93.3](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.3&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Bump version of operator helm-chart to `0.26.2`
## 0.17.6
**Release date:** 2023-09-04
![AppVersion: v1.93.3](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.3&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Move `cleanupCRD` option to victoria-metrics-operator chart (#593)
- Disable `honorTimestamps` for cadvisor scrape job by default (#617)
- For vmalert all replicas of alertmanager are added to notifiers (only if alertmanager is enabled) (#619)
- Add `grafanaOperatorDashboardsFormat` option (#615)
- Fix query expression for memory calculation in `k8s-views-global` dashboard (#636)
- Bump version of Victoria Metrics components to `v1.93.3`
- Bump version of operator helm-chart to `0.26.0`
## 0.17.5
**Release date:** 2023-08-23
![AppVersion: v1.93.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.93.0 to v1.93.1
## 0.17.4
**Release date:** 2023-08-12
![AppVersion: v1.93.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.93.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.92.1 to v1.93.0
- delete an obsolete parameter remaining by mistake (see <https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack#upgrade-to-0130>) (#602)
## 0.17.3
**Release date:** 2023-07-28
![AppVersion: v1.92.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.92.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.92.0 to v1.92.1 (#599)
## 0.17.2
**Release date:** 2023-07-27
![AppVersion: v1.92.0](https://img.shields.io/static/v1?label=AppVersion&message=v1.92.0&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Update VictoriaMetrics components from v1.91.3 to v1.92.0

View File

@@ -1,24 +0,0 @@
dependencies:
- name: victoria-metrics-common
repository: https://victoriametrics.github.io/helm-charts
version: 0.0.11
- name: victoria-metrics-operator
repository: https://victoriametrics.github.io/helm-charts
version: 0.34.8
- name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 5.25.1
- name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
version: 4.39.0
- name: grafana
repository: https://grafana.github.io/helm-charts
version: 8.4.9
- name: crds
repository: ""
version: 0.0.0
- name: prometheus-operator-crds
repository: https://prometheus-community.github.io/helm-charts
version: 11.0.0
digest: sha256:11b119ebabf4ff0ea2951e7c72f51d0223dc3f50fb061a43b01fe7856491b836
generated: "2024-09-12T11:50:51.935071545Z"

View File

@@ -1,66 +0,0 @@
annotations:
artifacthub.io/category: monitoring-logging
artifacthub.io/changes: |
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
- Fixed ETCD dashboard
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
- 'Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.'
artifacthub.io/license: Apache-2.0
artifacthub.io/links: |
- name: Sources
url: https://docs.victoriametrics.com/vmgateway
- name: Charts repo
url: https://victoriametrics.github.io/helm-charts/
- name: Docs
url: https://docs.victoriametrics.com
artifacthub.io/operator: "true"
apiVersion: v2
appVersion: v1.102.1
dependencies:
- name: victoria-metrics-common
repository: https://victoriametrics.github.io/helm-charts
version: 0.0.*
- condition: victoria-metrics-operator.enabled
name: victoria-metrics-operator
repository: https://victoriametrics.github.io/helm-charts
version: 0.34.*
- condition: kube-state-metrics.enabled
name: kube-state-metrics
repository: https://prometheus-community.github.io/helm-charts
version: 5.25.*
- condition: prometheus-node-exporter.enabled
name: prometheus-node-exporter
repository: https://prometheus-community.github.io/helm-charts
version: 4.39.*
- condition: grafana.enabled
name: grafana
repository: https://grafana.github.io/helm-charts
version: 8.4.*
- condition: crds.enabled
name: crds
repository: ""
version: 0.0.0
- condition: prometheus-operator-crds.enabled
name: prometheus-operator-crds
repository: https://prometheus-community.github.io/helm-charts
version: 11.0.*
description: Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics
Operator, Grafana dashboards, ServiceScrapes and VMRules
home: https://github.com/VictoriaMetrics/helm-charts
icon: https://avatars.githubusercontent.com/u/43720803?s=200&v=4
keywords:
- victoriametrics
- operator
- monitoring
- kubernetes
- observability
- tsdb
- metrics
- metricsql
- timeseries
kubeVersion: '>=1.25.0-0'
name: victoria-metrics-k8s-stack
sources:
- https://github.com/VictoriaMetrics/helm-charts
type: application
version: 0.25.17

View File

@@ -1,300 +0,0 @@
{{ template "chart.typeBadge" . }} {{ template "chart.versionBadge" . }}
[![Artifact Hub](https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/victoriametrics)](https://artifacthub.io/packages/helm/victoriametrics/victoria-metrics-k8s-stack)
{{ template "chart.description" . }}
* [Overview](#Overview)
* [Configuration](#Configuration)
* [Prerequisites](#Prerequisites)
* [Dependencies](#Dependencies)
* [Quick Start](#How-to-install)
* [Uninstall](#How-to-uninstall)
* [Version Upgrade](#Upgrade-guide)
* [Troubleshooting](#Troubleshooting)
* [Values](#Parameters)
## Overview
This chart is an All-in-one solution to start monitoring kubernetes cluster.
It installs multiple dependency charts like [grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana), [node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter), [kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) and [victoria-metrics-operator](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator).
Also it installs Custom Resources like [VMSingle](https://docs.victoriametrics.com/operator/quick-start#vmsingle), [VMCluster](https://docs.victoriametrics.com/operator/quick-start#vmcluster), [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent), [VMAlert](https://docs.victoriametrics.com/operator/quick-start#vmalert).
By default, the operator [converts all existing prometheus-operator API objects](https://docs.victoriametrics.com/operator/quick-start#migration-from-prometheus-operator-objects) into corresponding VictoriaMetrics Operator objects.
To enable metrics collection for kubernetes this chart installs multiple scrape configurations for kuberenetes components like kubelet and kube-proxy, etc. Metrics collection is done by [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent). So if want to ship metrics to external VictoriaMetrics database you can disable VMSingle installation by setting `vmsingle.enabled` to `false` and setting `vmagent.vmagentSpec.remoteWrite.url` to your external VictoriaMetrics database.
This chart also installs bunch of dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project.
![Overview](img/k8s-stack-overview.png)
## Configuration
Configuration of this chart is done through helm values.
### Dependencies
Dependencies can be enabled or disabled by setting `enabled` to `true` or `false` in `values.yaml` file.
**!Important:** for dependency charts anything that you can find in values.yaml of dependency chart can be configured in this chart under key for that dependency. For example if you want to configure `grafana` you can find all possible configuration options in [values.yaml](https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml) and you should set them in values for this chart under grafana: key. For example if you want to configure `grafana.persistence.enabled` you should set it in values.yaml like this:
```yaml
#################################################
### dependencies #####
#################################################
# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration
grafana:
enabled: true
persistence:
type: pvc
enabled: false
```
### VictoriaMetrics components
This chart installs multiple VictoriaMetrics components using Custom Resources that are managed by [victoria-metrics-operator](https://docs.victoriametrics.com/operator/design)
Each resource can be configured using `spec` of that resource from API docs of [victoria-metrics-operator](https://docs.victoriametrics.com/operator/api). For example if you want to configure `VMAgent` you can find all possible configuration options in [API docs](https://docs.victoriametrics.com/operator/api#vmagent) and you should set them in values for this chart under `vmagent.spec` key. For example if you want to configure `remoteWrite.url` you should set it in values.yaml like this:
```yaml
vmagent:
spec:
remoteWrite:
- url: "https://insert.vmcluster.domain.com/insert/0/prometheus/api/v1/write"
```
### ArgoCD issues
#### Operator self signed certificates
When deploying K8s stack using ArgoCD without Cert Manager (`.Values.victoria-metrics-operator.admissionWebhooks.certManager.enabled: false`)
it will rerender operator's webhook certificates on each sync since Helm `lookup` function is not respected by ArgoCD.
To prevent this please update you K8s stack Application `spec.syncPolicy` and `spec.ignoreDifferences` with a following:
```yaml
apiVersion: argoproj.io/v1alpha1
kind: Application
...
spec:
...
syncPolicy:
syncOptions:
# https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/#respect-ignore-difference-configs
# argocd must also ignore difference during apply stage
# otherwise it ll silently override changes and cause a problem
- RespectIgnoreDifferences=true
ignoreDifferences:
- group: ""
kind: Secret
name: <fullname>-validation
namespace: kube-system
jsonPointers:
- /data
- group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
name: <fullname>-admission
jqPathExpressions:
- '.webhooks[]?.clientConfig.caBundle'
```
where `<fullname>` is output of `{{"{{"}} include "vm-operator.fullname" {{"}}"}}` for your setup
#### `metadata.annotations: Too long: must have at most 262144 bytes` on dashboards
If one of dashboards ConfigMap is failing with error `Too long: must have at most 262144 bytes`, please make sure you've added `argocd.argoproj.io/sync-options: ServerSideApply=true` annotation to your dashboards:
```yaml
grafana:
sidecar:
dashboards:
additionalDashboardAnnotations
argocd.argoproj.io/sync-options: ServerSideApply=true
```
argocd.argoproj.io/sync-options: ServerSideApply=true
### Rules and dashboards
This chart by default install multiple dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus)
you can disable dashboards with `defaultDashboardsEnabled: false` and `experimentalDashboardsEnabled: false`
and rules can be configured under `defaultRules`
### Prometheus scrape configs
This chart installs multiple scrape configurations for kubernetes monitoring. They are configured under `#ServiceMonitors` section in `values.yaml` file. For example if you want to configure scrape config for `kubelet` you should set it in values.yaml like this:
```yaml
kubelet:
enabled: true
# spec for VMNodeScrape crd
# https://docs.victoriametrics.com/operator/api#vmnodescrapespec
spec:
interval: "30s"
```
### Using externally managed Grafana
If you want to use an externally managed Grafana instance but still want to use the dashboards provided by this chart you can set
`grafana.enabled` to `false` and set `defaultDashboardsEnabled` to `true`. This will install the dashboards
but will not install Grafana.
For example:
```yaml
defaultDashboardsEnabled: true
grafana:
enabled: false
```
This will create ConfigMaps with dashboards to be imported into Grafana.
If additional configuration for labels or annotations is needed in order to import dashboard to an existing Grafana you can
set `.grafana.sidecar.dashboards.additionalDashboardLabels` or `.grafana.sidecar.dashboards.additionalDashboardAnnotations` in `values.yaml`:
For example:
```yaml
defaultDashboardsEnabled: true
grafana:
enabled: false
sidecar:
dashboards:
additionalDashboardLabels:
key: value
additionalDashboardAnnotations:
key: value
```
## Prerequisites
* Install the follow packages: ``git``, ``kubectl``, ``helm``, ``helm-docs``. See this [tutorial](../../REQUIREMENTS.md).
* Add dependency chart repositories
```console
helm repo add grafana https://grafana.github.io/helm-charts
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
```
* PV support on underlying infrastructure.
{{ include "chart.installSection" . }}
### Install locally (Minikube)
To run VictoriaMetrics stack locally it's possible to use [Minikube](https://github.com/kubernetes/minikube). To avoid dashboards and alert rules issues please follow the steps below:
Run Minikube cluster
```
minikube start --container-runtime=containerd --extra-config=scheduler.bind-address=0.0.0.0 --extra-config=controller-manager.bind-address=0.0.0.0
```
Install helm chart
```
helm install [RELEASE_NAME] vm/victoria-metrics-k8s-stack -f values.yaml -f values.minikube.yaml -n NAMESPACE --debug --dry-run
```
{{ include "chart.uninstallSection" . }}
CRDs created by this chart are not removed by default and should be manually cleaned up:
```console
kubectl get crd | grep victoriametrics.com | awk '{print $1 }' | xargs -i kubectl delete crd {}
```
## Troubleshooting
- If you cannot install helm chart with error `configmap already exist`. It could happen because of name collisions, if you set too long release name.
Kubernetes by default, allows only 63 symbols at resource names and all resource names are trimmed by helm to 63 symbols.
To mitigate it, use shorter name for helm chart release name, like:
```bash
# stack - is short enough
helm upgrade -i stack vm/victoria-metrics-k8s-stack
```
Or use override for helm chart release name:
```bash
helm upgrade -i some-very-long-name vm/victoria-metrics-k8s-stack --set fullnameOverride=stack
```
## Upgrade guide
Usually, helm upgrade doesn't requires manual actions. Just execute command:
```console
$ helm upgrade [RELEASE_NAME] vm/victoria-metrics-k8s-stack
```
But release with CRD update can only be patched manually with kubectl.
Since helm does not perform a CRD update, we recommend that you always perform this when updating the helm-charts version:
```console
# 1. check the changes in CRD
$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl diff -f -
# 2. apply the changes (update CRD)
$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl apply -f - --server-side
```
All other manual actions upgrades listed below:
### Upgrade to 0.13.0
- node-exporter starting from version 4.0.0 is using the Kubernetes recommended labels. Therefore you have to delete the daemonset before you upgrade.
```bash
kubectl delete daemonset -l app=prometheus-node-exporter
```
- scrape configuration for kubernetes components was moved from `vmServiceScrape.spec` section to `spec` section. If you previously modified scrape configuration you need to update your `values.yaml`
- `grafana.defaultDashboardsEnabled` was renamed to `defaultDashboardsEnabled` (moved to top level). You may need to update it in your `values.yaml`
### Upgrade to 0.6.0
All `CRD` must be update to the lastest version with command:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml
```
### Upgrade to 0.4.0
All `CRD` must be update to `v1` version with command:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml
```
### Upgrade from 0.2.8 to 0.2.9
Update `VMAgent` crd
command:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.16.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml
```
### Upgrade from 0.2.5 to 0.2.6
New CRD added to operator - `VMUser` and `VMAuth`, new fields added to exist crd.
Manual commands:
```bash
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmusers.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmauths.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmalerts.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmsingles.yaml
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmclusters.yaml
```
{{ include "chart.helmDocs" . }}
## Parameters
The following tables lists the configurable parameters of the chart and their default values.
Change the values according to the need of the environment in ``victoria-metrics-k8s-stack/values.yaml`` file.
{{ template "chart.valuesTableHtml" . }}

View File

@@ -1,40 +0,0 @@
# Release process guidance
## Update version for VictoriaMetrics kubernetes monitoring stack
1. Update dependency requirements in [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml)
2. Apply changes via `helm dependency update`
3. Update image tag in chart values:
<div class="with-copy" markdown="1">
```console
make sync-rules
make sync-dashboards
```
</div>
4. Bump version of the victoria-metrics-k8s-stack [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml)
5. Run linter:
<div class="with-copy" markdown="1">
```console
make lint
```
</div>
6. Render templates locally to check for errors:
<div class="with-copy" markdown="1">
```console
helm template vm-k8s-stack ./charts/victoria-metrics-k8s-stack --output-dir out --values ./charts/victoria-metrics-k8s-stack/values.yaml --debug
```
</div>
7. Test updated chart by installing it to your kubernetes cluster.
8. Update docs with
```console
helm-docs
```
9. Commit the changes and send a [PR](https://github.com/VictoriaMetrics/helm-charts/pulls)

View File

@@ -1,12 +0,0 @@
# Release notes for version 0.25.17
**Release date:** 2024-09-20
![AppVersion: v1.102.1](https://img.shields.io/static/v1?label=AppVersion&message=v1.102.1&color=success&logo=)
![Helm: v3](https://img.shields.io/static/v1?label=Helm&message=v3&color=informational&logo=helm)
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
- Fixed ETCD dashboard
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.

View File

@@ -1,13 +0,0 @@
---
weight: 1
title: CHANGELOG
menu:
docs:
weight: 1
identifier: helm-victoriametrics-k8s-stack-changelog
parent: helm-victoriametrics-k8s-stack
url: /helm/victoriametrics-k8s-stack/changelog
aliases:
- /helm/victoriametrics-k8s-stack/changelog/index.html
---
{{% content "CHANGELOG.md" %}}

View File

@@ -1,13 +0,0 @@
---
weight: 9
title: VictoriaMetrics K8s Stack
menu:
docs:
parent: helm
weight: 9
identifier: helm-victoriametrics-k8s-stack
url: /helm/victoriametrics-k8s-stack
aliases:
- /helm/victoriametrics-k8s-stack/index.html
---
{{% content "README.md" %}}

View File

@@ -1,165 +0,0 @@
condition: '{{ .Values.kubeEtcd.enabled }}'
name: etcd
rules:
- alert: etcdMembersDown
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
summary: 'etcd cluster members are down.'
condition: '{{ true }}'
expr: |-
max without (endpoint) (
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical
- alert: etcdInsufficientMembers
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
summary: 'etcd cluster has insufficient number of members.'
condition: '{{ true }}'
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
summary: 'etcd cluster has no leader.'
condition: '{{ true }}'
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
summary: 'etcd cluster has high number of leader changes.'
condition: '{{ true }}'
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
for: 5m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster has high number of failed grpc requests.'
condition: '{{ true }}'
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster has high number of failed grpc requests.'
condition: '{{ true }}'
expr: |-
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.'
summary: 'etcd grpc requests are slow'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster member communication is slow.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster has high number of proposal failures.'
condition: '{{ true }}'
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster 99th percentile fsync durations are too high.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster 99th percentile fsync durations are too high.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 1
for: 10m
labels:
severity: critical
- alert: etcdHighCommitDurations
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
summary: 'etcd cluster 99th percentile commit durations are too high.'
condition: '{{ true }}'
expr: |-
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- alert: etcdDatabaseQuotaLowSpace
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
summary: 'etcd cluster database is running full.'
condition: '{{ true }}'
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
for: 10m
labels:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
summary: 'etcd cluster database growing very fast.'
condition: '{{ true }}'
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
labels:
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation'
summary: 'etcd database size in use is less than 50% of the actual allocated storage.'
condition: '{{ true }}'
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
labels:
severity: warning

View File

@@ -1,53 +0,0 @@
condition: '{{ true }}'
name: general.rules
rules:
- alert: TargetDown
annotations:
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/targetdown'
summary: 'One or more targets are unreachable.'
condition: '{{ true }}'
expr: 100 * (count(up == 0) BY (job,namespace,service,{{ .Values.global.clusterLabel }}) / count(up) BY (job,namespace,service,{{ .Values.global.clusterLabel }})) > 10
for: 10m
labels:
severity: warning
- alert: Watchdog
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/watchdog'
summary: 'An alert that should always be firing to certify that Alertmanager is working properly.'
condition: '{{ true }}'
expr: vector(1)
labels:
severity: ok
- alert: InfoInhibitor
annotations:
description: 'This is an alert that is used to inhibit info alerts.
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
other alerts.
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
severity of ''warning'' or ''critical'' starts firing on the same namespace.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor'
summary: 'Info-level alert inhibition.'
condition: '{{ true }}'
expr: ALERTS{severity = "info"} == 1 unless on (namespace,{{ .Values.global.clusterLabel }}) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: major

View File

@@ -1,11 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.container_cpu_usage_seconds_total
rules:
- condition: '{{ true }}'
expr: |-
sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (
1, max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate

View File

@@ -1,10 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_cache
rules:
- condition: '{{ true }}'
expr: |-
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_cache

View File

@@ -1,10 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_rss
rules:
- condition: '{{ true }}'
expr: |-
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_rss

View File

@@ -1,10 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_swap
rules:
- condition: '{{ true }}'
expr: |-
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_swap

View File

@@ -1,10 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.container_memory_working_set_bytes
rules:
- condition: '{{ true }}'
expr: |-
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
)
record: node_namespace_pod_container:container_memory_working_set_bytes

View File

@@ -1,79 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.container_resource
rules:
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_requests:sum
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_requests:sum
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_memory:kube_pod_container_resource_limits:sum
- condition: '{{ true }}'
expr: |-
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
)
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
- condition: '{{ true }}'
expr: |-
sum by (namespace,{{ .Values.global.clusterLabel }}) (
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
)
record: namespace_cpu:kube_pod_container_resource_limits:sum

View File

@@ -1,54 +0,0 @@
condition: '{{ true }}'
name: k8s.rules.pod_owner
rules:
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on (replicaset,namespace,{{ .Values.global.clusterLabel }}) group_left(owner_name) topk by (replicaset,namespace,{{ .Values.global.clusterLabel }}) (
1, max by (replicaset,namespace,owner_name,{{ .Values.global.clusterLabel }}) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: deployment
record: namespace_workload_pod:kube_pod_owner:relabel
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: daemonset
record: namespace_workload_pod:kube_pod_owner:relabel
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: statefulset
record: namespace_workload_pod:kube_pod_owner:relabel
- condition: '{{ true }}'
expr: |-
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
"workload", "$1", "owner_name", "(.*)"
)
)
labels:
workload_type: job
record: namespace_workload_pod:kube_pod_owner:relabel

View File

@@ -1,128 +0,0 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
interval: 3m
name: kube-apiserver-availability.rules
rules:
- condition: '{{ true }}'
expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
record: code_verb:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
labels:
verb: read
record: code:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
labels:
verb: write
record: code:apiserver_request_total:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h]))
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30)
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
- condition: '{{ true }}'
expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
- condition: '{{ true }}'
expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
- condition: '{{ true }}'
expr: |-
1 - (
(
# write too slow
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
) +
(
# read too slow
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
)
) +
# errors
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
)
/
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d)
labels:
verb: all
record: apiserver_request:availability30d
- condition: '{{ true }}'
expr: |-
1 - (
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
-
(
# too slow
(
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
+
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
)
/
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read"})
labels:
verb: read
record: apiserver_request:availability30d
- condition: '{{ true }}'
expr: |-
1 - (
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
-
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
)
/
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write"})
labels:
verb: write
record: apiserver_request:availability30d
- condition: '{{ true }}'
expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: code_resource:apiserver_request_total:rate5m
- condition: '{{ true }}'
expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: code_resource:apiserver_request_total:rate5m
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
record: code_verb:apiserver_request_total:increase1h
- condition: '{{ true }}'
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
record: code_verb:apiserver_request_total:increase1h

View File

@@ -1,318 +0,0 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
name: kube-apiserver-burnrate.rules
rules:
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
labels:
verb: read
record: apiserver_request:burnrate1d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
labels:
verb: read
record: apiserver_request:burnrate1h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
labels:
verb: read
record: apiserver_request:burnrate2h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
labels:
verb: read
record: apiserver_request:burnrate30m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
labels:
verb: read
record: apiserver_request:burnrate3d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
labels:
verb: read
record: apiserver_request:burnrate5m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
(
(
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
or
vector(0)
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
)
)
+
# errors
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
labels:
verb: read
record: apiserver_request:burnrate6h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
labels:
verb: write
record: apiserver_request:burnrate1d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
labels:
verb: write
record: apiserver_request:burnrate1h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
labels:
verb: write
record: apiserver_request:burnrate2h
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
labels:
verb: write
record: apiserver_request:burnrate30m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
labels:
verb: write
record: apiserver_request:burnrate3d
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
labels:
verb: write
record: apiserver_request:burnrate5m
- condition: '{{ true }}'
expr: |-
(
(
# too slow
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
-
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
)
+
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
)
/
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
labels:
verb: write
record: apiserver_request:burnrate6h

View File

@@ -1,15 +0,0 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
name: kube-apiserver-histogram.rules
rules:
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
labels:
quantile: '0.99'
verb: read
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
labels:
quantile: '0.99'
verb: write
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile

View File

@@ -1,63 +0,0 @@
condition: '{{ .Values.kubeApiServer.enabled }}'
name: kube-apiserver-slos
rules:
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
and
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
for: 2m
labels:
long: 1h
severity: critical
short: 5m
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
and
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
for: 15m
labels:
long: 6h
severity: critical
short: 30m
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
and
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
for: 1h
labels:
long: 1d
severity: warning
short: 2h
- alert: KubeAPIErrorBudgetBurn
annotations:
description: 'The API server is burning too much error budget.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
summary: 'The API server is burning too much error budget.'
condition: '{{ true }}'
expr: |-
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
and
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
for: 3h
labels:
long: 3d
severity: warning
short: 6h

View File

@@ -1,9 +0,0 @@
condition: '{{ true }}'
name: kube-prometheus-general.rules
rules:
- condition: '{{ true }}'
expr: count without(instance, pod, node) (up == 1)
record: count:up1
- condition: '{{ true }}'
expr: count without(instance, pod, node) (up == 0)
record: count:up0

View File

@@ -1,21 +0,0 @@
condition: '{{ true }}'
name: kube-prometheus-node-recording.rules
rules:
- condition: '{{ true }}'
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- condition: '{{ true }}'
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- condition: '{{ true }}'
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- condition: '{{ true }}'
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- condition: '{{ true }}'
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
record: cluster:node_cpu:sum_rate5m
- condition: '{{ true }}'
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
record: cluster:node_cpu:ratio

View File

@@ -1,48 +0,0 @@
condition: '{{ .Values.kubeScheduler.enabled }}'
name: kube-scheduler.rules
rules:
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.99'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.9'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
labels:
quantile: '0.5'
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile

View File

@@ -1,55 +0,0 @@
condition: '{{ true }}'
name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
description: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors'
summary: 'kube-state-metrics is experiencing errors in list operations.'
condition: '{{ true }}'
expr: |-
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }})
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }}))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
description: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors'
summary: 'kube-state-metrics is experiencing errors in watch operations.'
condition: '{{ true }}'
expr: |-
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }})
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }}))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardingMismatch
annotations:
description: 'kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch'
summary: 'kube-state-metrics sharding is misconfigured.'
condition: '{{ true }}'
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) != 0
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsShardsMissing
annotations:
description: 'kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing'
summary: 'kube-state-metrics shards are missing.'
condition: '{{ true }}'
expr: |-
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - 1
-
sum( 2 ^ max by (shard_ordinal,{{ .Values.global.clusterLabel }}) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ .Values.global.clusterLabel }})
!= 0
for: 15m
labels:
severity: critical

View File

@@ -1,18 +0,0 @@
condition: '{{ .Values.kubelet.enabled }}'
name: kubelet.rules
rules:
- condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.99'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.9'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- condition: '{{ true }}'
expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: '0.5'
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile

View File

@@ -1,257 +0,0 @@
condition: '{{ true }}'
name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping'
summary: 'Pod is crash looping.'
condition: '{{ true }}'
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1
for: 15m
labels:
severity: warning
- alert: KubePodNotReady
annotations:
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready'
summary: 'Pod has been in a non-ready state for more than 15 minutes.'
condition: '{{ true }}'
expr: |-
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"}
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(owner_kind) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (
1, max by (namespace,pod,owner_kind,{{ .Values.global.clusterLabel }}) (kube_pod_owner{owner_kind!="Job"})
)
) > 0
for: 15m
labels:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch'
summary: 'Deployment generation mismatch due to possible roll-back'
condition: '{{ true }}'
expr: |-
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch'
summary: 'Deployment has not matched the expected number of replicas.'
condition: '{{ true }}'
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentrolloutstuck'
summary: 'Deployment rollout is not progressing.'
condition: '{{ true }}'
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch'
summary: 'StatefulSet has not matched the expected number of replicas.'
condition: '{{ true }}'
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch'
summary: 'StatefulSet generation mismatch due to possible roll-back'
condition: '{{ true }}'
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout'
summary: 'StatefulSet update has not been rolled out.'
condition: '{{ true }}'
expr: |-
(
max by (namespace,statefulset,{{ .Values.global.clusterLabel }}) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck'
summary: 'DaemonSet rollout is stuck.'
condition: '{{ true }}'
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting'
summary: 'Pod container waiting longer than 1 hour'
condition: '{{ true }}'
expr: sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled'
summary: 'DaemonSet pods are not scheduled.'
condition: '{{ true }}'
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
-
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled'
summary: 'DaemonSet pods are misscheduled.'
condition: '{{ true }}'
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 15m
labels:
severity: warning
- alert: KubeJobNotCompleted
annotations:
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted'
summary: 'Job did not complete in time'
condition: '{{ true }}'
expr: |-
time() - max by (namespace,job_name,{{ .Values.global.clusterLabel }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
and
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200
labels:
severity: warning
- alert: KubeJobFailed
annotations:
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed'
summary: 'Job failed to complete.'
condition: '{{ true }}'
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch'
summary: 'HPA has not matched desired number of replicas.'
condition: '{{ true }}'
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout'
summary: 'HPA is running at max replicas'
condition: '{{ true }}'
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
for: 15m
labels:
severity: warning

View File

@@ -1,113 +0,0 @@
condition: '{{ true }}'
name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit'
summary: 'Cluster has overcommitted CPU resource requests.'
condition: '{{ true }}'
expr: |-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0
and
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0
for: 10m
labels:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit'
summary: 'Cluster has overcommitted memory resource requests.'
condition: '{{ true }}'
expr: |-
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0
and
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0
for: 10m
labels:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit'
summary: 'Cluster has overcommitted CPU resource requests.'
condition: '{{ true }}'
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ .Values.global.clusterLabel }})
/
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Namespaces.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit'
summary: 'Cluster has overcommitted memory resource requests.'
condition: '{{ true }}'
expr: |-
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ .Values.global.clusterLabel }})
/
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull'
summary: 'Namespace quota is going to be full.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 0.9 < 1
for: 15m
labels:
severity: informational
- alert: KubeQuotaFullyUsed
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused'
summary: 'Namespace quota is fully used.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
== 1
for: 15m
labels:
severity: informational
- alert: KubeQuotaExceeded
annotations:
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded'
summary: 'Namespace quota has exceeded the limits.'
condition: '{{ true }}'
expr: |-
kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 1
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh'
summary: 'Processes experience elevated CPU throttling.'
condition: '{{ true }}'
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }})
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }})
> ( 25 / 100 )
for: 15m
labels:
severity: informational

View File

@@ -1,101 +0,0 @@
condition: '{{ true }}'
name: kubernetes-storage
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup'
summary: 'PersistentVolume is filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup'
summary: 'PersistentVolume is filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup'
summary: 'PersistentVolumeInodes are filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.03
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup'
summary: 'PersistentVolumeInodes are filling up.'
condition: '{{ true }}'
expr: |-
(
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
/
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
) < 0.15
and
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
and
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
for: 1h
labels:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
description: 'The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors'
summary: 'PersistentVolume is having issues with provisioning.'
condition: '{{ true }}'
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical

View File

@@ -1,62 +0,0 @@
condition: '{{ true }}'
name: kubernetes-system-apiserver
rules:
- alert: KubeClientCertificateExpiration
annotations:
description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
summary: 'Client certificate is about to expire.'
condition: '{{ true }}'
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
for: 5m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
summary: 'Client certificate is about to expire.'
condition: '{{ true }}'
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
for: 5m
labels:
severity: critical
- alert: KubeAggregatedAPIErrors
annotations:
description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors'
summary: 'Kubernetes aggregated API has reported errors.'
condition: '{{ true }}'
expr: sum by (name,namespace,{{ .Values.global.clusterLabel }})(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4
labels:
severity: warning
- alert: KubeAggregatedAPIDown
annotations:
description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown'
summary: 'Kubernetes aggregated API is down.'
condition: '{{ true }}'
expr: (1 - max by (name,namespace,{{ .Values.global.clusterLabel }})(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m]))) * 100 < 85
for: 5m
labels:
severity: warning
- alert: KubeAPIDown
annotations:
description: 'KubeAPI has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubeApiServer.enabled }}'
expr: absent(up{job="kube-apiserver"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeAPITerminatedRequests
annotations:
description: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests'
summary: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
condition: '{{ true }}'
expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) ) > 0.20
for: 5m
labels:
severity: warning

View File

@@ -1,13 +0,0 @@
condition: '{{ .Values.kubeControllerManager.enabled }}'
name: kubernetes-system-controller-manager
rules:
- alert: KubeControllerManagerDown
annotations:
description: 'KubeControllerManager has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubeControllerManager.enabled }}'
expr: absent(up{job="kube-controller-manager"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -1,136 +0,0 @@
condition: '{{ true }}'
name: kubernetes-system-kubelet
rules:
- alert: KubeNodeNotReady
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready'
summary: 'Node is not ready.'
condition: '{{ true }}'
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 15m
labels:
severity: warning
- alert: KubeNodeUnreachable
annotations:
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable'
summary: 'Node is unreachable.'
condition: '{{ true }}'
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
description: 'Kubelet ''{{`{{`}} $labels.node {{`}}`}}'' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods'
summary: 'Kubelet is running at capacity.'
condition: '{{ true }}'
expr: |-
count by (node,{{ .Values.global.clusterLabel }}) (
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,{{ .Values.global.clusterLabel }}) group_left(node) topk by (instance,pod,namespace,{{ .Values.global.clusterLabel }}) (1, kube_pod_info{job="kube-state-metrics"})
)
/
max by (node,{{ .Values.global.clusterLabel }}) (
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
) > 0.95
for: 15m
labels:
severity: informational
- alert: KubeNodeReadinessFlapping
annotations:
description: 'The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping'
summary: 'Node readiness status is flapping.'
condition: '{{ true }}'
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (node,{{ .Values.global.clusterLabel }}) > 2
for: 15m
labels:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh'
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.'
condition: '{{ true }}'
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
description: 'Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh'
summary: 'Kubelet Pod startup latency is too high.'
condition: '{{ true }}'
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }})) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
for: 15m
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
summary: 'Kubelet client certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletClientCertificateExpiration
annotations:
description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
summary: 'Kubelet client certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletServerCertificateExpiration
annotations:
description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
summary: 'Kubelet server certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
labels:
severity: warning
- alert: KubeletServerCertificateExpiration
annotations:
description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
summary: 'Kubelet server certificate is about to expire.'
condition: '{{ true }}'
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
labels:
severity: critical
- alert: KubeletClientCertificateRenewalErrors
annotations:
description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors'
summary: 'Kubelet has failed to renew its client certificate.'
condition: '{{ true }}'
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletServerCertificateRenewalErrors
annotations:
description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors'
summary: 'Kubelet has failed to renew its server certificate.'
condition: '{{ true }}'
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
for: 15m
labels:
severity: warning
- alert: KubeletDown
annotations:
description: 'Kubelet has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubelet.enabled }}'
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -1,13 +0,0 @@
condition: '{{ .Values.kubeScheduler.enabled }}'
name: kubernetes-system-scheduler
rules:
- alert: KubeSchedulerDown
annotations:
description: 'KubeScheduler has disappeared from Prometheus target discovery.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown'
summary: 'Target disappeared from Prometheus target discovery.'
condition: '{{ .Values.kubeScheduler.enabled }}'
expr: absent(up{job="kube-scheduler"} == 1)
for: 15m
labels:
severity: critical

View File

@@ -1,27 +0,0 @@
condition: '{{ true }}'
name: kubernetes-system
rules:
- alert: KubeVersionMismatch
annotations:
description: 'There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch'
summary: 'Different semantic versions of Kubernetes components running.'
condition: '{{ true }}'
expr: count by ({{ .Values.global.clusterLabel }}) (count by (git_version,{{ .Values.global.clusterLabel }}) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
description: 'Kubernetes API server client ''{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}'' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'''
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors'
summary: 'Kubernetes API server client is experiencing errors.'
condition: '{{ true }}'
expr: |-
(sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }})
/
sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }}))
> 0.01
for: 15m
labels:
severity: warning

View File

@@ -1,76 +0,0 @@
condition: '{{ true }}'
name: node-exporter.rules
rules:
- condition: '{{ true }}'
expr: |-
count without (cpu, mode) (
node_cpu_seconds_total{job="node-exporter",mode="idle"}
)
record: instance:node_num_cpu:sum
- condition: '{{ true }}'
expr: |-
1 - avg without (cpu) (
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
)
record: instance:node_cpu_utilisation:rate5m
- condition: '{{ true }}'
expr: |-
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- condition: '{{ true }}'
expr: |-
1 - (
(
node_memory_MemAvailable_bytes{job="node-exporter"}
or
(
node_memory_Buffers_bytes{job="node-exporter"}
+
node_memory_Cached_bytes{job="node-exporter"}
+
node_memory_MemFree_bytes{job="node-exporter"}
+
node_memory_Slab_bytes{job="node-exporter"}
)
)
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- condition: '{{ true }}'
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
record: instance:node_vmstat_pgmajfault:rate5m
- condition: '{{ true }}'
expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_seconds:rate5m
- condition: '{{ true }}'
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_receive_drop_excluding_lo:rate5m
- condition: '{{ true }}'
expr: |-
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate5m

View File

@@ -1,336 +0,0 @@
condition: '{{ true }}'
name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup'
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup'
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace'
summary: 'Filesystem has less than 5% space left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace'
summary: 'Filesystem has less than 3% space left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 30m
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup'
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup'
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles'
summary: 'Filesystem has less than 5% inodes left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles'
summary: 'Filesystem has less than 3% inodes left.'
condition: '{{ true }}'
expr: |-
(
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs'
summary: 'Network interface is reporting many receive errors.'
condition: '{{ true }}'
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs'
summary: 'Network interface is reporting many transmit errors.'
condition: '{{ true }}'
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused'
summary: 'Number of conntrack are getting close to the limit.'
condition: '{{ true }}'
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: 'Node Exporter text file collector on {{`{{`}} $labels.instance {{`}}`}} failed to scrape.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror'
summary: 'Node Exporter text file collector failed to scrape.'
condition: '{{ true }}'
expr: node_textfile_scrape_error{job="node-exporter"} == 1
labels:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected'
summary: 'Clock skew detected.'
condition: '{{ true }}'
expr: |-
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
)
or
(
node_timex_offset_seconds{job="node-exporter"} < -0.05
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
for: 10m
labels:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising'
summary: 'Clock not synchronising.'
condition: '{{ true }}'
expr: |-
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
for: 10m
labels:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: 'RAID array ''{{`{{`}} $labels.device {{`}}`}}'' at {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded'
summary: 'RAID Array is degraded.'
condition: '{{ true }}'
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
for: 15m
labels:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: 'At least one device in RAID array at {{`{{`}} $labels.instance {{`}}`}} failed. Array ''{{`{{`}} $labels.device {{`}}`}}'' needs attention and possibly a disk swap.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure'
summary: 'Failed device in RAID array.'
condition: '{{ true }}'
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit'
summary: 'Kernel is predicted to exhaust file descriptors limit soon.'
condition: '{{ true }}'
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
)
for: 15m
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit'
summary: 'Kernel is predicted to exhaust file descriptors limit soon.'
condition: '{{ true }}'
expr: |-
(
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
)
for: 15m
labels:
severity: critical
- alert: NodeCPUHighUsage
annotations:
description: 'CPU usage at {{`{{`}} $labels.instance {{`}}`}} has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage'
summary: 'High CPU usage.'
condition: '{{ true }}'
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
severity: informational
- alert: NodeSystemSaturation
annotations:
description: 'System load per core at {{`{{`}} $labels.instance {{`}}`}} has been above 2 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
This might indicate this instance resources saturation and can cause it becoming unresponsive.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemsaturation'
summary: 'System saturated, load per core is very high.'
condition: '{{ true }}'
expr: |-
node_load1{job="node-exporter"}
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
for: 15m
labels:
severity: warning
- alert: NodeMemoryMajorPagesFaults
annotations:
description: 'Memory major pages are occurring at very high rate at {{`{{`}} $labels.instance {{`}}`}}, 500 major page faults per second for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
Please check that there is enough memory available at this instance.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults'
summary: 'Memory major page faults are occurring at very high rate.'
condition: '{{ true }}'
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 15m
labels:
severity: warning
- alert: NodeMemoryHighUtilization
annotations:
description: 'Memory is filling up at {{`{{`}} $labels.instance {{`}}`}}, has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization'
summary: 'Host is running out of memory.'
condition: '{{ true }}'
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
for: 15m
labels:
severity: warning
- alert: NodeDiskIOSaturation
annotations:
description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
This symptom might indicate disk saturation.
'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation'
summary: 'Disk IO queue is high.'
condition: '{{ true }}'
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description: 'Systemd service {{`{{`}} $labels.name {{`}}`}} has entered failed state at {{`{{`}} $labels.instance {{`}}`}}'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed'
summary: 'Systemd service has entered failed state.'
condition: '{{ true }}'
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
- alert: NodeBondingDegraded
annotations:
description: 'Bonding interface {{`{{`}} $labels.master {{`}}`}} on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more slave failures.'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded'
summary: 'Bonding interface is degraded'
condition: '{{ true }}'
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning

View File

@@ -1,13 +0,0 @@
condition: '{{ true }}'
name: node-network
rules:
- alert: NodeNetworkInterfaceFlapping
annotations:
description: 'Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}'
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping'
summary: 'Network interface is often changing its status'
condition: '{{ true }}'
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning

View File

@@ -1,44 +0,0 @@
condition: '{{ true }}'
name: node.rules
rules:
- condition: '{{ true }}'
expr: |-
topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
max by (node,namespace,pod,{{ .Values.global.clusterLabel }}) (
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:'
- condition: '{{ true }}'
expr: |-
count by (node,{{ .Values.global.clusterLabel }}) (
node_cpu_seconds_total{mode="idle",job="node-exporter"}
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node)
topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, node_namespace_pod:kube_pod_info:)
)
record: node:node_num_cpu:sum
- condition: '{{ true }}'
expr: |-
sum(
node_memory_MemAvailable_bytes{job="node-exporter"} or
(
node_memory_Buffers_bytes{job="node-exporter"} +
node_memory_Cached_bytes{job="node-exporter"} +
node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"}
)
) by ({{ .Values.global.clusterLabel }})
record: :node_memory_MemAvailable_bytes:sum
- condition: '{{ true }}'
expr: |-
avg by (node,{{ .Values.global.clusterLabel }}) (
sum without (mode) (
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
)
)
record: node:node_cpu_utilization:ratio_rate5m
- condition: '{{ true }}'
expr: |-
avg by ({{ .Values.global.clusterLabel }}) (
node:node_cpu_utilization:ratio_rate5m
)
record: cluster:node_cpu:ratio_rate5m

View File

@@ -1,458 +0,0 @@
{{- /* Expand the name of the chart. */ -}}
{{- define "victoria-metrics-k8s-stack.name" -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- default $Chart.Name $Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end }}
{{- /*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
If release name contains chart name it will be used as a full name.
*/ -}}
{{- define "victoria-metrics-k8s-stack.fullname" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $fullname := "" -}}
{{- if .appKey -}}
{{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}}
{{- $values := $Values -}}
{{- $global := (index $Values.global $Chart.Name) | default dict -}}
{{- range $ak := $appKey }}
{{- $values = (index $values $ak) | default dict -}}
{{- $global = (index $global $ak) | default dict -}}
{{- if $values.name -}}
{{- $fullname = $values.name -}}
{{- else if $global.name -}}
{{- $fullname = $global.name -}}
{{- end -}}
{{- end }}
{{- end -}}
{{- if empty $fullname -}}
{{- if $Values.fullnameOverride -}}
{{- $fullname = $Values.fullnameOverride -}}
{{- else if (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}}
{{- $fullname = (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}}
{{- else -}}
{{- $name := default $Chart.Name $Values.nameOverride -}}
{{- if contains $name $Release.Name -}}
{{- $fullname = $Release.Name -}}
{{- else -}}
{{- $fullname = (printf "%s-%s" $Release.Name $name) }}
{{- end -}}
{{- end }}
{{- end -}}
{{- $fullname | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- /* Create chart name and version as used by the chart label. */ -}}
{{- define "victoria-metrics-k8s-stack.chart" -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- printf "%s-%s" $Chart.Name $Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
{{- end }}
{{- /* Create the name of the service account to use */ -}}
{{- define "victoria-metrics-k8s-stack.serviceAccountName" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- if $Values.serviceAccount.create -}}
{{- default (include "victoria-metrics-k8s-stack.fullname" .) $Values.serviceAccount.name -}}
{{- else -}}
{{- default "default" $Values.serviceAccount.name -}}
{{- end }}
{{- end }}
{{- /* Common labels */ -}}
{{- define "victoria-metrics-k8s-stack.labels" -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $Chart := (.helm).Chart | default .Chart -}}
{{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.selectorLabels" .)) -}}
{{- $_ := set $labels "helm.sh/chart" (include "victoria-metrics-k8s-stack.chart" .) -}}
{{- $_ := set $labels "app.kubernetes.io/managed-by" $Release.Service -}}
{{- with $Chart.AppVersion }}
{{- $_ := set $labels "app.kubernetes.io/version" . -}}
{{- end -}}
{{- toYaml $labels -}}
{{- end }}
{{- define "vm.release" -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- default $Release.Name $Values.argocdReleaseOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}
{{- /* Selector labels */ -}}
{{- define "victoria-metrics-k8s-stack.selectorLabels" -}}
{{- $labels := .extraLabels | default dict -}}
{{- $_ := set $labels "app.kubernetes.io/name" (include "victoria-metrics-k8s-stack.name" .) -}}
{{- $_ := set $labels "app.kubernetes.io/instance" (include "vm.release" .) -}}
{{- toYaml $labels -}}
{{- end }}
{{- /* Create the name for VM service */ -}}
{{- define "vm.service" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $name := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- with .appKey -}}
{{- $prefix := . -}}
{{- if kindIs "slice" $prefix }}
{{- $prefix = last $prefix -}}
{{- end -}}
{{- $prefix = ternary $prefix (printf "vm%s" $prefix) (hasPrefix "vm" $prefix) -}}
{{- $name = printf "%s-%s" $prefix $name -}}
{{- end -}}
{{- if hasKey . "appIdx" -}}
{{- $name = (printf "%s-%d.%s" $name .appIdx $name) -}}
{{- end -}}
{{- $name -}}
{{- end }}
{{- define "vm.url" -}}
{{- $name := (include "vm.service" .) -}}
{{- $Release := (.helm).Release | default .Release -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $ns := include "vm.namespace" . -}}
{{- $proto := "http" -}}
{{- $port := 80 -}}
{{- $path := .appRoute | default "/" -}}
{{- $isSecure := false -}}
{{- if .appSecure -}}
{{- $isSecure = .appSecure -}}
{{- end -}}
{{- if .appKey -}}
{{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}}
{{- $spec := $Values -}}
{{- range $ak := $appKey -}}
{{- if hasKey $spec $ak -}}
{{- $spec = (index $spec $ak) -}}
{{- end -}}
{{- if hasKey $spec "spec" -}}
{{- $spec = $spec.spec -}}
{{- end -}}
{{- end -}}
{{- $isSecure = (eq ($spec.extraArgs).tls "true") | default $isSecure -}}
{{- $proto = (ternary "https" "http" $isSecure) -}}
{{- $port = (ternary 443 80 $isSecure) -}}
{{- $port = $spec.port | default $port -}}
{{- $path = dig "http.pathPrefix" $path ($spec.extraArgs | default dict) -}}
{{- end -}}
{{- printf "%s://%s.%s.svc:%d%s" $proto $name $ns (int $port) $path -}}
{{- end -}}
{{- define "vm.read.endpoint" -}}
{{- $ctx := . -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $endpoint := default dict -}}
{{- if $Values.vmsingle.enabled -}}
{{- $_ := set $ctx "appKey" "vmsingle" -}}
{{- $_ := set $endpoint "url" (include "vm.url" $ctx) -}}
{{- else if $Values.vmcluster.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}}
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
{{- $tenant := ($Values.tenant | default 0) -}}
{{- $_ := set $endpoint "url" (printf "%s/select/%d/prometheus" $baseURL (int $tenant)) -}}
{{- else if $Values.externalVM.read.url -}}
{{- $endpoint = $Values.externalVM.read -}}
{{- end -}}
{{- toYaml $endpoint -}}
{{- end }}
{{- define "vm.write.endpoint" -}}
{{- $ctx := . -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $endpoint := default dict -}}
{{- if $Values.vmsingle.enabled -}}
{{- $_ := set $ctx "appKey" "vmsingle" -}}
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
{{- $_ := set $endpoint "url" (printf "%s/api/v1/write" $baseURL) -}}
{{- else if $Values.vmcluster.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}}
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
{{- $tenant := ($Values.tenant | default 0) -}}
{{- $_ := set $endpoint "url" (printf "%s/insert/%d/prometheus/api/v1/write" $baseURL (int $tenant)) -}}
{{- else if $Values.externalVM.write.url -}}
{{- $endpoint = $Values.externalVM.write -}}
{{- end -}}
{{- toYaml $endpoint -}}
{{- end -}}
{{- /* VMAlert remotes */ -}}
{{- define "vm.alert.remotes" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $remotes := default dict -}}
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- $ctx := dict "helm" . -}}
{{- $remoteWrite := (include "vm.write.endpoint" $ctx | fromYaml) -}}
{{- if $Values.vmalert.remoteWriteVMAgent -}}
{{- $ctx := dict "helm" . "appKey" "vmagent" -}}
{{- $remoteWrite = dict "url" (printf "%s/api/v1/write" (include "vm.url" $ctx)) -}}
{{- end -}}
{{- $ctx := dict "helm" . -}}
{{- $remoteRead := (fromYaml (include "vm.read.endpoint" $ctx)) -}}
{{- $_ := set $remotes "remoteWrite" $remoteWrite -}}
{{- $_ := set $remotes "remoteRead" $remoteRead -}}
{{- $_ := set $remotes "datasource" $remoteRead -}}
{{- if $Values.vmalert.additionalNotifierConfigs }}
{{- $configName := printf "%s-vmalert-additional-notifier" $fullname -}}
{{- $notifierConfigRef := dict "name" $configName "key" "notifier-configs.yaml" -}}
{{- $_ := set $remotes "notifierConfigRef" $notifierConfigRef -}}
{{- else if $Values.alertmanager.enabled -}}
{{- $notifiers := default list -}}
{{- $appSecure := (not (empty (((.Values.alertmanager).spec).webConfig).tls_server_config)) -}}
{{- $ctx := dict "helm" . "appKey" "alertmanager" "appSecure" $appSecure "appRoute" ((.Values.alertmanager).spec).routePrefix -}}
{{- $alertManagerReplicas := (.Values.alertmanager.spec.replicaCount | default 1 | int) -}}
{{- range until $alertManagerReplicas -}}
{{- $_ := set $ctx "appIdx" . -}}
{{- $notifiers = append $notifiers (dict "url" (include "vm.url" $ctx)) -}}
{{- end }}
{{- $_ := set $remotes "notifiers" $notifiers -}}
{{- end -}}
{{- toYaml $remotes -}}
{{- end -}}
{{- /* VMAlert templates */ -}}
{{- define "vm.alert.templates" -}}
{{- $Values := (.helm).Values | default .Values}}
{{- $cms := ($Values.vmalert.spec.configMaps | default list) -}}
{{- if $Values.vmalert.templateFiles -}}
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- $cms = append $cms (printf "%s-vmalert-extra-tpl" $fullname) -}}
{{- end -}}
{{- $output := dict "configMaps" (compact $cms) -}}
{{- toYaml $output -}}
{{- end -}}
{{- define "vm.license.global" -}}
{{- $license := (deepCopy (.Values.global).license) | default dict -}}
{{- if $license.key -}}
{{- if hasKey $license "keyRef" -}}
{{- $_ := unset $license "keyRef" -}}
{{- end -}}
{{- else if $license.keyRef.name -}}
{{- if hasKey $license "key" -}}
{{- $_ := unset $license "key" -}}
{{- end -}}
{{- else -}}
{{- $license = default dict -}}
{{- end -}}
{{- toYaml $license -}}
{{- end -}}
{{- /* VMAlert spec */ -}}
{{- define "vm.alert.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $extraArgs := dict "remoteWrite.disablePathAppend" "true" -}}
{{- if $Values.vmalert.templateFiles -}}
{{- $ruleTmpl := (printf "/etc/vm/configs/%s-vmalert-extra-tpl/*.tmpl" (include "victoria-metrics-k8s-stack.fullname" .)) -}}
{{- $_ := set $extraArgs "rule.templates" $ruleTmpl -}}
{{- end -}}
{{- $vmAlertRemotes := (include "vm.alert.remotes" . | fromYaml) -}}
{{- $vmAlertTemplates := (include "vm.alert.templates" . | fromYaml) -}}
{{- $spec := dict "extraArgs" $extraArgs -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- with concat ($vmAlertRemotes.notifiers | default list) (.Values.vmalert.spec.notifiers | default list) }}
{{- $_ := set $vmAlertRemotes "notifiers" . }}
{{- end }}
{{- $spec := deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec }}
{{- if not (or (hasKey $spec "notifier") (hasKey $spec "notifiers") (hasKey $spec "notifierConfigRef") (hasKey $spec.extraArgs "notifier.blackhole")) }}
{{- fail "Neither `notifier`, `notifiers` nor `notifierConfigRef` is set for vmalert. If it's intentionally please consider setting `.vmalert.spec.extraArgs.['notifier.blackhole']` to `'true'`"}}
{{- end }}
{{- tpl (deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec | toYaml) . -}}
{{- end }}
{{- /* VM Agent remoteWrites */ -}}
{{- define "vm.agent.remote.write" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $remoteWrites := $Values.vmagent.additionalRemoteWrites | default list -}}
{{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.externalVM.write.url -}}
{{- $ctx := dict "helm" . -}}
{{- $remoteWrites = append $remoteWrites (fromYaml (include "vm.write.endpoint" $ctx)) -}}
{{- end -}}
{{- toYaml (dict "remoteWrite" $remoteWrites) -}}
{{- end -}}
{{- /* VMAgent spec */ -}}
{{- define "vm.agent.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $spec := (include "vm.agent.remote.write" . | fromYaml) -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl (deepCopy $Values.vmagent.spec | mergeOverwrite $spec | toYaml) . -}}
{{- end }}
{{- /* VMAuth spec */ -}}
{{- define "vm.auth.spec" -}}
{{- $ctx := . -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $unauthorizedAccessConfig := default list }}
{{- if $Values.vmsingle.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmsingle") -}}
{{- $url := (include "vm.url" $ctx) }}
{{- $srcPath := clean (printf "%s/.*" (urlParse $url).path) }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list $url)) }}
{{- else if $Values.vmcluster.enabled -}}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}}
{{- $writeUrl := (include "vm.url" $ctx) }}
{{- $writeSrcPath := clean (printf "%s/insert/.*" (urlParse $writeUrl).path) }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $writeSrcPath) "url_prefix" (list $writeUrl)) }}
{{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}}
{{- $readUrl := (include "vm.url" $ctx) }}
{{- $readSrcPath := clean (printf "%s/select/.*" (urlParse $readUrl).path) }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $readSrcPath) "url_prefix" (list $readUrl)) }}
{{- else if or $Values.externalVM.read.url $Values.externalVM.write.url }}
{{- with $Values.externalVM.read.url }}
{{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }}
{{- end -}}
{{- with $Values.externalVM.write.url }}
{{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }}
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }}
{{- end -}}
{{- end -}}
{{- $spec := $Values.vmauth.spec }}
{{- $_ := set $spec "unauthorizedAccessConfig" (concat $unauthorizedAccessConfig ($spec.unauthorizedAccessConfig | default list)) }}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl (toYaml $spec) . -}}
{{- end -}}
{{- /* Alermanager spec */ -}}
{{- define "vm.alertmanager.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
{{- $spec := $Values.alertmanager.spec -}}
{{- if and (not $Values.alertmanager.spec.configRawYaml) (not $Values.alertmanager.spec.configSecret) -}}
{{- $_ := set $spec "configSecret" (printf "%s-alertmanager" $fullname) -}}
{{- end -}}
{{- $templates := default list -}}
{{- if $Values.alertmanager.monzoTemplate.enabled -}}
{{- $configMap := (printf "%s-alertmanager-monzo-tpl" $fullname) -}}
{{- $templates = append $templates (dict "name" $configMap "key" "monzo.tmpl") -}}
{{- end -}}
{{- $configMap := (printf "%s-alertmanager-extra-tpl" $fullname) -}}
{{- range $key, $value := (.Values.alertmanager.templateFiles | default dict) -}}
{{- $templates = append $templates (dict "name" $configMap "key" $key) -}}
{{- end -}}
{{- $_ := set $spec "templates" $templates -}}
{{- toYaml $spec -}}
{{- end -}}
{{- /* Single spec */ -}}
{{- define "vm.single.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $extraArgs := default dict -}}
{{- if $Values.vmalert.enabled }}
{{- $ctx := dict "helm" . "appKey" "vmalert" -}}
{{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}}
{{- end -}}
{{- $spec := dict "extraArgs" $extraArgs -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $spec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl (deepCopy $Values.vmsingle.spec | mergeOverwrite $spec | toYaml) . -}}
{{- end }}
{{- /* Cluster spec */ -}}
{{- define "vm.select.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $extraArgs := default dict -}}
{{- if $Values.vmalert.enabled -}}
{{- $ctx := dict "helm" . "appKey" "vmalert" -}}
{{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}}
{{- end -}}
{{- $spec := dict "extraArgs" $extraArgs -}}
{{- toYaml $spec -}}
{{- end -}}
{{- define "vm.cluster.spec" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $spec := (include "vm.select.spec" . | fromYaml) -}}
{{- $clusterSpec := (deepCopy $Values.vmcluster.spec) -}}
{{- with (include "vm.license.global" .) -}}
{{- $_ := set $clusterSpec "license" (fromYaml .) -}}
{{- end -}}
{{- tpl ($clusterSpec | mergeOverwrite (dict "vmselect" $spec) | toYaml) . -}}
{{- end -}}
{{- define "vm.data.source.enabled" -}}
{{- $Values := (.helm).Values | default .Values -}}
{{- $grafana := $Values.grafana -}}
{{- $isEnabled := false -}}
{{- if $grafana.plugins -}}
{{- range $value := $grafana.plugins -}}
{{- if contains "victoriametrics-datasource" $value -}}
{{- $isEnabled = true -}}
{{- end }}
{{- end }}
{{- end }}
{{- $unsignedPlugins := ((index $grafana "grafana.ini").plugins).allow_loading_unsigned_plugins | default "" -}}
{{- $allowUnsigned := contains "victoriametrics-datasource" $unsignedPlugins -}}
{{- ternary "true" "" (and $isEnabled $allowUnsigned) -}}
{{- end -}}
{{- /* Datasources */ -}}
{{- define "vm.data.sources" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $grafana := $Values.grafana -}}
{{- $datasources := $Values.grafana.additionalDataSources | default list -}}
{{- $vmDatasource := "victoriametrics-datasource" -}}
{{- $allowVMDatasource := (ternary false true (empty (include "vm.data.source.enabled" .))) -}}
{{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled -}}
{{- $ctx := dict "helm" . -}}
{{- $readEndpoint:= (include "vm.read.endpoint" $ctx | fromYaml) -}}
{{- $defaultDatasources := default list -}}
{{- range $ds := $grafana.sidecar.datasources.default }}
{{- if not $ds.type -}}
{{- $_ := set $ds "type" $Values.grafana.defaultDatasourceType }}
{{- end -}}
{{- if or (ne $ds.type $vmDatasource) $allowVMDatasource -}}
{{- $_ := set $ds "url" $readEndpoint.url -}}
{{- $defaultDatasources = append $defaultDatasources $ds -}}
{{- end -}}
{{- end }}
{{- $datasources = concat $datasources $defaultDatasources -}}
{{- if and $grafana.sidecar.datasources.createVMReplicasDatasources $defaultDatasources -}}
{{- range $id := until (int $Values.vmsingle.spec.replicaCount) -}}
{{- $_ := set $ctx "appIdx" $id -}}
{{- $readEndpoint := (include "vm.read.endpoint" $ctx | fromYaml) -}}
{{- range $ds := $defaultDatasources -}}
{{- $ds = (deepCopy $ds) -}}
{{- $_ := set $ds "url" $readEndpoint.url -}}
{{- $_ := set $ds "name" (printf "%s-%d" $ds.name $id) -}}
{{- $_ := set $ds "isDefault" false -}}
{{- $datasources = append $datasources $ds -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{- end -}}
{{- toYaml $datasources -}}
{{- end }}
{{- /* VMRule name */ -}}
{{- define "victoria-metrics-k8s-stack.rulegroup.name" -}}
{{- printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) (.name | replace "_" "") -}}
{{- end -}}
{{- /* VMRule labels */ -}}
{{- define "victoria-metrics-k8s-stack.rulegroup.labels" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.labels" .)) -}}
{{- $_ := set $labels "app" (include "victoria-metrics-k8s-stack.name" .) -}}
{{- $labels = mergeOverwrite $labels (deepCopy $Values.defaultRules.labels) -}}
{{- toYaml $labels -}}
{{- end }}
{{- /* VMRule key */ -}}
{{- define "victoria-metrics-k8s-stack.rulegroup.key" -}}
{{- without (regexSplit "[-_.]" .name -1) "exporter" "rules" | join "-" | camelcase | untitle -}}
{{- end -}}
{{- /* VMAlertmanager name */ -}}
{{- define "victoria-metrics-k8s-stack.alertmanager.name" -}}
{{- $Values := (.helm).Values | default .Values }}
{{- $Values.alertmanager.name | default (printf "%s-%s" "vmalertmanager" (include "victoria-metrics-k8s-stack.fullname" .) | trunc 63 | trimSuffix "-") -}}
{{- end -}}

View File

@@ -1,4 +0,0 @@
{{ range .Values.extraObjects }}
---
{{ tpl (toYaml .) $ }}
{{ end }}

View File

@@ -1,121 +0,0 @@
{{- if .Values.defaultRules.create }}
{{- /*
Default rules alias
*/}}
{{- $defaultRules := .Values.defaultRules -}}
{{- /*
Exact rules overrides
*/}}
{{- $exactRules := index $defaultRules "rules" | default dict }}
{{- /*
Create custom template context
*/}}
{{- $ctx := (dict "Values" (deepCopy .Values) "Release" (deepCopy .Release) "Chart" (deepCopy .Chart) "Template" (deepCopy .Template)) -}}
{{- /*
Loop through all rules files, that were crafted with hack/sync_rules.py
*/}}
{{- range $groupFile, $_ := .Files.Glob "files/rules/**.yaml" -}}
{{- /*
Get group name from file
*/}}
{{- $groupBase := base $groupFile -}}
{{- $groupNameRaw := trimSuffix (ext $groupBase) $groupBase -}}
{{- /*
Create context for templating
*/}}
{{- $_ := set $ctx "name" $groupNameRaw -}}
{{- /*
Create sanitized group name retrieved from file
*/}}
{{- $groupName := include "victoria-metrics-k8s-stack.rulegroup.key" $ctx -}}
{{- /*
Merge common group with a group data of a current iteration
*/}}
{{- $group := mergeOverwrite (deepCopy (dig "group" (default dict) $defaultRules)) (dig "groups" $groupName (default dict) $defaultRules) -}}
{{- /*
Get group data from file
*/}}
{{- $groupCtx := mergeOverwrite (deepCopy $ctx) $group }}
{{- $groupData := fromYaml (tpl ($.Files.Get $groupFile) $groupCtx) -}}
{{- /*
Save rules spec from file
*/}}
{{- $rulesSpec := $groupData.rules -}}
{{- /*
Delete rules from group
*/}}
{{- $_ := unset $groupData "rules" -}}
{{- /*
Save condition for group from file
*/}}
{{- $groupCondition := (eq $groupData.condition "true") -}}
{{- /*
Delete condition from group
*/}}
{{- $_ := unset $groupData "condition" -}}
{{- /*
Merge group spec
*/}}
{{- $groupSpec := mergeOverwrite (deepCopy $groupData) (dig "spec" (default dict) $group) -}}
{{- /*
Filter out ignore rules
*/}}
{{- $commonRule := dig "rule" (default dict) $defaultRules }}
{{- $commonInGroupRule := dig "rules" (default dict) $group }}
{{- $filteredRulesSpec := default list }}
{{- range $_, $ruleSpec := $rulesSpec }}
{{- $ruleName := $ruleSpec.alert | default "" }}
{{- $ruleKey := (hasKey $ruleSpec "record" | ternary "recording" "alerting") -}}
{{- $ruleCondition := (eq $ruleSpec.condition "true") }}
{{- $_ := unset $ruleSpec "condition" }}
{{- $exactRule := index $exactRules $ruleName | default dict }}
{{- $defaultRule := deepCopy (index $defaultRules $ruleKey) }}
{{- $resultRule := mergeOverwrite (deepCopy $commonRule) $defaultRule $commonInGroupRule $exactRule }}
{{- if (and (dig "create" true $resultRule) $ruleCondition) }}
{{- $ruleSpec := mergeOverwrite (deepCopy $ruleSpec) (dig "spec" (default dict) $resultRule) }}
{{- $filteredRulesSpec = append $filteredRulesSpec $ruleSpec }}
{{- end }}
{{- end -}}
{{- $rulesSpec = $filteredRulesSpec }}
{{- /*
Check if group is enabled
*/}}
{{- if (and $rulesSpec (dig "create" true $group) $groupCondition) }}
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMRule
metadata:
name: {{ include "victoria-metrics-k8s-stack.rulegroup.name" $ctx }}
{{- with (include "victoria-metrics-k8s-stack.rulegroup.labels" $ctx) }}
labels: {{ . | nindent 4 }}
{{- end }}
{{- with $.Values.defaultRules.annotations }}
annotations: {{ toYaml . | nindent 4 }}
{{- end }}
spec:
groups:
- {{ toYaml $groupSpec | indent 4 | trim }}
rules:
{{- range $ruleSpec := $rulesSpec }}
- {{ toYaml $ruleSpec | indent 6 | trim }}
{{- end }}
{{- end }}
{{- end }}
{{- end }}

View File

@@ -1,26 +0,0 @@
### plans
* [x] VMCluster
* [x] VMSingle
* [x] VMAgent
* [x] VMAlert
* [x] AlertManager
* [x] Annotations
* [x] ServiceScrapes
* [x] Nodeexporter
* [x] Grafana
* [x] kube-state-metrics
* [x] kube-mixin
* [x] core-dns
* [x] Grafana DS
* [x] Dashboards
* [x] Nodeexporter
* [x] kube-state-metrics
* [x] kube-mixin
* [x] Rules
* [x] kube-mixin
* [x] kube-prometheus
* [x] victoria-metrics
* [ ] ServiceAccounts stuff
* [ ] SelectorOvverride for ServiceScrapes
* [ ] helm hook for uninstall crd objects before chart remove

View File

@@ -1,38 +0,0 @@
vmagent:
spec:
volumes:
- hostPath:
path: /var/lib/minikube/certs/etcd
type: DirectoryOrCreate
name: etcd-certs
volumeMounts:
- mountPath: /var/lib/minikube/certs/etcd
name: etcd-certs
kubeScheduler:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: http-metrics
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
kubeControllerManager:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: http-metrics
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecureSkipVerify: true
kubeEtcd:
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
port: http-metrics
scheme: https
tlsConfig:
caFile: /var/lib/minikube/certs/etcd/ca.crt
certFile: /var/lib/minikube/certs/etcd/peer.crt
keyFile: /var/lib/minikube/certs/etcd/peer.key

View File

@@ -0,0 +1,7 @@
{{- $files := .Files.Glob "alerts/*.yaml" -}}
{{- range $path, $file := $files }}
---
# from: {{ $path }}
{{ toString $file }}
{{- end -}}

View File

@@ -82,3 +82,275 @@ victoria-metrics-k8s-stack:
enabled: false
alertmanager:
name: vmalertmanager-alertmanager
kube-state-metrics:
rbac:
extraRules:
- apiGroups:
- source.toolkit.fluxcd.io
- kustomize.toolkit.fluxcd.io
- helm.toolkit.fluxcd.io
- notification.toolkit.fluxcd.io
- image.toolkit.fluxcd.io
resources:
- gitrepositories
- buckets
- helmrepositories
- helmcharts
- ocirepositories
- kustomizations
- helmreleases
- alerts
- providers
- receivers
- imagerepositories
- imagepolicies
- imageupdateautomations
verbs: [ "list", "watch" ]
customResourceState:
enabled: true
config:
spec:
resources:
- groupVersionKind:
group: kustomize.toolkit.fluxcd.io
version: v1
kind: Kustomization
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux Kustomization resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, lastAppliedRevision ]
source_name: [ spec, sourceRef, name ]
- groupVersionKind:
group: helm.toolkit.fluxcd.io
version: v2
kind: HelmRelease
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux HelmRelease resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, history, "0", chartVersion ]
chart_name: [ status, history, "0", chartName ]
chart_app_version: [ status, history, "0", appVersion ]
chart_ref_name: [ spec, chartRef, name ]
chart_source_name: [ spec, chart, spec, sourceRef, name ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1
kind: GitRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux GitRepository resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
url: [ spec, url ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1beta2
kind: Bucket
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux Bucket resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
endpoint: [ spec, endpoint ]
bucket_name: [ spec, bucketName ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1
kind: HelmRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux HelmRepository resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
url: [ spec, url ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1
kind: HelmChart
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux HelmChart resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
chart_name: [ spec, chart ]
chart_version: [ spec, version ]
- groupVersionKind:
group: source.toolkit.fluxcd.io
version: v1beta2
kind: OCIRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux OCIRepository resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
revision: [ status, artifact, revision ]
url: [ spec, url ]
- groupVersionKind:
group: notification.toolkit.fluxcd.io
version: v1beta3
kind: Alert
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux Alert resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
suspended: [ spec, suspend ]
- groupVersionKind:
group: notification.toolkit.fluxcd.io
version: v1beta3
kind: Provider
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux Provider resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
suspended: [ spec, suspend ]
- groupVersionKind:
group: notification.toolkit.fluxcd.io
version: v1
kind: Receiver
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux Receiver resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
webhook_path: [ status, webhookPath ]
- groupVersionKind:
group: image.toolkit.fluxcd.io
version: v1beta2
kind: ImageRepository
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux ImageRepository resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
image: [ spec, image ]
- groupVersionKind:
group: image.toolkit.fluxcd.io
version: v1beta2
kind: ImagePolicy
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux ImagePolicy resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
source_name: [ spec, imageRepositoryRef, name ]
- groupVersionKind:
group: image.toolkit.fluxcd.io
version: v1beta2
kind: ImageUpdateAutomation
metricNamePrefix: gotk
metrics:
- name: "resource_info"
help: "The current state of a Flux ImageUpdateAutomation resource."
each:
type: Info
info:
labelsFromPath:
name: [ metadata, name ]
labelsFromPath:
exported_namespace: [ metadata, namespace ]
ready: [ status, conditions, "[type=Ready]", status ]
suspended: [ spec, suspend ]
source_name: [ spec, sourceRef, name ]