mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-01-28 02:18:36 +00:00
Rework alerts; Add fluxcd alerts (#388)
- Rework alerts - Add fluxcd alerts --------- Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
@@ -19,26 +19,3 @@ update:
|
||||
helm repo add fluent https://fluent.github.io/helm-charts
|
||||
helm repo update fluent
|
||||
helm pull fluent/fluent-bit --untar --untardir charts
|
||||
# alerts from victoria-metrics-k8s-stack
|
||||
helm repo add vm https://victoriametrics.github.io/helm-charts/
|
||||
helm repo update vm
|
||||
helm pull vm/victoria-metrics-k8s-stack --untar --untardir charts
|
||||
rm -rf charts/victoria-metrics-k8s-stack/charts
|
||||
rm -rf charts/victoria-metrics-k8s-stack/hack
|
||||
rm -rf charts/victoria-metrics-k8s-stack/templates/victoria-metrics-operator
|
||||
rm -rf charts/victoria-metrics-k8s-stack/templates/grafana
|
||||
rm -rf charts/victoria-metrics-k8s-stack/templates/ingress.yaml
|
||||
rm -rf charts/victoria-metrics-k8s-stack/files/dashboards
|
||||
rm -f charts/victoria-metrics-k8s-stack/templates/servicemonitors.yaml
|
||||
rm -f charts/victoria-metrics-k8s-stack/templates/serviceaccount.yaml
|
||||
rm -f charts/victoria-metrics-k8s-stack/templates/rules/additionalVictoriaMetricsRules.yml
|
||||
sed -i '/ namespace:/d' charts/victoria-metrics-k8s-stack/templates/rules/rule.yaml
|
||||
sed -i 's|job="apiserver"|job="kube-apiserver"|g' `grep -rl 'job="apiserver"' charts/victoria-metrics-k8s-stack/files/rules/generated`
|
||||
sed -i 's|severity: info|severity: informational|g' `grep -rl 'severity: info' ./charts/victoria-metrics-k8s-stack/files/rules/generated`
|
||||
sed -i 's|severity: none|severity: ok|g' ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml
|
||||
sed -i ./charts/victoria-metrics-k8s-stack/files/rules/generated/general.rules.yaml \
|
||||
-e '/Watchdog/,/severity:/s/severity: none/severity: ok/' \
|
||||
-e '/InfoInhibitor/,/severity:/s/severity: none/severity: major/'
|
||||
# TODO
|
||||
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/alertmanager.rules.yaml
|
||||
rm -f charts/victoria-metrics-k8s-stack/files/rules/generated/vm*.yaml
|
||||
|
||||
221
packages/system/monitoring/alerts/etcd.yaml
Normal file
221
packages/system/monitoring/alerts/etcd.yaml
Normal file
@@ -0,0 +1,221 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-etcd
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
params: {}
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||
without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance
|
||||
}} has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||
resources, high network latency, or disruptions by other components and
|
||||
should be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||
>= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests
|
||||
for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests
|
||||
for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
|
||||
service: etcd
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||
}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.grpc_method }}'
|
||||
service: etcd
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member communication with
|
||||
{{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.member }}'
|
||||
service: etcd
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the
|
||||
defined quota on etcd instance {{ $labels.instance }}, please defrag or
|
||||
increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100
|
||||
> 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of
|
||||
disk space in the next four hours, based on write observations within the
|
||||
past four hours on etcd instance {{ $labels.instance }}, please check as
|
||||
it might be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h],
|
||||
4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||
retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated
|
||||
storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) <
|
||||
0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: etcd
|
||||
128
packages/system/monitoring/alerts/flux.yaml
Normal file
128
packages/system/monitoring/alerts/flux.yaml
Normal file
@@ -0,0 +1,128 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
annotations:
|
||||
meta.helm.sh/release-name: monitoring
|
||||
meta.helm.sh/release-namespace: cozy-monitoring
|
||||
labels:
|
||||
app: victoria-metrics-k8s-stack
|
||||
app.kubernetes.io/instance: monitoring
|
||||
app.kubernetes.io/managed-by: Helm
|
||||
app.kubernetes.io/name: victoria-metrics-k8s-stack
|
||||
app.kubernetes.io/version: v1.102.1
|
||||
helm.sh/chart: victoria-metrics-k8s-stack-0.25.17
|
||||
name: alerts-flux-resources
|
||||
namespace: cozy-monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: flux-resources-alerts
|
||||
rules:
|
||||
- alert: HelmReleaseNotReady
|
||||
expr: gotk_resource_info{customresource_kind="HelmRelease", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready"
|
||||
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is in an unready state for more than 15 minutes."
|
||||
|
||||
- alert: GitRepositorySyncFailed
|
||||
expr: gotk_resource_info{customresource_kind="GitRepository", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
|
||||
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
|
||||
|
||||
- alert: KustomizationNotApplied
|
||||
expr: gotk_resource_info{customresource_kind="Kustomization", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not applied"
|
||||
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not successfully applied for more than 15 minutes."
|
||||
|
||||
- alert: ImageRepositorySyncFailed
|
||||
expr: gotk_resource_info{customresource_kind="ImageRepository", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} sync failed"
|
||||
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has not been successfully synced for more than 15 minutes."
|
||||
|
||||
- alert: HelmChartFailed
|
||||
expr: gotk_resource_info{customresource_kind="HelmChart", ready!="True"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has failed"
|
||||
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is not ready for more than 15 minutes."
|
||||
|
||||
- alert: HelmReleaseSuspended
|
||||
expr: gotk_resource_info{customresource_kind="HelmRelease", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "HelmRelease {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: GitRepositorySuspended
|
||||
expr: gotk_resource_info{customresource_kind="GitRepository", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "GitRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: KustomizationSuspended
|
||||
expr: gotk_resource_info{customresource_kind="Kustomization", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "Kustomization {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: ImageRepositorySuspended
|
||||
expr: gotk_resource_info{customresource_kind="ImageRepository", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "ImageRepository {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
|
||||
- alert: HelmChartSuspended
|
||||
expr: gotk_resource_info{customresource_kind="HelmChart", suspended="true"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
service: fluxcd
|
||||
exported_instance: '{{ $labels.exported_namespace }}/{{ $labels.name }}'
|
||||
annotations:
|
||||
summary: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} is suspended"
|
||||
description: "HelmChart {{ $labels.name }} in namespace {{ $labels.exported_namespace }} has been suspended."
|
||||
57
packages/system/monitoring/alerts/general.rules.yaml
Normal file
57
packages/system/monitoring/alerts/general.rules.yaml
Normal file
@@ -0,0 +1,57 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-general.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
params: {}
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (job,namespace,service,cluster) / count(up) BY
|
||||
(job,namespace,service,cluster)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: general.rules
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager
|
||||
is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: ok
|
||||
exported_instance: global
|
||||
service: general.rules
|
||||
event: Heartbeat
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert that is used to inhibit info alerts.
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
other alerts.
|
||||
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
||||
severity of 'warning' or 'critical' starts firing on the same namespace.
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace,cluster) ALERTS{alertname
|
||||
!= "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} ==
|
||||
1
|
||||
labels:
|
||||
severity: major
|
||||
exported_instance: global
|
||||
service: general.rules
|
||||
@@ -0,0 +1,18 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containercpuusagesecondstotal
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_cpu_usage_seconds_total
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,pod,container,cluster) (
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (
|
||||
1, max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemorycache
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_cache
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemoryrss
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_rss
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemoryswap
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_swap
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containermemoryworkingsetbytes
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_memory_working_set_bytes
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,cluster) group_left(node) topk by (namespace,pod,cluster) (1,
|
||||
max by (namespace,pod,node,cluster) (kube_pod_info{node!=""})
|
||||
)
|
||||
labels: {}
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
@@ -0,0 +1,93 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.containerresource
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.container_resource
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,cluster)
|
||||
group_left() max by (namespace,pod,cluster) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum by (namespace,cluster) (
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,container,cluster) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,cluster) group_left() max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
60
packages/system/monitoring/alerts/k8s.rules.pod_owner.yaml
Normal file
60
packages/system/monitoring/alerts/k8s.rules.pod_owner.yaml
Normal file
@@ -0,0 +1,60 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-k8s.rules.podowner
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules.pod_owner
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on (replicaset,namespace,cluster) group_left(owner_name) topk by (replicaset,namespace,cluster) (
|
||||
1, max by (replicaset,namespace,owner_name,cluster) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,cluster) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: job
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
@@ -0,0 +1,146 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-availability.rules
|
||||
spec:
|
||||
groups:
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24
|
||||
* 30
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,cluster) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h]))
|
||||
labels: {}
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,cluster) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d])
|
||||
* 24 * 30)
|
||||
labels: {}
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,le,cluster) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
|
||||
labels: {}
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (verb,scope,le,cluster) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d])
|
||||
* 24 * 30)
|
||||
labels: {}
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- annotations: {}
|
||||
expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- annotations: {}
|
||||
expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- annotations: {}
|
||||
expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
labels: {}
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
@@ -0,0 +1,324 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-burnrate.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-burnrate.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
@@ -0,0 +1,23 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-histogram.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-histogram.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])))
|
||||
> 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])))
|
||||
> 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
73
packages/system/monitoring/alerts/kube-apiserver-slos.yaml
Normal file
73
packages/system/monitoring/alerts/kube-apiserver-slos.yaml
Normal file
@@ -0,0 +1,73 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-apiserver-slos
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver-slos
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: The API server is burning too much error budget.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn
|
||||
summary: The API server is burning too much error budget.
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.apiserver }}'
|
||||
service: kube-apiserver-slos
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-prometheus-general.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-general.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: count without(instance, pod, node) (up == 1)
|
||||
labels: {}
|
||||
record: count:up1
|
||||
- annotations: {}
|
||||
expr: count without(instance, pod, node) (up == 0)
|
||||
labels: {}
|
||||
record: count:up0
|
||||
@@ -0,0 +1,37 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-prometheus-node-recording.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
|
||||
BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_cpu:rate:sum
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu)) BY (instance)
|
||||
labels: {}
|
||||
record: instance:node_cpu:ratio
|
||||
- annotations: {}
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
labels: {}
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- annotations: {}
|
||||
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance,
|
||||
cpu))
|
||||
labels: {}
|
||||
record: cluster:node_cpu:ratio
|
||||
63
packages/system/monitoring/alerts/kube-scheduler.rules.yaml
Normal file
63
packages/system/monitoring/alerts/kube-scheduler.rules.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-scheduler.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m]))
|
||||
without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
73
packages/system/monitoring/alerts/kube-state-metrics.yaml
Normal file
73
packages/system/monitoring/alerts/kube-state-metrics.yaml
Normal file
@@ -0,0 +1,73 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kube-state-metrics
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate
|
||||
in list operations. This is likely causing it to not be able to expose metrics
|
||||
about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate
|
||||
in watch operations. This is likely causing it to not be able to expose
|
||||
metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: kube-state-metrics pods are running with different --total-shards
|
||||
configuration, some Kubernetes objects may be exposed multiple times or
|
||||
not exposed at all.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by
|
||||
(cluster) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects
|
||||
are not being exposed.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
|
||||
-
|
||||
sum( 2 ^ max by (shard_ordinal,cluster) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/kube-state-metrics'
|
||||
service: kube-state-metrics
|
||||
30
packages/system/monitoring/alerts/kubelet.rules.yaml
Normal file
30
packages/system/monitoring/alerts/kubelet.rules.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubelet.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- annotations: {}
|
||||
expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
304
packages/system/monitoring/alerts/kubernetes-apps.yaml
Normal file
304
packages/system/monitoring/alerts/kubernetes-apps.yaml
Normal file
@@ -0,0 +1,304 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-apps
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace,pod,cluster) (
|
||||
max by (namespace,pod,cluster) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
||||
) * on (namespace,pod,cluster) group_left(owner_kind) topk by (namespace,pod,cluster) (
|
||||
1, max by (namespace,pod,owner_kind,cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} is not progressing for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||
summary: Deployment rollout is not progressing.
|
||||
expr: |-
|
||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.deployment }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
|
||||
has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: StatefulSet has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }}
|
||||
update has not been rolled out.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max by (namespace,statefulset,cluster) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.statefulset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has
|
||||
not finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on
|
||||
container {{ $labels.container}} has been in waiting state for longer than
|
||||
1 hour.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace,pod,container,cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||
namespace=~".*"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container
|
||||
}}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.daemonset }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking
|
||||
more than {{ "43200" | humanizeDuration }} to complete.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by (namespace,job_name,cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.job_name }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to
|
||||
complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.job_name }}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler
|
||||
}}'
|
||||
service: kubernetes-apps
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler
|
||||
}}'
|
||||
service: kubernetes-apps
|
||||
138
packages/system/monitoring/alerts/kubernetes-resources.yaml
Normal file
138
packages/system/monitoring/alerts/kubernetes-resources.yaml
Normal file
@@ -0,0 +1,138 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-resources
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource
|
||||
requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||
failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.namespace }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.namespace }}'
|
||||
service: kubernetes-resources
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}'
|
||||
service: kubernetes-resources
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
|
||||
$labels.pod }}.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,cluster)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,cluster)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}/{{ $labels.container
|
||||
}}'
|
||||
service: kubernetes-resources
|
||||
130
packages/system/monitoring/alerts/kubernetes-storage.yaml
Normal file
130
packages/system/monitoring/alerts/kubernetes-storage.yaml
Normal file
@@ -0,0 +1,130 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-storage
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{
|
||||
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{
|
||||
with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill
|
||||
up within four days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{
|
||||
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{
|
||||
with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run
|
||||
out of inodes within four days. Currently {{ $value | humanizePercentage
|
||||
}} of its inodes are free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,cluster)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim
|
||||
}}'
|
||||
service: kubernetes-storage
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: The persistent volume {{ $labels.persistentvolume }} {{ with
|
||||
$labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase
|
||||
}}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.persistentvolume }}'
|
||||
service: kubernetes-storage
|
||||
@@ -0,0 +1,91 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-apiserver
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-apiserver
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to kubernetes apiserver
|
||||
is expiring in less than 7.0 days.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"}
|
||||
> 0 and on (job,cluster) histogram_quantile(0.01, sum by (job,le,cluster)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m])))
|
||||
< 604800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: A client certificate used to authenticate to kubernetes apiserver
|
||||
is expiring in less than 24.0 hours.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"}
|
||||
> 0 and on (job,cluster) histogram_quantile(0.01, sum by (job,le,cluster)
|
||||
(rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m])))
|
||||
< 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.namespace }}/{{ $labels.pod }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAggregatedAPIErrors
|
||||
annotations:
|
||||
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
|
||||
}} has reported errors. It has appeared unavailable {{ $value | humanize
|
||||
}} times averaged over the past 10m.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors
|
||||
summary: Kubernetes aggregated API has reported errors.
|
||||
expr: sum by (name,namespace,cluster)(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m]))
|
||||
> 4
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.name }}/{{ $labels.namespace }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAggregatedAPIDown
|
||||
annotations:
|
||||
description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
|
||||
}} has been only {{ $value | humanize }}% available over the last 10m.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown
|
||||
summary: Kubernetes aggregated API is down.
|
||||
expr: (1 - max by (name,namespace,cluster)(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m])))
|
||||
* 100 < 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.name }}/{{ $labels.namespace }}'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.cluster }}/apiserver'
|
||||
service: kubernetes-system-apiserver
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
description: The kubernetes apiserver has terminated {{ $value | humanizePercentage
|
||||
}} of its incoming requests.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests
|
||||
summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage
|
||||
}} of its incoming requests.
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) /
|
||||
( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m]))
|
||||
) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}/apiserver'
|
||||
service: kubernetes-system-apiserver
|
||||
@@ -0,0 +1,21 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-controller-manager
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-controller-manager
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
description: KubeControllerManager has disappeared from Prometheus target
|
||||
discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontrollermanagerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/controller-manager'
|
||||
service: kubernetes-system-controller-manager
|
||||
175
packages/system/monitoring/alerts/kubernetes-system-kubelet.yaml
Normal file
175
packages/system/monitoring/alerts/kubernetes-system-kubelet.yaml
Normal file
@@ -0,0 +1,175 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-kubelet
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kubelet
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
description: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"}
|
||||
== 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
description: '{{ $labels.node }} is unreachable and some workloads may be
|
||||
rescheduled.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"}
|
||||
unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"})
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |-
|
||||
count by (node,cluster) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster) group_left(node) topk by (instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by (node,cluster) (
|
||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
description: The readiness status of node {{ $labels.node }} has changed {{
|
||||
$value }} times in the last 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m]))
|
||||
by (node,cluster) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
|
||||
duration of {{ $value }} seconds on node {{ $labels.node }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"}
|
||||
>= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
|
||||
on node {{ $labels.node }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet",
|
||||
metrics_path="/metrics"}[5m])) by (instance,le,cluster)) * on (instance,cluster)
|
||||
group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}
|
||||
> 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: Client certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: Server certificate for Kubelet on node {{ $labels.node }} expires
|
||||
in {{ $value | humanizeDuration }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{ $labels.node }} has failed to renew its client
|
||||
certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m])
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
description: Kubelet on node {{ $labels.node }} has failed to renew its server
|
||||
certificate ({{ $value | humanize }} errors in the last 5 minutes).
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.node }}'
|
||||
service: kubernetes-system-kubelet
|
||||
@@ -0,0 +1,20 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system-scheduler
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-scheduler
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
description: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeschedulerdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.scheduler }}'
|
||||
service: kubernetes-system-scheduler
|
||||
37
packages/system/monitoring/alerts/kubernetes-system.yaml
Normal file
37
packages/system/monitoring/alerts/kubernetes-system.yaml
Normal file
@@ -0,0 +1,37 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-kubernetes-system
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system
|
||||
params: {}
|
||||
rules:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
description: There are {{ $value }} different semantic versions of Kubernetes
|
||||
components running.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeversionmismatch
|
||||
summary: Different semantic versions of Kubernetes components running.
|
||||
expr: count by (cluster) (count by (git_version,cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*")))
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.cluster }}'
|
||||
service: kubernetes-system
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclienterrors
|
||||
summary: Kubernetes API server client is experiencing errors.
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,cluster)
|
||||
/
|
||||
sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
service: kubernetes-system
|
||||
93
packages/system/monitoring/alerts/node-exporter.rules.yaml
Normal file
93
packages/system/monitoring/alerts/node-exporter.rules.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node-exporter.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_num_cpu:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- annotations: {}
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
labels: {}
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- annotations: {}
|
||||
expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
labels: {}
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- annotations: {}
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
labels: {}
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
labels: {}
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
396
packages/system/monitoring/alerts/node-exporter.yaml
Normal file
396
packages/system/monitoring/alerts/node-exporter.yaml
Normal file
@@ -0,0 +1,396 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node-exporter
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
params: {}
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||
> 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector on {{ $labels.instance }} failed
|
||||
to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||
Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is not synchronising. Ensure
|
||||
NTP is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||
in degraded state due to one or more disks failures. Number of spare drives
|
||||
is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array at {{ $labels.instance }} failed.
|
||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
> 0
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently
|
||||
at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently
|
||||
at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
description: |
|
||||
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||
mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
description: |
|
||||
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
||||
summary: System saturated, load per core is very high.
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
description: |
|
||||
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
Please check that there is enough memory available at this instance.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
||||
summary: Memory major page faults are occurring at very high rate.
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
description: |
|
||||
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}'
|
||||
service: node-exporter
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
description: |
|
||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This symptom might indicate disk saturation.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
> 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-exporter
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description: Systemd service {{ $labels.name }} has entered failed state at
|
||||
{{ $labels.instance }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.name }}'
|
||||
service: node-exporter
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description: Bonding interface {{ $labels.master }} on {{ $labels.instance
|
||||
}} is in degraded state due to one or more slave failures.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.master }}'
|
||||
service: node-exporter
|
||||
21
packages/system/monitoring/alerts/node-network.yaml
Normal file
21
packages/system/monitoring/alerts/node-network.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node-network
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
params: {}
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: Network interface "{{ $labels.device }}" changing its up status
|
||||
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
|
||||
summary: Network interface is often changing its status
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
exported_instance: '{{ $labels.instance }}/{{ $labels.device }}'
|
||||
service: node-network
|
||||
55
packages/system/monitoring/alerts/node.rules.yaml
Normal file
55
packages/system/monitoring/alerts/node.rules.yaml
Normal file
@@ -0,0 +1,55 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: alerts-node.rules
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
params: {}
|
||||
rules:
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
topk by (namespace,pod,cluster) (1,
|
||||
max by (node,namespace,pod,cluster) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
labels: {}
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
count by (node,cluster) (
|
||||
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
||||
* on (namespace,pod,cluster) group_left(node)
|
||||
topk by (namespace,pod,cluster) (1, node_namespace_pod:kube_pod_info:)
|
||||
)
|
||||
labels: {}
|
||||
record: node:node_num_cpu:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
labels: {}
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
avg by (node,cluster) (
|
||||
sum without (mode) (
|
||||
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
||||
)
|
||||
)
|
||||
labels: {}
|
||||
record: node:node_cpu_utilization:ratio_rate5m
|
||||
- annotations: {}
|
||||
expr: |-
|
||||
avg by (cluster) (
|
||||
node:node_cpu_utilization:ratio_rate5m
|
||||
)
|
||||
labels: {}
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
@@ -1,26 +0,0 @@
|
||||
# Patterns to ignore when building packages.
|
||||
# This supports shell glob matching, relative path matching, and
|
||||
# negation (prefixed with !). Only one pattern per line.
|
||||
.DS_Store
|
||||
# Common VCS dirs
|
||||
.git/
|
||||
.gitignore
|
||||
.bzr/
|
||||
.bzrignore
|
||||
.hg/
|
||||
.hgignore
|
||||
.svn/
|
||||
# Common backup files
|
||||
*.swp
|
||||
*.bak
|
||||
*.tmp
|
||||
*.orig
|
||||
*~
|
||||
# Various IDEs
|
||||
.project
|
||||
.idea/
|
||||
*.tmproj
|
||||
.vscode/
|
||||
|
||||
# Ignore img folder used for documentation
|
||||
img/
|
||||
@@ -1,688 +0,0 @@
|
||||
## Next release
|
||||
|
||||
- TODO
|
||||
|
||||
## 0.25.17
|
||||
|
||||
**Release date:** 2024-09-20
|
||||
|
||||

|
||||

|
||||
|
||||
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
|
||||
- Fixed ETCD dashboard
|
||||
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
|
||||
- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.
|
||||
|
||||
## 0.25.16
|
||||
|
||||
**Release date:** 2024-09-10
|
||||
|
||||

|
||||

|
||||
|
||||
- Do not truncate servicemonitor, datasources, rules, dashboard, alertmanager & vmalert templates names
|
||||
- Use service label for node-exporter instead of podLabel. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1458)
|
||||
- Added common chart to a k8s-stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1456)
|
||||
- Fixed value of custom alertmanager configSecret. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1461)
|
||||
|
||||
## 0.25.15
|
||||
|
||||
**Release date:** 2024-09-05
|
||||
|
||||

|
||||

|
||||
|
||||
- Drop empty endpoints param from scrape configuration
|
||||
- Fixed proto when TLS is enabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1449)
|
||||
|
||||
## 0.25.14
|
||||
|
||||
**Release date:** 2024-09-04
|
||||
|
||||

|
||||

|
||||
|
||||
- fixed alertmanager templates
|
||||
|
||||
## 0.25.13
|
||||
|
||||
**Release date:** 2024-09-04
|
||||
|
||||

|
||||

|
||||
|
||||
- Use operator's own service monitor
|
||||
|
||||
## 0.25.12
|
||||
|
||||
**Release date:** 2024-09-03
|
||||
|
||||

|
||||

|
||||
|
||||
- Fixed dashboards rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1414)
|
||||
- Fixed service monitor label name.
|
||||
|
||||
## 0.25.11
|
||||
|
||||
**Release date:** 2024-09-03
|
||||
|
||||

|
||||

|
||||
|
||||
- Merged ingress templates
|
||||
- Removed custom VMServiceScrape for operator
|
||||
- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860).
|
||||
- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312).
|
||||
- Migrated Node Exporter dashboard into chart
|
||||
- Deprecated `grafana.sidecar.jsonData`, `grafana.provisionDefaultDatasource` in a favour of `grafana.sidecar.datasources.default` slice of datasources.
|
||||
- Fail if no notifiers are set, do not set `notifiers` to null if empty
|
||||
|
||||
## 0.25.10
|
||||
|
||||
**Release date:** 2024-08-31
|
||||
|
||||

|
||||

|
||||
|
||||
- fixed ingress extraPaths and externalVM urls rendering
|
||||
|
||||
## 0.25.9
|
||||
|
||||
**Release date:** 2024-08-31
|
||||
|
||||

|
||||

|
||||
|
||||
- fixed vmalert ingress name typo
|
||||
- Added ability to override default Prometheus-compatible datatasources with all available parameters. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/860).
|
||||
- Do not use `grafana.dashboards` and `grafana.dashboardProviders`. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1312).
|
||||
|
||||
## 0.25.8
|
||||
|
||||
**Release date:** 2024-08-30
|
||||
|
||||

|
||||

|
||||
|
||||
- fixed external notifiers rendering, when alertmanager is disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1378)
|
||||
|
||||
## 0.25.7
|
||||
|
||||
**Release date:** 2024-08-30
|
||||
|
||||

|
||||

|
||||
|
||||
- fixed extra rules template context
|
||||
|
||||
## 0.25.6
|
||||
|
||||
**Release date:** 2024-08-29
|
||||
|
||||

|
||||

|
||||
|
||||
**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubeScheduler.spec` to `kubeScheduler.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubeEtcd.spec` to `kubeEtcd.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `coreDns.spec` to `coreDns.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubeDns.spec` to `kubeDns.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubeProxy.spec` to `kubeProxy.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubeControllerManager.spec` to `kubeControllerManager.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubeApiServer.spec` to `kubeApiServer.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kubelet.spec` to `kubelet.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `kube-state-metrics.spec` to `kube-state-metrics.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `prometheus-node-exporter.spec` to `prometheus-node-exporter.vmScrape.spec`
|
||||
|
||||
**Update note**: Update `grafana.spec` to `grafana.vmScrape.spec`
|
||||
|
||||
- bump version of VM components to [v1.103.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.103.0)
|
||||
- Added `dashboards.<dashboardName>` bool flag to enable dashboard even if component it is for is not installed.
|
||||
- Allow extra `vmalert.notifiers` without dropping default notifier if `alertmanager.enabled: true`
|
||||
- Do not drop default notifier, when vmalert.additionalNotifierConfigs is set
|
||||
- Replaced static url proto with a template, which selects proto depending on a present tls configuration
|
||||
- Moved kubernetes components monitoring config from `spec` config to `vmScrape.spec`
|
||||
- Merged servicemonitor templates
|
||||
|
||||
## 0.25.5
|
||||
|
||||
**Release date:** 2024-08-26
|
||||
|
||||

|
||||

|
||||
|
||||
- TODO
|
||||
|
||||
## 0.25.4
|
||||
|
||||
**Release date:** 2024-08-26
|
||||
|
||||

|
||||

|
||||
|
||||
- updates operator to [v0.47.2](https://github.com/VictoriaMetrics/operator/releases/tag/v0.47.2)
|
||||
- kube-state-metrics - 5.16.4 -> 5.25.1
|
||||
- prometheus-node-exporter - 4.27.0 -> 4.29.0
|
||||
- grafana - 8.3.8 -> 8.4.7
|
||||
- added configurable `.Values.global.clusterLabel` to all alerting and recording rules `by` and `on` expressions
|
||||
|
||||
## 0.25.3
|
||||
|
||||
**Release date:** 2024-08-23
|
||||
|
||||

|
||||

|
||||
|
||||
- updated operator to v0.47.1 release
|
||||
- Build `app.kubernetes.io/instance` label consistently. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1282)
|
||||
|
||||
## 0.25.2
|
||||
|
||||
**Release date:** 2024-08-21
|
||||
|
||||

|
||||

|
||||
|
||||
- fixed vmalert ingress name. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1271)
|
||||
- fixed alertmanager ingress host template rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1270)
|
||||
|
||||
## 0.25.1
|
||||
|
||||
**Release date:** 2024-08-21
|
||||
|
||||

|
||||

|
||||
|
||||
- Added `.Values.global.license` configuration
|
||||
- Fixed extraLabels rendering. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1248)
|
||||
- Fixed vmalert url to alertmanager by including its path prefix
|
||||
- Removed `networking.k8s.io/v1beta1/Ingress` and `extensions/v1beta1/Ingress` support
|
||||
- Fixed kubedns servicemonitor template. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1255)
|
||||
|
||||
## 0.25.0
|
||||
|
||||
**Release date:** 2024-08-16
|
||||
|
||||

|
||||

|
||||
|
||||
**Update note**: it requires to update CRD dependency manually before upgrade
|
||||
|
||||
**Update note**: requires Helm 3.14+
|
||||
|
||||
- Moved dashboards templating logic out of sync script to Helm template
|
||||
- Allow to disable default grafana datasource
|
||||
- Synchronize Etcd dashboards and rules with mixin provided by Etcd
|
||||
- Add alerting rules for VictoriaMetrics operator.
|
||||
- Updated alerting rules for VictoriaMetrics components.
|
||||
- Fixed exact rule annotations propagation to other rules.
|
||||
- Set minimal kubernetes version to 1.25
|
||||
- updates operator to v0.47.0 version
|
||||
|
||||
## 0.24.5
|
||||
|
||||
**Release date:** 2024-08-01
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.102.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.1)
|
||||
|
||||
## 0.24.4
|
||||
|
||||
**Release date:** 2024-08-01
|
||||
|
||||

|
||||

|
||||
|
||||
- Update dependencies: grafana -> 8.3.6.
|
||||
- Added `.Values.defaultRules.alerting` and `.Values.defaultRules.recording` to setup common properties for all alerting an recording rules
|
||||
|
||||
## 0.24.3
|
||||
|
||||
**Release date:** 2024-07-23
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.102.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.102.0)
|
||||
|
||||
## 0.24.2
|
||||
|
||||
**Release date:** 2024-07-15
|
||||
|
||||

|
||||

|
||||
|
||||
- fix vmalertmanager configuration when using `.VMAlertmanagerSpec.ConfigRawYaml`. See [this pull request](https://github.com/VictoriaMetrics/helm-charts/pull/1136).
|
||||
|
||||
## 0.24.1
|
||||
|
||||
**Release date:** 2024-07-10
|
||||
|
||||

|
||||

|
||||
|
||||
- updates operator to v0.46.4
|
||||
|
||||
## 0.24.0
|
||||
|
||||
**Release date:** 2024-07-10
|
||||
|
||||

|
||||

|
||||
|
||||
- added ability to override alerting rules labels and annotations:
|
||||
- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations`
|
||||
- for all rules in a group - `.Values.defaultRules.groups.<groupName>.rules.spec.labels` and `.Valeus.defaultRules.groups.<groupName>.rules.spec.annotations`
|
||||
- for each rule individually - `.Values.defaultRules.rules.<ruleName>.spec.labels` and `.Values.defaultRules.rules.<ruleName>.spec.annotations`
|
||||
- changed `.Values.defaultRules.rules.<groupName>` to `.Values.defaultRules.groups.<groupName>.create`
|
||||
- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups.<groupName>.targetNamespace`
|
||||
- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups.<groupName>.spec.params`
|
||||
|
||||
## 0.23.6
|
||||
|
||||
**Release date:** 2024-07-08
|
||||
|
||||

|
||||

|
||||
|
||||
- added ability to override alerting rules labels and annotations:
|
||||
- globally - `.Values.defaultRules.rule.spec.labels` (before it was `.Values.defaultRules.additionalRuleLabels`) and `.Values.defaultRules.rule.spec.annotations`
|
||||
- for all rules in a group - `.Values.defaultRules.groups.<groupName>.rules.spec.labels` and `.Valeus.defaultRules.groups.<groupName>.rules.spec.annotations`
|
||||
- for each rule individually - `.Values.defaultRules.rules.<ruleName>.spec.labels` and `.Values.defaultRules.rules.<ruleName>.spec.annotations`
|
||||
- changed `.Values.defaultRules.rules.<groupName>` to `.Values.defaultRules.groups.<groupName>.create`
|
||||
- changed `.Values.defaultRules.appNamespacesTarget` to `.Values.defaultRules.groups.<groupName>.targetNamespace`
|
||||
- changed `.Values.defaultRules.params` to `.Values.defaultRules.group.spec.params` with ability to override it at `.Values.defaultRules.groups.<groupName>.spec.params`
|
||||
|
||||
## 0.23.5
|
||||
|
||||
**Release date:** 2024-07-04
|
||||
|
||||

|
||||

|
||||
|
||||
- Support configuring vmalert `-notifier.config` with `.Values.vmalert.additionalNotifierConfigs`.
|
||||
|
||||
## 0.23.4
|
||||
|
||||
**Release date:** 2024-07-02
|
||||
|
||||

|
||||

|
||||
|
||||
- Add `extraObjects` to allow deploying additional resources with the chart release.
|
||||
|
||||
## 0.23.3
|
||||
|
||||
**Release date:** 2024-06-26
|
||||
|
||||

|
||||

|
||||
|
||||
- Enable [conversion of Prometheus CRDs](https://docs.victoriametrics.com/operator/migration/#objects-conversion) by default. See [this](https://github.com/VictoriaMetrics/helm-charts/pull/1069) pull request for details.
|
||||
- use bitnami/kubectl image for cleanup instead of deprecated gcr.io/google_containers/hyperkube
|
||||
|
||||
## 0.23.2
|
||||
|
||||
**Release date:** 2024-06-14
|
||||
|
||||

|
||||

|
||||
|
||||
- Do not add `cluster` external label at VMAgent by default. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/774) for the details.
|
||||
|
||||
## 0.23.1
|
||||
|
||||
**Release date:** 2024-06-10
|
||||
|
||||

|
||||

|
||||
|
||||
- updates operator to v0.45.0 release
|
||||
- sync latest vm alerts and dashboards.
|
||||
|
||||
## 0.23.0
|
||||
|
||||
**Release date:** 2024-05-30
|
||||
|
||||

|
||||

|
||||
|
||||
- sync latest etcd v3.5.x rules from [upstream](https://github.com/etcd-io/etcd/blob/release-3.5/contrib/mixin/mixin.libsonnet).
|
||||
- add Prometheus operator CRDs as an optional dependency. See [this PR](https://github.com/VictoriaMetrics/helm-charts/pull/1022) and [related issue](https://github.com/VictoriaMetrics/helm-charts/issues/341) for the details.
|
||||
|
||||
## 0.22.1
|
||||
|
||||
**Release date:** 2024-05-14
|
||||
|
||||

|
||||

|
||||
|
||||
- fix missing serviceaccounts patch permission in VM operator, see [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/1012) for details.
|
||||
|
||||
## 0.22.0
|
||||
|
||||
**Release date:** 2024-05-10
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM operator to [0.44.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.44.0)
|
||||
|
||||
## 0.21.3
|
||||
|
||||
**Release date:** 2024-04-26
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.101.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.101.0)
|
||||
|
||||
## 0.21.2
|
||||
|
||||
**Release date:** 2024-04-23
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM operator to [0.43.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.3)
|
||||
|
||||
## 0.21.1
|
||||
|
||||
**Release date:** 2024-04-18
|
||||
|
||||

|
||||

|
||||
|
||||
## 0.21.0
|
||||
|
||||
**Release date:** 2024-04-18
|
||||
|
||||

|
||||

|
||||
|
||||
- TODO
|
||||
|
||||
- bump version of VM operator to [0.43.0](https://github.com/VictoriaMetrics/operator/releases/tag/v0.43.0)
|
||||
- updates CRDs definitions.
|
||||
|
||||
## 0.20.1
|
||||
|
||||
**Release date:** 2024-04-16
|
||||
|
||||

|
||||

|
||||
|
||||
- upgraded dashboards and alerting rules, added values file for local (Minikube) setup
|
||||
- bump version of VM components to [v1.100.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.100.1)
|
||||
|
||||
## 0.20.0
|
||||
|
||||
**Release date:** 2024-04-02
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM operator to [0.42.3](https://github.com/VictoriaMetrics/operator/releases/tag/v0.42.3)
|
||||
|
||||
## 0.19.4
|
||||
|
||||
**Release date:** 2024-03-05
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.99.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.99.0)
|
||||
|
||||
## 0.19.3
|
||||
|
||||
**Release date:** 2024-03-05
|
||||
|
||||

|
||||

|
||||
|
||||
- Commented default configuration for alertmanager. It simplifies configuration and makes it more explicit. See this [issue](https://github.com/VictoriaMetrics/helm-charts/issues/473) for details.
|
||||
- Allow enabling/disabling default k8s rules when installing. See [#904](https://github.com/VictoriaMetrics/helm-charts/pull/904) by @passie.
|
||||
|
||||
## 0.19.2
|
||||
|
||||
**Release date:** 2024-02-26
|
||||
|
||||

|
||||

|
||||
|
||||
- Fix templating of VMAgent `remoteWrite` in case both `VMSingle` and `VMCluster` are disabled. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/865) for details.
|
||||
|
||||
## 0.19.1
|
||||
|
||||
**Release date:** 2024-02-21
|
||||
|
||||

|
||||

|
||||
|
||||
- Update dependencies: victoria-metrics-operator -> 0.28.1, grafana -> 7.3.1.
|
||||
- Update victoriametrics CRD resources yaml.
|
||||
|
||||
## 0.19.0
|
||||
|
||||
**Release date:** 2024-02-09
|
||||
|
||||

|
||||

|
||||
|
||||
- Do not store original labels in `vmagent`'s memory by default. This reduces memory usage of `vmagent` but makes `vmagent`'s debugging UI less informative. See [this docs](https://docs.victoriametrics.com/vmagent/#relabel-debug) for details on relabeling debug.
|
||||
- Update dependencies: kube-state-metrics -> 5.16.0, prometheus-node-exporter -> 4.27.0, grafana -> 7.3.0.
|
||||
- Update victoriametrics CRD resources yaml.
|
||||
- Update builtin dashboards and rules.
|
||||
|
||||
## 0.18.12
|
||||
|
||||
**Release date:** 2024-02-01
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.97.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.97.1)
|
||||
- Fix helm lint when ingress resources enabled - split templates of resources per kind. See [#820](https://github.com/VictoriaMetrics/helm-charts/pull/820) by @MemberIT.
|
||||
|
||||
## 0.18.11
|
||||
|
||||
**Release date:** 2023-12-15
|
||||
|
||||

|
||||

|
||||
|
||||
- Fix missing `.Values.defaultRules.rules.vmcluster` value. See [#801](https://github.com/VictoriaMetrics/helm-charts/pull/801) by @MemberIT.
|
||||
|
||||
## 0.18.10
|
||||
|
||||
**Release date:** 2023-12-12
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.96.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.96.0)
|
||||
- Add optional allowCrossNamespaceImport to GrafanaDashboard(s) (#788)
|
||||
|
||||
## 0.18.9
|
||||
|
||||
**Release date:** 2023-12-08
|
||||
|
||||

|
||||

|
||||
|
||||
- Properly use variable from values file for Grafana datasource type. (#769)
|
||||
- Update dashboards from upstream sources. (#780)
|
||||
|
||||
## 0.18.8
|
||||
|
||||
**Release date:** 2023-11-16
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.95.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.1)
|
||||
|
||||
## 0.18.7
|
||||
|
||||
**Release date:** 2023-11-15
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.95.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.95.0)
|
||||
- Support adding extra group parameters for default vmrules. (#752)
|
||||
|
||||
## 0.18.6
|
||||
|
||||
**Release date:** 2023-11-01
|
||||
|
||||

|
||||

|
||||
|
||||
- Fix kube scheduler default scraping port from 10251 to 10259, Kubernetes changed it since 1.23.0. See [this pr](https://github.com/VictoriaMetrics/helm-charts/pull/736) for details.
|
||||
- Bump version of operator chart to [0.27.4](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.4)
|
||||
|
||||
## 0.18.5
|
||||
|
||||
**Release date:** 2023-10-08
|
||||
|
||||

|
||||

|
||||
|
||||
- Update operator chart to [v0.27.3](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.3) for fixing [#708](https://github.com/VictoriaMetrics/helm-charts/issues/708)
|
||||
|
||||
## 0.18.4
|
||||
|
||||
**Release date:** 2023-10-04
|
||||
|
||||

|
||||

|
||||
|
||||
- Update dependencies: [victoria-metrics-operator -> 0.27.2](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.2), prometheus-node-exporter -> 4.23.2, grafana -> 6.59.5.
|
||||
|
||||
## 0.18.3
|
||||
|
||||
**Release date:** 2023-10-04
|
||||
|
||||

|
||||

|
||||
|
||||
- bump version of VM components to [v1.94.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.94.0)
|
||||
|
||||
## 0.18.2
|
||||
|
||||
**Release date:** 2023-09-28
|
||||
|
||||

|
||||

|
||||
|
||||
- Fix behavior of `vmalert.remoteWriteVMAgent` - remoteWrite.url for VMAlert is correctly generated considering endpoint, name, port and http.pathPrefix of VMAgent
|
||||
|
||||
## 0.18.1
|
||||
|
||||
**Release date:** 2023-09-21
|
||||
|
||||

|
||||

|
||||
|
||||
- Bump version of VM components to [v1.93.5](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.5)
|
||||
|
||||
## 0.18.0
|
||||
|
||||
**Release date:** 2023-09-12
|
||||
|
||||

|
||||

|
||||
|
||||
- Bump version of `grafana` helm-chart to `6.59.*`
|
||||
- Bump version of `prometheus-node-exporter` helm-chart to `4.23.*`
|
||||
- Bump version of `kube-state-metrics` helm-chart to `0.59.*`
|
||||
- Update alerting rules
|
||||
- Update grafana dashboards
|
||||
- Add `make` commands `sync-rules` and `sync-dashboards`
|
||||
- Add support of VictoriaMetrics datasource
|
||||
|
||||
## 0.17.8
|
||||
|
||||
**Release date:** 2023-09-11
|
||||
|
||||

|
||||

|
||||
|
||||
- Bump version of VM components to [v1.93.4](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.4)
|
||||
- Bump version of operator chart to [0.27.0](https://github.com/VictoriaMetrics/helm-charts/releases/tag/victoria-metrics-operator-0.27.0)
|
||||
|
||||
## 0.17.7
|
||||
|
||||
**Release date:** 2023-09-07
|
||||
|
||||

|
||||

|
||||
|
||||
- Bump version of operator helm-chart to `0.26.2`
|
||||
|
||||
## 0.17.6
|
||||
|
||||
**Release date:** 2023-09-04
|
||||
|
||||

|
||||

|
||||
|
||||
- Move `cleanupCRD` option to victoria-metrics-operator chart (#593)
|
||||
- Disable `honorTimestamps` for cadvisor scrape job by default (#617)
|
||||
- For vmalert all replicas of alertmanager are added to notifiers (only if alertmanager is enabled) (#619)
|
||||
- Add `grafanaOperatorDashboardsFormat` option (#615)
|
||||
- Fix query expression for memory calculation in `k8s-views-global` dashboard (#636)
|
||||
- Bump version of Victoria Metrics components to `v1.93.3`
|
||||
- Bump version of operator helm-chart to `0.26.0`
|
||||
|
||||
## 0.17.5
|
||||
|
||||
**Release date:** 2023-08-23
|
||||
|
||||

|
||||

|
||||
|
||||
- Update VictoriaMetrics components from v1.93.0 to v1.93.1
|
||||
|
||||
## 0.17.4
|
||||
|
||||
**Release date:** 2023-08-12
|
||||
|
||||

|
||||

|
||||
|
||||
- Update VictoriaMetrics components from v1.92.1 to v1.93.0
|
||||
- delete an obsolete parameter remaining by mistake (see <https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-k8s-stack#upgrade-to-0130>) (#602)
|
||||
|
||||
## 0.17.3
|
||||
|
||||
**Release date:** 2023-07-28
|
||||
|
||||

|
||||

|
||||
|
||||
- Update VictoriaMetrics components from v1.92.0 to v1.92.1 (#599)
|
||||
|
||||
## 0.17.2
|
||||
|
||||
**Release date:** 2023-07-27
|
||||
|
||||

|
||||

|
||||
|
||||
- Update VictoriaMetrics components from v1.91.3 to v1.92.0
|
||||
@@ -1,24 +0,0 @@
|
||||
dependencies:
|
||||
- name: victoria-metrics-common
|
||||
repository: https://victoriametrics.github.io/helm-charts
|
||||
version: 0.0.11
|
||||
- name: victoria-metrics-operator
|
||||
repository: https://victoriametrics.github.io/helm-charts
|
||||
version: 0.34.8
|
||||
- name: kube-state-metrics
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 5.25.1
|
||||
- name: prometheus-node-exporter
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 4.39.0
|
||||
- name: grafana
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 8.4.9
|
||||
- name: crds
|
||||
repository: ""
|
||||
version: 0.0.0
|
||||
- name: prometheus-operator-crds
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 11.0.0
|
||||
digest: sha256:11b119ebabf4ff0ea2951e7c72f51d0223dc3f50fb061a43b01fe7856491b836
|
||||
generated: "2024-09-12T11:50:51.935071545Z"
|
||||
@@ -1,66 +0,0 @@
|
||||
annotations:
|
||||
artifacthub.io/category: monitoring-logging
|
||||
artifacthub.io/changes: |
|
||||
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
|
||||
- Fixed ETCD dashboard
|
||||
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
|
||||
- 'Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.'
|
||||
artifacthub.io/license: Apache-2.0
|
||||
artifacthub.io/links: |
|
||||
- name: Sources
|
||||
url: https://docs.victoriametrics.com/vmgateway
|
||||
- name: Charts repo
|
||||
url: https://victoriametrics.github.io/helm-charts/
|
||||
- name: Docs
|
||||
url: https://docs.victoriametrics.com
|
||||
artifacthub.io/operator: "true"
|
||||
apiVersion: v2
|
||||
appVersion: v1.102.1
|
||||
dependencies:
|
||||
- name: victoria-metrics-common
|
||||
repository: https://victoriametrics.github.io/helm-charts
|
||||
version: 0.0.*
|
||||
- condition: victoria-metrics-operator.enabled
|
||||
name: victoria-metrics-operator
|
||||
repository: https://victoriametrics.github.io/helm-charts
|
||||
version: 0.34.*
|
||||
- condition: kube-state-metrics.enabled
|
||||
name: kube-state-metrics
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 5.25.*
|
||||
- condition: prometheus-node-exporter.enabled
|
||||
name: prometheus-node-exporter
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 4.39.*
|
||||
- condition: grafana.enabled
|
||||
name: grafana
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 8.4.*
|
||||
- condition: crds.enabled
|
||||
name: crds
|
||||
repository: ""
|
||||
version: 0.0.0
|
||||
- condition: prometheus-operator-crds.enabled
|
||||
name: prometheus-operator-crds
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 11.0.*
|
||||
description: Kubernetes monitoring on VictoriaMetrics stack. Includes VictoriaMetrics
|
||||
Operator, Grafana dashboards, ServiceScrapes and VMRules
|
||||
home: https://github.com/VictoriaMetrics/helm-charts
|
||||
icon: https://avatars.githubusercontent.com/u/43720803?s=200&v=4
|
||||
keywords:
|
||||
- victoriametrics
|
||||
- operator
|
||||
- monitoring
|
||||
- kubernetes
|
||||
- observability
|
||||
- tsdb
|
||||
- metrics
|
||||
- metricsql
|
||||
- timeseries
|
||||
kubeVersion: '>=1.25.0-0'
|
||||
name: victoria-metrics-k8s-stack
|
||||
sources:
|
||||
- https://github.com/VictoriaMetrics/helm-charts
|
||||
type: application
|
||||
version: 0.25.17
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,300 +0,0 @@
|
||||
{{ template "chart.typeBadge" . }} {{ template "chart.versionBadge" . }}
|
||||
[](https://artifacthub.io/packages/helm/victoriametrics/victoria-metrics-k8s-stack)
|
||||
|
||||
{{ template "chart.description" . }}
|
||||
|
||||
* [Overview](#Overview)
|
||||
* [Configuration](#Configuration)
|
||||
* [Prerequisites](#Prerequisites)
|
||||
* [Dependencies](#Dependencies)
|
||||
* [Quick Start](#How-to-install)
|
||||
* [Uninstall](#How-to-uninstall)
|
||||
* [Version Upgrade](#Upgrade-guide)
|
||||
* [Troubleshooting](#Troubleshooting)
|
||||
* [Values](#Parameters)
|
||||
|
||||
|
||||
## Overview
|
||||
This chart is an All-in-one solution to start monitoring kubernetes cluster.
|
||||
It installs multiple dependency charts like [grafana](https://github.com/grafana/helm-charts/tree/main/charts/grafana), [node-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-node-exporter), [kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) and [victoria-metrics-operator](https://github.com/VictoriaMetrics/helm-charts/tree/master/charts/victoria-metrics-operator).
|
||||
Also it installs Custom Resources like [VMSingle](https://docs.victoriametrics.com/operator/quick-start#vmsingle), [VMCluster](https://docs.victoriametrics.com/operator/quick-start#vmcluster), [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent), [VMAlert](https://docs.victoriametrics.com/operator/quick-start#vmalert).
|
||||
|
||||
By default, the operator [converts all existing prometheus-operator API objects](https://docs.victoriametrics.com/operator/quick-start#migration-from-prometheus-operator-objects) into corresponding VictoriaMetrics Operator objects.
|
||||
|
||||
To enable metrics collection for kubernetes this chart installs multiple scrape configurations for kuberenetes components like kubelet and kube-proxy, etc. Metrics collection is done by [VMAgent](https://docs.victoriametrics.com/operator/quick-start#vmagent). So if want to ship metrics to external VictoriaMetrics database you can disable VMSingle installation by setting `vmsingle.enabled` to `false` and setting `vmagent.vmagentSpec.remoteWrite.url` to your external VictoriaMetrics database.
|
||||
|
||||
This chart also installs bunch of dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus) project.
|
||||
|
||||

|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
Configuration of this chart is done through helm values.
|
||||
|
||||
### Dependencies
|
||||
|
||||
Dependencies can be enabled or disabled by setting `enabled` to `true` or `false` in `values.yaml` file.
|
||||
|
||||
**!Important:** for dependency charts anything that you can find in values.yaml of dependency chart can be configured in this chart under key for that dependency. For example if you want to configure `grafana` you can find all possible configuration options in [values.yaml](https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml) and you should set them in values for this chart under grafana: key. For example if you want to configure `grafana.persistence.enabled` you should set it in values.yaml like this:
|
||||
```yaml
|
||||
#################################################
|
||||
### dependencies #####
|
||||
#################################################
|
||||
# Grafana dependency chart configuration. For possible values refer to https://github.com/grafana/helm-charts/tree/main/charts/grafana#configuration
|
||||
grafana:
|
||||
enabled: true
|
||||
persistence:
|
||||
type: pvc
|
||||
enabled: false
|
||||
```
|
||||
|
||||
### VictoriaMetrics components
|
||||
|
||||
This chart installs multiple VictoriaMetrics components using Custom Resources that are managed by [victoria-metrics-operator](https://docs.victoriametrics.com/operator/design)
|
||||
Each resource can be configured using `spec` of that resource from API docs of [victoria-metrics-operator](https://docs.victoriametrics.com/operator/api). For example if you want to configure `VMAgent` you can find all possible configuration options in [API docs](https://docs.victoriametrics.com/operator/api#vmagent) and you should set them in values for this chart under `vmagent.spec` key. For example if you want to configure `remoteWrite.url` you should set it in values.yaml like this:
|
||||
```yaml
|
||||
vmagent:
|
||||
spec:
|
||||
remoteWrite:
|
||||
- url: "https://insert.vmcluster.domain.com/insert/0/prometheus/api/v1/write"
|
||||
```
|
||||
|
||||
### ArgoCD issues
|
||||
|
||||
#### Operator self signed certificates
|
||||
When deploying K8s stack using ArgoCD without Cert Manager (`.Values.victoria-metrics-operator.admissionWebhooks.certManager.enabled: false`)
|
||||
it will rerender operator's webhook certificates on each sync since Helm `lookup` function is not respected by ArgoCD.
|
||||
To prevent this please update you K8s stack Application `spec.syncPolicy` and `spec.ignoreDifferences` with a following:
|
||||
|
||||
```yaml
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
...
|
||||
spec:
|
||||
...
|
||||
syncPolicy:
|
||||
syncOptions:
|
||||
# https://argo-cd.readthedocs.io/en/stable/user-guide/sync-options/#respect-ignore-difference-configs
|
||||
# argocd must also ignore difference during apply stage
|
||||
# otherwise it ll silently override changes and cause a problem
|
||||
- RespectIgnoreDifferences=true
|
||||
ignoreDifferences:
|
||||
- group: ""
|
||||
kind: Secret
|
||||
name: <fullname>-validation
|
||||
namespace: kube-system
|
||||
jsonPointers:
|
||||
- /data
|
||||
- group: admissionregistration.k8s.io
|
||||
kind: ValidatingWebhookConfiguration
|
||||
name: <fullname>-admission
|
||||
jqPathExpressions:
|
||||
- '.webhooks[]?.clientConfig.caBundle'
|
||||
```
|
||||
where `<fullname>` is output of `{{"{{"}} include "vm-operator.fullname" {{"}}"}}` for your setup
|
||||
|
||||
#### `metadata.annotations: Too long: must have at most 262144 bytes` on dashboards
|
||||
|
||||
If one of dashboards ConfigMap is failing with error `Too long: must have at most 262144 bytes`, please make sure you've added `argocd.argoproj.io/sync-options: ServerSideApply=true` annotation to your dashboards:
|
||||
|
||||
```yaml
|
||||
grafana:
|
||||
sidecar:
|
||||
dashboards:
|
||||
additionalDashboardAnnotations
|
||||
argocd.argoproj.io/sync-options: ServerSideApply=true
|
||||
```
|
||||
|
||||
argocd.argoproj.io/sync-options: ServerSideApply=true
|
||||
|
||||
### Rules and dashboards
|
||||
|
||||
This chart by default install multiple dashboards and recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus)
|
||||
you can disable dashboards with `defaultDashboardsEnabled: false` and `experimentalDashboardsEnabled: false`
|
||||
and rules can be configured under `defaultRules`
|
||||
|
||||
### Prometheus scrape configs
|
||||
This chart installs multiple scrape configurations for kubernetes monitoring. They are configured under `#ServiceMonitors` section in `values.yaml` file. For example if you want to configure scrape config for `kubelet` you should set it in values.yaml like this:
|
||||
```yaml
|
||||
kubelet:
|
||||
enabled: true
|
||||
# spec for VMNodeScrape crd
|
||||
# https://docs.victoriametrics.com/operator/api#vmnodescrapespec
|
||||
spec:
|
||||
interval: "30s"
|
||||
```
|
||||
|
||||
### Using externally managed Grafana
|
||||
|
||||
If you want to use an externally managed Grafana instance but still want to use the dashboards provided by this chart you can set
|
||||
`grafana.enabled` to `false` and set `defaultDashboardsEnabled` to `true`. This will install the dashboards
|
||||
but will not install Grafana.
|
||||
|
||||
For example:
|
||||
```yaml
|
||||
defaultDashboardsEnabled: true
|
||||
|
||||
grafana:
|
||||
enabled: false
|
||||
```
|
||||
|
||||
This will create ConfigMaps with dashboards to be imported into Grafana.
|
||||
|
||||
If additional configuration for labels or annotations is needed in order to import dashboard to an existing Grafana you can
|
||||
set `.grafana.sidecar.dashboards.additionalDashboardLabels` or `.grafana.sidecar.dashboards.additionalDashboardAnnotations` in `values.yaml`:
|
||||
|
||||
For example:
|
||||
```yaml
|
||||
defaultDashboardsEnabled: true
|
||||
|
||||
grafana:
|
||||
enabled: false
|
||||
sidecar:
|
||||
dashboards:
|
||||
additionalDashboardLabels:
|
||||
key: value
|
||||
additionalDashboardAnnotations:
|
||||
key: value
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
* Install the follow packages: ``git``, ``kubectl``, ``helm``, ``helm-docs``. See this [tutorial](../../REQUIREMENTS.md).
|
||||
|
||||
* Add dependency chart repositories
|
||||
|
||||
```console
|
||||
helm repo add grafana https://grafana.github.io/helm-charts
|
||||
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
helm repo update
|
||||
```
|
||||
|
||||
* PV support on underlying infrastructure.
|
||||
|
||||
{{ include "chart.installSection" . }}
|
||||
|
||||
### Install locally (Minikube)
|
||||
|
||||
To run VictoriaMetrics stack locally it's possible to use [Minikube](https://github.com/kubernetes/minikube). To avoid dashboards and alert rules issues please follow the steps below:
|
||||
|
||||
Run Minikube cluster
|
||||
|
||||
```
|
||||
minikube start --container-runtime=containerd --extra-config=scheduler.bind-address=0.0.0.0 --extra-config=controller-manager.bind-address=0.0.0.0
|
||||
```
|
||||
|
||||
Install helm chart
|
||||
|
||||
```
|
||||
helm install [RELEASE_NAME] vm/victoria-metrics-k8s-stack -f values.yaml -f values.minikube.yaml -n NAMESPACE --debug --dry-run
|
||||
```
|
||||
|
||||
{{ include "chart.uninstallSection" . }}
|
||||
|
||||
CRDs created by this chart are not removed by default and should be manually cleaned up:
|
||||
|
||||
```console
|
||||
kubectl get crd | grep victoriametrics.com | awk '{print $1 }' | xargs -i kubectl delete crd {}
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- If you cannot install helm chart with error `configmap already exist`. It could happen because of name collisions, if you set too long release name.
|
||||
Kubernetes by default, allows only 63 symbols at resource names and all resource names are trimmed by helm to 63 symbols.
|
||||
To mitigate it, use shorter name for helm chart release name, like:
|
||||
```bash
|
||||
# stack - is short enough
|
||||
helm upgrade -i stack vm/victoria-metrics-k8s-stack
|
||||
```
|
||||
Or use override for helm chart release name:
|
||||
```bash
|
||||
helm upgrade -i some-very-long-name vm/victoria-metrics-k8s-stack --set fullnameOverride=stack
|
||||
```
|
||||
|
||||
|
||||
## Upgrade guide
|
||||
|
||||
Usually, helm upgrade doesn't requires manual actions. Just execute command:
|
||||
|
||||
```console
|
||||
$ helm upgrade [RELEASE_NAME] vm/victoria-metrics-k8s-stack
|
||||
```
|
||||
|
||||
But release with CRD update can only be patched manually with kubectl.
|
||||
Since helm does not perform a CRD update, we recommend that you always perform this when updating the helm-charts version:
|
||||
|
||||
```console
|
||||
# 1. check the changes in CRD
|
||||
$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl diff -f -
|
||||
|
||||
# 2. apply the changes (update CRD)
|
||||
$ helm show crds vm/victoria-metrics-k8s-stack --version [YOUR_CHART_VERSION] | kubectl apply -f - --server-side
|
||||
```
|
||||
|
||||
All other manual actions upgrades listed below:
|
||||
|
||||
|
||||
### Upgrade to 0.13.0
|
||||
|
||||
|
||||
- node-exporter starting from version 4.0.0 is using the Kubernetes recommended labels. Therefore you have to delete the daemonset before you upgrade.
|
||||
|
||||
```bash
|
||||
kubectl delete daemonset -l app=prometheus-node-exporter
|
||||
```
|
||||
- scrape configuration for kubernetes components was moved from `vmServiceScrape.spec` section to `spec` section. If you previously modified scrape configuration you need to update your `values.yaml`
|
||||
|
||||
- `grafana.defaultDashboardsEnabled` was renamed to `defaultDashboardsEnabled` (moved to top level). You may need to update it in your `values.yaml`
|
||||
|
||||
|
||||
### Upgrade to 0.6.0
|
||||
|
||||
|
||||
All `CRD` must be update to the lastest version with command:
|
||||
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml
|
||||
|
||||
```
|
||||
|
||||
### Upgrade to 0.4.0
|
||||
|
||||
All `CRD` must be update to `v1` version with command:
|
||||
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/helm-charts/master/charts/victoria-metrics-k8s-stack/crds/crd.yaml
|
||||
|
||||
```
|
||||
|
||||
### Upgrade from 0.2.8 to 0.2.9
|
||||
|
||||
Update `VMAgent` crd
|
||||
|
||||
command:
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.16.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml
|
||||
```
|
||||
|
||||
### Upgrade from 0.2.5 to 0.2.6
|
||||
|
||||
New CRD added to operator - `VMUser` and `VMAuth`, new fields added to exist crd.
|
||||
Manual commands:
|
||||
```bash
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmusers.yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmauths.yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmalerts.yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmagents.yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmsingles.yaml
|
||||
kubectl apply -f https://raw.githubusercontent.com/VictoriaMetrics/operator/v0.15.0/config/crd/bases/operator.victoriametrics.com_vmclusters.yaml
|
||||
```
|
||||
|
||||
{{ include "chart.helmDocs" . }}
|
||||
|
||||
## Parameters
|
||||
|
||||
The following tables lists the configurable parameters of the chart and their default values.
|
||||
|
||||
Change the values according to the need of the environment in ``victoria-metrics-k8s-stack/values.yaml`` file.
|
||||
|
||||
{{ template "chart.valuesTableHtml" . }}
|
||||
@@ -1,40 +0,0 @@
|
||||
# Release process guidance
|
||||
|
||||
## Update version for VictoriaMetrics kubernetes monitoring stack
|
||||
|
||||
1. Update dependency requirements in [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml)
|
||||
2. Apply changes via `helm dependency update`
|
||||
3. Update image tag in chart values:
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```console
|
||||
make sync-rules
|
||||
make sync-dashboards
|
||||
```
|
||||
</div>
|
||||
4. Bump version of the victoria-metrics-k8s-stack [Chart.yml](https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/Chart.yaml)
|
||||
5. Run linter:
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```console
|
||||
make lint
|
||||
```
|
||||
|
||||
</div>
|
||||
6. Render templates locally to check for errors:
|
||||
|
||||
<div class="with-copy" markdown="1">
|
||||
|
||||
```console
|
||||
helm template vm-k8s-stack ./charts/victoria-metrics-k8s-stack --output-dir out --values ./charts/victoria-metrics-k8s-stack/values.yaml --debug
|
||||
```
|
||||
|
||||
</div>
|
||||
7. Test updated chart by installing it to your kubernetes cluster.
|
||||
8. Update docs with
|
||||
```console
|
||||
helm-docs
|
||||
```
|
||||
9. Commit the changes and send a [PR](https://github.com/VictoriaMetrics/helm-charts/pulls)
|
||||
@@ -1,12 +0,0 @@
|
||||
# Release notes for version 0.25.17
|
||||
|
||||
**Release date:** 2024-09-20
|
||||
|
||||

|
||||

|
||||
|
||||
- Added VMAuth to k8s stack. See [this issue](https://github.com/VictoriaMetrics/helm-charts/issues/829)
|
||||
- Fixed ETCD dashboard
|
||||
- Use path prefix from args as a default path prefix for ingress. Related [issue](https://github.com/VictoriaMetrics/helm-charts/issues/1260)
|
||||
- Allow using vmalert without notifiers configuration. Note that it is required to use `.vmalert.spec.extraArgs["notifiers.blackhole"]: true` in order to start vmalert with a blackhole configuration.
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
---
|
||||
weight: 1
|
||||
title: CHANGELOG
|
||||
menu:
|
||||
docs:
|
||||
weight: 1
|
||||
identifier: helm-victoriametrics-k8s-stack-changelog
|
||||
parent: helm-victoriametrics-k8s-stack
|
||||
url: /helm/victoriametrics-k8s-stack/changelog
|
||||
aliases:
|
||||
- /helm/victoriametrics-k8s-stack/changelog/index.html
|
||||
---
|
||||
{{% content "CHANGELOG.md" %}}
|
||||
@@ -1,13 +0,0 @@
|
||||
---
|
||||
weight: 9
|
||||
title: VictoriaMetrics K8s Stack
|
||||
menu:
|
||||
docs:
|
||||
parent: helm
|
||||
weight: 9
|
||||
identifier: helm-victoriametrics-k8s-stack
|
||||
url: /helm/victoriametrics-k8s-stack
|
||||
aliases:
|
||||
- /helm/victoriametrics-k8s-stack/index.html
|
||||
---
|
||||
{{% content "README.md" %}}
|
||||
@@ -1,165 +0,0 @@
|
||||
condition: '{{ .Values.kubeEtcd.enabled }}'
|
||||
name: etcd
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: 'etcd cluster members are down.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: 'etcd cluster has insufficient number of members.'
|
||||
condition: '{{ true }}'
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
summary: 'etcd cluster has no leader.'
|
||||
condition: '{{ true }}'
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
summary: 'etcd cluster has high number of leader changes.'
|
||||
condition: '{{ true }}'
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster has high number of failed grpc requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster has high number of failed grpc requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.'
|
||||
summary: 'etcd grpc requests are slow'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster member communication is slow.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster has high number of proposal failures.'
|
||||
condition: '{{ true }}'
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster 99th percentile fsync durations are too high.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster 99th percentile fsync durations are too high.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: 'etcd cluster 99th percentile commit durations are too high.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: 'etcd cluster database is running full.'
|
||||
condition: '{{ true }}'
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
|
||||
summary: 'etcd cluster database growing very fast.'
|
||||
condition: '{{ true }}'
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
||||
runbook_url: 'https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation'
|
||||
summary: 'etcd database size in use is less than 50% of the actual allocated storage.'
|
||||
condition: '{{ true }}'
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1,53 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{`{{`}} printf "%.4g" $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.service {{`}}`}} targets in {{`{{`}} $labels.namespace {{`}}`}} namespace are down.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/targetdown'
|
||||
summary: 'One or more targets are unreachable.'
|
||||
condition: '{{ true }}'
|
||||
expr: 100 * (count(up == 0) BY (job,namespace,service,{{ .Values.global.clusterLabel }}) / count(up) BY (job,namespace,service,{{ .Values.global.clusterLabel }})) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/watchdog'
|
||||
summary: 'An alert that should always be firing to certify that Alertmanager is working properly.'
|
||||
condition: '{{ true }}'
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: ok
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
description: 'This is an alert that is used to inhibit info alerts.
|
||||
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
|
||||
other alerts.
|
||||
|
||||
This alert fires whenever there''s a severity="info" alert, and stops firing when another alert with a
|
||||
|
||||
severity of ''warning'' or ''critical'' starts firing on the same namespace.
|
||||
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/infoinhibitor'
|
||||
summary: 'Info-level alert inhibition.'
|
||||
condition: '{{ true }}'
|
||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace,{{ .Values.global.clusterLabel }}) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||
labels:
|
||||
severity: major
|
||||
@@ -1,11 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.container_cpu_usage_seconds_total
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
|
||||
irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
1, max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
@@ -1,10 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.container_memory_cache
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
|
||||
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_cache
|
||||
@@ -1,10 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.container_memory_rss
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
|
||||
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_rss
|
||||
@@ -1,10 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.container_memory_swap
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
|
||||
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_swap
|
||||
@@ -1,10 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.container_memory_working_set_bytes
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
|
||||
max by (namespace,pod,node,{{ .Values.global.clusterLabel }}) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_memory_working_set_bytes
|
||||
@@ -1,79 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.container_resource
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
|
||||
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum by (namespace,{{ .Values.global.clusterLabel }}) (
|
||||
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_requests:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
|
||||
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum by (namespace,{{ .Values.global.clusterLabel }}) (
|
||||
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_requests:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
|
||||
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum by (namespace,{{ .Values.global.clusterLabel }}) (
|
||||
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_memory:kube_pod_container_resource_limits:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace,pod,{{ .Values.global.clusterLabel }})
|
||||
group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
(kube_pod_status_phase{phase=~"Pending|Running"} == 1)
|
||||
)
|
||||
record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum by (namespace,{{ .Values.global.clusterLabel }}) (
|
||||
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
max by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
|
||||
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left() max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_status_phase{phase=~"Pending|Running"} == 1
|
||||
)
|
||||
)
|
||||
)
|
||||
record: namespace_cpu:kube_pod_container_resource_limits:sum
|
||||
@@ -1,54 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: k8s.rules.pod_owner
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on (replicaset,namespace,{{ .Values.global.clusterLabel }}) group_left(owner_name) topk by (replicaset,namespace,{{ .Values.global.clusterLabel }}) (
|
||||
1, max by (replicaset,namespace,owner_name,{{ .Values.global.clusterLabel }}) (
|
||||
kube_replicaset_owner{job="kube-state-metrics"}
|
||||
)
|
||||
),
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
max by (namespace,workload,pod,{{ .Values.global.clusterLabel }}) (
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
)
|
||||
labels:
|
||||
workload_type: job
|
||||
record: namespace_workload_pod:kube_pod_owner:relabel
|
||||
@@ -1,128 +0,0 @@
|
||||
condition: '{{ .Values.kubeApiServer.enabled }}'
|
||||
interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,{{ .Values.global.clusterLabel }}) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_count{job="kube-apiserver"}[1h]))
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (verb,scope,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (verb,scope,le,{{ .Values.global.clusterLabel }}) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
record: apiserver_request:availability30d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
1 - (
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:availability30d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:availability30d
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
- condition: '{{ true }}'
|
||||
expr: sum by (code,verb,{{ .Values.global.clusterLabel }}) (increase(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
@@ -1,318 +0,0 @@
|
||||
condition: '{{ .Values.kubeApiServer.enabled }}'
|
||||
name: kube-apiserver-burnrate.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1d]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[1h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[1h]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[1h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[1h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate1h
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[2h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[2h]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[2h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[2h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate2h
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[30m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[30m]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[30m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[30m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate30m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[3d]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[3d]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[3d]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[3d]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate3d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[5m]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[5m]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[5m]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le="1"}[6h]))
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le="5"}[6h]))
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le="30"}[6h]))
|
||||
)
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"LIST|GET"}[6h]))
|
||||
labels:
|
||||
verb: read
|
||||
record: apiserver_request:burnrate6h
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1d]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[1h]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate1h
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[2h]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate2h
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[30m]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate30m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[3d]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate3d
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[5m]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
# too slow
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_count{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
|
||||
-
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le="1"}[6h]))
|
||||
)
|
||||
+
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
|
||||
)
|
||||
/
|
||||
sum by ({{ .Values.global.clusterLabel }}) (rate(apiserver_request_total{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
|
||||
labels:
|
||||
verb: write
|
||||
record: apiserver_request:burnrate6h
|
||||
@@ -1,15 +0,0 @@
|
||||
condition: '{{ .Values.kubeApiServer.enabled }}'
|
||||
name: kube-apiserver-histogram.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: read
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum by (le,resource,{{ .Values.global.clusterLabel }}) (rate(apiserver_request_sli_duration_seconds_bucket{job="kube-apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
verb: write
|
||||
record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
|
||||
@@ -1,63 +0,0 @@
|
||||
condition: '{{ .Values.kubeApiServer.enabled }}'
|
||||
name: kube-apiserver-slos
|
||||
rules:
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: 'The API server is burning too much error budget.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
|
||||
summary: 'The API server is burning too much error budget.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000)
|
||||
for: 2m
|
||||
labels:
|
||||
long: 1h
|
||||
severity: critical
|
||||
short: 5m
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: 'The API server is burning too much error budget.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
|
||||
summary: 'The API server is burning too much error budget.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000)
|
||||
for: 15m
|
||||
labels:
|
||||
long: 6h
|
||||
severity: critical
|
||||
short: 30m
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: 'The API server is burning too much error budget.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
|
||||
summary: 'The API server is burning too much error budget.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000)
|
||||
for: 1h
|
||||
labels:
|
||||
long: 1d
|
||||
severity: warning
|
||||
short: 2h
|
||||
- alert: KubeAPIErrorBudgetBurn
|
||||
annotations:
|
||||
description: 'The API server is burning too much error budget.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapierrorbudgetburn'
|
||||
summary: 'The API server is burning too much error budget.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000)
|
||||
and
|
||||
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000)
|
||||
for: 3h
|
||||
labels:
|
||||
long: 3d
|
||||
severity: warning
|
||||
short: 6h
|
||||
@@ -1,9 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kube-prometheus-general.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: count without(instance, pod, node) (up == 1)
|
||||
record: count:up1
|
||||
- condition: '{{ true }}'
|
||||
expr: count without(instance, pod, node) (up == 0)
|
||||
record: count:up0
|
||||
@@ -1,21 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- condition: '{{ true }}'
|
||||
expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
@@ -1,48 +0,0 @@
|
||||
condition: '{{ .Values.kubeScheduler.enabled }}'
|
||||
name: kube-scheduler.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
||||
@@ -1,55 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kube-state-metrics
|
||||
rules:
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
description: 'kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors'
|
||||
summary: 'kube-state-metrics is experiencing errors in list operations.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }})
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }}))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
description: 'kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors'
|
||||
summary: 'kube-state-metrics is experiencing errors in watch operations.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by ({{ .Values.global.clusterLabel }})
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by ({{ .Values.global.clusterLabel }}))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
description: 'kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch'
|
||||
summary: 'kube-state-metrics sharding is misconfigured.'
|
||||
condition: '{{ true }}'
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
description: 'kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing'
|
||||
summary: 'kube-state-metrics shards are missing.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - 1
|
||||
-
|
||||
sum( 2 ^ max by (shard_ordinal,{{ .Values.global.clusterLabel }}) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by ({{ .Values.global.clusterLabel }})
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1,18 +0,0 @@
|
||||
condition: '{{ .Values.kubelet.enabled }}'
|
||||
name: kubelet.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }}) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
@@ -1,257 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping'
|
||||
summary: 'Pod is crash looping.'
|
||||
condition: '{{ true }}'
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready'
|
||||
summary: 'Pod has been in a non-ready state for more than 15 minutes.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
max by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}", phase=~"Pending|Unknown|Failed"}
|
||||
) * on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(owner_kind) topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
1, max by (namespace,pod,owner_kind,{{ .Values.global.clusterLabel }}) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: 'Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch'
|
||||
summary: 'Deployment generation mismatch due to possible roll-back'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: 'Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch'
|
||||
summary: 'Deployment has not matched the expected number of replicas.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
description: 'Rollout of deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} is not progressing for longer than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentrolloutstuck'
|
||||
summary: 'Deployment rollout is not progressing.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch'
|
||||
summary: 'StatefulSet has not matched the expected number of replicas.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: 'StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch'
|
||||
summary: 'StatefulSet generation mismatch due to possible roll-back'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: 'StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout'
|
||||
summary: 'StatefulSet update has not been rolled out.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
max by (namespace,statefulset,{{ .Values.global.clusterLabel }}) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: 'DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck'
|
||||
summary: 'DaemonSet rollout is stuck.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: 'pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting'
|
||||
summary: 'Pod container waiting longer than 1 hour'
|
||||
condition: '{{ true }}'
|
||||
expr: sum by (namespace,pod,container,{{ .Values.global.clusterLabel }}) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled'
|
||||
summary: 'DaemonSet pods are not scheduled.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled'
|
||||
summary: 'DaemonSet pods are misscheduled.'
|
||||
condition: '{{ true }}'
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted'
|
||||
summary: 'Job did not complete in time'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
time() - max by (namespace,job_name,{{ .Values.global.clusterLabel }}) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: 'Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed'
|
||||
summary: 'Job failed to complete.'
|
||||
condition: '{{ true }}'
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch'
|
||||
summary: 'HPA has not matched desired number of replicas.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: 'HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout'
|
||||
summary: 'HPA is running at max replicas'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ .targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1,113 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kubernetes-resources
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit'
|
||||
summary: 'Cluster has overcommitted CPU resource requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by ({{ .Values.global.clusterLabel }})) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit'
|
||||
summary: 'Cluster has overcommitted memory resource requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by ({{ .Values.global.clusterLabel }}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted CPU resource requests for Namespaces.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit'
|
||||
summary: 'Cluster has overcommitted CPU resource requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by ({{ .Values.global.clusterLabel }})
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: 'Cluster {{`{{`}} $labels.cluster {{`}}`}} has overcommitted memory resource requests for Namespaces.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit'
|
||||
summary: 'Cluster has overcommitted memory resource requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by ({{ .Values.global.clusterLabel }})
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by ({{ .Values.global.clusterLabel }})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull'
|
||||
summary: 'Namespace quota is going to be full.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused'
|
||||
summary: 'Namespace quota is fully used.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: 'Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded'
|
||||
summary: 'Namespace quota has exceeded the limits.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh'
|
||||
summary: 'Processes experience elevated CPU throttling.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }})
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container,pod,namespace,{{ .Values.global.clusterLabel }})
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
@@ -1,101 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is only {{`{{`}} $value | humanizePercentage {{`}}`}} free.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup'
|
||||
summary: 'PersistentVolume is filling up.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} is available.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumefillingup'
|
||||
summary: 'PersistentVolume is filling up.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: 'The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} only has {{`{{`}} $value | humanizePercentage {{`}}`}} free inodes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup'
|
||||
summary: 'PersistentVolumeInodes are filling up.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: 'Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} is expected to run out of inodes within four days. Currently {{`{{`}} $value | humanizePercentage {{`}}`}} of its inodes are free.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeinodesfillingup'
|
||||
summary: 'PersistentVolumeInodes are filling up.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~"{{ .targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (namespace,persistentvolumeclaim,{{ .Values.global.clusterLabel }})
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: 'The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} {{`{{`}} with $labels.cluster -{{`}}`}} on Cluster {{`{{`}} . {{`}}`}} {{`{{`}}- end {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepersistentvolumeerrors'
|
||||
summary: 'PersistentVolume is having issues with provisioning.'
|
||||
condition: '{{ true }}'
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1,62 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kubernetes-system-apiserver
|
||||
rules:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
|
||||
summary: 'Client certificate is about to expire.'
|
||||
condition: '{{ true }}'
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 604800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
description: 'A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration'
|
||||
summary: 'Client certificate is about to expire.'
|
||||
condition: '{{ true }}'
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="kube-apiserver"} > 0 and on (job,{{ .Values.global.clusterLabel }}) histogram_quantile(0.01, sum by (job,le,{{ .Values.global.clusterLabel }}) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kube-apiserver"}[5m]))) < 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAggregatedAPIErrors
|
||||
annotations:
|
||||
description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors'
|
||||
summary: 'Kubernetes aggregated API has reported errors.'
|
||||
condition: '{{ true }}'
|
||||
expr: sum by (name,namespace,{{ .Values.global.clusterLabel }})(increase(aggregator_unavailable_apiservice_total{job="kube-apiserver"}[10m])) > 4
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAggregatedAPIDown
|
||||
annotations:
|
||||
description: 'Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown'
|
||||
summary: 'Kubernetes aggregated API is down.'
|
||||
condition: '{{ true }}'
|
||||
expr: (1 - max by (name,namespace,{{ .Values.global.clusterLabel }})(avg_over_time(aggregator_unavailable_apiservice{job="kube-apiserver"}[10m]))) * 100 < 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
description: 'KubeAPI has disappeared from Prometheus target discovery.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown'
|
||||
summary: 'Target disappeared from Prometheus target discovery.'
|
||||
condition: '{{ .Values.kubeApiServer.enabled }}'
|
||||
expr: absent(up{job="kube-apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
description: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests'
|
||||
summary: 'The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.'
|
||||
condition: '{{ true }}'
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="kube-apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kube-apiserver"}[10m])) ) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1,13 +0,0 @@
|
||||
condition: '{{ .Values.kubeControllerManager.enabled }}'
|
||||
name: kubernetes-system-controller-manager
|
||||
rules:
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
description: 'KubeControllerManager has disappeared from Prometheus target discovery.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontrollermanagerdown'
|
||||
summary: 'Target disappeared from Prometheus target discovery.'
|
||||
condition: '{{ .Values.kubeControllerManager.enabled }}'
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1,136 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kubernetes-system-kubelet
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready'
|
||||
summary: 'Node is not ready.'
|
||||
condition: '{{ true }}'
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable'
|
||||
summary: 'Node is unreachable.'
|
||||
condition: '{{ true }}'
|
||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
description: 'Kubelet ''{{`{{`}} $labels.node {{`}}`}}'' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods'
|
||||
summary: 'Kubelet is running at capacity.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
count by (node,{{ .Values.global.clusterLabel }}) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,{{ .Values.global.clusterLabel }}) group_left(node) topk by (instance,pod,namespace,{{ .Values.global.clusterLabel }}) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by (node,{{ .Values.global.clusterLabel }}) (
|
||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
description: 'The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping'
|
||||
summary: 'Node readiness status is flapping.'
|
||||
condition: '{{ true }}'
|
||||
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (node,{{ .Values.global.clusterLabel }}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
description: 'The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh'
|
||||
summary: 'Kubelet Pod Lifecycle Event Generator is taking too long to relist.'
|
||||
condition: '{{ true }}'
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
description: 'Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh'
|
||||
summary: 'Kubelet Pod startup latency is too high.'
|
||||
condition: '{{ true }}'
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,{{ .Values.global.clusterLabel }})) * on (instance,{{ .Values.global.clusterLabel }}) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
|
||||
summary: 'Kubelet client certificate is about to expire.'
|
||||
condition: '{{ true }}'
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
description: 'Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration'
|
||||
summary: 'Kubelet client certificate is about to expire.'
|
||||
condition: '{{ true }}'
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
|
||||
summary: 'Kubelet server certificate is about to expire.'
|
||||
condition: '{{ true }}'
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
description: 'Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration'
|
||||
summary: 'Kubelet server certificate is about to expire.'
|
||||
condition: '{{ true }}'
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors'
|
||||
summary: 'Kubelet has failed to renew its client certificate.'
|
||||
condition: '{{ true }}'
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
description: 'Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors'
|
||||
summary: 'Kubelet has failed to renew its server certificate.'
|
||||
condition: '{{ true }}'
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
description: 'Kubelet has disappeared from Prometheus target discovery.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown'
|
||||
summary: 'Target disappeared from Prometheus target discovery.'
|
||||
condition: '{{ .Values.kubelet.enabled }}'
|
||||
expr: absent(up{job="kubelet", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1,13 +0,0 @@
|
||||
condition: '{{ .Values.kubeScheduler.enabled }}'
|
||||
name: kubernetes-system-scheduler
|
||||
rules:
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
description: 'KubeScheduler has disappeared from Prometheus target discovery.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeschedulerdown'
|
||||
summary: 'Target disappeared from Prometheus target discovery.'
|
||||
condition: '{{ .Values.kubeScheduler.enabled }}'
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -1,27 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: kubernetes-system
|
||||
rules:
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
description: 'There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeversionmismatch'
|
||||
summary: 'Different semantic versions of Kubernetes components running.'
|
||||
condition: '{{ true }}'
|
||||
expr: count by ({{ .Values.global.clusterLabel }}) (count by (git_version,{{ .Values.global.clusterLabel }}) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
description: 'Kubernetes API server client ''{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}'' is experiencing {{`{{`}} $value | humanizePercentage {{`}}`}} errors.'''
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclienterrors'
|
||||
summary: 'Kubernetes API server client is experiencing errors.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{job="kube-apiserver",code=~"5.."}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }})
|
||||
/
|
||||
sum(rate(rest_client_requests_total{job="kube-apiserver"}[5m])) by (instance,job,namespace,{{ .Values.global.clusterLabel }}))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1,76 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: node-exporter.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- condition: '{{ true }}'
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
@@ -1,336 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup'
|
||||
summary: 'Filesystem is predicted to run out of space within the next 24 hours.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup'
|
||||
summary: 'Filesystem is predicted to run out of space within the next 4 hours.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace'
|
||||
summary: 'Filesystem has less than 5% space left.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace'
|
||||
summary: 'Filesystem has less than 3% space left.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup'
|
||||
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup'
|
||||
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles'
|
||||
summary: 'Filesystem has less than 5% inodes left.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: 'Filesystem on {{`{{`}} $labels.device {{`}}`}}, mounted on {{`{{`}} $labels.mountpoint {{`}}`}}, at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles'
|
||||
summary: 'Filesystem has less than 3% inodes left.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs'
|
||||
summary: 'Network interface is reporting many receive errors.'
|
||||
condition: '{{ true }}'
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs'
|
||||
summary: 'Network interface is reporting many transmit errors.'
|
||||
condition: '{{ true }}'
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused'
|
||||
summary: 'Number of conntrack are getting close to the limit.'
|
||||
condition: '{{ true }}'
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: 'Node Exporter text file collector on {{`{{`}} $labels.instance {{`}}`}} failed to scrape.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror'
|
||||
summary: 'Node Exporter text file collector failed to scrape.'
|
||||
condition: '{{ true }}'
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected'
|
||||
summary: 'Clock skew detected.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: 'Clock at {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising'
|
||||
summary: 'Clock not synchronising.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: 'RAID array ''{{`{{`}} $labels.device {{`}}`}}'' at {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded'
|
||||
summary: 'RAID Array is degraded.'
|
||||
condition: '{{ true }}'
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: 'At least one device in RAID array at {{`{{`}} $labels.instance {{`}}`}} failed. Array ''{{`{{`}} $labels.device {{`}}`}}'' needs attention and possibly a disk swap.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure'
|
||||
summary: 'Failed device in RAID array.'
|
||||
condition: '{{ true }}'
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit'
|
||||
summary: 'Kernel is predicted to exhaust file descriptors limit soon.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: 'File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit'
|
||||
summary: 'Kernel is predicted to exhaust file descriptors limit soon.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
description: 'CPU usage at {{`{{`}} $labels.instance {{`}}`}} has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodecpuhighusage'
|
||||
summary: 'High CPU usage.'
|
||||
condition: '{{ true }}'
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: informational
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
description: 'System load per core at {{`{{`}} $labels.instance {{`}}`}} has been above 2 for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
|
||||
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemsaturation'
|
||||
summary: 'System saturated, load per core is very high.'
|
||||
condition: '{{ true }}'
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
description: 'Memory major pages are occurring at very high rate at {{`{{`}} $labels.instance {{`}}`}}, 500 major page faults per second for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
|
||||
|
||||
Please check that there is enough memory available at this instance.
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememorymajorpagesfaults'
|
||||
summary: 'Memory major page faults are occurring at very high rate.'
|
||||
condition: '{{ true }}'
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
description: 'Memory is filling up at {{`{{`}} $labels.instance {{`}}`}}, has been above 90% for the last 15 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodememoryhighutilization'
|
||||
summary: 'Host is running out of memory.'
|
||||
condition: '{{ true }}'
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
description: 'Disk IO queue (aqu-sq) is high on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}}, has been above 10 for the last 30 minutes, is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}.
|
||||
|
||||
This symptom might indicate disk saturation.
|
||||
|
||||
'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodediskiosaturation'
|
||||
summary: 'Disk IO queue is high.'
|
||||
condition: '{{ true }}'
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) > 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description: 'Systemd service {{`{{`}} $labels.name {{`}}`}} has entered failed state at {{`{{`}} $labels.instance {{`}}`}}'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodesystemdservicefailed'
|
||||
summary: 'Systemd service has entered failed state.'
|
||||
condition: '{{ true }}'
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description: 'Bonding interface {{`{{`}} $labels.master {{`}}`}} on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more slave failures.'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/node/nodebondingdegraded'
|
||||
summary: 'Bonding interface is degraded'
|
||||
condition: '{{ true }}'
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1,13 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: node-network
|
||||
rules:
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
description: 'Network interface "{{`{{`}} $labels.device {{`}}`}}" changing its up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}'
|
||||
runbook_url: '{{ .Values.defaultRules.runbookUrl }}/general/nodenetworkinterfaceflapping'
|
||||
summary: 'Network interface is often changing its status'
|
||||
condition: '{{ true }}'
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
@@ -1,44 +0,0 @@
|
||||
condition: '{{ true }}'
|
||||
name: node.rules
|
||||
rules:
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1,
|
||||
max by (node,namespace,pod,{{ .Values.global.clusterLabel }}) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
count by (node,{{ .Values.global.clusterLabel }}) (
|
||||
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
||||
* on (namespace,pod,{{ .Values.global.clusterLabel }}) group_left(node)
|
||||
topk by (namespace,pod,{{ .Values.global.clusterLabel }}) (1, node_namespace_pod:kube_pod_info:)
|
||||
)
|
||||
record: node:node_num_cpu:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by ({{ .Values.global.clusterLabel }})
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
avg by (node,{{ .Values.global.clusterLabel }}) (
|
||||
sum without (mode) (
|
||||
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
||||
)
|
||||
)
|
||||
record: node:node_cpu_utilization:ratio_rate5m
|
||||
- condition: '{{ true }}'
|
||||
expr: |-
|
||||
avg by ({{ .Values.global.clusterLabel }}) (
|
||||
node:node_cpu_utilization:ratio_rate5m
|
||||
)
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
@@ -1,458 +0,0 @@
|
||||
{{- /* Expand the name of the chart. */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.name" -}}
|
||||
{{- $Chart := (.helm).Chart | default .Chart -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- default $Chart.Name $Values.nameOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /*
|
||||
Create a default fully qualified app name.
|
||||
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
|
||||
If release name contains chart name it will be used as a full name.
|
||||
*/ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.fullname" -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $Chart := (.helm).Chart | default .Chart -}}
|
||||
{{- $Release := (.helm).Release | default .Release -}}
|
||||
{{- $fullname := "" -}}
|
||||
{{- if .appKey -}}
|
||||
{{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}}
|
||||
{{- $values := $Values -}}
|
||||
{{- $global := (index $Values.global $Chart.Name) | default dict -}}
|
||||
{{- range $ak := $appKey }}
|
||||
{{- $values = (index $values $ak) | default dict -}}
|
||||
{{- $global = (index $global $ak) | default dict -}}
|
||||
{{- if $values.name -}}
|
||||
{{- $fullname = $values.name -}}
|
||||
{{- else if $global.name -}}
|
||||
{{- $fullname = $global.name -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- if empty $fullname -}}
|
||||
{{- if $Values.fullnameOverride -}}
|
||||
{{- $fullname = $Values.fullnameOverride -}}
|
||||
{{- else if (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}}
|
||||
{{- $fullname = (dig $Chart.Name "fullnameOverride" "" ($Values.global)) -}}
|
||||
{{- else -}}
|
||||
{{- $name := default $Chart.Name $Values.nameOverride -}}
|
||||
{{- if contains $name $Release.Name -}}
|
||||
{{- $fullname = $Release.Name -}}
|
||||
{{- else -}}
|
||||
{{- $fullname = (printf "%s-%s" $Release.Name $name) }}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- $fullname | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* Create chart name and version as used by the chart label. */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.chart" -}}
|
||||
{{- $Chart := (.helm).Chart | default .Chart -}}
|
||||
{{- printf "%s-%s" $Chart.Name $Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* Create the name of the service account to use */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.serviceAccountName" -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- if $Values.serviceAccount.create -}}
|
||||
{{- default (include "victoria-metrics-k8s-stack.fullname" .) $Values.serviceAccount.name -}}
|
||||
{{- else -}}
|
||||
{{- default "default" $Values.serviceAccount.name -}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{- /* Common labels */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.labels" -}}
|
||||
{{- $Release := (.helm).Release | default .Release -}}
|
||||
{{- $Chart := (.helm).Chart | default .Chart -}}
|
||||
{{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.selectorLabels" .)) -}}
|
||||
{{- $_ := set $labels "helm.sh/chart" (include "victoria-metrics-k8s-stack.chart" .) -}}
|
||||
{{- $_ := set $labels "app.kubernetes.io/managed-by" $Release.Service -}}
|
||||
{{- with $Chart.AppVersion }}
|
||||
{{- $_ := set $labels "app.kubernetes.io/version" . -}}
|
||||
{{- end -}}
|
||||
{{- toYaml $labels -}}
|
||||
{{- end }}
|
||||
|
||||
{{- define "vm.release" -}}
|
||||
{{- $Release := (.helm).Release | default .Release -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- default $Release.Name $Values.argocdReleaseOverride | trunc 63 | trimSuffix "-" -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* Selector labels */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.selectorLabels" -}}
|
||||
{{- $labels := .extraLabels | default dict -}}
|
||||
{{- $_ := set $labels "app.kubernetes.io/name" (include "victoria-metrics-k8s-stack.name" .) -}}
|
||||
{{- $_ := set $labels "app.kubernetes.io/instance" (include "vm.release" .) -}}
|
||||
{{- toYaml $labels -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* Create the name for VM service */ -}}
|
||||
{{- define "vm.service" -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $name := (include "victoria-metrics-k8s-stack.fullname" .) -}}
|
||||
{{- with .appKey -}}
|
||||
{{- $prefix := . -}}
|
||||
{{- if kindIs "slice" $prefix }}
|
||||
{{- $prefix = last $prefix -}}
|
||||
{{- end -}}
|
||||
{{- $prefix = ternary $prefix (printf "vm%s" $prefix) (hasPrefix "vm" $prefix) -}}
|
||||
{{- $name = printf "%s-%s" $prefix $name -}}
|
||||
{{- end -}}
|
||||
{{- if hasKey . "appIdx" -}}
|
||||
{{- $name = (printf "%s-%d.%s" $name .appIdx $name) -}}
|
||||
{{- end -}}
|
||||
{{- $name -}}
|
||||
{{- end }}
|
||||
|
||||
{{- define "vm.url" -}}
|
||||
{{- $name := (include "vm.service" .) -}}
|
||||
{{- $Release := (.helm).Release | default .Release -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $ns := include "vm.namespace" . -}}
|
||||
{{- $proto := "http" -}}
|
||||
{{- $port := 80 -}}
|
||||
{{- $path := .appRoute | default "/" -}}
|
||||
{{- $isSecure := false -}}
|
||||
{{- if .appSecure -}}
|
||||
{{- $isSecure = .appSecure -}}
|
||||
{{- end -}}
|
||||
{{- if .appKey -}}
|
||||
{{- $appKey := ternary (list .appKey) .appKey (kindIs "string" .appKey) -}}
|
||||
{{- $spec := $Values -}}
|
||||
{{- range $ak := $appKey -}}
|
||||
{{- if hasKey $spec $ak -}}
|
||||
{{- $spec = (index $spec $ak) -}}
|
||||
{{- end -}}
|
||||
{{- if hasKey $spec "spec" -}}
|
||||
{{- $spec = $spec.spec -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- $isSecure = (eq ($spec.extraArgs).tls "true") | default $isSecure -}}
|
||||
{{- $proto = (ternary "https" "http" $isSecure) -}}
|
||||
{{- $port = (ternary 443 80 $isSecure) -}}
|
||||
{{- $port = $spec.port | default $port -}}
|
||||
{{- $path = dig "http.pathPrefix" $path ($spec.extraArgs | default dict) -}}
|
||||
{{- end -}}
|
||||
{{- printf "%s://%s.%s.svc:%d%s" $proto $name $ns (int $port) $path -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "vm.read.endpoint" -}}
|
||||
{{- $ctx := . -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $endpoint := default dict -}}
|
||||
{{- if $Values.vmsingle.enabled -}}
|
||||
{{- $_ := set $ctx "appKey" "vmsingle" -}}
|
||||
{{- $_ := set $endpoint "url" (include "vm.url" $ctx) -}}
|
||||
{{- else if $Values.vmcluster.enabled -}}
|
||||
{{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}}
|
||||
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
|
||||
{{- $tenant := ($Values.tenant | default 0) -}}
|
||||
{{- $_ := set $endpoint "url" (printf "%s/select/%d/prometheus" $baseURL (int $tenant)) -}}
|
||||
{{- else if $Values.externalVM.read.url -}}
|
||||
{{- $endpoint = $Values.externalVM.read -}}
|
||||
{{- end -}}
|
||||
{{- toYaml $endpoint -}}
|
||||
{{- end }}
|
||||
|
||||
{{- define "vm.write.endpoint" -}}
|
||||
{{- $ctx := . -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $endpoint := default dict -}}
|
||||
{{- if $Values.vmsingle.enabled -}}
|
||||
{{- $_ := set $ctx "appKey" "vmsingle" -}}
|
||||
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
|
||||
{{- $_ := set $endpoint "url" (printf "%s/api/v1/write" $baseURL) -}}
|
||||
{{- else if $Values.vmcluster.enabled -}}
|
||||
{{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}}
|
||||
{{- $baseURL := (trimSuffix "/" (include "vm.url" $ctx)) -}}
|
||||
{{- $tenant := ($Values.tenant | default 0) -}}
|
||||
{{- $_ := set $endpoint "url" (printf "%s/insert/%d/prometheus/api/v1/write" $baseURL (int $tenant)) -}}
|
||||
{{- else if $Values.externalVM.write.url -}}
|
||||
{{- $endpoint = $Values.externalVM.write -}}
|
||||
{{- end -}}
|
||||
{{- toYaml $endpoint -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* VMAlert remotes */ -}}
|
||||
{{- define "vm.alert.remotes" -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $remotes := default dict -}}
|
||||
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
|
||||
{{- $ctx := dict "helm" . -}}
|
||||
{{- $remoteWrite := (include "vm.write.endpoint" $ctx | fromYaml) -}}
|
||||
{{- if $Values.vmalert.remoteWriteVMAgent -}}
|
||||
{{- $ctx := dict "helm" . "appKey" "vmagent" -}}
|
||||
{{- $remoteWrite = dict "url" (printf "%s/api/v1/write" (include "vm.url" $ctx)) -}}
|
||||
{{- end -}}
|
||||
{{- $ctx := dict "helm" . -}}
|
||||
{{- $remoteRead := (fromYaml (include "vm.read.endpoint" $ctx)) -}}
|
||||
{{- $_ := set $remotes "remoteWrite" $remoteWrite -}}
|
||||
{{- $_ := set $remotes "remoteRead" $remoteRead -}}
|
||||
{{- $_ := set $remotes "datasource" $remoteRead -}}
|
||||
{{- if $Values.vmalert.additionalNotifierConfigs }}
|
||||
{{- $configName := printf "%s-vmalert-additional-notifier" $fullname -}}
|
||||
{{- $notifierConfigRef := dict "name" $configName "key" "notifier-configs.yaml" -}}
|
||||
{{- $_ := set $remotes "notifierConfigRef" $notifierConfigRef -}}
|
||||
{{- else if $Values.alertmanager.enabled -}}
|
||||
{{- $notifiers := default list -}}
|
||||
{{- $appSecure := (not (empty (((.Values.alertmanager).spec).webConfig).tls_server_config)) -}}
|
||||
{{- $ctx := dict "helm" . "appKey" "alertmanager" "appSecure" $appSecure "appRoute" ((.Values.alertmanager).spec).routePrefix -}}
|
||||
{{- $alertManagerReplicas := (.Values.alertmanager.spec.replicaCount | default 1 | int) -}}
|
||||
{{- range until $alertManagerReplicas -}}
|
||||
{{- $_ := set $ctx "appIdx" . -}}
|
||||
{{- $notifiers = append $notifiers (dict "url" (include "vm.url" $ctx)) -}}
|
||||
{{- end }}
|
||||
{{- $_ := set $remotes "notifiers" $notifiers -}}
|
||||
{{- end -}}
|
||||
{{- toYaml $remotes -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* VMAlert templates */ -}}
|
||||
{{- define "vm.alert.templates" -}}
|
||||
{{- $Values := (.helm).Values | default .Values}}
|
||||
{{- $cms := ($Values.vmalert.spec.configMaps | default list) -}}
|
||||
{{- if $Values.vmalert.templateFiles -}}
|
||||
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
|
||||
{{- $cms = append $cms (printf "%s-vmalert-extra-tpl" $fullname) -}}
|
||||
{{- end -}}
|
||||
{{- $output := dict "configMaps" (compact $cms) -}}
|
||||
{{- toYaml $output -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "vm.license.global" -}}
|
||||
{{- $license := (deepCopy (.Values.global).license) | default dict -}}
|
||||
{{- if $license.key -}}
|
||||
{{- if hasKey $license "keyRef" -}}
|
||||
{{- $_ := unset $license "keyRef" -}}
|
||||
{{- end -}}
|
||||
{{- else if $license.keyRef.name -}}
|
||||
{{- if hasKey $license "key" -}}
|
||||
{{- $_ := unset $license "key" -}}
|
||||
{{- end -}}
|
||||
{{- else -}}
|
||||
{{- $license = default dict -}}
|
||||
{{- end -}}
|
||||
{{- toYaml $license -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* VMAlert spec */ -}}
|
||||
{{- define "vm.alert.spec" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $extraArgs := dict "remoteWrite.disablePathAppend" "true" -}}
|
||||
{{- if $Values.vmalert.templateFiles -}}
|
||||
{{- $ruleTmpl := (printf "/etc/vm/configs/%s-vmalert-extra-tpl/*.tmpl" (include "victoria-metrics-k8s-stack.fullname" .)) -}}
|
||||
{{- $_ := set $extraArgs "rule.templates" $ruleTmpl -}}
|
||||
{{- end -}}
|
||||
{{- $vmAlertRemotes := (include "vm.alert.remotes" . | fromYaml) -}}
|
||||
{{- $vmAlertTemplates := (include "vm.alert.templates" . | fromYaml) -}}
|
||||
{{- $spec := dict "extraArgs" $extraArgs -}}
|
||||
{{- with (include "vm.license.global" .) -}}
|
||||
{{- $_ := set $spec "license" (fromYaml .) -}}
|
||||
{{- end -}}
|
||||
{{- with concat ($vmAlertRemotes.notifiers | default list) (.Values.vmalert.spec.notifiers | default list) }}
|
||||
{{- $_ := set $vmAlertRemotes "notifiers" . }}
|
||||
{{- end }}
|
||||
{{- $spec := deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec }}
|
||||
{{- if not (or (hasKey $spec "notifier") (hasKey $spec "notifiers") (hasKey $spec "notifierConfigRef") (hasKey $spec.extraArgs "notifier.blackhole")) }}
|
||||
{{- fail "Neither `notifier`, `notifiers` nor `notifierConfigRef` is set for vmalert. If it's intentionally please consider setting `.vmalert.spec.extraArgs.['notifier.blackhole']` to `'true'`"}}
|
||||
{{- end }}
|
||||
{{- tpl (deepCopy (omit $Values.vmalert.spec "notifiers") | mergeOverwrite $vmAlertRemotes | mergeOverwrite $vmAlertTemplates | mergeOverwrite $spec | toYaml) . -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* VM Agent remoteWrites */ -}}
|
||||
{{- define "vm.agent.remote.write" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $remoteWrites := $Values.vmagent.additionalRemoteWrites | default list -}}
|
||||
{{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled $Values.externalVM.write.url -}}
|
||||
{{- $ctx := dict "helm" . -}}
|
||||
{{- $remoteWrites = append $remoteWrites (fromYaml (include "vm.write.endpoint" $ctx)) -}}
|
||||
{{- end -}}
|
||||
{{- toYaml (dict "remoteWrite" $remoteWrites) -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* VMAgent spec */ -}}
|
||||
{{- define "vm.agent.spec" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $spec := (include "vm.agent.remote.write" . | fromYaml) -}}
|
||||
{{- with (include "vm.license.global" .) -}}
|
||||
{{- $_ := set $spec "license" (fromYaml .) -}}
|
||||
{{- end -}}
|
||||
{{- tpl (deepCopy $Values.vmagent.spec | mergeOverwrite $spec | toYaml) . -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* VMAuth spec */ -}}
|
||||
{{- define "vm.auth.spec" -}}
|
||||
{{- $ctx := . -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $unauthorizedAccessConfig := default list }}
|
||||
{{- if $Values.vmsingle.enabled -}}
|
||||
{{- $_ := set $ctx "appKey" (list "vmsingle") -}}
|
||||
{{- $url := (include "vm.url" $ctx) }}
|
||||
{{- $srcPath := clean (printf "%s/.*" (urlParse $url).path) }}
|
||||
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list $url)) }}
|
||||
{{- else if $Values.vmcluster.enabled -}}
|
||||
{{- $_ := set $ctx "appKey" (list "vmcluster" "vminsert") -}}
|
||||
{{- $writeUrl := (include "vm.url" $ctx) }}
|
||||
{{- $writeSrcPath := clean (printf "%s/insert/.*" (urlParse $writeUrl).path) }}
|
||||
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $writeSrcPath) "url_prefix" (list $writeUrl)) }}
|
||||
{{- $_ := set $ctx "appKey" (list "vmcluster" "vmselect") -}}
|
||||
{{- $readUrl := (include "vm.url" $ctx) }}
|
||||
{{- $readSrcPath := clean (printf "%s/select/.*" (urlParse $readUrl).path) }}
|
||||
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $readSrcPath) "url_prefix" (list $readUrl)) }}
|
||||
{{- else if or $Values.externalVM.read.url $Values.externalVM.write.url }}
|
||||
{{- with $Values.externalVM.read.url }}
|
||||
{{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }}
|
||||
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }}
|
||||
{{- end -}}
|
||||
{{- with $Values.externalVM.write.url }}
|
||||
{{- $srcPath := regexReplaceAll "(.*)/api/.*" (clean (printf "%s/.*" (urlParse .).path)) "${1}" }}
|
||||
{{- $unauthorizedAccessConfig = append $unauthorizedAccessConfig (dict "src_paths" (list $srcPath) "url_prefix" (list .)) }}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- $spec := $Values.vmauth.spec }}
|
||||
{{- $_ := set $spec "unauthorizedAccessConfig" (concat $unauthorizedAccessConfig ($spec.unauthorizedAccessConfig | default list)) }}
|
||||
{{- with (include "vm.license.global" .) -}}
|
||||
{{- $_ := set $spec "license" (fromYaml .) -}}
|
||||
{{- end -}}
|
||||
{{- tpl (toYaml $spec) . -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* Alermanager spec */ -}}
|
||||
{{- define "vm.alertmanager.spec" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $fullname := (include "victoria-metrics-k8s-stack.fullname" .) -}}
|
||||
{{- $spec := $Values.alertmanager.spec -}}
|
||||
{{- if and (not $Values.alertmanager.spec.configRawYaml) (not $Values.alertmanager.spec.configSecret) -}}
|
||||
{{- $_ := set $spec "configSecret" (printf "%s-alertmanager" $fullname) -}}
|
||||
{{- end -}}
|
||||
{{- $templates := default list -}}
|
||||
{{- if $Values.alertmanager.monzoTemplate.enabled -}}
|
||||
{{- $configMap := (printf "%s-alertmanager-monzo-tpl" $fullname) -}}
|
||||
{{- $templates = append $templates (dict "name" $configMap "key" "monzo.tmpl") -}}
|
||||
{{- end -}}
|
||||
{{- $configMap := (printf "%s-alertmanager-extra-tpl" $fullname) -}}
|
||||
{{- range $key, $value := (.Values.alertmanager.templateFiles | default dict) -}}
|
||||
{{- $templates = append $templates (dict "name" $configMap "key" $key) -}}
|
||||
{{- end -}}
|
||||
{{- $_ := set $spec "templates" $templates -}}
|
||||
{{- toYaml $spec -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* Single spec */ -}}
|
||||
{{- define "vm.single.spec" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $extraArgs := default dict -}}
|
||||
{{- if $Values.vmalert.enabled }}
|
||||
{{- $ctx := dict "helm" . "appKey" "vmalert" -}}
|
||||
{{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}}
|
||||
{{- end -}}
|
||||
{{- $spec := dict "extraArgs" $extraArgs -}}
|
||||
{{- with (include "vm.license.global" .) -}}
|
||||
{{- $_ := set $spec "license" (fromYaml .) -}}
|
||||
{{- end -}}
|
||||
{{- tpl (deepCopy $Values.vmsingle.spec | mergeOverwrite $spec | toYaml) . -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* Cluster spec */ -}}
|
||||
{{- define "vm.select.spec" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $extraArgs := default dict -}}
|
||||
{{- if $Values.vmalert.enabled -}}
|
||||
{{- $ctx := dict "helm" . "appKey" "vmalert" -}}
|
||||
{{- $_ := set $extraArgs "vmalert.proxyURL" (include "vm.url" $ctx) -}}
|
||||
{{- end -}}
|
||||
{{- $spec := dict "extraArgs" $extraArgs -}}
|
||||
{{- toYaml $spec -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "vm.cluster.spec" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $spec := (include "vm.select.spec" . | fromYaml) -}}
|
||||
{{- $clusterSpec := (deepCopy $Values.vmcluster.spec) -}}
|
||||
{{- with (include "vm.license.global" .) -}}
|
||||
{{- $_ := set $clusterSpec "license" (fromYaml .) -}}
|
||||
{{- end -}}
|
||||
{{- tpl ($clusterSpec | mergeOverwrite (dict "vmselect" $spec) | toYaml) . -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- define "vm.data.source.enabled" -}}
|
||||
{{- $Values := (.helm).Values | default .Values -}}
|
||||
{{- $grafana := $Values.grafana -}}
|
||||
{{- $isEnabled := false -}}
|
||||
{{- if $grafana.plugins -}}
|
||||
{{- range $value := $grafana.plugins -}}
|
||||
{{- if contains "victoriametrics-datasource" $value -}}
|
||||
{{- $isEnabled = true -}}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- $unsignedPlugins := ((index $grafana "grafana.ini").plugins).allow_loading_unsigned_plugins | default "" -}}
|
||||
{{- $allowUnsigned := contains "victoriametrics-datasource" $unsignedPlugins -}}
|
||||
{{- ternary "true" "" (and $isEnabled $allowUnsigned) -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* Datasources */ -}}
|
||||
{{- define "vm.data.sources" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $grafana := $Values.grafana -}}
|
||||
{{- $datasources := $Values.grafana.additionalDataSources | default list -}}
|
||||
{{- $vmDatasource := "victoriametrics-datasource" -}}
|
||||
{{- $allowVMDatasource := (ternary false true (empty (include "vm.data.source.enabled" .))) -}}
|
||||
{{- if or $Values.vmsingle.enabled $Values.vmcluster.enabled -}}
|
||||
{{- $ctx := dict "helm" . -}}
|
||||
{{- $readEndpoint:= (include "vm.read.endpoint" $ctx | fromYaml) -}}
|
||||
{{- $defaultDatasources := default list -}}
|
||||
{{- range $ds := $grafana.sidecar.datasources.default }}
|
||||
{{- if not $ds.type -}}
|
||||
{{- $_ := set $ds "type" $Values.grafana.defaultDatasourceType }}
|
||||
{{- end -}}
|
||||
{{- if or (ne $ds.type $vmDatasource) $allowVMDatasource -}}
|
||||
{{- $_ := set $ds "url" $readEndpoint.url -}}
|
||||
{{- $defaultDatasources = append $defaultDatasources $ds -}}
|
||||
{{- end -}}
|
||||
{{- end }}
|
||||
{{- $datasources = concat $datasources $defaultDatasources -}}
|
||||
{{- if and $grafana.sidecar.datasources.createVMReplicasDatasources $defaultDatasources -}}
|
||||
{{- range $id := until (int $Values.vmsingle.spec.replicaCount) -}}
|
||||
{{- $_ := set $ctx "appIdx" $id -}}
|
||||
{{- $readEndpoint := (include "vm.read.endpoint" $ctx | fromYaml) -}}
|
||||
{{- range $ds := $defaultDatasources -}}
|
||||
{{- $ds = (deepCopy $ds) -}}
|
||||
{{- $_ := set $ds "url" $readEndpoint.url -}}
|
||||
{{- $_ := set $ds "name" (printf "%s-%d" $ds.name $id) -}}
|
||||
{{- $_ := set $ds "isDefault" false -}}
|
||||
{{- $datasources = append $datasources $ds -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- end -}}
|
||||
{{- toYaml $datasources -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* VMRule name */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.rulegroup.name" -}}
|
||||
{{- printf "%s-%s" (include "victoria-metrics-k8s-stack.fullname" .) (.name | replace "_" "") -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* VMRule labels */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.rulegroup.labels" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $labels := (fromYaml (include "victoria-metrics-k8s-stack.labels" .)) -}}
|
||||
{{- $_ := set $labels "app" (include "victoria-metrics-k8s-stack.name" .) -}}
|
||||
{{- $labels = mergeOverwrite $labels (deepCopy $Values.defaultRules.labels) -}}
|
||||
{{- toYaml $labels -}}
|
||||
{{- end }}
|
||||
|
||||
{{- /* VMRule key */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.rulegroup.key" -}}
|
||||
{{- without (regexSplit "[-_.]" .name -1) "exporter" "rules" | join "-" | camelcase | untitle -}}
|
||||
{{- end -}}
|
||||
|
||||
{{- /* VMAlertmanager name */ -}}
|
||||
{{- define "victoria-metrics-k8s-stack.alertmanager.name" -}}
|
||||
{{- $Values := (.helm).Values | default .Values }}
|
||||
{{- $Values.alertmanager.name | default (printf "%s-%s" "vmalertmanager" (include "victoria-metrics-k8s-stack.fullname" .) | trunc 63 | trimSuffix "-") -}}
|
||||
{{- end -}}
|
||||
@@ -1,4 +0,0 @@
|
||||
{{ range .Values.extraObjects }}
|
||||
---
|
||||
{{ tpl (toYaml .) $ }}
|
||||
{{ end }}
|
||||
@@ -1,121 +0,0 @@
|
||||
{{- if .Values.defaultRules.create }}
|
||||
|
||||
{{- /*
|
||||
Default rules alias
|
||||
*/}}
|
||||
{{- $defaultRules := .Values.defaultRules -}}
|
||||
|
||||
{{- /*
|
||||
Exact rules overrides
|
||||
*/}}
|
||||
{{- $exactRules := index $defaultRules "rules" | default dict }}
|
||||
|
||||
{{- /*
|
||||
Create custom template context
|
||||
*/}}
|
||||
{{- $ctx := (dict "Values" (deepCopy .Values) "Release" (deepCopy .Release) "Chart" (deepCopy .Chart) "Template" (deepCopy .Template)) -}}
|
||||
|
||||
{{- /*
|
||||
Loop through all rules files, that were crafted with hack/sync_rules.py
|
||||
*/}}
|
||||
{{- range $groupFile, $_ := .Files.Glob "files/rules/**.yaml" -}}
|
||||
|
||||
{{- /*
|
||||
Get group name from file
|
||||
*/}}
|
||||
{{- $groupBase := base $groupFile -}}
|
||||
{{- $groupNameRaw := trimSuffix (ext $groupBase) $groupBase -}}
|
||||
|
||||
{{- /*
|
||||
Create context for templating
|
||||
*/}}
|
||||
{{- $_ := set $ctx "name" $groupNameRaw -}}
|
||||
|
||||
{{- /*
|
||||
Create sanitized group name retrieved from file
|
||||
*/}}
|
||||
{{- $groupName := include "victoria-metrics-k8s-stack.rulegroup.key" $ctx -}}
|
||||
|
||||
{{- /*
|
||||
Merge common group with a group data of a current iteration
|
||||
*/}}
|
||||
|
||||
{{- $group := mergeOverwrite (deepCopy (dig "group" (default dict) $defaultRules)) (dig "groups" $groupName (default dict) $defaultRules) -}}
|
||||
|
||||
{{- /*
|
||||
Get group data from file
|
||||
*/}}
|
||||
{{- $groupCtx := mergeOverwrite (deepCopy $ctx) $group }}
|
||||
{{- $groupData := fromYaml (tpl ($.Files.Get $groupFile) $groupCtx) -}}
|
||||
|
||||
{{- /*
|
||||
Save rules spec from file
|
||||
*/}}
|
||||
{{- $rulesSpec := $groupData.rules -}}
|
||||
|
||||
{{- /*
|
||||
Delete rules from group
|
||||
*/}}
|
||||
{{- $_ := unset $groupData "rules" -}}
|
||||
|
||||
{{- /*
|
||||
Save condition for group from file
|
||||
*/}}
|
||||
{{- $groupCondition := (eq $groupData.condition "true") -}}
|
||||
|
||||
{{- /*
|
||||
Delete condition from group
|
||||
*/}}
|
||||
{{- $_ := unset $groupData "condition" -}}
|
||||
|
||||
{{- /*
|
||||
Merge group spec
|
||||
*/}}
|
||||
{{- $groupSpec := mergeOverwrite (deepCopy $groupData) (dig "spec" (default dict) $group) -}}
|
||||
|
||||
{{- /*
|
||||
Filter out ignore rules
|
||||
*/}}
|
||||
{{- $commonRule := dig "rule" (default dict) $defaultRules }}
|
||||
{{- $commonInGroupRule := dig "rules" (default dict) $group }}
|
||||
{{- $filteredRulesSpec := default list }}
|
||||
{{- range $_, $ruleSpec := $rulesSpec }}
|
||||
{{- $ruleName := $ruleSpec.alert | default "" }}
|
||||
{{- $ruleKey := (hasKey $ruleSpec "record" | ternary "recording" "alerting") -}}
|
||||
{{- $ruleCondition := (eq $ruleSpec.condition "true") }}
|
||||
{{- $_ := unset $ruleSpec "condition" }}
|
||||
{{- $exactRule := index $exactRules $ruleName | default dict }}
|
||||
{{- $defaultRule := deepCopy (index $defaultRules $ruleKey) }}
|
||||
{{- $resultRule := mergeOverwrite (deepCopy $commonRule) $defaultRule $commonInGroupRule $exactRule }}
|
||||
{{- if (and (dig "create" true $resultRule) $ruleCondition) }}
|
||||
{{- $ruleSpec := mergeOverwrite (deepCopy $ruleSpec) (dig "spec" (default dict) $resultRule) }}
|
||||
{{- $filteredRulesSpec = append $filteredRulesSpec $ruleSpec }}
|
||||
{{- end }}
|
||||
{{- end -}}
|
||||
{{- $rulesSpec = $filteredRulesSpec }}
|
||||
|
||||
{{- /*
|
||||
Check if group is enabled
|
||||
*/}}
|
||||
{{- if (and $rulesSpec (dig "create" true $group) $groupCondition) }}
|
||||
---
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMRule
|
||||
metadata:
|
||||
name: {{ include "victoria-metrics-k8s-stack.rulegroup.name" $ctx }}
|
||||
{{- with (include "victoria-metrics-k8s-stack.rulegroup.labels" $ctx) }}
|
||||
labels: {{ . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- with $.Values.defaultRules.annotations }}
|
||||
annotations: {{ toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- {{ toYaml $groupSpec | indent 4 | trim }}
|
||||
rules:
|
||||
{{- range $ruleSpec := $rulesSpec }}
|
||||
- {{ toYaml $ruleSpec | indent 6 | trim }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,26 +0,0 @@
|
||||
### plans
|
||||
|
||||
* [x] VMCluster
|
||||
* [x] VMSingle
|
||||
* [x] VMAgent
|
||||
* [x] VMAlert
|
||||
* [x] AlertManager
|
||||
* [x] Annotations
|
||||
* [x] ServiceScrapes
|
||||
* [x] Nodeexporter
|
||||
* [x] Grafana
|
||||
* [x] kube-state-metrics
|
||||
* [x] kube-mixin
|
||||
* [x] core-dns
|
||||
* [x] Grafana DS
|
||||
* [x] Dashboards
|
||||
* [x] Nodeexporter
|
||||
* [x] kube-state-metrics
|
||||
* [x] kube-mixin
|
||||
* [x] Rules
|
||||
* [x] kube-mixin
|
||||
* [x] kube-prometheus
|
||||
* [x] victoria-metrics
|
||||
* [ ] ServiceAccounts stuff
|
||||
* [ ] SelectorOvverride for ServiceScrapes
|
||||
* [ ] helm hook for uninstall crd objects before chart remove
|
||||
@@ -1,38 +0,0 @@
|
||||
vmagent:
|
||||
spec:
|
||||
volumes:
|
||||
- hostPath:
|
||||
path: /var/lib/minikube/certs/etcd
|
||||
type: DirectoryOrCreate
|
||||
name: etcd-certs
|
||||
volumeMounts:
|
||||
- mountPath: /var/lib/minikube/certs/etcd
|
||||
name: etcd-certs
|
||||
kubeScheduler:
|
||||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
port: http-metrics
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecureSkipVerify: true
|
||||
kubeControllerManager:
|
||||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
port: http-metrics
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
insecureSkipVerify: true
|
||||
kubeEtcd:
|
||||
spec:
|
||||
endpoints:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
port: http-metrics
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/lib/minikube/certs/etcd/ca.crt
|
||||
certFile: /var/lib/minikube/certs/etcd/peer.crt
|
||||
keyFile: /var/lib/minikube/certs/etcd/peer.key
|
||||
File diff suppressed because it is too large
Load Diff
7
packages/system/monitoring/templates/alerts.yaml
Normal file
7
packages/system/monitoring/templates/alerts.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
{{- $files := .Files.Glob "alerts/*.yaml" -}}
|
||||
{{- range $path, $file := $files }}
|
||||
---
|
||||
# from: {{ $path }}
|
||||
{{ toString $file }}
|
||||
|
||||
{{- end -}}
|
||||
@@ -82,3 +82,275 @@ victoria-metrics-k8s-stack:
|
||||
enabled: false
|
||||
alertmanager:
|
||||
name: vmalertmanager-alertmanager
|
||||
|
||||
kube-state-metrics:
|
||||
rbac:
|
||||
extraRules:
|
||||
- apiGroups:
|
||||
- source.toolkit.fluxcd.io
|
||||
- kustomize.toolkit.fluxcd.io
|
||||
- helm.toolkit.fluxcd.io
|
||||
- notification.toolkit.fluxcd.io
|
||||
- image.toolkit.fluxcd.io
|
||||
resources:
|
||||
- gitrepositories
|
||||
- buckets
|
||||
- helmrepositories
|
||||
- helmcharts
|
||||
- ocirepositories
|
||||
- kustomizations
|
||||
- helmreleases
|
||||
- alerts
|
||||
- providers
|
||||
- receivers
|
||||
- imagerepositories
|
||||
- imagepolicies
|
||||
- imageupdateautomations
|
||||
verbs: [ "list", "watch" ]
|
||||
customResourceState:
|
||||
enabled: true
|
||||
config:
|
||||
spec:
|
||||
resources:
|
||||
- groupVersionKind:
|
||||
group: kustomize.toolkit.fluxcd.io
|
||||
version: v1
|
||||
kind: Kustomization
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux Kustomization resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, lastAppliedRevision ]
|
||||
source_name: [ spec, sourceRef, name ]
|
||||
- groupVersionKind:
|
||||
group: helm.toolkit.fluxcd.io
|
||||
version: v2
|
||||
kind: HelmRelease
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux HelmRelease resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, history, "0", chartVersion ]
|
||||
chart_name: [ status, history, "0", chartName ]
|
||||
chart_app_version: [ status, history, "0", appVersion ]
|
||||
chart_ref_name: [ spec, chartRef, name ]
|
||||
chart_source_name: [ spec, chart, spec, sourceRef, name ]
|
||||
- groupVersionKind:
|
||||
group: source.toolkit.fluxcd.io
|
||||
version: v1
|
||||
kind: GitRepository
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux GitRepository resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, artifact, revision ]
|
||||
url: [ spec, url ]
|
||||
- groupVersionKind:
|
||||
group: source.toolkit.fluxcd.io
|
||||
version: v1beta2
|
||||
kind: Bucket
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux Bucket resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, artifact, revision ]
|
||||
endpoint: [ spec, endpoint ]
|
||||
bucket_name: [ spec, bucketName ]
|
||||
- groupVersionKind:
|
||||
group: source.toolkit.fluxcd.io
|
||||
version: v1
|
||||
kind: HelmRepository
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux HelmRepository resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, artifact, revision ]
|
||||
url: [ spec, url ]
|
||||
- groupVersionKind:
|
||||
group: source.toolkit.fluxcd.io
|
||||
version: v1
|
||||
kind: HelmChart
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux HelmChart resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, artifact, revision ]
|
||||
chart_name: [ spec, chart ]
|
||||
chart_version: [ spec, version ]
|
||||
- groupVersionKind:
|
||||
group: source.toolkit.fluxcd.io
|
||||
version: v1beta2
|
||||
kind: OCIRepository
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux OCIRepository resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
revision: [ status, artifact, revision ]
|
||||
url: [ spec, url ]
|
||||
- groupVersionKind:
|
||||
group: notification.toolkit.fluxcd.io
|
||||
version: v1beta3
|
||||
kind: Alert
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux Alert resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
suspended: [ spec, suspend ]
|
||||
- groupVersionKind:
|
||||
group: notification.toolkit.fluxcd.io
|
||||
version: v1beta3
|
||||
kind: Provider
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux Provider resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
suspended: [ spec, suspend ]
|
||||
- groupVersionKind:
|
||||
group: notification.toolkit.fluxcd.io
|
||||
version: v1
|
||||
kind: Receiver
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux Receiver resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
webhook_path: [ status, webhookPath ]
|
||||
- groupVersionKind:
|
||||
group: image.toolkit.fluxcd.io
|
||||
version: v1beta2
|
||||
kind: ImageRepository
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux ImageRepository resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
image: [ spec, image ]
|
||||
- groupVersionKind:
|
||||
group: image.toolkit.fluxcd.io
|
||||
version: v1beta2
|
||||
kind: ImagePolicy
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux ImagePolicy resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
source_name: [ spec, imageRepositoryRef, name ]
|
||||
- groupVersionKind:
|
||||
group: image.toolkit.fluxcd.io
|
||||
version: v1beta2
|
||||
kind: ImageUpdateAutomation
|
||||
metricNamePrefix: gotk
|
||||
metrics:
|
||||
- name: "resource_info"
|
||||
help: "The current state of a Flux ImageUpdateAutomation resource."
|
||||
each:
|
||||
type: Info
|
||||
info:
|
||||
labelsFromPath:
|
||||
name: [ metadata, name ]
|
||||
labelsFromPath:
|
||||
exported_namespace: [ metadata, namespace ]
|
||||
ready: [ status, conditions, "[type=Ready]", status ]
|
||||
suspended: [ spec, suspend ]
|
||||
source_name: [ spec, sourceRef, name ]
|
||||
|
||||
Reference in New Issue
Block a user