This commit is contained in:
Floppy Disk
2025-02-12 11:45:16 +03:00
parent d234d58a16
commit 84a23947b0
2 changed files with 11 additions and 11 deletions

View File

@@ -1,7 +1,7 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: {{ .Release.Name }}-pod-scrape
name: {{ .Release.Name }}
spec:
podMetricsEndpoints:
- port: tcp-prometheus
@@ -20,13 +20,13 @@ spec:
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: kubernetes_pod_name
targetLabel: pod
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_node_name]
separator: ;
regex: (.*)
targetLabel: node_name
targetLabel: node
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_host_ip]

View File

@@ -24,7 +24,7 @@ spec:
severity: warning
annotations:
summary: 'Kafka under replicated partitions'
description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}'
description: 'There are {{ $value }} under replicated partitions on {{ $labels.pod }}'
- alert: AbnormalControllerState
expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1
for: 10s
@@ -48,7 +48,7 @@ spec:
severity: warning
annotations:
summary: 'Kafka under min ISR partitions'
description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.kubernetes_pod_name }}'
description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.pod }}'
- alert: OfflineLogDirectoryCount
expr: kafka_log_logmanager_offlinelogdirectorycount > 0
for: 10s
@@ -56,15 +56,15 @@ spec:
severity: warning
annotations:
summary: 'Kafka offline log directories'
description: 'There are {{ $value }} offline log directories on {{ $labels.kubernetes_pod_name }}'
description: 'There are {{ $value }} offline log directories on {{ $labels.pod }}'
- alert: ScrapeProblem
expr: up{kubernetes_namespace!~"openshift-.+",kubernetes_pod_name=~".+-kafka-[0-9]+"} == 0
expr: up{kubernetes_namespace!~"openshift-.+",pod=~".+-kafka-[0-9]+"} == 0
for: 3m
labels:
severity: major
annotations:
summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}'
description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes'
summary: 'Prometheus unable to scrape metrics from {{ $labels.pod }}/{{ $labels.instance }}'
description: 'Prometheus was unable to scrape metrics from {{ $labels.pod }}/{{ $labels.instance }} for more than 3 minutes'
- alert: ClusterOperatorContainerDown
expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"})
for: 1m
@@ -98,7 +98,7 @@ spec:
severity: warning
annotations:
summary: 'Zookeeper average request latency'
description: 'The average request latency is {{ $value }} on {{ $labels.kubernetes_pod_name }}'
description: 'The average request latency is {{ $value }} on {{ $labels.pod }}'
- alert: OutstandingRequests
expr: zookeeper_outstandingrequests > 10
for: 10s
@@ -106,7 +106,7 @@ spec:
severity: warning
annotations:
summary: 'Zookeeper outstanding requests'
description: 'There are {{ $value }} outstanding requests on {{ $labels.kubernetes_pod_name }}'
description: 'There are {{ $value }} outstanding requests on {{ $labels.pod }}'
- alert: ZookeeperRunningOutOfSpace
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data-(.+)-zookeeper-[0-9]+"} < 5368709120
for: 10s