mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-01-27 18:18:41 +00:00
add kafka monitoring
This commit is contained in:
2940
dashboards/kafka/strimzi-kafka.json
Normal file
2940
dashboards/kafka/strimzi-kafka.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -80,6 +80,7 @@ modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//main/namespace/
|
||||
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//main/capacity-planning/capacity-planning.json
|
||||
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//flux/flux-control-plane.json
|
||||
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//flux/flux-stats.json
|
||||
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kafka/strimzi-kafka.json
|
||||
EOT
|
||||
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ type: application
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.3.1
|
||||
version: 0.3.2
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
|
||||
@@ -57,6 +57,12 @@ spec:
|
||||
class: {{ . }}
|
||||
{{- end }}
|
||||
deleteClaim: true
|
||||
metricsConfig:
|
||||
type: jmxPrometheusExporter
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: {{ .Release.Name }}-metrics
|
||||
key: kafka-metrics-config.yml
|
||||
zookeeper:
|
||||
replicas: {{ .Values.zookeeper.replicas }}
|
||||
storage:
|
||||
@@ -68,6 +74,12 @@ spec:
|
||||
class: {{ . }}
|
||||
{{- end }}
|
||||
deleteClaim: false
|
||||
metricsConfig:
|
||||
type: jmxPrometheusExporter
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
name: {{ .Release.Name }}-metrics
|
||||
key: kafka-metrics-config.yml
|
||||
entityOperator:
|
||||
topicOperator: {}
|
||||
userOperator: {}
|
||||
|
||||
198
packages/apps/kafka/templates/metrics-configmap.yaml
Normal file
198
packages/apps/kafka/templates/metrics-configmap.yaml
Normal file
@@ -0,0 +1,198 @@
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-metrics
|
||||
data:
|
||||
kafka-metrics-config.yml: |
|
||||
# See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics
|
||||
lowercaseOutputName: true
|
||||
rules:
|
||||
# Special cases and very specific rules
|
||||
- pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), topic=(.+), partition=(.*)><>Value
|
||||
name: kafka_server_$1_$2
|
||||
type: GAUGE
|
||||
labels:
|
||||
clientId: "$3"
|
||||
topic: "$4"
|
||||
partition: "$5"
|
||||
- pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), brokerHost=(.+), brokerPort=(.+)><>Value
|
||||
name: kafka_server_$1_$2
|
||||
type: GAUGE
|
||||
labels:
|
||||
clientId: "$3"
|
||||
broker: "$4:$5"
|
||||
- pattern: kafka.server<type=(.+), cipher=(.+), protocol=(.+), listener=(.+), networkProcessor=(.+)><>connections
|
||||
name: kafka_server_$1_connections_tls_info
|
||||
type: GAUGE
|
||||
labels:
|
||||
cipher: "$2"
|
||||
protocol: "$3"
|
||||
listener: "$4"
|
||||
networkProcessor: "$5"
|
||||
- pattern: kafka.server<type=(.+), clientSoftwareName=(.+), clientSoftwareVersion=(.+), listener=(.+), networkProcessor=(.+)><>connections
|
||||
name: kafka_server_$1_connections_software
|
||||
type: GAUGE
|
||||
labels:
|
||||
clientSoftwareName: "$2"
|
||||
clientSoftwareVersion: "$3"
|
||||
listener: "$4"
|
||||
networkProcessor: "$5"
|
||||
- pattern: "kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+-total):"
|
||||
name: kafka_server_$1_$4
|
||||
type: COUNTER
|
||||
labels:
|
||||
listener: "$2"
|
||||
networkProcessor: "$3"
|
||||
- pattern: "kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+):"
|
||||
name: kafka_server_$1_$4
|
||||
type: GAUGE
|
||||
labels:
|
||||
listener: "$2"
|
||||
networkProcessor: "$3"
|
||||
- pattern: kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+-total)
|
||||
name: kafka_server_$1_$4
|
||||
type: COUNTER
|
||||
labels:
|
||||
listener: "$2"
|
||||
networkProcessor: "$3"
|
||||
- pattern: kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+)
|
||||
name: kafka_server_$1_$4
|
||||
type: GAUGE
|
||||
labels:
|
||||
listener: "$2"
|
||||
networkProcessor: "$3"
|
||||
# Some percent metrics use MeanRate attribute
|
||||
# Ex) kafka.server<type=(KafkaRequestHandlerPool), name=(RequestHandlerAvgIdlePercent)><>MeanRate
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*><>MeanRate
|
||||
name: kafka_$1_$2_$3_percent
|
||||
type: GAUGE
|
||||
# Generic gauges for percents
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*><>Value
|
||||
name: kafka_$1_$2_$3_percent
|
||||
type: GAUGE
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*, (.+)=(.+)><>Value
|
||||
name: kafka_$1_$2_$3_percent
|
||||
type: GAUGE
|
||||
labels:
|
||||
"$4": "$5"
|
||||
# Generic per-second counters with 0-2 key/value pairs
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+), (.+)=(.+)><>Count
|
||||
name: kafka_$1_$2_$3_total
|
||||
type: COUNTER
|
||||
labels:
|
||||
"$4": "$5"
|
||||
"$6": "$7"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+)><>Count
|
||||
name: kafka_$1_$2_$3_total
|
||||
type: COUNTER
|
||||
labels:
|
||||
"$4": "$5"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*><>Count
|
||||
name: kafka_$1_$2_$3_total
|
||||
type: COUNTER
|
||||
# Generic gauges with 0-2 key/value pairs
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Value
|
||||
name: kafka_$1_$2_$3
|
||||
type: GAUGE
|
||||
labels:
|
||||
"$4": "$5"
|
||||
"$6": "$7"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Value
|
||||
name: kafka_$1_$2_$3
|
||||
type: GAUGE
|
||||
labels:
|
||||
"$4": "$5"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>Value
|
||||
name: kafka_$1_$2_$3
|
||||
type: GAUGE
|
||||
# Emulate Prometheus 'Summary' metrics for the exported 'Histogram's.
|
||||
# Note that these are missing the '_sum' metric!
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Count
|
||||
name: kafka_$1_$2_$3_count
|
||||
type: COUNTER
|
||||
labels:
|
||||
"$4": "$5"
|
||||
"$6": "$7"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*), (.+)=(.+)><>(\d+)thPercentile
|
||||
name: kafka_$1_$2_$3
|
||||
type: GAUGE
|
||||
labels:
|
||||
"$4": "$5"
|
||||
"$6": "$7"
|
||||
quantile: "0.$8"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Count
|
||||
name: kafka_$1_$2_$3_count
|
||||
type: COUNTER
|
||||
labels:
|
||||
"$4": "$5"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*)><>(\d+)thPercentile
|
||||
name: kafka_$1_$2_$3
|
||||
type: GAUGE
|
||||
labels:
|
||||
"$4": "$5"
|
||||
quantile: "0.$6"
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>Count
|
||||
name: kafka_$1_$2_$3_count
|
||||
type: COUNTER
|
||||
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>(\d+)thPercentile
|
||||
name: kafka_$1_$2_$3
|
||||
type: GAUGE
|
||||
labels:
|
||||
quantile: "0.$4"
|
||||
# KRaft overall related metrics
|
||||
# distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics
|
||||
- pattern: "kafka.server<type=raft-metrics><>(.+-total|.+-max):"
|
||||
name: kafka_server_raftmetrics_$1
|
||||
type: COUNTER
|
||||
- pattern: "kafka.server<type=raft-metrics><>(current-state): (.+)"
|
||||
name: kafka_server_raftmetrics_$1
|
||||
value: 1
|
||||
type: UNTYPED
|
||||
labels:
|
||||
$1: "$2"
|
||||
- pattern: "kafka.server<type=raft-metrics><>(.+):"
|
||||
name: kafka_server_raftmetrics_$1
|
||||
type: GAUGE
|
||||
# KRaft "low level" channels related metrics
|
||||
# distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics
|
||||
- pattern: "kafka.server<type=raft-channel-metrics><>(.+-total|.+-max):"
|
||||
name: kafka_server_raftchannelmetrics_$1
|
||||
type: COUNTER
|
||||
- pattern: "kafka.server<type=raft-channel-metrics><>(.+):"
|
||||
name: kafka_server_raftchannelmetrics_$1
|
||||
type: GAUGE
|
||||
# Broker metrics related to fetching metadata topic records in KRaft mode
|
||||
- pattern: "kafka.server<type=broker-metadata-metrics><>(.+):"
|
||||
name: kafka_server_brokermetadatametrics_$1
|
||||
type: GAUGE
|
||||
zookeeper-metrics-config.yml: |
|
||||
# See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics
|
||||
lowercaseOutputName: true
|
||||
rules:
|
||||
# replicated Zookeeper
|
||||
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+)><>(\\w+)"
|
||||
name: "zookeeper_$2"
|
||||
type: GAUGE
|
||||
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+)><>(\\w+)"
|
||||
name: "zookeeper_$3"
|
||||
type: GAUGE
|
||||
labels:
|
||||
replicaId: "$2"
|
||||
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+)><>(Packets\\w+)"
|
||||
name: "zookeeper_$4"
|
||||
type: COUNTER
|
||||
labels:
|
||||
replicaId: "$2"
|
||||
memberType: "$3"
|
||||
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+)><>(\\w+)"
|
||||
name: "zookeeper_$4"
|
||||
type: GAUGE
|
||||
labels:
|
||||
replicaId: "$2"
|
||||
memberType: "$3"
|
||||
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+), name3=(\\w+)><>(\\w+)"
|
||||
name: "zookeeper_$4_$5"
|
||||
type: GAUGE
|
||||
labels:
|
||||
replicaId: "$2"
|
||||
memberType: "$3"
|
||||
40
packages/apps/kafka/templates/podscrape.yaml
Normal file
40
packages/apps/kafka/templates/podscrape.yaml
Normal file
@@ -0,0 +1,40 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMPodScrape
|
||||
metadata:
|
||||
name: {{ .Release.Name }}-pod-scrape
|
||||
spec:
|
||||
podMetricsEndpoints:
|
||||
- port: tcp-prometheus
|
||||
scheme: http
|
||||
relabelConfigs:
|
||||
- separator: ;
|
||||
regex: __meta_kubernetes_pod_label_(strimzi_io_.+)
|
||||
replacement: $1
|
||||
action: labelmap
|
||||
- sourceLabels: [__meta_kubernetes_namespace]
|
||||
separator: ;
|
||||
regex: (.*)
|
||||
targetLabel: namespace
|
||||
replacement: $1
|
||||
action: replace
|
||||
- sourceLabels: [__meta_kubernetes_pod_name]
|
||||
separator: ;
|
||||
regex: (.*)
|
||||
targetLabel: kubernetes_pod_name
|
||||
replacement: $1
|
||||
action: replace
|
||||
- sourceLabels: [__meta_kubernetes_pod_node_name]
|
||||
separator: ;
|
||||
regex: (.*)
|
||||
targetLabel: node_name
|
||||
replacement: $1
|
||||
action: replace
|
||||
- sourceLabels: [__meta_kubernetes_pod_host_ip]
|
||||
separator: ;
|
||||
regex: (.*)
|
||||
targetLabel: node_ip
|
||||
replacement: $1
|
||||
action: replace
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
@@ -23,7 +23,8 @@ kafka 0.2.1 3ac17018
|
||||
kafka 0.2.2 d0758692
|
||||
kafka 0.2.3 5ca8823
|
||||
kafka 0.3.0 c07c4bbd
|
||||
kafka 0.3.1 HEAD
|
||||
kafka 0.3.1 b7375f73
|
||||
kafka 0.3.2 HEAD
|
||||
kubernetes 0.1.0 f642698
|
||||
kubernetes 0.2.0 7cd7de73
|
||||
kubernetes 0.3.0 7caccec1
|
||||
|
||||
@@ -34,3 +34,4 @@ control-plane/kube-etcd
|
||||
kubevirt/kubevirt-control-plane
|
||||
flux/flux-control-plane
|
||||
flux/flux-stats
|
||||
kafka/strimzi-kafka
|
||||
|
||||
275
packages/system/kafka-operator/templates/prometheus-rules.yaml
Normal file
275
packages/system/kafka-operator/templates/prometheus-rules.yaml
Normal file
@@ -0,0 +1,275 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
labels:
|
||||
role: alert-rules
|
||||
app: strimzi
|
||||
name: prometheus-kafka-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: kafka
|
||||
rules:
|
||||
- alert: KafkaRunningOutOfSpace
|
||||
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} * 100 / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} < 15
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka is running out of free disk space'
|
||||
description: 'There are only {{ $value }} percent available at {{ $labels.persistentvolumeclaim }} PVC'
|
||||
- alert: UnderReplicatedPartitions
|
||||
expr: kafka_server_replicamanager_underreplicatedpartitions > 0
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka under replicated partitions'
|
||||
description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}'
|
||||
- alert: AbnormalControllerState
|
||||
expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka abnormal controller state'
|
||||
description: 'There are {{ $value }} active controllers in the cluster'
|
||||
- alert: OfflinePartitions
|
||||
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka offline partitions'
|
||||
description: 'One or more partitions have no leader'
|
||||
- alert: UnderMinIsrPartitionCount
|
||||
expr: kafka_server_replicamanager_underminisrpartitioncount > 0
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka under min ISR partitions'
|
||||
description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.kubernetes_pod_name }}'
|
||||
- alert: OfflineLogDirectoryCount
|
||||
expr: kafka_log_logmanager_offlinelogdirectorycount > 0
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka offline log directories'
|
||||
description: 'There are {{ $value }} offline log directories on {{ $labels.kubernetes_pod_name }}'
|
||||
- alert: ScrapeProblem
|
||||
expr: up{kubernetes_namespace!~"openshift-.+",kubernetes_pod_name=~".+-kafka-[0-9]+"} == 0
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}'
|
||||
description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes'
|
||||
- alert: ClusterOperatorContainerDown
|
||||
expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"})
|
||||
for: 1m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'Cluster Operator down'
|
||||
description: 'The Cluster Operator has been down for longer than 90 seconds'
|
||||
- alert: KafkaBrokerContainersDown
|
||||
expr: absent(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'All `kafka` containers down or in CrashLookBackOff status'
|
||||
description: 'All `kafka` containers have been down or in CrashLookBackOff status for 3 minutes'
|
||||
- alert: KafkaContainerRestartedInTheLast5Minutes
|
||||
expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'One or more Kafka containers restarted too often'
|
||||
description: 'One or more Kafka containers were restarted too often within the last 5 minutes'
|
||||
- name: zookeeper
|
||||
rules:
|
||||
- alert: AvgRequestLatency
|
||||
expr: zookeeper_avgrequestlatency > 10
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Zookeeper average request latency'
|
||||
description: 'The average request latency is {{ $value }} on {{ $labels.kubernetes_pod_name }}'
|
||||
- alert: OutstandingRequests
|
||||
expr: zookeeper_outstandingrequests > 10
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Zookeeper outstanding requests'
|
||||
description: 'There are {{ $value }} outstanding requests on {{ $labels.kubernetes_pod_name }}'
|
||||
- alert: ZookeeperRunningOutOfSpace
|
||||
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data-(.+)-zookeeper-[0-9]+"} < 5368709120
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Zookeeper is running out of free disk space'
|
||||
description: 'There are only {{ $value }} bytes available at {{ $labels.persistentvolumeclaim }} PVC'
|
||||
- alert: ZookeeperContainerRestartedInTheLast5Minutes
|
||||
expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'One or more Zookeeper containers were restarted too often'
|
||||
description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up'
|
||||
- alert: ZookeeperContainersDown
|
||||
expr: absent(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'All `zookeeper` containers in the Zookeeper pods down or in CrashLookBackOff status'
|
||||
description: 'All `zookeeper` containers in the Zookeeper pods have been down or in CrashLookBackOff status for 3 minutes'
|
||||
- name: entityOperator
|
||||
rules:
|
||||
- alert: TopicOperatorContainerDown
|
||||
expr: absent(container_last_seen{container="topic-operator",pod=~".+-entity-operator-.+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status'
|
||||
description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes'
|
||||
- alert: UserOperatorContainerDown
|
||||
expr: absent(container_last_seen{container="user-operator",pod=~".+-entity-operator-.+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status'
|
||||
description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes'
|
||||
- name: connect
|
||||
rules:
|
||||
- alert: ConnectContainersDown
|
||||
expr: absent(container_last_seen{container=~".+-connect",pod=~".+-connect-.+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'All Kafka Connect containers down or in CrashLookBackOff status'
|
||||
description: 'All Kafka Connect containers have been down or in CrashLookBackOff status for 3 minutes'
|
||||
- alert: ConnectFailedConnector
|
||||
expr: sum(kafka_connect_connector_status{status="failed"}) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'Kafka Connect Connector Failure'
|
||||
description: 'One or more connectors have been in failed state for 5 minutes,'
|
||||
- alert: ConnectFailedTask
|
||||
expr: sum(kafka_connect_worker_connector_failed_task_count) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'Kafka Connect Task Failure'
|
||||
description: 'One or more tasks have been in failed state for 5 minutes.'
|
||||
- name: bridge
|
||||
rules:
|
||||
- alert: BridgeContainersDown
|
||||
expr: absent(container_last_seen{container=~".+-bridge",pod=~".+-bridge-.+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'All Kafka Bridge containers down or in CrashLookBackOff status'
|
||||
description: 'All Kafka Bridge containers have been down or in CrashLookBackOff status for 3 minutes'
|
||||
- alert: AvgProducerLatency
|
||||
expr: strimzi_bridge_kafka_producer_request_latency_avg > 10
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka Bridge producer average request latency'
|
||||
description: 'The average producer request latency is {{ $value }} on {{ $labels.clientId }}'
|
||||
- alert: AvgConsumerFetchLatency
|
||||
expr: strimzi_bridge_kafka_consumer_fetch_latency_avg > 500
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka Bridge consumer average fetch latency'
|
||||
description: 'The average consumer fetch latency is {{ $value }} on {{ $labels.clientId }}'
|
||||
- alert: AvgConsumerCommitLatency
|
||||
expr: strimzi_bridge_kafka_consumer_commit_latency_avg > 200
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka Bridge consumer average commit latency'
|
||||
description: 'The average consumer commit latency is {{ $value }} on {{ $labels.clientId }}'
|
||||
- alert: Http4xxErrorRate
|
||||
expr: strimzi_bridge_http_server_requestCount_total{code=~"^4..$", container=~"^.+-bridge", path !="/favicon.ico"} > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka Bridge returns code 4xx too often'
|
||||
description: 'Kafka Bridge returns code 4xx too much ({{ $value }}) for the path {{ $labels.path }}'
|
||||
- alert: Http5xxErrorRate
|
||||
expr: strimzi_bridge_http_server_requestCount_total{code=~"^5..$", container=~"^.+-bridge"} > 10
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Kafka Bridge returns code 5xx too often'
|
||||
description: 'Kafka Bridge returns code 5xx too much ({{ $value }}) for the path {{ $labels.path }}'
|
||||
- name: mirrorMaker
|
||||
rules:
|
||||
- alert: MirrorMakerContainerDown
|
||||
expr: absent(container_last_seen{container=~".+-mirror-maker",pod=~".+-mirror-maker-.+"})
|
||||
for: 3m
|
||||
labels:
|
||||
severity: major
|
||||
annotations:
|
||||
summary: 'All Kafka Mirror Maker containers down or in CrashLookBackOff status'
|
||||
description: 'All Kafka Mirror Maker containers have been down or in CrashLookBackOff status for 3 minutes'
|
||||
- name: kafkaExporter
|
||||
rules:
|
||||
- alert: UnderReplicatedPartition
|
||||
expr: kafka_topic_partition_under_replicated_partition > 0
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Topic has under-replicated partitions'
|
||||
description: 'Topic {{ $labels.topic }} has {{ $value }} under-replicated partition {{ $labels.partition }}'
|
||||
- alert: TooLargeConsumerGroupLag
|
||||
expr: kafka_consumergroup_lag > 1000
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Consumer group lag is too big'
|
||||
description: 'Consumer group {{ $labels.consumergroup}} lag is too big ({{ $value }}) on topic {{ $labels.topic }}/partition {{ $labels.partition }}'
|
||||
- alert: NoMessageForTooLong
|
||||
expr: changes(kafka_topic_partition_current_offset[10m]) == 0
|
||||
for: 10s
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'No message for 10 minutes'
|
||||
description: 'There is no messages in topic {{ $labels.topic}}/partition {{ $labels.partition }} for 10 minutes'
|
||||
- name: certificates
|
||||
interval: 1m0s
|
||||
rules:
|
||||
- alert: CertificateExpiration
|
||||
expr: |
|
||||
strimzi_certificate_expiration_timestamp_ms/1000 - time() < 30 * 24 * 60 * 60
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: 'Certificate will expire in less than 30 days'
|
||||
description: 'Certificate of type {{ $labels.type }} in cluster {{ $labels.cluster }} in namespace {{ $labels.resource_namespace }} will expire in less than 30 days'
|
||||
Reference in New Issue
Block a user