add kafka monitoring

This commit is contained in:
Floppy Disk
2025-02-10 13:29:17 +03:00
parent 87328a6ff3
commit b75aaf177b
9 changed files with 3470 additions and 2 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -80,6 +80,7 @@ modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//main/namespace/
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//main/capacity-planning/capacity-planning.json
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//flux/flux-control-plane.json
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//flux/flux-stats.json
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kafka/strimzi-kafka.json
EOT

View File

@@ -16,7 +16,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.3.1
version: 0.3.2
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

View File

@@ -57,6 +57,12 @@ spec:
class: {{ . }}
{{- end }}
deleteClaim: true
metricsConfig:
type: jmxPrometheusExporter
valueFrom:
configMapKeyRef:
name: {{ .Release.Name }}-metrics
key: kafka-metrics-config.yml
zookeeper:
replicas: {{ .Values.zookeeper.replicas }}
storage:
@@ -68,6 +74,12 @@ spec:
class: {{ . }}
{{- end }}
deleteClaim: false
metricsConfig:
type: jmxPrometheusExporter
valueFrom:
configMapKeyRef:
name: {{ .Release.Name }}-metrics
key: kafka-metrics-config.yml
entityOperator:
topicOperator: {}
userOperator: {}

View File

@@ -0,0 +1,198 @@
kind: ConfigMap
apiVersion: v1
metadata:
name: {{ .Release.Name }}-metrics
data:
kafka-metrics-config.yml: |
# See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics
lowercaseOutputName: true
rules:
# Special cases and very specific rules
- pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), topic=(.+), partition=(.*)><>Value
name: kafka_server_$1_$2
type: GAUGE
labels:
clientId: "$3"
topic: "$4"
partition: "$5"
- pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), brokerHost=(.+), brokerPort=(.+)><>Value
name: kafka_server_$1_$2
type: GAUGE
labels:
clientId: "$3"
broker: "$4:$5"
- pattern: kafka.server<type=(.+), cipher=(.+), protocol=(.+), listener=(.+), networkProcessor=(.+)><>connections
name: kafka_server_$1_connections_tls_info
type: GAUGE
labels:
cipher: "$2"
protocol: "$3"
listener: "$4"
networkProcessor: "$5"
- pattern: kafka.server<type=(.+), clientSoftwareName=(.+), clientSoftwareVersion=(.+), listener=(.+), networkProcessor=(.+)><>connections
name: kafka_server_$1_connections_software
type: GAUGE
labels:
clientSoftwareName: "$2"
clientSoftwareVersion: "$3"
listener: "$4"
networkProcessor: "$5"
- pattern: "kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+-total):"
name: kafka_server_$1_$4
type: COUNTER
labels:
listener: "$2"
networkProcessor: "$3"
- pattern: "kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+):"
name: kafka_server_$1_$4
type: GAUGE
labels:
listener: "$2"
networkProcessor: "$3"
- pattern: kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+-total)
name: kafka_server_$1_$4
type: COUNTER
labels:
listener: "$2"
networkProcessor: "$3"
- pattern: kafka.server<type=(.+), listener=(.+), networkProcessor=(.+)><>(.+)
name: kafka_server_$1_$4
type: GAUGE
labels:
listener: "$2"
networkProcessor: "$3"
# Some percent metrics use MeanRate attribute
# Ex) kafka.server<type=(KafkaRequestHandlerPool), name=(RequestHandlerAvgIdlePercent)><>MeanRate
- pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*><>MeanRate
name: kafka_$1_$2_$3_percent
type: GAUGE
# Generic gauges for percents
- pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*><>Value
name: kafka_$1_$2_$3_percent
type: GAUGE
- pattern: kafka.(\w+)<type=(.+), name=(.+)Percent\w*, (.+)=(.+)><>Value
name: kafka_$1_$2_$3_percent
type: GAUGE
labels:
"$4": "$5"
# Generic per-second counters with 0-2 key/value pairs
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+), (.+)=(.+)><>Count
name: kafka_$1_$2_$3_total
type: COUNTER
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*, (.+)=(.+)><>Count
name: kafka_$1_$2_$3_total
type: COUNTER
labels:
"$4": "$5"
- pattern: kafka.(\w+)<type=(.+), name=(.+)PerSec\w*><>Count
name: kafka_$1_$2_$3_total
type: COUNTER
# Generic gauges with 0-2 key/value pairs
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
# Emulate Prometheus 'Summary' metrics for the exported 'Histogram's.
# Note that these are missing the '_sum' metric!
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>Count
name: kafka_$1_$2_$3_count
type: COUNTER
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*), (.+)=(.+)><>(\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
"$6": "$7"
quantile: "0.$8"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.+)><>Count
name: kafka_$1_$2_$3_count
type: COUNTER
labels:
"$4": "$5"
- pattern: kafka.(\w+)<type=(.+), name=(.+), (.+)=(.*)><>(\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
quantile: "0.$6"
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>Count
name: kafka_$1_$2_$3_count
type: COUNTER
- pattern: kafka.(\w+)<type=(.+), name=(.+)><>(\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
quantile: "0.$4"
# KRaft overall related metrics
# distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics
- pattern: "kafka.server<type=raft-metrics><>(.+-total|.+-max):"
name: kafka_server_raftmetrics_$1
type: COUNTER
- pattern: "kafka.server<type=raft-metrics><>(current-state): (.+)"
name: kafka_server_raftmetrics_$1
value: 1
type: UNTYPED
labels:
$1: "$2"
- pattern: "kafka.server<type=raft-metrics><>(.+):"
name: kafka_server_raftmetrics_$1
type: GAUGE
# KRaft "low level" channels related metrics
# distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics
- pattern: "kafka.server<type=raft-channel-metrics><>(.+-total|.+-max):"
name: kafka_server_raftchannelmetrics_$1
type: COUNTER
- pattern: "kafka.server<type=raft-channel-metrics><>(.+):"
name: kafka_server_raftchannelmetrics_$1
type: GAUGE
# Broker metrics related to fetching metadata topic records in KRaft mode
- pattern: "kafka.server<type=broker-metadata-metrics><>(.+):"
name: kafka_server_brokermetadatametrics_$1
type: GAUGE
zookeeper-metrics-config.yml: |
# See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics
lowercaseOutputName: true
rules:
# replicated Zookeeper
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+)><>(\\w+)"
name: "zookeeper_$2"
type: GAUGE
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+)><>(\\w+)"
name: "zookeeper_$3"
type: GAUGE
labels:
replicaId: "$2"
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+)><>(Packets\\w+)"
name: "zookeeper_$4"
type: COUNTER
labels:
replicaId: "$2"
memberType: "$3"
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+)><>(\\w+)"
name: "zookeeper_$4"
type: GAUGE
labels:
replicaId: "$2"
memberType: "$3"
- pattern: "org.apache.ZooKeeperService<name0=ReplicatedServer_id(\\d+), name1=replica.(\\d+), name2=(\\w+), name3=(\\w+)><>(\\w+)"
name: "zookeeper_$4_$5"
type: GAUGE
labels:
replicaId: "$2"
memberType: "$3"

View File

@@ -0,0 +1,40 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: {{ .Release.Name }}-pod-scrape
spec:
podMetricsEndpoints:
- port: tcp-prometheus
scheme: http
relabelConfigs:
- separator: ;
regex: __meta_kubernetes_pod_label_(strimzi_io_.+)
replacement: $1
action: labelmap
- sourceLabels: [__meta_kubernetes_namespace]
separator: ;
regex: (.*)
targetLabel: namespace
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_name]
separator: ;
regex: (.*)
targetLabel: kubernetes_pod_name
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_node_name]
separator: ;
regex: (.*)
targetLabel: node_name
replacement: $1
action: replace
- sourceLabels: [__meta_kubernetes_pod_host_ip]
separator: ;
regex: (.*)
targetLabel: node_ip
replacement: $1
action: replace
selector:
matchLabels:
app.kubernetes.io/instance: {{ .Release.Name }}

View File

@@ -23,7 +23,8 @@ kafka 0.2.1 3ac17018
kafka 0.2.2 d0758692
kafka 0.2.3 5ca8823
kafka 0.3.0 c07c4bbd
kafka 0.3.1 HEAD
kafka 0.3.1 b7375f73
kafka 0.3.2 HEAD
kubernetes 0.1.0 f642698
kubernetes 0.2.0 7cd7de73
kubernetes 0.3.0 7caccec1

View File

@@ -34,3 +34,4 @@ control-plane/kube-etcd
kubevirt/kubevirt-control-plane
flux/flux-control-plane
flux/flux-stats
kafka/strimzi-kafka

View File

@@ -0,0 +1,275 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
role: alert-rules
app: strimzi
name: prometheus-kafka-rules
spec:
groups:
- name: kafka
rules:
- alert: KafkaRunningOutOfSpace
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} * 100 / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} < 15
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka is running out of free disk space'
description: 'There are only {{ $value }} percent available at {{ $labels.persistentvolumeclaim }} PVC'
- alert: UnderReplicatedPartitions
expr: kafka_server_replicamanager_underreplicatedpartitions > 0
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka under replicated partitions'
description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}'
- alert: AbnormalControllerState
expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka abnormal controller state'
description: 'There are {{ $value }} active controllers in the cluster'
- alert: OfflinePartitions
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka offline partitions'
description: 'One or more partitions have no leader'
- alert: UnderMinIsrPartitionCount
expr: kafka_server_replicamanager_underminisrpartitioncount > 0
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka under min ISR partitions'
description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.kubernetes_pod_name }}'
- alert: OfflineLogDirectoryCount
expr: kafka_log_logmanager_offlinelogdirectorycount > 0
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka offline log directories'
description: 'There are {{ $value }} offline log directories on {{ $labels.kubernetes_pod_name }}'
- alert: ScrapeProblem
expr: up{kubernetes_namespace!~"openshift-.+",kubernetes_pod_name=~".+-kafka-[0-9]+"} == 0
for: 3m
labels:
severity: major
annotations:
summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}'
description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes'
- alert: ClusterOperatorContainerDown
expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"})
for: 1m
labels:
severity: major
annotations:
summary: 'Cluster Operator down'
description: 'The Cluster Operator has been down for longer than 90 seconds'
- alert: KafkaBrokerContainersDown
expr: absent(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All `kafka` containers down or in CrashLookBackOff status'
description: 'All `kafka` containers have been down or in CrashLookBackOff status for 3 minutes'
- alert: KafkaContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"})
for: 5m
labels:
severity: warning
annotations:
summary: 'One or more Kafka containers restarted too often'
description: 'One or more Kafka containers were restarted too often within the last 5 minutes'
- name: zookeeper
rules:
- alert: AvgRequestLatency
expr: zookeeper_avgrequestlatency > 10
for: 10s
labels:
severity: warning
annotations:
summary: 'Zookeeper average request latency'
description: 'The average request latency is {{ $value }} on {{ $labels.kubernetes_pod_name }}'
- alert: OutstandingRequests
expr: zookeeper_outstandingrequests > 10
for: 10s
labels:
severity: warning
annotations:
summary: 'Zookeeper outstanding requests'
description: 'There are {{ $value }} outstanding requests on {{ $labels.kubernetes_pod_name }}'
- alert: ZookeeperRunningOutOfSpace
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data-(.+)-zookeeper-[0-9]+"} < 5368709120
for: 10s
labels:
severity: warning
annotations:
summary: 'Zookeeper is running out of free disk space'
description: 'There are only {{ $value }} bytes available at {{ $labels.persistentvolumeclaim }} PVC'
- alert: ZookeeperContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"})
for: 5m
labels:
severity: warning
annotations:
summary: 'One or more Zookeeper containers were restarted too often'
description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up'
- alert: ZookeeperContainersDown
expr: absent(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All `zookeeper` containers in the Zookeeper pods down or in CrashLookBackOff status'
description: 'All `zookeeper` containers in the Zookeeper pods have been down or in CrashLookBackOff status for 3 minutes'
- name: entityOperator
rules:
- alert: TopicOperatorContainerDown
expr: absent(container_last_seen{container="topic-operator",pod=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status'
description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes'
- alert: UserOperatorContainerDown
expr: absent(container_last_seen{container="user-operator",pod=~".+-entity-operator-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status'
description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes'
- name: connect
rules:
- alert: ConnectContainersDown
expr: absent(container_last_seen{container=~".+-connect",pod=~".+-connect-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All Kafka Connect containers down or in CrashLookBackOff status'
description: 'All Kafka Connect containers have been down or in CrashLookBackOff status for 3 minutes'
- alert: ConnectFailedConnector
expr: sum(kafka_connect_connector_status{status="failed"}) > 0
for: 5m
labels:
severity: major
annotations:
summary: 'Kafka Connect Connector Failure'
description: 'One or more connectors have been in failed state for 5 minutes,'
- alert: ConnectFailedTask
expr: sum(kafka_connect_worker_connector_failed_task_count) > 0
for: 5m
labels:
severity: major
annotations:
summary: 'Kafka Connect Task Failure'
description: 'One or more tasks have been in failed state for 5 minutes.'
- name: bridge
rules:
- alert: BridgeContainersDown
expr: absent(container_last_seen{container=~".+-bridge",pod=~".+-bridge-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All Kafka Bridge containers down or in CrashLookBackOff status'
description: 'All Kafka Bridge containers have been down or in CrashLookBackOff status for 3 minutes'
- alert: AvgProducerLatency
expr: strimzi_bridge_kafka_producer_request_latency_avg > 10
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka Bridge producer average request latency'
description: 'The average producer request latency is {{ $value }} on {{ $labels.clientId }}'
- alert: AvgConsumerFetchLatency
expr: strimzi_bridge_kafka_consumer_fetch_latency_avg > 500
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka Bridge consumer average fetch latency'
description: 'The average consumer fetch latency is {{ $value }} on {{ $labels.clientId }}'
- alert: AvgConsumerCommitLatency
expr: strimzi_bridge_kafka_consumer_commit_latency_avg > 200
for: 10s
labels:
severity: warning
annotations:
summary: 'Kafka Bridge consumer average commit latency'
description: 'The average consumer commit latency is {{ $value }} on {{ $labels.clientId }}'
- alert: Http4xxErrorRate
expr: strimzi_bridge_http_server_requestCount_total{code=~"^4..$", container=~"^.+-bridge", path !="/favicon.ico"} > 10
for: 1m
labels:
severity: warning
annotations:
summary: 'Kafka Bridge returns code 4xx too often'
description: 'Kafka Bridge returns code 4xx too much ({{ $value }}) for the path {{ $labels.path }}'
- alert: Http5xxErrorRate
expr: strimzi_bridge_http_server_requestCount_total{code=~"^5..$", container=~"^.+-bridge"} > 10
for: 1m
labels:
severity: warning
annotations:
summary: 'Kafka Bridge returns code 5xx too often'
description: 'Kafka Bridge returns code 5xx too much ({{ $value }}) for the path {{ $labels.path }}'
- name: mirrorMaker
rules:
- alert: MirrorMakerContainerDown
expr: absent(container_last_seen{container=~".+-mirror-maker",pod=~".+-mirror-maker-.+"})
for: 3m
labels:
severity: major
annotations:
summary: 'All Kafka Mirror Maker containers down or in CrashLookBackOff status'
description: 'All Kafka Mirror Maker containers have been down or in CrashLookBackOff status for 3 minutes'
- name: kafkaExporter
rules:
- alert: UnderReplicatedPartition
expr: kafka_topic_partition_under_replicated_partition > 0
for: 10s
labels:
severity: warning
annotations:
summary: 'Topic has under-replicated partitions'
description: 'Topic {{ $labels.topic }} has {{ $value }} under-replicated partition {{ $labels.partition }}'
- alert: TooLargeConsumerGroupLag
expr: kafka_consumergroup_lag > 1000
for: 10s
labels:
severity: warning
annotations:
summary: 'Consumer group lag is too big'
description: 'Consumer group {{ $labels.consumergroup}} lag is too big ({{ $value }}) on topic {{ $labels.topic }}/partition {{ $labels.partition }}'
- alert: NoMessageForTooLong
expr: changes(kafka_topic_partition_current_offset[10m]) == 0
for: 10s
labels:
severity: warning
annotations:
summary: 'No message for 10 minutes'
description: 'There is no messages in topic {{ $labels.topic}}/partition {{ $labels.partition }} for 10 minutes'
- name: certificates
interval: 1m0s
rules:
- alert: CertificateExpiration
expr: |
strimzi_certificate_expiration_timestamp_ms/1000 - time() < 30 * 24 * 60 * 60
for: 5m
labels:
severity: warning
annotations:
summary: 'Certificate will expire in less than 30 days'
description: 'Certificate of type {{ $labels.type }} in cluster {{ $labels.cluster }} in namespace {{ $labels.resource_namespace }} will expire in less than 30 days'