From b75aaf177b0e02dd92bc588ad08df70c1fc29be9 Mon Sep 17 00:00:00 2001 From: Floppy Disk Date: Mon, 10 Feb 2025 13:29:17 +0300 Subject: [PATCH] add kafka monitoring --- dashboards/kafka/strimzi-kafka.json | 2940 +++++++++++++++++ hack/download-dashboards.sh | 1 + packages/apps/kafka/Chart.yaml | 2 +- packages/apps/kafka/templates/kafka.yaml | 12 + .../kafka/templates/metrics-configmap.yaml | 198 ++ packages/apps/kafka/templates/podscrape.yaml | 40 + packages/apps/versions_map | 3 +- packages/extra/monitoring/dashboards.list | 1 + .../templates/prometheus-rules.yaml | 275 ++ 9 files changed, 3470 insertions(+), 2 deletions(-) create mode 100644 dashboards/kafka/strimzi-kafka.json create mode 100644 packages/apps/kafka/templates/metrics-configmap.yaml create mode 100644 packages/apps/kafka/templates/podscrape.yaml create mode 100644 packages/system/kafka-operator/templates/prometheus-rules.yaml diff --git a/dashboards/kafka/strimzi-kafka.json b/dashboards/kafka/strimzi-kafka.json new file mode 100644 index 00000000..e7d5a5ed --- /dev/null +++ b/dashboards/kafka/strimzi-kafka.json @@ -0,0 +1,2940 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 133, + "links": [], + "panels": [ + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of brokers online", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 0 + }, + "id": 46, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "count(kafka_server_replicamanager_leadercount{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Brokers Online", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of active controllers in the cluster", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "#e5ac0e", + "value": 2 + }, + { + "color": "#bf1b00" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 36, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_controller_kafkacontroller_activecontrollercount{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Active Controllers", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Unclean leader election rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 2 + }, + { + "color": "#d44a3a" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 38, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_controller_controllerstats_uncleanleaderelections_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Unclean Leader Election Rate", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Replicas that are online", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 0 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 40, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_server_replicamanager_partitioncount{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Online Replicas", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of under-replicated partitions (| ISR | < | all replicas |).", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#508642", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 1 + }, + { + "color": "#bf1b00", + "value": 5 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 30, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_server_replicamanager_underreplicatedpartitions{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Under Replicated Partitions", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of partitions which are at their minimum in sync replica count (| ISR | == | min.insync.replicas |)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "#508642", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#508642", + "value": null + }, + { + "color": "#ef843c", + "value": 1 + }, + { + "color": "#bf1b00", + "value": 5 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 102, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_cluster_partition_atminisr{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Partitions at minimum ISR", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of partitions which are under their minimum in sync replica count (| ISR | < | min.insync.replicas |)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "#508642", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#508642", + "value": null + }, + { + "color": "#ef843c", + "value": 1 + }, + { + "color": "#bf1b00", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 103, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_cluster_partition_underminisr{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Partitions under minimum ISR", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Number of partitions that don’t have an active leader and are hence not writable or readable", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#508642", + "value": null + }, + { + "color": "#ef843c", + "value": 1 + }, + { + "color": "#bf1b00", + "value": 1 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 32, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_controller_kafkacontroller_offlinepartitionscount{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Offline Partitions Count", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 4 + }, + "id": 28, + "panels": [], + "title": "Kafka", + "type": "row" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Kafka broker pods memory usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 5 + }, + "id": 82, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(container_memory_usage_bytes{namespace=\"$kubernetes_namespace\",pod=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}) by (pod)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Aggregated Kafka broker pods CPU usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 5 + }, + "id": 81, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$kubernetes_namespace\",pod=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}[5m])) by (pod)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Kafka broker pods disk usage", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 5 + }, + "id": 83, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kubelet_volume_stats_available_bytes{namespace=\"$kubernetes_namespace\",persistentvolumeclaim=~\"data(-[0-9]+)?-$strimzi_cluster_name-$kafka_broker\", persistentvolumeclaim!~\".*zookeeper(-[0-9]+)?$\"}) by (persistentvolumeclaim)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{persistentvolumeclaim}}", + "refId": "A" + } + ], + "title": "Available Disk Space", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Open File Descriptors", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 5 + }, + "id": 107, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(process_open_fds{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",container=\"kafka\"}) by (kubernetes_pod_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "Open File Descriptors", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 12 + }, + "id": 93, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(jvm_memory_used_bytes{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",strimzi_io_name=\"$strimzi_cluster_name-kafka\"}) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "JVM Memory Used", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 12 + }, + "id": 95, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(rate(jvm_gc_collection_seconds_sum{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",strimzi_io_name=\"$strimzi_cluster_name-kafka\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "JVM GC Time", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 12 + }, + "id": 97, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(rate(jvm_gc_collection_seconds_count{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",strimzi_io_name=\"$strimzi_cluster_name-kafka\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "JVM GC Count", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "JVM thread count", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 12 + }, + "id": 108, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(jvm_threads_current{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",strimzi_io_name=\"$strimzi_cluster_name-kafka\"}) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "JVM Thread Count", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Total incoming byte rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgba(237, 129, 40, 0.89)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 19 + }, + "id": 98, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_bytesin_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Total Incoming Byte Rate", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Total outgoing byte rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgba(237, 129, 40, 0.89)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 19 + }, + "id": 99, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_bytesout_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Total Outgoing Byte Rate", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Incoming messages rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgba(237, 129, 40, 0.89)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 + } + ] + }, + "unit": "wps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 19 + }, + "id": 100, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_messagesin_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Incoming Messages Rate", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Total produce request rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "color": "rgba(237, 129, 40, 0.89)", + "text": "0" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 0 + }, + { + "color": "#299c46", + "value": 2 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 19 + }, + "id": 101, + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_totalproducerequests_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "title": "Total Produce Request Rate", + "type": "stat" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Byte rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_bytesin_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "Total Incoming Byte Rate", + "refId": "A" + }, + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_bytesout_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Total Outgoing Byte Rate", + "refId": "B" + } + ], + "title": "Byte Rate", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 58, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_messagesin_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total Incoming Messages Rate", + "refId": "D" + } + ], + "title": "Messages In Per Second", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Produce request rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 50, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_totalproducerequests_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Total Produce Request Rate", + "refId": "A" + }, + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_failedproducerequests_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Failed Produce Request Rate", + "refId": "B" + } + ], + "title": "Produce Request Rate", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Fetch request rate", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 56, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_brokertopicmetrics_totalfetchrequests_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Fetch Request Rate", + "refId": "A" + }, + { + "datasource": "${DS_PROMETHEUS}", + "expr": " sum(irate(kafka_server_brokertopicmetrics_failedfetchrequests_total{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",topic=~\"$kafka_topic\",topic!=\"\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Failed Fetch Request Rate", + "refId": "B" + } + ], + "title": "Fetch Request Rate", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Average percentage of time network processor is idle", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 60, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_network_socketserver_networkprocessoravgidle_percent{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}*100) by (kubernetes_pod_name)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "Network Processor Avg Idle Percent", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Average percentage of time request handler threads are idle", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 62, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_server_kafkarequesthandlerpool_requesthandleravgidle_percent{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}*100) by (kubernetes_pod_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "Request Handler Avg Idle Percent", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Disk writes", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 47 + }, + "id": 104, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_kafkaserver_linux_disk_write_bytes{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "Disk Writes", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Disk reads", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 47 + }, + "id": 105, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(irate(kafka_server_kafkaserver_linux_disk_read_bytes{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}[5m])) by (kubernetes_pod_name)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "Disk Reads", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "description": "Disk reads", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 47 + }, + "id": 106, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_server_socket_server_metrics_connection_count{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"}) by (kubernetes_pod_name, listener)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{listener}}-{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "Connection Count per Listener", + "type": "timeseries" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": { + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Log Size" + }, + "properties": [ + { + "id": "unit", + "value": "decbytes" + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 55 + }, + "id": 91, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "expr": "kafka_log_log_size{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",topic=~\"$kafka_topic\",partition=~\"$kafka_partition\"}", + "format": "table", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{topic}}:{{partition}}", + "refId": "A" + } + ], + "title": "Log Size", + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": { + "Time": true, + "__name__": true, + "container": true, + "endpoint": true, + "instance": true, + "job": true, + "kubernetes_pod_name": true, + "namespace": true, + "node_ip": true, + "node_name": true, + "pod": true, + "strimzi_io_broker_role": true, + "strimzi_io_cluster": true, + "strimzi_io_component_type": true, + "strimzi_io_controller": true, + "strimzi_io_controller_name": true, + "strimzi_io_controller_role": true, + "strimzi_io_kind": true, + "strimzi_io_name": true, + "strimzi_io_pod_name": true, + "strimzi_io_pool_name": true + }, + "indexByName": { + "Time": 0, + "Value": 23, + "__name__": 1, + "container": 3, + "endpoint": 4, + "instance": 5, + "job": 6, + "kubernetes_pod_name": 7, + "namespace": 8, + "node_ip": 9, + "node_name": 10, + "partition": 11, + "pod": 12, + "strimzi_io_broker_role": 13, + "strimzi_io_cluster": 14, + "strimzi_io_component_type": 15, + "strimzi_io_controller": 16, + "strimzi_io_controller_name": 17, + "strimzi_io_controller_role": 18, + "strimzi_io_kind": 19, + "strimzi_io_name": 20, + "strimzi_io_pod_name": 21, + "strimzi_io_pool_name": 22, + "topic": 2 + }, + "renameByName": { + "Value": "Log Size", + "partition": "Partition", + "strimzi_io_name": "", + "topic": "Topic" + } + } + } + ], + "type": "table" + }, + { + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 55 + }, + "id": 110, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": "${DS_PROMETHEUS}", + "expr": "sum(kafka_server_zookeeperclientmetrics_zookeeperrequestlatencyms{namespace=\"$kubernetes_namespace\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",strimzi_io_name=\"$strimzi_cluster_name-kafka\"}) by (kubernetes_pod_name)", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{kubernetes_pod_name}}", + "refId": "A" + } + ], + "title": "ZK request latecy", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "5s", + "schemaVersion": 40, + "tags": [ + "Strimzi", + "Kafka" + ], + "templating": { + "list": [ + { + "current": { + "text": "vm-shortterm", + "value": "24b0d575-0cea-474c-a4eb-c225d250858d" + }, + "includeAll": false, + "label": "datasource", + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { + "text": "tenant-kafkamonitors", + "value": "tenant-kafkamonitors" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "includeAll": false, + "label": "Namespace", + "name": "kubernetes_namespace", + "options": [], + "query": "query_result(kafka_server_replicamanager_leadercount)", + "refresh": 1, + "regex": "/.*namespace=\"([^\"]*).*/", + "type": "query" + }, + { + "current": { + "text": "kafka-asdf", + "value": "kafka-asdf" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "includeAll": false, + "label": "Cluster Name", + "name": "strimzi_cluster_name", + "options": [], + "query": "query_result(kafka_server_replicamanager_leadercount{namespace=\"$kubernetes_namespace\"})", + "refresh": 1, + "regex": "/.*strimzi_io_cluster=\"([^\"]*).*/", + "type": "query" + }, + { + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "includeAll": true, + "label": "Broker", + "name": "kafka_broker", + "options": [], + "query": "query_result(kafka_server_replicamanager_leadercount{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\"})", + "refresh": 1, + "regex": "/.*pod_name=\"$strimzi_cluster_name-([^\"]*).*/", + "type": "query" + }, + { + "allValue": ".+", + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "includeAll": true, + "label": "Topic", + "name": "kafka_topic", + "options": [], + "query": "query_result(kafka_cluster_partition_replicascount{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\"})", + "refresh": 1, + "regex": "/.*topic=\"([^\"]*).*/", + "type": "query" + }, + { + "allValue": ".*", + "current": { + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${DS_PROMETHEUS}", + "definition": "", + "includeAll": true, + "label": "Partition", + "multi": true, + "name": "kafka_partition", + "options": [], + "query": "query_result(kafka_log_log_size{namespace=\"$kubernetes_namespace\",strimzi_io_cluster=\"$strimzi_cluster_name\",kubernetes_pod_name=~\"$strimzi_cluster_name-$kafka_broker\",topic=~\"$kafka_topic\"})", + "refresh": 1, + "regex": "/.*partition=\"([^\"]*).*/", + "sort": 3, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Strimzi Kafka", + "uid": "66bebec5-5861-4e4a-b4ad-b0634928e946", + "version": 5, + "weekStart": "" +} diff --git a/hack/download-dashboards.sh b/hack/download-dashboards.sh index e3a2ed53..61d2ad5d 100755 --- a/hack/download-dashboards.sh +++ b/hack/download-dashboards.sh @@ -80,6 +80,7 @@ modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//main/namespace/ modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//main/capacity-planning/capacity-planning.json modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//flux/flux-control-plane.json modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//flux/flux-stats.json +modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kafka/strimzi-kafka.json EOT diff --git a/packages/apps/kafka/Chart.yaml b/packages/apps/kafka/Chart.yaml index ef5ef5fa..ec425b02 100644 --- a/packages/apps/kafka/Chart.yaml +++ b/packages/apps/kafka/Chart.yaml @@ -16,7 +16,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.3.1 +version: 0.3.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/packages/apps/kafka/templates/kafka.yaml b/packages/apps/kafka/templates/kafka.yaml index eed36f63..0134ca5a 100644 --- a/packages/apps/kafka/templates/kafka.yaml +++ b/packages/apps/kafka/templates/kafka.yaml @@ -57,6 +57,12 @@ spec: class: {{ . }} {{- end }} deleteClaim: true + metricsConfig: + type: jmxPrometheusExporter + valueFrom: + configMapKeyRef: + name: {{ .Release.Name }}-metrics + key: kafka-metrics-config.yml zookeeper: replicas: {{ .Values.zookeeper.replicas }} storage: @@ -68,6 +74,12 @@ spec: class: {{ . }} {{- end }} deleteClaim: false + metricsConfig: + type: jmxPrometheusExporter + valueFrom: + configMapKeyRef: + name: {{ .Release.Name }}-metrics + key: kafka-metrics-config.yml entityOperator: topicOperator: {} userOperator: {} diff --git a/packages/apps/kafka/templates/metrics-configmap.yaml b/packages/apps/kafka/templates/metrics-configmap.yaml new file mode 100644 index 00000000..da2a15da --- /dev/null +++ b/packages/apps/kafka/templates/metrics-configmap.yaml @@ -0,0 +1,198 @@ +kind: ConfigMap +apiVersion: v1 +metadata: + name: {{ .Release.Name }}-metrics +data: + kafka-metrics-config.yml: | + # See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics + lowercaseOutputName: true + rules: + # Special cases and very specific rules + - pattern: kafka.server<>Value + name: kafka_server_$1_$2 + type: GAUGE + labels: + clientId: "$3" + topic: "$4" + partition: "$5" + - pattern: kafka.server<>Value + name: kafka_server_$1_$2 + type: GAUGE + labels: + clientId: "$3" + broker: "$4:$5" + - pattern: kafka.server<>connections + name: kafka_server_$1_connections_tls_info + type: GAUGE + labels: + cipher: "$2" + protocol: "$3" + listener: "$4" + networkProcessor: "$5" + - pattern: kafka.server<>connections + name: kafka_server_$1_connections_software + type: GAUGE + labels: + clientSoftwareName: "$2" + clientSoftwareVersion: "$3" + listener: "$4" + networkProcessor: "$5" + - pattern: "kafka.server<>(.+-total):" + name: kafka_server_$1_$4 + type: COUNTER + labels: + listener: "$2" + networkProcessor: "$3" + - pattern: "kafka.server<>(.+):" + name: kafka_server_$1_$4 + type: GAUGE + labels: + listener: "$2" + networkProcessor: "$3" + - pattern: kafka.server<>(.+-total) + name: kafka_server_$1_$4 + type: COUNTER + labels: + listener: "$2" + networkProcessor: "$3" + - pattern: kafka.server<>(.+) + name: kafka_server_$1_$4 + type: GAUGE + labels: + listener: "$2" + networkProcessor: "$3" + # Some percent metrics use MeanRate attribute + # Ex) kafka.server<>MeanRate + - pattern: kafka.(\w+)<>MeanRate + name: kafka_$1_$2_$3_percent + type: GAUGE + # Generic gauges for percents + - pattern: kafka.(\w+)<>Value + name: kafka_$1_$2_$3_percent + type: GAUGE + - pattern: kafka.(\w+)<>Value + name: kafka_$1_$2_$3_percent + type: GAUGE + labels: + "$4": "$5" + # Generic per-second counters with 0-2 key/value pairs + - pattern: kafka.(\w+)<>Count + name: kafka_$1_$2_$3_total + type: COUNTER + labels: + "$4": "$5" + "$6": "$7" + - pattern: kafka.(\w+)<>Count + name: kafka_$1_$2_$3_total + type: COUNTER + labels: + "$4": "$5" + - pattern: kafka.(\w+)<>Count + name: kafka_$1_$2_$3_total + type: COUNTER + # Generic gauges with 0-2 key/value pairs + - pattern: kafka.(\w+)<>Value + name: kafka_$1_$2_$3 + type: GAUGE + labels: + "$4": "$5" + "$6": "$7" + - pattern: kafka.(\w+)<>Value + name: kafka_$1_$2_$3 + type: GAUGE + labels: + "$4": "$5" + - pattern: kafka.(\w+)<>Value + name: kafka_$1_$2_$3 + type: GAUGE + # Emulate Prometheus 'Summary' metrics for the exported 'Histogram's. + # Note that these are missing the '_sum' metric! + - pattern: kafka.(\w+)<>Count + name: kafka_$1_$2_$3_count + type: COUNTER + labels: + "$4": "$5" + "$6": "$7" + - pattern: kafka.(\w+)<>(\d+)thPercentile + name: kafka_$1_$2_$3 + type: GAUGE + labels: + "$4": "$5" + "$6": "$7" + quantile: "0.$8" + - pattern: kafka.(\w+)<>Count + name: kafka_$1_$2_$3_count + type: COUNTER + labels: + "$4": "$5" + - pattern: kafka.(\w+)<>(\d+)thPercentile + name: kafka_$1_$2_$3 + type: GAUGE + labels: + "$4": "$5" + quantile: "0.$6" + - pattern: kafka.(\w+)<>Count + name: kafka_$1_$2_$3_count + type: COUNTER + - pattern: kafka.(\w+)<>(\d+)thPercentile + name: kafka_$1_$2_$3 + type: GAUGE + labels: + quantile: "0.$4" + # KRaft overall related metrics + # distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics + - pattern: "kafka.server<>(.+-total|.+-max):" + name: kafka_server_raftmetrics_$1 + type: COUNTER + - pattern: "kafka.server<>(current-state): (.+)" + name: kafka_server_raftmetrics_$1 + value: 1 + type: UNTYPED + labels: + $1: "$2" + - pattern: "kafka.server<>(.+):" + name: kafka_server_raftmetrics_$1 + type: GAUGE + # KRaft "low level" channels related metrics + # distinguish between always increasing COUNTER (total and max) and variable GAUGE (all others) metrics + - pattern: "kafka.server<>(.+-total|.+-max):" + name: kafka_server_raftchannelmetrics_$1 + type: COUNTER + - pattern: "kafka.server<>(.+):" + name: kafka_server_raftchannelmetrics_$1 + type: GAUGE + # Broker metrics related to fetching metadata topic records in KRaft mode + - pattern: "kafka.server<>(.+):" + name: kafka_server_brokermetadatametrics_$1 + type: GAUGE + zookeeper-metrics-config.yml: | + # See https://github.com/prometheus/jmx_exporter for more info about JMX Prometheus Exporter metrics + lowercaseOutputName: true + rules: + # replicated Zookeeper + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$2" + type: GAUGE + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$3" + type: GAUGE + labels: + replicaId: "$2" + - pattern: "org.apache.ZooKeeperService<>(Packets\\w+)" + name: "zookeeper_$4" + type: COUNTER + labels: + replicaId: "$2" + memberType: "$3" + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$4" + type: GAUGE + labels: + replicaId: "$2" + memberType: "$3" + - pattern: "org.apache.ZooKeeperService<>(\\w+)" + name: "zookeeper_$4_$5" + type: GAUGE + labels: + replicaId: "$2" + memberType: "$3" diff --git a/packages/apps/kafka/templates/podscrape.yaml b/packages/apps/kafka/templates/podscrape.yaml new file mode 100644 index 00000000..bbbf2916 --- /dev/null +++ b/packages/apps/kafka/templates/podscrape.yaml @@ -0,0 +1,40 @@ +apiVersion: operator.victoriametrics.com/v1beta1 +kind: VMPodScrape +metadata: + name: {{ .Release.Name }}-pod-scrape +spec: + podMetricsEndpoints: + - port: tcp-prometheus + scheme: http + relabelConfigs: + - separator: ; + regex: __meta_kubernetes_pod_label_(strimzi_io_.+) + replacement: $1 + action: labelmap + - sourceLabels: [__meta_kubernetes_namespace] + separator: ; + regex: (.*) + targetLabel: namespace + replacement: $1 + action: replace + - sourceLabels: [__meta_kubernetes_pod_name] + separator: ; + regex: (.*) + targetLabel: kubernetes_pod_name + replacement: $1 + action: replace + - sourceLabels: [__meta_kubernetes_pod_node_name] + separator: ; + regex: (.*) + targetLabel: node_name + replacement: $1 + action: replace + - sourceLabels: [__meta_kubernetes_pod_host_ip] + separator: ; + regex: (.*) + targetLabel: node_ip + replacement: $1 + action: replace + selector: + matchLabels: + app.kubernetes.io/instance: {{ .Release.Name }} diff --git a/packages/apps/versions_map b/packages/apps/versions_map index e9e597cc..85c0ded7 100644 --- a/packages/apps/versions_map +++ b/packages/apps/versions_map @@ -23,7 +23,8 @@ kafka 0.2.1 3ac17018 kafka 0.2.2 d0758692 kafka 0.2.3 5ca8823 kafka 0.3.0 c07c4bbd -kafka 0.3.1 HEAD +kafka 0.3.1 b7375f73 +kafka 0.3.2 HEAD kubernetes 0.1.0 f642698 kubernetes 0.2.0 7cd7de73 kubernetes 0.3.0 7caccec1 diff --git a/packages/extra/monitoring/dashboards.list b/packages/extra/monitoring/dashboards.list index 8c1c8ab5..0de7bfd6 100644 --- a/packages/extra/monitoring/dashboards.list +++ b/packages/extra/monitoring/dashboards.list @@ -34,3 +34,4 @@ control-plane/kube-etcd kubevirt/kubevirt-control-plane flux/flux-control-plane flux/flux-stats +kafka/strimzi-kafka diff --git a/packages/system/kafka-operator/templates/prometheus-rules.yaml b/packages/system/kafka-operator/templates/prometheus-rules.yaml new file mode 100644 index 00000000..bc4d6f46 --- /dev/null +++ b/packages/system/kafka-operator/templates/prometheus-rules.yaml @@ -0,0 +1,275 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + role: alert-rules + app: strimzi + name: prometheus-kafka-rules +spec: + groups: + - name: kafka + rules: + - alert: KafkaRunningOutOfSpace + expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} * 100 / kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~"data(-[0-9]+)?-(.+)-kafka-[0-9]+"} < 15 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka is running out of free disk space' + description: 'There are only {{ $value }} percent available at {{ $labels.persistentvolumeclaim }} PVC' + - alert: UnderReplicatedPartitions + expr: kafka_server_replicamanager_underreplicatedpartitions > 0 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka under replicated partitions' + description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}' + - alert: AbnormalControllerState + expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka abnormal controller state' + description: 'There are {{ $value }} active controllers in the cluster' + - alert: OfflinePartitions + expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount) > 0 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka offline partitions' + description: 'One or more partitions have no leader' + - alert: UnderMinIsrPartitionCount + expr: kafka_server_replicamanager_underminisrpartitioncount > 0 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka under min ISR partitions' + description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.kubernetes_pod_name }}' + - alert: OfflineLogDirectoryCount + expr: kafka_log_logmanager_offlinelogdirectorycount > 0 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka offline log directories' + description: 'There are {{ $value }} offline log directories on {{ $labels.kubernetes_pod_name }}' + - alert: ScrapeProblem + expr: up{kubernetes_namespace!~"openshift-.+",kubernetes_pod_name=~".+-kafka-[0-9]+"} == 0 + for: 3m + labels: + severity: major + annotations: + summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}' + description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes' + - alert: ClusterOperatorContainerDown + expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"}) + for: 1m + labels: + severity: major + annotations: + summary: 'Cluster Operator down' + description: 'The Cluster Operator has been down for longer than 90 seconds' + - alert: KafkaBrokerContainersDown + expr: absent(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'All `kafka` containers down or in CrashLookBackOff status' + description: 'All `kafka` containers have been down or in CrashLookBackOff status for 3 minutes' + - alert: KafkaContainerRestartedInTheLast5Minutes + expr: count(count_over_time(container_last_seen{container="kafka"}[5m])) > 2 * count(container_last_seen{container="kafka",pod=~".+-kafka-[0-9]+"}) + for: 5m + labels: + severity: warning + annotations: + summary: 'One or more Kafka containers restarted too often' + description: 'One or more Kafka containers were restarted too often within the last 5 minutes' + - name: zookeeper + rules: + - alert: AvgRequestLatency + expr: zookeeper_avgrequestlatency > 10 + for: 10s + labels: + severity: warning + annotations: + summary: 'Zookeeper average request latency' + description: 'The average request latency is {{ $value }} on {{ $labels.kubernetes_pod_name }}' + - alert: OutstandingRequests + expr: zookeeper_outstandingrequests > 10 + for: 10s + labels: + severity: warning + annotations: + summary: 'Zookeeper outstanding requests' + description: 'There are {{ $value }} outstanding requests on {{ $labels.kubernetes_pod_name }}' + - alert: ZookeeperRunningOutOfSpace + expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data-(.+)-zookeeper-[0-9]+"} < 5368709120 + for: 10s + labels: + severity: warning + annotations: + summary: 'Zookeeper is running out of free disk space' + description: 'There are only {{ $value }} bytes available at {{ $labels.persistentvolumeclaim }} PVC' + - alert: ZookeeperContainerRestartedInTheLast5Minutes + expr: count(count_over_time(container_last_seen{container="zookeeper"}[5m])) > 2 * count(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"}) + for: 5m + labels: + severity: warning + annotations: + summary: 'One or more Zookeeper containers were restarted too often' + description: 'One or more Zookeeper containers were restarted too often within the last 5 minutes. This alert can be ignored when the Zookeeper cluster is scaling up' + - alert: ZookeeperContainersDown + expr: absent(container_last_seen{container="zookeeper",pod=~".+-zookeeper-[0-9]+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'All `zookeeper` containers in the Zookeeper pods down or in CrashLookBackOff status' + description: 'All `zookeeper` containers in the Zookeeper pods have been down or in CrashLookBackOff status for 3 minutes' + - name: entityOperator + rules: + - alert: TopicOperatorContainerDown + expr: absent(container_last_seen{container="topic-operator",pod=~".+-entity-operator-.+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'Container topic-operator in Entity Operator pod down or in CrashLookBackOff status' + description: 'Container topic-operator in Entity Operator pod has been or in CrashLookBackOff status for 3 minutes' + - alert: UserOperatorContainerDown + expr: absent(container_last_seen{container="user-operator",pod=~".+-entity-operator-.+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'Container user-operator in Entity Operator pod down or in CrashLookBackOff status' + description: 'Container user-operator in Entity Operator pod have been down or in CrashLookBackOff status for 3 minutes' + - name: connect + rules: + - alert: ConnectContainersDown + expr: absent(container_last_seen{container=~".+-connect",pod=~".+-connect-.+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'All Kafka Connect containers down or in CrashLookBackOff status' + description: 'All Kafka Connect containers have been down or in CrashLookBackOff status for 3 minutes' + - alert: ConnectFailedConnector + expr: sum(kafka_connect_connector_status{status="failed"}) > 0 + for: 5m + labels: + severity: major + annotations: + summary: 'Kafka Connect Connector Failure' + description: 'One or more connectors have been in failed state for 5 minutes,' + - alert: ConnectFailedTask + expr: sum(kafka_connect_worker_connector_failed_task_count) > 0 + for: 5m + labels: + severity: major + annotations: + summary: 'Kafka Connect Task Failure' + description: 'One or more tasks have been in failed state for 5 minutes.' + - name: bridge + rules: + - alert: BridgeContainersDown + expr: absent(container_last_seen{container=~".+-bridge",pod=~".+-bridge-.+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'All Kafka Bridge containers down or in CrashLookBackOff status' + description: 'All Kafka Bridge containers have been down or in CrashLookBackOff status for 3 minutes' + - alert: AvgProducerLatency + expr: strimzi_bridge_kafka_producer_request_latency_avg > 10 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka Bridge producer average request latency' + description: 'The average producer request latency is {{ $value }} on {{ $labels.clientId }}' + - alert: AvgConsumerFetchLatency + expr: strimzi_bridge_kafka_consumer_fetch_latency_avg > 500 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka Bridge consumer average fetch latency' + description: 'The average consumer fetch latency is {{ $value }} on {{ $labels.clientId }}' + - alert: AvgConsumerCommitLatency + expr: strimzi_bridge_kafka_consumer_commit_latency_avg > 200 + for: 10s + labels: + severity: warning + annotations: + summary: 'Kafka Bridge consumer average commit latency' + description: 'The average consumer commit latency is {{ $value }} on {{ $labels.clientId }}' + - alert: Http4xxErrorRate + expr: strimzi_bridge_http_server_requestCount_total{code=~"^4..$", container=~"^.+-bridge", path !="/favicon.ico"} > 10 + for: 1m + labels: + severity: warning + annotations: + summary: 'Kafka Bridge returns code 4xx too often' + description: 'Kafka Bridge returns code 4xx too much ({{ $value }}) for the path {{ $labels.path }}' + - alert: Http5xxErrorRate + expr: strimzi_bridge_http_server_requestCount_total{code=~"^5..$", container=~"^.+-bridge"} > 10 + for: 1m + labels: + severity: warning + annotations: + summary: 'Kafka Bridge returns code 5xx too often' + description: 'Kafka Bridge returns code 5xx too much ({{ $value }}) for the path {{ $labels.path }}' + - name: mirrorMaker + rules: + - alert: MirrorMakerContainerDown + expr: absent(container_last_seen{container=~".+-mirror-maker",pod=~".+-mirror-maker-.+"}) + for: 3m + labels: + severity: major + annotations: + summary: 'All Kafka Mirror Maker containers down or in CrashLookBackOff status' + description: 'All Kafka Mirror Maker containers have been down or in CrashLookBackOff status for 3 minutes' + - name: kafkaExporter + rules: + - alert: UnderReplicatedPartition + expr: kafka_topic_partition_under_replicated_partition > 0 + for: 10s + labels: + severity: warning + annotations: + summary: 'Topic has under-replicated partitions' + description: 'Topic {{ $labels.topic }} has {{ $value }} under-replicated partition {{ $labels.partition }}' + - alert: TooLargeConsumerGroupLag + expr: kafka_consumergroup_lag > 1000 + for: 10s + labels: + severity: warning + annotations: + summary: 'Consumer group lag is too big' + description: 'Consumer group {{ $labels.consumergroup}} lag is too big ({{ $value }}) on topic {{ $labels.topic }}/partition {{ $labels.partition }}' + - alert: NoMessageForTooLong + expr: changes(kafka_topic_partition_current_offset[10m]) == 0 + for: 10s + labels: + severity: warning + annotations: + summary: 'No message for 10 minutes' + description: 'There is no messages in topic {{ $labels.topic}}/partition {{ $labels.partition }} for 10 minutes' + - name: certificates + interval: 1m0s + rules: + - alert: CertificateExpiration + expr: | + strimzi_certificate_expiration_timestamp_ms/1000 - time() < 30 * 24 * 60 * 60 + for: 5m + labels: + severity: warning + annotations: + summary: 'Certificate will expire in less than 30 days' + description: 'Certificate of type {{ $labels.type }} in cluster {{ $labels.cluster }} in namespace {{ $labels.resource_namespace }} will expire in less than 30 days'