diff --git a/packages/apps/kafka/templates/podscrape.yaml b/packages/apps/kafka/templates/podscrape.yaml index bbbf2916..fcf91553 100644 --- a/packages/apps/kafka/templates/podscrape.yaml +++ b/packages/apps/kafka/templates/podscrape.yaml @@ -1,7 +1,7 @@ apiVersion: operator.victoriametrics.com/v1beta1 kind: VMPodScrape metadata: - name: {{ .Release.Name }}-pod-scrape + name: {{ .Release.Name }} spec: podMetricsEndpoints: - port: tcp-prometheus @@ -20,13 +20,13 @@ spec: - sourceLabels: [__meta_kubernetes_pod_name] separator: ; regex: (.*) - targetLabel: kubernetes_pod_name + targetLabel: pod replacement: $1 action: replace - sourceLabels: [__meta_kubernetes_pod_node_name] separator: ; regex: (.*) - targetLabel: node_name + targetLabel: node replacement: $1 action: replace - sourceLabels: [__meta_kubernetes_pod_host_ip] diff --git a/packages/system/kafka-operator/templates/prometheus-rules.yaml b/packages/system/kafka-operator/templates/prometheus-rules.yaml index bc4d6f46..760ca5da 100644 --- a/packages/system/kafka-operator/templates/prometheus-rules.yaml +++ b/packages/system/kafka-operator/templates/prometheus-rules.yaml @@ -24,7 +24,7 @@ spec: severity: warning annotations: summary: 'Kafka under replicated partitions' - description: 'There are {{ $value }} under replicated partitions on {{ $labels.kubernetes_pod_name }}' + description: 'There are {{ $value }} under replicated partitions on {{ $labels.pod }}' - alert: AbnormalControllerState expr: sum(kafka_controller_kafkacontroller_activecontrollercount) by (strimzi_io_name) != 1 for: 10s @@ -48,7 +48,7 @@ spec: severity: warning annotations: summary: 'Kafka under min ISR partitions' - description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.kubernetes_pod_name }}' + description: 'There are {{ $value }} partitions under the min ISR on {{ $labels.pod }}' - alert: OfflineLogDirectoryCount expr: kafka_log_logmanager_offlinelogdirectorycount > 0 for: 10s @@ -56,15 +56,15 @@ spec: severity: warning annotations: summary: 'Kafka offline log directories' - description: 'There are {{ $value }} offline log directories on {{ $labels.kubernetes_pod_name }}' + description: 'There are {{ $value }} offline log directories on {{ $labels.pod }}' - alert: ScrapeProblem - expr: up{kubernetes_namespace!~"openshift-.+",kubernetes_pod_name=~".+-kafka-[0-9]+"} == 0 + expr: up{kubernetes_namespace!~"openshift-.+",pod=~".+-kafka-[0-9]+"} == 0 for: 3m labels: severity: major annotations: - summary: 'Prometheus unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }}' - description: 'Prometheus was unable to scrape metrics from {{ $labels.kubernetes_pod_name }}/{{ $labels.instance }} for more than 3 minutes' + summary: 'Prometheus unable to scrape metrics from {{ $labels.pod }}/{{ $labels.instance }}' + description: 'Prometheus was unable to scrape metrics from {{ $labels.pod }}/{{ $labels.instance }} for more than 3 minutes' - alert: ClusterOperatorContainerDown expr: count((container_last_seen{container="strimzi-cluster-operator"} > (time() - 90))) < 1 or absent(container_last_seen{container="strimzi-cluster-operator"}) for: 1m @@ -98,7 +98,7 @@ spec: severity: warning annotations: summary: 'Zookeeper average request latency' - description: 'The average request latency is {{ $value }} on {{ $labels.kubernetes_pod_name }}' + description: 'The average request latency is {{ $value }} on {{ $labels.pod }}' - alert: OutstandingRequests expr: zookeeper_outstandingrequests > 10 for: 10s @@ -106,7 +106,7 @@ spec: severity: warning annotations: summary: 'Zookeeper outstanding requests' - description: 'There are {{ $value }} outstanding requests on {{ $labels.kubernetes_pod_name }}' + description: 'There are {{ $value }} outstanding requests on {{ $labels.pod }}' - alert: ZookeeperRunningOutOfSpace expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim=~"data-(.+)-zookeeper-[0-9]+"} < 5368709120 for: 10s