Merge branch 'aenix-io:main' into main

This commit is contained in:
klinch0
2025-02-05 15:38:21 +03:00
committed by GitHub
10 changed files with 3767 additions and 1609 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -21,7 +21,7 @@ fix_d8() {
}
swap_pvc_overview() {
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
}
deprectaed_remove_faq() {
@@ -68,7 +68,7 @@ modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/namespace/
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhost_detail.json
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhosts.json
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/control-plane-status.json
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd3.json #TODO
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd.json #TODO
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/deprecated-resources.json
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/ntp.json #TODO
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/nodes.json
@@ -109,4 +109,3 @@ done <<\EOT
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json
EOT

View File

@@ -1 +1 @@
ghcr.io/aenix-io/cozystack/matchbox:v0.24.1@sha256:26ce2eaae90c82e49e866ae5b18e38d6e3ac1a4b0a3b494ebe2c480a4685f143
ghcr.io/aenix-io/cozystack/matchbox:v0.24.1@sha256:002d540c3a5583bfadbcbd436c2e582e51fc854068a2a7d2dba41072e921ca96

View File

@@ -3,4 +3,4 @@ name: etcd
description: Storage for Kubernetes clusters
icon: /logos/etcd.svg
type: application
version: 2.4.0
version: 2.5.0

View File

@@ -40,6 +40,12 @@ spec:
labels:
cozystack.io/service: etcd
spec:
containers:
- name: etcd
ports:
- name: metrics
containerPort: 2381
protocol: TCP
topologySpreadConstraints:
- maxSkew: 1
topologyKey: "kubernetes.io/hostname"

View File

@@ -0,0 +1,11 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: etcd-pod-scrape
spec:
podMetricsEndpoints:
- port: metrics
scheme: http
selector:
matchLabels:
app.kubernetes.io/name: etcd

View File

@@ -0,0 +1,132 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: etcd-rules
spec:
groups:
- name: etcd
rules:
- alert: etcdInsufficientMembers
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': insufficient members '{{`{{ $value }}`}}'."
expr: |
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member '{{`{{ $labels.instance }}`}}' has no leader."
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': instance '{{`{{ $labels.instance }}`}}' has seen '{{`{{ $value }}`}}' leader changes within the last hour."
expr: |
rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 1
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 5
for: 5m
labels:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': gRPC requests to '{{`{{ $labels.grpc_method }}`}}' are taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member communication with '{{`{{ $labels.To }}`}}' is taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' proposal failures within the last hour on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.01
for: 10m
labels:
severity: warning
- alert: etcdHighNumberOfFailedHTTPRequests
annotations:
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
expr: |
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.05
for: 10m
labels:
severity: critical
- alert: etcdHTTPRequestsSlow
annotations:
summary: "etcd instance '{{`{{ $labels.instance }}`}}' HTTP requests to '{{`{{ $labels.method }}`}}' are slow."
expr: |
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
- alert: etcdMembersDown
annotations:
summary: "etcd cluster '{{`{{ $labels.job }}`}}' members are down."
description: 'etcd cluster "{{`{{ $labels.job }}`}}": members are down {{`{{ $value }}`}}.'
expr: |
max without (endpoint) (
sum without (instance, pod) (up{job=~".*etcd.*"} == bool 0)
or
count without (To) (
sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
)
)
> 0
for: 10m
labels:
severity: critical

View File

@@ -30,5 +30,5 @@ main/nodes
control-plane/control-plane-status
control-plane/deprecated-resources
control-plane/dns-coredns
control-plane/kube-etcd3
control-plane/kube-etcd
kubevirt/kubevirt-control-plane

View File

@@ -5,7 +5,8 @@ etcd 2.0.1 6fc1cc7d
etcd 2.1.0 2b00fcf8
etcd 2.2.0 5ca8823
etcd 2.3.0 b908400d
etcd 2.4.0 HEAD
etcd 2.4.0 cb7b8158
etcd 2.5.0 HEAD
ingress 1.0.0 f642698
ingress 1.1.0 838bee5d
ingress 1.2.0 ced8e5b