mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-01-27 18:18:41 +00:00
Merge branch 'aenix-io:main' into main
This commit is contained in:
3611
dashboards/control-plane/kube-etcd.json
Normal file
3611
dashboards/control-plane/kube-etcd.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -21,7 +21,7 @@ fix_d8() {
|
||||
}
|
||||
|
||||
swap_pvc_overview() {
|
||||
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
|
||||
jq '(.panels[] | select(.title=="PVC Detailed") | .panels[] | select(.title=="Overview")) as $a | del(.panels[] | select(.title=="PVC Detailed").panels[] | select(.title=="Overview")) | ( (.panels[] | select(.title=="PVC Detailed"))) as $b | del( .panels[] | select(.title=="PVC Detailed")) | (.panels[.panels|length]=($a|.gridPos.y=$b.gridPos.y)) | (.panels[.panels|length]=($b|.gridPos.y=$a.gridPos.y))'
|
||||
}
|
||||
|
||||
deprectaed_remove_faq() {
|
||||
@@ -68,7 +68,7 @@ modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/namespace/
|
||||
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhost_detail.json
|
||||
modules/402-ingress-nginx/monitoring/grafana-dashboards/ingress-nginx/vhost/vhosts.json
|
||||
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/control-plane-status.json
|
||||
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd3.json #TODO
|
||||
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/kube-etcd.json #TODO
|
||||
modules/340-monitoring-kubernetes-control-plane/monitoring/grafana-dashboards/kubernetes-cluster/deprecated-resources.json
|
||||
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/ntp.json #TODO
|
||||
modules/340-monitoring-kubernetes/monitoring/grafana-dashboards//kubernetes-cluster/nodes/nodes.json
|
||||
@@ -109,4 +109,3 @@ done <<\EOT
|
||||
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-namespaces.json
|
||||
https://raw.githubusercontent.com/dotdc/grafana-dashboards-kubernetes/master/dashboards/k8s-views-pods.json
|
||||
EOT
|
||||
|
||||
|
||||
@@ -1 +1 @@
|
||||
ghcr.io/aenix-io/cozystack/matchbox:v0.24.1@sha256:26ce2eaae90c82e49e866ae5b18e38d6e3ac1a4b0a3b494ebe2c480a4685f143
|
||||
ghcr.io/aenix-io/cozystack/matchbox:v0.24.1@sha256:002d540c3a5583bfadbcbd436c2e582e51fc854068a2a7d2dba41072e921ca96
|
||||
@@ -3,4 +3,4 @@ name: etcd
|
||||
description: Storage for Kubernetes clusters
|
||||
icon: /logos/etcd.svg
|
||||
type: application
|
||||
version: 2.4.0
|
||||
version: 2.5.0
|
||||
|
||||
@@ -40,6 +40,12 @@ spec:
|
||||
labels:
|
||||
cozystack.io/service: etcd
|
||||
spec:
|
||||
containers:
|
||||
- name: etcd
|
||||
ports:
|
||||
- name: metrics
|
||||
containerPort: 2381
|
||||
protocol: TCP
|
||||
topologySpreadConstraints:
|
||||
- maxSkew: 1
|
||||
topologyKey: "kubernetes.io/hostname"
|
||||
|
||||
11
packages/extra/etcd/templates/podscrape.yaml
Normal file
11
packages/extra/etcd/templates/podscrape.yaml
Normal file
@@ -0,0 +1,11 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMPodScrape
|
||||
metadata:
|
||||
name: etcd-pod-scrape
|
||||
spec:
|
||||
podMetricsEndpoints:
|
||||
- port: metrics
|
||||
scheme: http
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: etcd
|
||||
132
packages/extra/etcd/templates/prometheus-rules.yaml
Normal file
132
packages/extra/etcd/templates/prometheus-rules.yaml
Normal file
@@ -0,0 +1,132 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: etcd-rules
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': insufficient members '{{`{{ $value }}`}}'."
|
||||
expr: |
|
||||
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member '{{`{{ $labels.instance }}`}}' has no leader."
|
||||
expr: |
|
||||
etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': instance '{{`{{ $labels.instance }}`}}' has seen '{{`{{ $value }}`}}' leader changes within the last hour."
|
||||
expr: |
|
||||
rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' of requests for '{{`{{ $labels.grpc_method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': gRPC requests to '{{`{{ $labels.grpc_method }}`}}' are taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': member communication with '{{`{{ $labels.To }}`}}' is taking '{{`{{ $value }}`}}' on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}': '{{`{{ $value }}`}}' proposal failures within the last hour on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
summary: "'{{`{{ $value }}`}}' of requests for '{{`{{ $labels.method }}`}}' failed on etcd instance '{{`{{ $labels.instance }}`}}'."
|
||||
expr: |
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
summary: "etcd instance '{{`{{ $labels.instance }}`}}' HTTP requests to '{{`{{ $labels.method }}`}}' are slow."
|
||||
expr: |
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
summary: "etcd cluster '{{`{{ $labels.job }}`}}' members are down."
|
||||
description: 'etcd cluster "{{`{{ $labels.job }}`}}": members are down {{`{{ $value }}`}}.'
|
||||
expr: |
|
||||
max without (endpoint) (
|
||||
sum without (instance, pod) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance, pod) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@@ -30,5 +30,5 @@ main/nodes
|
||||
control-plane/control-plane-status
|
||||
control-plane/deprecated-resources
|
||||
control-plane/dns-coredns
|
||||
control-plane/kube-etcd3
|
||||
control-plane/kube-etcd
|
||||
kubevirt/kubevirt-control-plane
|
||||
|
||||
@@ -5,7 +5,8 @@ etcd 2.0.1 6fc1cc7d
|
||||
etcd 2.1.0 2b00fcf8
|
||||
etcd 2.2.0 5ca8823
|
||||
etcd 2.3.0 b908400d
|
||||
etcd 2.4.0 HEAD
|
||||
etcd 2.4.0 cb7b8158
|
||||
etcd 2.5.0 HEAD
|
||||
ingress 1.0.0 f642698
|
||||
ingress 1.1.0 838bee5d
|
||||
ingress 1.2.0 ced8e5b
|
||||
|
||||
Reference in New Issue
Block a user