mirror of
https://github.com/outbackdingo/cozystack.git
synced 2026-01-27 10:18:39 +00:00
Import Piraeus dashboard and alerts. (#658)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Expanded the monitored dashboards with a new storage dashboard entry. - Introduced proactive alert configurations that cover key storage components. - Added templated alert management to streamline dynamic configuration. - Enhanced metric collection by integrating monitoring endpoints for storage components. - Delivered a comprehensive dashboard offering real-time insights into storage performance. <!-- end of auto-generated comment: release notes by coderabbit.ai --> --------- Signed-off-by: Andrei Kvapil <kvapss@gmail.com> Co-authored-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
2193
dashboards/storage/linstor.json
Normal file
2193
dashboards/storage/linstor.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -37,3 +37,4 @@ flux/flux-stats
|
||||
kafka/strimzi-kafka
|
||||
goldpinger/goldpinger
|
||||
clickhouse/altinity-clickhouse-operator-dashboard
|
||||
storage/linstor
|
||||
44
packages/system/linstor/templates/podscrape.yaml
Normal file
44
packages/system/linstor/templates/podscrape.yaml
Normal file
@@ -0,0 +1,44 @@
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMPodScrape
|
||||
metadata:
|
||||
name: linstor-satellite
|
||||
namespace: cozy-linstor
|
||||
spec:
|
||||
podMetricsEndpoints:
|
||||
- port: prometheus
|
||||
scheme: http
|
||||
relabelConfigs:
|
||||
- action: labeldrop
|
||||
regex: (endpoint|namespace|pod|container)
|
||||
- replacement: linstor-controller
|
||||
targetLabel: job
|
||||
- sourceLabels: [__meta_kubernetes_pod_node_name]
|
||||
targetLabel: node
|
||||
- targetLabel: tier
|
||||
replacement: cluster
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: linstor-satellite
|
||||
---
|
||||
apiVersion: operator.victoriametrics.com/v1beta1
|
||||
kind: VMPodScrape
|
||||
metadata:
|
||||
name: linstor-controller
|
||||
namespace: cozy-linstor
|
||||
spec:
|
||||
podMetricsEndpoints:
|
||||
- path: /metrics
|
||||
port: api
|
||||
scheme: http
|
||||
relabelConfigs:
|
||||
- action: labeldrop
|
||||
regex: (endpoint|namespace|pod|container)
|
||||
- replacement: linstor-satellite
|
||||
targetLabel: job
|
||||
- sourceLabels: [__meta_kubernetes_pod_node_name]
|
||||
targetLabel: node
|
||||
- targetLabel: tier
|
||||
replacement: cluster
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: linstor-controller
|
||||
116
packages/system/piraeus-operator/alerts/piraeus-datastore.yaml
Normal file
116
packages/system/piraeus-operator/alerts/piraeus-datastore.yaml
Normal file
@@ -0,0 +1,116 @@
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: piraeus-datastore
|
||||
spec:
|
||||
groups:
|
||||
- name: linstor.rules
|
||||
rules:
|
||||
- alert: linstorControllerOffline
|
||||
annotations:
|
||||
description: |
|
||||
LINSTOR Controller is not reachable.
|
||||
expr: up{job="linstor-controller"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: linstorSatelliteErrorRate
|
||||
annotations:
|
||||
description: |
|
||||
LINSTOR Satellite "{{ $labels.name }}" reports {{ $value }} errors in the last 15 minutes.
|
||||
Use "linstor error-reports list --nodes {{ $labels.name }} --since 15minutes" to see them.
|
||||
expr: increase(linstor_error_reports_count{module="SATELLITE"}[15m]) > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: linstorControllerErrorRate
|
||||
annotations:
|
||||
description: |
|
||||
LINSTOR Controller reports {{ $value }} errors in the last 15 minutes.
|
||||
Use "linstor error-reports list --since 15minutes" to see them.
|
||||
expr: increase(linstor_error_reports_count{module="CONTROLLER"}[15m]) > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: linstorSatelliteNotOnline
|
||||
annotations:
|
||||
description: |
|
||||
LINSTOR Satellite "{{ $labels.name }}" is not ONLINE.
|
||||
Check that the Satellite is running and reachable from the LINSTOR Controller.
|
||||
expr: linstor_node_state{nodetype="SATELLITE"} != 2
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: linstorStoragePoolErrors
|
||||
annotations:
|
||||
description: |
|
||||
Storage pool "{{ $labels.storage_pool }}" on node "{{ $labels.node }}" ({{ $labels.driver }}={{ $labels.backing_pool }}) is reporting errors.
|
||||
expr: linstor_storage_pool_error_count > 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: linstorStoragePoolAtCapacity
|
||||
annotations:
|
||||
description: |
|
||||
Storage pool "{{ $labels.storage_pool }}" on node "{{ $labels.node }}" ({{ $labels.driver }}={{ $labels.backing_pool }}) has less than 20% free space available.
|
||||
expr: ( linstor_storage_pool_capacity_free_bytes / linstor_storage_pool_capacity_total_bytes ) < 0.20
|
||||
labels:
|
||||
severity: warn
|
||||
- name: drbd.rules
|
||||
rules:
|
||||
- alert: drbdReactorOffline
|
||||
annotations:
|
||||
description: |
|
||||
DRBD Reactor on "{{ $labels.node }}" is not reachable.
|
||||
expr: up{job="piraeus-datastore/linstor-satellite"} == 0
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: drbdConnectionNotConnected
|
||||
annotations:
|
||||
description: |
|
||||
DRBD Resource "{{ $labels.name }}" on "{{ $labels.node }}" is not connected to "{{ $labels.conn_name }}": {{ $labels.drbd_connection_state }}.
|
||||
expr: drbd_connection_state{drbd_connection_state!="Connected"} > 0
|
||||
labels:
|
||||
severity: warn
|
||||
- alert: drbdDeviceNotUpToDate
|
||||
annotations:
|
||||
description: |
|
||||
DRBD device "{{ $labels.name }}" on "{{ $labels.node }}" has unexpected device state "{{ $labels.drbd_device_state }}".
|
||||
expr: drbd_device_state{drbd_device_state!~"UpToDate|Diskless"} > 0
|
||||
labels:
|
||||
severity: warn
|
||||
- alert: drbdDeviceUnintentionalDiskless
|
||||
annotations:
|
||||
description: |
|
||||
DRBD device "{{ $labels.name }}" on "{{ $labels.node }}" is unintenionally diskless.
|
||||
This usually indicates IO errors reported on the backing device. Check the kernel log.
|
||||
expr: drbd_device_unintentionaldiskless > 0
|
||||
labels:
|
||||
severity: warn
|
||||
- alert: drbdDeviceWithoutQuorum
|
||||
annotations:
|
||||
description: |
|
||||
DRBD device "{{ $labels.name }}" on "{{ $labels.node }}" has no quorum.
|
||||
This usually indicates connectivity issues.
|
||||
expr: drbd_device_quorum == 0
|
||||
labels:
|
||||
severity: warn
|
||||
- alert: drbdResourceSuspended
|
||||
annotations:
|
||||
description: |
|
||||
DRBD resource "{{ $labels.name }}" on "{{ $labels.node }}" has been suspended for 1m.
|
||||
for: 1m
|
||||
expr: drbd_resource_suspended > 0
|
||||
labels:
|
||||
severity: warn
|
||||
- alert: drbdResourceResyncWithoutProgress
|
||||
annotations:
|
||||
description: |
|
||||
DRBD resource "{{ $labels.name }}" on "{{ $labels.node }}" has been in Inconsistent without resync progress for 5 minutes.
|
||||
This may indicate there is no connection to UpToDate data, or a stuck resync.
|
||||
expr: drbd_device_state{drbd_device_state="Inconsistent"} and delta(drbd_peerdevice_outofsync_bytes[5m]) >= 0
|
||||
labels:
|
||||
severity: warn
|
||||
- alert: drbdResourceWithNoUpToDateReplicas
|
||||
annotations:
|
||||
description: |
|
||||
DRBD resource "{{ $labels.name }}" has no UpToDate replicas.
|
||||
expr: sum by (name) (drbd_device_state{drbd_device_state="UpToDate"}) == 0
|
||||
labels:
|
||||
severity: critical
|
||||
7
packages/system/piraeus-operator/templates/alerts.yaml
Normal file
7
packages/system/piraeus-operator/templates/alerts.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
{{- $files := .Files.Glob "alerts/*.yaml" -}}
|
||||
{{- range $path, $file := $files }}
|
||||
---
|
||||
# from: {{ $path }}
|
||||
{{ toString $file }}
|
||||
|
||||
{{- end -}}
|
||||
Reference in New Issue
Block a user