Import Piraeus dashboard and alerts. (#658)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- **New Features**
- Expanded the monitored dashboards with a new storage dashboard entry.
- Introduced proactive alert configurations that cover key storage
components.
- Added templated alert management to streamline dynamic configuration.
- Enhanced metric collection by integrating monitoring endpoints for
storage components.
- Delivered a comprehensive dashboard offering real-time insights into
storage performance.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->

---------

Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
Co-authored-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
xy2
2025-03-05 16:51:23 +03:00
committed by GitHub
parent 266d097cab
commit c60b7c0730
5 changed files with 2361 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -37,3 +37,4 @@ flux/flux-stats
kafka/strimzi-kafka
goldpinger/goldpinger
clickhouse/altinity-clickhouse-operator-dashboard
storage/linstor

View File

@@ -0,0 +1,44 @@
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: linstor-satellite
namespace: cozy-linstor
spec:
podMetricsEndpoints:
- port: prometheus
scheme: http
relabelConfigs:
- action: labeldrop
regex: (endpoint|namespace|pod|container)
- replacement: linstor-controller
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_node_name]
targetLabel: node
- targetLabel: tier
replacement: cluster
selector:
matchLabels:
app.kubernetes.io/component: linstor-satellite
---
apiVersion: operator.victoriametrics.com/v1beta1
kind: VMPodScrape
metadata:
name: linstor-controller
namespace: cozy-linstor
spec:
podMetricsEndpoints:
- path: /metrics
port: api
scheme: http
relabelConfigs:
- action: labeldrop
regex: (endpoint|namespace|pod|container)
- replacement: linstor-satellite
targetLabel: job
- sourceLabels: [__meta_kubernetes_pod_node_name]
targetLabel: node
- targetLabel: tier
replacement: cluster
selector:
matchLabels:
app.kubernetes.io/component: linstor-controller

View File

@@ -0,0 +1,116 @@
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: piraeus-datastore
spec:
groups:
- name: linstor.rules
rules:
- alert: linstorControllerOffline
annotations:
description: |
LINSTOR Controller is not reachable.
expr: up{job="linstor-controller"} == 0
labels:
severity: critical
- alert: linstorSatelliteErrorRate
annotations:
description: |
LINSTOR Satellite "{{ $labels.name }}" reports {{ $value }} errors in the last 15 minutes.
Use "linstor error-reports list --nodes {{ $labels.name }} --since 15minutes" to see them.
expr: increase(linstor_error_reports_count{module="SATELLITE"}[15m]) > 0
labels:
severity: warning
- alert: linstorControllerErrorRate
annotations:
description: |
LINSTOR Controller reports {{ $value }} errors in the last 15 minutes.
Use "linstor error-reports list --since 15minutes" to see them.
expr: increase(linstor_error_reports_count{module="CONTROLLER"}[15m]) > 0
labels:
severity: warning
- alert: linstorSatelliteNotOnline
annotations:
description: |
LINSTOR Satellite "{{ $labels.name }}" is not ONLINE.
Check that the Satellite is running and reachable from the LINSTOR Controller.
expr: linstor_node_state{nodetype="SATELLITE"} != 2
labels:
severity: critical
- alert: linstorStoragePoolErrors
annotations:
description: |
Storage pool "{{ $labels.storage_pool }}" on node "{{ $labels.node }}" ({{ $labels.driver }}={{ $labels.backing_pool }}) is reporting errors.
expr: linstor_storage_pool_error_count > 0
labels:
severity: critical
- alert: linstorStoragePoolAtCapacity
annotations:
description: |
Storage pool "{{ $labels.storage_pool }}" on node "{{ $labels.node }}" ({{ $labels.driver }}={{ $labels.backing_pool }}) has less than 20% free space available.
expr: ( linstor_storage_pool_capacity_free_bytes / linstor_storage_pool_capacity_total_bytes ) < 0.20
labels:
severity: warn
- name: drbd.rules
rules:
- alert: drbdReactorOffline
annotations:
description: |
DRBD Reactor on "{{ $labels.node }}" is not reachable.
expr: up{job="piraeus-datastore/linstor-satellite"} == 0
labels:
severity: critical
- alert: drbdConnectionNotConnected
annotations:
description: |
DRBD Resource "{{ $labels.name }}" on "{{ $labels.node }}" is not connected to "{{ $labels.conn_name }}": {{ $labels.drbd_connection_state }}.
expr: drbd_connection_state{drbd_connection_state!="Connected"} > 0
labels:
severity: warn
- alert: drbdDeviceNotUpToDate
annotations:
description: |
DRBD device "{{ $labels.name }}" on "{{ $labels.node }}" has unexpected device state "{{ $labels.drbd_device_state }}".
expr: drbd_device_state{drbd_device_state!~"UpToDate|Diskless"} > 0
labels:
severity: warn
- alert: drbdDeviceUnintentionalDiskless
annotations:
description: |
DRBD device "{{ $labels.name }}" on "{{ $labels.node }}" is unintenionally diskless.
This usually indicates IO errors reported on the backing device. Check the kernel log.
expr: drbd_device_unintentionaldiskless > 0
labels:
severity: warn
- alert: drbdDeviceWithoutQuorum
annotations:
description: |
DRBD device "{{ $labels.name }}" on "{{ $labels.node }}" has no quorum.
This usually indicates connectivity issues.
expr: drbd_device_quorum == 0
labels:
severity: warn
- alert: drbdResourceSuspended
annotations:
description: |
DRBD resource "{{ $labels.name }}" on "{{ $labels.node }}" has been suspended for 1m.
for: 1m
expr: drbd_resource_suspended > 0
labels:
severity: warn
- alert: drbdResourceResyncWithoutProgress
annotations:
description: |
DRBD resource "{{ $labels.name }}" on "{{ $labels.node }}" has been in Inconsistent without resync progress for 5 minutes.
This may indicate there is no connection to UpToDate data, or a stuck resync.
expr: drbd_device_state{drbd_device_state="Inconsistent"} and delta(drbd_peerdevice_outofsync_bytes[5m]) >= 0
labels:
severity: warn
- alert: drbdResourceWithNoUpToDateReplicas
annotations:
description: |
DRBD resource "{{ $labels.name }}" has no UpToDate replicas.
expr: sum by (name) (drbd_device_state{drbd_device_state="UpToDate"}) == 0
labels:
severity: critical

View File

@@ -0,0 +1,7 @@
{{- $files := .Files.Glob "alerts/*.yaml" -}}
{{- range $path, $file := $files }}
---
# from: {{ $path }}
{{ toString $file }}
{{- end -}}