fix(pg): replication alerts add stuck receive & cleanup

This commit is contained in:
JJGadgets
2025-01-18 06:10:54 +08:00
parent d7f3fd1414
commit 04f0dee7af

View File

@@ -9,15 +9,15 @@ spec:
groups:
- name: crunchypgo.rules
rules:
- alert: PostgresWALsStopped
annotations:
summary: >-
{{ $labels.pg_cluster }}'s master has stopped WALs generation.
expr: |
(sum by(pg_cluster, role) (increase(pg_wal_size_bytes{role="master"}[1h]))) <bool 100000000
for: 1h
labels:
severity: critical
# - alert: PostgresWALsStopped
# annotations:
# summary: >-
# {{ $labels.pg_cluster }}'s master has stopped WALs generation.
# expr: |
# (sum by(pg_cluster, role) (increase(pg_wal_size_bytes{role="master"}[1h]))) <bool 100000000
# for: 1h
# labels:
# severity: critical
- alert: PostgresReplicasReplayStuck
annotations:
summary: >-
@@ -27,6 +27,15 @@ spec:
for: 1h
labels:
severity: critical
- alert: PostgresReplicasReceiveStuck
annotations:
summary: >-
{{ $labels.pg_cluster }}'s replicas are stuck receiving.
expr: |
(sum by(pg_cluster, role) (ccp_replication_lag_received_time{role="replica"})) >bool 99999
for: 1h
labels:
severity: critical
- alert: PostgresReplicasArchiveError
annotations:
summary: >-
@@ -36,14 +45,14 @@ spec:
for: 1h
labels:
severity: critical
- alert: PostgresReplicationBad
annotations:
summary: >-
{{ $labels.pg_cluster }} is in a bad replication state.
expr: |
( (sum by(pg_cluster) (increase(pg_wal_size_bytes{role="master"}[1h]))) <bool 100000000 ) +
( (sum by(pg_cluster) (ccp_replication_lag_replay_time{role="replica"})) >bool 0 ) +
( (sum by(pg_cluster) (ccp_archive_command_status_seconds_since_last_fail{role="replica"})) >bool 0 )
for: 1h
labels:
severity: critical
# - alert: PostgresReplicationBad
# annotations:
# summary: >-
# {{ $labels.pg_cluster }} is in a bad replication state.
# expr: |
# ( (sum by(pg_cluster) (ccp_replication_lag_replay_time{role="replica"})) >bool 0 ) +
# ( (sum by(pg_cluster) (ccp_replication_lag_received_time{role="replica"})) > 99999 ) +
# ( (sum by(pg_cluster) (ccp_archive_command_status_seconds_since_last_fail{role="replica"})) >bool 0 )
# for: 1h
# labels:
# severity: critical