mirror of
https://github.com/outbackdingo/Biohazard.git
synced 2026-03-21 15:39:44 +00:00
fix(pg): replication alerts add stuck receive & cleanup
This commit is contained in:
@@ -9,15 +9,15 @@ spec:
|
||||
groups:
|
||||
- name: crunchypgo.rules
|
||||
rules:
|
||||
- alert: PostgresWALsStopped
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ $labels.pg_cluster }}'s master has stopped WALs generation.
|
||||
expr: |
|
||||
(sum by(pg_cluster, role) (increase(pg_wal_size_bytes{role="master"}[1h]))) <bool 100000000
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
# - alert: PostgresWALsStopped
|
||||
# annotations:
|
||||
# summary: >-
|
||||
# {{ $labels.pg_cluster }}'s master has stopped WALs generation.
|
||||
# expr: |
|
||||
# (sum by(pg_cluster, role) (increase(pg_wal_size_bytes{role="master"}[1h]))) <bool 100000000
|
||||
# for: 1h
|
||||
# labels:
|
||||
# severity: critical
|
||||
- alert: PostgresReplicasReplayStuck
|
||||
annotations:
|
||||
summary: >-
|
||||
@@ -27,6 +27,15 @@ spec:
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PostgresReplicasReceiveStuck
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ $labels.pg_cluster }}'s replicas are stuck receiving.
|
||||
expr: |
|
||||
(sum by(pg_cluster, role) (ccp_replication_lag_received_time{role="replica"})) >bool 99999
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PostgresReplicasArchiveError
|
||||
annotations:
|
||||
summary: >-
|
||||
@@ -36,14 +45,14 @@ spec:
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PostgresReplicationBad
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ $labels.pg_cluster }} is in a bad replication state.
|
||||
expr: |
|
||||
( (sum by(pg_cluster) (increase(pg_wal_size_bytes{role="master"}[1h]))) <bool 100000000 ) +
|
||||
( (sum by(pg_cluster) (ccp_replication_lag_replay_time{role="replica"})) >bool 0 ) +
|
||||
( (sum by(pg_cluster) (ccp_archive_command_status_seconds_since_last_fail{role="replica"})) >bool 0 )
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
# - alert: PostgresReplicationBad
|
||||
# annotations:
|
||||
# summary: >-
|
||||
# {{ $labels.pg_cluster }} is in a bad replication state.
|
||||
# expr: |
|
||||
# ( (sum by(pg_cluster) (ccp_replication_lag_replay_time{role="replica"})) >bool 0 ) +
|
||||
# ( (sum by(pg_cluster) (ccp_replication_lag_received_time{role="replica"})) > 99999 ) +
|
||||
# ( (sum by(pg_cluster) (ccp_archive_command_status_seconds_since_last_fail{role="replica"})) >bool 0 )
|
||||
# for: 1h
|
||||
# labels:
|
||||
# severity: critical
|
||||
|
||||
Reference in New Issue
Block a user