diff --git a/kube/deploy/core/db/pg/app/prometheusrule-alerts.yaml b/kube/deploy/core/db/pg/app/prometheusrule-alerts.yaml index 0645548e..913a41d3 100644 --- a/kube/deploy/core/db/pg/app/prometheusrule-alerts.yaml +++ b/kube/deploy/core/db/pg/app/prometheusrule-alerts.yaml @@ -9,15 +9,15 @@ spec: groups: - name: crunchypgo.rules rules: - - alert: PostgresWALsStopped - annotations: - summary: >- - {{ $labels.pg_cluster }}'s master has stopped WALs generation. - expr: | - (sum by(pg_cluster, role) (increase(pg_wal_size_bytes{role="master"}[1h]))) - + # {{ $labels.pg_cluster }}'s master has stopped WALs generation. + # expr: | + # (sum by(pg_cluster, role) (increase(pg_wal_size_bytes{role="master"}[1h]))) - @@ -27,6 +27,15 @@ spec: for: 1h labels: severity: critical + - alert: PostgresReplicasReceiveStuck + annotations: + summary: >- + {{ $labels.pg_cluster }}'s replicas are stuck receiving. + expr: | + (sum by(pg_cluster, role) (ccp_replication_lag_received_time{role="replica"})) >bool 99999 + for: 1h + labels: + severity: critical - alert: PostgresReplicasArchiveError annotations: summary: >- @@ -36,14 +45,14 @@ spec: for: 1h labels: severity: critical - - alert: PostgresReplicationBad - annotations: - summary: >- - {{ $labels.pg_cluster }} is in a bad replication state. - expr: | - ( (sum by(pg_cluster) (increase(pg_wal_size_bytes{role="master"}[1h]))) bool 0 ) + - ( (sum by(pg_cluster) (ccp_archive_command_status_seconds_since_last_fail{role="replica"})) >bool 0 ) - for: 1h - labels: - severity: critical + # - alert: PostgresReplicationBad + # annotations: + # summary: >- + # {{ $labels.pg_cluster }} is in a bad replication state. + # expr: | + # ( (sum by(pg_cluster) (ccp_replication_lag_replay_time{role="replica"})) >bool 0 ) + + # ( (sum by(pg_cluster) (ccp_replication_lag_received_time{role="replica"})) > 99999 ) + + # ( (sum by(pg_cluster) (ccp_archive_command_status_seconds_since_last_fail{role="replica"})) >bool 0 ) + # for: 1h + # labels: + # severity: critical