Add an alert when relays are down

This commit is contained in:
Andrew Dryga
2024-04-11 18:04:00 -06:00
parent 13d7312738
commit 54def90b12

View File

@@ -154,3 +154,40 @@ resource "google_compute_firewall" "relays-ssh-ipv4" {
source_ranges = local.iap_ipv4_ranges
target_tags = module.relays[0].target_tags
}
# Trigger an alert when more than 20% of relays are down
resource "google_monitoring_alert_policy" "instances_high_cpu_policy" {
project = module.google-cloud-project.project.project_id
display_name = "Relays are down"
combiner = "OR"
notification_channels = module.ops.notification_channels
conditions {
display_name = "Relay Instances"
condition_threshold {
filter = "resource.type = \"gce_instance\" AND metric.type = \"custom.googleapis.com/elixir/domain/relays/online_relays_count/last_value\""
comparison = "COMPARISON_GT"
# at least one relay per region must be always online
threshold_value = module.relays[0].instances
duration = "5s"
trigger {
count = 1
}
aggregations {
alignment_period = "60s"
cross_series_reducer = "REDUCE_NONE"
per_series_aligner = "ALIGN_MEAN"
}
}
}
alert_strategy {
auto_close = "172800s"
}
}