diff --git a/terraform/environments/production/main.tf b/terraform/environments/production/main.tf index 8b2c65fd7..3fae42e0d 100644 --- a/terraform/environments/production/main.tf +++ b/terraform/environments/production/main.tf @@ -814,6 +814,8 @@ module "ops" { slack_alerts_auth_token = var.slack_alerts_auth_token slack_alerts_channel = var.slack_alerts_channel + pagerduty_auth_token = var.pagerduty_auth_token + api_host = module.api.host web_host = module.web.host } diff --git a/terraform/environments/production/variables.tf b/terraform/environments/production/variables.tf index 18363cec4..dc42e40a9 100644 --- a/terraform/environments/production/variables.tf +++ b/terraform/environments/production/variables.tf @@ -32,3 +32,7 @@ variable "slack_alerts_auth_token" { variable "postmark_server_api_token" { type = string } + +variable "pagerduty_auth_token" { + type = string +} diff --git a/terraform/environments/production/website.tf b/terraform/environments/production/website.tf new file mode 100644 index 000000000..68e7912e6 --- /dev/null +++ b/terraform/environments/production/website.tf @@ -0,0 +1,77 @@ + +resource "google_monitoring_uptime_check_config" "website-https" { + project = module.google-cloud-project.project.project_id + + display_name = "website-https" + timeout = "60s" + + http_check { + port = "443" + use_ssl = true + validate_ssl = true + + request_method = "GET" + path = "/" + + accepted_response_status_codes { + status_class = "STATUS_CLASS_2XX" + } + } + + monitored_resource { + type = "uptime_url" + + labels = { + project_id = module.google-cloud-project.project.project_id + host = local.tld + } + } + + content_matchers { + matcher = "CONTAINS_STRING" + content = "firezone" + } + + checker_type = "STATIC_IP_CHECKERS" +} + +resource "google_monitoring_alert_policy" "website-downtime" { + project = module.google-cloud-project.project.project_id + + display_name = "Website is DOWN!" + combiner = "OR" + + notification_channels = module.ops.notification_channels + + conditions { + display_name = "Uptime Health Check on website-https" + + condition_threshold { + filter = "resource.type = \"uptime_url\" AND metric.type = \"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id = \"${reverse(split("/", google_monitoring_uptime_check_config.website-https.id))[0]}\"" + + comparison = "COMPARISON_GT" + + threshold_value = 1 + duration = "0s" + + trigger { + count = 1 + } + + aggregations { + alignment_period = "60s" + cross_series_reducer = "REDUCE_COUNT_FALSE" + per_series_aligner = "ALIGN_NEXT_OLDER" + + group_by_fields = [ + "resource.label.project_id", + "resource.label.host" + ] + } + } + } + + alert_strategy { + auto_close = "28800s" + } +} diff --git a/terraform/modules/google-cloud-ops/main.tf b/terraform/modules/google-cloud-ops/main.tf index 5f599765e..15a6ca707 100644 --- a/terraform/modules/google-cloud-ops/main.tf +++ b/terraform/modules/google-cloud-ops/main.tf @@ -13,6 +13,26 @@ resource "google_monitoring_notification_channel" "slack" { } } +resource "google_monitoring_notification_channel" "pagerduty" { + count = var.pagerduty_auth_token != null ? 1 : 0 + + project = var.project_id + + display_name = "PagerDuty" + type = "pagerduty" + + sensitive_labels { + service_key = var.pagerduty_auth_token + } +} + +locals { + notification_channels = concat( + [google_monitoring_notification_channel.slack.name], + var.pagerduty_auth_token != null ? [google_monitoring_notification_channel.pagerduty[0].name] : [] + ) +} + resource "google_monitoring_uptime_check_config" "api-https" { project = var.project_id @@ -102,9 +122,7 @@ resource "google_monitoring_alert_policy" "api-downtime" { display_name = "API service is DOWN!" combiner = "OR" - notification_channels = [ - google_monitoring_notification_channel.slack.name - ] + notification_channels = local.notification_channels conditions { display_name = "Uptime Health Check on api-https" @@ -144,9 +162,7 @@ resource "google_monitoring_alert_policy" "web-downtime" { display_name = "Portal service is DOWN!" combiner = "OR" - notification_channels = [ - google_monitoring_notification_channel.slack.name - ] + notification_channels = local.notification_channels conditions { display_name = "Uptime Health Check on web-https" @@ -186,9 +202,7 @@ resource "google_monitoring_alert_policy" "instances_high_cpu_policy" { display_name = "High Instance CPU utilization" combiner = "OR" - notification_channels = [ - google_monitoring_notification_channel.slack.name - ] + notification_channels = local.notification_channels conditions { display_name = "VM Instance - CPU utilization" @@ -223,9 +237,7 @@ resource "google_monitoring_alert_policy" "sql_high_cpu_policy" { display_name = "High Cloud SQL CPU utilization" combiner = "OR" - notification_channels = [ - google_monitoring_notification_channel.slack.name - ] + notification_channels = local.notification_channels conditions { display_name = "Cloud SQL Database - CPU utilization" @@ -260,9 +272,7 @@ resource "google_monitoring_alert_policy" "sql_disk_utiliziation_policy" { display_name = "High Cloud SQL Disk utilization" combiner = "OR" - notification_channels = [ - google_monitoring_notification_channel.slack.name - ] + notification_channels = local.notification_channels conditions { display_name = "Cloud SQL Database - Disk utilization" @@ -290,3 +300,72 @@ resource "google_monitoring_alert_policy" "sql_disk_utiliziation_policy" { auto_close = "28800s" } } + +resource "google_monitoring_alert_policy" "genservers_crash_policy" { + project = var.project_id + + display_name = "GenServer Crashes" + combiner = "OR" + + notification_channels = local.notification_channels + + conditions { + display_name = "Log match condition" + + condition_matched_log { + filter = <<-EOT + resource.type="gce_instance" + severity>=ERROR + EOT + } + } + + alert_strategy { + auto_close = "28800s" + + notification_rate_limit { + period = "3600s" + } + } +} + +resource "google_monitoring_alert_policy" "ssl_certs_expiring_policy" { + project = var.project_id + + display_name = "SSL certificate expiring soon" + combiner = "OR" + + notification_channels = local.notification_channels + + user_labels = { + version = "1" + uptime = "ssl_cert_expiration" + } + + conditions { + display_name = "SSL certificate expiring soon" + + condition_threshold { + comparison = "COMPARISON_LT" + filter = "metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\" AND resource.type=\"uptime_url\"" + + aggregations { + alignment_period = "1200s" + cross_series_reducer = "REDUCE_MEAN" + group_by_fields = ["resource.label.*"] + per_series_aligner = "ALIGN_NEXT_OLDER" + } + + duration = "600s" + threshold_value = 15 + + trigger { + count = 1 + } + } + } + + alert_strategy { + auto_close = "28800s" + } +} diff --git a/terraform/modules/google-cloud-ops/outputs.tf b/terraform/modules/google-cloud-ops/outputs.tf new file mode 100644 index 000000000..1ee4e96c2 --- /dev/null +++ b/terraform/modules/google-cloud-ops/outputs.tf @@ -0,0 +1,3 @@ +output "notification_channels" { + value = local.notification_channels +} diff --git a/terraform/modules/google-cloud-ops/variables.tf b/terraform/modules/google-cloud-ops/variables.tf index ec9d42b8c..4a105ed6c 100644 --- a/terraform/modules/google-cloud-ops/variables.tf +++ b/terraform/modules/google-cloud-ops/variables.tf @@ -12,6 +12,12 @@ variable "slack_alerts_auth_token" { description = "Slack auth token for the infra alerts channel" } +variable "pagerduty_auth_token" { + type = string + description = "Pagerduty auth token for the infra alerts channel" + default = null +} + variable "api_host" { type = string }