Report incidents to PagerDuty and monitor website/errors in logs

This commit is contained in:
Andrew Dryga
2023-12-07 19:06:34 -05:00
parent 8312779813
commit 8fe09d2859
6 changed files with 186 additions and 15 deletions

View File

@@ -814,6 +814,8 @@ module "ops" {
slack_alerts_auth_token = var.slack_alerts_auth_token
slack_alerts_channel = var.slack_alerts_channel
pagerduty_auth_token = var.pagerduty_auth_token
api_host = module.api.host
web_host = module.web.host
}

View File

@@ -32,3 +32,7 @@ variable "slack_alerts_auth_token" {
variable "postmark_server_api_token" {
type = string
}
variable "pagerduty_auth_token" {
type = string
}

View File

@@ -0,0 +1,77 @@
resource "google_monitoring_uptime_check_config" "website-https" {
project = module.google-cloud-project.project.project_id
display_name = "website-https"
timeout = "60s"
http_check {
port = "443"
use_ssl = true
validate_ssl = true
request_method = "GET"
path = "/"
accepted_response_status_codes {
status_class = "STATUS_CLASS_2XX"
}
}
monitored_resource {
type = "uptime_url"
labels = {
project_id = module.google-cloud-project.project.project_id
host = local.tld
}
}
content_matchers {
matcher = "CONTAINS_STRING"
content = "firezone"
}
checker_type = "STATIC_IP_CHECKERS"
}
resource "google_monitoring_alert_policy" "website-downtime" {
project = module.google-cloud-project.project.project_id
display_name = "Website is DOWN!"
combiner = "OR"
notification_channels = module.ops.notification_channels
conditions {
display_name = "Uptime Health Check on website-https"
condition_threshold {
filter = "resource.type = \"uptime_url\" AND metric.type = \"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id = \"${reverse(split("/", google_monitoring_uptime_check_config.website-https.id))[0]}\""
comparison = "COMPARISON_GT"
threshold_value = 1
duration = "0s"
trigger {
count = 1
}
aggregations {
alignment_period = "60s"
cross_series_reducer = "REDUCE_COUNT_FALSE"
per_series_aligner = "ALIGN_NEXT_OLDER"
group_by_fields = [
"resource.label.project_id",
"resource.label.host"
]
}
}
}
alert_strategy {
auto_close = "28800s"
}
}

View File

@@ -13,6 +13,26 @@ resource "google_monitoring_notification_channel" "slack" {
}
}
resource "google_monitoring_notification_channel" "pagerduty" {
count = var.pagerduty_auth_token != null ? 1 : 0
project = var.project_id
display_name = "PagerDuty"
type = "pagerduty"
sensitive_labels {
service_key = var.pagerduty_auth_token
}
}
locals {
notification_channels = concat(
[google_monitoring_notification_channel.slack.name],
var.pagerduty_auth_token != null ? [google_monitoring_notification_channel.pagerduty[0].name] : []
)
}
resource "google_monitoring_uptime_check_config" "api-https" {
project = var.project_id
@@ -102,9 +122,7 @@ resource "google_monitoring_alert_policy" "api-downtime" {
display_name = "API service is DOWN!"
combiner = "OR"
notification_channels = [
google_monitoring_notification_channel.slack.name
]
notification_channels = local.notification_channels
conditions {
display_name = "Uptime Health Check on api-https"
@@ -144,9 +162,7 @@ resource "google_monitoring_alert_policy" "web-downtime" {
display_name = "Portal service is DOWN!"
combiner = "OR"
notification_channels = [
google_monitoring_notification_channel.slack.name
]
notification_channels = local.notification_channels
conditions {
display_name = "Uptime Health Check on web-https"
@@ -186,9 +202,7 @@ resource "google_monitoring_alert_policy" "instances_high_cpu_policy" {
display_name = "High Instance CPU utilization"
combiner = "OR"
notification_channels = [
google_monitoring_notification_channel.slack.name
]
notification_channels = local.notification_channels
conditions {
display_name = "VM Instance - CPU utilization"
@@ -223,9 +237,7 @@ resource "google_monitoring_alert_policy" "sql_high_cpu_policy" {
display_name = "High Cloud SQL CPU utilization"
combiner = "OR"
notification_channels = [
google_monitoring_notification_channel.slack.name
]
notification_channels = local.notification_channels
conditions {
display_name = "Cloud SQL Database - CPU utilization"
@@ -260,9 +272,7 @@ resource "google_monitoring_alert_policy" "sql_disk_utiliziation_policy" {
display_name = "High Cloud SQL Disk utilization"
combiner = "OR"
notification_channels = [
google_monitoring_notification_channel.slack.name
]
notification_channels = local.notification_channels
conditions {
display_name = "Cloud SQL Database - Disk utilization"
@@ -290,3 +300,72 @@ resource "google_monitoring_alert_policy" "sql_disk_utiliziation_policy" {
auto_close = "28800s"
}
}
resource "google_monitoring_alert_policy" "genservers_crash_policy" {
project = var.project_id
display_name = "GenServer Crashes"
combiner = "OR"
notification_channels = local.notification_channels
conditions {
display_name = "Log match condition"
condition_matched_log {
filter = <<-EOT
resource.type="gce_instance"
severity>=ERROR
EOT
}
}
alert_strategy {
auto_close = "28800s"
notification_rate_limit {
period = "3600s"
}
}
}
resource "google_monitoring_alert_policy" "ssl_certs_expiring_policy" {
project = var.project_id
display_name = "SSL certificate expiring soon"
combiner = "OR"
notification_channels = local.notification_channels
user_labels = {
version = "1"
uptime = "ssl_cert_expiration"
}
conditions {
display_name = "SSL certificate expiring soon"
condition_threshold {
comparison = "COMPARISON_LT"
filter = "metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\" AND resource.type=\"uptime_url\""
aggregations {
alignment_period = "1200s"
cross_series_reducer = "REDUCE_MEAN"
group_by_fields = ["resource.label.*"]
per_series_aligner = "ALIGN_NEXT_OLDER"
}
duration = "600s"
threshold_value = 15
trigger {
count = 1
}
}
}
alert_strategy {
auto_close = "28800s"
}
}

View File

@@ -0,0 +1,3 @@
output "notification_channels" {
value = local.notification_channels
}

View File

@@ -12,6 +12,12 @@ variable "slack_alerts_auth_token" {
description = "Slack auth token for the infra alerts channel"
}
variable "pagerduty_auth_token" {
type = string
description = "Pagerduty auth token for the infra alerts channel"
default = null
}
variable "api_host" {
type = string
}