mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 18:18:55 +00:00
Report incidents to PagerDuty and monitor website/errors in logs
This commit is contained in:
@@ -814,6 +814,8 @@ module "ops" {
|
||||
slack_alerts_auth_token = var.slack_alerts_auth_token
|
||||
slack_alerts_channel = var.slack_alerts_channel
|
||||
|
||||
pagerduty_auth_token = var.pagerduty_auth_token
|
||||
|
||||
api_host = module.api.host
|
||||
web_host = module.web.host
|
||||
}
|
||||
|
||||
@@ -32,3 +32,7 @@ variable "slack_alerts_auth_token" {
|
||||
variable "postmark_server_api_token" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "pagerduty_auth_token" {
|
||||
type = string
|
||||
}
|
||||
|
||||
77
terraform/environments/production/website.tf
Normal file
77
terraform/environments/production/website.tf
Normal file
@@ -0,0 +1,77 @@
|
||||
|
||||
resource "google_monitoring_uptime_check_config" "website-https" {
|
||||
project = module.google-cloud-project.project.project_id
|
||||
|
||||
display_name = "website-https"
|
||||
timeout = "60s"
|
||||
|
||||
http_check {
|
||||
port = "443"
|
||||
use_ssl = true
|
||||
validate_ssl = true
|
||||
|
||||
request_method = "GET"
|
||||
path = "/"
|
||||
|
||||
accepted_response_status_codes {
|
||||
status_class = "STATUS_CLASS_2XX"
|
||||
}
|
||||
}
|
||||
|
||||
monitored_resource {
|
||||
type = "uptime_url"
|
||||
|
||||
labels = {
|
||||
project_id = module.google-cloud-project.project.project_id
|
||||
host = local.tld
|
||||
}
|
||||
}
|
||||
|
||||
content_matchers {
|
||||
matcher = "CONTAINS_STRING"
|
||||
content = "firezone"
|
||||
}
|
||||
|
||||
checker_type = "STATIC_IP_CHECKERS"
|
||||
}
|
||||
|
||||
resource "google_monitoring_alert_policy" "website-downtime" {
|
||||
project = module.google-cloud-project.project.project_id
|
||||
|
||||
display_name = "Website is DOWN!"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = module.ops.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "Uptime Health Check on website-https"
|
||||
|
||||
condition_threshold {
|
||||
filter = "resource.type = \"uptime_url\" AND metric.type = \"monitoring.googleapis.com/uptime_check/check_passed\" AND metric.labels.check_id = \"${reverse(split("/", google_monitoring_uptime_check_config.website-https.id))[0]}\""
|
||||
|
||||
comparison = "COMPARISON_GT"
|
||||
|
||||
threshold_value = 1
|
||||
duration = "0s"
|
||||
|
||||
trigger {
|
||||
count = 1
|
||||
}
|
||||
|
||||
aggregations {
|
||||
alignment_period = "60s"
|
||||
cross_series_reducer = "REDUCE_COUNT_FALSE"
|
||||
per_series_aligner = "ALIGN_NEXT_OLDER"
|
||||
|
||||
group_by_fields = [
|
||||
"resource.label.project_id",
|
||||
"resource.label.host"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
alert_strategy {
|
||||
auto_close = "28800s"
|
||||
}
|
||||
}
|
||||
@@ -13,6 +13,26 @@ resource "google_monitoring_notification_channel" "slack" {
|
||||
}
|
||||
}
|
||||
|
||||
resource "google_monitoring_notification_channel" "pagerduty" {
|
||||
count = var.pagerduty_auth_token != null ? 1 : 0
|
||||
|
||||
project = var.project_id
|
||||
|
||||
display_name = "PagerDuty"
|
||||
type = "pagerduty"
|
||||
|
||||
sensitive_labels {
|
||||
service_key = var.pagerduty_auth_token
|
||||
}
|
||||
}
|
||||
|
||||
locals {
|
||||
notification_channels = concat(
|
||||
[google_monitoring_notification_channel.slack.name],
|
||||
var.pagerduty_auth_token != null ? [google_monitoring_notification_channel.pagerduty[0].name] : []
|
||||
)
|
||||
}
|
||||
|
||||
resource "google_monitoring_uptime_check_config" "api-https" {
|
||||
project = var.project_id
|
||||
|
||||
@@ -102,9 +122,7 @@ resource "google_monitoring_alert_policy" "api-downtime" {
|
||||
display_name = "API service is DOWN!"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = [
|
||||
google_monitoring_notification_channel.slack.name
|
||||
]
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "Uptime Health Check on api-https"
|
||||
@@ -144,9 +162,7 @@ resource "google_monitoring_alert_policy" "web-downtime" {
|
||||
display_name = "Portal service is DOWN!"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = [
|
||||
google_monitoring_notification_channel.slack.name
|
||||
]
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "Uptime Health Check on web-https"
|
||||
@@ -186,9 +202,7 @@ resource "google_monitoring_alert_policy" "instances_high_cpu_policy" {
|
||||
display_name = "High Instance CPU utilization"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = [
|
||||
google_monitoring_notification_channel.slack.name
|
||||
]
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "VM Instance - CPU utilization"
|
||||
@@ -223,9 +237,7 @@ resource "google_monitoring_alert_policy" "sql_high_cpu_policy" {
|
||||
display_name = "High Cloud SQL CPU utilization"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = [
|
||||
google_monitoring_notification_channel.slack.name
|
||||
]
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "Cloud SQL Database - CPU utilization"
|
||||
@@ -260,9 +272,7 @@ resource "google_monitoring_alert_policy" "sql_disk_utiliziation_policy" {
|
||||
display_name = "High Cloud SQL Disk utilization"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = [
|
||||
google_monitoring_notification_channel.slack.name
|
||||
]
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "Cloud SQL Database - Disk utilization"
|
||||
@@ -290,3 +300,72 @@ resource "google_monitoring_alert_policy" "sql_disk_utiliziation_policy" {
|
||||
auto_close = "28800s"
|
||||
}
|
||||
}
|
||||
|
||||
resource "google_monitoring_alert_policy" "genservers_crash_policy" {
|
||||
project = var.project_id
|
||||
|
||||
display_name = "GenServer Crashes"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
conditions {
|
||||
display_name = "Log match condition"
|
||||
|
||||
condition_matched_log {
|
||||
filter = <<-EOT
|
||||
resource.type="gce_instance"
|
||||
severity>=ERROR
|
||||
EOT
|
||||
}
|
||||
}
|
||||
|
||||
alert_strategy {
|
||||
auto_close = "28800s"
|
||||
|
||||
notification_rate_limit {
|
||||
period = "3600s"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "google_monitoring_alert_policy" "ssl_certs_expiring_policy" {
|
||||
project = var.project_id
|
||||
|
||||
display_name = "SSL certificate expiring soon"
|
||||
combiner = "OR"
|
||||
|
||||
notification_channels = local.notification_channels
|
||||
|
||||
user_labels = {
|
||||
version = "1"
|
||||
uptime = "ssl_cert_expiration"
|
||||
}
|
||||
|
||||
conditions {
|
||||
display_name = "SSL certificate expiring soon"
|
||||
|
||||
condition_threshold {
|
||||
comparison = "COMPARISON_LT"
|
||||
filter = "metric.type=\"monitoring.googleapis.com/uptime_check/time_until_ssl_cert_expires\" AND resource.type=\"uptime_url\""
|
||||
|
||||
aggregations {
|
||||
alignment_period = "1200s"
|
||||
cross_series_reducer = "REDUCE_MEAN"
|
||||
group_by_fields = ["resource.label.*"]
|
||||
per_series_aligner = "ALIGN_NEXT_OLDER"
|
||||
}
|
||||
|
||||
duration = "600s"
|
||||
threshold_value = 15
|
||||
|
||||
trigger {
|
||||
count = 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
alert_strategy {
|
||||
auto_close = "28800s"
|
||||
}
|
||||
}
|
||||
|
||||
3
terraform/modules/google-cloud-ops/outputs.tf
Normal file
3
terraform/modules/google-cloud-ops/outputs.tf
Normal file
@@ -0,0 +1,3 @@
|
||||
output "notification_channels" {
|
||||
value = local.notification_channels
|
||||
}
|
||||
@@ -12,6 +12,12 @@ variable "slack_alerts_auth_token" {
|
||||
description = "Slack auth token for the infra alerts channel"
|
||||
}
|
||||
|
||||
variable "pagerduty_auth_token" {
|
||||
type = string
|
||||
description = "Pagerduty auth token for the infra alerts channel"
|
||||
default = null
|
||||
}
|
||||
|
||||
variable "api_host" {
|
||||
type = string
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user