From 762f16bfea183d168a8ac17dcf1d93a69edcdeca Mon Sep 17 00:00:00 2001 From: Jamil Date: Wed, 19 Feb 2025 07:10:12 -0800 Subject: [PATCH] fix(infra): `create_before_destroy` for all Relay resources (#8198) When making any modification that taints any Relay infrastructure, some Relay components are destroyed before they're created, and some are created before they're destroyed. This results in failures that can lead to downtime, even if we bump subnet numbering to trigger a rollover of the `naming_suffix`. See https://app.terraform.io/app/firezone/workspaces/staging/runs To fix this, we ensure `create_before_destroy` is applied to all Relay module resources, and we ensure that the `naming_suffix` is properly used in all resources that require unique names or IDs within the project. Thus, we need to remember to make sure to bump subnet numbering whenever changing any Relay infrastructure so that: (1) the subnet numbering doesn't collide, and (2) to trigger the `naming_suffix` change which prevents other resource names from colliding. Unfortunately there doesn't seem to be a better alternative here. The only other alternative I could determine as of now is to derive the subnet numbering dynamically on each deploy, incrementing them, which would taint all Relay resources upon each and every deploy, which is wasteful and prone to random timeouts or failures. --- terraform/environments | 2 +- .../modules/google-cloud/apps/relay/main.tf | 43 ++++++++++++++++--- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/terraform/environments b/terraform/environments index 13aa93779..2d2d7c8c4 160000 --- a/terraform/environments +++ b/terraform/environments @@ -1 +1 @@ -Subproject commit 13aa93779c3cae8e364a4a4c7fd454411f083c19 +Subproject commit 2d2d7c8c4471d02f20c03be998e6f257aebbabc5 diff --git a/terraform/modules/google-cloud/apps/relay/main.tf b/terraform/modules/google-cloud/apps/relay/main.tf index e20f135e1..14a50fca5 100644 --- a/terraform/modules/google-cloud/apps/relay/main.tf +++ b/terraform/modules/google-cloud/apps/relay/main.tf @@ -62,9 +62,13 @@ data "google_compute_image" "coreos" { resource "google_service_account" "application" { project = var.project_id - account_id = "app-${local.application_name}" + account_id = "app-${local.application_name}-${var.naming_suffix}" display_name = "${local.application_name} app" description = "Service account for ${local.application_name} application instances." + + lifecycle { + create_before_destroy = true + } } ## Allow application service account to pull images from the container registry @@ -121,7 +125,6 @@ resource "google_project_iam_member" "cloudtrace" { member = "serviceAccount:${google_service_account.application.email}" } - resource "google_compute_reservation" "relay_reservation" { for_each = var.instances @@ -141,6 +144,10 @@ resource "google_compute_reservation" "relay_reservation" { machine_type = each.value.type } } + + lifecycle { + create_before_destroy = true + } } # Deploy app @@ -357,6 +364,10 @@ resource "google_compute_region_instance_group_manager" "application" { depends_on = [ google_compute_instance_template.application ] + + lifecycle { + create_before_destroy = true + } } # TODO: Rate limit requests to the relays by source IP address @@ -375,12 +386,16 @@ resource "google_compute_firewall" "stun-turn-ipv4" { protocol = "udp" ports = ["3478", "49152-65535"] } + + lifecycle { + create_before_destroy = true + } } resource "google_compute_firewall" "stun-turn-ipv6" { project = var.project_id - name = "${local.application_name}-firewall-lb-to-instances-ipv6" + name = "${local.application_name}-firewall-lb-to-instances-ipv6-${var.naming_suffix}" network = var.network source_ranges = ["::/0"] @@ -390,13 +405,17 @@ resource "google_compute_firewall" "stun-turn-ipv6" { protocol = "udp" ports = ["3478", "49152-65535"] } + + lifecycle { + create_before_destroy = true + } } ## Open metrics port for the health checks resource "google_compute_firewall" "http-health-checks" { project = var.project_id - name = "${local.application_name}-healthcheck" + name = "${local.application_name}-healthcheck-${var.naming_suffix}" network = var.network source_ranges = local.google_health_check_ip_ranges @@ -406,13 +425,17 @@ resource "google_compute_firewall" "http-health-checks" { protocol = var.health_check.protocol ports = [var.health_check.port] } + + lifecycle { + create_before_destroy = true + } } # Allow outbound traffic resource "google_compute_firewall" "egress-ipv4" { project = var.project_id - name = "${local.application_name}-egress-ipv4" + name = "${local.application_name}-egress-ipv4-${var.naming_suffix}" network = var.network direction = "EGRESS" @@ -422,12 +445,16 @@ resource "google_compute_firewall" "egress-ipv4" { allow { protocol = "all" } + + lifecycle { + create_before_destroy = true + } } resource "google_compute_firewall" "egress-ipv6" { project = var.project_id - name = "${local.application_name}-egress-ipv6" + name = "${local.application_name}-egress-ipv6-${var.naming_suffix}" network = var.network direction = "EGRESS" @@ -437,4 +464,8 @@ resource "google_compute_firewall" "egress-ipv6" { allow { protocol = "all" } + + lifecycle { + create_before_destroy = true + } }