diff --git a/CHANGES.md b/CHANGES.md index b34abfb4..b3530ab5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -8,6 +8,9 @@ Notable changes between versions. * Add `enable_http_load_balancing` variable to reduce load balancer rules count * Azure charges by load balancer rules (5 included) +* Change Azure VMSS instance update policy (i.e. upgrade policy) from Manual to Rolling + * Set a rolling upgrade policy so that changes to the worker node pool are rolled out gradually. Previously, the VMSS model could change, but instances would not receive it until manually replaced +* Define Azure automatic instance repair using Application Health Extension probes to 10256 (kube-proxy or Cilium equivalent) to match the strategy used on Google Cloud ### Google Cloud diff --git a/azure/fedora-coreos/kubernetes/lb.tf b/azure/fedora-coreos/kubernetes/lb.tf index 2fd04ca1..7befbf5e 100644 --- a/azure/fedora-coreos/kubernetes/lb.tf +++ b/azure/fedora-coreos/kubernetes/lb.tf @@ -89,7 +89,7 @@ resource "azurerm_lb_rule" "apiserver-ipv6" { } resource "azurerm_lb_rule" "ingress-http-ipv4" { -count = var.enable_http_load_balancing ? 1 : 0 + count = var.enable_http_load_balancing ? 1 : 0 name = "ingress-http-ipv4" loadbalancer_id = azurerm_lb.cluster.id diff --git a/azure/fedora-coreos/kubernetes/variables.tf b/azure/fedora-coreos/kubernetes/variables.tf index 6ba1bc54..8afc9059 100644 --- a/azure/fedora-coreos/kubernetes/variables.tf +++ b/azure/fedora-coreos/kubernetes/variables.tf @@ -146,7 +146,7 @@ EOD variable "enable_http_load_balancing" { description = "Enable HTTP (port 80) LB rules" - default = false + default = false } variable "enable_ipv6_load_balancing" { diff --git a/azure/fedora-coreos/kubernetes/workers/workers.tf b/azure/fedora-coreos/kubernetes/workers/workers.tf index 86f4a8bf..d9cb7301 100644 --- a/azure/fedora-coreos/kubernetes/workers/workers.tf +++ b/azure/fedora-coreos/kubernetes/workers/workers.tf @@ -69,6 +69,42 @@ resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" { } } + # Roll out VMSS changes to instances gradually + upgrade_mode = "Rolling" + rolling_upgrade_policy { + max_batch_instance_percent = 20 + pause_time_between_batches = "PT2M" + + maximum_surge_instances_enabled = true + # Upgrade unhealthy instances first + prioritize_unhealthy_instances_enabled = true + + # Safety gate to stop bad rollouts + max_unhealthy_instance_percent = 20 + max_unhealthy_upgraded_instance_percent = 25 + } + + # Azure instance repair replaces instances that fail probes from the + # ApplicationHealthExtension + automatic_instance_repair { + enabled = true + grace_period = "PT15M" + action = "Replace" + } + + extension { + name = "ApplicationHealthExtension" + publisher = "Microsoft.ManagedServices" + type = "ApplicationHealthLinux" + type_handler_version = "1.0" + settings = jsonencode({ + protocol = "http" + port = 10256 + requestPath = "/healthz" + }) + } + + # lifecycle # eviction policy may only be set when priority is Spot priority = var.priority diff --git a/azure/flatcar-linux/kubernetes/lb.tf b/azure/flatcar-linux/kubernetes/lb.tf index 2fd04ca1..7befbf5e 100644 --- a/azure/flatcar-linux/kubernetes/lb.tf +++ b/azure/flatcar-linux/kubernetes/lb.tf @@ -89,7 +89,7 @@ resource "azurerm_lb_rule" "apiserver-ipv6" { } resource "azurerm_lb_rule" "ingress-http-ipv4" { -count = var.enable_http_load_balancing ? 1 : 0 + count = var.enable_http_load_balancing ? 1 : 0 name = "ingress-http-ipv4" loadbalancer_id = azurerm_lb.cluster.id diff --git a/azure/flatcar-linux/kubernetes/variables.tf b/azure/flatcar-linux/kubernetes/variables.tf index 53c454db..21f83a62 100644 --- a/azure/flatcar-linux/kubernetes/variables.tf +++ b/azure/flatcar-linux/kubernetes/variables.tf @@ -152,7 +152,7 @@ EOD variable "enable_http_load_balancing" { description = "Enable HTTP (port 80) LB rules" - default = false + default = false } variable "enable_ipv6_load_balancing" { diff --git a/azure/flatcar-linux/kubernetes/workers/workers.tf b/azure/flatcar-linux/kubernetes/workers/workers.tf index 6fe406d4..41027541 100644 --- a/azure/flatcar-linux/kubernetes/workers/workers.tf +++ b/azure/flatcar-linux/kubernetes/workers/workers.tf @@ -90,6 +90,42 @@ resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" { } } + # Roll out VMSS changes to instances gradually + upgrade_mode = "Rolling" + rolling_upgrade_policy { + max_batch_instance_percent = 20 + pause_time_between_batches = "PT2M" + + maximum_surge_instances_enabled = true + # Upgrade unhealthy instances first + prioritize_unhealthy_instances_enabled = true + + # Safety gate to stop bad rollouts + max_unhealthy_instance_percent = 20 + max_unhealthy_upgraded_instance_percent = 25 + } + + # Azure instance repair replaces instances that fail probes from the + # ApplicationHealthExtension + automatic_instance_repair { + enabled = true + grace_period = "PT15M" + action = "Replace" + } + + extension { + name = "ApplicationHealthExtension" + publisher = "Microsoft.ManagedServices" + type = "ApplicationHealthLinux" + type_handler_version = "1.0" + settings = jsonencode({ + protocol = "http" + port = 10256 + requestPath = "/healthz" + }) + } + + # lifecycle # eviction policy may only be set when priority is Spot priority = var.priority