Set Azure VMSS upgrade policy to Rolling

* Set a rolling upgrade policy so that changes to the worker node pool are rolled out gradually. Previously, the VMSS model could change, but instances would not receive it until manually replaced * Align Azure node pool behaviors more closely with AWS and GCP: * On AWS, worker instance template changes trigger an instance refresh * On GCP, worker instance template changes roll out via proactive * Define Azure automatic instance repair using Application Health Extension probes to 10256 (kube-proxy or Cilium equivalent) to match the strategy used on Google Cloud
2026-01-27 10:20:48 +00:00 · 2025-06-19 16:17:38 -07:00
parent b4399d1267
commit 528ed63a7e
7 changed files with 79 additions and 4 deletions
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -8,6 +8,9 @@ Notable changes between versions.

 * Add `enable_http_load_balancing` variable to reduce load balancer rules count
  * Azure charges by load balancer rules (5 included)
+* Change Azure VMSS instance update policy (i.e. upgrade policy) from Manual to Rolling
+  * Set a rolling upgrade policy so that changes to the worker node pool are rolled out gradually. Previously, the VMSS model could change, but instances would not receive it until manually replaced
+* Define Azure automatic instance repair using Application Health Extension probes to 10256 (kube-proxy or Cilium equivalent) to match the strategy used on Google Cloud

 ### Google Cloud

--- a/azure/fedora-coreos/kubernetes/lb.tf
+++ b/azure/fedora-coreos/kubernetes/lb.tf
@@ -89,7 +89,7 @@ resource "azurerm_lb_rule" "apiserver-ipv6" {
 }

 resource "azurerm_lb_rule" "ingress-http-ipv4" {
-count = var.enable_http_load_balancing ? 1 : 0
+  count = var.enable_http_load_balancing ? 1 : 0

  name                           = "ingress-http-ipv4"
  loadbalancer_id                = azurerm_lb.cluster.id
--- a/azure/fedora-coreos/kubernetes/variables.tf
+++ b/azure/fedora-coreos/kubernetes/variables.tf
@@ -146,7 +146,7 @@ EOD

 variable "enable_http_load_balancing" {
  description = "Enable HTTP (port 80) LB rules"
-  default = false
+  default     = false
 }

 variable "enable_ipv6_load_balancing" {
--- a/azure/fedora-coreos/kubernetes/workers/workers.tf
+++ b/azure/fedora-coreos/kubernetes/workers/workers.tf
@@ -69,6 +69,42 @@ resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" {
    }
  }

+  # Roll out VMSS changes to instances gradually
+  upgrade_mode = "Rolling"
+  rolling_upgrade_policy {
+    max_batch_instance_percent = 20
+    pause_time_between_batches = "PT2M"
+
+    maximum_surge_instances_enabled = true
+    # Upgrade unhealthy instances first
+    prioritize_unhealthy_instances_enabled = true
+
+    # Safety gate to stop bad rollouts
+    max_unhealthy_instance_percent          = 20
+    max_unhealthy_upgraded_instance_percent = 25
+  }
+
+  # Azure instance repair replaces instances that fail probes from the
+  # ApplicationHealthExtension
+  automatic_instance_repair {
+    enabled      = true
+    grace_period = "PT15M"
+    action       = "Replace"
+  }
+
+  extension {
+    name                 = "ApplicationHealthExtension"
+    publisher            = "Microsoft.ManagedServices"
+    type                 = "ApplicationHealthLinux"
+    type_handler_version = "1.0"
+    settings = jsonencode({
+      protocol    = "http"
+      port        = 10256
+      requestPath = "/healthz"
+    })
+  }
+
+
  # lifecycle
  # eviction policy may only be set when priority is Spot
  priority        = var.priority
--- a/azure/flatcar-linux/kubernetes/lb.tf
+++ b/azure/flatcar-linux/kubernetes/lb.tf
@@ -89,7 +89,7 @@ resource "azurerm_lb_rule" "apiserver-ipv6" {
 }

 resource "azurerm_lb_rule" "ingress-http-ipv4" {
-count = var.enable_http_load_balancing ? 1 : 0
+  count = var.enable_http_load_balancing ? 1 : 0

  name                           = "ingress-http-ipv4"
  loadbalancer_id                = azurerm_lb.cluster.id
--- a/azure/flatcar-linux/kubernetes/variables.tf
+++ b/azure/flatcar-linux/kubernetes/variables.tf
@@ -152,7 +152,7 @@ EOD

 variable "enable_http_load_balancing" {
  description = "Enable HTTP (port 80) LB rules"
-  default = false
+  default     = false
 }

 variable "enable_ipv6_load_balancing" {
--- a/azure/flatcar-linux/kubernetes/workers/workers.tf
+++ b/azure/flatcar-linux/kubernetes/workers/workers.tf
@@ -90,6 +90,42 @@ resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" {
    }
  }

+  # Roll out VMSS changes to instances gradually
+  upgrade_mode = "Rolling"
+  rolling_upgrade_policy {
+    max_batch_instance_percent = 20
+    pause_time_between_batches = "PT2M"
+
+    maximum_surge_instances_enabled = true
+    # Upgrade unhealthy instances first
+    prioritize_unhealthy_instances_enabled = true
+
+    # Safety gate to stop bad rollouts
+    max_unhealthy_instance_percent          = 20
+    max_unhealthy_upgraded_instance_percent = 25
+  }
+
+  # Azure instance repair replaces instances that fail probes from the
+  # ApplicationHealthExtension
+  automatic_instance_repair {
+    enabled      = true
+    grace_period = "PT15M"
+    action       = "Replace"
+  }
+
+  extension {
+    name                 = "ApplicationHealthExtension"
+    publisher            = "Microsoft.ManagedServices"
+    type                 = "ApplicationHealthLinux"
+    type_handler_version = "1.0"
+    settings = jsonencode({
+      protocol    = "http"
+      port        = 10256
+      requestPath = "/healthz"
+    })
+  }
+
+
  # lifecycle
  # eviction policy may only be set when priority is Spot
  priority        = var.priority