Set Azure VMSS upgrade policy to Rolling

* Set a rolling upgrade policy so that changes to the worker node
pool are rolled out gradually. Previously, the VMSS model could
change, but instances would not receive it until manually replaced
* Align Azure node pool behaviors more closely with AWS and GCP:
  * On AWS, worker instance template changes trigger an instance refresh
  * On GCP, worker instance template changes roll out via proactive
* Define Azure automatic instance repair using Application Health
Extension probes to 10256 (kube-proxy or Cilium equivalent) to match
the strategy used on Google Cloud
This commit is contained in:
Dalton Hubble
2025-06-19 16:17:38 -07:00
parent b4399d1267
commit 528ed63a7e
7 changed files with 79 additions and 4 deletions

View File

@@ -8,6 +8,9 @@ Notable changes between versions.
* Add `enable_http_load_balancing` variable to reduce load balancer rules count
* Azure charges by load balancer rules (5 included)
* Change Azure VMSS instance update policy (i.e. upgrade policy) from Manual to Rolling
* Set a rolling upgrade policy so that changes to the worker node pool are rolled out gradually. Previously, the VMSS model could change, but instances would not receive it until manually replaced
* Define Azure automatic instance repair using Application Health Extension probes to 10256 (kube-proxy or Cilium equivalent) to match the strategy used on Google Cloud
### Google Cloud

View File

@@ -89,7 +89,7 @@ resource "azurerm_lb_rule" "apiserver-ipv6" {
}
resource "azurerm_lb_rule" "ingress-http-ipv4" {
count = var.enable_http_load_balancing ? 1 : 0
count = var.enable_http_load_balancing ? 1 : 0
name = "ingress-http-ipv4"
loadbalancer_id = azurerm_lb.cluster.id

View File

@@ -146,7 +146,7 @@ EOD
variable "enable_http_load_balancing" {
description = "Enable HTTP (port 80) LB rules"
default = false
default = false
}
variable "enable_ipv6_load_balancing" {

View File

@@ -69,6 +69,42 @@ resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" {
}
}
# Roll out VMSS changes to instances gradually
upgrade_mode = "Rolling"
rolling_upgrade_policy {
max_batch_instance_percent = 20
pause_time_between_batches = "PT2M"
maximum_surge_instances_enabled = true
# Upgrade unhealthy instances first
prioritize_unhealthy_instances_enabled = true
# Safety gate to stop bad rollouts
max_unhealthy_instance_percent = 20
max_unhealthy_upgraded_instance_percent = 25
}
# Azure instance repair replaces instances that fail probes from the
# ApplicationHealthExtension
automatic_instance_repair {
enabled = true
grace_period = "PT15M"
action = "Replace"
}
extension {
name = "ApplicationHealthExtension"
publisher = "Microsoft.ManagedServices"
type = "ApplicationHealthLinux"
type_handler_version = "1.0"
settings = jsonencode({
protocol = "http"
port = 10256
requestPath = "/healthz"
})
}
# lifecycle
# eviction policy may only be set when priority is Spot
priority = var.priority

View File

@@ -89,7 +89,7 @@ resource "azurerm_lb_rule" "apiserver-ipv6" {
}
resource "azurerm_lb_rule" "ingress-http-ipv4" {
count = var.enable_http_load_balancing ? 1 : 0
count = var.enable_http_load_balancing ? 1 : 0
name = "ingress-http-ipv4"
loadbalancer_id = azurerm_lb.cluster.id

View File

@@ -152,7 +152,7 @@ EOD
variable "enable_http_load_balancing" {
description = "Enable HTTP (port 80) LB rules"
default = false
default = false
}
variable "enable_ipv6_load_balancing" {

View File

@@ -90,6 +90,42 @@ resource "azurerm_orchestrated_virtual_machine_scale_set" "workers" {
}
}
# Roll out VMSS changes to instances gradually
upgrade_mode = "Rolling"
rolling_upgrade_policy {
max_batch_instance_percent = 20
pause_time_between_batches = "PT2M"
maximum_surge_instances_enabled = true
# Upgrade unhealthy instances first
prioritize_unhealthy_instances_enabled = true
# Safety gate to stop bad rollouts
max_unhealthy_instance_percent = 20
max_unhealthy_upgraded_instance_percent = 25
}
# Azure instance repair replaces instances that fail probes from the
# ApplicationHealthExtension
automatic_instance_repair {
enabled = true
grace_period = "PT15M"
action = "Replace"
}
extension {
name = "ApplicationHealthExtension"
publisher = "Microsoft.ManagedServices"
type = "ApplicationHealthLinux"
type_handler_version = "1.0"
settings = jsonencode({
protocol = "http"
port = 10256
requestPath = "/healthz"
})
}
# lifecycle
# eviction policy may only be set when priority is Spot
priority = var.priority