From 5ed2bd8eaeb1190fba7b11cc30d0faf545d35aa2 Mon Sep 17 00:00:00 2001 From: bsctl Date: Tue, 10 Jun 2025 16:52:42 +0200 Subject: [PATCH] feat(azure): add support provider --- README.md | 2 +- examples/azure/example.tf | 134 +++++++++ modules/azure-node-pool/README.md | 138 +++++++++ modules/azure-node-pool/data.tf | 38 +++ modules/azure-node-pool/main.tf | 206 ++++++++++++++ modules/azure-node-pool/outputs.tf | 43 +++ modules/azure-node-pool/vars.tf | 203 ++++++++++++++ providers/azure/README.md | 356 ++++++++++++++++++++++++ providers/azure/backend.tf | 9 + providers/azure/main.auto.tfvars.sample | 47 ++++ providers/azure/main.tf | 86 ++++++ providers/azure/outputs.tf | 40 +++ providers/azure/vars.tf | 144 ++++++++++ providers/azure/versions.tf | 24 ++ 14 files changed, 1469 insertions(+), 1 deletion(-) create mode 100644 examples/azure/example.tf create mode 100644 modules/azure-node-pool/README.md create mode 100644 modules/azure-node-pool/data.tf create mode 100644 modules/azure-node-pool/main.tf create mode 100644 modules/azure-node-pool/outputs.tf create mode 100644 modules/azure-node-pool/vars.tf create mode 100644 providers/azure/README.md create mode 100644 providers/azure/backend.tf create mode 100644 providers/azure/main.auto.tfvars.sample create mode 100644 providers/azure/main.tf create mode 100644 providers/azure/outputs.tf create mode 100644 providers/azure/vars.tf create mode 100644 providers/azure/versions.tf diff --git a/README.md b/README.md index d738bd6..de82599 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The machines created by this project automatically will create a secure bootstra | **Proxmox** | Virtual Machines | Virtual Machines on Proxmox VE | Manual | Available | | **vSphere** | Virtual Machines | Virtual Machines on VMware vSphere/vCenter | Manual | Available | | **vCloud** | vApps | Multi-tenant Virtual Machines on VMware Cloud Director with vApp isolation | Manual | Available | -| **Azure** | Scale Sets | Azure VMs with automatic scaling and availability zones | Automatic | Planned | +| **Azure** | Virtual Machine Scale Sets | Azure VMs with automatic scaling and availability zones | Automatic | Available | ## Bootstrap Token Management diff --git a/examples/azure/example.tf b/examples/azure/example.tf new file mode 100644 index 0000000..a7727c3 --- /dev/null +++ b/examples/azure/example.tf @@ -0,0 +1,134 @@ +# ============================================================================= +# AZURE KAMAJI NODE POOL EXAMPLE +# ============================================================================= +# +# This example demonstrates multi-pool Azure node configuration with both +# manual and automatic scaling options: +# +# SCALING MODES: +# - enable_autoscaling = true: Azure manages scaling based on CPU metrics +# - enable_autoscaling = false: Terraform directly controls pool_size +# +# For manual control, set enable_autoscaling = false and adjust pool_size +# in your configuration files. +# +# ============================================================================= + +# Example: Azure Provider Usage +# This example shows how to use the Azure provider wrapper + +terraform { + required_providers { + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.35.0" + } + azurerm = { + source = "hashicorp/azurerm" + version = "~> 4.0" + } + } +} + +# Configure the Azure provider +provider "azurerm" { + subscription_id = var.azure_subscription_id + features {} +} + +# Configure the Kubernetes provider +provider "kubernetes" { + config_path = var.tenant_kubeconfig_path +} + +# Use the Azure provider module +module "azure_kamaji_node_pools" { + source = "../../providers/azure" + + # Cluster configuration + tenant_cluster_name = "my-azure-cluster" + tenant_kubeconfig_path = "~/.kube/my-cluster.kubeconfig" + yaki_url = "https://goyaki.clastix.io" + + # Node pools configuration + node_pools = [ + { + name = "default" + size = 3 + min_size = 2 + max_size = 10 + node_disk_size = 50 + node_disk_type = "Premium_LRS" + vm_size = "Standard_D2s_v3" + assign_public_ip = true + enable_autoscaling = false + scale_out_cpu_threshold = 75 + scale_in_cpu_threshold = 25 + enable_automatic_instance_repair = false + automatic_instance_repair_grace_period = "PT30M" + upgrade_mode = "Manual" + }, + { + name = "system" + size = 2 + min_size = 1 + max_size = 5 + node_disk_size = 100 + node_disk_type = "Premium_LRS" + vm_size = "Standard_D4s_v3" + assign_public_ip = false + enable_autoscaling = true + scale_out_cpu_threshold = 80 + scale_in_cpu_threshold = 30 + enable_automatic_instance_repair = true + automatic_instance_repair_grace_period = "PT15M" + upgrade_mode = "Automatic" + } + ] + + # Azure configuration + azure_subscription_id = var.azure_subscription_id + azure_location = var.azure_location + azure_resource_group_name = "kamaji" + azure_vnet_name = "kamaji-vnet" + azure_subnet_name = "kamaji-subnet" + vnet_subnet_address_prefix = "10.10.10.0/24" + tags = { + "ManagedBy" = "Terraform" + "Environment" = "production" + "Provider" = "Azure" + } + + # SSH configuration + ssh_user = "ubuntu" + ssh_public_key_path = "~/.ssh/id_rsa.pub" +} + +# Variables +variable "azure_subscription_id" { + description = "Azure subscription ID" + type = string +} + +variable "azure_location" { + description = "Azure region" + type = string + default = "italynorth" +} + +variable "tenant_kubeconfig_path" { + description = "Path to tenant cluster kubeconfig" + type = string + default = "~/.kube/config" +} + +# Outputs +output "cluster_info" { + description = "Cluster information" + value = module.azure_kamaji_node_pools.cluster_info +} + +output "node_pools" { + description = "Node pools details" + value = module.azure_kamaji_node_pools.node_pools +} \ No newline at end of file diff --git a/modules/azure-node-pool/README.md b/modules/azure-node-pool/README.md new file mode 100644 index 0000000..e9c89cd --- /dev/null +++ b/modules/azure-node-pool/README.md @@ -0,0 +1,138 @@ +# Azure Node Pool Module + +Creates Azure Virtual Machine Scale Sets for Kamaji tenant cluster worker nodes with automatic scaling capabilities. + +## Features + +- **Virtual Machine Scale Sets** with automatic scaling +- **Network Security Groups** with Kubernetes-optimized rules +- **Ubuntu 24.04 LTS** support +- **Automatic instance repair** for failed VMs +- **CPU-based autoscaling** with configurable thresholds +- **Bootstrap token integration** via cloud-init + +## Usage + +```hcl +module "azure_node_pool" { + source = "../../modules/azure-node-pool" + + # Cluster configuration + tenant_cluster_name = "charlie" + pool_name = "default" + + # Pool sizing + pool_size = 3 + pool_min_size = 1 + pool_max_size = 9 + + # Azure configuration + azure_location = "italynorth" + azure_resource_group_name = "kamaji" + azure_vnet_name = "kamaji-vnet" + azure_subnet_name = "kamaji-subnet" + + # VM configuration + vm_size = "Standard_D2s_v3" + assign_public_ip = true + node_disk_size = 30 + node_disk_type = "Premium_LRS" + + # Autoscaling + enable_autoscaling = true + scale_out_cpu_threshold = 75 + scale_in_cpu_threshold = 25 + + # Bootstrap command + runcmd = module.bootstrap_token.join_cmd + + tags = { + Environment = "production" + Project = "kamaji" + } +} +``` + +## Requirements + +| Name | Version | +|------|---------| +| terraform | >= 1.0 | +| azurerm | >= 3.0 | +| cloudinit | >= 2.0 | + +## Providers + +| Name | Version | +|------|---------| +| azurerm | >= 3.0 | +| cloudinit | >= 2.0 | + +## Resources + +- `azurerm_linux_virtual_machine_scale_set` - Main VMSS resource +- `azurerm_network_security_group` - Security group for nodes +- `azurerm_network_security_rule` - Security rules +- `azurerm_monitor_autoscale_setting` - Autoscaling configuration + +## Variables + +| Name | Description | Type | Default | +|------|-------------|------|---------| +| `tenant_cluster_name` | Name of the tenant cluster | `string` | `"charlie"` | +| `pool_name` | Name of the node pool | `string` | `"default"` | +| `pool_size` | The size of the node pool | `number` | `3` | +| `pool_min_size` | The minimum size of the node pool | `number` | `1` | +| `pool_max_size` | The maximum size of the node pool | `number` | `9` | +| `azure_location` | Azure region where resources are created | `string` | `"italynorth"` | +| `azure_resource_group_name` | Name of the Azure resource group | `string` | `"kamaji"` | +| `azure_vnet_name` | Name of the Azure virtual network | `string` | `"kamaji-vnet"` | +| `azure_subnet_name` | Name of the Azure subnet | `string` | `"kamaji-subnet"` | +| `vm_size` | Size of the virtual machines | `string` | `"Standard_D2s_v3"` | +| `assign_public_ip` | Whether to assign public IP addresses to VMs | `bool` | `true` | +| `node_disk_size` | Disk size for each node in GB | `number` | `30` | +| `node_disk_type` | Storage account type for each node | `string` | `"Premium_LRS"` | +| `ssh_user` | SSH user for the nodes | `string` | `"ubuntu"` | +| `ssh_public_key_path` | Path to the SSH public key | `string` | `"~/.ssh/id_rsa.pub"` | +| `enable_autoscaling` | Enable automatic scaling based on CPU metrics | `bool` | `true` | +| `scale_out_cpu_threshold` | CPU threshold percentage to trigger scale out | `number` | `75` | +| `scale_in_cpu_threshold` | CPU threshold percentage to trigger scale in | `number` | `25` | +| `runcmd` | Command to run on the node at first boot time | `string` | `"echo 'Hello, World!'"` | + +## Outputs + +| Name | Description | +|------|-------------| +| `vmss_details` | Virtual Machine Scale Set details | +| `autoscale_settings` | Autoscale settings details | +| `network_security_group` | Network Security Group details | + +## Security Groups + +The module creates a Network Security Group with the following rules: + +- **Outbound**: Allow all outbound traffic +- **SSH**: Allow inbound SSH (port 22) from anywhere +- **Cluster Internal**: Allow all traffic within the subnet + +## Scaling Behavior + +This module supports both manual and automatic scaling modes: + +### Manual Scaling (`enable_autoscaling = false`) +- **Direct Control**: Terraform directly manages VMSS instance count +- **pool_size Changes**: Changing `pool_size` will update the VMSS immediately on `terraform apply` +- **No Lifecycle Rules**: No `ignore_changes` applied to instances +- **Use Case**: Predictable workloads requiring manual capacity control + +### Automatic Scaling (`enable_autoscaling = true`) +- **CPU-Based**: Azure autoscaler manages instance count based on CPU metrics +- **Scale Out**: When average CPU > 75% for 5 minutes +- **Scale In**: When average CPU < 25% for 5 minutes +- **Cooldown**: 1 minute between scaling actions +- **Default Capacity**: `pool_size` sets the initial/default capacity +- **Lifecycle Protection**: Terraform ignores instance count changes made by autoscaler + +## Instance Repair + +Automatic instance repair is enabled by default with a 30-minute grace period for failed VMs. \ No newline at end of file diff --git a/modules/azure-node-pool/data.tf b/modules/azure-node-pool/data.tf new file mode 100644 index 0000000..54c09ea --- /dev/null +++ b/modules/azure-node-pool/data.tf @@ -0,0 +1,38 @@ +# ============================================================================= +# DATA SOURCES +# ============================================================================= + +data "azurerm_resource_group" "tenant" { + name = var.azure_resource_group_name +} + +data "azurerm_virtual_network" "tenant_vnet" { + name = var.azure_vnet_name + resource_group_name = var.azure_resource_group_name +} + +data "azurerm_subnet" "tenant_subnet" { + name = var.azure_subnet_name + virtual_network_name = var.azure_vnet_name + resource_group_name = var.azure_resource_group_name +} + +# ============================================================================= +# CLOUD-INIT CONFIGURATION +# ============================================================================= + +data "cloudinit_config" "node_cloud_init" { + gzip = true + base64_encode = true + + part { + filename = "cloud-config.yaml" + content_type = "text/cloud-config" + content = templatefile("${path.module}/../templates/cloud-init/userdata.yml.tpl", { + hostname = "" + runcmd = var.runcmd + ssh_user = var.ssh_user + ssh_public_key = file(pathexpand(var.ssh_public_key_path)) + }) + } +} \ No newline at end of file diff --git a/modules/azure-node-pool/main.tf b/modules/azure-node-pool/main.tf new file mode 100644 index 0000000..736dbc3 --- /dev/null +++ b/modules/azure-node-pool/main.tf @@ -0,0 +1,206 @@ +# ============================================================================= +# TERRAFORM CONFIGURATION +# ============================================================================= + +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + } + cloudinit = { + source = "hashicorp/cloudinit" + } + } +} + +# ============================================================================= +# NETWORK SECURITY GROUP CONFIGURATION +# ============================================================================= + +# Network Security Group for Kubernetes Nodes +resource "azurerm_network_security_group" "kubernetes" { + name = "${var.tenant_cluster_name}-${var.pool_name}-nsg" + location = var.azure_location + resource_group_name = var.azure_resource_group_name + + tags = merge( + { + "Name" = "${var.tenant_cluster_name}-${var.pool_name}" + }, + var.tags, + ) +} + +# Allow outgoing connectivity +resource "azurerm_network_security_rule" "allow_all_outbound" { + name = "AllowAllOutbound" + priority = 100 + direction = "Outbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = "*" + destination_address_prefix = "*" + resource_group_name = var.azure_resource_group_name + network_security_group_name = azurerm_network_security_group.kubernetes.name +} + +# Allow SSH access +resource "azurerm_network_security_rule" "allow_ssh_inbound" { + name = "AllowSSHInbound" + priority = 1000 + direction = "Inbound" + access = "Allow" + protocol = "Tcp" + source_port_range = "*" + destination_port_range = "22" + source_address_prefix = "*" + destination_address_prefix = "*" + resource_group_name = var.azure_resource_group_name + network_security_group_name = azurerm_network_security_group.kubernetes.name +} + +# Allow cluster internal communication +resource "azurerm_network_security_rule" "allow_cluster_internal" { + name = "AllowClusterInternal" + priority = 1100 + direction = "Inbound" + access = "Allow" + protocol = "*" + source_port_range = "*" + destination_port_range = "*" + source_address_prefix = var.vnet_subnet_address_prefix + destination_address_prefix = var.vnet_subnet_address_prefix + resource_group_name = var.azure_resource_group_name + network_security_group_name = azurerm_network_security_group.kubernetes.name +} + +# ============================================================================= +# VIRTUAL MACHINE SCALE SET +# ============================================================================= + +resource "azurerm_linux_virtual_machine_scale_set" "nodes" { + name = "${var.tenant_cluster_name}-${var.pool_name}-${var.enable_autoscaling ? "auto" : "manual"}-vmss" + resource_group_name = var.azure_resource_group_name + location = var.azure_location + sku = var.vm_size + instances = var.pool_size + + admin_username = var.ssh_user + disable_password_authentication = true + + admin_ssh_key { + username = var.ssh_user + public_key = file(pathexpand(var.ssh_public_key_path)) + } + + source_image_reference { + publisher = var.vm_image_publisher + offer = var.vm_image_offer + sku = var.vm_image_sku + version = var.vm_image_version + } + + os_disk { + storage_account_type = var.node_disk_type + caching = "ReadWrite" + disk_size_gb = var.node_disk_size + } + + network_interface { + name = "primary" + primary = true + + ip_configuration { + name = "primary" + primary = true + subnet_id = data.azurerm_subnet.tenant_subnet.id + + dynamic "public_ip_address" { + for_each = var.assign_public_ip ? [1] : [] + content { + name = "primary" + } + } + } + + network_security_group_id = azurerm_network_security_group.kubernetes.id + } + + custom_data = data.cloudinit_config.node_cloud_init.rendered + + # Configure upgrade policy + upgrade_mode = var.upgrade_mode + + tags = merge( + { + "Name" = "${var.tenant_cluster_name}-${var.pool_name}" + }, + var.tags, + ) +} + +# ============================================================================= +# AUTO SCALING CONFIGURATION +# ============================================================================= + +resource "azurerm_monitor_autoscale_setting" "nodes" { + count = var.enable_autoscaling ? 1 : 0 + name = "${var.tenant_cluster_name}-${var.pool_name}-autoscale" + resource_group_name = var.azure_resource_group_name + location = var.azure_location + target_resource_id = azurerm_linux_virtual_machine_scale_set.nodes.id + + profile { + name = "AutoScale" + + capacity { + default = var.pool_size + minimum = var.pool_min_size + maximum = var.pool_max_size + } + + rule { + metric_trigger { + metric_name = "Percentage CPU" + metric_resource_id = azurerm_linux_virtual_machine_scale_set.nodes.id + time_grain = "PT1M" + statistic = "Average" + time_window = "PT5M" + time_aggregation = "Average" + operator = "GreaterThan" + threshold = var.scale_out_cpu_threshold + } + + scale_action { + direction = "Increase" + type = "ChangeCount" + value = "1" + cooldown = "PT1M" + } + } + + rule { + metric_trigger { + metric_name = "Percentage CPU" + metric_resource_id = azurerm_linux_virtual_machine_scale_set.nodes.id + time_grain = "PT1M" + statistic = "Average" + time_window = "PT5M" + time_aggregation = "Average" + operator = "LessThan" + threshold = var.scale_in_cpu_threshold + } + + scale_action { + direction = "Decrease" + type = "ChangeCount" + value = "1" + cooldown = "PT1M" + } + } + } + + tags = var.tags +} \ No newline at end of file diff --git a/modules/azure-node-pool/outputs.tf b/modules/azure-node-pool/outputs.tf new file mode 100644 index 0000000..eb03ecf --- /dev/null +++ b/modules/azure-node-pool/outputs.tf @@ -0,0 +1,43 @@ +# ============================================================================= +# VIRTUAL MACHINE SCALE SET +# ============================================================================= + +output "vmss_details" { + description = "Virtual Machine Scale Set details" + value = { + name = azurerm_linux_virtual_machine_scale_set.nodes.name + id = azurerm_linux_virtual_machine_scale_set.nodes.id + resource_group = azurerm_linux_virtual_machine_scale_set.nodes.resource_group_name + location = azurerm_linux_virtual_machine_scale_set.nodes.location + instances = azurerm_linux_virtual_machine_scale_set.nodes.instances + vm_size = azurerm_linux_virtual_machine_scale_set.nodes.sku + } +} + +# ============================================================================= +# AUTOSCALING SETTINGS +# ============================================================================= + +output "autoscale_settings" { + description = "Autoscale settings details" + value = var.enable_autoscaling ? { + name = azurerm_monitor_autoscale_setting.nodes[0].name + id = azurerm_monitor_autoscale_setting.nodes[0].id + min_size = var.pool_min_size + max_size = var.pool_max_size + target_resource_id = azurerm_monitor_autoscale_setting.nodes[0].target_resource_id + } : null +} + +# ============================================================================= +# NETWORK SECURITY GROUP +# ============================================================================= + +output "network_security_group" { + description = "Network Security Group details" + value = { + name = azurerm_network_security_group.kubernetes.name + id = azurerm_network_security_group.kubernetes.id + resource_group_name = azurerm_network_security_group.kubernetes.resource_group_name + } +} \ No newline at end of file diff --git a/modules/azure-node-pool/vars.tf b/modules/azure-node-pool/vars.tf new file mode 100644 index 0000000..a296ca3 --- /dev/null +++ b/modules/azure-node-pool/vars.tf @@ -0,0 +1,203 @@ +# ============================================================================= +# CLUSTER CONFIGURATION +# ============================================================================= + +# Name of the tenant cluster +variable "tenant_cluster_name" { + description = "Name of the tenant cluster" + type = string + default = "charlie" +} + +# ============================================================================= +# POOL CONFIGURATION +# ============================================================================= + +variable "runcmd" { + description = "Command to run on the node at first boot time" + type = string + default = "echo 'Hello, World!'" +} + +variable "pool_name" { + description = "Name of the node pool" + type = string + default = "default" +} + +variable "pool_size" { + description = "The size of the node pool" + type = number + default = 3 +} + +variable "pool_min_size" { + description = "The minimum size of the node pool" + type = number + default = 1 +} + +variable "pool_max_size" { + description = "The maximum size of the node pool" + type = number + default = 9 +} + +# ============================================================================= +# AZURE CONFIGURATION +# ============================================================================= + +variable "azure_location" { + description = "Azure region where resources are created" + type = string + default = "italynorth" +} + +variable "azure_resource_group_name" { + description = "Name of the Azure resource group" + type = string + default = "kamaji" +} + +variable "azure_vnet_name" { + description = "Name of the Azure virtual network" + type = string + default = "kamaji-vnet" +} + +variable "azure_subnet_name" { + description = "Name of the Azure subnet" + type = string + default = "kamaji-subnet" +} + +variable "vnet_subnet_address_prefix" { + description = "Address prefix for the subnet (used for security group rules)" + type = string + default = "10.10.10.0/24" +} + +variable "vm_size" { + description = "Size of the virtual machines" + type = string + default = "Standard_D2s_v3" +} + +variable "assign_public_ip" { + description = "Whether to assign public IP addresses to VMs" + type = bool + default = true +} + +variable "tags" { + description = "Tags used for Azure resources" + type = map(string) + default = { + "ManagedBy" = "Clastix" + "CreatedBy" = "Terraform" + } +} + +# ============================================================================= +# VM IMAGE CONFIGURATION +# ============================================================================= + +variable "vm_image_publisher" { + description = "Publisher of the VM image" + type = string + default = "Canonical" +} + +variable "vm_image_offer" { + description = "Offer of the VM image" + type = string + default = "ubuntu-24_04-lts" +} + +variable "vm_image_sku" { + description = "SKU of the VM image" + type = string + default = "server" +} + +variable "vm_image_version" { + description = "Version of the VM image" + type = string + default = "latest" +} + +# ============================================================================= +# NODE CONFIGURATION +# ============================================================================= + +variable "node_disk_size" { + description = "Disk size for each node in GB" + type = number + default = 30 +} + +variable "node_disk_type" { + description = "Storage account type for each node (Standard_LRS, Premium_LRS)" + type = string + default = "Premium_LRS" +} + +# ============================================================================= +# SSH CONFIGURATION +# ============================================================================= + +variable "ssh_user" { + description = "SSH user for the nodes" + type = string + default = "ubuntu" +} + +variable "ssh_public_key_path" { + description = "Path to the SSH public key" + type = string + default = "~/.ssh/id_rsa.pub" +} + +# ============================================================================= +# AUTO SCALING CONFIGURATION +# ============================================================================= + +variable "enable_autoscaling" { + description = "Enable automatic scaling based on CPU metrics" + type = bool + default = true +} + +variable "scale_out_cpu_threshold" { + description = "CPU threshold percentage to trigger scale out" + type = number + default = 75 +} + +variable "scale_in_cpu_threshold" { + description = "CPU threshold percentage to trigger scale in" + type = number + default = 25 +} + +# ============================================================================= +# INSTANCE REPAIR CONFIGURATION +# ============================================================================= + +variable "enable_automatic_instance_repair" { + description = "Enable automatic instance repair for failed VMs" + type = bool + default = true +} + +variable "automatic_instance_repair_grace_period" { + description = "Grace period for automatic instance repair (in minutes)" + type = string + default = "PT30M" +} + +variable "upgrade_mode" { + description = "Upgrade mode for the scale set (Manual, Automatic, Rolling)" + type = string + default = "Manual" +} \ No newline at end of file diff --git a/providers/azure/README.md b/providers/azure/README.md new file mode 100644 index 0000000..5e99ade --- /dev/null +++ b/providers/azure/README.md @@ -0,0 +1,356 @@ +# Azure Provider for Kamaji Node Pools + +Ready-to-use Terraform implementation for creating multiple Kubernetes worker node pools on Microsoft Azure using Virtual Machine Scale Sets (VMSS). + +## Features + +- **Multiple Node Pools** with different configurations per pool +- **Virtual Machine Scale Sets** with automatic scaling per pool +- **CPU-based autoscaling** with configurable thresholds per pool +- **Automatic instance repair** for failed VMs (configurable per pool) +- **Network Security Groups** with Kubernetes-optimized rules +- **Ubuntu 24.04 LTS** support +- **Bootstrap token integration** with automatic cluster joining via YAKI + +## Prerequisites + +### Required Infrastructure + +Before using this provider, you must have: + +1. **Azure Resource Group** - Where all resources will be created +2. **Virtual Network (VNet)** - Network for the node pools +3. **Subnet** - Subnet within the VNet for VM instances +4. **Kamaji tenant cluster** - Running cluster with accessible kubeconfig + +### Required Tools + +- Terraform >= 1.0 +- Azure CLI >= 2.0 (authenticated) +- SSH key pair for node access +- [direnv](https://direnv.net/) (optional, for automatic environment management) + +## Quick Start + +### 1. Authentication + +```bash +# Login to Azure +az login + +# Set subscription (if needed) +az account set --subscription "your-subscription-id" + +# Verify access +az account show +``` + +### 2. Environment Configuration + +**Option A: Using .envrc (Recommended)** + +```bash +# Install direnv (if not already installed) +# macOS: brew install direnv +# Ubuntu: apt install direnv + +# Add direnv hook to your shell +echo 'eval "$(direnv hook bash)"' >> ~/.bashrc # for bash +echo 'eval "$(direnv hook zsh)"' >> ~/.zshrc # for zsh + +# Copy and edit .envrc file +cp .envrc.sample .envrc +vim .envrc + +# Allow direnv to load the environment +direnv allow +``` + +**Option B: Using tfvars file** + +```bash +# Copy sample configuration +cp main.auto.tfvars.sample main.auto.tfvars + +# Edit configuration - MAKE SURE TO SET azure_subscription_id +vim main.auto.tfvars +``` + +### 3. Deploy + +```bash +# Initialize Terraform +terraform init + +# Review plan +terraform plan + +# Apply configuration +terraform apply +``` + +## Configuration + +### Required Variables + +```hcl +# Azure subscription (REQUIRED - no default) +azure_subscription_id = "your-subscription-id" + +# Cluster identity +tenant_cluster_name = "your-cluster-name" + +# Bootstrap configuration +tenant_kubeconfig_path = "/path/to/your/cluster.kubeconfig" + +# Azure infrastructure (must exist) +azure_resource_group_name = "your-resource-group" +azure_vnet_name = "your-vnet" +azure_subnet_name = "your-subnet" + +# Node pools configuration +node_pools = [ + { + name = "default" + size = 3 + node_disk_size = 30 + vm_size = "Standard_D2s_v3" + # ... other pool-specific settings + } +] +``` + +### Node Pool Configuration + +Each node pool supports the following configuration options: + +```hcl +node_pools = [ + { + # Required fields + name = "pool-name" # Unique name for this pool + size = 3 # Number of nodes in the pool + node_disk_size = 30 # Disk size in GB + vm_size = "Standard_D2s_v3" # Azure VM size + + # Optional fields with defaults + min_size = 1 # Minimum nodes for autoscaling + max_size = 9 # Maximum nodes for autoscaling + node_disk_type = "Premium_LRS" # Storage type + assign_public_ip = true # Assign public IPs + enable_autoscaling = false # Enable CPU-based autoscaling + scale_out_cpu_threshold = 75 # CPU threshold to scale out + scale_in_cpu_threshold = 25 # CPU threshold to scale in + enable_automatic_instance_repair = false # Enable automatic repair + automatic_instance_repair_grace_period = "PT30M" # Repair grace period + upgrade_mode = "Manual" # Scale set upgrade mode + } +] +``` + +### Multiple Node Pools Example + +```hcl +node_pools = [ + { + name = "default" + size = 3 + node_disk_size = 50 + vm_size = "Standard_D2s_v3" + assign_public_ip = true + enable_autoscaling = false + }, + { + name = "system" + size = 2 + node_disk_size = 100 + vm_size = "Standard_D4s_v3" + assign_public_ip = false + enable_autoscaling = true + scale_out_cpu_threshold = 80 + scale_in_cpu_threshold = 30 + enable_automatic_instance_repair = true + } +] +``` + +### Environment Variable Options + +You can override any Terraform variable using environment variables: + +```bash +# Using .envrc file +export TF_VAR_azure_subscription_id="your-subscription-id" +export TF_VAR_azure_location="italynorth" +export TF_VAR_tenant_cluster_name="customer-azure" +export TF_VAR_tenant_kubeconfig_path="/path/to/kubeconfig" + +# Or set Azure provider environment variables +export ARM_SUBSCRIPTION_ID="your-subscription-id" +export ARM_CLIENT_ID="your-service-principal-id" # if using service principal +export ARM_CLIENT_SECRET="your-service-principal-secret" # if using service principal +export ARM_TENANT_ID="your-tenant-id" # if using service principal +``` + +### Infrastructure Setup + +If you haven't created the required Azure infrastructure: + +```bash +# Create resource group +az group create --name "kamaji" --location "italynorth" + +# Create virtual network with subnet +az network vnet create \ + --resource-group "kamaji" \ + --name "kamaji-vnet" \ + --address-prefix "10.10.0.0/16" \ + --subnet-name "kamaji-subnet" \ + --subnet-prefix "10.10.10.0/24" \ + --location "italynorth" +``` + +## Scaling Configuration + +Each node pool supports both manual and automatic scaling modes: + +### Manual Scaling (enable_autoscaling = false) +- **Terraform Control**: Direct control over instance count via `size` +- **No Autoscaler**: Azure autoscaler is disabled for this pool +- **Use Case**: When you want predictable, manual control over node count +- **Scaling**: Change `size` in tfvars and run `terraform apply` + +### Automatic Scaling (enable_autoscaling = true) +- **CPU-Based**: Automatic scaling based on CPU utilization +- **Scale Out**: When average CPU > threshold for 5 minutes +- **Scale In**: When average CPU < threshold for 5 minutes +- **Cooldown**: 1 minute between scaling actions +- **Range**: min_size to max_size instances +- **Note**: `size` sets the default capacity; actual scaling is managed by Azure + +## Security + +### Network Security Groups + +Automatically creates security group rules for: + +- **Outbound**: Allow all outbound traffic +- **SSH**: Allow inbound SSH (port 22) from anywhere +- **Cluster Internal**: Allow all traffic within the subnet + +### VM Security + +- SSH key-based authentication (password disabled) +- Premium SSD storage by default +- Automatic security updates via cloud-init + +## Monitoring + +### Instance Health + +- Automatic instance repair configurable per pool +- Failed instances can be manually replaced or automatically via Azure policies + +### Scaling Metrics + +- CPU percentage monitoring every minute +- 5-minute evaluation windows for scaling decisions +- Configurable thresholds per pool for scale-out/scale-in + +## Troubleshooting + +### Common Issues + +1. **Subscription ID Error** + ```bash + # Make sure subscription ID is set + export TF_VAR_azure_subscription_id="your-subscription-id" + # Or update main.auto.tfvars + ``` + +2. **Authentication Errors** + ```bash + az login + az account set --subscription "your-subscription-id" + ``` + +3. **Resource Group Not Found** + ```bash + az group create --name "kamaji" --location "italynorth" + ``` + +4. **VNet/Subnet Not Found** + ```bash + az network vnet create --resource-group "kamaji" --name "kamaji-vnet" --address-prefix "10.10.0.0/16" + ``` + +5. **Kubeconfig Issues** + - Verify kubeconfig path exists + - Test cluster connectivity: `kubectl --kubeconfig=/path/to/config get nodes` + +### Debugging + +```bash +# Enable detailed logging +export TF_LOG=DEBUG + +# Check Azure resources +az resource list --resource-group "kamaji" --output table + +# Verify VM scale sets (one per pool) +az vmss list --resource-group "kamaji" --output table +``` + +## Outputs + +After successful deployment: + +```bash +# View outputs +terraform output + +# Get cluster information +terraform output cluster_info + +# Get all node pools details +terraform output node_pools +``` + +Example output structure: +```bash +cluster_info = { + "azure_location" = "italynorth" + "node_pools_count" = 2 + "node_pools_names" = ["default", "system"] + "tenant_cluster_name" = "customer-azure" +} + +node_pools = { + "default" = { + "autoscale_settings" = { ... } + "network_security_group" = { ... } + "vmss_details" = { ... } + } + "system" = { + "autoscale_settings" = { ... } + "network_security_group" = { ... } + "vmss_details" = { ... } + } +} +``` + +## Cleanup + +```bash +# Destroy infrastructure +terraform destroy + +# Confirm removal +az resource list --resource-group "kamaji" --output table +``` + +## Support + +- **Documentation**: [Kamaji Documentation](https://kamaji.clastix.io) +- **Bootstrap Script**: [YAKI](https://goyaki.clastix.io) +- **Issues**: Report issues in the project repository \ No newline at end of file diff --git a/providers/azure/backend.tf b/providers/azure/backend.tf new file mode 100644 index 0000000..76fe3e7 --- /dev/null +++ b/providers/azure/backend.tf @@ -0,0 +1,9 @@ +# ============================================================================= +# TERRAFORM BACKEND +# ============================================================================= + +terraform { + backend "local" { + path = "tfstate/terraform.tfstate" + } +} \ No newline at end of file diff --git a/providers/azure/main.auto.tfvars.sample b/providers/azure/main.auto.tfvars.sample new file mode 100644 index 0000000..d50c269 --- /dev/null +++ b/providers/azure/main.auto.tfvars.sample @@ -0,0 +1,47 @@ +# Azure Configuration +azure_subscription_id = "" # Azure subscription ID (REQUIRED) +azure_location = "" # Azure region (e.g., "eastus", "westeurope", "italynorth") +azure_resource_group_name = "" # Resource group name (e.g., "kamaji") +azure_vnet_name = "" # Virtual network name (e.g., "kamaji-vnet") +azure_subnet_name = "" # Subnet name (e.g., "kamaji-subnet") +vnet_subnet_address_prefix = "" # Subnet CIDR (e.g., "10.10.10.0/24") + +# SSH Configuration +ssh_user = "" # SSH username (e.g., "ubuntu", "azureuser") +ssh_public_key_path = "" # Path to SSH public key (e.g., "~/.ssh/id_rsa.pub") + +# Tenant Cluster Configuration +tenant_cluster_name = "" # Name of the tenant cluster +tenant_kubeconfig_path = "" # Path to kubeconfig file (e.g., "~/.kube/config") + +# Node Pool Configuration +node_pools = [ + { + name = "" # Name of the node pool (e.g., "default", "workers") + size = 0 # Number of nodes in the pool + node_disk_size = 0 # Disk size for each node (in GB) + vm_size = "" # Azure VM size (e.g., "Standard_D2s_v3", "Standard_D4s_v3") + min_size = 0 # Minimum number of nodes (for autoscaling) + max_size = 0 # Maximum number of nodes (for autoscaling) + node_disk_type = "" # Storage type (Standard_LRS, Premium_LRS) + assign_public_ip = false # Whether to assign public IP addresses + enable_autoscaling = false # Enable CPU-based autoscaling + scale_out_cpu_threshold = 0 # CPU threshold percentage to trigger scale out + scale_in_cpu_threshold = 0 # CPU threshold percentage to trigger scale in + enable_automatic_instance_repair = false # Enable automatic instance repair + automatic_instance_repair_grace_period = "" # Grace period (e.g., "PT30M") + upgrade_mode = "" # Upgrade mode (Manual, Automatic, Rolling) + }, + # Add more node pools here as needed. +] + +# Example: List available VM sizes in your region +# az vm list-sizes --location italynorth --output table + +# Tags for Azure resources +tags = { + "ManagedBy" = "" # Who manages these resources (e.g., "Terraform", "Clastix") + "CreatedBy" = "" # What created these resources (e.g., "Terraform") + "Environment" = "" # Environment name (e.g., "dev", "staging", "prod") + "Project" = "" # Project name (e.g., "Kamaji", "MyProject") +} \ No newline at end of file diff --git a/providers/azure/main.tf b/providers/azure/main.tf new file mode 100644 index 0000000..a2e9031 --- /dev/null +++ b/providers/azure/main.tf @@ -0,0 +1,86 @@ +# ============================================================================= +# PROVIDERS +# ============================================================================= + +# Configure the Kubernetes provider +provider "kubernetes" { + # Path to the kubeconfig file for accessing the tenant cluster + config_path = var.tenant_kubeconfig_path +} + +# Configure the Azure Provider +provider "azurerm" { + subscription_id = var.azure_subscription_id + features {} +} + +# ============================================================================= +# BOOTSTRAP TOKEN +# ============================================================================= + +# Call the shared bootstrap-token module to generate the join command +module "bootstrap_token" { + source = "../../modules/bootstrap-token" # Updated to use shared module + kubeconfig_path = var.tenant_kubeconfig_path # Pass the kubeconfig path to the module + yaki_url = var.yaki_url # Pass the YAKI URL to the module +} + +# ============================================================================= +# NODE POOLS +# ============================================================================= + +module "azure_node_pools" { + source = "../../modules/azure-node-pool" # Updated path to the azure-node-pool module + + # Iterate over the list of node pools and call the module for each pool + for_each = { for pool in var.node_pools : pool.name => pool } + + # Tenant cluster configuration + tenant_cluster_name = var.tenant_cluster_name + + # Pool configuration + pool_name = each.value.name + pool_size = each.value.size + pool_min_size = each.value.min_size + pool_max_size = each.value.max_size + + # Node configuration + node_disk_size = each.value.node_disk_size + node_disk_type = each.value.node_disk_type + + # Azure configuration + azure_location = var.azure_location + azure_resource_group_name = var.azure_resource_group_name + azure_vnet_name = var.azure_vnet_name + azure_subnet_name = var.azure_subnet_name + vnet_subnet_address_prefix = var.vnet_subnet_address_prefix + vm_size = each.value.vm_size + assign_public_ip = each.value.assign_public_ip + vm_image_publisher = var.vm_image_publisher + vm_image_offer = var.vm_image_offer + vm_image_sku = var.vm_image_sku + vm_image_version = var.vm_image_version + tags = var.tags + + # SSH configuration + ssh_user = var.ssh_user + ssh_public_key_path = var.ssh_public_key_path + + # Autoscaling configuration + enable_autoscaling = each.value.enable_autoscaling + scale_out_cpu_threshold = each.value.scale_out_cpu_threshold + scale_in_cpu_threshold = each.value.scale_in_cpu_threshold + + # Instance repair configuration + enable_automatic_instance_repair = each.value.enable_automatic_instance_repair + automatic_instance_repair_grace_period = each.value.automatic_instance_repair_grace_period + upgrade_mode = each.value.upgrade_mode + + # Join command for bootstrapping nodes + runcmd = module.bootstrap_token.join_cmd + + # Ensure the azure-node-pool module depends on the bootstrap-token module + depends_on = [ + module.bootstrap_token + ] +} \ No newline at end of file diff --git a/providers/azure/outputs.tf b/providers/azure/outputs.tf new file mode 100644 index 0000000..d44fb46 --- /dev/null +++ b/providers/azure/outputs.tf @@ -0,0 +1,40 @@ +# ============================================================================= +# BOOTSTRAP TOKEN OUTPUTS +# ============================================================================= + +output "bootstrap_token" { + description = "Bootstrap token details" + value = { + join_cmd = module.bootstrap_token.join_cmd + } + sensitive = true +} + +# ============================================================================= +# AZURE NODE POOLS OUTPUTS +# ============================================================================= + +output "node_pools" { + description = "Azure node pools details" + value = { + for name, pool in module.azure_node_pools : name => { + vmss_details = pool.vmss_details + autoscale_settings = pool.autoscale_settings + network_security_group = pool.network_security_group + } + } +} + +# ============================================================================= +# CLUSTER INFORMATION +# ============================================================================= + +output "cluster_info" { + description = "Cluster information" + value = { + tenant_cluster_name = var.tenant_cluster_name + azure_location = var.azure_location + node_pools_count = length(var.node_pools) + node_pools_names = [for pool in var.node_pools : pool.name] + } +} \ No newline at end of file diff --git a/providers/azure/vars.tf b/providers/azure/vars.tf new file mode 100644 index 0000000..682b508 --- /dev/null +++ b/providers/azure/vars.tf @@ -0,0 +1,144 @@ +# ============================================================================= +# CLUSTER CONFIGURATION +# ============================================================================= + +variable "tenant_cluster_name" { + description = "Name of the tenant cluster" + type = string +} + +variable "tenant_kubeconfig_path" { + description = "Path to the kubeconfig file for the tenant cluster" + type = string + default = "~/.kube/config" +} + +# ============================================================================= +# BOOTSTRAP CONFIGURATION +# ============================================================================= + +variable "yaki_url" { + description = "URL to the YAKI script for node bootstrapping" + type = string + default = "https://goyaki.clastix.io" +} + +# ============================================================================= +# NODE POOL CONFIGURATION +# ============================================================================= + +variable "node_pools" { + description = "List of Azure node pools with their configurations" + type = list(object({ + name = string + size = number + node_disk_size = number + vm_size = string + min_size = optional(number, 1) + max_size = optional(number, 9) + node_disk_type = optional(string, "Premium_LRS") + assign_public_ip = optional(bool, true) + enable_autoscaling = optional(bool, false) + scale_out_cpu_threshold = optional(number, 75) + scale_in_cpu_threshold = optional(number, 25) + enable_automatic_instance_repair = optional(bool, false) + automatic_instance_repair_grace_period = optional(string, "PT30M") + upgrade_mode = optional(string, "Manual") + })) +} + +# ============================================================================= +# AZURE CONFIGURATION +# ============================================================================= + +variable "azure_subscription_id" { + description = "Azure subscription ID" + type = string +} + +variable "azure_location" { + description = "Azure region where resources will be created" + type = string + default = "italynorth" +} + +variable "azure_resource_group_name" { + description = "Name of the Azure resource group" + type = string + default = "kamaji" +} + +variable "azure_vnet_name" { + description = "Name of the Azure virtual network" + type = string + default = "kamaji-vnet" +} + +variable "azure_subnet_name" { + description = "Name of the Azure subnet" + type = string + default = "kamaji-subnet" +} + +variable "vnet_subnet_address_prefix" { + description = "CIDR block for the Azure subnet" + type = string + default = "10.10.10.0/24" +} + +# ============================================================================= +# VM IMAGE CONFIGURATION +# ============================================================================= + +variable "vm_image_publisher" { + description = "Azure VM image publisher" + type = string + default = "Canonical" +} + +variable "vm_image_offer" { + description = "Azure VM image offer" + type = string + default = "ubuntu-24_04-lts" +} + +variable "vm_image_sku" { + description = "Azure VM image SKU" + type = string + default = "server" +} + +variable "vm_image_version" { + description = "Azure VM image version" + type = string + default = "latest" +} + +# ============================================================================= +# SSH CONFIGURATION +# ============================================================================= + +variable "ssh_user" { + description = "SSH user for node access" + type = string + default = "ubuntu" +} + +variable "ssh_public_key_path" { + description = "Path to the SSH public key" + type = string + default = "~/.ssh/id_rsa.pub" +} + +# ============================================================================= +# TAGS +# ============================================================================= + +variable "tags" { + description = "Tags applied to Azure resources" + type = map(string) + default = { + "ManagedBy" = "Clastix" + "CreatedBy" = "Terraform" + } +} \ No newline at end of file diff --git a/providers/azure/versions.tf b/providers/azure/versions.tf new file mode 100644 index 0000000..549b3db --- /dev/null +++ b/providers/azure/versions.tf @@ -0,0 +1,24 @@ +terraform { + required_version = ">= 1.0" + + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = ">= 3.0" + } + cloudinit = { + source = "hashicorp/cloudinit" + version = ">= 2.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.0" + } + tls = { + source = "hashicorp/tls" + version = ">= 3.0" + } + } +} + + \ No newline at end of file