feat(azure): add support provider

This commit is contained in:
bsctl
2025-06-10 16:52:42 +02:00
parent 9eadb35497
commit 5ed2bd8eae
14 changed files with 1469 additions and 1 deletions

View File

@@ -12,7 +12,7 @@ The machines created by this project automatically will create a secure bootstra
| **Proxmox** | Virtual Machines | Virtual Machines on Proxmox VE | Manual | Available |
| **vSphere** | Virtual Machines | Virtual Machines on VMware vSphere/vCenter | Manual | Available |
| **vCloud** | vApps | Multi-tenant Virtual Machines on VMware Cloud Director with vApp isolation | Manual | Available |
| **Azure** | Scale Sets | Azure VMs with automatic scaling and availability zones | Automatic | Planned |
| **Azure** | Virtual Machine Scale Sets | Azure VMs with automatic scaling and availability zones | Automatic | Available |
## Bootstrap Token Management

134
examples/azure/example.tf Normal file
View File

@@ -0,0 +1,134 @@
# =============================================================================
# AZURE KAMAJI NODE POOL EXAMPLE
# =============================================================================
#
# This example demonstrates multi-pool Azure node configuration with both
# manual and automatic scaling options:
#
# SCALING MODES:
# - enable_autoscaling = true: Azure manages scaling based on CPU metrics
# - enable_autoscaling = false: Terraform directly controls pool_size
#
# For manual control, set enable_autoscaling = false and adjust pool_size
# in your configuration files.
#
# =============================================================================
# Example: Azure Provider Usage
# This example shows how to use the Azure provider wrapper
terraform {
required_providers {
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.35.0"
}
azurerm = {
source = "hashicorp/azurerm"
version = "~> 4.0"
}
}
}
# Configure the Azure provider
provider "azurerm" {
subscription_id = var.azure_subscription_id
features {}
}
# Configure the Kubernetes provider
provider "kubernetes" {
config_path = var.tenant_kubeconfig_path
}
# Use the Azure provider module
module "azure_kamaji_node_pools" {
source = "../../providers/azure"
# Cluster configuration
tenant_cluster_name = "my-azure-cluster"
tenant_kubeconfig_path = "~/.kube/my-cluster.kubeconfig"
yaki_url = "https://goyaki.clastix.io"
# Node pools configuration
node_pools = [
{
name = "default"
size = 3
min_size = 2
max_size = 10
node_disk_size = 50
node_disk_type = "Premium_LRS"
vm_size = "Standard_D2s_v3"
assign_public_ip = true
enable_autoscaling = false
scale_out_cpu_threshold = 75
scale_in_cpu_threshold = 25
enable_automatic_instance_repair = false
automatic_instance_repair_grace_period = "PT30M"
upgrade_mode = "Manual"
},
{
name = "system"
size = 2
min_size = 1
max_size = 5
node_disk_size = 100
node_disk_type = "Premium_LRS"
vm_size = "Standard_D4s_v3"
assign_public_ip = false
enable_autoscaling = true
scale_out_cpu_threshold = 80
scale_in_cpu_threshold = 30
enable_automatic_instance_repair = true
automatic_instance_repair_grace_period = "PT15M"
upgrade_mode = "Automatic"
}
]
# Azure configuration
azure_subscription_id = var.azure_subscription_id
azure_location = var.azure_location
azure_resource_group_name = "kamaji"
azure_vnet_name = "kamaji-vnet"
azure_subnet_name = "kamaji-subnet"
vnet_subnet_address_prefix = "10.10.10.0/24"
tags = {
"ManagedBy" = "Terraform"
"Environment" = "production"
"Provider" = "Azure"
}
# SSH configuration
ssh_user = "ubuntu"
ssh_public_key_path = "~/.ssh/id_rsa.pub"
}
# Variables
variable "azure_subscription_id" {
description = "Azure subscription ID"
type = string
}
variable "azure_location" {
description = "Azure region"
type = string
default = "italynorth"
}
variable "tenant_kubeconfig_path" {
description = "Path to tenant cluster kubeconfig"
type = string
default = "~/.kube/config"
}
# Outputs
output "cluster_info" {
description = "Cluster information"
value = module.azure_kamaji_node_pools.cluster_info
}
output "node_pools" {
description = "Node pools details"
value = module.azure_kamaji_node_pools.node_pools
}

View File

@@ -0,0 +1,138 @@
# Azure Node Pool Module
Creates Azure Virtual Machine Scale Sets for Kamaji tenant cluster worker nodes with automatic scaling capabilities.
## Features
- **Virtual Machine Scale Sets** with automatic scaling
- **Network Security Groups** with Kubernetes-optimized rules
- **Ubuntu 24.04 LTS** support
- **Automatic instance repair** for failed VMs
- **CPU-based autoscaling** with configurable thresholds
- **Bootstrap token integration** via cloud-init
## Usage
```hcl
module "azure_node_pool" {
source = "../../modules/azure-node-pool"
# Cluster configuration
tenant_cluster_name = "charlie"
pool_name = "default"
# Pool sizing
pool_size = 3
pool_min_size = 1
pool_max_size = 9
# Azure configuration
azure_location = "italynorth"
azure_resource_group_name = "kamaji"
azure_vnet_name = "kamaji-vnet"
azure_subnet_name = "kamaji-subnet"
# VM configuration
vm_size = "Standard_D2s_v3"
assign_public_ip = true
node_disk_size = 30
node_disk_type = "Premium_LRS"
# Autoscaling
enable_autoscaling = true
scale_out_cpu_threshold = 75
scale_in_cpu_threshold = 25
# Bootstrap command
runcmd = module.bootstrap_token.join_cmd
tags = {
Environment = "production"
Project = "kamaji"
}
}
```
## Requirements
| Name | Version |
|------|---------|
| terraform | >= 1.0 |
| azurerm | >= 3.0 |
| cloudinit | >= 2.0 |
## Providers
| Name | Version |
|------|---------|
| azurerm | >= 3.0 |
| cloudinit | >= 2.0 |
## Resources
- `azurerm_linux_virtual_machine_scale_set` - Main VMSS resource
- `azurerm_network_security_group` - Security group for nodes
- `azurerm_network_security_rule` - Security rules
- `azurerm_monitor_autoscale_setting` - Autoscaling configuration
## Variables
| Name | Description | Type | Default |
|------|-------------|------|---------|
| `tenant_cluster_name` | Name of the tenant cluster | `string` | `"charlie"` |
| `pool_name` | Name of the node pool | `string` | `"default"` |
| `pool_size` | The size of the node pool | `number` | `3` |
| `pool_min_size` | The minimum size of the node pool | `number` | `1` |
| `pool_max_size` | The maximum size of the node pool | `number` | `9` |
| `azure_location` | Azure region where resources are created | `string` | `"italynorth"` |
| `azure_resource_group_name` | Name of the Azure resource group | `string` | `"kamaji"` |
| `azure_vnet_name` | Name of the Azure virtual network | `string` | `"kamaji-vnet"` |
| `azure_subnet_name` | Name of the Azure subnet | `string` | `"kamaji-subnet"` |
| `vm_size` | Size of the virtual machines | `string` | `"Standard_D2s_v3"` |
| `assign_public_ip` | Whether to assign public IP addresses to VMs | `bool` | `true` |
| `node_disk_size` | Disk size for each node in GB | `number` | `30` |
| `node_disk_type` | Storage account type for each node | `string` | `"Premium_LRS"` |
| `ssh_user` | SSH user for the nodes | `string` | `"ubuntu"` |
| `ssh_public_key_path` | Path to the SSH public key | `string` | `"~/.ssh/id_rsa.pub"` |
| `enable_autoscaling` | Enable automatic scaling based on CPU metrics | `bool` | `true` |
| `scale_out_cpu_threshold` | CPU threshold percentage to trigger scale out | `number` | `75` |
| `scale_in_cpu_threshold` | CPU threshold percentage to trigger scale in | `number` | `25` |
| `runcmd` | Command to run on the node at first boot time | `string` | `"echo 'Hello, World!'"` |
## Outputs
| Name | Description |
|------|-------------|
| `vmss_details` | Virtual Machine Scale Set details |
| `autoscale_settings` | Autoscale settings details |
| `network_security_group` | Network Security Group details |
## Security Groups
The module creates a Network Security Group with the following rules:
- **Outbound**: Allow all outbound traffic
- **SSH**: Allow inbound SSH (port 22) from anywhere
- **Cluster Internal**: Allow all traffic within the subnet
## Scaling Behavior
This module supports both manual and automatic scaling modes:
### Manual Scaling (`enable_autoscaling = false`)
- **Direct Control**: Terraform directly manages VMSS instance count
- **pool_size Changes**: Changing `pool_size` will update the VMSS immediately on `terraform apply`
- **No Lifecycle Rules**: No `ignore_changes` applied to instances
- **Use Case**: Predictable workloads requiring manual capacity control
### Automatic Scaling (`enable_autoscaling = true`)
- **CPU-Based**: Azure autoscaler manages instance count based on CPU metrics
- **Scale Out**: When average CPU > 75% for 5 minutes
- **Scale In**: When average CPU < 25% for 5 minutes
- **Cooldown**: 1 minute between scaling actions
- **Default Capacity**: `pool_size` sets the initial/default capacity
- **Lifecycle Protection**: Terraform ignores instance count changes made by autoscaler
## Instance Repair
Automatic instance repair is enabled by default with a 30-minute grace period for failed VMs.

View File

@@ -0,0 +1,38 @@
# =============================================================================
# DATA SOURCES
# =============================================================================
data "azurerm_resource_group" "tenant" {
name = var.azure_resource_group_name
}
data "azurerm_virtual_network" "tenant_vnet" {
name = var.azure_vnet_name
resource_group_name = var.azure_resource_group_name
}
data "azurerm_subnet" "tenant_subnet" {
name = var.azure_subnet_name
virtual_network_name = var.azure_vnet_name
resource_group_name = var.azure_resource_group_name
}
# =============================================================================
# CLOUD-INIT CONFIGURATION
# =============================================================================
data "cloudinit_config" "node_cloud_init" {
gzip = true
base64_encode = true
part {
filename = "cloud-config.yaml"
content_type = "text/cloud-config"
content = templatefile("${path.module}/../templates/cloud-init/userdata.yml.tpl", {
hostname = ""
runcmd = var.runcmd
ssh_user = var.ssh_user
ssh_public_key = file(pathexpand(var.ssh_public_key_path))
})
}
}

View File

@@ -0,0 +1,206 @@
# =============================================================================
# TERRAFORM CONFIGURATION
# =============================================================================
terraform {
required_providers {
azurerm = {
source = "hashicorp/azurerm"
}
cloudinit = {
source = "hashicorp/cloudinit"
}
}
}
# =============================================================================
# NETWORK SECURITY GROUP CONFIGURATION
# =============================================================================
# Network Security Group for Kubernetes Nodes
resource "azurerm_network_security_group" "kubernetes" {
name = "${var.tenant_cluster_name}-${var.pool_name}-nsg"
location = var.azure_location
resource_group_name = var.azure_resource_group_name
tags = merge(
{
"Name" = "${var.tenant_cluster_name}-${var.pool_name}"
},
var.tags,
)
}
# Allow outgoing connectivity
resource "azurerm_network_security_rule" "allow_all_outbound" {
name = "AllowAllOutbound"
priority = 100
direction = "Outbound"
access = "Allow"
protocol = "*"
source_port_range = "*"
destination_port_range = "*"
source_address_prefix = "*"
destination_address_prefix = "*"
resource_group_name = var.azure_resource_group_name
network_security_group_name = azurerm_network_security_group.kubernetes.name
}
# Allow SSH access
resource "azurerm_network_security_rule" "allow_ssh_inbound" {
name = "AllowSSHInbound"
priority = 1000
direction = "Inbound"
access = "Allow"
protocol = "Tcp"
source_port_range = "*"
destination_port_range = "22"
source_address_prefix = "*"
destination_address_prefix = "*"
resource_group_name = var.azure_resource_group_name
network_security_group_name = azurerm_network_security_group.kubernetes.name
}
# Allow cluster internal communication
resource "azurerm_network_security_rule" "allow_cluster_internal" {
name = "AllowClusterInternal"
priority = 1100
direction = "Inbound"
access = "Allow"
protocol = "*"
source_port_range = "*"
destination_port_range = "*"
source_address_prefix = var.vnet_subnet_address_prefix
destination_address_prefix = var.vnet_subnet_address_prefix
resource_group_name = var.azure_resource_group_name
network_security_group_name = azurerm_network_security_group.kubernetes.name
}
# =============================================================================
# VIRTUAL MACHINE SCALE SET
# =============================================================================
resource "azurerm_linux_virtual_machine_scale_set" "nodes" {
name = "${var.tenant_cluster_name}-${var.pool_name}-${var.enable_autoscaling ? "auto" : "manual"}-vmss"
resource_group_name = var.azure_resource_group_name
location = var.azure_location
sku = var.vm_size
instances = var.pool_size
admin_username = var.ssh_user
disable_password_authentication = true
admin_ssh_key {
username = var.ssh_user
public_key = file(pathexpand(var.ssh_public_key_path))
}
source_image_reference {
publisher = var.vm_image_publisher
offer = var.vm_image_offer
sku = var.vm_image_sku
version = var.vm_image_version
}
os_disk {
storage_account_type = var.node_disk_type
caching = "ReadWrite"
disk_size_gb = var.node_disk_size
}
network_interface {
name = "primary"
primary = true
ip_configuration {
name = "primary"
primary = true
subnet_id = data.azurerm_subnet.tenant_subnet.id
dynamic "public_ip_address" {
for_each = var.assign_public_ip ? [1] : []
content {
name = "primary"
}
}
}
network_security_group_id = azurerm_network_security_group.kubernetes.id
}
custom_data = data.cloudinit_config.node_cloud_init.rendered
# Configure upgrade policy
upgrade_mode = var.upgrade_mode
tags = merge(
{
"Name" = "${var.tenant_cluster_name}-${var.pool_name}"
},
var.tags,
)
}
# =============================================================================
# AUTO SCALING CONFIGURATION
# =============================================================================
resource "azurerm_monitor_autoscale_setting" "nodes" {
count = var.enable_autoscaling ? 1 : 0
name = "${var.tenant_cluster_name}-${var.pool_name}-autoscale"
resource_group_name = var.azure_resource_group_name
location = var.azure_location
target_resource_id = azurerm_linux_virtual_machine_scale_set.nodes.id
profile {
name = "AutoScale"
capacity {
default = var.pool_size
minimum = var.pool_min_size
maximum = var.pool_max_size
}
rule {
metric_trigger {
metric_name = "Percentage CPU"
metric_resource_id = azurerm_linux_virtual_machine_scale_set.nodes.id
time_grain = "PT1M"
statistic = "Average"
time_window = "PT5M"
time_aggregation = "Average"
operator = "GreaterThan"
threshold = var.scale_out_cpu_threshold
}
scale_action {
direction = "Increase"
type = "ChangeCount"
value = "1"
cooldown = "PT1M"
}
}
rule {
metric_trigger {
metric_name = "Percentage CPU"
metric_resource_id = azurerm_linux_virtual_machine_scale_set.nodes.id
time_grain = "PT1M"
statistic = "Average"
time_window = "PT5M"
time_aggregation = "Average"
operator = "LessThan"
threshold = var.scale_in_cpu_threshold
}
scale_action {
direction = "Decrease"
type = "ChangeCount"
value = "1"
cooldown = "PT1M"
}
}
}
tags = var.tags
}

View File

@@ -0,0 +1,43 @@
# =============================================================================
# VIRTUAL MACHINE SCALE SET
# =============================================================================
output "vmss_details" {
description = "Virtual Machine Scale Set details"
value = {
name = azurerm_linux_virtual_machine_scale_set.nodes.name
id = azurerm_linux_virtual_machine_scale_set.nodes.id
resource_group = azurerm_linux_virtual_machine_scale_set.nodes.resource_group_name
location = azurerm_linux_virtual_machine_scale_set.nodes.location
instances = azurerm_linux_virtual_machine_scale_set.nodes.instances
vm_size = azurerm_linux_virtual_machine_scale_set.nodes.sku
}
}
# =============================================================================
# AUTOSCALING SETTINGS
# =============================================================================
output "autoscale_settings" {
description = "Autoscale settings details"
value = var.enable_autoscaling ? {
name = azurerm_monitor_autoscale_setting.nodes[0].name
id = azurerm_monitor_autoscale_setting.nodes[0].id
min_size = var.pool_min_size
max_size = var.pool_max_size
target_resource_id = azurerm_monitor_autoscale_setting.nodes[0].target_resource_id
} : null
}
# =============================================================================
# NETWORK SECURITY GROUP
# =============================================================================
output "network_security_group" {
description = "Network Security Group details"
value = {
name = azurerm_network_security_group.kubernetes.name
id = azurerm_network_security_group.kubernetes.id
resource_group_name = azurerm_network_security_group.kubernetes.resource_group_name
}
}

View File

@@ -0,0 +1,203 @@
# =============================================================================
# CLUSTER CONFIGURATION
# =============================================================================
# Name of the tenant cluster
variable "tenant_cluster_name" {
description = "Name of the tenant cluster"
type = string
default = "charlie"
}
# =============================================================================
# POOL CONFIGURATION
# =============================================================================
variable "runcmd" {
description = "Command to run on the node at first boot time"
type = string
default = "echo 'Hello, World!'"
}
variable "pool_name" {
description = "Name of the node pool"
type = string
default = "default"
}
variable "pool_size" {
description = "The size of the node pool"
type = number
default = 3
}
variable "pool_min_size" {
description = "The minimum size of the node pool"
type = number
default = 1
}
variable "pool_max_size" {
description = "The maximum size of the node pool"
type = number
default = 9
}
# =============================================================================
# AZURE CONFIGURATION
# =============================================================================
variable "azure_location" {
description = "Azure region where resources are created"
type = string
default = "italynorth"
}
variable "azure_resource_group_name" {
description = "Name of the Azure resource group"
type = string
default = "kamaji"
}
variable "azure_vnet_name" {
description = "Name of the Azure virtual network"
type = string
default = "kamaji-vnet"
}
variable "azure_subnet_name" {
description = "Name of the Azure subnet"
type = string
default = "kamaji-subnet"
}
variable "vnet_subnet_address_prefix" {
description = "Address prefix for the subnet (used for security group rules)"
type = string
default = "10.10.10.0/24"
}
variable "vm_size" {
description = "Size of the virtual machines"
type = string
default = "Standard_D2s_v3"
}
variable "assign_public_ip" {
description = "Whether to assign public IP addresses to VMs"
type = bool
default = true
}
variable "tags" {
description = "Tags used for Azure resources"
type = map(string)
default = {
"ManagedBy" = "Clastix"
"CreatedBy" = "Terraform"
}
}
# =============================================================================
# VM IMAGE CONFIGURATION
# =============================================================================
variable "vm_image_publisher" {
description = "Publisher of the VM image"
type = string
default = "Canonical"
}
variable "vm_image_offer" {
description = "Offer of the VM image"
type = string
default = "ubuntu-24_04-lts"
}
variable "vm_image_sku" {
description = "SKU of the VM image"
type = string
default = "server"
}
variable "vm_image_version" {
description = "Version of the VM image"
type = string
default = "latest"
}
# =============================================================================
# NODE CONFIGURATION
# =============================================================================
variable "node_disk_size" {
description = "Disk size for each node in GB"
type = number
default = 30
}
variable "node_disk_type" {
description = "Storage account type for each node (Standard_LRS, Premium_LRS)"
type = string
default = "Premium_LRS"
}
# =============================================================================
# SSH CONFIGURATION
# =============================================================================
variable "ssh_user" {
description = "SSH user for the nodes"
type = string
default = "ubuntu"
}
variable "ssh_public_key_path" {
description = "Path to the SSH public key"
type = string
default = "~/.ssh/id_rsa.pub"
}
# =============================================================================
# AUTO SCALING CONFIGURATION
# =============================================================================
variable "enable_autoscaling" {
description = "Enable automatic scaling based on CPU metrics"
type = bool
default = true
}
variable "scale_out_cpu_threshold" {
description = "CPU threshold percentage to trigger scale out"
type = number
default = 75
}
variable "scale_in_cpu_threshold" {
description = "CPU threshold percentage to trigger scale in"
type = number
default = 25
}
# =============================================================================
# INSTANCE REPAIR CONFIGURATION
# =============================================================================
variable "enable_automatic_instance_repair" {
description = "Enable automatic instance repair for failed VMs"
type = bool
default = true
}
variable "automatic_instance_repair_grace_period" {
description = "Grace period for automatic instance repair (in minutes)"
type = string
default = "PT30M"
}
variable "upgrade_mode" {
description = "Upgrade mode for the scale set (Manual, Automatic, Rolling)"
type = string
default = "Manual"
}

356
providers/azure/README.md Normal file
View File

@@ -0,0 +1,356 @@
# Azure Provider for Kamaji Node Pools
Ready-to-use Terraform implementation for creating multiple Kubernetes worker node pools on Microsoft Azure using Virtual Machine Scale Sets (VMSS).
## Features
- **Multiple Node Pools** with different configurations per pool
- **Virtual Machine Scale Sets** with automatic scaling per pool
- **CPU-based autoscaling** with configurable thresholds per pool
- **Automatic instance repair** for failed VMs (configurable per pool)
- **Network Security Groups** with Kubernetes-optimized rules
- **Ubuntu 24.04 LTS** support
- **Bootstrap token integration** with automatic cluster joining via YAKI
## Prerequisites
### Required Infrastructure
Before using this provider, you must have:
1. **Azure Resource Group** - Where all resources will be created
2. **Virtual Network (VNet)** - Network for the node pools
3. **Subnet** - Subnet within the VNet for VM instances
4. **Kamaji tenant cluster** - Running cluster with accessible kubeconfig
### Required Tools
- Terraform >= 1.0
- Azure CLI >= 2.0 (authenticated)
- SSH key pair for node access
- [direnv](https://direnv.net/) (optional, for automatic environment management)
## Quick Start
### 1. Authentication
```bash
# Login to Azure
az login
# Set subscription (if needed)
az account set --subscription "your-subscription-id"
# Verify access
az account show
```
### 2. Environment Configuration
**Option A: Using .envrc (Recommended)**
```bash
# Install direnv (if not already installed)
# macOS: brew install direnv
# Ubuntu: apt install direnv
# Add direnv hook to your shell
echo 'eval "$(direnv hook bash)"' >> ~/.bashrc # for bash
echo 'eval "$(direnv hook zsh)"' >> ~/.zshrc # for zsh
# Copy and edit .envrc file
cp .envrc.sample .envrc
vim .envrc
# Allow direnv to load the environment
direnv allow
```
**Option B: Using tfvars file**
```bash
# Copy sample configuration
cp main.auto.tfvars.sample main.auto.tfvars
# Edit configuration - MAKE SURE TO SET azure_subscription_id
vim main.auto.tfvars
```
### 3. Deploy
```bash
# Initialize Terraform
terraform init
# Review plan
terraform plan
# Apply configuration
terraform apply
```
## Configuration
### Required Variables
```hcl
# Azure subscription (REQUIRED - no default)
azure_subscription_id = "your-subscription-id"
# Cluster identity
tenant_cluster_name = "your-cluster-name"
# Bootstrap configuration
tenant_kubeconfig_path = "/path/to/your/cluster.kubeconfig"
# Azure infrastructure (must exist)
azure_resource_group_name = "your-resource-group"
azure_vnet_name = "your-vnet"
azure_subnet_name = "your-subnet"
# Node pools configuration
node_pools = [
{
name = "default"
size = 3
node_disk_size = 30
vm_size = "Standard_D2s_v3"
# ... other pool-specific settings
}
]
```
### Node Pool Configuration
Each node pool supports the following configuration options:
```hcl
node_pools = [
{
# Required fields
name = "pool-name" # Unique name for this pool
size = 3 # Number of nodes in the pool
node_disk_size = 30 # Disk size in GB
vm_size = "Standard_D2s_v3" # Azure VM size
# Optional fields with defaults
min_size = 1 # Minimum nodes for autoscaling
max_size = 9 # Maximum nodes for autoscaling
node_disk_type = "Premium_LRS" # Storage type
assign_public_ip = true # Assign public IPs
enable_autoscaling = false # Enable CPU-based autoscaling
scale_out_cpu_threshold = 75 # CPU threshold to scale out
scale_in_cpu_threshold = 25 # CPU threshold to scale in
enable_automatic_instance_repair = false # Enable automatic repair
automatic_instance_repair_grace_period = "PT30M" # Repair grace period
upgrade_mode = "Manual" # Scale set upgrade mode
}
]
```
### Multiple Node Pools Example
```hcl
node_pools = [
{
name = "default"
size = 3
node_disk_size = 50
vm_size = "Standard_D2s_v3"
assign_public_ip = true
enable_autoscaling = false
},
{
name = "system"
size = 2
node_disk_size = 100
vm_size = "Standard_D4s_v3"
assign_public_ip = false
enable_autoscaling = true
scale_out_cpu_threshold = 80
scale_in_cpu_threshold = 30
enable_automatic_instance_repair = true
}
]
```
### Environment Variable Options
You can override any Terraform variable using environment variables:
```bash
# Using .envrc file
export TF_VAR_azure_subscription_id="your-subscription-id"
export TF_VAR_azure_location="italynorth"
export TF_VAR_tenant_cluster_name="customer-azure"
export TF_VAR_tenant_kubeconfig_path="/path/to/kubeconfig"
# Or set Azure provider environment variables
export ARM_SUBSCRIPTION_ID="your-subscription-id"
export ARM_CLIENT_ID="your-service-principal-id" # if using service principal
export ARM_CLIENT_SECRET="your-service-principal-secret" # if using service principal
export ARM_TENANT_ID="your-tenant-id" # if using service principal
```
### Infrastructure Setup
If you haven't created the required Azure infrastructure:
```bash
# Create resource group
az group create --name "kamaji" --location "italynorth"
# Create virtual network with subnet
az network vnet create \
--resource-group "kamaji" \
--name "kamaji-vnet" \
--address-prefix "10.10.0.0/16" \
--subnet-name "kamaji-subnet" \
--subnet-prefix "10.10.10.0/24" \
--location "italynorth"
```
## Scaling Configuration
Each node pool supports both manual and automatic scaling modes:
### Manual Scaling (enable_autoscaling = false)
- **Terraform Control**: Direct control over instance count via `size`
- **No Autoscaler**: Azure autoscaler is disabled for this pool
- **Use Case**: When you want predictable, manual control over node count
- **Scaling**: Change `size` in tfvars and run `terraform apply`
### Automatic Scaling (enable_autoscaling = true)
- **CPU-Based**: Automatic scaling based on CPU utilization
- **Scale Out**: When average CPU > threshold for 5 minutes
- **Scale In**: When average CPU < threshold for 5 minutes
- **Cooldown**: 1 minute between scaling actions
- **Range**: min_size to max_size instances
- **Note**: `size` sets the default capacity; actual scaling is managed by Azure
## Security
### Network Security Groups
Automatically creates security group rules for:
- **Outbound**: Allow all outbound traffic
- **SSH**: Allow inbound SSH (port 22) from anywhere
- **Cluster Internal**: Allow all traffic within the subnet
### VM Security
- SSH key-based authentication (password disabled)
- Premium SSD storage by default
- Automatic security updates via cloud-init
## Monitoring
### Instance Health
- Automatic instance repair configurable per pool
- Failed instances can be manually replaced or automatically via Azure policies
### Scaling Metrics
- CPU percentage monitoring every minute
- 5-minute evaluation windows for scaling decisions
- Configurable thresholds per pool for scale-out/scale-in
## Troubleshooting
### Common Issues
1. **Subscription ID Error**
```bash
# Make sure subscription ID is set
export TF_VAR_azure_subscription_id="your-subscription-id"
# Or update main.auto.tfvars
```
2. **Authentication Errors**
```bash
az login
az account set --subscription "your-subscription-id"
```
3. **Resource Group Not Found**
```bash
az group create --name "kamaji" --location "italynorth"
```
4. **VNet/Subnet Not Found**
```bash
az network vnet create --resource-group "kamaji" --name "kamaji-vnet" --address-prefix "10.10.0.0/16"
```
5. **Kubeconfig Issues**
- Verify kubeconfig path exists
- Test cluster connectivity: `kubectl --kubeconfig=/path/to/config get nodes`
### Debugging
```bash
# Enable detailed logging
export TF_LOG=DEBUG
# Check Azure resources
az resource list --resource-group "kamaji" --output table
# Verify VM scale sets (one per pool)
az vmss list --resource-group "kamaji" --output table
```
## Outputs
After successful deployment:
```bash
# View outputs
terraform output
# Get cluster information
terraform output cluster_info
# Get all node pools details
terraform output node_pools
```
Example output structure:
```bash
cluster_info = {
"azure_location" = "italynorth"
"node_pools_count" = 2
"node_pools_names" = ["default", "system"]
"tenant_cluster_name" = "customer-azure"
}
node_pools = {
"default" = {
"autoscale_settings" = { ... }
"network_security_group" = { ... }
"vmss_details" = { ... }
}
"system" = {
"autoscale_settings" = { ... }
"network_security_group" = { ... }
"vmss_details" = { ... }
}
}
```
## Cleanup
```bash
# Destroy infrastructure
terraform destroy
# Confirm removal
az resource list --resource-group "kamaji" --output table
```
## Support
- **Documentation**: [Kamaji Documentation](https://kamaji.clastix.io)
- **Bootstrap Script**: [YAKI](https://goyaki.clastix.io)
- **Issues**: Report issues in the project repository

View File

@@ -0,0 +1,9 @@
# =============================================================================
# TERRAFORM BACKEND
# =============================================================================
terraform {
backend "local" {
path = "tfstate/terraform.tfstate"
}
}

View File

@@ -0,0 +1,47 @@
# Azure Configuration
azure_subscription_id = "" # Azure subscription ID (REQUIRED)
azure_location = "" # Azure region (e.g., "eastus", "westeurope", "italynorth")
azure_resource_group_name = "" # Resource group name (e.g., "kamaji")
azure_vnet_name = "" # Virtual network name (e.g., "kamaji-vnet")
azure_subnet_name = "" # Subnet name (e.g., "kamaji-subnet")
vnet_subnet_address_prefix = "" # Subnet CIDR (e.g., "10.10.10.0/24")
# SSH Configuration
ssh_user = "" # SSH username (e.g., "ubuntu", "azureuser")
ssh_public_key_path = "" # Path to SSH public key (e.g., "~/.ssh/id_rsa.pub")
# Tenant Cluster Configuration
tenant_cluster_name = "" # Name of the tenant cluster
tenant_kubeconfig_path = "" # Path to kubeconfig file (e.g., "~/.kube/config")
# Node Pool Configuration
node_pools = [
{
name = "" # Name of the node pool (e.g., "default", "workers")
size = 0 # Number of nodes in the pool
node_disk_size = 0 # Disk size for each node (in GB)
vm_size = "" # Azure VM size (e.g., "Standard_D2s_v3", "Standard_D4s_v3")
min_size = 0 # Minimum number of nodes (for autoscaling)
max_size = 0 # Maximum number of nodes (for autoscaling)
node_disk_type = "" # Storage type (Standard_LRS, Premium_LRS)
assign_public_ip = false # Whether to assign public IP addresses
enable_autoscaling = false # Enable CPU-based autoscaling
scale_out_cpu_threshold = 0 # CPU threshold percentage to trigger scale out
scale_in_cpu_threshold = 0 # CPU threshold percentage to trigger scale in
enable_automatic_instance_repair = false # Enable automatic instance repair
automatic_instance_repair_grace_period = "" # Grace period (e.g., "PT30M")
upgrade_mode = "" # Upgrade mode (Manual, Automatic, Rolling)
},
# Add more node pools here as needed.
]
# Example: List available VM sizes in your region
# az vm list-sizes --location italynorth --output table
# Tags for Azure resources
tags = {
"ManagedBy" = "" # Who manages these resources (e.g., "Terraform", "Clastix")
"CreatedBy" = "" # What created these resources (e.g., "Terraform")
"Environment" = "" # Environment name (e.g., "dev", "staging", "prod")
"Project" = "" # Project name (e.g., "Kamaji", "MyProject")
}

86
providers/azure/main.tf Normal file
View File

@@ -0,0 +1,86 @@
# =============================================================================
# PROVIDERS
# =============================================================================
# Configure the Kubernetes provider
provider "kubernetes" {
# Path to the kubeconfig file for accessing the tenant cluster
config_path = var.tenant_kubeconfig_path
}
# Configure the Azure Provider
provider "azurerm" {
subscription_id = var.azure_subscription_id
features {}
}
# =============================================================================
# BOOTSTRAP TOKEN
# =============================================================================
# Call the shared bootstrap-token module to generate the join command
module "bootstrap_token" {
source = "../../modules/bootstrap-token" # Updated to use shared module
kubeconfig_path = var.tenant_kubeconfig_path # Pass the kubeconfig path to the module
yaki_url = var.yaki_url # Pass the YAKI URL to the module
}
# =============================================================================
# NODE POOLS
# =============================================================================
module "azure_node_pools" {
source = "../../modules/azure-node-pool" # Updated path to the azure-node-pool module
# Iterate over the list of node pools and call the module for each pool
for_each = { for pool in var.node_pools : pool.name => pool }
# Tenant cluster configuration
tenant_cluster_name = var.tenant_cluster_name
# Pool configuration
pool_name = each.value.name
pool_size = each.value.size
pool_min_size = each.value.min_size
pool_max_size = each.value.max_size
# Node configuration
node_disk_size = each.value.node_disk_size
node_disk_type = each.value.node_disk_type
# Azure configuration
azure_location = var.azure_location
azure_resource_group_name = var.azure_resource_group_name
azure_vnet_name = var.azure_vnet_name
azure_subnet_name = var.azure_subnet_name
vnet_subnet_address_prefix = var.vnet_subnet_address_prefix
vm_size = each.value.vm_size
assign_public_ip = each.value.assign_public_ip
vm_image_publisher = var.vm_image_publisher
vm_image_offer = var.vm_image_offer
vm_image_sku = var.vm_image_sku
vm_image_version = var.vm_image_version
tags = var.tags
# SSH configuration
ssh_user = var.ssh_user
ssh_public_key_path = var.ssh_public_key_path
# Autoscaling configuration
enable_autoscaling = each.value.enable_autoscaling
scale_out_cpu_threshold = each.value.scale_out_cpu_threshold
scale_in_cpu_threshold = each.value.scale_in_cpu_threshold
# Instance repair configuration
enable_automatic_instance_repair = each.value.enable_automatic_instance_repair
automatic_instance_repair_grace_period = each.value.automatic_instance_repair_grace_period
upgrade_mode = each.value.upgrade_mode
# Join command for bootstrapping nodes
runcmd = module.bootstrap_token.join_cmd
# Ensure the azure-node-pool module depends on the bootstrap-token module
depends_on = [
module.bootstrap_token
]
}

View File

@@ -0,0 +1,40 @@
# =============================================================================
# BOOTSTRAP TOKEN OUTPUTS
# =============================================================================
output "bootstrap_token" {
description = "Bootstrap token details"
value = {
join_cmd = module.bootstrap_token.join_cmd
}
sensitive = true
}
# =============================================================================
# AZURE NODE POOLS OUTPUTS
# =============================================================================
output "node_pools" {
description = "Azure node pools details"
value = {
for name, pool in module.azure_node_pools : name => {
vmss_details = pool.vmss_details
autoscale_settings = pool.autoscale_settings
network_security_group = pool.network_security_group
}
}
}
# =============================================================================
# CLUSTER INFORMATION
# =============================================================================
output "cluster_info" {
description = "Cluster information"
value = {
tenant_cluster_name = var.tenant_cluster_name
azure_location = var.azure_location
node_pools_count = length(var.node_pools)
node_pools_names = [for pool in var.node_pools : pool.name]
}
}

144
providers/azure/vars.tf Normal file
View File

@@ -0,0 +1,144 @@
# =============================================================================
# CLUSTER CONFIGURATION
# =============================================================================
variable "tenant_cluster_name" {
description = "Name of the tenant cluster"
type = string
}
variable "tenant_kubeconfig_path" {
description = "Path to the kubeconfig file for the tenant cluster"
type = string
default = "~/.kube/config"
}
# =============================================================================
# BOOTSTRAP CONFIGURATION
# =============================================================================
variable "yaki_url" {
description = "URL to the YAKI script for node bootstrapping"
type = string
default = "https://goyaki.clastix.io"
}
# =============================================================================
# NODE POOL CONFIGURATION
# =============================================================================
variable "node_pools" {
description = "List of Azure node pools with their configurations"
type = list(object({
name = string
size = number
node_disk_size = number
vm_size = string
min_size = optional(number, 1)
max_size = optional(number, 9)
node_disk_type = optional(string, "Premium_LRS")
assign_public_ip = optional(bool, true)
enable_autoscaling = optional(bool, false)
scale_out_cpu_threshold = optional(number, 75)
scale_in_cpu_threshold = optional(number, 25)
enable_automatic_instance_repair = optional(bool, false)
automatic_instance_repair_grace_period = optional(string, "PT30M")
upgrade_mode = optional(string, "Manual")
}))
}
# =============================================================================
# AZURE CONFIGURATION
# =============================================================================
variable "azure_subscription_id" {
description = "Azure subscription ID"
type = string
}
variable "azure_location" {
description = "Azure region where resources will be created"
type = string
default = "italynorth"
}
variable "azure_resource_group_name" {
description = "Name of the Azure resource group"
type = string
default = "kamaji"
}
variable "azure_vnet_name" {
description = "Name of the Azure virtual network"
type = string
default = "kamaji-vnet"
}
variable "azure_subnet_name" {
description = "Name of the Azure subnet"
type = string
default = "kamaji-subnet"
}
variable "vnet_subnet_address_prefix" {
description = "CIDR block for the Azure subnet"
type = string
default = "10.10.10.0/24"
}
# =============================================================================
# VM IMAGE CONFIGURATION
# =============================================================================
variable "vm_image_publisher" {
description = "Azure VM image publisher"
type = string
default = "Canonical"
}
variable "vm_image_offer" {
description = "Azure VM image offer"
type = string
default = "ubuntu-24_04-lts"
}
variable "vm_image_sku" {
description = "Azure VM image SKU"
type = string
default = "server"
}
variable "vm_image_version" {
description = "Azure VM image version"
type = string
default = "latest"
}
# =============================================================================
# SSH CONFIGURATION
# =============================================================================
variable "ssh_user" {
description = "SSH user for node access"
type = string
default = "ubuntu"
}
variable "ssh_public_key_path" {
description = "Path to the SSH public key"
type = string
default = "~/.ssh/id_rsa.pub"
}
# =============================================================================
# TAGS
# =============================================================================
variable "tags" {
description = "Tags applied to Azure resources"
type = map(string)
default = {
"ManagedBy" = "Clastix"
"CreatedBy" = "Terraform"
}
}

View File

@@ -0,0 +1,24 @@
terraform {
required_version = ">= 1.0"
required_providers {
azurerm = {
source = "hashicorp/azurerm"
version = ">= 3.0"
}
cloudinit = {
source = "hashicorp/cloudinit"
version = ">= 2.0"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = ">= 2.0"
}
tls = {
source = "hashicorp/tls"
version = ">= 3.0"
}
}
}