autoscaller fixes

This commit is contained in:
Serge Logvinov
2022-12-26 20:19:00 +02:00
parent c37b1135ac
commit f8327cb3cf
12 changed files with 119 additions and 95 deletions

View File

@@ -9,7 +9,7 @@ The goal is to create all cloud services from scratch.
|---|---|---|---|---|
| [Azure](azure) | 1.3.0 | CCM,CSI,Autoscaler | many regions, many zones | ✓ |
| [Exoscale](exoscale) | 1.3.0 | CCM,Autoscaler | many regions | ✗ |
| [GCP](gcp-zonal) | 0.14.0 | CCM,CSI,Autoscaler | one region, many zones | ✓ |
| [GCP](gcp-zonal) | 1.3.0 | CCM,CSI,Autoscaler | one region, many zones | ✓ |
| [Hetzner](hetzner) | 1.3.0 | CCM,CSI,Autoscaler | many regions | ✗ |
| [Openstack](openstack) | 1.3.0 | CCM,CSI | many regions, many zones | ✓ |
| [Oracle](oracle) | 1.3.0 | CCM,~~CSI~~,Autoscaler | one region, many zones | ✓ |

5
gcp-zonal/README.md Normal file
View File

@@ -0,0 +1,5 @@
* https://console.cloud.google.com/apis/library/iam.googleapis.com
* https://console.cloud.google.com/apis/library/cloudresourcemanager.googleapis.com
* https://console.cloud.google.com/apis/library/compute.googleapis.com
* https://console.cloud.google.com/apis/library/servicenetworking.googleapis.com

View File

@@ -1,5 +1,5 @@
data "google_client_openid_userinfo" "terraform" {}
# data "google_client_openid_userinfo" "terraform" {}
data "google_compute_image" "talos" {
project = local.project

View File

@@ -60,7 +60,7 @@ data:
# Enable IPv6 addressing. If enabled, all endpoints are allocated an IPv6
# address.
enable-ipv6: "false"
enable-ipv6: "true"
# Users who wish to specify their own custom CNI configuration file must set
# custom-cni-conf to "true", otherwise Cilium may overwrite the configuration.
custom-cni-conf: "false"
@@ -156,6 +156,7 @@ data:
enable-l2-neigh-discovery: "true"
arping-refresh-period: "30s"
k8s-require-ipv4-pod-cidr: "true"
k8s-require-ipv6-pod-cidr: "true"
enable-endpoint-health-checking: "true"
enable-health-checking: "true"
enable-well-known-identities: "false"
@@ -813,7 +814,7 @@ spec:
metadata:
annotations:
# ensure pods roll when configmap updates
cilium.io/cilium-configmap-checksum: "16ad070d5b63bf306024e40a412abddb22eac12ab78ce8b3f821045c84af75f5"
cilium.io/cilium-configmap-checksum: "1fba38b457dca54f2e3f842507d9f35143a396529e557ad5ad7fe557e72be995"
labels:
io.cilium/app: operator
name: cilium-operator

View File

@@ -35,14 +35,14 @@ ipam:
mode: "kubernetes"
k8s:
requireIPv4PodCIDR: true
# requireIPv6PodCIDR: true
requireIPv6PodCIDR: true
bpf:
masquerade: false
ipv4:
enabled: true
ipv6:
enabled: false
enabled: true
hostServices:
enabled: true
hostPort:

View File

@@ -291,21 +291,19 @@ spec:
dnsPolicy: "ClusterFirst"
containers:
- name: gce-cluster-autoscaler
image: "k8s.gcr.io/autoscaling/cluster-autoscaler:v1.23.0"
image: "k8s.gcr.io/autoscaling/cluster-autoscaler:v1.26.0"
imagePullPolicy: "IfNotPresent"
command:
- ./cluster-autoscaler
- --cloud-provider=gce
- --namespace=kube-system
- --nodes=0:2:https://www.googleapis.com/compute/v1/projects/metal-dimension/zones/
- --nodes=0:2:https://www.googleapis.com/compute/v1/projects/metal-dimension/zones/
- --nodes=0:2:https://www.googleapis.com/compute/v1/projects//zones/
- --nodes=0:2:https://www.googleapis.com/compute/v1/projects//zones/
- --cloud-config=/etc/gce/gce.conf
- --logtostderr=true
- --regional=true
- --stderrthreshold=info
- --v=4
env:
- name: GOOGLE_APPLICATION_CREDENTIALS
value: "/etc/gce/cloud-sa.json"
livenessProbe:
httpGet:
path: /health-check
@@ -313,10 +311,15 @@ spec:
ports:
- containerPort: 8085
resources:
{}
limits:
cpu: 100m
memory: 300Mi
requests:
cpu: 100m
memory: 300Mi
volumeMounts:
- name: google-cloud-key
mountPath: /etc/gce
- mountPath: /etc/gce
name: config
readOnly: true
nodeSelector:
node-role.kubernetes.io/control-plane: ""
@@ -326,9 +329,7 @@ spec:
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
volumes:
- name: google-cloud-key
secret:
secretName: cloud-autoscaler-gcp
items:
- key: cloud-sa.json
path: cloud-sa.json
- configMap:
defaultMode: 420
name: gcp-cloud-controller-manager
name: config

View File

@@ -3,14 +3,17 @@
#
fullnameOverride: cluster-autoscaler-gcp
image:
tag: v1.26.0
cloudProvider: gce
cloudConfigPath: /etc/gce/gce.conf
autoscalingGroups:
- name: https://www.googleapis.com/compute/v1/projects/metal-dimension/zones/
- name: https://www.googleapis.com/compute/v1/projects//zones/
minSize: 0
maxSize: 2
- name: https://www.googleapis.com/compute/v1/projects/metal-dimension/zones/
- name: https://www.googleapis.com/compute/v1/projects//zones/
minSize: 0
maxSize: 2
@@ -20,19 +23,38 @@ extraArgs:
stderrthreshold: info
v: 4
extraEnv:
GOOGLE_APPLICATION_CREDENTIALS: /etc/gce/cloud-sa.json
# extraEnv:
# GOOGLE_APPLICATION_CREDENTIALS: /etc/gce/cloud-sa.json
# extraVolumeSecrets:
# google-cloud-key:
# name: cloud-autoscaler-gcp
# mountPath: /etc/gce
# items:
# - key: cloud-sa.json
# path: cloud-sa.json
extraVolumeSecrets:
google-cloud-key:
name: cloud-autoscaler-gcp
mountPath: /etc/gce
items:
- key: cloud-sa.json
path: cloud-sa.json
extraArgs:
cloud-config: /etc/gce/gce.conf
extraVolumeMounts:
- mountPath: /etc/gce
name: config
readOnly: true
extraVolumes:
- configMap:
defaultMode: 420
name: gcp-cloud-controller-manager
name: config
priorityClassName: system-cluster-critical
resources:
limits:
cpu: 100m
memory: 300Mi
requests:
cpu: 100m
memory: 300Mi
nodeSelector:
node-role.kubernetes.io/control-plane: ""
node.cloudprovider.kubernetes.io/platform: gcp

View File

@@ -11,22 +11,16 @@ resource "google_service_account_iam_member" "terraform" {
member = "serviceAccount:${google_service_account.terraform.email}"
}
resource "google_project_iam_binding" "terraform" {
resource "google_project_iam_member" "terraform" {
project = var.project
role = "roles/editor"
members = [
"serviceAccount:${google_service_account.terraform.email}",
]
member = "serviceAccount:${google_service_account.terraform.email}"
}
resource "google_project_iam_binding" "terraform_networksAdmin" {
resource "google_project_iam_member" "terraform_networksAdmin" {
project = var.project
role = "roles/servicenetworking.networksAdmin"
members = [
"serviceAccount:${google_service_account.terraform.email}",
]
member = "serviceAccount:${google_service_account.terraform.email}"
# condition {
# title = "ExpiresAfter_2023_12_31"
@@ -35,20 +29,14 @@ resource "google_project_iam_binding" "terraform_networksAdmin" {
# }
}
resource "google_project_iam_binding" "terraform_saAdmin" {
resource "google_project_iam_member" "terraform_saAdmin" {
project = var.project
role = "roles/iam.serviceAccountAdmin"
members = [
"serviceAccount:${google_service_account.terraform.email}",
]
member = "serviceAccount:${google_service_account.terraform.email}"
}
resource "google_project_iam_binding" "terraform_iamAdmin" {
resource "google_project_iam_member" "terraform_iamAdmin" {
project = var.project
role = "roles/iam.securityAdmin"
members = [
"serviceAccount:${google_service_account.terraform.email}",
]
member = "serviceAccount:${google_service_account.terraform.email}"
}

View File

@@ -36,20 +36,12 @@ resource "google_project_iam_custom_role" "autoscaler" {
"compute.instanceGroupManagers.get",
"compute.instanceGroupManagers.list",
"compute.instanceGroupManagers.update",
"compute.instanceGroups.update",
"compute.instanceTemplates.get",
"compute.instanceTemplates.list",
"compute.machineTypes.get",
"compute.machineTypes.list",
"compute.instances.setLabels",
"compute.instances.setMetadata",
"compute.instances.setTags",
"compute.instances.create",
"compute.disks.create",
"compute.disks.setLabels",
"compute.images.useReadOnly",
"compute.subnetworks.use",
# "compute.instances.*",
"compute.healthChecks.get",
"compute.httpsHealthChecks.get",
"servicemanagement.services.get",
"servicemanagement.services.list",
]

View File

@@ -4,30 +4,17 @@ resource "google_service_account" "controlplane" {
display_name = "A service account for controlplane instances"
}
# resource "google_project_iam_member" "ccm_sa" {
# project = local.project
# role = "roles/compute.serviceAgent"
# member = "serviceAccount:${google_service_account.controlplane.email}"
# }
resource "google_project_iam_member" "ccm" {
resource "google_project_iam_member" "controlplane_ccm" {
project = local.project
role = "projects/${local.project}/roles/KubeCCM"
member = "serviceAccount:${google_service_account.controlplane.email}"
}
# resource "google_project_iam_member" "ccm_autoscaler" {
# project = local.project
# role = "projects/${local.project}/roles/KubeClusterAutoscaler"
# member = "serviceAccount:${google_service_account.controlplane.email}"
# }
# resource "google_project_iam_member" "ccm_autoscaler_roles" {
# project = local.project
# role = "roles/viewer"
# member = "serviceAccount:${google_service_account.controlplane.email}"
# }
resource "google_project_iam_member" "controlplane_autoscaler" {
project = local.project
role = "projects/${local.project}/roles/KubeClusterAutoscaler"
member = "serviceAccount:${google_service_account.controlplane.email}"
}
# resource "google_service_account" "csi" {
# account_id = "csi-driver"
@@ -63,14 +50,14 @@ resource "google_project_iam_member" "autoscaler" {
member = "serviceAccount:${google_service_account.autoscaler.email}"
}
resource "google_project_iam_member" "autoscaler_admin" {
project = local.project
role = "roles/compute.admin"
member = "serviceAccount:${google_service_account.autoscaler.email}"
}
# resource "google_project_iam_member" "autoscaler_admin" {
# project = local.project
# role = "roles/compute.admin"
# member = "serviceAccount:${google_service_account.autoscaler.email}"
# }
resource "google_project_iam_member" "autoscaler_roles" {
project = local.project
role = "roles/viewer"
member = "serviceAccount:${google_service_account.autoscaler.email}"
}
# resource "google_project_iam_member" "autoscaler_roles" {
# project = local.project
# role = "roles/viewer"
# member = "serviceAccount:${google_service_account.autoscaler.email}"
# }

View File

@@ -55,10 +55,10 @@ resource "google_compute_instance_group_manager" "worker_spot" {
max_unavailable_fixed = 1
replacement_method = "SUBSTITUTE"
}
# auto_healing_policies {
# health_check = google_compute_health_check.instance.id
# initial_delay_sec = 300
# }
auto_healing_policies {
health_check = google_compute_region_health_check.instance.id
initial_delay_sec = 300
}
target_pools = []
target_size = 0

View File

@@ -23,6 +23,26 @@ resource "google_compute_firewall" "common" {
protocol = "udp"
ports = ["8472"]
}
depends_on = [google_compute_network.network]
}
resource "google_compute_firewall" "dhcp" {
project = var.project
name = "${var.cluster_name}-dhcp-v6"
network = var.network_name
description = "Managed by terraform: Allow dhcp traffic"
priority = 910
direction = "INGRESS"
source_ranges = ["fe80::/10"]
target_tags = ["${var.cluster_name}-common"]
allow {
protocol = "udp"
}
depends_on = [google_compute_network.network]
}
resource "google_compute_firewall" "common_health_check" {
@@ -39,6 +59,8 @@ resource "google_compute_firewall" "common_health_check" {
protocol = "tcp"
ports = ["50000"]
}
depends_on = [google_compute_network.network]
}
resource "google_compute_firewall" "controlplane" {
@@ -55,6 +77,8 @@ resource "google_compute_firewall" "controlplane" {
protocol = "tcp"
ports = ["2379", "2380", "6443", ]
}
depends_on = [google_compute_network.network]
}
resource "google_compute_firewall" "controlplane_admin" {
@@ -75,6 +99,8 @@ resource "google_compute_firewall" "controlplane_admin" {
protocol = "tcp"
ports = ["6443", "50000"]
}
depends_on = [google_compute_network.network]
}
resource "google_compute_firewall" "controlplane_health_check" {
@@ -91,6 +117,8 @@ resource "google_compute_firewall" "controlplane_health_check" {
protocol = "tcp"
ports = ["6443"]
}
depends_on = [google_compute_network.network]
}
# resource "google_compute_firewall" "web" {