[kubernetes] Add gpu-operator and introduce GPU support for tenant Kubernetes clusters

Signed-off-by: Andrei Kvapil <kvapss@gmail.com>
This commit is contained in:
Andrei Kvapil
2025-04-22 17:40:15 +02:00
parent a6b02bf381
commit 7537235f43
18 changed files with 237 additions and 29 deletions

View File

@@ -16,7 +16,7 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.18.1
version: 0.19.0
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to

View File

@@ -27,20 +27,46 @@ How to access to deployed cluster:
kubectl get secret -n <namespace> kubernetes-<clusterName>-admin-kubeconfig -o go-template='{{ printf "%s\n" (index .data "super-admin.conf" | base64decode) }}' > test
```
# Series
## Parameters
<!-- source: https://github.com/kubevirt/common-instancetypes/blob/main/README.md -->
### Common parameters
. | U | O | CX | M | RT
----------------------------|-----|-----|------|-----|------
*Has GPUs* | | | | |
*Hugepages* | | | | ✓ | ✓
*Overcommitted Memory* | | | | |
*Dedicated CPU* | | | | | ✓
*Burstable CPU performance* | ✓ | ✓ | | ✓ |
*Isolated emulator threads* | | | ✓ | | ✓
*vNUMA* | | | ✓ | | ✓
*vCPU-To-Memory Ratio* | 1:4 | 1:4 | 1:2 | 1:8 | 1:4
| Name | Description | Value |
| ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ------------ |
| `host` | The hostname used to access the Kubernetes cluster externally (defaults to using the cluster name as a subdomain for the tenant host). | `""` |
| `controlPlane.replicas` | Number of replicas for Kubernetes control-plane components | `2` |
| `storageClass` | StorageClass used to store user data | `replicated` |
| `nodeGroups` | nodeGroups configuration | `{}` |
### Cluster Addons
| Name | Description | Value |
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `addons.certManager.enabled` | Enables the cert-manager | `false` |
| `addons.certManager.valuesOverride` | Custom values to override | `{}` |
| `addons.ingressNginx.enabled` | Enable Ingress-NGINX controller (expect nodes with 'ingress-nginx' role) | `false` |
| `addons.ingressNginx.valuesOverride` | Custom values to override | `{}` |
| `addons.ingressNginx.hosts` | List of domain names that should be passed through to the cluster by upper cluster | `[]` |
| `addons.gpuOperator.enabled` | Enables the gpu-operator | `false` |
| `addons.gpuOperator.valuesOverride` | Custom values to override | `{}` |
| `addons.fluxcd.enabled` | Enables Flux CD | `false` |
| `addons.fluxcd.valuesOverride` | Custom values to override | `{}` |
| `addons.monitoringAgents.enabled` | Enables MonitoringAgents (fluentbit, vmagents for sending logs and metrics to storage) if tenant monitoring enabled, send to tenant storage, else to root storage | `false` |
| `addons.monitoringAgents.valuesOverride` | Custom values to override | `{}` |
| `addons.verticalPodAutoscaler.valuesOverride` | Custom values to override | `{}` |
### Kamaji control plane
| Name | Description | Value |
| --------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `kamajiControlPlane.apiServer.resources` | Resources | `{}` |
| `kamajiControlPlane.apiServer.resourcesPreset` | Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production). | `small` |
| `kamajiControlPlane.controllerManager.resources` | Resources | `{}` |
| `kamajiControlPlane.controllerManager.resourcesPreset` | Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production). | `micro` |
| `kamajiControlPlane.scheduler.resources` | Resources | `{}` |
| `kamajiControlPlane.scheduler.resourcesPreset` | Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production). | `micro` |
| `kamajiControlPlane.addons.konnectivity.server.resources` | Resources | `{}` |
| `kamajiControlPlane.addons.konnectivity.server.resourcesPreset` | Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production). | `micro` |
## U Series

View File

@@ -39,6 +39,13 @@ spec:
sockets: 1
{{- end }}
devices:
{{- if .group.gpus }}
gpus:
{{- range $i, $gpu := .group.gpus }}
- name: gpu{{ add $i 1 }}
deviceName: {{ $gpu.name }}
{{- end }}
{{- end }}
disks:
- name: system
disk:

View File

@@ -4,7 +4,7 @@ metadata:
name: {{ .Release.Name }}-cert-manager-crds
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: cert-manager-crds

View File

@@ -5,7 +5,7 @@ metadata:
name: {{ .Release.Name }}-cert-manager
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: cert-manager

View File

@@ -4,7 +4,7 @@ metadata:
name: {{ .Release.Name }}-cilium
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: cilium

View File

@@ -4,7 +4,7 @@ metadata:
name: {{ .Release.Name }}-csi
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: csi

View File

@@ -38,6 +38,7 @@ spec:
{{ .Release.Name }}-ingress-nginx
{{ .Release.Name }}-fluxcd-operator
{{ .Release.Name }}-fluxcd
{{ .Release.Name }}-gpu-operator
-p '{"spec": {"suspend": true}}'
--type=merge --field-manager=flux-client-side-apply || true
---
@@ -76,6 +77,7 @@ rules:
- {{ .Release.Name }}-ingress-nginx
- {{ .Release.Name }}-fluxcd-operator
- {{ .Release.Name }}-fluxcd
- {{ .Release.Name }}-gpu-operator
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding

View File

@@ -5,7 +5,7 @@ metadata:
name: {{ .Release.Name }}-fluxcd-operator
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: fluxcd-operator
@@ -49,7 +49,7 @@ metadata:
name: {{ .Release.Name }}-fluxcd
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: fluxcd

View File

@@ -0,0 +1,57 @@
{{- if .Values.addons.gpuOperator.enabled }}
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: {{ .Release.Name }}-gpu-operator
labels:
cozystack.io/repository: system
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: gpu-operator
chart:
spec:
chart: cozy-gpu-operator
reconcileStrategy: Revision
sourceRef:
kind: HelmRepository
name: cozystack-system
namespace: cozy-system
kubeConfig:
secretRef:
name: {{ .Release.Name }}-admin-kubeconfig
key: super-admin.svc
targetNamespace: cozy-gpu-operator
storageNamespace: cozy-gpu-operator
install:
createNamespace: true
remediation:
retries: -1
upgrade:
remediation:
retries: -1
{{- if .Values.addons.gpuOperator.valuesOverride }}
valuesFrom:
- kind: Secret
name: {{ .Release.Name }}-gpu-operator-values-override
valuesKey: values
{{- end }}
dependsOn:
{{- if lookup "helm.toolkit.fluxcd.io/v2" "HelmRelease" .Release.Namespace .Release.Name }}
- name: {{ .Release.Name }}
namespace: {{ .Release.Namespace }}
{{- end }}
- name: {{ .Release.Name }}-cilium
namespace: {{ .Release.Namespace }}
{{- end }}
{{- if .Values.addons.gpuOperator.valuesOverride }}
---
apiVersion: v1
kind: Secret
metadata:
name: {{ .Release.Name }}-gpu-operator-values-override
stringData:
values: |
{{- toYaml .Values.addons.gpuOperator.valuesOverride | nindent 4 }}
{{- end }}

View File

@@ -5,7 +5,7 @@ metadata:
name: {{ .Release.Name }}-ingress-nginx
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: ingress-nginx

View File

@@ -7,7 +7,7 @@ metadata:
name: {{ .Release.Name }}-monitoring-agents
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: cozy-monitoring-agents

View File

@@ -5,7 +5,7 @@ metadata:
name: {{ .Release.Name }}-vertical-pod-autoscaler-crds
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: vertical-pod-autoscaler-crds

View File

@@ -7,7 +7,7 @@ metadata:
name: {{ .Release.Name }}-vertical-pod-autoscaler
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: vertical-pod-autoscaler

View File

@@ -5,7 +5,7 @@ metadata:
name: {{ .Release.Name }}-cozy-victoria-metrics-operator
labels:
cozystack.io/repository: system
coztstack.io/target-cluster-name: {{ .Release.Name }}
cozystack.io/target-cluster-name: {{ .Release.Name }}
spec:
interval: 5m
releaseName: cozy-victoria-metrics-operator

View File

@@ -12,7 +12,7 @@
"properties": {
"replicas": {
"type": "number",
"description": "Number of replicas for Kubernetes contorl-plane components",
"description": "Number of replicas for Kubernetes control-plane components",
"default": 2
}
}
@@ -61,6 +61,21 @@
}
}
},
"gpuOperator": {
"type": "object",
"properties": {
"enabled": {
"type": "boolean",
"description": "Enables the gpu-operator",
"default": false
},
"valuesOverride": {
"type": "object",
"description": "Custom values to override",
"default": {}
}
}
},
"fluxcd": {
"type": "object",
"properties": {
@@ -90,8 +105,93 @@
"default": {}
}
}
},
"verticalPodAutoscaler": {
"type": "object",
"properties": {
"valuesOverride": {
"type": "object",
"description": "Custom values to override",
"default": {}
}
}
}
}
},
"kamajiControlPlane": {
"type": "object",
"properties": {
"apiServer": {
"type": "object",
"properties": {
"resources": {
"type": "object",
"description": "Resources",
"default": {}
},
"resourcesPreset": {
"type": "string",
"description": "Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production).",
"default": "small"
}
}
},
"controllerManager": {
"type": "object",
"properties": {
"resources": {
"type": "object",
"description": "Resources",
"default": {}
},
"resourcesPreset": {
"type": "string",
"description": "Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production).",
"default": "micro"
}
}
},
"scheduler": {
"type": "object",
"properties": {
"resources": {
"type": "object",
"description": "Resources",
"default": {}
},
"resourcesPreset": {
"type": "string",
"description": "Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production).",
"default": "micro"
}
}
},
"addons": {
"type": "object",
"properties": {
"konnectivity": {
"type": "object",
"properties": {
"server": {
"type": "object",
"properties": {
"resources": {
"type": "object",
"description": "Resources",
"default": {}
},
"resourcesPreset": {
"type": "string",
"description": "Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production).",
"default": "micro"
}
}
}
}
}
}
}
}
}
}
}
}

View File

@@ -1,7 +1,7 @@
## @section Common parameters
## @param host The hostname used to access the Kubernetes cluster externally (defaults to using the cluster name as a subdomain for the tenant host).
## @param controlPlane.replicas Number of replicas for Kubernetes contorl-plane components
## @param controlPlane.replicas Number of replicas for Kubernetes control-plane components
## @param storageClass StorageClass used to store user data
##
host: ""
@@ -24,6 +24,14 @@ nodeGroups:
cpu: ""
memory: ""
## List of GPUs to attach (WARN: NVIDIA driver requires at least 4 GiB of RAM)
## e.g:
## instanceType: "u1.xlarge"
## gpus:
## - name: nvidia.com/AD102GL_L40S
gpus: []
## @section Cluster Addons
##
addons:
@@ -52,6 +60,14 @@ addons:
hosts: []
valuesOverride: {}
## GPU-operator: NVIDIA GPU Operator
##
gpuOperator:
## @param addons.gpuOperator.enabled Enables the gpu-operator
## @param addons.gpuOperator.valuesOverride Custom values to override
enabled: false
valuesOverride: {}
## Flux CD
##
fluxcd:
@@ -135,4 +151,4 @@ kamajiControlPlane:
## @param kamajiControlPlane.addons.konnectivity.server.resourcesPreset Set container resources according to one common preset (allowed values: none, nano, micro, small, medium, large, xlarge, 2xlarge). This is ignored if resources is set (resources is recommended for production).
resourcesPreset: "micro"

View File

@@ -59,7 +59,7 @@ kubernetes 0.16.0 077045b0
kubernetes 0.17.0 1fbbfcd0
kubernetes 0.17.1 fd240701
kubernetes 0.18.0 721c12a7
kubernetes 0.18.1 HEAD
kubernetes 0.19.0 HEAD
mysql 0.1.0 263e47be
mysql 0.2.0 c24a103f
mysql 0.3.0 53f2365e