From 5c97eeea4ace8cf236f8d9c7c516fbeeb9ffb5e7 Mon Sep 17 00:00:00 2001 From: Toboshii Nakama Date: Mon, 17 May 2021 03:17:38 -0500 Subject: [PATCH] Nvidia madness --- .gitignore | 1 + cluster/apps/kube-system/kustomization.yaml | 1 + .../nvidia-device-plugin/helm-release.yaml | 25 +++++ .../nvidia-device-plugin/kustomization.yaml | 4 + cluster/apps/kustomization.yaml | 1 + cluster/apps/media/_pvc/kustomization.yaml | 4 + cluster/apps/media/_pvc/nfs-media-franxx.yaml | 34 ++++++ cluster/apps/media/plex/config-pvc.yaml | 13 +++ cluster/apps/media/plex/helm-release.yaml | 105 ++++++++++++++++++ cluster/apps/media/plex/kustomization.yaml | 5 + .../apps/media/plex/uptimerobot-monitor.yaml | 11 ++ cluster/base/cluster-settings.yaml | 1 + .../charts/helm/kustomization.yaml | 1 + .../charts/helm/nvidia-charts.yaml | 10 ++ .../group_vars/all/calico-settings.yml | 4 +- .../group_vars/all/rsyslog-settings.yml | 2 +- .../group_vars/gpu-nodes/nvidia-settings.yml | 9 ++ .../home-cluster/host_vars/dfw-k8s-cuda1.yml | 16 +++ .../ansible/inventory/home-cluster/hosts.yml | 3 + server/ansible/playbooks/k3s/install.yml | 13 +++ server/ansible/playbooks/k3s/nuke.yml | 1 + server/ansible/playbooks/k3s/upgrade.yml | 1 + .../playbooks/power-outage/shutdown.yml | 1 + server/ansible/playbooks/ubuntu/prepare.yml | 1 + server/ansible/playbooks/ubuntu/upgrade.yml | 1 + server/ansible/requirements.yml | 2 + .../k3s/templates/calico-installation.yaml.j2 | 4 +- .../roles/nvidia/files/blacklist-nouveau.conf | 2 + .../roles/nvidia/files/config.toml.tmpl | 53 +++++++++ .../roles/nvidia/tasks/container-runtime.yml | 22 ++++ server/ansible/roles/nvidia/tasks/driver.yml | 40 +++++++ .../ansible/roles/nvidia/tasks/k3s-agent.yml | 13 +++ server/ansible/roles/nvidia/tasks/main.yml | 17 +++ server/ansible/roles/nvidia/tasks/patch.yml | 18 +++ 34 files changed, 436 insertions(+), 3 deletions(-) create mode 100644 cluster/apps/kube-system/nvidia-device-plugin/helm-release.yaml create mode 100644 cluster/apps/kube-system/nvidia-device-plugin/kustomization.yaml create mode 100644 cluster/apps/media/_pvc/kustomization.yaml create mode 100644 cluster/apps/media/_pvc/nfs-media-franxx.yaml create mode 100644 cluster/apps/media/plex/config-pvc.yaml create mode 100644 cluster/apps/media/plex/helm-release.yaml create mode 100644 cluster/apps/media/plex/kustomization.yaml create mode 100644 cluster/apps/media/plex/uptimerobot-monitor.yaml create mode 100644 cluster/base/flux-system/charts/helm/nvidia-charts.yaml create mode 100644 server/ansible/inventory/home-cluster/group_vars/gpu-nodes/nvidia-settings.yml create mode 100644 server/ansible/inventory/home-cluster/host_vars/dfw-k8s-cuda1.yml create mode 100644 server/ansible/roles/nvidia/files/blacklist-nouveau.conf create mode 100644 server/ansible/roles/nvidia/files/config.toml.tmpl create mode 100644 server/ansible/roles/nvidia/tasks/container-runtime.yml create mode 100644 server/ansible/roles/nvidia/tasks/driver.yml create mode 100644 server/ansible/roles/nvidia/tasks/k3s-agent.yml create mode 100644 server/ansible/roles/nvidia/tasks/main.yml create mode 100644 server/ansible/roles/nvidia/tasks/patch.yml diff --git a/.gitignore b/.gitignore index ca867d58..4d558b5e 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ terraform.tfstate* # Ansible server/ansible/inventory/e2e/hosts.yml xanmanning.k3s* +nvidia.nvidia_driver* node_modules \ No newline at end of file diff --git a/cluster/apps/kube-system/kustomization.yaml b/cluster/apps/kube-system/kustomization.yaml index 3fd74521..383136a0 100644 --- a/cluster/apps/kube-system/kustomization.yaml +++ b/cluster/apps/kube-system/kustomization.yaml @@ -6,4 +6,5 @@ resources: - metrics-server - node-feature-discovery - node-problem-detector +- nvidia-device-plugin - reloader diff --git a/cluster/apps/kube-system/nvidia-device-plugin/helm-release.yaml b/cluster/apps/kube-system/nvidia-device-plugin/helm-release.yaml new file mode 100644 index 00000000..3c8aada9 --- /dev/null +++ b/cluster/apps/kube-system/nvidia-device-plugin/helm-release.yaml @@ -0,0 +1,25 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: nvidia-device-plugin + namespace: kube-system +spec: + interval: 5m + chart: + spec: + chart: nvidia-device-plugin + version: 0.9.0 + sourceRef: + kind: HelmRepository + name: nvidia-charts + namespace: flux-system + interval: 5m + values: + image: + repository: nvcr.io/nvidia/k8s-device-plugin + tag: v0.9.0 + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: PreferNoSchedule \ No newline at end of file diff --git a/cluster/apps/kube-system/nvidia-device-plugin/kustomization.yaml b/cluster/apps/kube-system/nvidia-device-plugin/kustomization.yaml new file mode 100644 index 00000000..761d2252 --- /dev/null +++ b/cluster/apps/kube-system/nvidia-device-plugin/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- helm-release.yaml diff --git a/cluster/apps/kustomization.yaml b/cluster/apps/kustomization.yaml index 90ee3277..03ee13e0 100644 --- a/cluster/apps/kustomization.yaml +++ b/cluster/apps/kustomization.yaml @@ -6,6 +6,7 @@ resources: - flux-system - kasten-io - kube-system +- media - monitoring - networking - security diff --git a/cluster/apps/media/_pvc/kustomization.yaml b/cluster/apps/media/_pvc/kustomization.yaml new file mode 100644 index 00000000..ffd3520c --- /dev/null +++ b/cluster/apps/media/_pvc/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- nfs-media-franxx.yaml diff --git a/cluster/apps/media/_pvc/nfs-media-franxx.yaml b/cluster/apps/media/_pvc/nfs-media-franxx.yaml new file mode 100644 index 00000000..80e56837 --- /dev/null +++ b/cluster/apps/media/_pvc/nfs-media-franxx.yaml @@ -0,0 +1,34 @@ +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: nfs-media-franxx-pv +spec: + storageClassName: media-franxx + capacity: + storage: 1Mi + accessModes: + - ReadWriteMany + persistentVolumeReclaimPolicy: Retain + nfs: + server: 10.75.30.15 + path: /tank/share + mountOptions: + - tcp + - intr + - hard + - noatime + - nodiratime +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: nfs-media-franxx-pvc + namespace: media +spec: + accessModes: + - ReadWriteMany + storageClassName: media-franxx + resources: + requests: + storage: 1Mi diff --git a/cluster/apps/media/plex/config-pvc.yaml b/cluster/apps/media/plex/config-pvc.yaml new file mode 100644 index 00000000..9f7da7d1 --- /dev/null +++ b/cluster/apps/media/plex/config-pvc.yaml @@ -0,0 +1,13 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: plex-config-v1 + namespace: media +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 100Gi + storageClassName: rook-ceph-block diff --git a/cluster/apps/media/plex/helm-release.yaml b/cluster/apps/media/plex/helm-release.yaml new file mode 100644 index 00000000..320591cb --- /dev/null +++ b/cluster/apps/media/plex/helm-release.yaml @@ -0,0 +1,105 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: plex + namespace: media +spec: + interval: 5m + chart: + spec: + # renovate: registryUrl=https://k8s-at-home.com/charts/ + chart: plex + version: 4.4.0 + sourceRef: + kind: HelmRepository + name: k8s-at-home-charts + namespace: flux-system + interval: 5m + values: + image: + repository: ghcr.io/k8s-at-home/plex + tag: v1.22.3.4392-d7c624def + env: + TZ: "America/Chicago" + podSecurityContext: + supplementalGroups: + - 5600 + service: + enabled: true + type: LoadBalancer + externalIPs: + - "${LB_PLEX_IP}" + externalTrafficPolicy: Local + port: + port: 32400 + annotations: + traefik.ingress.kubernetes.io/service.serversscheme: https + ingress: + enabled: true + ingressClassName: "traefik" + annotations: + traefik.ingress.kubernetes.io/router.entrypoints: "websecure" + external-dns/is-public: "true" + external-dns.alpha.kubernetes.io/target: "ipv4.${SECRET_DOMAIN}" + hosts: + - host: "plex.${SECRET_DOMAIN}" + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - "plex.${SECRET_DOMAIN}" + persistence: + config: + enabled: true + existingClaim: plex-config-v1 + tmp: + enabled: true + emptyDir: + enabled: true + medium: "Memory" + additionalVolumeMounts: + - name: media-franxx + mountPath: "/media-franxx" + readOnly: true + additionalVolumes: + - name: media-franxx + persistentVolumeClaim: + claimName: nfs-media-franxx-pvc + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - dfw-k8s-cuda1 + resources: + limits: + nvidia.com/gpu: 4 + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "PreferNoSchedule" + podAnnotations: + configmap.reloader.stakater.com/reload: "plex-promtail" + addons: + promtail: + enabled: true + image: + repository: grafana/promtail + tag: 2.2.1 + loki: http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push + logs: + - name: promtail/plex + path: "/config/Library/Application Support/Plex Media Server/Logs/*.log" + - name: promtail/plex/plugins + path: "/config/Library/Application Support/Plex Media Server/Logs/PMS Plugin Logs/*.log" + volumeMounts: + - name: config + mountPath: /config + readOnly: true + securityContext: + runAsUser: 0 diff --git a/cluster/apps/media/plex/kustomization.yaml b/cluster/apps/media/plex/kustomization.yaml new file mode 100644 index 00000000..97b3a7ee --- /dev/null +++ b/cluster/apps/media/plex/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- config-pvc.yaml +- helm-release.yaml diff --git a/cluster/apps/media/plex/uptimerobot-monitor.yaml b/cluster/apps/media/plex/uptimerobot-monitor.yaml new file mode 100644 index 00000000..4e9492ba --- /dev/null +++ b/cluster/apps/media/plex/uptimerobot-monitor.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: uroperator.brennerm.github.io/v1beta1 +kind: UptimeRobotMonitor +metadata: + name: plex-monitor +spec: + friendlyName: "Plex" + url: "https://plex.${SECRET_DOMAIN}" + type: HTTPS + httpMethod: GET + interval: 120 diff --git a/cluster/base/cluster-settings.yaml b/cluster/base/cluster-settings.yaml index ad31b99c..1cae0854 100644 --- a/cluster/base/cluster-settings.yaml +++ b/cluster/base/cluster-settings.yaml @@ -10,4 +10,5 @@ data: LB_COREDNS_IP: 10.75.45.100 LB_TRAEFIK_IP: 10.75.45.101 LB_LOKI_IP: 10.75.45.102 + LB_PLEX_IP: 10.75.45.103 LB_BLOCKY_IP: 10.75.45.105 \ No newline at end of file diff --git a/cluster/base/flux-system/charts/helm/kustomization.yaml b/cluster/base/flux-system/charts/helm/kustomization.yaml index 7f26eea7..8e758be2 100644 --- a/cluster/base/flux-system/charts/helm/kustomization.yaml +++ b/cluster/base/flux-system/charts/helm/kustomization.yaml @@ -18,6 +18,7 @@ resources: - kubernetes-sigs-descheduler-charts.yaml - nfs-subdir-external-provisioner-charts.yaml - node-feature-discovery-charts.yaml +- nvidia-charts.yaml - prometheus-community-charts.yaml - rook-ceph-charts.yaml - stakater-charts.yaml diff --git a/cluster/base/flux-system/charts/helm/nvidia-charts.yaml b/cluster/base/flux-system/charts/helm/nvidia-charts.yaml new file mode 100644 index 00000000..876d685b --- /dev/null +++ b/cluster/base/flux-system/charts/helm/nvidia-charts.yaml @@ -0,0 +1,10 @@ +--- +apiVersion: source.toolkit.fluxcd.io/v1beta1 +kind: HelmRepository +metadata: + name: nvidia-charts + namespace: flux-system +spec: + interval: 10m + url: https://nvidia.github.io/k8s-device-plugin + timeout: 3m diff --git a/server/ansible/inventory/home-cluster/group_vars/all/calico-settings.yml b/server/ansible/inventory/home-cluster/group_vars/all/calico-settings.yml index 55e1f4e1..5df9f3b6 100644 --- a/server/ansible/inventory/home-cluster/group_vars/all/calico-settings.yml +++ b/server/ansible/inventory/home-cluster/group_vars/all/calico-settings.yml @@ -15,4 +15,6 @@ calico: # e.g. 192.168.169.0/24 externalIPs: 10.75.45.0/24 node: - cidr: 10.75.40.0/24 \ No newline at end of file + cidrs: + - 10.75.40.0/24 + - 10.75.30.0/24 \ No newline at end of file diff --git a/server/ansible/inventory/home-cluster/group_vars/all/rsyslog-settings.yml b/server/ansible/inventory/home-cluster/group_vars/all/rsyslog-settings.yml index c0a8c68d..e96a21ed 100644 --- a/server/ansible/inventory/home-cluster/group_vars/all/rsyslog-settings.yml +++ b/server/ansible/inventory/home-cluster/group_vars/all/rsyslog-settings.yml @@ -4,5 +4,5 @@ # ...requires a rsyslog server already set up rsyslog: enabled: false - ip: 192.168.69.155 + ip: 10.75.45.102 port: 1514 diff --git a/server/ansible/inventory/home-cluster/group_vars/gpu-nodes/nvidia-settings.yml b/server/ansible/inventory/home-cluster/group_vars/gpu-nodes/nvidia-settings.yml new file mode 100644 index 00000000..906ed492 --- /dev/null +++ b/server/ansible/inventory/home-cluster/group_vars/gpu-nodes/nvidia-settings.yml @@ -0,0 +1,9 @@ +--- + +nvidia_driver: + version: "465.27" + checksum: "sha256:7e69ffa85bdee6aaaa6b6ea7e1db283b0199f9ab21e41a27dc9048f249dc3171" + +nvidia_patch: + version: "d5d564b888aaef99fdd45e23f2fc3eae8e337a39" + checksum: "sha256:d80928c381d141734c13463d69bfaecff77ac66ee6f9036b2f0348b8602989d8" \ No newline at end of file diff --git a/server/ansible/inventory/home-cluster/host_vars/dfw-k8s-cuda1.yml b/server/ansible/inventory/home-cluster/host_vars/dfw-k8s-cuda1.yml new file mode 100644 index 00000000..a67a5bc3 --- /dev/null +++ b/server/ansible/inventory/home-cluster/host_vars/dfw-k8s-cuda1.yml @@ -0,0 +1,16 @@ +--- + +# IP address of node +ansible_host: "10.75.30.11" + +# Ansible user to ssh into servers with +ansible_user: "ubuntu" +# ansible_ssh_pass: "ubuntu" +# ansible_ssh_common_args: "-o UserKnownHostsFile=/dev/null" +ansible_become_pass: "ubuntu" + +# Set enabled to true to mark this host as running a distributed storage rook-ceph +rook_ceph: + enabled: false + devices: + - /dev/nvme0n1 diff --git a/server/ansible/inventory/home-cluster/hosts.yml b/server/ansible/inventory/home-cluster/hosts.yml index 730b207e..a0a36d03 100644 --- a/server/ansible/inventory/home-cluster/hosts.yml +++ b/server/ansible/inventory/home-cluster/hosts.yml @@ -17,6 +17,9 @@ all: dfw-k8s-worker2: dfw-k8s-worker3: dfw-k8s-worker4: + gpu-nodes: + hosts: + dfw-k8s-cuda1: # Storage group, these are my NAS devices # hosts should match the filenames in 'host_vars' storage: diff --git a/server/ansible/playbooks/k3s/install.yml b/server/ansible/playbooks/k3s/install.yml index 18a91008..175a489c 100644 --- a/server/ansible/playbooks/k3s/install.yml +++ b/server/ansible/playbooks/k3s/install.yml @@ -3,6 +3,7 @@ - hosts: - master-nodes - worker-nodes + - gpu-nodes become: true gather_facts: true any_errors_fatal: true @@ -13,3 +14,15 @@ roles: - xanmanning.k3s - k3s + +- hosts: + - gpu-nodes + become: true + gather_facts: true + any_errors_fatal: true + pre_tasks: + - name: Pausing for 5 seconds... + pause: + seconds: 5 + roles: + - nvidia diff --git a/server/ansible/playbooks/k3s/nuke.yml b/server/ansible/playbooks/k3s/nuke.yml index 8728f70d..6c12f9cb 100644 --- a/server/ansible/playbooks/k3s/nuke.yml +++ b/server/ansible/playbooks/k3s/nuke.yml @@ -3,6 +3,7 @@ - hosts: - master-nodes - worker-nodes + - gpu-nodes become: true gather_facts: true any_errors_fatal: true diff --git a/server/ansible/playbooks/k3s/upgrade.yml b/server/ansible/playbooks/k3s/upgrade.yml index 1b9fca68..f801b2f8 100644 --- a/server/ansible/playbooks/k3s/upgrade.yml +++ b/server/ansible/playbooks/k3s/upgrade.yml @@ -3,6 +3,7 @@ - hosts: - master-nodes - worker-nodes + - gpu-nodes become: true gather_facts: true any_errors_fatal: true diff --git a/server/ansible/playbooks/power-outage/shutdown.yml b/server/ansible/playbooks/power-outage/shutdown.yml index 897137d3..a8317165 100644 --- a/server/ansible/playbooks/power-outage/shutdown.yml +++ b/server/ansible/playbooks/power-outage/shutdown.yml @@ -3,6 +3,7 @@ - hosts: - master-nodes - worker-nodes + - gpu-nodes - storage become: true gather_facts: true diff --git a/server/ansible/playbooks/ubuntu/prepare.yml b/server/ansible/playbooks/ubuntu/prepare.yml index 01a8644e..ec2db678 100644 --- a/server/ansible/playbooks/ubuntu/prepare.yml +++ b/server/ansible/playbooks/ubuntu/prepare.yml @@ -3,6 +3,7 @@ - hosts: - master-nodes - worker-nodes + - gpu-nodes become: true gather_facts: true any_errors_fatal: true diff --git a/server/ansible/playbooks/ubuntu/upgrade.yml b/server/ansible/playbooks/ubuntu/upgrade.yml index 65b370db..fe2d214f 100644 --- a/server/ansible/playbooks/ubuntu/upgrade.yml +++ b/server/ansible/playbooks/ubuntu/upgrade.yml @@ -3,6 +3,7 @@ - hosts: - master-nodes - worker-nodes + - gpu-nodes become: true gather_facts: true any_errors_fatal: true diff --git a/server/ansible/requirements.yml b/server/ansible/requirements.yml index 162aad87..ac76eaf8 100644 --- a/server/ansible/requirements.yml +++ b/server/ansible/requirements.yml @@ -2,5 +2,7 @@ roles: - src: xanmanning.k3s version: v2.9.1 +- src: nvidia.nvidia_driver + version: v2.0.0 collections: - name: community.general diff --git a/server/ansible/roles/k3s/templates/calico-installation.yaml.j2 b/server/ansible/roles/k3s/templates/calico-installation.yaml.j2 index c2558b18..ddd41855 100644 --- a/server/ansible/roles/k3s/templates/calico-installation.yaml.j2 +++ b/server/ansible/roles/k3s/templates/calico-installation.yaml.j2 @@ -19,4 +19,6 @@ spec: nodeSelector: all() nodeAddressAutodetectionV4: cidrs: - - {{ calico.node.cidr }} \ No newline at end of file + {% for cidr in calico.node.cidrs %} + - {{ cidr }} + {% endfor %} diff --git a/server/ansible/roles/nvidia/files/blacklist-nouveau.conf b/server/ansible/roles/nvidia/files/blacklist-nouveau.conf new file mode 100644 index 00000000..976fcc25 --- /dev/null +++ b/server/ansible/roles/nvidia/files/blacklist-nouveau.conf @@ -0,0 +1,2 @@ +blacklist nouveau +options nouveau modeset=0 \ No newline at end of file diff --git a/server/ansible/roles/nvidia/files/config.toml.tmpl b/server/ansible/roles/nvidia/files/config.toml.tmpl new file mode 100644 index 00000000..aa2b3eb3 --- /dev/null +++ b/server/ansible/roles/nvidia/files/config.toml.tmpl @@ -0,0 +1,53 @@ +[plugins.opt] + path = "{{ .NodeConfig.Containerd.Opt }}" +[plugins.cri] + stream_server_address = "127.0.0.1" + stream_server_port = "10010" + enable_selinux = {{ .NodeConfig.SELinux }} +{{- if .IsRunningInUserNS }} + disable_cgroup = true + disable_apparmor = true + restrict_oom_score_adj = true +{{end}} +{{- if .NodeConfig.AgentConfig.PauseImage }} + sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}" +{{end}} +{{- if .NodeConfig.AgentConfig.Snapshotter }} +[plugins.cri.containerd] + disable_snapshot_annotations = true + snapshotter = "{{ .NodeConfig.AgentConfig.Snapshotter }}" +{{end}} +{{- if not .NodeConfig.NoFlannel }} +[plugins.cri.cni] + bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}" + conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}" +{{end}} +[plugins.cri.containerd.runtimes.runc] + runtime_type = "io.containerd.runtime.v1.linux" + +[plugins.linux] + runtime = "nvidia-container-runtime" +{{ if .PrivateRegistryConfig }} +{{ if .PrivateRegistryConfig.Mirrors }} +[plugins.cri.registry.mirrors]{{end}} +{{range $k, $v := .PrivateRegistryConfig.Mirrors }} +[plugins.cri.registry.mirrors."{{$k}}"] + endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}] +{{end}} +{{range $k, $v := .PrivateRegistryConfig.Configs }} +{{ if $v.Auth }} +[plugins.cri.registry.configs."{{$k}}".auth] + {{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}} + {{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}} + {{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}} + {{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}} +{{end}} +{{ if $v.TLS }} +[plugins.cri.registry.configs."{{$k}}".tls] + {{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}} + {{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}} + {{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}} + {{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}} +{{end}} +{{end}} +{{end}} \ No newline at end of file diff --git a/server/ansible/roles/nvidia/tasks/container-runtime.yml b/server/ansible/roles/nvidia/tasks/container-runtime.yml new file mode 100644 index 00000000..7e392468 --- /dev/null +++ b/server/ansible/roles/nvidia/tasks/container-runtime.yml @@ -0,0 +1,22 @@ +--- + +- name: container-runtime | add apt key + ansible.builtin.apt_key: + url: https://nvidia.github.io/nvidia-container-runtime/gpgkey + state: present + +- name: container-runtime | add apt repos + ansible.builtin.apt_repository: + repo: "{{ item }}" + state: present + mode: 0644 + update_cache: yes + filename: nvidia-container-runtime + with_items: + - "deb https://nvidia.github.io/libnvidia-container/stable/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /" + - "deb https://nvidia.github.io/nvidia-container-runtime/stable/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /" + +- name: container-runtime | install nvidia-container-runtime + ansible.builtin.apt: + name: "nvidia-container-runtime" + state: present \ No newline at end of file diff --git a/server/ansible/roles/nvidia/tasks/driver.yml b/server/ansible/roles/nvidia/tasks/driver.yml new file mode 100644 index 00000000..d3db836c --- /dev/null +++ b/server/ansible/roles/nvidia/tasks/driver.yml @@ -0,0 +1,40 @@ +--- + +- name: driver | blacklist nouveau driver + ansible.builtin.copy: + src: files/blacklist-nouveau.conf + dest: /etc/modprobe.d/blacklist-nouveau.conf + register: blacklist + +- name: driver | update initramfs + ansible.builtin.command: "update-initramfs -u" + when: blacklist.changed + +- name: driver | reboot to unload nouveau + ansible.builtin.reboot: + when: blacklist.changed + +- name: driver | install dkms build tools + ansible.builtin.apt: + name: "{{ item }}" + state: present + with_items: + - "dkms" + - "build-essential" + +- name: driver | download nvidia driver + ansible.builtin.get_url: + url: https://international.download.nvidia.com/XFree86/Linux-x86_64/465.27/NVIDIA-Linux-x86_64-{{ nvidia_driver.version }}.run + dest: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_driver.version }}.run + checksum: "{{ nvidia_driver.checksum }}" + mode: '0755' + +- name: driver | install nvidia driver + ansible.builtin.command: + cmd: "/tmp/NVIDIA-Linux-x86_64-{{ nvidia_driver.version }}.run -s --no-opengl-files" + creates: "/proc/driver/nvidia/version" + +- name: driver | load nvidia driver + modprobe: + name: nvidia + state: present diff --git a/server/ansible/roles/nvidia/tasks/k3s-agent.yml b/server/ansible/roles/nvidia/tasks/k3s-agent.yml new file mode 100644 index 00000000..5625a7d6 --- /dev/null +++ b/server/ansible/roles/nvidia/tasks/k3s-agent.yml @@ -0,0 +1,13 @@ +--- + +- name: k3s-agent | enable nvidia-container-runtime + ansible.builtin.copy: + src: files/config.toml.tmpl + dest: /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl + register: containerd_config + +- name: k3s-agent | restart agent + service: + name: k3s + state: restarted + when: containerd_config.changed diff --git a/server/ansible/roles/nvidia/tasks/main.yml b/server/ansible/roles/nvidia/tasks/main.yml new file mode 100644 index 00000000..21806beb --- /dev/null +++ b/server/ansible/roles/nvidia/tasks/main.yml @@ -0,0 +1,17 @@ +--- + +- include: driver.yml + tags: + - driver + +- include: patch.yml + tags: + - patch + +- include: container-runtime.yml + tags: + - container-runtime + +- include: k3s-agent.yml + tags: + - k3s-agent \ No newline at end of file diff --git a/server/ansible/roles/nvidia/tasks/patch.yml b/server/ansible/roles/nvidia/tasks/patch.yml new file mode 100644 index 00000000..80ff2b72 --- /dev/null +++ b/server/ansible/roles/nvidia/tasks/patch.yml @@ -0,0 +1,18 @@ +--- + +- name: patch | create patch directory + ansible.builtin.file: + path: /opt/nvidia-patch + state: directory + mode: '0755' + +- name: patch | download nvidia-patch + ansible.builtin.get_url: + url: https://raw.githubusercontent.com/keylase/nvidia-patch/{{ nvidia_patch.version }}/patch.sh + dest: /opt/nvidia-patch/patch.sh + checksum: "{{ nvidia_patch.checksum }}" + mode: '0755' + +- name: patch | patch current nvidia driver + ansible.builtin.command: + cmd: /opt/nvidia-patch/patch.sh