Nvidia madness

This commit is contained in:
Toboshii Nakama
2021-05-17 03:17:38 -05:00
parent 84dab065e7
commit 5c97eeea4a
34 changed files with 436 additions and 3 deletions

1
.gitignore vendored
View File

@@ -22,4 +22,5 @@ terraform.tfstate*
# Ansible
server/ansible/inventory/e2e/hosts.yml
xanmanning.k3s*
nvidia.nvidia_driver*
node_modules

View File

@@ -6,4 +6,5 @@ resources:
- metrics-server
- node-feature-discovery
- node-problem-detector
- nvidia-device-plugin
- reloader

View File

@@ -0,0 +1,25 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: nvidia-device-plugin
namespace: kube-system
spec:
interval: 5m
chart:
spec:
chart: nvidia-device-plugin
version: 0.9.0
sourceRef:
kind: HelmRepository
name: nvidia-charts
namespace: flux-system
interval: 5m
values:
image:
repository: nvcr.io/nvidia/k8s-device-plugin
tag: v0.9.0
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: PreferNoSchedule

View File

@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helm-release.yaml

View File

@@ -6,6 +6,7 @@ resources:
- flux-system
- kasten-io
- kube-system
- media
- monitoring
- networking
- security

View File

@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- nfs-media-franxx.yaml

View File

@@ -0,0 +1,34 @@
---
apiVersion: v1
kind: PersistentVolume
metadata:
name: nfs-media-franxx-pv
spec:
storageClassName: media-franxx
capacity:
storage: 1Mi
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
nfs:
server: 10.75.30.15
path: /tank/share
mountOptions:
- tcp
- intr
- hard
- noatime
- nodiratime
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: nfs-media-franxx-pvc
namespace: media
spec:
accessModes:
- ReadWriteMany
storageClassName: media-franxx
resources:
requests:
storage: 1Mi

View File

@@ -0,0 +1,13 @@
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: plex-config-v1
namespace: media
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: rook-ceph-block

View File

@@ -0,0 +1,105 @@
---
apiVersion: helm.toolkit.fluxcd.io/v2beta1
kind: HelmRelease
metadata:
name: plex
namespace: media
spec:
interval: 5m
chart:
spec:
# renovate: registryUrl=https://k8s-at-home.com/charts/
chart: plex
version: 4.4.0
sourceRef:
kind: HelmRepository
name: k8s-at-home-charts
namespace: flux-system
interval: 5m
values:
image:
repository: ghcr.io/k8s-at-home/plex
tag: v1.22.3.4392-d7c624def
env:
TZ: "America/Chicago"
podSecurityContext:
supplementalGroups:
- 5600
service:
enabled: true
type: LoadBalancer
externalIPs:
- "${LB_PLEX_IP}"
externalTrafficPolicy: Local
port:
port: 32400
annotations:
traefik.ingress.kubernetes.io/service.serversscheme: https
ingress:
enabled: true
ingressClassName: "traefik"
annotations:
traefik.ingress.kubernetes.io/router.entrypoints: "websecure"
external-dns/is-public: "true"
external-dns.alpha.kubernetes.io/target: "ipv4.${SECRET_DOMAIN}"
hosts:
- host: "plex.${SECRET_DOMAIN}"
paths:
- path: /
pathType: Prefix
tls:
- hosts:
- "plex.${SECRET_DOMAIN}"
persistence:
config:
enabled: true
existingClaim: plex-config-v1
tmp:
enabled: true
emptyDir:
enabled: true
medium: "Memory"
additionalVolumeMounts:
- name: media-franxx
mountPath: "/media-franxx"
readOnly: true
additionalVolumes:
- name: media-franxx
persistentVolumeClaim:
claimName: nfs-media-franxx-pvc
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/hostname
operator: In
values:
- dfw-k8s-cuda1
resources:
limits:
nvidia.com/gpu: 4
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "PreferNoSchedule"
podAnnotations:
configmap.reloader.stakater.com/reload: "plex-promtail"
addons:
promtail:
enabled: true
image:
repository: grafana/promtail
tag: 2.2.1
loki: http://loki.monitoring.svc.cluster.local:3100/loki/api/v1/push
logs:
- name: promtail/plex
path: "/config/Library/Application Support/Plex Media Server/Logs/*.log"
- name: promtail/plex/plugins
path: "/config/Library/Application Support/Plex Media Server/Logs/PMS Plugin Logs/*.log"
volumeMounts:
- name: config
mountPath: /config
readOnly: true
securityContext:
runAsUser: 0

View File

@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- config-pvc.yaml
- helm-release.yaml

View File

@@ -0,0 +1,11 @@
---
apiVersion: uroperator.brennerm.github.io/v1beta1
kind: UptimeRobotMonitor
metadata:
name: plex-monitor
spec:
friendlyName: "Plex"
url: "https://plex.${SECRET_DOMAIN}"
type: HTTPS
httpMethod: GET
interval: 120

View File

@@ -10,4 +10,5 @@ data:
LB_COREDNS_IP: 10.75.45.100
LB_TRAEFIK_IP: 10.75.45.101
LB_LOKI_IP: 10.75.45.102
LB_PLEX_IP: 10.75.45.103
LB_BLOCKY_IP: 10.75.45.105

View File

@@ -18,6 +18,7 @@ resources:
- kubernetes-sigs-descheduler-charts.yaml
- nfs-subdir-external-provisioner-charts.yaml
- node-feature-discovery-charts.yaml
- nvidia-charts.yaml
- prometheus-community-charts.yaml
- rook-ceph-charts.yaml
- stakater-charts.yaml

View File

@@ -0,0 +1,10 @@
---
apiVersion: source.toolkit.fluxcd.io/v1beta1
kind: HelmRepository
metadata:
name: nvidia-charts
namespace: flux-system
spec:
interval: 10m
url: https://nvidia.github.io/k8s-device-plugin
timeout: 3m

View File

@@ -15,4 +15,6 @@ calico:
# e.g. 192.168.169.0/24
externalIPs: 10.75.45.0/24
node:
cidr: 10.75.40.0/24
cidrs:
- 10.75.40.0/24
- 10.75.30.0/24

View File

@@ -4,5 +4,5 @@
# ...requires a rsyslog server already set up
rsyslog:
enabled: false
ip: 192.168.69.155
ip: 10.75.45.102
port: 1514

View File

@@ -0,0 +1,9 @@
---
nvidia_driver:
version: "465.27"
checksum: "sha256:7e69ffa85bdee6aaaa6b6ea7e1db283b0199f9ab21e41a27dc9048f249dc3171"
nvidia_patch:
version: "d5d564b888aaef99fdd45e23f2fc3eae8e337a39"
checksum: "sha256:d80928c381d141734c13463d69bfaecff77ac66ee6f9036b2f0348b8602989d8"

View File

@@ -0,0 +1,16 @@
---
# IP address of node
ansible_host: "10.75.30.11"
# Ansible user to ssh into servers with
ansible_user: "ubuntu"
# ansible_ssh_pass: "ubuntu"
# ansible_ssh_common_args: "-o UserKnownHostsFile=/dev/null"
ansible_become_pass: "ubuntu"
# Set enabled to true to mark this host as running a distributed storage rook-ceph
rook_ceph:
enabled: false
devices:
- /dev/nvme0n1

View File

@@ -17,6 +17,9 @@ all:
dfw-k8s-worker2:
dfw-k8s-worker3:
dfw-k8s-worker4:
gpu-nodes:
hosts:
dfw-k8s-cuda1:
# Storage group, these are my NAS devices
# hosts should match the filenames in 'host_vars'
storage:

View File

@@ -3,6 +3,7 @@
- hosts:
- master-nodes
- worker-nodes
- gpu-nodes
become: true
gather_facts: true
any_errors_fatal: true
@@ -13,3 +14,15 @@
roles:
- xanmanning.k3s
- k3s
- hosts:
- gpu-nodes
become: true
gather_facts: true
any_errors_fatal: true
pre_tasks:
- name: Pausing for 5 seconds...
pause:
seconds: 5
roles:
- nvidia

View File

@@ -3,6 +3,7 @@
- hosts:
- master-nodes
- worker-nodes
- gpu-nodes
become: true
gather_facts: true
any_errors_fatal: true

View File

@@ -3,6 +3,7 @@
- hosts:
- master-nodes
- worker-nodes
- gpu-nodes
become: true
gather_facts: true
any_errors_fatal: true

View File

@@ -3,6 +3,7 @@
- hosts:
- master-nodes
- worker-nodes
- gpu-nodes
- storage
become: true
gather_facts: true

View File

@@ -3,6 +3,7 @@
- hosts:
- master-nodes
- worker-nodes
- gpu-nodes
become: true
gather_facts: true
any_errors_fatal: true

View File

@@ -3,6 +3,7 @@
- hosts:
- master-nodes
- worker-nodes
- gpu-nodes
become: true
gather_facts: true
any_errors_fatal: true

View File

@@ -2,5 +2,7 @@
roles:
- src: xanmanning.k3s
version: v2.9.1
- src: nvidia.nvidia_driver
version: v2.0.0
collections:
- name: community.general

View File

@@ -19,4 +19,6 @@ spec:
nodeSelector: all()
nodeAddressAutodetectionV4:
cidrs:
- {{ calico.node.cidr }}
{% for cidr in calico.node.cidrs %}
- {{ cidr }}
{% endfor %}

View File

@@ -0,0 +1,2 @@
blacklist nouveau
options nouveau modeset=0

View File

@@ -0,0 +1,53 @@
[plugins.opt]
path = "{{ .NodeConfig.Containerd.Opt }}"
[plugins.cri]
stream_server_address = "127.0.0.1"
stream_server_port = "10010"
enable_selinux = {{ .NodeConfig.SELinux }}
{{- if .IsRunningInUserNS }}
disable_cgroup = true
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}
{{- if .NodeConfig.AgentConfig.PauseImage }}
sandbox_image = "{{ .NodeConfig.AgentConfig.PauseImage }}"
{{end}}
{{- if .NodeConfig.AgentConfig.Snapshotter }}
[plugins.cri.containerd]
disable_snapshot_annotations = true
snapshotter = "{{ .NodeConfig.AgentConfig.Snapshotter }}"
{{end}}
{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
bin_dir = "{{ .NodeConfig.AgentConfig.CNIBinDir }}"
conf_dir = "{{ .NodeConfig.AgentConfig.CNIConfDir }}"
{{end}}
[plugins.cri.containerd.runtimes.runc]
runtime_type = "io.containerd.runtime.v1.linux"
[plugins.linux]
runtime = "nvidia-container-runtime"
{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors."{{$k}}"]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf "%q" .}}{{end}}]
{{end}}
{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs."{{$k}}".auth]
{{ if $v.Auth.Username }}username = {{ printf "%q" $v.Auth.Username }}{{end}}
{{ if $v.Auth.Password }}password = {{ printf "%q" $v.Auth.Password }}{{end}}
{{ if $v.Auth.Auth }}auth = {{ printf "%q" $v.Auth.Auth }}{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf "%q" $v.Auth.IdentityToken }}{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs."{{$k}}".tls]
{{ if $v.TLS.CAFile }}ca_file = "{{ $v.TLS.CAFile }}"{{end}}
{{ if $v.TLS.CertFile }}cert_file = "{{ $v.TLS.CertFile }}"{{end}}
{{ if $v.TLS.KeyFile }}key_file = "{{ $v.TLS.KeyFile }}"{{end}}
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}}
{{end}}
{{end}}
{{end}}

View File

@@ -0,0 +1,22 @@
---
- name: container-runtime | add apt key
ansible.builtin.apt_key:
url: https://nvidia.github.io/nvidia-container-runtime/gpgkey
state: present
- name: container-runtime | add apt repos
ansible.builtin.apt_repository:
repo: "{{ item }}"
state: present
mode: 0644
update_cache: yes
filename: nvidia-container-runtime
with_items:
- "deb https://nvidia.github.io/libnvidia-container/stable/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /"
- "deb https://nvidia.github.io/nvidia-container-runtime/stable/{{ ansible_distribution | lower }}{{ ansible_distribution_version }}/$(ARCH) /"
- name: container-runtime | install nvidia-container-runtime
ansible.builtin.apt:
name: "nvidia-container-runtime"
state: present

View File

@@ -0,0 +1,40 @@
---
- name: driver | blacklist nouveau driver
ansible.builtin.copy:
src: files/blacklist-nouveau.conf
dest: /etc/modprobe.d/blacklist-nouveau.conf
register: blacklist
- name: driver | update initramfs
ansible.builtin.command: "update-initramfs -u"
when: blacklist.changed
- name: driver | reboot to unload nouveau
ansible.builtin.reboot:
when: blacklist.changed
- name: driver | install dkms build tools
ansible.builtin.apt:
name: "{{ item }}"
state: present
with_items:
- "dkms"
- "build-essential"
- name: driver | download nvidia driver
ansible.builtin.get_url:
url: https://international.download.nvidia.com/XFree86/Linux-x86_64/465.27/NVIDIA-Linux-x86_64-{{ nvidia_driver.version }}.run
dest: /tmp/NVIDIA-Linux-x86_64-{{ nvidia_driver.version }}.run
checksum: "{{ nvidia_driver.checksum }}"
mode: '0755'
- name: driver | install nvidia driver
ansible.builtin.command:
cmd: "/tmp/NVIDIA-Linux-x86_64-{{ nvidia_driver.version }}.run -s --no-opengl-files"
creates: "/proc/driver/nvidia/version"
- name: driver | load nvidia driver
modprobe:
name: nvidia
state: present

View File

@@ -0,0 +1,13 @@
---
- name: k3s-agent | enable nvidia-container-runtime
ansible.builtin.copy:
src: files/config.toml.tmpl
dest: /var/lib/rancher/k3s/agent/etc/containerd/config.toml.tmpl
register: containerd_config
- name: k3s-agent | restart agent
service:
name: k3s
state: restarted
when: containerd_config.changed

View File

@@ -0,0 +1,17 @@
---
- include: driver.yml
tags:
- driver
- include: patch.yml
tags:
- patch
- include: container-runtime.yml
tags:
- container-runtime
- include: k3s-agent.yml
tags:
- k3s-agent

View File

@@ -0,0 +1,18 @@
---
- name: patch | create patch directory
ansible.builtin.file:
path: /opt/nvidia-patch
state: directory
mode: '0755'
- name: patch | download nvidia-patch
ansible.builtin.get_url:
url: https://raw.githubusercontent.com/keylase/nvidia-patch/{{ nvidia_patch.version }}/patch.sh
dest: /opt/nvidia-patch/patch.sh
checksum: "{{ nvidia_patch.checksum }}"
mode: '0755'
- name: patch | patch current nvidia driver
ansible.builtin.command:
cmd: /opt/nvidia-patch/patch.sh