diff --git a/kube/deploy/apps/mlc-llm/app/hr.yaml b/kube/deploy/apps/mlc-llm/app/hr.yaml index 6fcc0b7a..a0496556 100644 --- a/kube/deploy/apps/mlc-llm/app/hr.yaml +++ b/kube/deploy/apps/mlc-llm/app/hr.yaml @@ -17,7 +17,7 @@ spec: namespace: flux-system values: controllers: - mlc-llm: + llama3: &deploy type: deployment replicas: 1 strategy: RollingUpdate @@ -25,33 +25,42 @@ spec: labels: ingress.home.arpa/nginx-internal: allow containers: - main: + main: &mlc image: &img repository: jank.ing/jjgadgets/mlc-llm tag: rolling@sha256:07faffd10763be433d4c3f3aadfbc4711d4257b62aabfcfda8aa5e896239129a - args: ["HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC"] - env: &env + args: ["HF://mlc-ai/$(MODEL)"] + env: &envMain TZ: "${CONFIG_TZ}" MLC_JIT_POLICY: "READONLY" MLC_DOWNLOAD_CACHE_POLICY: "READONLY" + MODEL: "Llama-3-8B-Instruct-q4f16_1-MLC" securityContext: &sc readOnlyRootFilesystem: true allowPrivilegeEscalation: false capabilities: drop: ["ALL"] - resources: + resources: &resources requests: cpu: "10m" limits: cpu: "1000m" - memory: "10Gi" + memory: "12Gi" gpu.intel.com/i915: "1" probes: liveness: enabled: true readiness: enabled: true - ml-model-pull: + startup: + enabled: true + custom: true + spec: + periodSeconds: 2 + failureThreshold: 300 + tcpSocket: + port: 8080 + llama3-pull: &job type: cronjob cronjob: schedule: "@daily" @@ -60,40 +69,72 @@ spec: labels: egress.home.arpa/internet: allow containers: - main: + main: &pull image: *img command: ["tini", "-g", "--", "/bin/bash", "-c"] - args: ["echo '/exit' | mlc_llm chat HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC"] # run for 10 minutes to pull models via preload, then kill process, hopefully it doesn't crash - env: + args: + - | + if [ -d "/app/.cache/mlc_llm/model_weights/hf/mlc_ai/$(MODEL)" ] && [ -z "$(ls -A "/app/.cache/mlc_llm/model_weights/hf/mlc_ai/$(MODEL)")" ]; then + true; + else + echo '/exit' | mlc_llm chat HF://mlc-ai/$(MODEL) + fi + env: &envPull TZ: "${CONFIG_TZ}" MLC_JIT_POLICY: "ON" MLC_DOWNLOAD_CACHE_POLICY: "ON" + MODEL: "Llama-3-8B-Instruct-q4f16_1-MLC" securityContext: *sc - resources: - requests: - cpu: "10m" - limits: - cpu: "1000m" - memory: "2Gi" - gpu.intel.com/i915: "1" + resources: *resources + codellama: + <<: *deploy + containers: + main: + <<: *mlc + env: + <<: *envMain + MODEL: "CodeLlama-7b-hf-q4f32_1-MLC" + codellama-pull: + <<: *job + containers: + main: + <<: *pull + env: + <<: *envPull + MODEL: "CodeLlama-7b-hf-q4f32_1-MLC" service: - mlc-llm: - controller: mlc-llm + llama3: &svc + controller: llama3 ports: http: port: 8080 protocol: HTTP appProtocol: http + codellama: + <<: *svc + controller: codellama ingress: - main: + llama3: className: nginx-internal hosts: - - host: &host "${APP_DNS_MLC_LLM:=APPNAME}" + - host: &host "llama3.${DNS_SHORT}" paths: &paths - path: / pathType: Prefix service: - identifier: mlc-llm + identifier: llama3 + port: http + tls: + - hosts: [*host] + codellama: + className: nginx-internal + hosts: + - host: &host "codellama.${DNS_SHORT}" + paths: &paths + - path: / + pathType: Prefix + service: + identifier: codellama port: http tls: - hosts: [*host] @@ -124,12 +165,12 @@ spec: runAsGroup: *uid fsGroup: *uid fsGroupChangePolicy: Always - supplementalGroups: [44] # iGPU - seccompProfile: { type: "RuntimeDefault" } + supplementalGroups: [44, 226] # iGPU + seccompProfile: { type: "Unconfined" } # GPU hangs with RuntimeDefault topologySpreadConstraints: - maxSkew: 1 topologyKey: kubernetes.io/hostname - whenUnsatisfiable: DoNotSchedule + whenUnsatisfiable: ScheduleAnyway labelSelector: matchLabels: app.kubernetes.io/name: *app