From d4a323ecc35b96896cbccd1bbb55f55bc88da52d Mon Sep 17 00:00:00 2001 From: JJGadgets Date: Sat, 5 Apr 2025 15:08:54 +0800 Subject: [PATCH] feat(mlc-llm): QwQ-32B, max seq 16384, enable debug --- kube/deploy/apps/mlc-llm/app/hr.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kube/deploy/apps/mlc-llm/app/hr.yaml b/kube/deploy/apps/mlc-llm/app/hr.yaml index d53b2426..1eeea146 100644 --- a/kube/deploy/apps/mlc-llm/app/hr.yaml +++ b/kube/deploy/apps/mlc-llm/app/hr.yaml @@ -29,12 +29,13 @@ spec: image: &img repository: jank.ing/jjgadgets/mlc-llm tag: 0.19.0@sha256:acbe4da65245cdc424eb16de4dd09b6c77fc1dc48f871f04faca4c0365341420 - args: ["HF://mlc-ai/$(MODEL)", "--enable-debug", "--overrides", "max_num_sequence=1;max_total_seq_length=16384"] + args: ["HF://mlc-ai/$(MODEL)", "--enable-debug", "--overrides", "max_num_sequence=1;max_total_seq_length=$(CONTEXT_SIZE)"] env: &envMain TZ: "${CONFIG_TZ}" MLC_JIT_POLICY: "ON" MLC_DOWNLOAD_CACHE_POLICY: "READONLY" MODEL: &llama3-model "Llama-3.2-3B-Instruct-q4f16_1-MLC" + CONTEXT_SIZE: "32768" # smaller KV cache needed thus larger size securityContext: &sc readOnlyRootFilesystem: true allowPrivilegeEscalation: false @@ -45,7 +46,7 @@ spec: cpu: "10m" limits: cpu: "1000m" - memory: "6Gi" + memory: "7.5Gi" gpu.intel.com/i915: "1" probes: liveness: @@ -113,13 +114,14 @@ spec: env: <<: *envMain MODEL: &phi3-model "Phi-3.5-mini-instruct-q4f16_1-MLC" + CONTEXT_SIZE: "16384" resources: requests: cpu: "10m" memory: "3Gi" limits: cpu: "1000m" - memory: "16Gi" + memory: "9Gi" # 2GB params, 0.4GB tmp, rest KV cache gpu.intel.com/i915: "1" phi3-pull: <<: *job @@ -136,7 +138,7 @@ spec: <<: *mlc env: <<: *envMain - MODEL: &qwq-model "mlc-ai/QwQ-32B-q4f16_1-MLC" + MODEL: &qwq-model "QwQ-32B-q4f16_1-MLC" resources: requests: cpu: "10m"