feat(mlc-llm): QwQ-32B, max seq 16384, enable debug

This commit is contained in:
JJGadgets
2025-04-05 15:08:54 +08:00
parent 5a6107dca5
commit d4a323ecc3

View File

@@ -29,12 +29,13 @@ spec:
image: &img
repository: jank.ing/jjgadgets/mlc-llm
tag: 0.19.0@sha256:acbe4da65245cdc424eb16de4dd09b6c77fc1dc48f871f04faca4c0365341420
args: ["HF://mlc-ai/$(MODEL)", "--enable-debug", "--overrides", "max_num_sequence=1;max_total_seq_length=16384"]
args: ["HF://mlc-ai/$(MODEL)", "--enable-debug", "--overrides", "max_num_sequence=1;max_total_seq_length=$(CONTEXT_SIZE)"]
env: &envMain
TZ: "${CONFIG_TZ}"
MLC_JIT_POLICY: "ON"
MLC_DOWNLOAD_CACHE_POLICY: "READONLY"
MODEL: &llama3-model "Llama-3.2-3B-Instruct-q4f16_1-MLC"
CONTEXT_SIZE: "32768" # smaller KV cache needed thus larger size
securityContext: &sc
readOnlyRootFilesystem: true
allowPrivilegeEscalation: false
@@ -45,7 +46,7 @@ spec:
cpu: "10m"
limits:
cpu: "1000m"
memory: "6Gi"
memory: "7.5Gi"
gpu.intel.com/i915: "1"
probes:
liveness:
@@ -113,13 +114,14 @@ spec:
env:
<<: *envMain
MODEL: &phi3-model "Phi-3.5-mini-instruct-q4f16_1-MLC"
CONTEXT_SIZE: "16384"
resources:
requests:
cpu: "10m"
memory: "3Gi"
limits:
cpu: "1000m"
memory: "16Gi"
memory: "9Gi" # 2GB params, 0.4GB tmp, rest KV cache
gpu.intel.com/i915: "1"
phi3-pull:
<<: *job
@@ -136,7 +138,7 @@ spec:
<<: *mlc
env:
<<: *envMain
MODEL: &qwq-model "mlc-ai/QwQ-32B-q4f16_1-MLC"
MODEL: &qwq-model "QwQ-32B-q4f16_1-MLC"
resources:
requests:
cpu: "10m"