From ada60a236105f8879c48df4e4d3714e40732e175 Mon Sep 17 00:00:00 2001 From: JJGadgets Date: Sat, 5 Apr 2025 22:31:10 +0800 Subject: [PATCH] feat(mlc-llm): QwQ-32B speculative eagle decoding --- kube/deploy/apps/mlc-llm/app/hr.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kube/deploy/apps/mlc-llm/app/hr.yaml b/kube/deploy/apps/mlc-llm/app/hr.yaml index fc01f91b..efec63ed 100644 --- a/kube/deploy/apps/mlc-llm/app/hr.yaml +++ b/kube/deploy/apps/mlc-llm/app/hr.yaml @@ -140,14 +140,14 @@ spec: env: <<: *envMain MODEL: &qwq-model "QwQ-32B-q4f16_1-MLC" - SPECULATIVE_DECODING: "eagle" + SPECULATIVE_DECODING: "small_draft" SPECULATIVE_DECODING_MODEL: "Qwen2.5-0.5B-Instruct-q0f16-MLC" resources: requests: cpu: "10m" limits: cpu: "1000m" - memory: "40Gi" + memory: "30Gi" gpu.intel.com/i915: "1" qwq-pull: <<: *job