feat(mlc-llm): QwQ-32B speculative eagle decoding

2026-03-22 06:39:47 +00:00 · 2025-04-05 22:31:10 +08:00
parent 832e319038
commit ada60a2361
1 changed files with 2 additions and 2 deletions
--- a/kube/deploy/apps/mlc-llm/app/hr.yaml
+++ b/kube/deploy/apps/mlc-llm/app/hr.yaml
@@ -140,14 +140,14 @@ spec:
            env:
              <<: *envMain
              MODEL: &qwq-model "QwQ-32B-q4f16_1-MLC"
-              SPECULATIVE_DECODING: "eagle"
+              SPECULATIVE_DECODING: "small_draft"
              SPECULATIVE_DECODING_MODEL: "Qwen2.5-0.5B-Instruct-q0f16-MLC"
            resources:
              requests:
                cpu: "10m"
              limits:
                cpu: "1000m"
-                memory: "40Gi"
+                memory: "30Gi"
                gpu.intel.com/i915: "1"
      qwq-pull:
        <<: *job