lightseekorg · dongjiyingdjy · Jun 16, 2026 · Jun 17, 2026 · chatgpt-codex-connector · Jun 17, 2026
diff --git a/test/ci/eval/deepseek-v4-flash-mtp-evalscope-aime25.yaml b/test/ci/eval/deepseek-v4-flash-mtp-evalscope-aime25.yaml
@@ -0,0 +1,60 @@
+api_version: ci.tokenspeed.io/v1
+name: eval-deepseek-v4-flash-mtp-aime25
+type: eval
+triggers:
+  - per-commit
+  - manual
+runner:
+  labels:
+    - b200-4gpu
+env:
+  CI: "true"
+install:
+  - bash test/ci_system/install_deps.sh
+server:
+  command: >-
+    ts serve
+    --load-format instanttensor
+    --model deepseek-ai/DeepSeek-V4-Flash
+    --tensor-parallel-size 4
+    --enable-expert-parallel
+    --kv-cache-dtype fp8_e4m3
+    --moe-backend mega_moe
+    --attention-use-fp4-indexer-cache
+    --max-model-len 80000
+    --max-total-tokens 163840
+    --chunked-prefill-size 8192
+    --gpu-memory-utilization 0.9
+    --disable-kvstore
+    --speculative-algorithm MTP
+    --speculative-num-steps 3
+    --disable-overlap-schedule
+    --no-enable-prefix-caching
+    --trust-remote-code
+    --host 127.0.0.1
+    --port 8000
+  ready:
+    url: http://127.0.0.1:8000/readiness
+    timeout: 1800
+    interval: 10
+eval:
+  install:
+    - python3 -m uv venv --seed --clear /tmp/evalscope-perf && python3 -m uv pip install --python /tmp/evalscope-perf/bin/python 'evalscope[perf]'
+  # AIME25 needs thinking mode on (reasoning_effort=high) for V4-Flash to reach
+  # ~0.96; reasoning chains run up to max_tokens=65536. MTP speculative decode
+  # keeps tokens flowing fast (like Kimi's EAGLE3), so eval-batch-size 16 is
+  # viable here without the long-stream read-timeout stalls the non-MTP config
+  # hits at 16. timeout=3600 guards the longest single-request streams.
+  command: >-
+    python3 -c "import urllib.request,json;urllib.request.urlopen(urllib.request.Request('http://127.0.0.1:8000/v1/completions',json.dumps({'model':'deepseek-ai/DeepSeek-V4-Flash','prompt':' '.join(['hello']*300),'max_tokens':2}).encode(),{'Content-Type':'application/json'}),timeout=120).read()" &&
+    /tmp/evalscope-perf/bin/evalscope eval
+    --model deepseek-ai/DeepSeek-V4-Flash
+    --api-url http://127.0.0.1:8000/v1
+    --api-key EMPTY_TOKEN
+    --datasets aime25
+    --eval-batch-size 16
+    --stream
+    --generation-config '{"do_sample":false,"temperature":0.0,"max_tokens":65536,"timeout":3600,"extra_body":{"chat_template_kwargs":{"reasoning_effort":"high","thinking":true}}}'
+report:
+  github_step_summary: true
+score_threshold: 0.93