Skip to content

Commit 53066fc

Browse files
committed
support nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 -tp1
Signed-off-by: Netanel Haber <58652339+netanel-haber@users.noreply.github.com>
1 parent fbb018f commit 53066fc

45 files changed

Lines changed: 4733 additions & 210 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

docs/recipes/models.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,70 @@ tokenspeed serve openai/gpt-oss-120b \
8080
--port 8000
8181
```
8282

83+
## Nemotron-H NVFP4
84+
85+
Nemotron-H is a hybrid Mamba2, attention, MLP, and non-gated MoE model. Use the
86+
hybrid attention backend, the in-tree Triton Mamba2 kernels from
87+
`tokenspeed-kernel`, and FlashInfer TRT-LLM NVFP4 MoE for the `relu2` experts.
88+
89+
```bash
90+
tokenspeed serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
91+
--served-model-name nemotron-3-super-120b-a12b \
92+
--quantization modelopt_mixed \
93+
--tensor-parallel-size 1 \
94+
--attention-backend flashinfer \
95+
--moe-backend flashinfer_trtllm \
96+
--mamba-ssm-dtype float32 \
97+
--max-model-len 2048 \
98+
--max-num-seqs 1 \
99+
--chunked-prefill-size 512 \
100+
--block-size 1 \
101+
--skip-server-warmup \
102+
--enforce-eager \
103+
--disable-overlap-schedule \
104+
--host 127.0.0.1 \
105+
--port 7999
106+
```
107+
108+
The smg gateway binds to `--port`; TokenSpeed's OpenAI-compatible control
109+
server binds to the next port. With the command above, use
110+
`http://127.0.0.1:8000/v1/chat/completions`:
111+
112+
```bash
113+
curl -sS http://127.0.0.1:8000/v1/chat/completions \
114+
-H 'Content-Type: application/json' \
115+
-d '{
116+
"model": "nemotron-3-super-120b-a12b",
117+
"messages": [
118+
{
119+
"role": "user",
120+
"content": "what is the capital of Chad? Answer with only the city name."
121+
}
122+
],
123+
"max_tokens": 16,
124+
"temperature": 0,
125+
"reasoning_effort": "none"
126+
}'
127+
```
128+
129+
The `modelopt_mixed` path keeps the checkpoint's ModelOpt mixed-precision
130+
layout: FP8 dense projections are loaded through the FP8 path, NVFP4 experts
131+
through the FlashInfer TRT-LLM MoE path, and CUTLASS-capable CUDA targets use
132+
SGLang-compatible channelwise FP8 scale layout while preserving the checkpoint
133+
FP8 shard scales.
134+
135+
The Mamba2 kernel code was vendored from `sgl-project/sglang` commit
136+
`03c77dc33d0a051aa15c1235407440d9d107b98f`, which carries vLLM and
137+
state-spaces/mamba lineage in the copied files.
138+
139+
`ts serve` detects Nemotron-H / Nemotron-3 checkpoints and applies the SGLang
140+
reasoning behavior automatically: the engine uses the `nemotron_3` reasoning
141+
parser for grammar deferral, while the smg gateway receives its equivalent
142+
`qwen3` reasoning parser and `qwen_coder` tool parser. Chat-completion requests
143+
with `reasoning_effort: "none"` are normalized to the model template's
144+
`enable_thinking: false` switch before they reach the gateway, matching SGLang's
145+
OpenAI request handling.
146+
83147
## Tuning Order
84148

85149
1. Set model ID, trust policy, tokenizer mode, and served model name.

python/tokenspeed/cli/_argsplit.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def split_argv(argv: list[str]) -> SplitResult:
172172

173173
items = _normalize(argv)
174174
result = SplitResult()
175-
engine_flags = _engine_recognized_flags()
175+
engine_flags: set[str] | None = None
176176

177177
for name, value in items:
178178
if name in _ORCH_FLAGS:
@@ -213,6 +213,8 @@ def split_argv(argv: list[str]) -> SplitResult:
213213
result.engine.extend([name, value])
214214
continue
215215

216+
if engine_flags is None:
217+
engine_flags = _engine_recognized_flags()
216218
if name in engine_flags:
217219
if value is not None:
218220
result.engine.extend([name, value])

python/tokenspeed/cli/serve_smg.py

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,13 @@
2929
import logging
3030
import os
3131
import signal
32+
import socket
3233
import sys
34+
import threading
3335
from pathlib import Path
3436

37+
import psutil
38+
3539
from tokenspeed.cli._argsplit import OrchestratorOpts, split_argv
3640
from tokenspeed.cli._logo import print_logo
3741
from tokenspeed.cli._logprefix import ENGINE_TAG, GATEWAY_TAG, tag_stream
@@ -42,8 +46,6 @@
4246
wait_grpc_serving,
4347
wait_http_ready,
4448
)
45-
from tokenspeed.runtime.utils.network import get_free_port
46-
from tokenspeed.runtime.utils.process import kill_process_tree
4749

4850
logger = logging.getLogger(__name__)
4951

@@ -52,6 +54,9 @@
5254
DEFAULT_REASONING_PARSER = "passthrough"
5355
DEEPSEEK_V4_REASONING_PARSER = "deepseek_v31"
5456
DEEPSEEK_V4_TOOL_CALL_PARSER = "deepseek_v4"
57+
NEMOTRON_3_REASONING_PARSER = "nemotron_3"
58+
NEMOTRON_3_GATEWAY_REASONING_PARSER = "qwen3"
59+
NEMOTRON_3_TOOL_CALL_PARSER = "qwen_coder"
5560
DEFAULT_SMG_LOG_LEVEL = "warn"
5661
DEFAULT_SMG_PROMETHEUS_PORT = 8413
5762
# smg reliability knobs we always want disabled when launched under
@@ -63,6 +68,52 @@
6368
)
6469

6570

71+
def get_free_port():
72+
try:
73+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
74+
s.bind(("", 0))
75+
return s.getsockname()[1]
76+
except OSError:
77+
with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
78+
s.bind(("", 0))
79+
return s.getsockname()[1]
80+
81+
82+
def kill_process_tree(
83+
parent_pid: int | None, include_parent: bool = True, skip_pid: int | None = None
84+
) -> None:
85+
if threading.current_thread() is threading.main_thread():
86+
signal.signal(signal.SIGCHLD, signal.SIG_DFL)
87+
88+
if parent_pid is None:
89+
parent_pid = os.getpid()
90+
include_parent = False
91+
92+
try:
93+
itself = psutil.Process(parent_pid)
94+
except psutil.NoSuchProcess:
95+
return
96+
97+
for child in itself.children(recursive=True):
98+
if child.pid == skip_pid:
99+
continue
100+
try:
101+
child.kill()
102+
except psutil.NoSuchProcess:
103+
pass
104+
105+
if include_parent:
106+
try:
107+
if parent_pid == os.getpid():
108+
itself.kill()
109+
sys.exit(0)
110+
111+
itself.kill()
112+
itself.send_signal(signal.SIGQUIT)
113+
except psutil.NoSuchProcess:
114+
pass
115+
116+
66117
def _check_serve_extra_installed() -> None:
67118
import importlib.util
68119

@@ -216,6 +267,37 @@ def _is_deepseek_v4_model(model_id: str | None) -> bool:
216267
)
217268

218269

270+
def _is_nemotron_h_model(model_id: str | None) -> bool:
271+
if not model_id:
272+
return False
273+
274+
normalized = model_id.lower().replace("_", "-")
275+
compact = normalized.replace("-", "")
276+
if (
277+
"nemotron-3" in normalized
278+
or "nemotron-h" in normalized
279+
or "nemotron3" in compact
280+
or "nemotronh" in compact
281+
):
282+
return True
283+
284+
config_path = Path(model_id) / "config.json"
285+
if not config_path.is_file():
286+
return False
287+
try:
288+
with config_path.open() as f:
289+
config = json.load(f)
290+
except (OSError, json.JSONDecodeError):
291+
return False
292+
if not isinstance(config, dict):
293+
return False
294+
architectures = config.get("architectures") or []
295+
return (
296+
config.get("model_type") == "nemotron_h"
297+
or "NemotronHForCausalLM" in architectures
298+
)
299+
300+
219301
def _args_with_default_model_parsers(
220302
engine_args: list[str], gateway_args: list[str]
221303
) -> tuple[list[str], list[str]]:
@@ -226,7 +308,20 @@ def _args_with_default_model_parsers(
226308
name to defer json_schema grammars past the reasoning channel.
227309
"""
228310
model_id = _user_model_id(gateway_args) or _user_model_id(engine_args)
229-
if not _is_deepseek_v4_model(model_id):
311+
if _is_deepseek_v4_model(model_id):
312+
engine_reasoning_parser = DEEPSEEK_V4_REASONING_PARSER
313+
gateway_reasoning_parser = DEEPSEEK_V4_REASONING_PARSER
314+
tool_call_parser = DEEPSEEK_V4_TOOL_CALL_PARSER
315+
elif _is_nemotron_h_model(model_id):
316+
# SGLang detects the Nemotron-3/Super template as ``nemotron_3``:
317+
# Qwen-style ``<think>...</think>`` reasoning with Qwen3-Coder XML
318+
# tool calls. The current smg gateway exposes that reasoning behavior
319+
# under its ``qwen3`` parser name, while the engine keeps the SGLang
320+
# ``nemotron_3`` name for grammar deferral and internal semantics.
321+
engine_reasoning_parser = NEMOTRON_3_REASONING_PARSER
322+
gateway_reasoning_parser = NEMOTRON_3_GATEWAY_REASONING_PARSER
323+
tool_call_parser = NEMOTRON_3_TOOL_CALL_PARSER
324+
else:
230325
return engine_args, gateway_args
231326

232327
engine_result = list(engine_args)
@@ -235,10 +330,10 @@ def _args_with_default_model_parsers(
235330
"--reasoning-parser" not in engine_result
236331
and "--reasoning-parser" not in gateway_result
237332
):
238-
engine_result.extend(["--reasoning-parser", DEEPSEEK_V4_REASONING_PARSER])
239-
gateway_result.extend(["--reasoning-parser", DEEPSEEK_V4_REASONING_PARSER])
333+
engine_result.extend(["--reasoning-parser", engine_reasoning_parser])
334+
gateway_result.extend(["--reasoning-parser", gateway_reasoning_parser])
240335
if "--tool-call-parser" not in gateway_result:
241-
gateway_result.extend(["--tool-call-parser", DEEPSEEK_V4_TOOL_CALL_PARSER])
336+
gateway_result.extend(["--tool-call-parser", tool_call_parser])
242337
return engine_result, gateway_result
243338

244339

python/tokenspeed/runtime/configs/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,19 @@
2424
from tokenspeed.runtime.configs.kimi_k2_config import KimiK2Config
2525
from tokenspeed.runtime.configs.kimi_k25_config import KimiK25Config
2626
from tokenspeed.runtime.configs.minimax_m2_config import MiniMaxM2Config
27+
from tokenspeed.runtime.configs.nemotron_h_config import (
28+
NemotronHConfig,
29+
NemotronHPuzzleConfig,
30+
)
2731
from tokenspeed.runtime.configs.qwen2_config import Qwen2Config
2832
from tokenspeed.runtime.configs.qwen3_5_config import Qwen3_5Config, Qwen3_5MoeConfig
2933
from tokenspeed.runtime.configs.qwen3_config import Qwen3Config
3034
from tokenspeed.runtime.configs.qwen3_moe_config import Qwen3MoeConfig
3135

3236
__all__ = [
3337
"DeepseekV4Config",
38+
"NemotronHConfig",
39+
"NemotronHPuzzleConfig",
3440
"Qwen2Config",
3541
"Qwen3Config",
3642
"Qwen3MoeConfig",

0 commit comments

Comments
 (0)