lightseekorg
diff --git a/‎docs/recipes/models.md‎
Lines changed: 64 additions & 0 deletions b/‎docs/recipes/models.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎python/tokenspeed/cli/_argsplit.py‎
Lines changed: 3 additions & 1 deletion b/‎python/tokenspeed/cli/_argsplit.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎python/tokenspeed/cli/serve_smg.py‎
Lines changed: 101 additions & 6 deletions b/‎python/tokenspeed/cli/serve_smg.py‎
Lines changed: 101 additions & 6 deletions
diff --git a/‎python/tokenspeed/runtime/configs/__init__.py‎
Lines changed: 6 additions & 0 deletions b/‎python/tokenspeed/runtime/configs/__init__.py‎
Lines changed: 6 additions & 0 deletions
@@ -80,6 +80,70 @@ tokenspeed serve openai/gpt-oss-120b \
   --port 8000
 ```
 
+## Nemotron-H NVFP4
+
+Nemotron-H is a hybrid Mamba2, attention, MLP, and non-gated MoE model. Use the
+hybrid attention backend, the in-tree Triton Mamba2 kernels from
+`tokenspeed-kernel`, and FlashInfer TRT-LLM NVFP4 MoE for the `relu2` experts.
+
+```bash
+tokenspeed serve nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-NVFP4 \
+  --served-model-name nemotron-3-super-120b-a12b \
+  --quantization modelopt_mixed \
+  --tensor-parallel-size 1 \
+  --attention-backend flashinfer \
+  --moe-backend flashinfer_trtllm \
+  --mamba-ssm-dtype float32 \
+  --max-model-len 2048 \
+  --max-num-seqs 1 \
+  --chunked-prefill-size 512 \
+  --block-size 1 \
+  --skip-server-warmup \
+  --enforce-eager \
+  --disable-overlap-schedule \
+  --host 127.0.0.1 \
+  --port 7999
+```
+
+The smg gateway binds to `--port`; TokenSpeed's OpenAI-compatible control
+server binds to the next port. With the command above, use
+`http://127.0.0.1:8000/v1/chat/completions`:
+
+```bash
+curl -sS http://127.0.0.1:8000/v1/chat/completions \
+  -H 'Content-Type: application/json' \
+  -d '{
+    "model": "nemotron-3-super-120b-a12b",
+    "messages": [
+      {
+        "role": "user",
+        "content": "what is the capital of Chad? Answer with only the city name."
+      }
+    ],
+    "max_tokens": 16,
+    "temperature": 0,
+    "reasoning_effort": "none"
+  }'
+```
+
+The `modelopt_mixed` path keeps the checkpoint's ModelOpt mixed-precision
+layout: FP8 dense projections are loaded through the FP8 path, NVFP4 experts
+through the FlashInfer TRT-LLM MoE path, and CUTLASS-capable CUDA targets use
+SGLang-compatible channelwise FP8 scale layout while preserving the checkpoint
+FP8 shard scales.
+
+The Mamba2 kernel code was vendored from `sgl-project/sglang` commit
+`03c77dc33d0a051aa15c1235407440d9d107b98f`, which carries vLLM and
+state-spaces/mamba lineage in the copied files.
+
+`ts serve` detects Nemotron-H / Nemotron-3 checkpoints and applies the SGLang
+reasoning behavior automatically: the engine uses the `nemotron_3` reasoning
+parser for grammar deferral, while the smg gateway receives its equivalent
+`qwen3` reasoning parser and `qwen_coder` tool parser. Chat-completion requests
+with `reasoning_effort: "none"` are normalized to the model template's
+`enable_thinking: false` switch before they reach the gateway, matching SGLang's
+OpenAI request handling.
+
 ## Tuning Order
 
 1. Set model ID, trust policy, tokenizer mode, and served model name.
 
@@ -172,7 +172,7 @@ def split_argv(argv: list[str]) -> SplitResult:
 
     items = _normalize(argv)
     result = SplitResult()
-    engine_flags = _engine_recognized_flags()
+    engine_flags: set[str] | None = None
 
     for name, value in items:
         if name in _ORCH_FLAGS:
@@ -213,6 +213,8 @@ def split_argv(argv: list[str]) -> SplitResult:
             result.engine.extend([name, value])
             continue
 
+        if engine_flags is None:
+            engine_flags = _engine_recognized_flags()
         if name in engine_flags:
             if value is not None:
                 result.engine.extend([name, value])
 
@@ -29,9 +29,13 @@
 import logging
 import os
 import signal
+import socket
 import sys
+import threading
 from pathlib import Path
 
+import psutil
+
 from tokenspeed.cli._argsplit import OrchestratorOpts, split_argv
 from tokenspeed.cli._logo import print_logo
 from tokenspeed.cli._logprefix import ENGINE_TAG, GATEWAY_TAG, tag_stream
@@ -42,8 +46,6 @@
     wait_grpc_serving,
     wait_http_ready,
 )
-from tokenspeed.runtime.utils.network import get_free_port
-from tokenspeed.runtime.utils.process import kill_process_tree
 
 logger = logging.getLogger(__name__)
 
@@ -52,6 +54,9 @@
 DEFAULT_REASONING_PARSER = "passthrough"
 DEEPSEEK_V4_REASONING_PARSER = "deepseek_v31"
 DEEPSEEK_V4_TOOL_CALL_PARSER = "deepseek_v4"
+NEMOTRON_3_REASONING_PARSER = "nemotron_3"
+NEMOTRON_3_GATEWAY_REASONING_PARSER = "qwen3"
+NEMOTRON_3_TOOL_CALL_PARSER = "qwen_coder"
 DEFAULT_SMG_LOG_LEVEL = "warn"
 DEFAULT_SMG_PROMETHEUS_PORT = 8413
 # smg reliability knobs we always want disabled when launched under
@@ -63,6 +68,52 @@
 )
 
 
+def get_free_port():
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+    except OSError:
+        with socket.socket(socket.AF_INET6, socket.SOCK_STREAM) as s:
+            s.bind(("", 0))
+            return s.getsockname()[1]
+
+
+def kill_process_tree(
+    parent_pid: int | None, include_parent: bool = True, skip_pid: int | None = None
+) -> None:
+    if threading.current_thread() is threading.main_thread():
+        signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+    if parent_pid is None:
+        parent_pid = os.getpid()
+        include_parent = False
+
+    try:
+        itself = psutil.Process(parent_pid)
+    except psutil.NoSuchProcess:
+        return
+
+    for child in itself.children(recursive=True):
+        if child.pid == skip_pid:
+            continue
+        try:
+            child.kill()
+        except psutil.NoSuchProcess:
+            pass
+
+    if include_parent:
+        try:
+            if parent_pid == os.getpid():
+                itself.kill()
+                sys.exit(0)
+
+            itself.kill()
+            itself.send_signal(signal.SIGQUIT)
+        except psutil.NoSuchProcess:
+            pass
+
+
 def _check_serve_extra_installed() -> None:
     import importlib.util
 
@@ -216,6 +267,37 @@ def _is_deepseek_v4_model(model_id: str | None) -> bool:
     )
 
 
+def _is_nemotron_h_model(model_id: str | None) -> bool:
+    if not model_id:
+        return False
+
+    normalized = model_id.lower().replace("_", "-")
+    compact = normalized.replace("-", "")
+    if (
+        "nemotron-3" in normalized
+        or "nemotron-h" in normalized
+        or "nemotron3" in compact
+        or "nemotronh" in compact
+    ):
+        return True
+
+    config_path = Path(model_id) / "config.json"
+    if not config_path.is_file():
+        return False
+    try:
+        with config_path.open() as f:
+            config = json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return False
+    if not isinstance(config, dict):
+        return False
+    architectures = config.get("architectures") or []
+    return (
+        config.get("model_type") == "nemotron_h"
+        or "NemotronHForCausalLM" in architectures
+    )
+
+
 def _args_with_default_model_parsers(
     engine_args: list[str], gateway_args: list[str]
 ) -> tuple[list[str], list[str]]:
@@ -226,7 +308,20 @@ def _args_with_default_model_parsers(
     name to defer json_schema grammars past the reasoning channel.
     """
     model_id = _user_model_id(gateway_args) or _user_model_id(engine_args)
-    if not _is_deepseek_v4_model(model_id):
+    if _is_deepseek_v4_model(model_id):
+        engine_reasoning_parser = DEEPSEEK_V4_REASONING_PARSER
+        gateway_reasoning_parser = DEEPSEEK_V4_REASONING_PARSER
+        tool_call_parser = DEEPSEEK_V4_TOOL_CALL_PARSER
+    elif _is_nemotron_h_model(model_id):
+        # SGLang detects the Nemotron-3/Super template as ``nemotron_3``:
+        # Qwen-style ``<think>...</think>`` reasoning with Qwen3-Coder XML
+        # tool calls. The current smg gateway exposes that reasoning behavior
+        # under its ``qwen3`` parser name, while the engine keeps the SGLang
+        # ``nemotron_3`` name for grammar deferral and internal semantics.
+        engine_reasoning_parser = NEMOTRON_3_REASONING_PARSER
+        gateway_reasoning_parser = NEMOTRON_3_GATEWAY_REASONING_PARSER
+        tool_call_parser = NEMOTRON_3_TOOL_CALL_PARSER
+    else:
         return engine_args, gateway_args
 
     engine_result = list(engine_args)
@@ -235,10 +330,10 @@ def _args_with_default_model_parsers(
         "--reasoning-parser" not in engine_result
         and "--reasoning-parser" not in gateway_result
     ):
-        engine_result.extend(["--reasoning-parser", DEEPSEEK_V4_REASONING_PARSER])
-        gateway_result.extend(["--reasoning-parser", DEEPSEEK_V4_REASONING_PARSER])
+        engine_result.extend(["--reasoning-parser", engine_reasoning_parser])
+        gateway_result.extend(["--reasoning-parser", gateway_reasoning_parser])
     if "--tool-call-parser" not in gateway_result:
-        gateway_result.extend(["--tool-call-parser", DEEPSEEK_V4_TOOL_CALL_PARSER])
+        gateway_result.extend(["--tool-call-parser", tool_call_parser])
     return engine_result, gateway_result
 
 
 
@@ -24,13 +24,19 @@
 from tokenspeed.runtime.configs.kimi_k2_config import KimiK2Config
 from tokenspeed.runtime.configs.kimi_k25_config import KimiK25Config
 from tokenspeed.runtime.configs.minimax_m2_config import MiniMaxM2Config
+from tokenspeed.runtime.configs.nemotron_h_config import (
+    NemotronHConfig,
+    NemotronHPuzzleConfig,
+)
 from tokenspeed.runtime.configs.qwen2_config import Qwen2Config
 from tokenspeed.runtime.configs.qwen3_5_config import Qwen3_5Config, Qwen3_5MoeConfig
 from tokenspeed.runtime.configs.qwen3_config import Qwen3Config
 from tokenspeed.runtime.configs.qwen3_moe_config import Qwen3MoeConfig
 
 __all__ = [
     "DeepseekV4Config",
+    "NemotronHConfig",
+    "NemotronHPuzzleConfig",
     "Qwen2Config",
     "Qwen3Config",
     "Qwen3MoeConfig",