2929import logging
3030import os
3131import signal
32+ import socket
3233import sys
34+ import threading
3335from pathlib import Path
3436
37+ import psutil
38+
3539from tokenspeed .cli ._argsplit import OrchestratorOpts , split_argv
3640from tokenspeed .cli ._logo import print_logo
3741from tokenspeed .cli ._logprefix import ENGINE_TAG , GATEWAY_TAG , tag_stream
4246 wait_grpc_serving ,
4347 wait_http_ready ,
4448)
45- from tokenspeed .runtime .utils .network import get_free_port
46- from tokenspeed .runtime .utils .process import kill_process_tree
4749
4850logger = logging .getLogger (__name__ )
4951
5254DEFAULT_REASONING_PARSER = "passthrough"
5355DEEPSEEK_V4_REASONING_PARSER = "deepseek_v31"
5456DEEPSEEK_V4_TOOL_CALL_PARSER = "deepseek_v4"
57+ NEMOTRON_3_REASONING_PARSER = "nemotron_3"
58+ NEMOTRON_3_GATEWAY_REASONING_PARSER = "qwen3"
59+ NEMOTRON_3_TOOL_CALL_PARSER = "qwen_coder"
5560DEFAULT_SMG_LOG_LEVEL = "warn"
5661DEFAULT_SMG_PROMETHEUS_PORT = 8413
5762# smg reliability knobs we always want disabled when launched under
6368)
6469
6570
71+ def get_free_port ():
72+ try :
73+ with socket .socket (socket .AF_INET , socket .SOCK_STREAM ) as s :
74+ s .bind (("" , 0 ))
75+ return s .getsockname ()[1 ]
76+ except OSError :
77+ with socket .socket (socket .AF_INET6 , socket .SOCK_STREAM ) as s :
78+ s .bind (("" , 0 ))
79+ return s .getsockname ()[1 ]
80+
81+
82+ def kill_process_tree (
83+ parent_pid : int | None , include_parent : bool = True , skip_pid : int | None = None
84+ ) -> None :
85+ if threading .current_thread () is threading .main_thread ():
86+ signal .signal (signal .SIGCHLD , signal .SIG_DFL )
87+
88+ if parent_pid is None :
89+ parent_pid = os .getpid ()
90+ include_parent = False
91+
92+ try :
93+ itself = psutil .Process (parent_pid )
94+ except psutil .NoSuchProcess :
95+ return
96+
97+ for child in itself .children (recursive = True ):
98+ if child .pid == skip_pid :
99+ continue
100+ try :
101+ child .kill ()
102+ except psutil .NoSuchProcess :
103+ pass
104+
105+ if include_parent :
106+ try :
107+ if parent_pid == os .getpid ():
108+ itself .kill ()
109+ sys .exit (0 )
110+
111+ itself .kill ()
112+ itself .send_signal (signal .SIGQUIT )
113+ except psutil .NoSuchProcess :
114+ pass
115+
116+
66117def _check_serve_extra_installed () -> None :
67118 import importlib .util
68119
@@ -216,6 +267,37 @@ def _is_deepseek_v4_model(model_id: str | None) -> bool:
216267 )
217268
218269
270+ def _is_nemotron_h_model (model_id : str | None ) -> bool :
271+ if not model_id :
272+ return False
273+
274+ normalized = model_id .lower ().replace ("_" , "-" )
275+ compact = normalized .replace ("-" , "" )
276+ if (
277+ "nemotron-3" in normalized
278+ or "nemotron-h" in normalized
279+ or "nemotron3" in compact
280+ or "nemotronh" in compact
281+ ):
282+ return True
283+
284+ config_path = Path (model_id ) / "config.json"
285+ if not config_path .is_file ():
286+ return False
287+ try :
288+ with config_path .open () as f :
289+ config = json .load (f )
290+ except (OSError , json .JSONDecodeError ):
291+ return False
292+ if not isinstance (config , dict ):
293+ return False
294+ architectures = config .get ("architectures" ) or []
295+ return (
296+ config .get ("model_type" ) == "nemotron_h"
297+ or "NemotronHForCausalLM" in architectures
298+ )
299+
300+
219301def _args_with_default_model_parsers (
220302 engine_args : list [str ], gateway_args : list [str ]
221303) -> tuple [list [str ], list [str ]]:
@@ -226,7 +308,20 @@ def _args_with_default_model_parsers(
226308 name to defer json_schema grammars past the reasoning channel.
227309 """
228310 model_id = _user_model_id (gateway_args ) or _user_model_id (engine_args )
229- if not _is_deepseek_v4_model (model_id ):
311+ if _is_deepseek_v4_model (model_id ):
312+ engine_reasoning_parser = DEEPSEEK_V4_REASONING_PARSER
313+ gateway_reasoning_parser = DEEPSEEK_V4_REASONING_PARSER
314+ tool_call_parser = DEEPSEEK_V4_TOOL_CALL_PARSER
315+ elif _is_nemotron_h_model (model_id ):
316+ # SGLang detects the Nemotron-3/Super template as ``nemotron_3``:
317+ # Qwen-style ``<think>...</think>`` reasoning with Qwen3-Coder XML
318+ # tool calls. The current smg gateway exposes that reasoning behavior
319+ # under its ``qwen3`` parser name, while the engine keeps the SGLang
320+ # ``nemotron_3`` name for grammar deferral and internal semantics.
321+ engine_reasoning_parser = NEMOTRON_3_REASONING_PARSER
322+ gateway_reasoning_parser = NEMOTRON_3_GATEWAY_REASONING_PARSER
323+ tool_call_parser = NEMOTRON_3_TOOL_CALL_PARSER
324+ else :
230325 return engine_args , gateway_args
231326
232327 engine_result = list (engine_args )
@@ -235,10 +330,10 @@ def _args_with_default_model_parsers(
235330 "--reasoning-parser" not in engine_result
236331 and "--reasoning-parser" not in gateway_result
237332 ):
238- engine_result .extend (["--reasoning-parser" , DEEPSEEK_V4_REASONING_PARSER ])
239- gateway_result .extend (["--reasoning-parser" , DEEPSEEK_V4_REASONING_PARSER ])
333+ engine_result .extend (["--reasoning-parser" , engine_reasoning_parser ])
334+ gateway_result .extend (["--reasoning-parser" , gateway_reasoning_parser ])
240335 if "--tool-call-parser" not in gateway_result :
241- gateway_result .extend (["--tool-call-parser" , DEEPSEEK_V4_TOOL_CALL_PARSER ])
336+ gateway_result .extend (["--tool-call-parser" , tool_call_parser ])
242337 return engine_result , gateway_result
243338
244339
0 commit comments