Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/.vitepress/config.mts
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ export default defineConfig({
text: "Guides",
items: [
{ text: "Getting Started", link: "/guides/getting-started" },
{ text: "Launching a Server", link: "/guides/launching" },
{ text: "InstantTensor Loading", link: "/guides/instanttensor" }
{ text: "Launching a Server", link: "/guides/launching" }
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs/configuration/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ For a compact compatibility table, see
| `--tokenizer` | Tokenizer path when it differs from the model path. |
| `--tokenizer-mode` | Select tokenizer behavior. `auto` uses fast tokenizers and model-specific hooks when available. |
| `--skip-tokenizer-init` | Skip tokenizer initialization for input-ID-only serving paths. |
| `--load-format` | Weight loading format: `auto`, `pt`, `safetensors`, `instanttensor`, `npcache`, `dummy`, or `extensible`. See [InstantTensor](/guides/instanttensor) for the accelerated NVIDIA loader. |
| `--load-format` | Weight loading format: `auto`, `pt`, `safetensors`, `npcache`, `dummy`, or `extensible`. |
| `--trust-remote-code` | Allow custom model code from the model repository. |
| `--revision` | Model branch, tag, or commit. |
| `--download-dir` | Hugging Face download/cache directory. |
Expand Down
70 changes: 0 additions & 70 deletions docs/guides/instanttensor.md

This file was deleted.

1 change: 0 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ dependencies = [
"fastapi",
"hf_transfer",
"huggingface_hub",
"instanttensor>=0.1.9",
"modelscope",
"msgspec",
"ninja",
Expand Down
5 changes: 0 additions & 5 deletions python/tokenspeed/runtime/configs/load_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ class LoadFormat(str, enum.Enum):
AUTO = "auto"
PT = "pt"
SAFETENSORS = "safetensors"
INSTANTTENSOR = "instanttensor"
NPCACHE = "npcache"
DUMMY = "dummy"
SHARDED_STATE = "sharded_state"
Expand All @@ -52,10 +51,6 @@ class LoadConfig:
not available.
"pt" will load the weights in the pytorch bin format.
"safetensors" will load the weights in the safetensors format.
"instanttensor" will load the safetensors weights on NVIDIA GPUs
using InstantTensor, which accelerates loading via distributed
loading, pipelined prefetching, and direct I/O (with optional
GPUDirect Storage support).
"npcache" will load the weights in pytorch format and store
a numpy cache to speed up the loading.
"dummy" will initialize the weights with random values, which is
Expand Down
8 changes: 1 addition & 7 deletions python/tokenspeed/runtime/model_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@
filter_files_not_needed_for_inference,
get_quant_config,
initialize_dummy_weights,
instanttensor_weights_iterator,
np_cache_weights_iterator,
pt_weights_iterator,
safetensors_weights_iterator,
Expand Down Expand Up @@ -254,10 +253,7 @@ def _prepare_weights(
# Some quantized models use .pt files for storing the weights.
if load_format == LoadFormat.AUTO:
allow_patterns = ["*.safetensors", "*.bin"]
elif (
load_format == LoadFormat.SAFETENSORS
or load_format == LoadFormat.INSTANTTENSOR
):
elif load_format == LoadFormat.SAFETENSORS:
use_safetensors = True
allow_patterns = ["*.safetensors"]
elif load_format == LoadFormat.MISTRAL:
Expand Down Expand Up @@ -335,8 +331,6 @@ def _get_weights_iterator(
hf_folder,
hf_weights_files,
)
elif self.load_config.load_format == LoadFormat.INSTANTTENSOR:
weights_iterator = instanttensor_weights_iterator(hf_weights_files)
elif use_safetensors:
weights_iterator = safetensors_weights_iterator(
hf_weights_files,
Expand Down
57 changes: 0 additions & 57 deletions python/tokenspeed/runtime/model_loader/weight_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
import torch
from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
from tokenspeed_kernel.platform import current_platform
from tqdm.auto import tqdm

from tokenspeed.runtime.configs.load_config import LoadConfig
Expand Down Expand Up @@ -481,62 +480,6 @@ def safetensors_weights_iterator(
yield from result.items()


def instanttensor_weights_iterator(
hf_weights_files: list[str],
) -> Generator[tuple[str, torch.Tensor], None, None]:
"""Iterate over the weights in the model safetensor files using the
InstantTensor library.

InstantTensor accelerates loading safetensors weights on NVIDIA GPUs
through distributed loading, pipelined prefetching, and direct I/O. When
the job spans multiple ranks, the world process group is passed to
InstantTensor so reads are sharded across ranks.

Args:
hf_weights_files: Local paths to the ``*.safetensors`` shards to load.

Yields:
``(name, tensor)`` pairs for every tensor in the checkpoint, with the
tensors materialized on the current CUDA device.
"""
try:
import instanttensor
except ImportError as e:
raise ImportError(
"Please install instanttensor via `pip install instanttensor`"
) from e

if not current_platform().is_nvidia:
raise ValueError("InstantTensor requires NVIDIA GPUs")

process_group = None
if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
# The default (world) group spans every rank in the job, matching the
# semantics InstantTensor expects for distributed loading.
process_group = torch.distributed.group.WORLD

device = torch.cuda.current_device()

enable_tqdm = (
not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
)

with instanttensor.safe_open(
hf_weights_files, framework="pt", device=device, process_group=process_group
) as f:
# Since InstantTensor 0.1.9, tensors are cloned internally by default,
# so no extra clone is needed here.
yield from tqdm(
f.tensors(),
desc="Loading safetensors using InstantTensor loader",
disable=not enable_tqdm,
bar_format=_BAR_FORMAT,
position=tqdm._get_free_pos(),
total=len(f.keys()),
mininterval=1.0,
)


def pt_weights_iterator(
hf_weights_files: list[str],
) -> Generator[tuple[str, torch.Tensor], None, None]:
Expand Down
70 changes: 27 additions & 43 deletions python/tokenspeed/runtime/models/gpt_oss.py
Original file line number Diff line number Diff line change
Expand Up @@ -716,27 +716,17 @@ def _load_normal_weights(
}

def _load_mxfp4_weights(self, weights, weight_name_mapping: dict):
# The MoE expert tensors dominate the checkpoint. Stream them straight
# into their (pre-allocated) parameter slots as they arrive instead of
# buffering the whole iterator into a list. Buffering is invisible to
# CPU-staging loaders, but a GPU-direct loader (e.g.
# ``--load-format instanttensor``) yields tensors already on the GPU,
# so collecting every expert tensor would keep the entire checkpoint
# resident on the device at once and OOM mid-load. The remaining
# non-expert weights (attention, embeddings, norms, router) are small
# and are collected for the generic ``_load_normal_weights`` pass.

mxfp4_weights = []
normal_weights = []

def expert_weights():
for name, weight in weights:
if ".experts" in name:
yield name, weight
else:
normal_weights.append((name, weight))
for name, weight in weights:
if ".experts" in name:
mxfp4_weights.append((name, weight))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Keep GPT-OSS MXFP4 expert loading streamed

Here weights is the loader generator, but this loop now appends every .experts tensor before any copy into model parameters occurs. For MXFP4 checkpoints such as openai/gpt-oss-120b, those expert tensors dominate the checkpoint; with the default safetensors iterator this pins all loaded shards in host memory until the list is handed to _load_mxfp4_experts_weights, so large-model loads can OOM instead of streaming each expert into its preallocated slot.

Useful? React with 👍 / 👎.

else:
normal_weights.append((name, weight))

# ``_load_mxfp4_experts_weights`` drains this generator fully, so by the
# time it returns ``normal_weights`` holds every non-expert tensor.
mxfp4_loaded_params = self._load_mxfp4_experts_weights(expert_weights())
mxfp4_loaded_params = self._load_mxfp4_experts_weights(mxfp4_weights)
self._load_normal_weights(
normal_weights,
weight_name_mapping=weight_name_mapping,
Expand Down Expand Up @@ -782,33 +772,27 @@ def _copy_into_param(param, narrow_weight):
)
param.data[slices].copy_(narrow_weight[slices])

# The two MXFP4 expert checkpoint layouts are mutually exclusive and
# are detected from the first expert tensor (a checkpoint is uniformly
# one layout), reproducing the original whole-iterator ``any(...)``
# probe for AMD-Quark per-expert checkpoints (e.g.
# ``amd/gpt-oss-120b-w-mxfp4-a-fp8``: one tensor set per expert plus a
# scalar ``input_scale`` for static FP8 activation quantization)
# without buffering the iterator. Each expert tensor is streamed
# straight into its slot as it arrives.
per_expert_re = re.compile(r"\.experts\.\d+\.(gate_up_proj|down_proj)\.")
per_expert_format = None
# Detect AMD-Quark per-expert checkpoints (e.g.
# ``amd/gpt-oss-120b-w-mxfp4-a-fp8``). These store one set of tensors
# per expert (``...experts.{e}.gate_up_proj.{weight,...}``) plus a
# scalar ``input_scale`` for static FP8 activation quantization.
if any(
re.search(r"\.experts\.\d+\.(gate_up_proj|down_proj)\.", n)
for n, _ in weights
):
return self._load_mxfp4_per_expert_weights(
weights,
params_dict=params_dict,
moe_tp_rank_start=moe_tp_rank_start,
moe_tp_rank_end=moe_tp_rank_end,
moe_ep_rank_start=moe_ep_rank_start,
moe_ep_rank_end=moe_ep_rank_end,
moe_tp_rank=moe_tp_rank,
copy_into_param=_copy_into_param,
mxfp4_block=mxfp4_block,
)

for name, weight in weights:
if per_expert_format is None:
per_expert_format = per_expert_re.search(name) is not None
if per_expert_format:
loaded_params |= self._load_mxfp4_per_expert_weights(
[(name, weight)],
params_dict=params_dict,
moe_tp_rank_start=moe_tp_rank_start,
moe_tp_rank_end=moe_tp_rank_end,
moe_ep_rank_start=moe_ep_rank_start,
moe_ep_rank_end=moe_ep_rank_end,
moe_tp_rank=moe_tp_rank,
copy_into_param=_copy_into_param,
mxfp4_block=mxfp4_block,
)
continue
weight = _WeightCreator.maybe_materialize(weight)

if "gate_up_proj_blocks" in name:
Expand Down
76 changes: 30 additions & 46 deletions python/tokenspeed/runtime/models/kimi_k25.py
Original file line number Diff line number Diff line change
Expand Up @@ -900,60 +900,44 @@ def forward(
)

def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
"""Load weights, streaming language weights to the language model.

The language weights are forwarded to ``language_model.load_weights``
lazily (as a generator) instead of being collected into a list first.
Materializing the whole iterator would keep every loaded tensor alive
at once; that is harmless for CPU-staged loaders but OOMs GPU-direct
loaders (e.g. ``--load-format instanttensor``), which would then hold
the entire model on the device during loading. Vision weights are
small and are still collected, then loaded after the language model.
"""
vision_weights: list[Tuple[str, torch.Tensor]] = []
encoder_only = getattr(self.config, "encoder_only", False)
load_vision = self.is_multimodal_active and not getattr(
self.config, "language_only", False
)
"""Load weights for the model, separating vision and language weights"""
vision_weights = []
language_weights = []

for name, loaded_weight in weights:
# nvidia/Kimi-K2.5-NVFP4 stores decoder layers under
# language_model.layers.*, while TokenSpeed's DeepSeek module
# expects model.layers.* after stripping language_model.
if name.startswith("language_model.layers."):
name = name.replace(
"language_model.layers.", "language_model.model.layers.", 1
)

def language_weights() -> Iterable[Tuple[str, torch.Tensor]]:
for name, loaded_weight in weights:
# nvidia/Kimi-K2.5-NVFP4 stores decoder layers under
# language_model.layers.*, while TokenSpeed's DeepSeek module
# expects model.layers.* after stripping language_model.
if name.startswith("language_model.layers."):
name = name.replace(
"language_model.layers.", "language_model.model.layers.", 1
)

if "vision_tower" in name or "mm_projector" in name:
name = name.replace(r"wqkv.", r"attn.qkv_proj.")
name = name.replace(r"wo.", r"attn.proj.")
name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
if load_vision:
vision_weights.append((name, loaded_weight))
else:
yield name.replace("language_model.", ""), loaded_weight

if not encoder_only:
# Consumes the iterator lazily; fills vision_weights as a side
# effect for the multimodal branch below.
self.language_model.load_weights(language_weights())
elif load_vision:
# Encoder-only: still drain the iterator to collect vision weights.
for _ in language_weights():
pass

if load_vision:
if "vision_tower" in name or "mm_projector" in name:
name = name.replace(r"wqkv.", r"attn.qkv_proj.")
name = name.replace(r"wo.", r"attn.proj.")
name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
vision_weights.append((name, loaded_weight))
else:
name = name.replace("language_model.", "")
language_weights.append((name, loaded_weight))

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Stream Kimi language weights into the LM

Appending every non-vision tensor means Kimi language weights are all materialized before DeepseekV3ForCausalLM.load_weights is invoked. For large checkpoints such as nvidia/Kimi-K2.5-NVFP4 (still used by the CI configs changed here), the safetensors iterator will keep the whole language checkpoint alive in host RAM rather than freeing shards as DeepSeek consumes them, which can OOM large Kimi loads; pass a generator to language_model.load_weights and only collect vision weights.

Useful? React with 👍 / 👎.


if self.is_multimodal_active and not getattr(
self.config, "language_only", False
):
vision_state_dict = dict(vision_weights)
params_dict = dict(self.named_parameters(remove_duplicate=False))
for name, loaded_weight in vision_weights:
for name, loaded_weight in vision_state_dict.items():
if name not in params_dict:
raise ValueError(f"Weight {name} not found in params_dict")
param = params_dict[name]
weight_loader = getattr(param, "weight_loader", default_weight_loader)
weight_loader(param, loaded_weight)

if not getattr(self.config, "encoder_only", False) and language_weights:
self.language_model.load_weights(language_weights)

@classmethod
def get_model_config_for_expert_location(cls, config: KimiK25Config):
text_config = config.text_config
Expand Down
Loading
Loading