Skip to content

Commit ec60f40

Browse files
revert: remove instanttensor loader (#462)
Signed-off-by: lightseek-bot <243258330+lightseek-bot@users.noreply.github.com>
1 parent 1248c0a commit ec60f40

25 files changed

Lines changed: 60 additions & 468 deletions

docs/.vitepress/config.mts

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ export default defineConfig({
3737
text: "Guides",
3838
items: [
3939
{ text: "Getting Started", link: "/guides/getting-started" },
40-
{ text: "Launching a Server", link: "/guides/launching" },
41-
{ text: "InstantTensor Loading", link: "/guides/instanttensor" }
40+
{ text: "Launching a Server", link: "/guides/launching" }
4241
]
4342
},
4443
{

docs/configuration/server.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ For a compact compatibility table, see
1616
| `--tokenizer` | Tokenizer path when it differs from the model path. |
1717
| `--tokenizer-mode` | Select tokenizer behavior. `auto` uses fast tokenizers and model-specific hooks when available. |
1818
| `--skip-tokenizer-init` | Skip tokenizer initialization for input-ID-only serving paths. |
19-
| `--load-format` | Weight loading format: `auto`, `pt`, `safetensors`, `instanttensor`, `npcache`, `dummy`, or `extensible`. See [InstantTensor](/guides/instanttensor) for the accelerated NVIDIA loader. |
19+
| `--load-format` | Weight loading format: `auto`, `pt`, `safetensors`, `npcache`, `dummy`, or `extensible`. |
2020
| `--trust-remote-code` | Allow custom model code from the model repository. |
2121
| `--revision` | Model branch, tag, or commit. |
2222
| `--download-dir` | Hugging Face download/cache directory. |

docs/guides/instanttensor.md

Lines changed: 0 additions & 70 deletions
This file was deleted.

python/pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ dependencies = [
4141
"fastapi",
4242
"hf_transfer",
4343
"huggingface_hub",
44-
"instanttensor>=0.1.9",
4544
"modelscope",
4645
"msgspec",
4746
"ninja",

python/tokenspeed/runtime/configs/load_config.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ class LoadFormat(str, enum.Enum):
3333
AUTO = "auto"
3434
PT = "pt"
3535
SAFETENSORS = "safetensors"
36-
INSTANTTENSOR = "instanttensor"
3736
NPCACHE = "npcache"
3837
DUMMY = "dummy"
3938
SHARDED_STATE = "sharded_state"
@@ -52,10 +51,6 @@ class LoadConfig:
5251
not available.
5352
"pt" will load the weights in the pytorch bin format.
5453
"safetensors" will load the weights in the safetensors format.
55-
"instanttensor" will load the safetensors weights on NVIDIA GPUs
56-
using InstantTensor, which accelerates loading via distributed
57-
loading, pipelined prefetching, and direct I/O (with optional
58-
GPUDirect Storage support).
5954
"npcache" will load the weights in pytorch format and store
6055
a numpy cache to speed up the loading.
6156
"dummy" will initialize the weights with random values, which is

python/tokenspeed/runtime/model_loader/loader.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,6 @@
5757
filter_files_not_needed_for_inference,
5858
get_quant_config,
5959
initialize_dummy_weights,
60-
instanttensor_weights_iterator,
6160
np_cache_weights_iterator,
6261
pt_weights_iterator,
6362
safetensors_weights_iterator,
@@ -254,10 +253,7 @@ def _prepare_weights(
254253
# Some quantized models use .pt files for storing the weights.
255254
if load_format == LoadFormat.AUTO:
256255
allow_patterns = ["*.safetensors", "*.bin"]
257-
elif (
258-
load_format == LoadFormat.SAFETENSORS
259-
or load_format == LoadFormat.INSTANTTENSOR
260-
):
256+
elif load_format == LoadFormat.SAFETENSORS:
261257
use_safetensors = True
262258
allow_patterns = ["*.safetensors"]
263259
elif load_format == LoadFormat.MISTRAL:
@@ -335,8 +331,6 @@ def _get_weights_iterator(
335331
hf_folder,
336332
hf_weights_files,
337333
)
338-
elif self.load_config.load_format == LoadFormat.INSTANTTENSOR:
339-
weights_iterator = instanttensor_weights_iterator(hf_weights_files)
340334
elif use_safetensors:
341335
weights_iterator = safetensors_weights_iterator(
342336
hf_weights_files,

python/tokenspeed/runtime/model_loader/weight_utils.py

Lines changed: 0 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
import torch
4343
from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
4444
from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
45-
from tokenspeed_kernel.platform import current_platform
4645
from tqdm.auto import tqdm
4746

4847
from tokenspeed.runtime.configs.load_config import LoadConfig
@@ -481,62 +480,6 @@ def safetensors_weights_iterator(
481480
yield from result.items()
482481

483482

484-
def instanttensor_weights_iterator(
485-
hf_weights_files: list[str],
486-
) -> Generator[tuple[str, torch.Tensor], None, None]:
487-
"""Iterate over the weights in the model safetensor files using the
488-
InstantTensor library.
489-
490-
InstantTensor accelerates loading safetensors weights on NVIDIA GPUs
491-
through distributed loading, pipelined prefetching, and direct I/O. When
492-
the job spans multiple ranks, the world process group is passed to
493-
InstantTensor so reads are sharded across ranks.
494-
495-
Args:
496-
hf_weights_files: Local paths to the ``*.safetensors`` shards to load.
497-
498-
Yields:
499-
``(name, tensor)`` pairs for every tensor in the checkpoint, with the
500-
tensors materialized on the current CUDA device.
501-
"""
502-
try:
503-
import instanttensor
504-
except ImportError as e:
505-
raise ImportError(
506-
"Please install instanttensor via `pip install instanttensor`"
507-
) from e
508-
509-
if not current_platform().is_nvidia:
510-
raise ValueError("InstantTensor requires NVIDIA GPUs")
511-
512-
process_group = None
513-
if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
514-
# The default (world) group spans every rank in the job, matching the
515-
# semantics InstantTensor expects for distributed loading.
516-
process_group = torch.distributed.group.WORLD
517-
518-
device = torch.cuda.current_device()
519-
520-
enable_tqdm = (
521-
not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
522-
)
523-
524-
with instanttensor.safe_open(
525-
hf_weights_files, framework="pt", device=device, process_group=process_group
526-
) as f:
527-
# Since InstantTensor 0.1.9, tensors are cloned internally by default,
528-
# so no extra clone is needed here.
529-
yield from tqdm(
530-
f.tensors(),
531-
desc="Loading safetensors using InstantTensor loader",
532-
disable=not enable_tqdm,
533-
bar_format=_BAR_FORMAT,
534-
position=tqdm._get_free_pos(),
535-
total=len(f.keys()),
536-
mininterval=1.0,
537-
)
538-
539-
540483
def pt_weights_iterator(
541484
hf_weights_files: list[str],
542485
) -> Generator[tuple[str, torch.Tensor], None, None]:

python/tokenspeed/runtime/models/gpt_oss.py

Lines changed: 27 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -716,27 +716,17 @@ def _load_normal_weights(
716716
}
717717

718718
def _load_mxfp4_weights(self, weights, weight_name_mapping: dict):
719-
# The MoE expert tensors dominate the checkpoint. Stream them straight
720-
# into their (pre-allocated) parameter slots as they arrive instead of
721-
# buffering the whole iterator into a list. Buffering is invisible to
722-
# CPU-staging loaders, but a GPU-direct loader (e.g.
723-
# ``--load-format instanttensor``) yields tensors already on the GPU,
724-
# so collecting every expert tensor would keep the entire checkpoint
725-
# resident on the device at once and OOM mid-load. The remaining
726-
# non-expert weights (attention, embeddings, norms, router) are small
727-
# and are collected for the generic ``_load_normal_weights`` pass.
719+
720+
mxfp4_weights = []
728721
normal_weights = []
729722

730-
def expert_weights():
731-
for name, weight in weights:
732-
if ".experts" in name:
733-
yield name, weight
734-
else:
735-
normal_weights.append((name, weight))
723+
for name, weight in weights:
724+
if ".experts" in name:
725+
mxfp4_weights.append((name, weight))
726+
else:
727+
normal_weights.append((name, weight))
736728

737-
# ``_load_mxfp4_experts_weights`` drains this generator fully, so by the
738-
# time it returns ``normal_weights`` holds every non-expert tensor.
739-
mxfp4_loaded_params = self._load_mxfp4_experts_weights(expert_weights())
729+
mxfp4_loaded_params = self._load_mxfp4_experts_weights(mxfp4_weights)
740730
self._load_normal_weights(
741731
normal_weights,
742732
weight_name_mapping=weight_name_mapping,
@@ -782,33 +772,27 @@ def _copy_into_param(param, narrow_weight):
782772
)
783773
param.data[slices].copy_(narrow_weight[slices])
784774

785-
# The two MXFP4 expert checkpoint layouts are mutually exclusive and
786-
# are detected from the first expert tensor (a checkpoint is uniformly
787-
# one layout), reproducing the original whole-iterator ``any(...)``
788-
# probe for AMD-Quark per-expert checkpoints (e.g.
789-
# ``amd/gpt-oss-120b-w-mxfp4-a-fp8``: one tensor set per expert plus a
790-
# scalar ``input_scale`` for static FP8 activation quantization)
791-
# without buffering the iterator. Each expert tensor is streamed
792-
# straight into its slot as it arrives.
793-
per_expert_re = re.compile(r"\.experts\.\d+\.(gate_up_proj|down_proj)\.")
794-
per_expert_format = None
775+
# Detect AMD-Quark per-expert checkpoints (e.g.
776+
# ``amd/gpt-oss-120b-w-mxfp4-a-fp8``). These store one set of tensors
777+
# per expert (``...experts.{e}.gate_up_proj.{weight,...}``) plus a
778+
# scalar ``input_scale`` for static FP8 activation quantization.
779+
if any(
780+
re.search(r"\.experts\.\d+\.(gate_up_proj|down_proj)\.", n)
781+
for n, _ in weights
782+
):
783+
return self._load_mxfp4_per_expert_weights(
784+
weights,
785+
params_dict=params_dict,
786+
moe_tp_rank_start=moe_tp_rank_start,
787+
moe_tp_rank_end=moe_tp_rank_end,
788+
moe_ep_rank_start=moe_ep_rank_start,
789+
moe_ep_rank_end=moe_ep_rank_end,
790+
moe_tp_rank=moe_tp_rank,
791+
copy_into_param=_copy_into_param,
792+
mxfp4_block=mxfp4_block,
793+
)
795794

796795
for name, weight in weights:
797-
if per_expert_format is None:
798-
per_expert_format = per_expert_re.search(name) is not None
799-
if per_expert_format:
800-
loaded_params |= self._load_mxfp4_per_expert_weights(
801-
[(name, weight)],
802-
params_dict=params_dict,
803-
moe_tp_rank_start=moe_tp_rank_start,
804-
moe_tp_rank_end=moe_tp_rank_end,
805-
moe_ep_rank_start=moe_ep_rank_start,
806-
moe_ep_rank_end=moe_ep_rank_end,
807-
moe_tp_rank=moe_tp_rank,
808-
copy_into_param=_copy_into_param,
809-
mxfp4_block=mxfp4_block,
810-
)
811-
continue
812796
weight = _WeightCreator.maybe_materialize(weight)
813797

814798
if "gate_up_proj_blocks" in name:

python/tokenspeed/runtime/models/kimi_k25.py

Lines changed: 30 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -900,60 +900,44 @@ def forward(
900900
)
901901

902902
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
903-
"""Load weights, streaming language weights to the language model.
904-
905-
The language weights are forwarded to ``language_model.load_weights``
906-
lazily (as a generator) instead of being collected into a list first.
907-
Materializing the whole iterator would keep every loaded tensor alive
908-
at once; that is harmless for CPU-staged loaders but OOMs GPU-direct
909-
loaders (e.g. ``--load-format instanttensor``), which would then hold
910-
the entire model on the device during loading. Vision weights are
911-
small and are still collected, then loaded after the language model.
912-
"""
913-
vision_weights: list[Tuple[str, torch.Tensor]] = []
914-
encoder_only = getattr(self.config, "encoder_only", False)
915-
load_vision = self.is_multimodal_active and not getattr(
916-
self.config, "language_only", False
917-
)
903+
"""Load weights for the model, separating vision and language weights"""
904+
vision_weights = []
905+
language_weights = []
906+
907+
for name, loaded_weight in weights:
908+
# nvidia/Kimi-K2.5-NVFP4 stores decoder layers under
909+
# language_model.layers.*, while TokenSpeed's DeepSeek module
910+
# expects model.layers.* after stripping language_model.
911+
if name.startswith("language_model.layers."):
912+
name = name.replace(
913+
"language_model.layers.", "language_model.model.layers.", 1
914+
)
918915

919-
def language_weights() -> Iterable[Tuple[str, torch.Tensor]]:
920-
for name, loaded_weight in weights:
921-
# nvidia/Kimi-K2.5-NVFP4 stores decoder layers under
922-
# language_model.layers.*, while TokenSpeed's DeepSeek module
923-
# expects model.layers.* after stripping language_model.
924-
if name.startswith("language_model.layers."):
925-
name = name.replace(
926-
"language_model.layers.", "language_model.model.layers.", 1
927-
)
928-
929-
if "vision_tower" in name or "mm_projector" in name:
930-
name = name.replace(r"wqkv.", r"attn.qkv_proj.")
931-
name = name.replace(r"wo.", r"attn.proj.")
932-
name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
933-
name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
934-
if load_vision:
935-
vision_weights.append((name, loaded_weight))
936-
else:
937-
yield name.replace("language_model.", ""), loaded_weight
938-
939-
if not encoder_only:
940-
# Consumes the iterator lazily; fills vision_weights as a side
941-
# effect for the multimodal branch below.
942-
self.language_model.load_weights(language_weights())
943-
elif load_vision:
944-
# Encoder-only: still drain the iterator to collect vision weights.
945-
for _ in language_weights():
946-
pass
947-
948-
if load_vision:
916+
if "vision_tower" in name or "mm_projector" in name:
917+
name = name.replace(r"wqkv.", r"attn.qkv_proj.")
918+
name = name.replace(r"wo.", r"attn.proj.")
919+
name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
920+
name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
921+
vision_weights.append((name, loaded_weight))
922+
else:
923+
name = name.replace("language_model.", "")
924+
language_weights.append((name, loaded_weight))
925+
926+
if self.is_multimodal_active and not getattr(
927+
self.config, "language_only", False
928+
):
929+
vision_state_dict = dict(vision_weights)
949930
params_dict = dict(self.named_parameters(remove_duplicate=False))
950-
for name, loaded_weight in vision_weights:
931+
for name, loaded_weight in vision_state_dict.items():
951932
if name not in params_dict:
952933
raise ValueError(f"Weight {name} not found in params_dict")
953934
param = params_dict[name]
954935
weight_loader = getattr(param, "weight_loader", default_weight_loader)
955936
weight_loader(param, loaded_weight)
956937

938+
if not getattr(self.config, "encoder_only", False) and language_weights:
939+
self.language_model.load_weights(language_weights)
940+
957941
@classmethod
958942
def get_model_config_for_expert_location(cls, config: KimiK25Config):
959943
text_config = config.text_config

0 commit comments

Comments
 (0)