lightseekorg · torchspec-bot · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · chatgpt-codex-connector
diff --git a/docs/.vitepress/config.mts b/docs/.vitepress/config.mts
@@ -37,8 +37,7 @@ export default defineConfig({
         text: "Guides",
         items: [
           { text: "Getting Started", link: "/guides/getting-started" },
-          { text: "Launching a Server", link: "/guides/launching" },
-          { text: "InstantTensor Loading", link: "/guides/instanttensor" }
+          { text: "Launching a Server", link: "/guides/launching" }
         ]
       },
       {

diff --git a/docs/configuration/server.md b/docs/configuration/server.md
@@ -16,7 +16,7 @@ For a compact compatibility table, see
 | `--tokenizer` | Tokenizer path when it differs from the model path. |
 | `--tokenizer-mode` | Select tokenizer behavior. `auto` uses fast tokenizers and model-specific hooks when available. |
 | `--skip-tokenizer-init` | Skip tokenizer initialization for input-ID-only serving paths. |
-| `--load-format` | Weight loading format: `auto`, `pt`, `safetensors`, `instanttensor`, `npcache`, `dummy`, or `extensible`. See [InstantTensor](/guides/instanttensor) for the accelerated NVIDIA loader. |
+| `--load-format` | Weight loading format: `auto`, `pt`, `safetensors`, `npcache`, `dummy`, or `extensible`. |
 | `--trust-remote-code` | Allow custom model code from the model repository. |
 | `--revision` | Model branch, tag, or commit. |
 | `--download-dir` | Hugging Face download/cache directory. |

diff --git a/docs/guides/instanttensor.md b/docs/guides/instanttensor.md
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -41,7 +41,6 @@ dependencies = [
     "fastapi",
     "hf_transfer",
     "huggingface_hub",
-    "instanttensor>=0.1.9",
     "modelscope",
     "msgspec",
     "ninja",

@@ -33,7 +33,6 @@ class LoadFormat(str, enum.Enum):
     AUTO = "auto"
     PT = "pt"
     SAFETENSORS = "safetensors"
-    INSTANTTENSOR = "instanttensor"
     NPCACHE = "npcache"
     DUMMY = "dummy"
     SHARDED_STATE = "sharded_state"
@@ -52,10 +51,6 @@ class LoadConfig:
             not available.
         "pt" will load the weights in the pytorch bin format.
         "safetensors" will load the weights in the safetensors format.
-        "instanttensor" will load the safetensors weights on NVIDIA GPUs
-            using InstantTensor, which accelerates loading via distributed
-            loading, pipelined prefetching, and direct I/O (with optional
-            GPUDirect Storage support).
         "npcache" will load the weights in pytorch format and store
             a numpy cache to speed up the loading.
         "dummy" will initialize the weights with random values, which is

@@ -57,7 +57,6 @@
     filter_files_not_needed_for_inference,
     get_quant_config,
     initialize_dummy_weights,
-    instanttensor_weights_iterator,
     np_cache_weights_iterator,
     pt_weights_iterator,
     safetensors_weights_iterator,
@@ -254,10 +253,7 @@ def _prepare_weights(
         # Some quantized models use .pt files for storing the weights.
         if load_format == LoadFormat.AUTO:
             allow_patterns = ["*.safetensors", "*.bin"]
-        elif (
-            load_format == LoadFormat.SAFETENSORS
-            or load_format == LoadFormat.INSTANTTENSOR
-        ):
+        elif load_format == LoadFormat.SAFETENSORS:
             use_safetensors = True
             allow_patterns = ["*.safetensors"]
         elif load_format == LoadFormat.MISTRAL:
@@ -335,8 +331,6 @@ def _get_weights_iterator(
                 hf_folder,
                 hf_weights_files,
             )
-        elif self.load_config.load_format == LoadFormat.INSTANTTENSOR:
-            weights_iterator = instanttensor_weights_iterator(hf_weights_files)
         elif use_safetensors:
             weights_iterator = safetensors_weights_iterator(
                 hf_weights_files,

@@ -42,7 +42,6 @@
 import torch
 from huggingface_hub import HfFileSystem, hf_hub_download, snapshot_download
 from pydantic import BaseModel, ConfigDict, ValidationInfo, model_validator
-from tokenspeed_kernel.platform import current_platform
 from tqdm.auto import tqdm
 
 from tokenspeed.runtime.configs.load_config import LoadConfig
@@ -481,62 +480,6 @@ def safetensors_weights_iterator(
         yield from result.items()
 
 
-def instanttensor_weights_iterator(
-    hf_weights_files: list[str],
-) -> Generator[tuple[str, torch.Tensor], None, None]:
-    """Iterate over the weights in the model safetensor files using the
-    InstantTensor library.
-
-    InstantTensor accelerates loading safetensors weights on NVIDIA GPUs
-    through distributed loading, pipelined prefetching, and direct I/O. When
-    the job spans multiple ranks, the world process group is passed to
-    InstantTensor so reads are sharded across ranks.
-
-    Args:
-        hf_weights_files: Local paths to the ``*.safetensors`` shards to load.
-
-    Yields:
-        ``(name, tensor)`` pairs for every tensor in the checkpoint, with the
-        tensors materialized on the current CUDA device.
-    """
-    try:
-        import instanttensor
-    except ImportError as e:
-        raise ImportError(
-            "Please install instanttensor via `pip install instanttensor`"
-        ) from e
-
-    if not current_platform().is_nvidia:
-        raise ValueError("InstantTensor requires NVIDIA GPUs")
-
-    process_group = None
-    if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
-        # The default (world) group spans every rank in the job, matching the
-        # semantics InstantTensor expects for distributed loading.
-        process_group = torch.distributed.group.WORLD
-
-    device = torch.cuda.current_device()
-
-    enable_tqdm = (
-        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-    )
-
-    with instanttensor.safe_open(
-        hf_weights_files, framework="pt", device=device, process_group=process_group
-    ) as f:
-        # Since InstantTensor 0.1.9, tensors are cloned internally by default,
-        # so no extra clone is needed here.
-        yield from tqdm(
-            f.tensors(),
-            desc="Loading safetensors using InstantTensor loader",
-            disable=not enable_tqdm,
-            bar_format=_BAR_FORMAT,
-            position=tqdm._get_free_pos(),
-            total=len(f.keys()),
-            mininterval=1.0,
-        )
-
-
 def pt_weights_iterator(
     hf_weights_files: list[str],
 ) -> Generator[tuple[str, torch.Tensor], None, None]:

@@ -716,27 +716,17 @@ def _load_normal_weights(
         }
 
     def _load_mxfp4_weights(self, weights, weight_name_mapping: dict):
-        # The MoE expert tensors dominate the checkpoint. Stream them straight
-        # into their (pre-allocated) parameter slots as they arrive instead of
-        # buffering the whole iterator into a list. Buffering is invisible to
-        # CPU-staging loaders, but a GPU-direct loader (e.g.
-        # ``--load-format instanttensor``) yields tensors already on the GPU,
-        # so collecting every expert tensor would keep the entire checkpoint
-        # resident on the device at once and OOM mid-load. The remaining
-        # non-expert weights (attention, embeddings, norms, router) are small
-        # and are collected for the generic ``_load_normal_weights`` pass.
+
+        mxfp4_weights = []
         normal_weights = []
 
-        def expert_weights():
-            for name, weight in weights:
-                if ".experts" in name:
-                    yield name, weight
-                else:
-                    normal_weights.append((name, weight))
+        for name, weight in weights:
+            if ".experts" in name:
+                mxfp4_weights.append((name, weight))
+            else:
+                normal_weights.append((name, weight))
 
-        # ``_load_mxfp4_experts_weights`` drains this generator fully, so by the
-        # time it returns ``normal_weights`` holds every non-expert tensor.
-        mxfp4_loaded_params = self._load_mxfp4_experts_weights(expert_weights())
+        mxfp4_loaded_params = self._load_mxfp4_experts_weights(mxfp4_weights)
         self._load_normal_weights(
             normal_weights,
             weight_name_mapping=weight_name_mapping,
@@ -782,33 +772,27 @@ def _copy_into_param(param, narrow_weight):
                 )
                 param.data[slices].copy_(narrow_weight[slices])
 
-        # The two MXFP4 expert checkpoint layouts are mutually exclusive and
-        # are detected from the first expert tensor (a checkpoint is uniformly
-        # one layout), reproducing the original whole-iterator ``any(...)``
-        # probe for AMD-Quark per-expert checkpoints (e.g.
-        # ``amd/gpt-oss-120b-w-mxfp4-a-fp8``: one tensor set per expert plus a
-        # scalar ``input_scale`` for static FP8 activation quantization)
-        # without buffering the iterator. Each expert tensor is streamed
-        # straight into its slot as it arrives.
-        per_expert_re = re.compile(r"\.experts\.\d+\.(gate_up_proj|down_proj)\.")
-        per_expert_format = None
+        # Detect AMD-Quark per-expert checkpoints (e.g.
+        # ``amd/gpt-oss-120b-w-mxfp4-a-fp8``). These store one set of tensors
+        # per expert (``...experts.{e}.gate_up_proj.{weight,...}``) plus a
+        # scalar ``input_scale`` for static FP8 activation quantization.
+        if any(
+            re.search(r"\.experts\.\d+\.(gate_up_proj|down_proj)\.", n)
+            for n, _ in weights
+        ):
+            return self._load_mxfp4_per_expert_weights(
+                weights,
+                params_dict=params_dict,
+                moe_tp_rank_start=moe_tp_rank_start,
+                moe_tp_rank_end=moe_tp_rank_end,
+                moe_ep_rank_start=moe_ep_rank_start,
+                moe_ep_rank_end=moe_ep_rank_end,
+                moe_tp_rank=moe_tp_rank,
+                copy_into_param=_copy_into_param,
+                mxfp4_block=mxfp4_block,
+            )
 
         for name, weight in weights:
-            if per_expert_format is None:
-                per_expert_format = per_expert_re.search(name) is not None
-            if per_expert_format:
-                loaded_params |= self._load_mxfp4_per_expert_weights(
-                    [(name, weight)],
-                    params_dict=params_dict,
-                    moe_tp_rank_start=moe_tp_rank_start,
-                    moe_tp_rank_end=moe_tp_rank_end,
-                    moe_ep_rank_start=moe_ep_rank_start,
-                    moe_ep_rank_end=moe_ep_rank_end,
-                    moe_tp_rank=moe_tp_rank,
-                    copy_into_param=_copy_into_param,
-                    mxfp4_block=mxfp4_block,
-                )
-                continue
             weight = _WeightCreator.maybe_materialize(weight)
 
             if "gate_up_proj_blocks" in name:

@@ -900,60 +900,44 @@ def forward(
         )
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-        """Load weights, streaming language weights to the language model.
-
-        The language weights are forwarded to ``language_model.load_weights``
-        lazily (as a generator) instead of being collected into a list first.
-        Materializing the whole iterator would keep every loaded tensor alive
-        at once; that is harmless for CPU-staged loaders but OOMs GPU-direct
-        loaders (e.g. ``--load-format instanttensor``), which would then hold
-        the entire model on the device during loading. Vision weights are
-        small and are still collected, then loaded after the language model.
-        """
-        vision_weights: list[Tuple[str, torch.Tensor]] = []
-        encoder_only = getattr(self.config, "encoder_only", False)
-        load_vision = self.is_multimodal_active and not getattr(
-            self.config, "language_only", False
-        )
+        """Load weights for the model, separating vision and language weights"""
+        vision_weights = []
+        language_weights = []
+
+        for name, loaded_weight in weights:
+            # nvidia/Kimi-K2.5-NVFP4 stores decoder layers under
+            # language_model.layers.*, while TokenSpeed's DeepSeek module
+            # expects model.layers.* after stripping language_model.
+            if name.startswith("language_model.layers."):
+                name = name.replace(
+                    "language_model.layers.", "language_model.model.layers.", 1
+                )
 
-        def language_weights() -> Iterable[Tuple[str, torch.Tensor]]:
-            for name, loaded_weight in weights:
-                # nvidia/Kimi-K2.5-NVFP4 stores decoder layers under
-                # language_model.layers.*, while TokenSpeed's DeepSeek module
-                # expects model.layers.* after stripping language_model.
-                if name.startswith("language_model.layers."):
-                    name = name.replace(
-                        "language_model.layers.", "language_model.model.layers.", 1
-                    )
-
-                if "vision_tower" in name or "mm_projector" in name:
-                    name = name.replace(r"wqkv.", r"attn.qkv_proj.")
-                    name = name.replace(r"wo.", r"attn.proj.")
-                    name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
-                    name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
-                    if load_vision:
-                        vision_weights.append((name, loaded_weight))
-                else:
-                    yield name.replace("language_model.", ""), loaded_weight
-
-        if not encoder_only:
-            # Consumes the iterator lazily; fills vision_weights as a side
-            # effect for the multimodal branch below.
-            self.language_model.load_weights(language_weights())
-        elif load_vision:
-            # Encoder-only: still drain the iterator to collect vision weights.
-            for _ in language_weights():
-                pass
-
-        if load_vision:
+            if "vision_tower" in name or "mm_projector" in name:
+                name = name.replace(r"wqkv.", r"attn.qkv_proj.")
+                name = name.replace(r"wo.", r"attn.proj.")
+                name = name.replace("mm_projector.proj.0", "mm_projector.linear_1")
+                name = name.replace("mm_projector.proj.2", "mm_projector.linear_2")
+                vision_weights.append((name, loaded_weight))
+            else:
+                name = name.replace("language_model.", "")
+                language_weights.append((name, loaded_weight))
+
+        if self.is_multimodal_active and not getattr(
+            self.config, "language_only", False
+        ):
+            vision_state_dict = dict(vision_weights)
             params_dict = dict(self.named_parameters(remove_duplicate=False))
-            for name, loaded_weight in vision_weights:
+            for name, loaded_weight in vision_state_dict.items():
                 if name not in params_dict:
                     raise ValueError(f"Weight {name} not found in params_dict")
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)
 
+        if not getattr(self.config, "encoder_only", False) and language_weights:
+            self.language_model.load_weights(language_weights)
+
     @classmethod
     def get_model_config_for_expert_location(cls, config: KimiK25Config):
         text_config = config.text_config