-
Notifications
You must be signed in to change notification settings - Fork 160
revert: remove instanttensor loader #462
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -900,60 +900,44 @@ def forward( | |
| ) | ||
|
|
||
| def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): | ||
| """Load weights, streaming language weights to the language model. | ||
|
|
||
| The language weights are forwarded to ``language_model.load_weights`` | ||
| lazily (as a generator) instead of being collected into a list first. | ||
| Materializing the whole iterator would keep every loaded tensor alive | ||
| at once; that is harmless for CPU-staged loaders but OOMs GPU-direct | ||
| loaders (e.g. ``--load-format instanttensor``), which would then hold | ||
| the entire model on the device during loading. Vision weights are | ||
| small and are still collected, then loaded after the language model. | ||
| """ | ||
| vision_weights: list[Tuple[str, torch.Tensor]] = [] | ||
| encoder_only = getattr(self.config, "encoder_only", False) | ||
| load_vision = self.is_multimodal_active and not getattr( | ||
| self.config, "language_only", False | ||
| ) | ||
| """Load weights for the model, separating vision and language weights""" | ||
| vision_weights = [] | ||
| language_weights = [] | ||
|
|
||
| for name, loaded_weight in weights: | ||
| # nvidia/Kimi-K2.5-NVFP4 stores decoder layers under | ||
| # language_model.layers.*, while TokenSpeed's DeepSeek module | ||
| # expects model.layers.* after stripping language_model. | ||
| if name.startswith("language_model.layers."): | ||
| name = name.replace( | ||
| "language_model.layers.", "language_model.model.layers.", 1 | ||
| ) | ||
|
|
||
| def language_weights() -> Iterable[Tuple[str, torch.Tensor]]: | ||
| for name, loaded_weight in weights: | ||
| # nvidia/Kimi-K2.5-NVFP4 stores decoder layers under | ||
| # language_model.layers.*, while TokenSpeed's DeepSeek module | ||
| # expects model.layers.* after stripping language_model. | ||
| if name.startswith("language_model.layers."): | ||
| name = name.replace( | ||
| "language_model.layers.", "language_model.model.layers.", 1 | ||
| ) | ||
|
|
||
| if "vision_tower" in name or "mm_projector" in name: | ||
| name = name.replace(r"wqkv.", r"attn.qkv_proj.") | ||
| name = name.replace(r"wo.", r"attn.proj.") | ||
| name = name.replace("mm_projector.proj.0", "mm_projector.linear_1") | ||
| name = name.replace("mm_projector.proj.2", "mm_projector.linear_2") | ||
| if load_vision: | ||
| vision_weights.append((name, loaded_weight)) | ||
| else: | ||
| yield name.replace("language_model.", ""), loaded_weight | ||
|
|
||
| if not encoder_only: | ||
| # Consumes the iterator lazily; fills vision_weights as a side | ||
| # effect for the multimodal branch below. | ||
| self.language_model.load_weights(language_weights()) | ||
| elif load_vision: | ||
| # Encoder-only: still drain the iterator to collect vision weights. | ||
| for _ in language_weights(): | ||
| pass | ||
|
|
||
| if load_vision: | ||
| if "vision_tower" in name or "mm_projector" in name: | ||
| name = name.replace(r"wqkv.", r"attn.qkv_proj.") | ||
| name = name.replace(r"wo.", r"attn.proj.") | ||
| name = name.replace("mm_projector.proj.0", "mm_projector.linear_1") | ||
| name = name.replace("mm_projector.proj.2", "mm_projector.linear_2") | ||
| vision_weights.append((name, loaded_weight)) | ||
| else: | ||
| name = name.replace("language_model.", "") | ||
| language_weights.append((name, loaded_weight)) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Appending every non-vision tensor means Kimi language weights are all materialized before Useful? React with 👍 / 👎. |
||
|
|
||
| if self.is_multimodal_active and not getattr( | ||
| self.config, "language_only", False | ||
| ): | ||
| vision_state_dict = dict(vision_weights) | ||
| params_dict = dict(self.named_parameters(remove_duplicate=False)) | ||
| for name, loaded_weight in vision_weights: | ||
| for name, loaded_weight in vision_state_dict.items(): | ||
| if name not in params_dict: | ||
| raise ValueError(f"Weight {name} not found in params_dict") | ||
| param = params_dict[name] | ||
| weight_loader = getattr(param, "weight_loader", default_weight_loader) | ||
| weight_loader(param, loaded_weight) | ||
|
|
||
| if not getattr(self.config, "encoder_only", False) and language_weights: | ||
| self.language_model.load_weights(language_weights) | ||
|
|
||
| @classmethod | ||
| def get_model_config_for_expert_location(cls, config: KimiK25Config): | ||
| text_config = config.text_config | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here
weightsis the loader generator, but this loop now appends every.expertstensor before any copy into model parameters occurs. For MXFP4 checkpoints such asopenai/gpt-oss-120b, those expert tensors dominate the checkpoint; with the default safetensors iterator this pins all loaded shards in host memory until the list is handed to_load_mxfp4_experts_weights, so large-model loads can OOM instead of streaming each expert into its preallocated slot.Useful? React with 👍 / 👎.