vllm-project
diff --git a/‎docs/user_guide/examples/online_serving/diffusers_pipeline_adapter.md‎
Lines changed: 73 additions & 3 deletions b/‎docs/user_guide/examples/online_serving/diffusers_pipeline_adapter.md‎
Lines changed: 73 additions & 3 deletions
diff --git a/‎tests/diffusion/test_diffusion_single_file.py‎
Lines changed: 182 additions & 0 deletions b/‎tests/diffusion/test_diffusion_single_file.py‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎vllm_omni/diffusion/data.py‎
Lines changed: 79 additions & 3 deletions b/‎vllm_omni/diffusion/data.py‎
Lines changed: 79 additions & 3 deletions
diff --git a/‎vllm_omni/diffusion/diffusion_engine.py‎
Lines changed: 4 additions & 1 deletion b/‎vllm_omni/diffusion/diffusion_engine.py‎
Lines changed: 4 additions & 1 deletion
@@ -39,9 +39,79 @@ vllm serve "stable-diffusion-v1-5/stable-diffusion-v1-5" \
     --diffusion-load-format diffusers
 ```
 
-Users turn on the diffusers backend primarily through `--diffusion-load-format diffusers` argument.
-There are two more optional arguments, `--diffusers-load-kwargs` and `--diffusers-call-kwargs`,
-which are only valid together with `--diffusion-load-format diffusers`.
+Users turn on the diffusers backend primarily through the `--diffusion-load-format diffusers` argument.
+
+### Single-File Checkpoints (e.g. Anima)
+
+For single-file checkpoints (such as `.safetensors` or `.ckpt`), users can load them via the `--diffusion-load-format diffusers_single_file` argument (or simply point `--model` to a single checkpoint file).
+
+If a custom pipeline class (such as `AnimaModularPipeline`) is needed, specify it using `--model-class-name`:
+
+```bash
+vllm serve "/path/to/anima.safetensors" \
+    --omni \
+    --diffusion-load-format diffusers_single_file \
+    --model-class-name AnimaModularPipeline
+```
+
+For the native Anima path, use `--model-class-name AnimaPipeline` without `--diffusion-load-format diffusers_single_file`. The native path reads the Anima transformer single-file checkpoint directly, converts original Cosmos transformer keys when needed, and loads the Cosmos transformer and text conditioner into vLLM-Omni native modules.
+
+The native path also needs the non-denoiser components (`text_encoder`, `tokenizer`, `t5_tokenizer`, `vae`, and optionally `scheduler`). These must be in Diffusers `from_pretrained()` layout. Raw Anima auxiliary files such as `qwen_3_06b_base.safetensors` and `qwen_image_vae.safetensors` are converter inputs; they are not accepted directly as `components_path`.
+
+Use the Anima converter from the Diffusers reference implementation to prepare the component directory:
+
+```bash
+python /path/to/convert_anima_to_diffusers.py \
+    --transformer_ckpt_path /path/to/anima-preview3-base.safetensors \
+    --text_encoder_ckpt_path /path/to/qwen_3_06b_base.safetensors \
+    --vae_ckpt_path /path/to/qwen_image_vae.safetensors \
+    --qwen_tokenizer_path /path/to/qwen-tokenizer \
+    --t5_tokenizer_path /path/to/t5-tokenizer \
+    --output_path /path/to/anima-components \
+    --save_pipeline
+```
+
+Then point `--model` at the raw Anima transformer checkpoint and `components_path` at the converted directory:
+
+```bash
+vllm serve "/path/to/anima.safetensors" \
+    --omni \
+    --model-class-name AnimaPipeline \
+    --diffusers-load-kwargs '{
+      "components_path": "/path/to/anima-components"
+    }'
+```
+
+Advanced execution features such as step execution, TP/SP, CFG parallel, HSDP, and Cache-DiT must stay disabled until CUDA parity is validated.
+
+Before enabling the native Anima path in a PR, record CUDA validation evidence on the target GPU:
+
+```bash
+# Native vLLM-Omni path.
+vllm serve "/path/to/anima-preview3-base.safetensors" \
+    --omni \
+    --model-class-name AnimaPipeline \
+    --enable-diffusion-pipeline-profiler \
+    --diffusers-load-kwargs '{"components_path": "/path/to/anima-components"}'
+
+# Diffusers adapter baseline using the same checkpoint, prompt, size, seed, and step count.
+vllm serve "/path/to/anima-preview3-base.safetensors" \
+    --omni \
+    --diffusion-load-format diffusers_single_file \
+    --model-class-name AnimaModularPipeline \
+    --enable-diffusion-pipeline-profiler
+```
+
+The validation report should include:
+
+| Item | Required evidence |
+|------|-------------------|
+| Output parity | Native and Diffusers-adapter samples from the same prompt, resolution, step count, and seed |
+| Latency | End-to-end time for native and Diffusers-adapter runs on the same GPU |
+| Memory | Peak VRAM for native and Diffusers-adapter runs |
+| Profiler | `torch.profiler` or `nsys` summary showing no unexpected CPU fallback, host/device copy storm, or synchronization hotspot in the native transformer/text-conditioner path |
+
+There are two more optional arguments, `--diffusers-load-kwargs` and `--diffusers-call-kwargs`, which are valid together with `--diffusion-load-format diffusers` or `diffusers_single_file`. Native Anima also accepts `--diffusers-load-kwargs` for component paths such as `components_path`, but does not delegate denoising to Diffusers.
 
 After launching the model, users send a request as usual. Refer to other documentation pages on how to request a particular input/output modality, such as `examples/online_serving/text_to_image/openai_chat_client.py`.
 
 
@@ -0,0 +1,182 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from types import SimpleNamespace
+
+import diffusers
+import torch
+from safetensors.torch import save_file
+
+from vllm_omni.diffusion.data import OmniDiffusionConfig
+
+
+def test_anima_registration():
+    # Verify that Anima modules were dynamically injected into the diffusers package
+    import vllm_omni.diffusion.models.anima  # noqa: F401
+    from vllm_omni.diffusion.registry import DiffusionModelRegistry
+
+    assert hasattr(diffusers, "AnimaModularPipeline")
+    assert hasattr(diffusers.modular_pipelines, "AnimaModularPipeline")
+    assert hasattr(diffusers.models, "AnimaTextConditioner")
+    assert hasattr(diffusers.models.condition_embedders, "AnimaTextConditioner")
+    assert DiffusionModelRegistry._try_load_model_cls("AnimaPipeline") is not None
+
+
+def test_enrich_config_single_file(tmp_path):
+    # Verify single-file config enrichment path
+    dummy_checkpoint = tmp_path / "model.safetensors"
+    dummy_checkpoint.write_text("dummy")
+
+    config = OmniDiffusionConfig(
+        model=str(dummy_checkpoint),
+        diffusion_load_format="diffusers_single_file",
+        model_class_name="AnimaModularPipeline",
+    )
+    config.enrich_config()
+
+    assert config.model_class_name == "DiffusersAdapterPipeline"
+    assert config.diffusers_pipeline_cls is diffusers.AnimaModularPipeline
+
+
+def test_enrich_config_single_file_autodetects_local_file(tmp_path):
+    dummy_checkpoint = tmp_path / "model.safetensors"
+    dummy_checkpoint.write_text("dummy")
+
+    config = OmniDiffusionConfig(
+        model=str(dummy_checkpoint),
+        model_class_name="AnimaModularPipeline",
+    )
+    config.enrich_config()
+
+    assert config.diffusion_load_format == "diffusers_single_file"
+    assert config.model_class_name == "DiffusersAdapterPipeline"
+    assert config.diffusers_pipeline_cls is diffusers.AnimaModularPipeline
+
+
+def test_enrich_config_native_anima_single_file_stays_native(tmp_path):
+    dummy_checkpoint = tmp_path / "model.safetensors"
+    dummy_checkpoint.write_text("dummy")
+
+    config = OmniDiffusionConfig(
+        model=str(dummy_checkpoint),
+        model_class_name="AnimaPipeline",
+    )
+    config.enrich_config()
+
+    assert config.diffusion_load_format == "default"
+    assert config.model_class_name == "AnimaPipeline"
+    assert config.diffusers_pipeline_cls is None
+
+
+def test_native_anima_single_file_allows_load_kwargs(tmp_path):
+    dummy_checkpoint = tmp_path / "model.safetensors"
+    dummy_checkpoint.write_text("dummy")
+
+    config = OmniDiffusionConfig(
+        model=str(dummy_checkpoint),
+        model_class_name="AnimaPipeline",
+        diffusers_load_kwargs={"local_files_only": True},
+    )
+    config.enrich_config()
+
+    assert config.diffusion_load_format == "default"
+    assert config.diffusers_load_kwargs == {"local_files_only": True}
+
+
+def test_native_anima_converts_original_cosmos_transformer_keys():
+    from vllm_omni.diffusion.models.anima.pipeline_anima import AnimaPipeline
+
+    converted = AnimaPipeline._convert_cosmos_2_transformer_state_dict(
+        {
+            "net.x_embedder.proj.1.weight": "patch",
+            "net.blocks.0.self_attn.q_proj.weight": "q",
+            "net.blocks.0.self_attn.q_norm.weight": "q_norm",
+            "net.blocks.0.mlp.layer1.weight": "mlp",
+            "net.final_layer.linear.weight": "out",
+            "net.accum_iteration": "drop",
+        }
+    )
+
+    assert converted == {
+        "patch_embed.proj.weight": "patch",
+        "transformer_blocks.0.attn1.to_q.weight": "q",
+        "transformer_blocks.0.attn1.norm_q.weight": "q_norm",
+        "transformer_blocks.0.ff.net.0.proj.weight": "mlp",
+        "proj_out.weight": "out",
+    }
+
+
+def test_native_anima_loads_synthetic_single_file(tmp_path, monkeypatch):
+    import vllm_omni.diffusion.models.anima.pipeline_anima as pipeline_anima
+    from vllm_omni.diffusion.models.anima.native_cosmos_transformer import NativeCosmosTransformer3DModel
+    from vllm_omni.diffusion.models.anima.native_text_conditioner import NativeAnimaTextConditioner
+
+    tiny_transformer_config = {
+        "in_channels": 1,
+        "out_channels": 1,
+        "num_attention_heads": 1,
+        "attention_head_dim": 12,
+        "num_layers": 1,
+        "mlp_ratio": 1.0,
+        "text_embed_dim": 4,
+        "adaln_lora_dim": 3,
+        "max_size": (1, 2, 2),
+        "patch_size": (1, 1, 1),
+        "rope_scale": (1.0, 1.0, 1.0),
+        "concat_padding_mask": True,
+        "extra_pos_embed_type": None,
+    }
+    monkeypatch.setattr(pipeline_anima, "_ANIMA_TRANSFORMER_CONFIG", tiny_transformer_config)
+
+    transformer = NativeCosmosTransformer3DModel(**tiny_transformer_config)
+    text_conditioner = NativeAnimaTextConditioner(
+        source_dim=4,
+        target_dim=4,
+        model_dim=4,
+        num_layers=1,
+        num_attention_heads=1,
+        target_vocab_size=8,
+        min_sequence_length=4,
+    )
+    transformer_state = {name: tensor.detach().clone() for name, tensor in transformer.state_dict().items()}
+    text_conditioner_state = {name: tensor.detach().clone() for name, tensor in text_conditioner.state_dict().items()}
+    checkpoint_state = {
+        **{f"transformer.{name}": tensor for name, tensor in transformer_state.items()},
+        **{f"text_conditioner.{name}": tensor for name, tensor in text_conditioner_state.items()},
+    }
+
+    checkpoint_path = tmp_path / "anima.safetensors"
+    save_file(checkpoint_state, str(checkpoint_path))
+
+    pipeline = pipeline_anima.AnimaPipeline.__new__(pipeline_anima.AnimaPipeline)
+    pipeline.od_config = SimpleNamespace(model=str(checkpoint_path), dtype=torch.float32)
+    pipeline.device = torch.device("cpu")
+
+    def assert_loaded(loaded_transformer, loaded_text_conditioner):
+        for name, tensor in transformer_state.items():
+            assert torch.equal(loaded_transformer.state_dict()[name], tensor)
+        for name, tensor in text_conditioner_state.items():
+            assert torch.equal(loaded_text_conditioner.state_dict()[name], tensor)
+
+    loaded_transformer, loaded_text_conditioner = pipeline._load_native_denoiser_components(dict(checkpoint_state))
+    assert_loaded(loaded_transformer, loaded_text_conditioner)
+
+    loaded_transformer, loaded_text_conditioner = pipeline._load_native_denoiser_components()
+    assert_loaded(loaded_transformer, loaded_text_conditioner)
+
+
+def test_enrich_config_single_file_rejects_unknown_pipeline(tmp_path):
+    dummy_checkpoint = tmp_path / "model.safetensors"
+    dummy_checkpoint.write_text("dummy")
+
+    config = OmniDiffusionConfig(
+        model=str(dummy_checkpoint),
+        diffusion_load_format="diffusers_single_file",
+        model_class_name="MissingPipeline",
+    )
+    try:
+        config.enrich_config()
+    except ValueError as exc:
+        assert "Could not find diffusers pipeline class MissingPipeline" in str(exc)
+    else:
+        raise AssertionError("Expected unknown single-file pipeline class to fail.")
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
+import importlib
 import os
 import random
 from collections.abc import Callable, Mapping
@@ -31,6 +32,43 @@
 
 logger = init_logger(__name__)
 
+_NATIVE_SINGLE_FILE_DIFFUSION_MODELS = {"AnimaPipeline"}
+
+
+def _diffusers_pipeline_module_name(model_class_name):
+    base_name = model_class_name
+    for suffix in ("ModularPipeline", "Pipeline"):
+        if base_name.endswith(suffix):
+            base_name = base_name[: -len(suffix)]
+            break
+    if not base_name:
+        return None
+
+    chars = []
+    for index, char in enumerate(base_name):
+        if char.isupper() and index > 0:
+            chars.append("_")
+        chars.append(char.lower())
+    return "vllm_omni.diffusion.models." + "".join(chars)
+
+
+def _resolve_diffusers_pipeline_cls(model_class_name):
+    if hasattr(diffusers, model_class_name):
+        return getattr(diffusers, model_class_name)
+
+    module_name = _diffusers_pipeline_module_name(model_class_name)
+    if module_name is not None:
+        try:
+            importlib.import_module(module_name)
+        except ModuleNotFoundError as exc:
+            if exc.name != module_name:
+                raise
+        else:
+            if hasattr(diffusers, model_class_name):
+                return getattr(diffusers, model_class_name)
+
+    raise ValueError(f"Could not find diffusers pipeline class {model_class_name} in diffusers namespace.")
+
 
 def parse_kv_cache_skip_selector(
     selector: str | list[int] | tuple[int, ...] | set[int] | None,
@@ -525,7 +563,8 @@ class OmniDiffusionConfig:
     custom_pipeline_args: dict[str, Any] | None = None
 
     # Diffusion model loading format
-    # "default", "custom_pipeline", "dummy", "diffusers" (HF diffusers adapter)
+    # "default", "custom_pipeline", "dummy", "diffusers" (HF diffusers adapter),
+    # or "diffusers_single_file" (HF diffusers adapter via from_single_file).
     diffusion_load_format: str = "default"
 
     # Diffusers adapter kwargs
@@ -777,10 +816,24 @@ def __post_init__(self):
         elif self.max_cpu_loras < 1:
             raise ValueError("max_cpu_loras must be >= 1 for diffusion LoRA")
 
-        if self.diffusion_load_format != "diffusers" and (self.diffusers_load_kwargs or self.diffusers_call_kwargs):
+        is_single_file = self.diffusion_load_format == "diffusers_single_file" or (
+            isinstance(self.model, str) and os.path.isfile(self.model)
+        )
+        if (
+            is_single_file
+            and self.model_class_name not in _NATIVE_SINGLE_FILE_DIFFUSION_MODELS
+            and self.diffusion_load_format in (None, "default", "diffusers")
+        ):
+            self.diffusion_load_format = "diffusers_single_file"
+
+        if (
+            self.diffusion_load_format not in ("diffusers", "diffusers_single_file")
+            and self.model_class_name not in _NATIVE_SINGLE_FILE_DIFFUSION_MODELS
+            and (self.diffusers_load_kwargs or self.diffusers_call_kwargs)
+        ):
             raise ValueError(
                 "diffusers_load_kwargs and diffusers_call_kwargs are only "
-                "valid together with diffusion_load_format=diffusers"
+                "valid together with diffusion_load_format=diffusers or diffusers_single_file"
             )
 
     def _propagate_quantization_from_tf_config(self, tf_config: "TransformerConfig") -> None:
@@ -875,6 +928,29 @@ def enrich_config(self) -> None:
         if self.model_class_name is None and self.diffusion_load_format == "diffusers":
             self.model_class_name = "DiffusersAdapterPipeline"
 
+        if (
+            self.diffusion_load_format == "diffusers_single_file"
+            or (isinstance(self.model, str) and os.path.isfile(self.model))
+        ) and self.diffusion_load_format in (
+            None,
+            "default",
+            "diffusers",
+            "diffusers_single_file",
+        ):
+            if self.model_class_name in _NATIVE_SINGLE_FILE_DIFFUSION_MODELS:
+                self.diffusion_load_format = "default"
+                self.set_tf_model_config(TransformerConfig())
+                return
+
+            if self.diffusion_load_format in (None, "default"):
+                self.diffusion_load_format = "diffusers_single_file"
+
+            if self.model_class_name is not None and self.model_class_name != "DiffusersAdapterPipeline":
+                self.diffusers_pipeline_cls = _resolve_diffusers_pipeline_cls(self.model_class_name)
+            self.model_class_name = "DiffusersAdapterPipeline"
+            self.set_tf_model_config(TransformerConfig())
+            return
+
         try:
             config_dict = get_hf_file_to_dict("model_index.json", self.model)
             if config_dict is not None:
 
@@ -53,7 +53,10 @@ class _RpcTask:
 
 
 def supports_multimodal_input(od_config: OmniDiffusionConfig) -> tuple[bool, bool]:
-    if od_config.diffusion_load_format == "diffusers" and (pipe_cls := od_config.diffusers_pipeline_cls) is not None:
+    if (
+        od_config.diffusion_load_format in ("diffusers", "diffusers_single_file")
+        and (pipe_cls := od_config.diffusers_pipeline_cls) is not None
+    ):
         signature = inspect.signature(pipe_cls.__call__)
         support_image_input = "image" in signature.parameters
         support_audio_input = (