fix: register SongGen pipeline and wire server-side voice conditioning

arnavnagzirkar · arnavnagzirkar · commit 4a8731c3a620 · 2026-06-08T15:01:16.000-07:00
Addresses the review on PR #4117 (issue #3388): - Register the 'songgen' model_type in the central pipeline_registry so the model can be resolved and served (it raised KeyError before). - Resolve request.ref_audio to a waveform in _build_songgen_params and pass it as ref_voice_array, the key the model reads; the old ref_voice_url was silently dropped, so server-side voice conditioning never worked. - Load the architecture named in the checkpoint config (Mixed vs DualTrack) and select bf16/fp16 on CUDA (fp32 on CPU), casting float inputs to the model dtype to avoid a generate() dtype mismatch. - Add tests: registry resolution, _build_songgen_params voice-conditioning unit tests, and offline/online e2e smoke tests. Signed-off-by: Arnav Nagzirkar <113314200+arnavnagzirkar@users.noreply.github.com>
diff --git a/tests/e2e/offline_inference/test_songgen_expansion.py b/tests/e2e/offline_inference/test_songgen_expansion.py
@@ -0,0 +1,111 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""E2E offline inference tests for the SongGen single-stage pipeline.
+
+SongGen turns lyrics plus a music-style description into a 16 kHz mono song in
+one auto-regressive pass (the 1.3B AR LM and the X-Codec decoder both run
+inside ``SongGenForGeneration``). These mirror the offline example in
+``examples/offline_inference/text_to_speech/songgen/end2end.py``.
+
+The model and its ``songgen`` package dependency are large, so these tests are
+gated behind the ``full_model`` / ``tts`` markers and only run in the model CI
+lane (the deploy config targets a single 80 GB GPU).
+"""
+
+from __future__ import annotations
+
+import pytest
+import torch
+from vllm import SamplingParams
+
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniRunner
+from tests.helpers.stage_config import get_deploy_config_path
+from vllm_omni import Omni
+
+MODEL_NAME = "LiuZH-19/SongGen_mixed_pro"
+STAGE_CONFIG = get_deploy_config_path("songgen.yaml")
+
+# (model, stage_configs_path) for the ``omni_runner`` indirect parametrize.
+_OMNI_RUNNER_PARAM = (
+    MODEL_NAME,
+    STAGE_CONFIG,
+)
+
+pytestmark = [
+    pytest.mark.full_model,
+    pytest.mark.tts,
+    pytest.mark.parametrize("omni_runner", [_OMNI_RUNNER_PARAM], indirect=True),
+]
+
+SAMPLE_RATE = 16000
+
+DEFAULT_SAMPLING = SamplingParams(
+    temperature=1.0,
+    top_p=1.0,
+    top_k=50,
+    max_tokens=4096,
+    seed=42,
+    detokenize=False,
+)
+
+
+def _build_request(lyrics: str, description: str = "a pop song", seed: int = 42) -> dict:
+    """Build a SongGen offline request (lyrics + style description)."""
+    return {
+        "prompt": "<|im_start|>assistant\n",
+        "additional_information": {
+            "lyrics": [lyrics],
+            "text_description": [description],
+            "seed": [seed],
+        },
+    }
+
+
+def _collect_audio(omni: Omni, request: dict) -> tuple[torch.Tensor, int]:
+    """Run a single request and return (waveform, sample_rate)."""
+    for stage_outputs in omni.generate(request, DEFAULT_SAMPLING):
+        req_output = stage_outputs.request_output
+        if req_output is not None:
+            mm = req_output.outputs[0].multimodal_output
+            assert mm is not None, "Expected multimodal_output to be non-None"
+            audio = mm.get("audio")
+            sr = mm.get("sr")
+            assert audio is not None, "Expected 'audio' key in multimodal_output"
+            assert isinstance(audio, torch.Tensor), f"audio should be Tensor, got {type(audio)}"
+            return audio.cpu(), int(sr.item()) if sr is not None else SAMPLE_RATE
+    raise AssertionError("No stage outputs received")
+
+
+@pytest.mark.advanced_model
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+def test_songgen_text_to_song(omni_runner: OmniRunner) -> None:
+    """Lyrics + description produce non-empty 16 kHz audio."""
+    req = _build_request("Under the moonlight, we dance through the night.")
+    audio, sr = _collect_audio(omni_runner.omni, req)
+
+    assert sr == SAMPLE_RATE, f"Expected sample_rate={SAMPLE_RATE}, got {sr}"
+    assert audio.numel() > 0, "Audio tensor should not be empty"
+    assert not torch.all(audio == 0), "Audio should not be all-zeros (silence)"
+
+
+@pytest.mark.advanced_model
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+def test_songgen_batch(omni_runner: OmniRunner) -> None:
+    """Batch of two requests returns audio for each."""
+    requests = [
+        _build_request("First verse under a quiet sky."),
+        _build_request("Second verse as the morning breaks."),
+    ]
+    results = []
+    # Single-stage model (num_stages=1): one sampling param for all requests.
+    for stage_outputs in omni_runner.omni.generate(requests, [DEFAULT_SAMPLING]):
+        req_output = stage_outputs.request_output
+        if req_output is not None:
+            mm = req_output.outputs[0].multimodal_output
+            assert mm is not None
+            results.append(mm["audio"].cpu())
+
+    assert len(results) == 2, f"Expected 2 outputs, got {len(results)}"
+    for i, audio in enumerate(results):
+        assert audio.numel() > 0, f"Audio {i} is empty"
diff --git a/tests/e2e/online_serving/test_songgen.py b/tests/e2e/online_serving/test_songgen.py
@@ -0,0 +1,69 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E online tests for SongGen via the /v1/audio/speech endpoint.
+
+SongGen maps the OpenAI speech contract onto text-to-song generation:
+  - ``input``        -> song lyrics (required)
+  - ``instructions`` -> music style / genre description (optional)
+  - ``ref_audio``    -> reference voice for timbre conditioning (optional)
+
+The server resolves ``ref_audio`` to a waveform and forwards it to the model as
+``ref_voice_array`` (the key the model reads); a minimal non-streaming WAV case
+is enough to exercise the full serving path end to end. These tests are gated
+behind ``full_model`` / ``tts`` and run only in the model CI lane.
+"""
+
+import os
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+import pytest
+
+from tests.helpers.mark import hardware_test
+from tests.helpers.runtime import OmniServerParams
+from tests.helpers.stage_config import get_deploy_config_path
+
+pytestmark = [pytest.mark.full_model, pytest.mark.tts]
+
+MODEL = "LiuZH-19/SongGen_mixed_pro"
+LYRICS = "Under the moonlight, we dance through the night, stars above shining bright."
+DESCRIPTION = "dreamy pop ballad with piano and strings"
+
+# A 16 kHz song clip is far larger than this floor; the check only guards
+# against an empty or truncated response, not audio quality.
+_MIN_AUDIO_BYTES = 20_000
+
+songgen_server_params = [
+    pytest.param(
+        OmniServerParams(
+            model=MODEL,
+            stage_config_path=get_deploy_config_path("songgen.yaml"),
+            server_args=["--disable-log-stats"],
+        ),
+        id="songgen",
+    )
+]
+
+
+@hardware_test(res={"cuda": "H100"}, num_cards=1)
+@pytest.mark.parametrize("omni_server", songgen_server_params, indirect=True)
+def test_text_to_song_001(omni_server, openai_client) -> None:
+    """
+    Text-to-song via /v1/audio/speech (lyrics + style description).
+    Deploy Setting: default yaml
+    Input Modal: text (lyrics) + instructions (style description)
+    Output Modal: audio (16 kHz, WAV)
+    Input Setting: stream=False
+    Datasets: single request
+    """
+    request_config = {
+        "model": omni_server.model,
+        "input": LYRICS,
+        "instructions": DESCRIPTION,
+        "stream": False,
+        "response_format": "wav",
+        "min_audio_bytes": _MIN_AUDIO_BYTES,
+    }
+
+    openai_client.send_audio_speech_request(request_config)
diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py
@@ -2926,3 +2926,58 @@ def test_diffusion_instance_shutdown_safe(self, mocker: MockerFixture):
         server = OmniOpenAIServingSpeech.for_diffusion(diffusion_engine=mocker.MagicMock(), model_name="test-model")
         assert server._tts_executor is None
         server.shutdown()  # Should not raise
+
+
+class TestSongGenParams:
+    """Unit tests for ``_build_songgen_params`` (issue #3388 review).
+
+    These guard two regressions in the SongGen serving path:
+      * the talker must receive the resolved reference waveform under the
+        ``ref_voice_array`` key the model actually reads, not a ``ref_voice_url``
+        key it silently ignores; and
+      * the no-reference path must not emit any voice-conditioning key.
+
+    A lightweight stub ``self`` (only ``_resolve_ref_audio`` is touched) keeps
+    the test on the CPU/core CI lane with no GPU, weights, or network.
+    """
+
+    @staticmethod
+    def _build(request, resolved=([0.0, 0.25, -0.25, 0.5], 24000)):
+        calls: list[str] = []
+
+        async def _fake_resolve(ref_audio_str):
+            calls.append(ref_audio_str)
+            return resolved
+
+        stub = SimpleNamespace(_resolve_ref_audio=_fake_resolve)
+        params = asyncio.run(OmniOpenAIServingSpeech._build_songgen_params(stub, request))
+        return params, calls
+
+    def test_without_ref_audio_emits_no_voice_key(self):
+        request = OpenAICreateSpeechRequest(
+            input="la la la under the moonlight",
+            instructions="dreamy pop ballad with piano",
+        )
+        params, calls = self._build(request)
+        assert params["lyrics"] == ["la la la under the moonlight"]
+        assert params["text_description"] == ["dreamy pop ballad with piano"]
+        assert "ref_voice_array" not in params
+        assert "ref_voice_url" not in params
+        assert calls == []
+
+    def test_ref_audio_resolved_to_array_not_url(self):
+        request = OpenAICreateSpeechRequest(
+            input="sing me a song",
+            instructions="a pop song",
+            ref_audio="data:audio/wav;base64,AAAA",
+        )
+        params, calls = self._build(request)
+        # The model consumes ref_voice_array=[[wav, sr]]; ref_voice_url is dead.
+        assert "ref_voice_url" not in params
+        assert params["ref_voice_array"] == [[[0.0, 0.25, -0.25, 0.5], 24000]]
+        assert calls == ["data:audio/wav;base64,AAAA"]
+
+    def test_missing_instructions_default_empty_description(self):
+        request = OpenAICreateSpeechRequest(input="just lyrics")
+        params, _ = self._build(request)
+        assert params["text_description"] == [""]
diff --git a/tests/helpers/runtime.py b/tests/helpers/runtime.py
@@ -1767,7 +1767,7 @@ def send_audio_speech_request(self, request_config: dict[str, Any], request_num:
         # Qwen3-TTS custom fields, forwarded via extra_body.
         extra_body: dict[str, Any] = {}
         # Keep this list aligned with vllm_omni.entrypoints.openai.protocol.audio params.
-        for key in ("task_type", "ref_text", "ref_audio", "language", "max_new_tokens", "seed"):
+        for key in ("task_type", "ref_text", "ref_audio", "instructions", "language", "max_new_tokens", "seed"):
             if key in request_config:
                 extra_body[key] = request_config[key]
 
diff --git a/tests/test_config_factory.py b/tests/test_config_factory.py
@@ -913,6 +913,18 @@ def test_registry_loads_pipeline_on_getitem(self):
         assert pipeline.model_type == "qwen3_omni_moe"
         assert len(pipeline.stages) == 3  # thinker + talker + code2wav
 
+    def test_registry_has_songgen(self):
+        """SongGen's single-stage pipeline is registered and resolvable.
+
+        Without this central entry, ``model_type='songgen'`` raises KeyError and
+        the model cannot be served at all (regression from issue #3388 review).
+        """
+        assert "songgen" in _PIPELINE_REGISTRY
+        pipeline = _PIPELINE_REGISTRY["songgen"]
+        assert pipeline.model_type == "songgen"
+        assert len(pipeline.stages) == 1  # single-stage AR generator
+        assert pipeline.stages[0].final_output_type == "audio"
+
     def test_registry_returns_none_for_unknown(self):
         """Unknown model_types aren't found; ``get()`` returns None."""
         assert "definitely_not_a_real_model" not in _PIPELINE_REGISTRY
diff --git a/vllm_omni/config/pipeline_registry.py b/vllm_omni/config/pipeline_registry.py
@@ -141,6 +141,11 @@
         "vllm_omni.model_executor.models.moss_tts.pipeline",
         "MOSS_TTS_REALTIME_PIPELINE",
     ),
+    # SongGen (text-to-song): single-stage AR generator, MOSS-TTS-Nano lineage.
+    "songgen": (
+        "vllm_omni.model_executor.models.songgen.pipeline",
+        "SONGGEN_PIPELINE",
+    ),
     "minicpmo_4_5": (
         "vllm_omni.model_executor.models.minicpmo_4_5.pipeline",
         "MINICPMO_4_5_PIPELINE",
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -1890,14 +1890,24 @@ async def _build_songgen_params(self, request: OpenAICreateSpeechRequest) -> dic
         Returns a dict with keys expected by SongGenForGeneration._create_stream_gen():
           lyrics            : list[str]  - song lyrics (from request.input)
           text_description  : list[str]  - style / genre description
-          ref_voice_url     : list[str]  - reference voice audio URL (if provided)
+          ref_voice_array   : list[[list[float], int]]  - resolved reference
+                              voice waveform + sample rate (only when ref_audio
+                              is provided)
+
+        The model reads ``ref_voice_array`` (a resolved ``[wav_samples, sr]``
+        pair that it stages to a temp WAV), not a URL. We resolve
+        ``request.ref_audio`` to a waveform here so the server path matches the
+        offline example (``ref_voice_array=[[wav, sr]]``) and the MOSS-TTS
+        ``prompt_audio_array`` convention; emitting ``ref_voice_url`` instead
+        would be silently dropped by the model.
         """
         params: dict = {
             "lyrics": [request.input],
             "text_description": [request.instructions or ""],
         }
         if request.ref_audio is not None:
-            params["ref_voice_url"] = [request.ref_audio]
+            wav_list, sr = await self._resolve_ref_audio(request.ref_audio)
+            params["ref_voice_array"] = [[wav_list, sr]]
         return params
 
     async def _build_higgs_audio_v2_params(self, request: OpenAICreateSpeechRequest):
diff --git a/vllm_omni/model_executor/models/songgen/modeling_songgen.py b/vllm_omni/model_executor/models/songgen/modeling_songgen.py
@@ -113,20 +113,47 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
                 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             self._device = device
 
-            logger.info("Loading SongGen from %s on %s", self.model_path, device)
-
+            # Match the MOSS-TTS convention: bf16 on bf16-capable CUDA, fp16 on
+            # older CUDA, fp32 on CPU. The deploy config targets H100/A100, so
+            # the bf16 path is the common one in practice.
+            if device.type == "cuda" and torch.cuda.is_bf16_supported():
+                model_dtype = torch.bfloat16
+            elif device.type == "cuda":
+                model_dtype = torch.float16
+            else:
+                model_dtype = torch.float32
+
+            # SongGen ships two architectures (Mixed and DualTrack). Both are
+            # served by this wrapper, but they are distinct upstream classes, so
+            # load the one named in the checkpoint config instead of always
+            # loading Mixed (which would load the wrong weights for a DualTrack
+            # checkpoint).
+            architectures = list(getattr(self.config, "architectures", None) or [])
+            want_dualtrack = any("DualTrack" in arch for arch in architectures)
             try:
-                from songgen import SongGenMixedForConditionalGeneration, SongGenProcessor
+                from songgen import SongGenProcessor
+
+                if want_dualtrack:
+                    from songgen import SongGenDualTrackForConditionalGeneration as _SongGenModelClass
+                else:
+                    from songgen import SongGenMixedForConditionalGeneration as _SongGenModelClass
             except ImportError as exc:
                 raise ImportError(
                     "SongGen requires the 'songgen' package. "
                     "Install it from: pip install git+https://github.com/LiuZH-19/SongGen.git"
                 ) from exc
 
-            model = SongGenMixedForConditionalGeneration.from_pretrained(
+            logger.info(
+                "Loading SongGen (%s) from %s on %s (dtype=%s)",
+                _SongGenModelClass.__name__,
+                self.model_path,
+                device,
+                model_dtype,
+            )
+            model = _SongGenModelClass.from_pretrained(
                 self.model_path,
                 attn_implementation="sdpa",
-                torch_dtype=torch.float32,
+                torch_dtype=model_dtype,
             )
             model.to(device=device)
             model.eval()
@@ -200,8 +227,16 @@ def _create_stream_gen(self, info: dict[str, Any]):
                 separate=False,
                 return_tensors="pt",
             )
+            # Move every tensor to the model device. Floating-point inputs (e.g.
+            # reference-voice features) are also cast to the model dtype so the
+            # bf16/fp16 weight path does not hit a dtype mismatch inside
+            # generate(); integer token ids keep their dtype.
+            model_dtype = next(self._model.parameters()).dtype
             model_inputs = {
-                k: v.to(self._device) if isinstance(v, torch.Tensor) else v for k, v in model_inputs.items()
+                k: (v.to(device=self._device, dtype=model_dtype) if v.is_floating_point() else v.to(self._device))
+                if isinstance(v, torch.Tensor)
+                else v
+                for k, v in model_inputs.items()
             }
 
             output = self._model.generate(**model_inputs, do_sample=_DEFAULT_DO_SAMPLE)