|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project |
| 3 | +"""E2E offline inference tests for the SongGen single-stage pipeline. |
| 4 | +
|
| 5 | +SongGen turns lyrics plus a music-style description into a 16 kHz mono song in |
| 6 | +one auto-regressive pass (the 1.3B AR LM and the X-Codec decoder both run |
| 7 | +inside ``SongGenForGeneration``). These mirror the offline example in |
| 8 | +``examples/offline_inference/text_to_speech/songgen/end2end.py``. |
| 9 | +
|
| 10 | +The model and its ``songgen`` package dependency are large, so these tests are |
| 11 | +gated behind the ``full_model`` / ``tts`` markers and only run in the model CI |
| 12 | +lane (the deploy config targets a single 80 GB GPU). |
| 13 | +""" |
| 14 | + |
| 15 | +from __future__ import annotations |
| 16 | + |
| 17 | +import pytest |
| 18 | +import torch |
| 19 | +from vllm import SamplingParams |
| 20 | + |
| 21 | +from tests.helpers.mark import hardware_test |
| 22 | +from tests.helpers.runtime import OmniRunner |
| 23 | +from tests.helpers.stage_config import get_deploy_config_path |
| 24 | +from vllm_omni import Omni |
| 25 | + |
| 26 | +MODEL_NAME = "LiuZH-19/SongGen_mixed_pro" |
| 27 | +STAGE_CONFIG = get_deploy_config_path("songgen.yaml") |
| 28 | + |
| 29 | +# (model, stage_configs_path) for the ``omni_runner`` indirect parametrize. |
| 30 | +_OMNI_RUNNER_PARAM = ( |
| 31 | + MODEL_NAME, |
| 32 | + STAGE_CONFIG, |
| 33 | +) |
| 34 | + |
| 35 | +pytestmark = [ |
| 36 | + pytest.mark.full_model, |
| 37 | + pytest.mark.tts, |
| 38 | + pytest.mark.parametrize("omni_runner", [_OMNI_RUNNER_PARAM], indirect=True), |
| 39 | +] |
| 40 | + |
| 41 | +SAMPLE_RATE = 16000 |
| 42 | + |
| 43 | +DEFAULT_SAMPLING = SamplingParams( |
| 44 | + temperature=1.0, |
| 45 | + top_p=1.0, |
| 46 | + top_k=50, |
| 47 | + max_tokens=4096, |
| 48 | + seed=42, |
| 49 | + detokenize=False, |
| 50 | +) |
| 51 | + |
| 52 | + |
| 53 | +def _build_request(lyrics: str, description: str = "a pop song", seed: int = 42) -> dict: |
| 54 | + """Build a SongGen offline request (lyrics + style description).""" |
| 55 | + return { |
| 56 | + "prompt": "<|im_start|>assistant\n", |
| 57 | + "additional_information": { |
| 58 | + "lyrics": [lyrics], |
| 59 | + "text_description": [description], |
| 60 | + "seed": [seed], |
| 61 | + }, |
| 62 | + } |
| 63 | + |
| 64 | + |
| 65 | +def _collect_audio(omni: Omni, request: dict) -> tuple[torch.Tensor, int]: |
| 66 | + """Run a single request and return (waveform, sample_rate).""" |
| 67 | + for stage_outputs in omni.generate(request, DEFAULT_SAMPLING): |
| 68 | + req_output = stage_outputs.request_output |
| 69 | + if req_output is not None: |
| 70 | + mm = req_output.outputs[0].multimodal_output |
| 71 | + assert mm is not None, "Expected multimodal_output to be non-None" |
| 72 | + audio = mm.get("audio") |
| 73 | + sr = mm.get("sr") |
| 74 | + assert audio is not None, "Expected 'audio' key in multimodal_output" |
| 75 | + assert isinstance(audio, torch.Tensor), f"audio should be Tensor, got {type(audio)}" |
| 76 | + return audio.cpu(), int(sr.item()) if sr is not None else SAMPLE_RATE |
| 77 | + raise AssertionError("No stage outputs received") |
| 78 | + |
| 79 | + |
| 80 | +@pytest.mark.advanced_model |
| 81 | +@hardware_test(res={"cuda": "H100"}, num_cards=1) |
| 82 | +def test_songgen_text_to_song(omni_runner: OmniRunner) -> None: |
| 83 | + """Lyrics + description produce non-empty 16 kHz audio.""" |
| 84 | + req = _build_request("Under the moonlight, we dance through the night.") |
| 85 | + audio, sr = _collect_audio(omni_runner.omni, req) |
| 86 | + |
| 87 | + assert sr == SAMPLE_RATE, f"Expected sample_rate={SAMPLE_RATE}, got {sr}" |
| 88 | + assert audio.numel() > 0, "Audio tensor should not be empty" |
| 89 | + assert not torch.all(audio == 0), "Audio should not be all-zeros (silence)" |
| 90 | + |
| 91 | + |
| 92 | +@pytest.mark.advanced_model |
| 93 | +@hardware_test(res={"cuda": "H100"}, num_cards=1) |
| 94 | +def test_songgen_batch(omni_runner: OmniRunner) -> None: |
| 95 | + """Batch of two requests returns audio for each.""" |
| 96 | + requests = [ |
| 97 | + _build_request("First verse under a quiet sky."), |
| 98 | + _build_request("Second verse as the morning breaks."), |
| 99 | + ] |
| 100 | + results = [] |
| 101 | + # Single-stage model (num_stages=1): one sampling param for all requests. |
| 102 | + for stage_outputs in omni_runner.omni.generate(requests, [DEFAULT_SAMPLING]): |
| 103 | + req_output = stage_outputs.request_output |
| 104 | + if req_output is not None: |
| 105 | + mm = req_output.outputs[0].multimodal_output |
| 106 | + assert mm is not None |
| 107 | + results.append(mm["audio"].cpu()) |
| 108 | + |
| 109 | + assert len(results) == 2, f"Expected 2 outputs, got {len(results)}" |
| 110 | + for i, audio in enumerate(results): |
| 111 | + assert audio.numel() > 0, f"Audio {i} is empty" |
0 commit comments