PR Test ROCm 7.2 (AMD) #117
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: PR Test ROCm 7.2 (AMD) | |
| # Dynamic run-name for /rerun-stage commands to enable URL lookup | |
| # Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs | |
| run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }} | |
| on: | |
| schedule: | |
| - cron: '30 17 * * *' | |
| - cron: '0 */6 * * *' # Every 6 hours (UTC): target dsv4 accuracy jobs only | |
| # push: | |
| # branches: [ main ] | |
| # paths: | |
| # - "python/**" | |
| # - "scripts/ci/**" | |
| # - "test/**" | |
| # - "sgl-kernel/**" | |
| # - ".github/workflows/pr-test-amd-rocm720.yml" | |
| # - "docker/rocm.Dockerfile" | |
| pull_request: | |
| paths: | |
| - "python/**" | |
| - "scripts/ci/**" | |
| - "test/**" | |
| - "sgl-kernel/**" | |
| - ".github/workflows/pr-test-amd-rocm720.yml" | |
| - "docker/rocm.Dockerfile" | |
| workflow_dispatch: | |
| inputs: | |
| target_stage_select: | |
| description: "Select a stage to run from dropdown (leave empty for auto-detect)" | |
| required: false | |
| type: choice | |
| default: '' | |
| options: | |
| - '' | |
| - sgl-kernel-unit-test-amd-rocm720 | |
| - sgl-kernel-unit-test-2-gpu-amd-rocm720 | |
| - stage-a-test-1-gpu-small-amd-rocm720 | |
| - jit-kernel-unit-test-amd-rocm720 | |
| - stage-b-test-1-gpu-small-amd-rocm720 | |
| - stage-b-test-1-gpu-small-amd-nondeterministic-rocm720 | |
| - stage-b-test-1-gpu-small-amd-mi35x-rocm720 | |
| - stage-b-test-1-gpu-large-amd-rocm720 | |
| - stage-b-test-2-gpu-large-amd-rocm720 | |
| - multimodal-gen-test-1-gpu-amd-rocm720 | |
| - multimodal-gen-test-2-gpu-amd-rocm720 | |
| - stage-c-test-large-8-gpu-amd-rocm720 | |
| - stage-c-test-large-8-gpu-amd-mi35x-rocm720 | |
| - stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720 | |
| - stage-c-test-4-gpu-amd-rocm720 | |
| - dsv4-flash-fp4-fp8-amd-rocm720 | |
| - dsv4-pro-fp4-amd-rocm720 | |
| target_stage: | |
| description: "Or type comma-separated stage names (overrides dropdown if non-empty)" | |
| required: false | |
| type: string | |
| default: "" | |
| pr_head_sha: | |
| description: "PR head SHA to checkout (for /rerun-stage on fork PRs)" | |
| required: false | |
| type: string | |
| default: "" | |
| aiter_ref: | |
| description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' | |
| required: false | |
| type: string | |
| default: '' | |
| continue_on_error: | |
| description: 'Continue on error (do not fail the workflow on test failures)' | |
| required: false | |
| type: boolean | |
| default: true | |
| run_all_tests: | |
| description: 'Run all tests (skip change detection). Ignored when target_stage / target_stage_select is set.' | |
| required: false | |
| type: boolean | |
| default: false | |
| workflow_call: | |
| inputs: | |
| ref: | |
| description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.' | |
| required: false | |
| type: string | |
| default: '' | |
| run_all_tests: | |
| description: "Run all tests (for releasing or testing purpose)" | |
| required: false | |
| type: boolean | |
| default: false | |
| aiter_ref: | |
| description: 'Override AITER commit (optional, leave empty to use Dockerfile default)' | |
| required: false | |
| type: string | |
| default: '' | |
| continue_on_error: | |
| description: 'Continue on error (do not fail the workflow on test failures)' | |
| required: false | |
| type: boolean | |
| default: true | |
| # Mirror pr-test.yml: the chained extra suite (call-pr-test-amd-extra-rocm720 | |
| # -> pr-test-amd-extra.yml) declares actions: write / issues: read / | |
| # pull-requests: read. A called reusable workflow can only use scopes the | |
| # caller already holds, so without this block the call fails workflow | |
| # validation ("requesting actions: write... but only allowed ...none"). | |
| permissions: | |
| actions: write | |
| contents: read | |
| issues: read | |
| pull-requests: read | |
| env: | |
| AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }} | |
| DOCKERHUB_AMD_USERNAME: ${{ secrets.DOCKERHUB_AMD_USERNAME }} | |
| DOCKERHUB_AMD_TOKEN: ${{ secrets.DOCKERHUB_AMD_TOKEN }} | |
| concurrency: | |
| # When called via workflow_call with run_all_tests=true, use a unique group per run to | |
| # avoid collisions with direct schedule/workflow_dispatch triggers. We use run_all_tests | |
| # (not github.event_name) to detect this, because github.event_name inherits from the caller. | |
| # Manual dispatch runs also get unique groups so they never cancel each other. | |
| group: pr-test-amd-rocm720-${{ (inputs.run_all_tests || github.event_name == 'workflow_dispatch') && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }} | |
| cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' && github.event_name != 'workflow_dispatch' }} | |
| jobs: | |
| call-gate: | |
| # Runs on PRs (pr-gate.yml enforces the run-ci label / blocks drafts), the daily | |
| # '30 17 * * *' cron, workflow_dispatch and workflow_call. It is skipped only on the | |
| # 6h ('0 */6 * * *') cron, which cascades a skip to check-changes and every existing | |
| # test job, so that cron runs the two dsv4 jobs only. On PRs the existing jobs are | |
| # additionally kept off via the change-detection guard in check-changes, so only the | |
| # dsv4 jobs run there -- and only when this gate passes (they `needs` it). | |
| if: github.event.schedule != '0 */6 * * *' | |
| uses: ./.github/workflows/pr-gate.yml | |
| secrets: inherit | |
| check-changes: | |
| needs: [call-gate] | |
| runs-on: ubuntu-latest | |
| outputs: | |
| main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }} | |
| sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }} | |
| jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }} | |
| multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }} | |
| continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Determine run mode | |
| id: run-mode | |
| run: | | |
| # Run all tests for workflow_call (when ref input is provided) | |
| # Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref | |
| # Any scheduled run that reaches check-changes is the daily '30 17 * * *' cron | |
| # (the 6h '0 */6 * * *' cron skips call-gate -> check-changes), so the daily run | |
| # executes the full suite (existing jobs + dsv4), like pr-test-amd.yml. | |
| if [[ "${{ inputs.run_all_tests }}" == "true" || "${{ github.event_name }}" == "schedule" ]]; then | |
| echo "run_all_tests=true" >> $GITHUB_OUTPUT | |
| echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }}, event=${{ github.event_name }})" | |
| else | |
| echo "run_all_tests=false" >> $GITHUB_OUTPUT | |
| echo "Run mode: FILTERED (triggered by ${{ github.event_name }})" | |
| fi | |
| - name: Set continue-on-error for schedule/full runs | |
| id: set-continue-on-error | |
| run: | | |
| # Mirror pr-test-amd.yml: continue-on-error when any of | |
| # - run_all_tests was requested (workflow_dispatch checkbox, workflow_call input, | |
| # or a scheduled run, which run-mode above rolls into run_all_tests), | |
| # - inputs.continue_on_error was explicitly set, | |
| # - or this is a scheduled run (kept as an explicit belt-and-suspenders guard). | |
| if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" || "${{ inputs.continue_on_error }}" == "true" || "${{ github.event_name }}" == "schedule" ]]; then | |
| echo "continue_on_error=true" >> $GITHUB_OUTPUT | |
| echo "Continue-on-error: ENABLED (run_all_tests=${{ steps.run-mode.outputs.run_all_tests }}, input=${{ inputs.continue_on_error }}, event=${{ github.event_name }})" | |
| else | |
| echo "continue_on_error=false" >> $GITHUB_OUTPUT | |
| echo "Continue-on-error: DISABLED" | |
| fi | |
| - name: Detect file changes | |
| id: filter | |
| uses: dorny/paths-filter@v3 | |
| # On pull_request only the dsv4 jobs run; skip change detection on PRs so the | |
| # per-stage outputs stay 'false' and the existing test jobs are gated off. | |
| if: steps.run-mode.outputs.run_all_tests != 'true' && github.event_name != 'pull_request' | |
| with: | |
| filters: | | |
| main_package: | |
| - "python/sglang/!(multimodal_gen)/**/!(*.md)" | |
| - "python/pyproject_rocm.toml" | |
| - "python/pyproject_other.toml" | |
| - "scripts/ci/amd/*" | |
| - "scripts/ci/utils/*" | |
| - "test/**/!(*.md)" | |
| - ".github/workflows/pr-test-amd-rocm720.yml" | |
| sgl_kernel: | |
| - "sgl-kernel/**/!(*.md|THIRDPARTYNOTICES.txt|LICENSE)" | |
| - ".github/workflows/pr-test-amd-rocm720.yml" | |
| jit_kernel: | |
| - "python/sglang/jit_kernel/**" | |
| - "test/registered/jit/**" | |
| - ".github/workflows/pr-test-amd-rocm720.yml" | |
| multimodal_gen: | |
| - "python/sglang/multimodal_gen/**/!(*.md|*.ipynb)" | |
| - "python/sglang/cli/**" | |
| - "python/sglang/srt/observability/**" | |
| - "python/sglang/jit_kernel/diffusion/**" | |
| - "test/registered/jit/diffusion/**" | |
| - "test/registered/jit/benchmark/diffusion/**" | |
| - "python/pyproject_rocm.toml" | |
| - "python/pyproject_other.toml" | |
| # =============================================== extra (scheduled) ==================================================== | |
| # ROCm 7.2 mirror of pr-test-amd.yml's `call-pr-test-amd-extra`: chain the | |
| # label-gated AMD extra tier into this workflow's daily schedule, but in a | |
| # ROCm 7.2 container (`rocm_version: rocm720`). On `schedule` (and | |
| # run_all_tests dispatch) the extra suite runs on `main` without the | |
| # `run-ci-extra` label (pr-gate.yml only enforces labels on pull_request | |
| # events). Targeted dispatches (target_stage set) are excluded. Not wired | |
| # into any finish aggregator so the base rocm720 run never depends on it. | |
| call-pr-test-amd-extra-rocm720: | |
| if: | | |
| (github.event_name == 'schedule' || inputs.run_all_tests == true) && | |
| !(inputs.target_stage || inputs.target_stage_select) | |
| uses: ./.github/workflows/pr-test-amd-extra.yml | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || '' }} | |
| rocm_version: rocm720 | |
| aiter_ref: ${{ inputs.aiter_ref }} | |
| continue_on_error: true | |
| secrets: inherit | |
| # =============================================== sgl-kernel ==================================================== | |
| sgl-kernel-unit-test-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.sgl_kernel == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py | |
| docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py | |
| sgl-kernel-unit-test-2-gpu-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.sgl_kernel == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-2gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite sgl-kernel-unit-test-2-gpu-amd | |
| # =============================================== primary ==================================================== | |
| stage-a-test-1-gpu-small-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-gpu-small-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-gpu-small-amd ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| jit-kernel-unit-test-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| needs.check-changes.outputs.jit_kernel == 'true' | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run JIT kernel unit tests | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite jit-kernel-unit-test-amd ${{ inputs.continue_on_error && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-small-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-small-amd-nondeterministic-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-nondeterministic-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 45 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-nondeterministic --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-small-amd-mi35x-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-mi35x-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi35x-gpu-1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 30 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-mi35x ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-1-gpu-large-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-large-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| part: [0, 1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 45 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-b-test-2-gpu-large-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-2-gpu-large-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-2gpu-sglang] | |
| part: [0, 1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 45 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-2-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| multimodal-gen-test-1-gpu-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-1gpu-sglang] | |
| part: [0, 1, 2, 3] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion | |
| docker exec ci_sglang pip install amdsmi | |
| - name: Setup kernel caches | |
| run: | | |
| # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) | |
| # This directory persists across container restarts on the self-hosted runner | |
| docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub | |
| # Clear pre-built AITER kernels from Docker image to avoid segfaults | |
| # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ | |
| echo "Clearing pre-built AITER kernels from Docker image..." | |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true | |
| docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true | |
| echo "AITER kernels cleared - will be rebuilt on first use" | |
| # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) | |
| # This tells the test cleanup code to NOT delete downloaded models | |
| if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then | |
| docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache | |
| echo "Created .persistent_cache marker - HF cache will persist" | |
| else | |
| echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" | |
| fi | |
| # Check MIOpen cache (VAE convolution kernels) | |
| miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") | |
| echo "Found ${miopen_files} MIOpen cache files" | |
| - name: Diagnose HF cache and system resources | |
| run: | | |
| echo "=== System Memory Status ===" | |
| free -h | |
| echo "" | |
| echo "=== Disk Space ===" | |
| df -h /home/runner/sgl-data 2>/dev/null || df -h | |
| echo "" | |
| echo "=== HF Cache Directory Structure ===" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" | |
| echo "" | |
| echo "=== Checking for cached diffusion models (1-GPU tests) ===" | |
| # Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2 | |
| for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do | |
| cache_path="/sgl-data/hf-cache/hub/models--${model}" | |
| if docker exec ci_sglang test -d "$cache_path"; then | |
| size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) | |
| echo "✓ CACHED: $model ($size)" | |
| else | |
| echo "✗ NOT CACHED: $model" | |
| fi | |
| done | |
| echo "" | |
| echo "=== GPU Memory Status ===" | |
| docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" | |
| - name: Run diffusion server tests (1-GPU) | |
| timeout-minutes: 60 | |
| run: | | |
| # AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path) | |
| # Tests: T2V, T2I, I2V, LoRA | |
| # | |
| # HF download env vars: | |
| # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) | |
| # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings | |
| docker exec \ | |
| -e SGLANG_E2E_TOLERANCE=0.3 \ | |
| -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ | |
| -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ | |
| -e SGLANG_SKIP_CONSISTENCY=1 \ | |
| -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ | |
| -e SGLANG_DIFFUSION_ARTIFACT_DIR=/sglang-checkout/diffusion-failures \ | |
| -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ | |
| -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ | |
| -e HF_HUB_ENABLE_HF_TRANSFER=1 \ | |
| -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ | |
| -w /sglang-checkout/python \ | |
| ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ | |
| --suite 1-gpu \ | |
| --partition-id ${{ matrix.part }} \ | |
| --total-partitions 4 \ | |
| -k "not flux_2" | |
| # Post-test diagnostics | |
| echo "=== Post-test System Memory Status ===" | |
| free -h | |
| - name: Upload diffusion failure artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: diffusion-failures-amd-rocm720-1gpu-${{ matrix.part }}-${{ github.run_attempt }} | |
| path: diffusion-failures/ | |
| if-no-files-found: ignore | |
| retention-days: 7 | |
| multimodal-gen-test-2-gpu-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-2gpu-sglang] | |
| part: [0, 1, 2] # 3 partitions: 2 parametrized + 1 standalone (test_disagg_server.py) | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Download artifacts | |
| if: needs.check-changes.outputs.sgl_kernel == 'true' | |
| uses: actions/download-artifact@v4 | |
| with: | |
| path: sgl-kernel/dist/ | |
| merge-multiple: true | |
| pattern: wheel-python3.10-cuda12.9 | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion | |
| docker exec ci_sglang pip install amdsmi | |
| - name: Setup kernel caches | |
| run: | | |
| # Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data) | |
| docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub | |
| # Clear pre-built AITER kernels from Docker image to avoid segfaults | |
| # The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/ | |
| echo "Clearing pre-built AITER kernels from Docker image..." | |
| docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true | |
| docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true | |
| echo "AITER kernels cleared - will be rebuilt on first use" | |
| # Create persistent cache marker if /sgl-data is a real mount (not ephemeral) | |
| # This tells the test cleanup code to NOT delete downloaded models | |
| if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then | |
| docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache | |
| echo "Created .persistent_cache marker - HF cache will persist" | |
| else | |
| echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test" | |
| fi | |
| # Check MIOpen cache (VAE convolution kernels) | |
| miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0") | |
| echo "Found ${miopen_files} MIOpen cache files" | |
| - name: Diagnose HF cache and system resources | |
| run: | | |
| echo "=== System Memory Status ===" | |
| free -h | |
| echo "" | |
| echo "=== Disk Space ===" | |
| df -h /home/runner/sgl-data 2>/dev/null || df -h | |
| echo "" | |
| echo "=== HF Cache Directory Structure ===" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found" | |
| docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found" | |
| echo "" | |
| echo "=== Checking for cached diffusion models (2-GPU tests) ===" | |
| # Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1 | |
| for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do | |
| cache_path="/sgl-data/hf-cache/hub/models--${model}" | |
| if docker exec ci_sglang test -d "$cache_path"; then | |
| size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1) | |
| echo "✓ CACHED: $model ($size)" | |
| else | |
| echo "✗ NOT CACHED: $model" | |
| fi | |
| done | |
| echo "" | |
| echo "=== GPU Memory Status ===" | |
| docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available" | |
| - name: Run diffusion server tests (2-GPU) | |
| timeout-minutes: 150 | |
| run: | | |
| # AMD CI: All 2-GPU tests including LoRA | |
| # Tests: T2V, T2I, I2V, LoRA | |
| # | |
| # HF download env vars: | |
| # - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available) | |
| # - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings | |
| docker exec \ | |
| -e SGLANG_E2E_TOLERANCE=0.3 \ | |
| -e SGLANG_STAGE_TIME_TOLERANCE=0.2 \ | |
| -e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \ | |
| -e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \ | |
| -e SGLANG_SKIP_CONSISTENCY=1 \ | |
| -e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \ | |
| -e SGLANG_DIFFUSION_ARTIFACT_DIR=/sglang-checkout/diffusion-failures \ | |
| -e AITER_JIT_DIR=/sgl-data/aiter-kernels \ | |
| -e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \ | |
| -e HF_HUB_ENABLE_HF_TRANSFER=1 \ | |
| -e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \ | |
| -w /sglang-checkout/python \ | |
| ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \ | |
| --suite 2-gpu \ | |
| --partition-id ${{ matrix.part }} \ | |
| --total-partitions 3 | |
| # Post-test diagnostics | |
| echo "=== Post-test System Memory Status ===" | |
| free -h | |
| - name: Upload diffusion failure artifacts | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: diffusion-failures-amd-rocm720-2gpu-${{ matrix.part }}-${{ github.run_attempt }} | |
| path: diffusion-failures/ | |
| if-no-files-found: ignore | |
| retention-days: 7 | |
| stage-c-test-4-gpu-amd-rocm720: | |
| needs: [check-changes, stage-b-test-1-gpu-small-amd-rocm720, stage-b-test-2-gpu-large-amd-rocm720] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-4-gpu-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-4gpu-sglang] | |
| part: [0] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh \ | |
| -e NCCL_CUMEM_ENABLE=0 \ | |
| -e NCCL_NVLS_ENABLE=0 \ | |
| -e RCCL_MSCCL_ENABLE=0 \ | |
| -e SGLANG_USE_ROCM700A=1 \ | |
| -w "/sglang-checkout/test" \ | |
| python3 run_suite.py \ | |
| --hw amd \ | |
| --suite stage-c-test-4-gpu-amd \ | |
| --auto-partition-id ${{ matrix.part }} \ | |
| --auto-partition-size 1 \ | |
| --timeout-per-file 1800 \ | |
| --enable-retry \ | |
| --max-attempts 2 \ | |
| --retry-wait-seconds 120 \ | |
| --retry-timeout-increase 0 \ | |
| ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| stage-c-test-large-8-gpu-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| env: | |
| RUNNER_LABELS: linux-mi325-8gpu-sglang | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi325-8gpu-sglang] | |
| part: [0, 1, 2, 3] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Test RCCL multi-GPU communication | |
| timeout-minutes: 5 | |
| run: | | |
| echo "Testing RCCL multi-GPU communication with debug info..." | |
| docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py" | |
| - name: Run test | |
| timeout-minutes: 120 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 --timeout-per-file 5400 ${{ (github.event_name == 'schedule' || inputs.continue_on_error) && '--continue-on-error' || '' }} | |
| stage-c-test-large-8-gpu-amd-mi35x-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi35x-gpu-8] | |
| part: [0, 1] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Start CI container | |
| run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Run test | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| # =============================================== Disaggregation ==================================================== | |
| stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720: | |
| needs: [check-changes] | |
| if: | | |
| always() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| (!failure() && !cancelled()) && | |
| ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true')) | |
| ) | |
| ) | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| runner: [linux-mi35x-gpu-8.fabric] | |
| runs-on: ${{matrix.runner}} | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Check Host RDMA Environment | |
| id: rdma_detect | |
| run: | | |
| set +e | |
| echo "=== Checking Host RDMA Environment ===" | |
| echo "" | |
| echo "=== 1. Ionic driver library check ===" | |
| ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path" | |
| echo "" | |
| echo "=== 2. Infiniband devices ===" | |
| ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found" | |
| ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found" | |
| echo "" | |
| echo "=== 3. ibv_devinfo ===" | |
| which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available" | |
| echo "" | |
| echo "=== 4. Kernel modules ===" | |
| lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded" | |
| echo "" | |
| echo "=== 5. Detect RDMA Devices for test environment ===" | |
| if [ -d "/sys/class/infiniband" ]; then | |
| RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -) | |
| echo "Detected RDMA Devices: $RDMA_DEVS" | |
| echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV | |
| else | |
| echo "No RDMA devices found in /sys/class/infiniband" | |
| echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV | |
| fi | |
| echo "" | |
| echo "=== Host RDMA Check Complete ===" | |
| - name: Start Special Container | |
| run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: bash scripts/ci/amd/amd_ci_install_dependency.sh | |
| - name: Verify RDMA in Container | |
| run: | | |
| docker exec -u root ci_sglang bash -c ' | |
| echo "=== Container RDMA Verification ===" | |
| echo "Device nodes:" | |
| ls -la /dev/infiniband/ | |
| echo "" | |
| echo "Provider libraries:" | |
| ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers" | |
| echo "" | |
| echo "HCA devices:" | |
| HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0") | |
| ibv_devinfo -list | |
| if [ "$HCA_COUNT" -gt 0 ]; then | |
| echo "" | |
| echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ===" | |
| else | |
| echo "" | |
| echo "=== WARNING: No HCAs detected. RDMA tests may fail ===" | |
| fi | |
| ' | |
| - name: Run Aiter Op Test (RMSNorm) | |
| timeout-minutes: 10 | |
| run: | | |
| echo "Running pre-check: test_rmsnorm2d.py" | |
| docker exec \ | |
| -e MAX_JOBS=192 \ | |
| ci_sglang \ | |
| python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py | |
| - name: Run test_disaggregation | |
| timeout-minutes: 60 | |
| run: | | |
| bash scripts/ci/amd/amd_ci_exec.sh \ | |
| -e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \ | |
| -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-mi35x-disaggregation-amd --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }} | |
| # =============================================== DeepSeek-V4 (MI35x, 8-GPU) ==================================================== | |
| # On `pull_request` and the 6h ('0 */6 * * *') cron these are the ONLY jobs that run; | |
| # they also run on the daily cron alongside the full suite. On PRs they run only when | |
| # call-gate succeeds, i.e. the PR carries the run-ci label (and is not a draft), so | |
| # they are gated just like every other PR test job. On PR they hard-fail (a failure | |
| # blocks merge); on any scheduled run they pass --continue-on-error. They stay | |
| # selectable via workflow_dispatch / run on workflow_call full runs. | |
| dsv4-flash-fp4-fp8-amd-rocm720: | |
| needs: [call-gate] | |
| if: | | |
| always() && !cancelled() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',dsv4-flash-fp4-fp8-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ( | |
| (github.event_name == 'pull_request' && needs.call-gate.result == 'success') || | |
| (github.event_name == 'schedule') || | |
| inputs.run_all_tests | |
| ) | |
| ) | |
| ) | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Setup docker (ROCm 7.2) | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| # --skip-test-time-deps: GSM8K + bench_one_batch_server don't need lmms-eval / human-eval. | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps | |
| bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V4-Flash FP4 + FP8) | |
| timeout-minutes: 300 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| # SGLANG_DSV4_ACCURACY_ONLY=1 makes the dsv4 test files skip their perf test | |
| # (test_b_perf_8k_1k); only the GSM8K accuracy test runs in this workflow. | |
| bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| -e SGLANG_DSV4_ACCURACY_ONLY=1 \ | |
| python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v4-flash --nightly --timeout-per-file 7200 ${{ (github.event_name == 'schedule' || inputs.continue_on_error) && '--continue-on-error' || '' }} || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| exit ${TEST_EXIT_CODE:-0} | |
| dsv4-pro-fp4-amd-rocm720: | |
| needs: [call-gate] | |
| if: | | |
| always() && !cancelled() && | |
| ( | |
| (contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',dsv4-pro-fp4-amd-rocm720,')) || | |
| ( | |
| !(inputs.target_stage || inputs.target_stage_select) && | |
| ( | |
| (github.event_name == 'pull_request' && needs.call-gate.result == 'success') || | |
| (github.event_name == 'schedule') || | |
| inputs.run_all_tests | |
| ) | |
| ) | |
| ) | |
| runs-on: linux-mi35x-gpu-8 | |
| steps: | |
| - name: Checkout code | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }} | |
| - name: Ensure VRAM is clear | |
| run: bash scripts/ci/amd/ensure_vram_clear.sh rocm | |
| - name: Setup docker (ROCm 7.2) | |
| run: | | |
| touch github_summary.md | |
| bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720 | |
| env: | |
| GITHUB_WORKSPACE: ${{ github.workspace }} | |
| - name: Install dependencies | |
| run: | | |
| # --skip-test-time-deps: GSM8K + bench_one_batch_server don't need lmms-eval / human-eval. | |
| bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps | |
| bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate | |
| - name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V4-Pro FP4) | |
| timeout-minutes: 480 | |
| run: | | |
| > github_summary.md # Clear summary file | |
| # SGLANG_DSV4_ACCURACY_ONLY=1 makes the dsv4 test files skip their perf test | |
| # (test_b_perf_8k_1k); only the GSM8K accuracy test runs in this workflow. | |
| bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \ | |
| -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \ | |
| -e SGLANG_DSV4_ACCURACY_ONLY=1 \ | |
| python3 registered/amd/test_deepseek_v4_pro_fp4.py || TEST_EXIT_CODE=$? | |
| echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true | |
| if [[ "${{ github.event_name == 'schedule' || inputs.continue_on_error }}" == "true" ]]; then | |
| exit 0 | |
| fi | |
| exit ${TEST_EXIT_CODE:-0} | |
| pr-test-amd-rocm720-finish: | |
| needs: | |
| [ | |
| call-gate, | |
| check-changes, | |
| sgl-kernel-unit-test-amd-rocm720, | |
| sgl-kernel-unit-test-2-gpu-amd-rocm720, | |
| multimodal-gen-test-1-gpu-amd-rocm720, | |
| multimodal-gen-test-2-gpu-amd-rocm720, | |
| stage-a-test-1-gpu-small-amd-rocm720, | |
| jit-kernel-unit-test-amd-rocm720, | |
| stage-b-test-1-gpu-small-amd-rocm720, | |
| stage-b-test-1-gpu-small-amd-nondeterministic-rocm720, | |
| stage-b-test-1-gpu-small-amd-mi35x-rocm720, | |
| stage-b-test-1-gpu-large-amd-rocm720, | |
| stage-b-test-2-gpu-large-amd-rocm720, | |
| stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720, | |
| stage-c-test-4-gpu-amd-rocm720, | |
| stage-c-test-large-8-gpu-amd-rocm720, | |
| stage-c-test-large-8-gpu-amd-mi35x-rocm720, | |
| dsv4-flash-fp4-fp8-amd-rocm720, | |
| dsv4-pro-fp4-amd-rocm720, | |
| ] | |
| if: always() | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Check all dependent job statuses | |
| run: | | |
| # Convert the 'needs' context to a JSON string | |
| json_needs='${{ toJson(needs) }}' | |
| # Get a list of all job names from the JSON keys | |
| job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]') | |
| for job in $job_names; do | |
| # For each job, extract its result | |
| result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result') | |
| # Print the job name and its result | |
| echo "$job: $result" | |
| # Check for failure or cancellation and exit if found | |
| if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then | |
| echo "The above jobs failed." | |
| exit 1 | |
| fi | |
| done | |
| # If the loop completes, all jobs were successful | |
| echo "All jobs completed successfully" | |
| exit 0 |