Skip to content

PR Test (AMD)

PR Test (AMD) #310

Workflow file for this run

name: PR Test (AMD)
# Dynamic run-name for /rerun-stage commands to enable URL lookup
# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs
run-name: ${{ (inputs.target_stage || inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage || inputs.target_stage_select, inputs.pr_head_sha) || format('[{0}]', inputs.target_stage || inputs.target_stage_select)) || '' }}
on:
schedule:
- cron: '0 */6 * * *' # Run every 6 hours (UTC)
pull_request:
paths:
- "python/**"
- "scripts/ci/**"
- "test/**"
- "sgl-kernel/**"
- ".github/workflows/pr-test-amd.yml"
- "docker/rocm.Dockerfile"
workflow_dispatch:
inputs:
target_stage_select:
description: "Select a stage to run from dropdown (leave empty for auto-detect)"
required: false
type: choice
default: ''
options:
- ''
- sgl-kernel-unit-test-amd
- sgl-kernel-unit-test-2-gpu-amd
- stage-a-test-1-gpu-small-amd
- jit-kernel-unit-test-amd
- stage-b-test-1-gpu-small-amd
- stage-b-test-1-gpu-small-amd-nondeterministic
- stage-b-test-1-gpu-small-amd-mi35x
- stage-b-test-1-gpu-large-amd
- stage-b-test-2-gpu-large-amd
- multimodal-gen-test-1-gpu-amd
- multimodal-gen-test-2-gpu-amd
- stage-c-test-4-gpu-amd
- stage-c-test-large-8-gpu-amd
- stage-c-test-large-8-gpu-amd-mi35x
- stage-b-test-large-8-gpu-mi35x-disaggregation-amd
target_stage:
description: "Or type comma-separated stage names (overrides dropdown if non-empty)"
required: false
type: string
default: ""
pr_head_sha:
description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
required: false
type: string
default: ""
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: false
runner_arch:
description: 'AMD runner pool to dispatch GPU jobs to'
required: false
type: choice
default: mi325
options:
- mi300
- mi325
run_all_tests:
description: 'Run all tests (skip change detection). Ignored when target_stage / target_stage_select is set.'
required: false
type: boolean
default: false
workflow_call:
inputs:
ref:
description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
required: false
type: string
default: ''
run_all_tests:
description: "Run all tests (for releasing or testing purpose)"
required: false
type: boolean
default: false
aiter_ref:
description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
required: false
type: string
default: ''
continue_on_error:
description: 'Continue on error (do not fail the workflow on test failures)'
required: false
type: boolean
default: false
# Mirror pr-test.yml: the chained extra suite (call-pr-test-amd-extra ->
# pr-test-amd-extra.yml) declares actions: write / issues: read /
# pull-requests: read. A called reusable workflow can only use scopes the
# caller already holds, so without this block the call fails workflow
# validation ("requesting actions: write... but only allowed ...none").
permissions:
actions: write
contents: read
issues: read
pull-requests: read
env:
AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }}
DOCKERHUB_AMD_USERNAME: ${{ secrets.DOCKERHUB_AMD_USERNAME }}
DOCKERHUB_AMD_TOKEN: ${{ secrets.DOCKERHUB_AMD_TOKEN }}
concurrency:
# Scheduled, run_all_tests, and manual dispatch runs get unique groups (never cancel each other).
# PR runs share a group per branch so new pushes cancel stale runs.
group: pr-test-amd-${{ (inputs.run_all_tests || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && format('full-{0}', github.run_id) || inputs.pr_head_sha || inputs.ref || github.ref }}
cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' && github.event_name != 'schedule' && github.event_name != 'workflow_dispatch' }}
jobs:
call-gate:
if: github.event_name != 'schedule'
uses: ./.github/workflows/pr-gate.yml
secrets: inherit
check-changes:
needs: [call-gate]
if: always()
runs-on: ubuntu-latest
outputs:
main_package: ${{ steps.filter.outputs.main_package || steps.run-mode.outputs.run_all_tests }}
sgl_kernel: ${{ steps.filter.outputs.sgl_kernel || steps.run-mode.outputs.run_all_tests }}
jit_kernel: ${{ steps.filter.outputs.jit_kernel || steps.run-mode.outputs.run_all_tests }}
multimodal_gen: ${{ steps.filter.outputs.multimodal_gen || steps.run-mode.outputs.run_all_tests }}
continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Determine run mode
id: run-mode
run: |
if [[ "${{ inputs.run_all_tests }}" == "true" || "${{ github.event_name }}" == "schedule" ]]; then
echo "run_all_tests=true" >> $GITHUB_OUTPUT
echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }}, event=${{ github.event_name }})"
else
echo "run_all_tests=false" >> $GITHUB_OUTPUT
echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
fi
- name: Set continue-on-error for schedule/full runs
id: set-continue-on-error
env:
# `bypass-fastfail` PR label: also disable within-suite fast-fail
# here. The shared actions/wait-for-jobs already honors the same
# label to skip cross-stage waits.
BYPASS_FASTFAIL_LABEL: ${{ contains(github.event.pull_request.labels.*.name, 'bypass-fastfail') }}
run: |
if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" \
|| "${{ inputs.continue_on_error }}" == "true" \
|| "$BYPASS_FASTFAIL_LABEL" == "true" ]]; then
echo "continue_on_error=true" >> $GITHUB_OUTPUT
echo "Continue-on-error: ENABLED (run_all_tests=${{ steps.run-mode.outputs.run_all_tests }}, input=${{ inputs.continue_on_error }}, bypass-fastfail=$BYPASS_FASTFAIL_LABEL)"
else
echo "continue_on_error=false" >> $GITHUB_OUTPUT
echo "Continue-on-error: DISABLED"
fi
- name: Detect file changes
id: filter
uses: dorny/paths-filter@v3
if: steps.run-mode.outputs.run_all_tests != 'true'
with:
filters: |
main_package:
- "python/sglang/!(multimodal_gen)/**/!(*.md)"
- "python/pyproject_rocm.toml"
- "python/pyproject_other.toml"
- "scripts/ci/amd/*"
- "scripts/ci/utils/*"
- "test/**/!(*.md)"
- ".github/workflows/pr-test-amd.yml"
sgl_kernel:
- "sgl-kernel/**/!(*.md|THIRDPARTYNOTICES.txt|LICENSE)"
- ".github/workflows/pr-test-amd.yml"
jit_kernel:
- "python/sglang/jit_kernel/**"
- "test/registered/jit/**"
- ".github/workflows/pr-test-amd.yml"
multimodal_gen:
- "python/sglang/multimodal_gen/**/!(*.md|*.ipynb)"
- "python/sglang/cli/**"
- "python/sglang/srt/observability/**"
- "python/sglang/jit_kernel/diffusion/**"
- "test/registered/jit/diffusion/**"
- "test/registered/jit/benchmark/diffusion/**"
- "python/pyproject_rocm.toml"
- "python/pyproject_other.toml"
# =============================================== extra (scheduled) ====================================================
# Chain the label-gated AMD extra tier into the scheduled run, mirroring
# pr-test.yml's `call-pr-test-extra`. On `schedule` (and run_all_tests
# dispatch) the extra suite runs on `main` without needing the
# `run-ci-extra` label (pr-gate.yml only enforces labels on pull_request
# events). Targeted /rerun-stage dispatches (target_stage set) are excluded.
# Not added to `pr-test-amd-finish` so the base AMD gate never depends on
# the opt-in extra suite.
call-pr-test-amd-extra:
if: |
(github.event_name == 'schedule' || inputs.run_all_tests == true) &&
!(inputs.target_stage || inputs.target_stage_select)
uses: ./.github/workflows/pr-test-amd-extra.yml
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || '' }}
aiter_ref: ${{ inputs.aiter_ref }}
continue_on_error: true
secrets: inherit
# =============================================== sgl-kernel ====================================================
sgl-kernel-unit-test-amd:
name: ${{ format('sgl-kernel-unit-test-amd (linux-{0}-1gpu-sglang)', inputs.runner_arch || 'mi325') }}
needs: [check-changes, call-gate]
if: |
always() && !cancelled() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') &&
needs.check-changes.outputs.sgl_kernel == 'true'
)
)
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
env:
CONTINUE_ON_ERROR: ${{ needs.check-changes.outputs.continue_on_error }}
run: |
# In continue-on-error mode (schedule/full runs), keep running all pytest
# files and aggregate the exit code. In PR mode, preserve fail-fast.
failures=0
run_pytest() {
if [[ "$CONTINUE_ON_ERROR" == "true" ]]; then
"$@" || failures=$((failures + 1))
else
"$@"
fi
}
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py
run_pytest docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py
exit $failures
sgl-kernel-unit-test-2-gpu-amd:
name: ${{ format('sgl-kernel-unit-test-2-gpu-amd (linux-{0}-2gpu-sglang)', inputs.runner_arch || 'mi325') }}
needs: [check-changes, call-gate]
if: |
always() && !cancelled() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') &&
needs.check-changes.outputs.sgl_kernel == 'true'
)
)
runs-on: ${{ format('linux-{0}-2gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
env:
CONTINUE_ON_ERROR: ${{ needs.check-changes.outputs.continue_on_error }}
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite sgl-kernel-unit-test-2-gpu-amd ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
# =============================================== primary ====================================================
stage-a-test-1-gpu-small-amd:
name: ${{ format('stage-a-test-1-gpu-small-amd (linux-{0}-1gpu-sglang)', inputs.runner_arch || 'mi325') }}
needs: [check-changes, call-gate]
if: |
always() && !cancelled() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-a-test-1-gpu-small-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-gpu-small-amd ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
jit-kernel-unit-test-amd:
name: ${{ format('jit-kernel-unit-test-amd (linux-{0}-1gpu-sglang)', inputs.runner_arch || 'mi325') }}
needs: [check-changes, call-gate]
if: |
always() && !cancelled() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',jit-kernel-unit-test-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') &&
needs.check-changes.outputs.jit_kernel == 'true'
)
)
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run JIT kernel unit tests
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite jit-kernel-unit-test-amd ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
# =============================================== Wait Jobs for Sequential PR Execution ====================================================
# These jobs poll GitHub API to wait for previous stages to complete.
# For PR runs: wait jobs run and enforce sequential execution via polling.
# For scheduled runs: wait jobs are skipped, enabling parallel execution of all stages.
wait-for-stage-a-amd:
needs: [check-changes, call-gate]
if: |
always() &&
!cancelled() &&
github.event_name == 'pull_request' &&
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped')
runs-on: ubuntu-latest
outputs:
stage_a_result: ${{ steps.wait.outputs.result }}
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/wait-for-jobs
id: wait
with:
stage-name: stage-a-amd
jobs: '[{"prefix": "stage-a-test-1-gpu-small-amd", "expected_count": 1}]'
max-wait-minutes: '240'
stage-b-test-1-gpu-small-amd:
name: ${{ format('stage-b-test-1-gpu-small-amd (linux-{0}-1gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, wait-for-stage-a-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 2400 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
stage-b-test-1-gpu-small-amd-nondeterministic:
name: ${{ format('stage-b-test-1-gpu-small-amd-nondeterministic (linux-{0}-1gpu-sglang)', inputs.runner_arch || 'mi325') }}
needs: [check-changes, wait-for-stage-a-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-nondeterministic,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 45
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-nondeterministic --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
stage-b-test-1-gpu-small-amd-mi35x:
needs: [check-changes, wait-for-stage-a-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-mi35x,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
runner: [linux-mi35x-gpu-1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-mi35x ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
stage-b-test-1-gpu-large-amd:
name: ${{ format('stage-b-test-1-gpu-large-amd (linux-{0}-1gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, wait-for-stage-a-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-1-gpu-large-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
part: [0, 1, 2]
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 45
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 2700 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
stage-b-test-2-gpu-large-amd:
name: ${{ format('stage-b-test-2-gpu-large-amd (linux-{0}-2gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, wait-for-stage-a-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-2-gpu-large-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
part: [0, 1]
runs-on: ${{ format('linux-{0}-2gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 45
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-2-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 2700 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
multimodal-gen-test-1-gpu-amd:
name: ${{ format('multimodal-gen-test-1-gpu-amd (linux-{0}-1gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, call-gate]
if: |
always() && !cancelled() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') &&
needs.check-changes.outputs.multimodal_gen == 'true'
)
)
strategy:
fail-fast: false
matrix:
part: [0, 1, 2, 3]
runs-on: ${{ format('linux-{0}-1gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
- name: Setup kernel caches
run: |
# Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
# This directory persists across container restarts on the self-hosted runner
docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
# Clear pre-built AITER kernels from Docker image to avoid segfaults
# The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
echo "Clearing pre-built AITER kernels from Docker image..."
docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
echo "AITER kernels cleared - will be rebuilt on first use"
# Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
# This tells the test cleanup code to NOT delete downloaded models
if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
echo "Created .persistent_cache marker - HF cache will persist"
else
echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
fi
# Check MIOpen cache (VAE convolution kernels)
miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
echo "Found ${miopen_files} MIOpen cache files"
- name: Diagnose HF cache and system resources
run: |
echo "=== System Memory Status ==="
free -h
echo ""
echo "=== Disk Space ==="
df -h /home/runner/sgl-data 2>/dev/null || df -h
echo ""
echo "=== HF Cache Directory Structure ==="
docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
echo ""
echo "=== Checking for cached diffusion models (1-GPU tests) ==="
# Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2
for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do
cache_path="/sgl-data/hf-cache/hub/models--${model}"
if docker exec ci_sglang test -d "$cache_path"; then
size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
echo "✓ CACHED: $model ($size)"
else
echo "✗ NOT CACHED: $model"
fi
done
echo ""
echo "=== GPU Memory Status ==="
docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
- name: Run diffusion server tests (1-GPU)
timeout-minutes: 90
run: |
# AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path)
# Tests: T2V, T2I, I2V, LoRA
#
# HF download env vars:
# - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
# - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
docker exec \
-e SGLANG_E2E_TOLERANCE=0.3 \
-e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
-e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
-e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
-e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
-e SGLANG_SKIP_CONSISTENCY=1 \
-e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
-e SGLANG_DIFFUSION_ARTIFACT_DIR=/sglang-checkout/diffusion-failures \
-e AITER_JIT_DIR=/sgl-data/aiter-kernels \
-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
-e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
-w /sglang-checkout/python \
ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
--suite 1-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 4 \
-k "not flux_2" \
${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
# Post-test diagnostics
echo "=== Post-test System Memory Status ==="
free -h
- name: Upload diffusion failure artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: diffusion-failures-amd-1gpu-${{ matrix.part }}-${{ github.run_attempt }}
path: diffusion-failures/
if-no-files-found: ignore
retention-days: 7
multimodal-gen-test-2-gpu-amd:
name: ${{ format('multimodal-gen-test-2-gpu-amd (linux-{0}-2gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, call-gate]
if: |
always() && !cancelled() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped') &&
needs.check-changes.outputs.multimodal_gen == 'true'
)
)
strategy:
fail-fast: false
matrix:
part: [0, 1, 2] # 3 partitions: 2 parametrized + 1 standalone (test_disagg_server.py)
runs-on: ${{ format('linux-{0}-2gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: |
bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
- name: Setup kernel caches
run: |
# Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub
# Clear pre-built AITER kernels from Docker image to avoid segfaults
# The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
echo "Clearing pre-built AITER kernels from Docker image..."
docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null || true
docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null || true
echo "AITER kernels cleared - will be rebuilt on first use"
# Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
# This tells the test cleanup code to NOT delete downloaded models
if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
echo "Created .persistent_cache marker - HF cache will persist"
else
echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
fi
# Check MIOpen cache (VAE convolution kernels)
miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null | wc -l || echo "0")
echo "Found ${miopen_files} MIOpen cache files"
- name: Diagnose HF cache and system resources
run: |
echo "=== System Memory Status ==="
free -h
echo ""
echo "=== Disk Space ==="
df -h /home/runner/sgl-data 2>/dev/null || df -h
echo ""
echo "=== HF Cache Directory Structure ==="
docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null || echo "HF cache dir not found"
docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null || echo "HF hub cache not found"
echo ""
echo "=== Checking for cached diffusion models (2-GPU tests) ==="
# Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1
for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do
cache_path="/sgl-data/hf-cache/hub/models--${model}"
if docker exec ci_sglang test -d "$cache_path"; then
size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null | cut -f1)
echo "✓ CACHED: $model ($size)"
else
echo "✗ NOT CACHED: $model"
fi
done
echo ""
echo "=== GPU Memory Status ==="
docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null || echo "rocm-smi not available"
- name: Run diffusion server tests (2-GPU)
timeout-minutes: 150
run: |
# AMD CI: All 2-GPU tests including LoRA
# Tests: T2V, T2I, I2V, LoRA
#
# HF download env vars:
# - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
# - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
docker exec \
-e SGLANG_E2E_TOLERANCE=0.3 \
-e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
-e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
-e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
-e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
-e SGLANG_SKIP_CONSISTENCY=1 \
-e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
-e SGLANG_DIFFUSION_ARTIFACT_DIR=/sglang-checkout/diffusion-failures \
-e AITER_JIT_DIR=/sgl-data/aiter-kernels \
-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
-e HF_HUB_ENABLE_HF_TRANSFER=1 \
-e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
-w /sglang-checkout/python \
ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
--suite 2-gpu \
--partition-id ${{ matrix.part }} \
--total-partitions 3 \
${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
# Post-test diagnostics
echo "=== Post-test System Memory Status ==="
free -h
- name: Upload diffusion failure artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: diffusion-failures-amd-2gpu-${{ matrix.part }}-${{ github.run_attempt }}
path: diffusion-failures/
if-no-files-found: ignore
retention-days: 7
wait-for-stage-b-amd:
needs: [check-changes, call-gate, wait-for-stage-a-amd]
if: |
always() &&
!cancelled() &&
github.event_name == 'pull_request' &&
!(inputs.target_stage || inputs.target_stage_select) &&
(needs.check-changes.outputs.main_package == 'true' || needs.check-changes.outputs.sgl_kernel == 'true') &&
(needs.wait-for-stage-a-amd.result == 'success' || needs.wait-for-stage-a-amd.result == 'skipped') &&
(needs.call-gate.result == 'success' || needs.call-gate.result == 'skipped')
runs-on: ubuntu-latest
outputs:
stage_b_result: ${{ steps.wait.outputs.result }}
steps:
- uses: actions/checkout@v4
- uses: ./.github/actions/wait-for-jobs
id: wait
with:
stage-name: stage-b-amd
jobs: |
[
{"prefix": "stage-b-test-1-gpu-small-amd", "expected_count": 14},
{"prefix": "stage-b-test-2-gpu-large-amd", "expected_count": 2}
]
max-wait-minutes: '480'
stage-c-test-4-gpu-amd:
name: ${{ format('stage-c-test-4-gpu-amd (linux-{0}-4gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, call-gate, wait-for-stage-b-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-4-gpu-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
part: [0]
runs-on: ${{ format('linux-{0}-4gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 90
run: |
bash scripts/ci/amd/amd_ci_exec.sh \
-e NCCL_CUMEM_ENABLE=0 \
-e NCCL_NVLS_ENABLE=0 \
-e RCCL_MSCCL_ENABLE=0 \
-e SGLANG_USE_ROCM700A=1 \
-w "/sglang-checkout/test" \
python3 run_suite.py \
--hw amd \
--suite stage-c-test-4-gpu-amd \
--auto-partition-id ${{ matrix.part }} \
--auto-partition-size 1 \
--timeout-per-file 5400 \
--enable-retry \
--max-attempts 2 \
--retry-wait-seconds 120 \
--retry-timeout-increase 0 \
${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
stage-c-test-large-8-gpu-amd:
name: ${{ format('stage-c-test-large-8-gpu-amd (linux-{0}-8gpu-sglang, {1})', inputs.runner_arch || 'mi325', matrix.part) }}
needs: [check-changes, call-gate, wait-for-stage-b-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
env:
RUNNER_LABELS: ${{ format('linux-{0}-8gpu-sglang', inputs.runner_arch || 'mi325') }}
strategy:
fail-fast: false
matrix:
part: [0, 1, 2, 3]
runs-on: ${{ format('linux-{0}-8gpu-sglang', inputs.runner_arch || 'mi325') }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Test RCCL multi-GPU communication
timeout-minutes: 5
run: |
echo "Testing RCCL multi-GPU communication with debug info..."
docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py"
- name: Run test
timeout-minutes: 120
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 --timeout-per-file 5400 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
stage-c-test-large-8-gpu-amd-mi35x:
needs: [check-changes, call-gate, wait-for-stage-b-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
runner: [linux-mi35x-gpu-8]
part: [0, 1]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Start CI container
run: bash scripts/ci/amd/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Run test
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
# =============================================== Disaggregation ====================================================
stage-b-test-large-8-gpu-mi35x-disaggregation-amd:
needs: [check-changes, wait-for-stage-a-amd]
if: |
always() &&
(
(contains(format(',{0},', inputs.target_stage || inputs.target_stage_select), ',stage-b-test-large-8-gpu-mi35x-disaggregation-amd,')) ||
(
!(inputs.target_stage || inputs.target_stage_select) &&
((github.event_name == 'schedule') || (!failure() && !cancelled())) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
runner: [linux-mi35x-gpu-8.fabric]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
- name: Ensure VRAM is clear
run: bash scripts/ci/amd/ensure_vram_clear.sh rocm
- name: Check Host RDMA Environment
id: rdma_detect
run: |
set +e
echo "=== Checking Host RDMA Environment ==="
echo ""
echo "=== 1. Ionic driver library check ==="
ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null || echo "libionic not found in standard path"
echo ""
echo "=== 2. Infiniband devices ==="
ls -la /dev/infiniband/ 2>/dev/null || echo "/dev/infiniband not found"
ls -la /sys/class/infiniband/ 2>/dev/null || echo "/sys/class/infiniband not found"
echo ""
echo "=== 3. ibv_devinfo ==="
which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 || echo "ibv_devinfo not available"
echo ""
echo "=== 4. Kernel modules ==="
lsmod 2>/dev/null | grep -E "ib_|rdma|ionic" || echo "No RDMA kernel modules loaded"
echo ""
echo "=== 5. Detect RDMA Devices for test environment ==="
if [ -d "/sys/class/infiniband" ]; then
RDMA_DEVS=$(ls /sys/class/infiniband | paste -sd "," -)
echo "Detected RDMA Devices: $RDMA_DEVS"
echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV
else
echo "No RDMA devices found in /sys/class/infiniband"
echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV
fi
echo ""
echo "=== Host RDMA Check Complete ==="
- name: Start Special Container
run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
- name: Install dependencies
run: bash scripts/ci/amd/amd_ci_install_dependency.sh
- name: Verify RDMA in Container
run: |
docker exec -u root ci_sglang bash -c '
echo "=== Container RDMA Verification ==="
echo "Device nodes:"
ls -la /dev/infiniband/
echo ""
echo "Provider libraries:"
ls /usr/lib/x86_64-linux-gnu/libibverbs/ | grep -E "ionic|mlx" || echo "No Ionic/Mellanox providers"
echo ""
echo "HCA devices:"
HCA_COUNT=$(ibv_devinfo -list 2>&1 | grep -oE "^[0-9]+ HCAs? found" | grep -oE "^[0-9]+" || echo "0")
ibv_devinfo -list
if [ "$HCA_COUNT" -gt 0 ]; then
echo ""
echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ==="
else
echo ""
echo "=== WARNING: No HCAs detected. RDMA tests may fail ==="
fi
'
- name: Run Aiter Op Test (RMSNorm)
timeout-minutes: 10
run: |
echo "Running pre-check: test_rmsnorm2d.py"
docker exec \
-e MAX_JOBS=192 \
ci_sglang \
python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py
- name: Run test_disaggregation
timeout-minutes: 60
run: |
bash scripts/ci/amd/amd_ci_exec.sh \
-e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \
-w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-mi35x-disaggregation-amd --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' || '' }}
pr-test-amd-finish:
needs:
[
call-gate,
check-changes,
sgl-kernel-unit-test-amd,
sgl-kernel-unit-test-2-gpu-amd,
multimodal-gen-test-1-gpu-amd,
multimodal-gen-test-2-gpu-amd,
wait-for-stage-a-amd,
stage-a-test-1-gpu-small-amd,
jit-kernel-unit-test-amd,
wait-for-stage-b-amd,
stage-b-test-1-gpu-small-amd,
stage-b-test-1-gpu-small-amd-nondeterministic,
stage-b-test-1-gpu-small-amd-mi35x,
stage-b-test-1-gpu-large-amd,
stage-b-test-2-gpu-large-amd,
stage-b-test-large-8-gpu-mi35x-disaggregation-amd,
stage-c-test-4-gpu-amd,
stage-c-test-large-8-gpu-amd,
stage-c-test-large-8-gpu-amd-mi35x,
]
if: always()
runs-on: ubuntu-latest
steps:
- name: Check all dependent job statuses
run: |
# Convert the 'needs' context to a JSON string
json_needs='${{ toJson(needs) }}'
# Get a list of all job names from the JSON keys
job_names=$(echo "$json_needs" | jq -r 'keys_unsorted[]')
for job in $job_names; do
# For each job, extract its result
result=$(echo "$json_needs" | jq -r --arg j "$job" '.[$j].result')
# Print the job name and its result
echo "$job: $result"
# Check for failure or cancellation and exit if found
if [[ "$result" == "failure" || "$result" == "cancelled" ]]; then
echo "The above jobs failed."
exit 1
fi
done
# If the loop completes, all jobs were successful
echo "All jobs completed successfully"
exit 0