PR Test ROCm 7.2 (AMD) #117

Workflow file for this run

.github/workflows/pr-test-amd-rocm720.yml at cb1d0dd

	name: PR Test ROCm 7.2 (AMD)
	# Dynamic run-name for /rerun-stage commands to enable URL lookup
	# Format: "[stage-name] sha" for fork PRs, "[stage-name]" for non-fork, default for normal runs
	run-name: ${{ (inputs.target_stage \|\| inputs.target_stage_select) && (inputs.pr_head_sha && format('[{0}] {1}', inputs.target_stage \|\| inputs.target_stage_select, inputs.pr_head_sha) \|\| format('[{0}]', inputs.target_stage \|\| inputs.target_stage_select)) \|\| '' }}

	on:
	schedule:
	- cron: '30 17 * * *'
	- cron: '0 /6 * *' # Every 6 hours (UTC): target dsv4 accuracy jobs only
	# push:
	# branches: [ main ]
	# paths:
	# - "python/**"
	# - "scripts/ci/**"
	# - "test/**"
	# - "sgl-kernel/**"
	# - ".github/workflows/pr-test-amd-rocm720.yml"
	# - "docker/rocm.Dockerfile"
	pull_request:
	paths:
	- "python/**"
	- "scripts/ci/**"
	- "test/**"
	- "sgl-kernel/**"
	- ".github/workflows/pr-test-amd-rocm720.yml"
	- "docker/rocm.Dockerfile"
	workflow_dispatch:
	inputs:
	target_stage_select:
	description: "Select a stage to run from dropdown (leave empty for auto-detect)"
	required: false
	type: choice
	default: ''
	options:
	- ''
	- sgl-kernel-unit-test-amd-rocm720
	- sgl-kernel-unit-test-2-gpu-amd-rocm720
	- stage-a-test-1-gpu-small-amd-rocm720
	- jit-kernel-unit-test-amd-rocm720
	- stage-b-test-1-gpu-small-amd-rocm720
	- stage-b-test-1-gpu-small-amd-nondeterministic-rocm720
	- stage-b-test-1-gpu-small-amd-mi35x-rocm720
	- stage-b-test-1-gpu-large-amd-rocm720
	- stage-b-test-2-gpu-large-amd-rocm720
	- multimodal-gen-test-1-gpu-amd-rocm720
	- multimodal-gen-test-2-gpu-amd-rocm720
	- stage-c-test-large-8-gpu-amd-rocm720
	- stage-c-test-large-8-gpu-amd-mi35x-rocm720
	- stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720
	- stage-c-test-4-gpu-amd-rocm720
	- dsv4-flash-fp4-fp8-amd-rocm720
	- dsv4-pro-fp4-amd-rocm720
	target_stage:
	description: "Or type comma-separated stage names (overrides dropdown if non-empty)"
	required: false
	type: string
	default: ""
	pr_head_sha:
	description: "PR head SHA to checkout (for /rerun-stage on fork PRs)"
	required: false
	type: string
	default: ""
	aiter_ref:
	description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
	required: false
	type: string
	default: ''
	continue_on_error:
	description: 'Continue on error (do not fail the workflow on test failures)'
	required: false
	type: boolean
	default: true
	run_all_tests:
	description: 'Run all tests (skip change detection). Ignored when target_stage / target_stage_select is set.'
	required: false
	type: boolean
	default: false
	workflow_call:
	inputs:
	ref:
	description: 'Git ref (branch, tag, or SHA) to test. If not provided, uses the default branch.'
	required: false
	type: string
	default: ''
	run_all_tests:
	description: "Run all tests (for releasing or testing purpose)"
	required: false
	type: boolean
	default: false
	aiter_ref:
	description: 'Override AITER commit (optional, leave empty to use Dockerfile default)'
	required: false
	type: string
	default: ''
	continue_on_error:
	description: 'Continue on error (do not fail the workflow on test failures)'
	required: false
	type: boolean
	default: true

	# Mirror pr-test.yml: the chained extra suite (call-pr-test-amd-extra-rocm720
	# -> pr-test-amd-extra.yml) declares actions: write / issues: read /
	# pull-requests: read. A called reusable workflow can only use scopes the
	# caller already holds, so without this block the call fails workflow
	# validation ("requesting actions: write... but only allowed ...none").
	permissions:
	actions: write
	contents: read
	issues: read
	pull-requests: read

	env:
	AITER_COMMIT_OVERRIDE: ${{ inputs.aiter_ref }}
	DOCKERHUB_AMD_USERNAME: ${{ secrets.DOCKERHUB_AMD_USERNAME }}
	DOCKERHUB_AMD_TOKEN: ${{ secrets.DOCKERHUB_AMD_TOKEN }}

	concurrency:
	# When called via workflow_call with run_all_tests=true, use a unique group per run to
	# avoid collisions with direct schedule/workflow_dispatch triggers. We use run_all_tests
	# (not github.event_name) to detect this, because github.event_name inherits from the caller.
	# Manual dispatch runs also get unique groups so they never cancel each other.
	group: pr-test-amd-rocm720-${{ (inputs.run_all_tests \|\| github.event_name == 'workflow_dispatch') && format('full-{0}', github.run_id) \|\| inputs.pr_head_sha \|\| inputs.ref \|\| github.ref }}
	cancel-in-progress: ${{ !inputs.run_all_tests && github.event_name != 'workflow_call' && github.event_name != 'workflow_dispatch' }}

	jobs:
	call-gate:
	# Runs on PRs (pr-gate.yml enforces the run-ci label / blocks drafts), the daily
	# '30 17 * * *' cron, workflow_dispatch and workflow_call. It is skipped only on the
	# 6h ('0 /6 * *') cron, which cascades a skip to check-changes and every existing
	# test job, so that cron runs the two dsv4 jobs only. On PRs the existing jobs are
	# additionally kept off via the change-detection guard in check-changes, so only the
	# dsv4 jobs run there -- and only when this gate passes (they `needs` it).
	if: github.event.schedule != '0 /6 * *'
	uses: ./.github/workflows/pr-gate.yml
	secrets: inherit
	check-changes:
	needs: [call-gate]
	runs-on: ubuntu-latest
	outputs:
	main_package: ${{ steps.filter.outputs.main_package \|\| steps.run-mode.outputs.run_all_tests }}
	sgl_kernel: ${{ steps.filter.outputs.sgl_kernel \|\| steps.run-mode.outputs.run_all_tests }}
	jit_kernel: ${{ steps.filter.outputs.jit_kernel \|\| steps.run-mode.outputs.run_all_tests }}
	multimodal_gen: ${{ steps.filter.outputs.multimodal_gen \|\| steps.run-mode.outputs.run_all_tests }}
	continue_on_error: ${{ steps.set-continue-on-error.outputs.continue_on_error }}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Determine run mode
	id: run-mode
	run: \|
	# Run all tests for workflow_call (when ref input is provided)
	# Note: github.event_name is inherited from caller, so we detect workflow_call by checking inputs.ref
	# Any scheduled run that reaches check-changes is the daily '30 17 * * *' cron
	# (the 6h '0 /6 * *' cron skips call-gate -> check-changes), so the daily run
	# executes the full suite (existing jobs + dsv4), like pr-test-amd.yml.
	if [[ "${{ inputs.run_all_tests }}" == "true" \|\| "${{ github.event_name }}" == "schedule" ]]; then
	echo "run_all_tests=true" >> $GITHUB_OUTPUT
	echo "Run mode: ALL TESTS (run_all_tests=${{ inputs.run_all_tests }}, event=${{ github.event_name }})"
	else
	echo "run_all_tests=false" >> $GITHUB_OUTPUT
	echo "Run mode: FILTERED (triggered by ${{ github.event_name }})"
	fi

	- name: Set continue-on-error for schedule/full runs
	id: set-continue-on-error
	run: \|
	# Mirror pr-test-amd.yml: continue-on-error when any of
	# - run_all_tests was requested (workflow_dispatch checkbox, workflow_call input,
	# or a scheduled run, which run-mode above rolls into run_all_tests),
	# - inputs.continue_on_error was explicitly set,
	# - or this is a scheduled run (kept as an explicit belt-and-suspenders guard).
	if [[ "${{ steps.run-mode.outputs.run_all_tests }}" == "true" \|\| "${{ inputs.continue_on_error }}" == "true" \|\| "${{ github.event_name }}" == "schedule" ]]; then
	echo "continue_on_error=true" >> $GITHUB_OUTPUT
	echo "Continue-on-error: ENABLED (run_all_tests=${{ steps.run-mode.outputs.run_all_tests }}, input=${{ inputs.continue_on_error }}, event=${{ github.event_name }})"
	else
	echo "continue_on_error=false" >> $GITHUB_OUTPUT
	echo "Continue-on-error: DISABLED"
	fi

	- name: Detect file changes
	id: filter
	uses: dorny/paths-filter@v3
	# On pull_request only the dsv4 jobs run; skip change detection on PRs so the
	# per-stage outputs stay 'false' and the existing test jobs are gated off.
	if: steps.run-mode.outputs.run_all_tests != 'true' && github.event_name != 'pull_request'
	with:
	filters: \|
	main_package:
	- "python/sglang/!(multimodal_gen)/*/!(.md)"
	- "python/pyproject_rocm.toml"
	- "python/pyproject_other.toml"
	- "scripts/ci/amd/*"
	- "scripts/ci/utils/*"
	- "test/*/!(.md)"
	- ".github/workflows/pr-test-amd-rocm720.yml"
	sgl_kernel:
	- "sgl-kernel/*/!(.md\|THIRDPARTYNOTICES.txt\|LICENSE)"
	- ".github/workflows/pr-test-amd-rocm720.yml"
	jit_kernel:
	- "python/sglang/jit_kernel/**"
	- "test/registered/jit/**"
	- ".github/workflows/pr-test-amd-rocm720.yml"
	multimodal_gen:
	- "python/sglang/multimodal_gen/*/!(.md\|*.ipynb)"
	- "python/sglang/cli/**"
	- "python/sglang/srt/observability/**"
	- "python/sglang/jit_kernel/diffusion/**"
	- "test/registered/jit/diffusion/**"
	- "test/registered/jit/benchmark/diffusion/**"
	- "python/pyproject_rocm.toml"
	- "python/pyproject_other.toml"

	# =============================================== extra (scheduled) ====================================================
	# ROCm 7.2 mirror of pr-test-amd.yml's `call-pr-test-amd-extra`: chain the
	# label-gated AMD extra tier into this workflow's daily schedule, but in a
	# ROCm 7.2 container (`rocm_version: rocm720`). On `schedule` (and
	# run_all_tests dispatch) the extra suite runs on `main` without the
	# `run-ci-extra` label (pr-gate.yml only enforces labels on pull_request
	# events). Targeted dispatches (target_stage set) are excluded. Not wired
	# into any finish aggregator so the base rocm720 run never depends on it.
	call-pr-test-amd-extra-rocm720:
	if: \|
	(github.event_name == 'schedule' \|\| inputs.run_all_tests == true) &&
	!(inputs.target_stage \|\| inputs.target_stage_select)
	uses: ./.github/workflows/pr-test-amd-extra.yml
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| '' }}
	rocm_version: rocm720
	aiter_ref: ${{ inputs.aiter_ref }}
	continue_on_error: true
	secrets: inherit

	# =============================================== sgl-kernel ====================================================
	sgl-kernel-unit-test-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',sgl-kernel-unit-test-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	needs.check-changes.outputs.sgl_kernel == 'true'
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 30
	run: \|
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_align.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_softmax.py
	docker exec -w /sglang-checkout/sgl-kernel/tests/speculative ci_sglang python3 -m pytest test_eagle_utils.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_apply_token_bitmask_inplace.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_activation.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_topk.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_kvcacheio.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_moe_topk_sigmoid.py
	docker exec -w /sglang-checkout/sgl-kernel/tests ci_sglang python3 -m pytest test_torch_defaults_reset.py

	sgl-kernel-unit-test-2-gpu-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',sgl-kernel-unit-test-2-gpu-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	needs.check-changes.outputs.sgl_kernel == 'true'
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-2gpu-sglang]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 30
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite sgl-kernel-unit-test-2-gpu-amd

	# =============================================== primary ====================================================

	stage-a-test-1-gpu-small-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-a-test-1-gpu-small-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 30
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-a-test-1-gpu-small-amd ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	jit-kernel-unit-test-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',jit-kernel-unit-test-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	needs.check-changes.outputs.jit_kernel == 'true'
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run JIT kernel unit tests
	timeout-minutes: 30
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite jit-kernel-unit-test-amd ${{ inputs.continue_on_error && '--continue-on-error' \|\| '' }}

	stage-b-test-1-gpu-small-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	part: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 60
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 14 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	stage-b-test-1-gpu-small-amd-nondeterministic-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-nondeterministic-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 45
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-nondeterministic --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	stage-b-test-1-gpu-small-amd-mi35x-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-b-test-1-gpu-small-amd-mi35x-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi35x-gpu-1]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 30
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-small-amd-mi35x ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	stage-b-test-1-gpu-large-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-b-test-1-gpu-large-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	part: [0, 1]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 45
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-1-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	stage-b-test-2-gpu-large-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-b-test-2-gpu-large-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-2gpu-sglang]
	part: [0, 1]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 45
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-2-gpu-large-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	multimodal-gen-test-1-gpu-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',multimodal-gen-test-1-gpu-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-1gpu-sglang]
	part: [0, 1, 2, 3]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
	docker exec ci_sglang pip install amdsmi

	- name: Setup kernel caches
	run: \|
	# Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
	# This directory persists across container restarts on the self-hosted runner
	docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub

	# Clear pre-built AITER kernels from Docker image to avoid segfaults
	# The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
	echo "Clearing pre-built AITER kernels from Docker image..."
	docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null \|\| true
	docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null \|\| true
	echo "AITER kernels cleared - will be rebuilt on first use"

	# Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
	# This tells the test cleanup code to NOT delete downloaded models
	if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
	docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
	echo "Created .persistent_cache marker - HF cache will persist"
	else
	echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
	fi

	# Check MIOpen cache (VAE convolution kernels)
	miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null \| wc -l \|\| echo "0")
	echo "Found ${miopen_files} MIOpen cache files"

	- name: Diagnose HF cache and system resources
	run: \|
	echo "=== System Memory Status ==="
	free -h
	echo ""
	echo "=== Disk Space ==="
	df -h /home/runner/sgl-data 2>/dev/null \|\| df -h
	echo ""
	echo "=== HF Cache Directory Structure ==="
	docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null \|\| echo "HF cache dir not found"
	docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null \|\| echo "HF hub cache not found"
	echo ""
	echo "=== Checking for cached diffusion models (1-GPU tests) ==="
	# Models used in 1-GPU tests: Wan2.1-T2V-1.3B, HunyuanVideo, Qwen-Image, FLUX.1, FLUX.2
	for model in "Wan-AI--Wan2.1-T2V-1.3B-Diffusers" "tencent--HunyuanVideo" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev" "black-forest-labs--FLUX.2-dev"; do
	cache_path="/sgl-data/hf-cache/hub/models--${model}"
	if docker exec ci_sglang test -d "$cache_path"; then
	size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null \| cut -f1)
	echo "✓ CACHED: $model ($size)"
	else
	echo "✗ NOT CACHED: $model"
	fi
	done
	echo ""
	echo "=== GPU Memory Status ==="
	docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null \|\| echo "rocm-smi not available"

	- name: Run diffusion server tests (1-GPU)
	timeout-minutes: 60
	run: \|
	# AMD CI: All 1-GPU tests except FLUX.2 (FLUX.1 covers same code path)
	# Tests: T2V, T2I, I2V, LoRA
	#
	# HF download env vars:
	# - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
	# - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
	docker exec \
	-e SGLANG_E2E_TOLERANCE=0.3 \
	-e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
	-e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
	-e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
	-e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
	-e SGLANG_SKIP_CONSISTENCY=1 \
	-e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
	-e SGLANG_DIFFUSION_ARTIFACT_DIR=/sglang-checkout/diffusion-failures \
	-e AITER_JIT_DIR=/sgl-data/aiter-kernels \
	-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
	-e HF_HUB_ENABLE_HF_TRANSFER=1 \
	-e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
	-w /sglang-checkout/python \
	ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
	--suite 1-gpu \
	--partition-id ${{ matrix.part }} \
	--total-partitions 4 \
	-k "not flux_2"

	# Post-test diagnostics
	echo "=== Post-test System Memory Status ==="
	free -h

	- name: Upload diffusion failure artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: diffusion-failures-amd-rocm720-1gpu-${{ matrix.part }}-${{ github.run_attempt }}
	path: diffusion-failures/
	if-no-files-found: ignore
	retention-days: 7

	multimodal-gen-test-2-gpu-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',multimodal-gen-test-2-gpu-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-2gpu-sglang]
	part: [0, 1, 2] # 3 partitions: 2 parametrized + 1 standalone (test_disagg_server.py)
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Download artifacts
	if: needs.check-changes.outputs.sgl_kernel == 'true'
	uses: actions/download-artifact@v4
	with:
	path: sgl-kernel/dist/
	merge-multiple: true
	pattern: wheel-python3.10-cuda12.9

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	bash scripts/ci/amd/amd_ci_install_dependency.sh diffusion
	docker exec ci_sglang pip install amdsmi

	- name: Setup kernel caches
	run: \|
	# Use the persistent /sgl-data directory (mounted from /home/runner/sgl-data)
	docker exec ci_sglang mkdir -p /sgl-data/aiter-kernels /sgl-data/miopen-cache /sgl-data/hf-cache/hub

	# Clear pre-built AITER kernels from Docker image to avoid segfaults
	# The image may have stale/incompatible kernels at /sgl-workspace/aiter/aiter/jit/
	echo "Clearing pre-built AITER kernels from Docker image..."
	docker exec ci_sglang rm -rf /sgl-workspace/aiter/aiter/jit/*.so 2>/dev/null \|\| true
	docker exec ci_sglang rm -rf /sgl-data/aiter-kernels/*.so 2>/dev/null \|\| true
	echo "AITER kernels cleared - will be rebuilt on first use"

	# Create persistent cache marker if /sgl-data is a real mount (not ephemeral)
	# This tells the test cleanup code to NOT delete downloaded models
	if docker exec ci_sglang test -d /sgl-data && docker exec ci_sglang mountpoint -q /sgl-data 2>/dev/null; then
	docker exec ci_sglang touch /sgl-data/hf-cache/.persistent_cache
	echo "Created .persistent_cache marker - HF cache will persist"
	else
	echo "WARNING: /sgl-data is not a mount point - models will be cleaned up after each test"
	fi

	# Check MIOpen cache (VAE convolution kernels)
	miopen_files=$(docker exec ci_sglang find /sgl-data/miopen-cache -name "*.udb" 2>/dev/null \| wc -l \|\| echo "0")
	echo "Found ${miopen_files} MIOpen cache files"

	- name: Diagnose HF cache and system resources
	run: \|
	echo "=== System Memory Status ==="
	free -h
	echo ""
	echo "=== Disk Space ==="
	df -h /home/runner/sgl-data 2>/dev/null \|\| df -h
	echo ""
	echo "=== HF Cache Directory Structure ==="
	docker exec ci_sglang ls -la /sgl-data/hf-cache/ 2>/dev/null \|\| echo "HF cache dir not found"
	docker exec ci_sglang ls -la /sgl-data/hf-cache/hub/ 2>/dev/null \|\| echo "HF hub cache not found"
	echo ""
	echo "=== Checking for cached diffusion models (2-GPU tests) ==="
	# Models used in 2-GPU tests: Wan2.2-T2V-A14B, Wan2.1-T2V-14B, Qwen-Image, FLUX.1
	for model in "Wan-AI--Wan2.2-T2V-A14B-Diffusers" "Wan-AI--Wan2.1-T2V-14B-Diffusers" "Qwen--Qwen-Image" "black-forest-labs--FLUX.1-dev"; do
	cache_path="/sgl-data/hf-cache/hub/models--${model}"
	if docker exec ci_sglang test -d "$cache_path"; then
	size=$(docker exec ci_sglang du -sh "$cache_path" 2>/dev/null \| cut -f1)
	echo "✓ CACHED: $model ($size)"
	else
	echo "✗ NOT CACHED: $model"
	fi
	done
	echo ""
	echo "=== GPU Memory Status ==="
	docker exec ci_sglang rocm-smi --showmeminfo vram 2>/dev/null \|\| echo "rocm-smi not available"

	- name: Run diffusion server tests (2-GPU)
	timeout-minutes: 150
	run: \|
	# AMD CI: All 2-GPU tests including LoRA
	# Tests: T2V, T2I, I2V, LoRA
	#
	# HF download env vars:
	# - HF_HUB_ENABLE_HF_TRANSFER=1: Use faster hf_transfer for downloads (if available)
	# - HF_HUB_DISABLE_SYMLINKS_WARNING=1: Suppress symlink warnings
	docker exec \
	-e SGLANG_E2E_TOLERANCE=0.3 \
	-e SGLANG_STAGE_TIME_TOLERANCE=0.2 \
	-e SGLANG_NON_DENOISE_STAGE_TIME_TOLERANCE=0.6 \
	-e SGLANG_DENOISE_STEP_TOLERANCE=0.6 \
	-e SGLANG_DENOISE_AGG_TOLERANCE=0.3 \
	-e SGLANG_SKIP_CONSISTENCY=1 \
	-e SGLANG_TEST_NUM_INFERENCE_STEPS=5 \
	-e SGLANG_DIFFUSION_ARTIFACT_DIR=/sglang-checkout/diffusion-failures \
	-e AITER_JIT_DIR=/sgl-data/aiter-kernels \
	-e MIOPEN_USER_DB_PATH=/sgl-data/miopen-cache \
	-e HF_HUB_ENABLE_HF_TRANSFER=1 \
	-e HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
	-w /sglang-checkout/python \
	ci_sglang python3 sglang/multimodal_gen/test/run_suite.py \
	--suite 2-gpu \
	--partition-id ${{ matrix.part }} \
	--total-partitions 3

	# Post-test diagnostics
	echo "=== Post-test System Memory Status ==="
	free -h

	- name: Upload diffusion failure artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: diffusion-failures-amd-rocm720-2gpu-${{ matrix.part }}-${{ github.run_attempt }}
	path: diffusion-failures/
	if-no-files-found: ignore
	retention-days: 7


	stage-c-test-4-gpu-amd-rocm720:
	needs: [check-changes, stage-b-test-1-gpu-small-amd-rocm720, stage-b-test-2-gpu-large-amd-rocm720]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-c-test-4-gpu-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-4gpu-sglang]
	part: [0]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh

	- name: Run test
	timeout-minutes: 60
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh \
	-e NCCL_CUMEM_ENABLE=0 \
	-e NCCL_NVLS_ENABLE=0 \
	-e RCCL_MSCCL_ENABLE=0 \
	-e SGLANG_USE_ROCM700A=1 \
	-w "/sglang-checkout/test" \
	python3 run_suite.py \
	--hw amd \
	--suite stage-c-test-4-gpu-amd \
	--auto-partition-id ${{ matrix.part }} \
	--auto-partition-size 1 \
	--timeout-per-file 1800 \
	--enable-retry \
	--max-attempts 2 \
	--retry-wait-seconds 120 \
	--retry-timeout-increase 0 \
	${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	stage-c-test-large-8-gpu-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	env:
	RUNNER_LABELS: linux-mi325-8gpu-sglang
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi325-8gpu-sglang]
	part: [0, 1, 2, 3]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Test RCCL multi-GPU communication
	timeout-minutes: 5
	run: \|
	echo "Testing RCCL multi-GPU communication with debug info..."
	docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/amd/test_rccl_multi_gpu.py"

	- name: Run test
	timeout-minutes: 120
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 4 --timeout-per-file 5400 ${{ (github.event_name == 'schedule' \|\| inputs.continue_on_error) && '--continue-on-error' \|\| '' }}

	stage-c-test-large-8-gpu-amd-mi35x-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-c-test-large-8-gpu-amd-mi35x-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi35x-gpu-8]
	part: [0, 1]
	runs-on: ${{matrix.runner}}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Start CI container
	run: bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh
	- name: Run test
	timeout-minutes: 60
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	# =============================================== Disaggregation ====================================================
	stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720:
	needs: [check-changes]
	if: \|
	always() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(!failure() && !cancelled()) &&
	((needs.check-changes.outputs.main_package == 'true') \|\| (needs.check-changes.outputs.sgl_kernel == 'true'))
	)
	)
	strategy:
	fail-fast: false
	matrix:
	runner: [linux-mi35x-gpu-8.fabric]

	runs-on: ${{matrix.runner}}

	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Check Host RDMA Environment
	id: rdma_detect
	run: \|
	set +e
	echo "=== Checking Host RDMA Environment ==="

	echo ""
	echo "=== 1. Ionic driver library check ==="
	ls -l /usr/lib/x86_64-linux-gnu/libibverbs/libionic* 2>/dev/null \|\| echo "libionic not found in standard path"

	echo ""
	echo "=== 2. Infiniband devices ==="
	ls -la /dev/infiniband/ 2>/dev/null \|\| echo "/dev/infiniband not found"
	ls -la /sys/class/infiniband/ 2>/dev/null \|\| echo "/sys/class/infiniband not found"

	echo ""
	echo "=== 3. ibv_devinfo ==="
	which ibv_devinfo 2>/dev/null && ibv_devinfo 2>&1 \|\| echo "ibv_devinfo not available"

	echo ""
	echo "=== 4. Kernel modules ==="
	lsmod 2>/dev/null \| grep -E "ib_\|rdma\|ionic" \|\| echo "No RDMA kernel modules loaded"

	echo ""
	echo "=== 5. Detect RDMA Devices for test environment ==="
	if [ -d "/sys/class/infiniband" ]; then
	RDMA_DEVS=$(ls /sys/class/infiniband \| paste -sd "," -)
	echo "Detected RDMA Devices: $RDMA_DEVS"
	echo "SGLANG_TEST_RDMA_DEVICE=$RDMA_DEVS" >> $GITHUB_ENV
	else
	echo "No RDMA devices found in /sys/class/infiniband"
	echo "SGLANG_TEST_RDMA_DEVICE=" >> $GITHUB_ENV
	fi

	echo ""
	echo "=== Host RDMA Check Complete ==="

	- name: Start Special Container
	run: bash scripts/ci/amd/amd_ci_start_container_disagg.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: bash scripts/ci/amd/amd_ci_install_dependency.sh

	- name: Verify RDMA in Container
	run: \|
	docker exec -u root ci_sglang bash -c '
	echo "=== Container RDMA Verification ==="
	echo "Device nodes:"
	ls -la /dev/infiniband/
	echo ""
	echo "Provider libraries:"
	ls /usr/lib/x86_64-linux-gnu/libibverbs/ \| grep -E "ionic\|mlx" \|\| echo "No Ionic/Mellanox providers"
	echo ""
	echo "HCA devices:"
	HCA_COUNT=$(ibv_devinfo -list 2>&1 \| grep -oE "^[0-9]+ HCAs? found" \| grep -oE "^[0-9]+" \|\| echo "0")
	ibv_devinfo -list
	if [ "$HCA_COUNT" -gt 0 ]; then
	echo ""
	echo "=== SUCCESS: RDMA setup complete. Found $HCA_COUNT HCA(s) ==="
	else
	echo ""
	echo "=== WARNING: No HCAs detected. RDMA tests may fail ==="
	fi
	'

	- name: Run Aiter Op Test (RMSNorm)
	timeout-minutes: 10
	run: \|
	echo "Running pre-check: test_rmsnorm2d.py"
	docker exec \
	-e MAX_JOBS=192 \
	ci_sglang \
	python /sgl-workspace/aiter/op_tests/test_rmsnorm2d.py

	- name: Run test_disaggregation
	timeout-minutes: 60
	run: \|
	bash scripts/ci/amd/amd_ci_exec.sh \
	-e SGLANG_TEST_RDMA_DEVICE="${{ env.SGLANG_TEST_RDMA_DEVICE }}" \
	-w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-8-gpu-mi35x-disaggregation-amd --timeout-per-file 1800 ${{ needs.check-changes.outputs.continue_on_error == 'true' && '--continue-on-error' \|\| '' }}

	# =============================================== DeepSeek-V4 (MI35x, 8-GPU) ====================================================
	# On `pull_request` and the 6h ('0 /6 * *') cron these are the ONLY jobs that run;
	# they also run on the daily cron alongside the full suite. On PRs they run only when
	# call-gate succeeds, i.e. the PR carries the run-ci label (and is not a draft), so
	# they are gated just like every other PR test job. On PR they hard-fail (a failure
	# blocks merge); on any scheduled run they pass --continue-on-error. They stay
	# selectable via workflow_dispatch / run on workflow_call full runs.
	dsv4-flash-fp4-fp8-amd-rocm720:
	needs: [call-gate]
	if: \|
	always() && !cancelled() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',dsv4-flash-fp4-fp8-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(
	(github.event_name == 'pull_request' && needs.call-gate.result == 'success') \|\|
	(github.event_name == 'schedule') \|\|
	inputs.run_all_tests
	)
	)
	)
	runs-on: linux-mi35x-gpu-8
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Setup docker (ROCm 7.2)
	run: \|
	touch github_summary.md
	bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	# --skip-test-time-deps: GSM8K + bench_one_batch_server don't need lmms-eval / human-eval.
	bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps
	bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate

	- name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V4-Flash FP4 + FP8)
	timeout-minutes: 300
	run: \|
	> github_summary.md # Clear summary file
	# SGLANG_DSV4_ACCURACY_ONLY=1 makes the dsv4 test files skip their perf test
	# (test_b_perf_8k_1k); only the GSM8K accuracy test runs in this workflow.
	bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
	-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
	-e SGLANG_DSV4_ACCURACY_ONLY=1 \
	python3 run_suite.py --hw amd --suite nightly-amd-8-gpu-mi35x-deepseek-v4-flash --nightly --timeout-per-file 7200 ${{ (github.event_name == 'schedule' \|\| inputs.continue_on_error) && '--continue-on-error' \|\| '' }} \|\| TEST_EXIT_CODE=$?
	echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY \|\| true
	exit ${TEST_EXIT_CODE:-0}

	dsv4-pro-fp4-amd-rocm720:
	needs: [call-gate]
	if: \|
	always() && !cancelled() &&
	(
	(contains(format(',{0},', inputs.target_stage \|\| inputs.target_stage_select), ',dsv4-pro-fp4-amd-rocm720,')) \|\|
	(
	!(inputs.target_stage \|\| inputs.target_stage_select) &&
	(
	(github.event_name == 'pull_request' && needs.call-gate.result == 'success') \|\|
	(github.event_name == 'schedule') \|\|
	inputs.run_all_tests
	)
	)
	)
	runs-on: linux-mi35x-gpu-8
	steps:
	- name: Checkout code
	uses: actions/checkout@v4
	with:
	ref: ${{ inputs.pr_head_sha \|\| inputs.ref \|\| github.sha }}

	- name: Ensure VRAM is clear
	run: bash scripts/ci/amd/ensure_vram_clear.sh rocm

	- name: Setup docker (ROCm 7.2)
	run: \|
	touch github_summary.md
	bash scripts/ci/amd/amd_ci_start_container.sh --rocm-version rocm720
	env:
	GITHUB_WORKSPACE: ${{ github.workspace }}

	- name: Install dependencies
	run: \|
	# --skip-test-time-deps: GSM8K + bench_one_batch_server don't need lmms-eval / human-eval.
	bash scripts/ci/amd/amd_ci_install_dependency.sh --skip-test-time-deps
	bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate

	- name: Accuracy Test MI35x ROCm 7.2 (8-GPU DeepSeek-V4-Pro FP4)
	timeout-minutes: 480
	run: \|
	> github_summary.md # Clear summary file
	# SGLANG_DSV4_ACCURACY_ONLY=1 makes the dsv4 test files skip their perf test
	# (test_b_perf_8k_1k); only the GSM8K accuracy test runs in this workflow.
	bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
	-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
	-e SGLANG_DSV4_ACCURACY_ONLY=1 \
	python3 registered/amd/test_deepseek_v4_pro_fp4.py \|\| TEST_EXIT_CODE=$?
	echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY \|\| true
	if [[ "${{ github.event_name == 'schedule' \|\| inputs.continue_on_error }}" == "true" ]]; then
	exit 0
	fi
	exit ${TEST_EXIT_CODE:-0}

	pr-test-amd-rocm720-finish:
	needs:
	[
	call-gate,
	check-changes,

	sgl-kernel-unit-test-amd-rocm720,
	sgl-kernel-unit-test-2-gpu-amd-rocm720,
	multimodal-gen-test-1-gpu-amd-rocm720,
	multimodal-gen-test-2-gpu-amd-rocm720,

	stage-a-test-1-gpu-small-amd-rocm720,
	jit-kernel-unit-test-amd-rocm720,
	stage-b-test-1-gpu-small-amd-rocm720,
	stage-b-test-1-gpu-small-amd-nondeterministic-rocm720,
	stage-b-test-1-gpu-small-amd-mi35x-rocm720,
	stage-b-test-1-gpu-large-amd-rocm720,
	stage-b-test-2-gpu-large-amd-rocm720,
	stage-b-test-large-8-gpu-mi35x-disaggregation-amd-rocm720,
	stage-c-test-4-gpu-amd-rocm720,
	stage-c-test-large-8-gpu-amd-rocm720,
	stage-c-test-large-8-gpu-amd-mi35x-rocm720,

	dsv4-flash-fp4-fp8-amd-rocm720,
	dsv4-pro-fp4-amd-rocm720,
	]
	if: always()
	runs-on: ubuntu-latest
	steps:
	- name: Check all dependent job statuses
	run: \|
	# Convert the 'needs' context to a JSON string
	json_needs='${{ toJson(needs) }}'

	# Get a list of all job names from the JSON keys
	job_names=$(echo "$json_needs" \| jq -r 'keys_unsorted[]')

	for job in $job_names; do
	# For each job, extract its result
	result=$(echo "$json_needs" \| jq -r --arg j "$job" '.[$j].result')

	# Print the job name and its result
	echo "$job: $result"

	# Check for failure or cancellation and exit if found
	if [[ "$result" == "failure" \|\| "$result" == "cancelled" ]]; then
	echo "The above jobs failed."
	exit 1
	fi
	done

	# If the loop completes, all jobs were successful
	echo "All jobs completed successfully"
	exit 0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

PR Test ROCm 7.2 (AMD) #117

Workflow file

PR Test ROCm 7.2 (AMD) #117

Uh oh!

Workflow file for this run