Nightly Test (GB200 72GPU) #78

Workflow file for this run

.github/workflows/nightly-72-gpu-gb200.yml at cb1d0dd

	name: Nightly Test (GB200 72GPU)

	# NOTE: Nightly (schedule) runs require no approval.
	# Manual (workflow_dispatch) runs are gated by the gb200-ci environment
	# to prevent individuals from queuing arbitrary jobs on the shared GB200 cluster.
	on:
	schedule:
	- cron: '0 2 * * *' # 2 AM UTC daily (offset from other nightly runs)
	workflow_dispatch: # allow manual trigger; gated by gb200-ci environment
	inputs:
	image:
	description: 'Optional. SGLang Docker image to benchmark. Leave empty for the default nightly image. Mutually exclusive with pr_number and sglang_branch.'
	required: false
	default: ''
	pr_number:
	description: 'Optional. PR number to build from (works for PRs from forks too, via refs/pull/<N>/head). Preferred over sglang_branch when a PR exists. Mutually exclusive with image and sglang_branch.'
	required: false
	default: ''
	sglang_branch:
	description: 'Optional. Branch name on sgl-project/sglang to build from (use when no PR is open yet). For fork branches, open a PR and use pr_number instead. Mutually exclusive with image and pr_number.'
	required: false
	default: ''
	configs:
	description: 'Optional. Comma-separated names to run only a subset. Format: {model-prefix}-{precision}-{isl}{osl}-{recipe}. E.g. "dsr1-fp8-1k1k-max-tpt" or "dsr1-fp8-1k1k-max-tpt,dsr1-fp4-1k1k-mid-curve". Leave empty to run all. Available names are listed in the setup job log.'
	required: false
	default: ''

	concurrency:
	group: nightly-test-gb200
	cancel-in-progress: false

	env:
	SGLANG_IS_IN_CI: true
	SGLANG_ENABLE_ASYNC_ASSERT: true
	SRT_SLURM_BRANCH: sglang-nightly-regression
	SLURM_PARTITION: batch
	SLURM_ACCOUNT: sglang
	# Docker Hub repo for ephemeral branch/PR build images (kept separate from
	# the released `lmsysorg/sglang` repo). Cleaned up by `cleanup-image`.
	CI_IMAGE_REPO: lmsysorg/sglang-staging
	# How many most recent staging tags to retain after each run.
	CI_IMAGE_KEEP_TAGS: 60

	jobs:
	# ---------------------------------------------------------------------------
	# Reject conflicting inputs early. At most one of `image`, `pr_number`,
	# `sglang_branch` may be set — they select different image sources. Only runs
	# on manual dispatch; all downstream jobs chain through this so invalid
	# inputs halt the pipeline before cluster resources are reserved.
	# ---------------------------------------------------------------------------
	validate-inputs:
	if: github.repository == 'sgl-project/sglang' && github.event_name == 'workflow_dispatch'
	runs-on: ubuntu-latest
	steps:
	- name: Reject conflicting inputs
	run: \|
	IMAGE="${{ inputs.image }}"
	PR="${{ inputs.pr_number }}"
	BRANCH="${{ inputs.sglang_branch }}"
	sources=0
	[ -n "$IMAGE" ] && sources=$((sources + 1))
	[ -n "$PR" ] && sources=$((sources + 1))
	[ -n "$BRANCH" ] && sources=$((sources + 1))
	if [ "$sources" -gt 1 ]; then
	echo "::error::Specify at most one of 'image' ('$IMAGE'), 'pr_number' ('$PR'), or 'sglang_branch' ('$BRANCH')."
	exit 1
	fi
	if [ -n "$PR" ] && ! echo "$PR" \| grep -Eq '^[0-9]+$'; then
	echo "::error::pr_number must be a positive integer, got '$PR'."
	exit 1
	fi

	# ---------------------------------------------------------------------------
	# Reads scripts/ci/slurm/nightly-configs.yaml and generates one matrix entry
	# per recipe YAML. Each job runs the full concurrency sweep defined in the
	# recipe as a single Slurm job.
	# To add/remove configs, edit nightly-configs.yaml only.
	# ---------------------------------------------------------------------------
	setup:
	needs: validate-inputs
	# Run if validate-inputs succeeded (dispatch) or was skipped (cron).
	if: \|
	always() && github.repository == 'sgl-project/sglang'
	&& (needs.validate-inputs.result == 'success' \|\| needs.validate-inputs.result == 'skipped')
	runs-on: ubuntu-latest
	outputs:
	matrix: ${{ steps.generate.outputs.matrix }}
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Generate benchmark matrix
	id: generate
	env:
	CONFIGS_FILTER: ${{ inputs.configs }}
	run: \|
	pip install pyyaml -q

	# List all available config names first so they're visible in logs
	# even when a filter rejects an unknown name.
	ALL_MATRIX=$(python3 scripts/ci/slurm/generate_matrix.py \
	scripts/ci/slurm/nightly-configs.yaml --runner gb200)
	echo "Available config names for runner gb200:"
	echo "$ALL_MATRIX" \| python3 -c "import json,sys; [print(f' - {e[\"name\"]}') for e in json.load(sys.stdin)]"

	FILTER_ARG=()
	if [ -n "$CONFIGS_FILTER" ]; then
	echo ""
	echo "Filtering to: $CONFIGS_FILTER"
	FILTER_ARG=(--filter "$CONFIGS_FILTER")
	fi
	MATRIX=$(python3 scripts/ci/slurm/generate_matrix.py \
	scripts/ci/slurm/nightly-configs.yaml --runner gb200 "${FILTER_ARG[@]}")
	echo "matrix=$MATRIX" >> $GITHUB_OUTPUT

	# ---------------------------------------------------------------------------
	# When pr_number or sglang_branch is provided, build an ARM64 (GB200) image
	# from that ref and push it to Docker Hub under lmsysorg/sglang-staging.
	# Uses refs/pull/<N>/head for PRs so fork PRs work without cross-repo auth.
	# Old staging tags are pruned by `cleanup-image` at the end of the run.
	# Skipped on nightly (cron) runs and manual runs with neither pr_number nor
	# sglang_branch.
	# ---------------------------------------------------------------------------
	build-image:
	needs: [validate-inputs, setup]
	if: \|
	github.repository == 'sgl-project/sglang' && github.event_name == 'workflow_dispatch'
	&& (inputs.pr_number != '' \|\| inputs.sglang_branch != '')
	runs-on: arm-docker-build-node
	outputs:
	image_ref: ${{ steps.build.outputs.image_ref }}
	image_tag: ${{ steps.build.outputs.image_tag }}
	steps:
	# Self-hosted runners retain the workspace across jobs. Prior `docker buildx`
	# runs on this node leave root-owned build artifacts (e.g. sgl-kernel/build/)
	# that actions/checkout cannot remove, causing EACCES on rmdir. Wipe them
	# via a throwaway root container before checkout recreates the workspace.
	- name: Clean workspace (remove root-owned files from prior runs)
	run: \|
	docker run --rm -v "${{ github.workspace }}:/workspace" alpine:3 \
	sh -c 'rm -rf /workspace/..?* /workspace/.[!.]* /workspace/*' \|\| true

	- name: Checkout code
	uses: actions/checkout@v4
	with:
	# PRs (including fork PRs) resolve via refs/pull/<N>/head on upstream.
	# Otherwise fall back to the branch name on sgl-project/sglang.
	ref: ${{ inputs.pr_number && format('refs/pull/{0}/head', inputs.pr_number) \|\| inputs.sglang_branch }}

	- name: Verify checkout
	env:
	PR_NUMBER: ${{ inputs.pr_number }}
	BRANCH: ${{ inputs.sglang_branch }}
	run: \|
	SHA=$(git rev-parse HEAD)
	echo "Commit SHA: $SHA"
	echo "Author: $(git log -1 --format='%an <%ae>')"
	echo "Date: $(git log -1 --format='%aI')"
	echo "Subject: $(git log -1 --format='%s')"
	echo ""
	if [ -n "$PR_NUMBER" ]; then
	echo "Cross-check: https://github.com/sgl-project/sglang/pull/${PR_NUMBER}/commits"
	else
	echo "Cross-check: https://github.com/sgl-project/sglang/commits/${BRANCH}"
	fi
	echo "Commit URL: https://github.com/sgl-project/sglang/commit/${SHA}"

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Login to Docker Hub
	uses: docker/login-action@v3
	with:
	username: ${{ secrets.DOCKERHUB_USERNAME }}
	password: ${{ secrets.DOCKERHUB_TOKEN }}

	- name: Build and push ARM64 image
	id: build
	run: \|
	if [ -n "${{ inputs.pr_number }}" ]; then
	TAG_STUB="pr-${{ inputs.pr_number }}"
	SOURCE_DESC="PR #${{ inputs.pr_number }}"
	else
	TAG_STUB=$(echo "${{ inputs.sglang_branch }}" \| tr '/' '-' \| tr -cd '[:alnum:]._-')
	SOURCE_DESC="branch ${{ inputs.sglang_branch }}"
	fi
	# run_attempt disambiguates "Re-run jobs" so the squash filename
	# (derived from the image URL) doesn't collide with a stale one.
	TAG="${TAG_STUB}-${{ github.run_id }}-${{ github.run_attempt }}"
	IMAGE_REF="${CI_IMAGE_REPO}:${TAG}"
	echo "Building ${IMAGE_REF} from ${SOURCE_DESC}"

	docker buildx build \
	--platform linux/arm64 \
	--output type=image,name=${IMAGE_REF},push=true \
	--target framework_final \
	-f docker/Dockerfile \
	--build-arg CUDA_VERSION=13.0.1 \
	--build-arg BUILD_TYPE=all \
	--build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) \
	--build-arg GRACE_BLACKWELL=1 \
	--build-arg BRANCH_TYPE=local \
	--build-arg INSTALL_FLASHINFER_JIT_CACHE=1 \
	--no-cache \
	.

	echo "image_ref=${IMAGE_REF}" >> $GITHUB_OUTPUT
	echo "image_tag=${TAG}" >> $GITHUB_OUTPUT

	# ---------------------------------------------------------------------------
	# Import Docker images to Lustre squash files once before all benchmark jobs.
	# This avoids parallel jobs racing to enroot import the same image.
	# When build-image ran, we import the freshly built Docker Hub staging image
	# (lmsysorg/sglang-staging is public → no auth needed for enroot pull).
	# Otherwise we use the `image` input (or its default public nightly image).
	# ---------------------------------------------------------------------------
	prepare-image:
	needs: [setup, build-image]
	if: \|
	always() && github.repository == 'sgl-project/sglang'
	&& needs.setup.result == 'success'
	&& (needs.build-image.result == 'success' \|\| needs.build-image.result == 'skipped')
	environment: ${{ github.event_name == 'workflow_dispatch' && 'gb200-ci' \|\| '' }}
	runs-on: 72-gpu-gb200
	outputs:
	squash_file: ${{ steps.import.outputs.squash_file }}
	nginx_squash_file: ${{ steps.import.outputs.nginx_squash_file }}
	image: ${{ steps.resolve.outputs.image }}
	env:
	NGINX_IMAGE: nginx:1.27.4
	steps:
	- name: Resolve image to import
	id: resolve
	run: \|
	BUILT_IMAGE="${{ needs.build-image.outputs.image_ref }}"
	if [ -n "$BUILT_IMAGE" ]; then
	echo "Using freshly built image: $BUILT_IMAGE"
	echo "image=$BUILT_IMAGE" >> $GITHUB_OUTPUT
	else
	IMAGE="${{ inputs.image \|\| 'lmsysorg/sglang:dev-cu13' }}"
	echo "Using pre-existing image: $IMAGE"
	echo "image=$IMAGE" >> $GITHUB_OUTPUT
	fi

	- name: Import Docker images to Lustre
	id: import
	env:
	IMAGE: ${{ steps.resolve.outputs.image }}
	run: \|
	SQUASH_FILE="/mnt/lustre01/users-public/sglang-ci/$(echo "$IMAGE" \| sed 's/[\/:@#]/_/g')_$(date +%Y%m%d).sqsh"
	NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sglang-ci/$(echo "$NGINX_IMAGE" \| sed 's/[\/:@#]/_/g').sqsh"

	if [ -f "$SQUASH_FILE" ]; then
	echo "Squash file already exists, skipping import: $SQUASH_FILE"
	else
	enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
	fi

	if [ -f "$NGINX_SQUASH_FILE" ]; then
	echo "Nginx squash file already exists, skipping import: $NGINX_SQUASH_FILE"
	else
	enroot import -o "$NGINX_SQUASH_FILE" "docker://$NGINX_IMAGE"
	fi

	echo "squash_file=$SQUASH_FILE" >> $GITHUB_OUTPUT
	echo "nginx_squash_file=$NGINX_SQUASH_FILE" >> $GITHUB_OUTPUT

	nightly-gb200-benchmark:
	needs: [setup, prepare-image]
	# Use always() + explicit success checks so a skipped transitive upstream
	# (e.g. build-image when neither pr_number nor sglang_branch is set) does
	# not propagate a skip to this job. Direct deps must still have succeeded.
	if: \|
	always() && github.repository == 'sgl-project/sglang'
	&& needs.setup.result == 'success'
	&& needs.prepare-image.result == 'success'
	runs-on: 72-gpu-gb200
	strategy:
	fail-fast: false
	matrix:
	config: ${{ fromJson(needs.setup.outputs.matrix) }}
	env:
	FRAMEWORK: dynamo-sglang
	MODEL: ${{ matrix.config.model }}
	MODEL_PREFIX: ${{ matrix.config.model_prefix }}
	PRECISION: ${{ matrix.config.precision }}
	ISL: ${{ matrix.config.isl }}
	OSL: ${{ matrix.config.osl }}
	CONFIG_FILE: ${{ matrix.config.config_file }}
	RESULT_FILENAME: gb200-${{ matrix.config.name }}
	MATRIX_CONFIG_NAME: ${{ matrix.config.name }}
	SQUASH_FILE: ${{ needs.prepare-image.outputs.squash_file }}
	NGINX_SQUASH_FILE: ${{ needs.prepare-image.outputs.nginx_squash_file }}
	# S3 log-upload credentials — consumed by srt-slurm's postprocess stage
	# to upload /logs after each Slurm job; prefix derived in launch_gb200.sh.
	AWS_ACCESS_KEY_ID: ${{ secrets.NV_S3_ACCESS_KEY_ID }}
	AWS_SECRET_ACCESS_KEY: ${{ secrets.NV_S3_SECRET_ACCESS_KEY }}
	S3_BUCKET: ${{ secrets.NV_S3_BUCKET }}
	S3_ENDPOINT_URL: ${{ secrets.NV_S3_ENDPOINT_URL }}

	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Clean up prior Slurm jobs from this runner
	continue-on-error: true
	env:
	RUNNER_NAME: ${{ runner.name }}
	run: \|
	STALE_JOBS=$(squeue --noheader --format="%i %j" \| grep "${RUNNER_NAME}" \| awk '{print $1}')
	if [ -n "$STALE_JOBS" ]; then
	echo "Cancelling stale jobs: $STALE_JOBS"
	scancel $STALE_JOBS
	fi

	- name: Launch GB200 benchmark via srt-slurm
	timeout-minutes: 360
	env:
	RUNNER_NAME: ${{ runner.name }}
	run: bash scripts/ci/slurm/launch_gb200.sh

	- name: Process results
	if: always()
	env:
	RUNNER_NAME: ${{ runner.name }}
	run: \|
	pip install tabulate pyyaml -q
	SRT_REPO_DIR="/mnt/lustre01/users-public/sglang-ci/workspace/${RUNNER_NAME}/srt-slurm"
	for result_file in ${{ github.workspace }}/${RESULT_FILENAME}_*.json; do
	[ -f "$result_file" ] \|\| continue
	basename_file=$(basename "$result_file")
	ctx=$(echo "$basename_file" \| sed -n 's/._ctx_$[0-9]$_gen.*/\1/p')
	gen=$(echo "$basename_file" \| sed -n 's/._gen_$[0-9]$\.json/\1/p')
	[ -n "$ctx" ] && [ -n "$gen" ] \|\| continue
	RESULT_FILENAME="${result_file%.json}" PREFILL_GPUS="$ctx" DECODE_GPUS="$gen" \
	RECIPE_FILE="$SRT_REPO_DIR/$CONFIG_FILE" \
	python3 scripts/ci/slurm/process_result.py
	done

	- name: Upload results
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: gb200-${{ matrix.config.name }}-${{ github.run_id }}
	path: \|
	${{ github.workspace }}/*.json
	${{ github.workspace }}/multinode_server_logs.tar.gz
	retention-days: 30
	if-no-files-found: warn

	- name: Analyze logs with AI on failure
	if: failure()
	continue-on-error: true
	env:
	MODAL_TOKEN_ID: ${{ secrets.NV_MODAL_TOKEN_ID }}
	MODAL_TOKEN_SECRET: ${{ secrets.NV_MODAL_TOKEN_SECRET }}
	run: \|
	TARBALL="${{ github.workspace }}/multinode_server_logs.tar.gz"
	if [ -f "$TARBALL" ]; then
	uv run --with modal python scripts/ci/slurm/analyze_logs_with_modal.py \
	--tarball "$TARBALL" \
	--job-id "${{ matrix.config.name }}-${{ github.run_id }}" \
	--output "${{ github.workspace }}/ai_analysis.md"
	if [ -f "${{ github.workspace }}/ai_analysis.md" ]; then
	echo "## AI Log Analysis" >> $GITHUB_STEP_SUMMARY
	cat "${{ github.workspace }}/ai_analysis.md" >> $GITHUB_STEP_SUMMARY
	fi
	else
	echo "No log tarball found, skipping analysis"
	fi

	- name: Upload AI analysis to S3
	if: failure()
	continue-on-error: true
	env:
	ISL: ${{ matrix.config.isl }}
	OSL: ${{ matrix.config.osl }}
	run: \|
	ANALYSIS="${{ github.workspace }}/ai_analysis.md"
	[ -f "$ANALYSIS" ] \|\| { echo "no ai_analysis.md, skipping"; exit 0; }
	case "${{ github.event_name }}" in
	schedule) TRIGGER=cron ;;
	workflow_dispatch) TRIGGER=manual ;;
	*) TRIGGER="${{ github.event_name }}" ;;
	esac
	fmt() { if [ $(( $1 % 1024 )) -eq 0 ]; then echo "$(( $1 / 1024 ))k"; else echo "$1"; fi; }
	SEQ_LEN="$(fmt "$ISL")$(fmt "$OSL")"
	KEY="${TRIGGER}/${{ github.run_id }}-${{ github.run_attempt }}/${SEQ_LEN}/${{ matrix.config.name }}/ai_analysis.md"
	aws --endpoint-url "$S3_ENDPOINT_URL" s3 cp "$ANALYSIS" "s3://${S3_BUCKET}/${KEY}"
	echo "uploaded to s3://${S3_BUCKET}/${KEY}"

	- name: Clean up Slurm jobs on failure/cancel
	if: failure() \|\| cancelled()
	continue-on-error: true
	env:
	RUNNER_NAME: ${{ runner.name }}
	run: \|
	ACTIVE_JOBS=$(squeue --noheader --format="%i %j" \| grep "${RUNNER_NAME}" \| awk '{print $1}')
	if [ -n "$ACTIVE_JOBS" ]; then
	echo "Cancelling jobs: $ACTIVE_JOBS"
	scancel $ACTIVE_JOBS
	fi

	collect-results:
	needs: nightly-gb200-benchmark
	if: github.repository == 'sgl-project/sglang' && always()
	runs-on: ubuntu-latest
	steps:
	- name: Checkout code
	uses: actions/checkout@v4

	- name: Download artifacts
	uses: actions/download-artifact@v4
	with:
	path: results/
	pattern: gb200-*

	- name: Print summary
	run: \|
	pip install tabulate -q
	python3 scripts/ci/slurm/summarize.py results/ >> $GITHUB_STEP_SUMMARY

	# ---------------------------------------------------------------------------
	# Prune old tags in the staging repo, keeping only the most recent N. Mirrors
	# the pattern used by release-docker-dev.yml. Runs after benchmarks so the
	# freshly built image (whose sqsh is already on Lustre) becomes a regular
	# aged-out tag over time. No-op when the repo has ≤ CI_IMAGE_KEEP_TAGS tags.
	# ---------------------------------------------------------------------------
	cleanup-image:
	needs: [build-image, nightly-gb200-benchmark]
	if: always() && needs.build-image.result == 'success'
	runs-on: ubuntu-latest
	steps:
	- name: Prune old staging tags on Docker Hub
	env:
	DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
	DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
	run: \|
	TOKEN=$(curl -s -H "Content-Type: application/json" \
	-X POST -d "{\"username\": \"${DOCKERHUB_USERNAME}\", \"password\": \"${DOCKERHUB_TOKEN}\"}" \
	https://hub.docker.com/v2/users/login/ \| jq -r .token)
	if [ -z "$TOKEN" ] \|\| [ "$TOKEN" = "null" ]; then
	echo "::error::Docker Hub login failed"
	exit 1
	fi

	TAGS_RESPONSE=$(curl -s -H "Authorization: JWT $TOKEN" \
	"https://hub.docker.com/v2/repositories/${CI_IMAGE_REPO}/tags/?page_size=100")

	# Sort tags by last_updated (newest first), keep names only.
	TAGS=$(echo "$TAGS_RESPONSE" \| jq -r \
	'.results[] \| "\(.last_updated)\|\(.name)"' \
	\| sort -r \| cut -d'\|' -f2)

	TAG_COUNT=$(echo "$TAGS" \| grep -c . \|\| true)
	if [ "$TAG_COUNT" -gt "$CI_IMAGE_KEEP_TAGS" ]; then
	echo "Found $TAG_COUNT tags in ${CI_IMAGE_REPO}, keeping $CI_IMAGE_KEEP_TAGS most recent"
	TAGS_TO_DELETE=$(echo "$TAGS" \| tail -n +$((CI_IMAGE_KEEP_TAGS + 1)))
	for tag in $TAGS_TO_DELETE; do
	echo "Deleting ${CI_IMAGE_REPO}:${tag}"
	curl -s -X DELETE -H "Authorization: JWT $TOKEN" \
	"https://hub.docker.com/v2/repositories/${CI_IMAGE_REPO}/tags/${tag}/"
	done
	else
	echo "Only $TAG_COUNT tags in ${CI_IMAGE_REPO}, no cleanup needed"
	fi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Nightly Test (GB200 72GPU) #78

Workflow file

Nightly Test (GB200 72GPU) #78

Uh oh!

Workflow file for this run