From de6166f6d4e6d624338ba30f6d4d1c4a22dbfd57 Mon Sep 17 00:00:00 2001 From: Tamilmani Date: Fri, 23 Jan 2026 14:38:00 -0800 Subject: [PATCH 1/2] Added mpijob and rdma_tests with dranet usage: ./tests/scenarios/test.sh dranet mpijob ./tests/scenarios/test.sh dranet rdma-test Signed-off-by: Tamilmani --- tests/scenarios/k8s/files/dranet-ds.yaml | 182 ++++++++++++++++++ .../k8s/templates/dranet-deviceclass.yaml | 13 ++ .../dranet-resourceclaimtemplate.yaml | 30 +++ tests/scenarios/k8s/templates/leader-job.yaml | 5 + tests/scenarios/k8s/templates/mpi-job.yaml | 8 +- tests/scenarios/k8s/templates/worker-job.yaml | 5 + tests/scenarios/k8s/values.yaml | 14 +- tests/scenarios/test.sh | 135 +++++++++++-- tests/scenarios/util.sh | 13 ++ 9 files changed, 387 insertions(+), 18 deletions(-) create mode 100644 tests/scenarios/k8s/files/dranet-ds.yaml create mode 100644 tests/scenarios/k8s/templates/dranet-deviceclass.yaml create mode 100644 tests/scenarios/k8s/templates/dranet-resourceclaimtemplate.yaml diff --git a/tests/scenarios/k8s/files/dranet-ds.yaml b/tests/scenarios/k8s/files/dranet-ds.yaml new file mode 100644 index 0000000..393a1b4 --- /dev/null +++ b/tests/scenarios/k8s/files/dranet-ds.yaml @@ -0,0 +1,182 @@ +# Copyright The Kubernetes Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: dranet +rules: + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - apiGroups: + - "resource.k8s.io" + resources: + - resourceslices + verbs: + - list + - watch + - create + - update + - delete + - apiGroups: + - "resource.k8s.io" + resources: + - resourceclaims + - deviceclasses + verbs: + - get + - apiGroups: + - "resource.k8s.io" + resources: + - resourceclaims/status + verbs: + - patch + - update +--- + +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: dranet +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: dranet +subjects: +- kind: ServiceAccount + name: dranet + namespace: kube-system +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: dranet + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: dranet + namespace: kube-system + labels: + tier: node + app: dranet + k8s-app: dranet +spec: + selector: + matchLabels: + app: dranet + template: + metadata: + labels: + tier: node + app: dranet + k8s-app: dranet + spec: + hostNetwork: true + tolerations: + - operator: Exists + effect: NoSchedule + serviceAccountName: dranet + hostPID: true + initContainers: + - name: enable-nri + image: busybox:stable + volumeMounts: + - mountPath: /etc + name: etc + securityContext: + privileged: true + command: + - /bin/sh + - -c + - | + set -o errexit + set -o pipefail + set -o nounset + set -x + if grep -q "io.containerd.nri.v1.nri" /etc/containerd/config.toml + then + echo "containerd config contains NRI reference already; taking no action" + else + echo "containerd config does not mention NRI, thus enabling it"; + printf '%s\n' "[plugins.\"io.containerd.nri.v1.nri\"]" " disable = false" " disable_connections = false" " plugin_config_path = \"/etc/nri/conf.d\"" " plugin_path = \"/opt/nri/plugins\"" " plugin_registration_timeout = \"5s\"" " plugin_request_timeout = \"5s\"" " socket_path = \"/var/run/nri/nri.sock\"" >> /etc/containerd/config.toml + echo "restarting containerd" + nsenter -t 1 -m -u -i -n -p -- systemctl restart containerd + fi + containers: + - name: dranet + args: + - /dranet + - --v=4 + - --hostname-override=$(NODE_NAME) + image: acnpublic.azurecr.io/dranet:dev8 + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + requests: + cpu: "100m" + memory: "50Mi" + securityContext: + privileged: true + readinessProbe: + httpGet: + path: /healthz + port: 9177 + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/plugins + - name: plugin-registry + mountPath: /var/lib/kubelet/plugins_registry + - name: nri-plugin + mountPath: /var/run/nri + - name: netns + mountPath: /var/run/netns + mountPropagation: HostToContainer + - name: infiniband + mountPath: /dev/infiniband + mountPropagation: HostToContainer + - name: bpf-programs + mountPath: /sys/fs/bpf + mountPropagation: HostToContainer + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/plugins + - name: plugin-registry + hostPath: + path: /var/lib/kubelet/plugins_registry + - name: nri-plugin + hostPath: + path: /var/run/nri + - name: netns + hostPath: + path: /var/run/netns + - name: infiniband + hostPath: + path: /dev/infiniband + - name: etc + hostPath: + path: /etc + - name: bpf-programs + hostPath: + path: /sys/fs/bpf +--- \ No newline at end of file diff --git a/tests/scenarios/k8s/templates/dranet-deviceclass.yaml b/tests/scenarios/k8s/templates/dranet-deviceclass.yaml new file mode 100644 index 0000000..4402f4a --- /dev/null +++ b/tests/scenarios/k8s/templates/dranet-deviceclass.yaml @@ -0,0 +1,13 @@ +{{- if .Values.dranet.enabled }} +# DeviceClass for DRANET - selects RDMA-capable NICs managed by DRANET +apiVersion: resource.k8s.io/v1 +kind: DeviceClass +metadata: + name: {{ .Values.dranet.deviceClassName | default "dranet-rdma" }} +spec: + selectors: + - cel: + expression: device.driver == "dra.net" + - cel: + expression: device.attributes["dra.net"].rdma == true +{{- end }} diff --git a/tests/scenarios/k8s/templates/dranet-resourceclaimtemplate.yaml b/tests/scenarios/k8s/templates/dranet-resourceclaimtemplate.yaml new file mode 100644 index 0000000..4a42d5d --- /dev/null +++ b/tests/scenarios/k8s/templates/dranet-resourceclaimtemplate.yaml @@ -0,0 +1,30 @@ +{{- if .Values.dranet.enabled }} +# ResourceClaimTemplate for requesting RDMA NICs via DRA +apiVersion: resource.k8s.io/v1 +kind: ResourceClaimTemplate +metadata: + name: {{ .Values.dranet.resourceClaimTemplateName | default "rdma-nic-template" }} +spec: + spec: + devices: + requests: + - name: rdma-nic + exactly: + deviceClassName: {{ .Values.dranet.deviceClassName | default "dranet-rdma" }} + count: {{ .Values.dranet.nicCount | default 1 }} + {{- if .Values.dranet.selectors }} + selectors: + {{- range .Values.dranet.selectors }} + - cel: + expression: {{ . | quote }} + {{- end }} + {{- end }} + {{- if .Values.dranet.interfaceConfig }} + config: + - opaque: + driver: dra.net + parameters: + interface: + {{- toYaml .Values.dranet.interfaceConfig | nindent 14 }} + {{- end }} +{{- end }} diff --git a/tests/scenarios/k8s/templates/leader-job.yaml b/tests/scenarios/k8s/templates/leader-job.yaml index f689fa9..edf2265 100644 --- a/tests/scenarios/k8s/templates/leader-job.yaml +++ b/tests/scenarios/k8s/templates/leader-job.yaml @@ -17,6 +17,11 @@ spec: labels: role: leader spec: + {{- if .Values.dranet.enabled }} + resourceClaims: + - name: rdma-nic + resourceClaimTemplateName: {{ .Values.dranet.resourceClaimTemplateName | default "rdma-nic-template" }} + {{- end }} containers: - name: runner image: {{ .Values.job.image }} diff --git a/tests/scenarios/k8s/templates/mpi-job.yaml b/tests/scenarios/k8s/templates/mpi-job.yaml index a94d4b4..6d19e1f 100644 --- a/tests/scenarios/k8s/templates/mpi-job.yaml +++ b/tests/scenarios/k8s/templates/mpi-job.yaml @@ -8,7 +8,7 @@ metadata: spec: slotsPerWorker: 8 runPolicy: - cleanPodPolicy: Running + cleanPodPolicy: {{ .Values.mpiJob.cleanPodPolicy | default "Running" }} mpiReplicaSpecs: Launcher: replicas: 1 @@ -76,6 +76,11 @@ spec: labels: task: test spec: + {{- if .Values.dranet.enabled }} + resourceClaims: + - name: rdma-nic + resourceClaimTemplateName: {{ .Values.dranet.resourceClaimTemplateName | default "rdma-nic-template" }} + {{- end }} containers: - image: {{ .Values.mpiJob.image }} name: nccl @@ -118,3 +123,4 @@ spec: enableServiceLinks: false automountServiceAccountToken: false {{- end }} + diff --git a/tests/scenarios/k8s/templates/worker-job.yaml b/tests/scenarios/k8s/templates/worker-job.yaml index 088ec9b..839884d 100644 --- a/tests/scenarios/k8s/templates/worker-job.yaml +++ b/tests/scenarios/k8s/templates/worker-job.yaml @@ -17,6 +17,11 @@ spec: labels: role: worker spec: + {{- if .Values.dranet.enabled }} + resourceClaims: + - name: rdma-nic + resourceClaimTemplateName: {{ .Values.dranet.resourceClaimTemplateName | default "rdma-nic-template" }} + {{- end }} containers: - name: runner image: {{ .Values.job.image }} diff --git a/tests/scenarios/k8s/values.yaml b/tests/scenarios/k8s/values.yaml index 149f24e..00cff4a 100644 --- a/tests/scenarios/k8s/values.yaml +++ b/tests/scenarios/k8s/values.yaml @@ -2,6 +2,7 @@ mpiJob: enabled: false image: ghcr.io/azure/aks-rdma-infiniband/nccl-tests numberOfProcesses: 0 + cleanPodPolicy: Running # Options: Running, All, None job: enabled: false @@ -21,11 +22,22 @@ job: ipoib: false +# DRANET (DRA) Configuration for RDMA NICs via Resource Claims +# Enable this to use Dynamic Resource Allocation instead of device plugin +dranet: + enabled: false + # DeviceClass name for RDMA NICs + deviceClassName: dranet-rdma + # ResourceClaimTemplate name + resourceClaimTemplateName: rdma-nic-template + # Number of RDMA NICs to request per pod + nicCount: 8 + # -------------------------------------------------------------------- # Below this point it is gonna be constant for all the tests ncclEnvVars: NCCL_NET_GDR_LEVEL: SYS # Needed for MPI Job. - NCCL_IB_DISABLE: "0" # Force NCCL to use Infiniband. + NCCL_IB_DISABLE: "0" # Force NCCL to use Infiniband. # NCCL_DEBUG: INFO # Valid values: VERSION, WARN, INFO, TRACE # NCCL_DEBUG_SUBSYS: INIT,NET # DEBUG: true # Enable script in verbose mode. diff --git a/tests/scenarios/test.sh b/tests/scenarios/test.sh index 0961710..d3c19f2 100755 --- a/tests/scenarios/test.sh +++ b/tests/scenarios/test.sh @@ -18,6 +18,25 @@ if [ "${DEBUG:-false}" = "true" ]; then ) fi +# Set MPI job flags based on SKIP_CLEANUP +export SKIP_CLEANUP_FLAGS=() +if [ "${SKIP_CLEANUP:-false}" = "true" ]; then + export SKIP_CLEANUP_FLAGS=( + --set "mpiJob.cleanPodPolicy=None" + ) +fi + +# Check if SKIP_CLEANUP env var is set to true +function cleanup() { + if [ "${SKIP_CLEANUP:-false}" = "true" ]; then + echo "⏭️ Skipping cleanup (SKIP_CLEANUP=true)" + echo " To cleanup manually run: helm uninstall test --wait" + else + echo "🧹 Cleaning up..." + $HELM_UNINSTALL_CMD + fi +} + function deploy_root_nic_policy() { kubectl apply -k "${SCRIPT_DIR}/../../configs/nicclusterpolicy/base" wait_until_mofed_is_ready @@ -39,8 +58,7 @@ function root_nic_policy() { exit 1 fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function root_nic_policy_gpu() { @@ -64,6 +82,7 @@ function root_nic_policy_gpu() { fail_on_job_failure "role=worker" "default" else $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + ${SKIP_CLEANUP_FLAGS[@]+"${SKIP_CLEANUP_FLAGS[@]}"} \ "${test_flags[@]}" \ --set mpiJob.enabled=true \ --set mpiJob.numberOfProcesses="${NUMBER_OF_PROCESSES}" @@ -71,8 +90,7 @@ function root_nic_policy_gpu() { fail_on_job_failure "app=nccl-tests" "default" fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function deploy_sriov_nic_policy() { @@ -102,8 +120,7 @@ function sriov_nic_policy() { exit 1 fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function sriov_nic_policy_gpu() { @@ -128,6 +145,7 @@ function sriov_nic_policy_gpu() { fail_on_job_failure "role=worker" "default" else $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + ${SKIP_CLEANUP_FLAGS[@]+"${SKIP_CLEANUP_FLAGS[@]}"} \ "${test_flags[@]}" \ --set mpiJob.enabled=true \ --set mpiJob.numberOfProcesses="${NUMBER_OF_PROCESSES}" @@ -135,8 +153,7 @@ function sriov_nic_policy_gpu() { fail_on_job_failure "app=nccl-tests" "default" fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function deploy_ipoib_nic_policy() { @@ -167,8 +184,7 @@ function ipoib_nic_policy() { exit 1 fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function ipoib_nic_policy_gpu() { @@ -194,6 +210,7 @@ function ipoib_nic_policy_gpu() { fail_on_job_failure "role=worker" "default" else $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + ${SKIP_CLEANUP_FLAGS[@]+"${SKIP_CLEANUP_FLAGS[@]}"} \ "${test_flags[@]}" \ --set mpiJob.enabled=true \ --set mpiJob.numberOfProcesses="${NUMBER_OF_PROCESSES}" @@ -201,8 +218,7 @@ function ipoib_nic_policy_gpu() { fail_on_job_failure "app=nccl-tests" "default" fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function deploy_rdma_shared_device_plugin() { @@ -232,8 +248,7 @@ function rdma_shared_device_plugin() { exit 1 fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup } function rdma_shared_device_plugin_gpu() { @@ -258,6 +273,7 @@ function rdma_shared_device_plugin_gpu() { fail_on_job_failure "role=worker" "default" else $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + ${SKIP_CLEANUP_FLAGS[@]+"${SKIP_CLEANUP_FLAGS[@]}"} \ "${test_flags[@]}" \ --set mpiJob.enabled=true \ --set mpiJob.numberOfProcesses="${NUMBER_OF_PROCESSES}" @@ -265,8 +281,89 @@ function rdma_shared_device_plugin_gpu() { fail_on_job_failure "app=nccl-tests" "default" fi - echo "🧹 Cleaning up..." - $HELM_UNINSTALL_CMD + cleanup +} + +function install_dranet() { + echo "📦 Installing DRANET..." + kubectl apply -f "${SCRIPT_DIR}/k8s/files/dranet-ds.yaml" + + # Wait for dranet daemonset to be ready + echo "⏳ Waiting for DRANET daemonset to be ready..." + kubectl rollout status daemonset/dranet -n kube-system --timeout=300s + + # Give some time for resource slices to be populated + sleep 10 + + echo "✅ DRANET installed. Checking available RDMA devices..." + kubectl get resourceslices -o json | jq -r '.items[].spec.devices[]? | select(.basic.attributes["dra.net/rdma"].bool == true) | .name' | head -10 || echo "No RDMA devices found yet" +} + +function deploy_dranet() { + kubectl apply -k "${SCRIPT_DIR}/../../configs/nicclusterpolicy/base" + wait_until_mofed_is_ready + install_dranet +} + +function dranet_nic_policy() { + deploy_dranet + + local test_flags=( + --set "securityContext.capabilities.add={IPC_LOCK}" + --set "dranet.enabled=true" + --set "dranet.nicCount=1" + --set "ncclEnvVars.NCCL_SOCKET_IFNAME=eth0" + ) + + if [[ ${subcmd} != "mpijob" ]]; then + $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + "${test_flags[@]}" \ + --set job.enabled=true \ + --set job.testFunctionName="${subcmd}" + + fail_on_job_failure "role=leader" "default" + fail_on_job_failure "role=worker" "default" + else + echo "❌ Can't run mpijob without GPUs" + exit 1 + fi + + cleanup +} + +function dranet_nic_policy_gpu() { + deploy_dranet + + find_gpu_per_node + mpi_job_number_of_processes + + local test_flags=( + --set "securityContext.capabilities.add={IPC_LOCK}" + --set "resources.nvidia\.com/gpu=${GPU_PER_NODE_NUMBER}" + --set "dranet.enabled=true" + --set "dranet.nicCount=${GPU_PER_NODE_NUMBER}" + --set "ncclEnvVars.NCCL_SOCKET_IFNAME=eth0" + ) + + if [[ ${subcmd} != "mpijob" ]]; then + $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + "${test_flags[@]}" \ + --set job.enabled=true \ + --set job.testFunctionName="${subcmd}" + + fail_on_job_failure "role=leader" "default" + fail_on_job_failure "role=worker" "default" + else + $HELM_INSTALL_CMD ${TEST_DEBUG_FLAGS[@]+"${TEST_DEBUG_FLAGS[@]}"} \ + ${SKIP_CLEANUP_FLAGS[@]+"${SKIP_CLEANUP_FLAGS[@]}"} \ + "${test_flags[@]}" \ + --set mpiJob.enabled=true \ + --set mpiJob.numberOfProcesses="${NUMBER_OF_PROCESSES}" + + fail_on_job_failure "app=nccl-tests" "default" + fi + + cleanup } cmd="${1:-}" @@ -295,6 +392,12 @@ rdma-shared-device-plugin | rdma_shared_device_plugin) rdma-shared-device-plugin-gpu | rdma_shared_device_plugin_gpu) DEPLOY_METHOD_FUNC="rdma_shared_device_plugin_gpu" ;; +dranet | dranet-nic-policy | dranet_nic_policy) + DEPLOY_METHOD_FUNC="dranet_nic_policy" + ;; +dranet-gpu | dranet-nic-policy-gpu | dranet_nic_policy_gpu) + DEPLOY_METHOD_FUNC="dranet_nic_policy_gpu" + ;; *) echo "Unknown command: ${cmd}" print_help $0 diff --git a/tests/scenarios/util.sh b/tests/scenarios/util.sh index 52235f9..2ec8ba3 100644 --- a/tests/scenarios/util.sh +++ b/tests/scenarios/util.sh @@ -167,6 +167,9 @@ function find_gpu_per_node() { } function cleanup_cm() { + if [ "${SKIP_CLEANUP:-false}" = "true" ]; then + return + fi kubectl delete configmap nvidia-topology || true } @@ -257,12 +260,14 @@ Available Commands (GPU): rdma-shared-device-plugin-gpu Run a test with RDMA shared device plugin ipoib-nic-policy-gpu Run a test with IP over IB root-nic-policy-gpu Run a test with no shared device plugin + dranet-gpu Run a test with DRANET (DRA) for RDMA NICs via Resource Claims Available Commands (non-GPU): sriov-nic-policy Run a test with SR-IOV shared device plugin without GPU rdma-shared-device-plugin Run a test with RDMA shared device plugin wihtout GPU ipoib-nic-policy Run a test with IP over IB without GPU root-nic-policy Run a test with no shared device plugin without GPU + dranet Run a test with DRANET (DRA) for RDMA NICs via Resource Claims Available Subcommands: mpijob Run MPI job to see the total speed @@ -272,5 +277,13 @@ Available Subcommands: sockperf Run tests with sockperf utility all Run all tests in the order sockperf, rdma-test and nccl-tests debug The tests sleep infinitely for debugging + +Environment Variables: + DEBUG=true Enable verbose debug output + SKIP_CLEANUP=true Skip cleanup after test (keep resources for debugging) + NODE_POOL_VM_SIZE Required for GPU tests (e.g., Standard_ND96isr_H100_v5) + +Note: DRANET requires Kubernetes with DRA (Dynamic Resource Allocation) enabled. + See: https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/ EOF } From 1db8006c052840a464d86312b2af35e5f15874d6 Mon Sep 17 00:00:00 2001 From: Tamilmani Date: Tue, 24 Feb 2026 13:41:44 -0800 Subject: [PATCH 2/2] temp commit Signed-off-by: Tamilmani --- tests/scenarios/k8s/templates/dranet-deviceclass.yaml | 1 + tests/scenarios/test.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/scenarios/k8s/templates/dranet-deviceclass.yaml b/tests/scenarios/k8s/templates/dranet-deviceclass.yaml index 4402f4a..f1ceb42 100644 --- a/tests/scenarios/k8s/templates/dranet-deviceclass.yaml +++ b/tests/scenarios/k8s/templates/dranet-deviceclass.yaml @@ -10,4 +10,5 @@ spec: expression: device.driver == "dra.net" - cel: expression: device.attributes["dra.net"].rdma == true + expression: device.attributes["dra.net"].encapsulation == "infiniband" {{- end }} diff --git a/tests/scenarios/test.sh b/tests/scenarios/test.sh index d3c19f2..e271d4b 100755 --- a/tests/scenarios/test.sh +++ b/tests/scenarios/test.sh @@ -302,7 +302,7 @@ function install_dranet() { function deploy_dranet() { kubectl apply -k "${SCRIPT_DIR}/../../configs/nicclusterpolicy/base" wait_until_mofed_is_ready - install_dranet + #install_dranet } function dranet_nic_policy() {