deepspeedai · sfc-gh-truwase · Mar 30, 2026 · Mar 30, 2026
@@ -2,13 +2,13 @@
 # DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
 #
 # Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
-# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
+# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
+# fallback to 8x A100 nodes when L40S capacity is unavailable.
 #
 # This workflow runs:
 # - Parallel tests with pytest-xdist (-n 8)
 # - Sequential tests marked with @pytest.mark.sequential
-#
-# Nightly schedule: skips if no new commits since last successful run.
+# - Nightly schedule: skips if no new commits since last successful run
 ################################################################################
 
 name: aws-torch-latest-full
@@ -26,7 +26,6 @@ jobs:
   check-changes:
     name: Check for new commits
     runs-on: ubuntu-latest
-    # Only check on schedule; workflow_dispatch always runs
     if: github.event_name == 'schedule'
     outputs:
       has_changes: ${{ steps.check.outputs.has_changes }}
@@ -38,28 +37,26 @@ jobs:
         run: |
           default_branch="${{ github.event.repository.default_branch }}"
 
-          # Get the HEAD SHA of the last successful run of this workflow
           last_sha=$(gh api \
             "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
             --jq '.workflow_runs[0].head_sha // empty')
 
           current_sha="${{ github.sha }}"
 
           if [ -z "$last_sha" ]; then
-            echo "No previous successful run found — running tests"
+            echo "No previous successful run found - running tests"
             echo "has_changes=true" >> "$GITHUB_OUTPUT"
           elif [ "$last_sha" = "$current_sha" ]; then
-            echo "No new commits since last successful run ($last_sha) — skipping"
+            echo "No new commits since last successful run ($last_sha) - skipping"
             echo "has_changes=false" >> "$GITHUB_OUTPUT"
           else
-            echo "New commits detected: $last_sha -> $current_sha — running tests"
+            echo "New commits detected: $last_sha -> $current_sha - running tests"
             echo "has_changes=true" >> "$GITHUB_OUTPUT"
           fi
 
   unit-tests:
     name: Unit Tests (Full)
     needs: [check-changes]
-    # Run if: (a) workflow_dispatch, or (b) schedule with new commits
     if: |
       always() &&
       (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
@@ -134,8 +131,30 @@ jobs:
           echo "CUTLASS_PATH: $CUTLASS_PATH"
           ls -la $CUTLASS_PATH/include/ | head -5
 
+      - name: Detect GPU architecture
+        run: |
+          python - <<'PY'
+          import os
+          import torch
+
+          torch.cuda.init()
+          major, minor = torch.cuda.get_device_capability(0)
+          arch = f"{major}.{minor}"
+          gpu_count = torch.cuda.device_count()
+          gpu_name = torch.cuda.get_device_name(0)
+
+          with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
+              env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
+              env_file.write(f"GPU_COUNT={gpu_count}\n")
+
+          print(f"Detected GPU: {gpu_name}")
+          print(f"Detected compute capability: {arch}")
+          print(f"Detected GPU count: {gpu_count}")
+          PY
+
       - name: Install DeepSpeed
         run: |
+          echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
           # Initialize CUDA before install so setup.py can detect NCCL version
           python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
           # Use --no-build-isolation so setup.py can access pre-installed PyTorch
@@ -148,7 +167,7 @@ jobs:
 
       - name: Unit tests (parallel)
         run: |
-          export TORCH_CUDA_ARCH_LIST="8.9"
+          echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
           cd tests
           # Skip tests requiring unavailable hardware or known issues:
           # - nvme checkpointing: no nvme device
@@ -166,7 +185,7 @@ jobs:
 
       - name: Unit tests (sequential)
         run: |
-          export TORCH_CUDA_ARCH_LIST="8.9"
+          echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
           cd tests
           rm -rf /mnt/aio/pytest
           pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \