From 86d834b3e57b272cea3b7db9eff582bfcf74b896 Mon Sep 17 00:00:00 2001 From: Masahiro Tanaka Date: Sun, 29 Mar 2026 23:42:45 -0700 Subject: [PATCH] add fallback to full test Signed-off-by: Masahiro Tanaka --- .github/workflows/aws-torch-latest-full.yml | 41 +++++++++++++++------ 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/.github/workflows/aws-torch-latest-full.yml b/.github/workflows/aws-torch-latest-full.yml index cc35a1784fc3..9f0ef1180f00 100644 --- a/.github/workflows/aws-torch-latest-full.yml +++ b/.github/workflows/aws-torch-latest-full.yml @@ -2,13 +2,13 @@ # DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest) # # Runs the full DeepSpeed unit test suite on AWS self-hosted runners. -# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances. +# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side +# fallback to 8x A100 nodes when L40S capacity is unavailable. # # This workflow runs: # - Parallel tests with pytest-xdist (-n 8) # - Sequential tests marked with @pytest.mark.sequential -# -# Nightly schedule: skips if no new commits since last successful run. +# - Nightly schedule: skips if no new commits since last successful run ################################################################################ name: aws-torch-latest-full @@ -26,7 +26,6 @@ jobs: check-changes: name: Check for new commits runs-on: ubuntu-latest - # Only check on schedule; workflow_dispatch always runs if: github.event_name == 'schedule' outputs: has_changes: ${{ steps.check.outputs.has_changes }} @@ -38,7 +37,6 @@ jobs: run: | default_branch="${{ github.event.repository.default_branch }}" - # Get the HEAD SHA of the last successful run of this workflow last_sha=$(gh api \ "repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \ --jq '.workflow_runs[0].head_sha // empty') @@ -46,20 +44,19 @@ jobs: current_sha="${{ github.sha }}" if [ -z "$last_sha" ]; then - echo "No previous successful run found — running tests" + echo "No previous successful run found - running tests" echo "has_changes=true" >> "$GITHUB_OUTPUT" elif [ "$last_sha" = "$current_sha" ]; then - echo "No new commits since last successful run ($last_sha) — skipping" + echo "No new commits since last successful run ($last_sha) - skipping" echo "has_changes=false" >> "$GITHUB_OUTPUT" else - echo "New commits detected: $last_sha -> $current_sha — running tests" + echo "New commits detected: $last_sha -> $current_sha - running tests" echo "has_changes=true" >> "$GITHUB_OUTPUT" fi unit-tests: name: Unit Tests (Full) needs: [check-changes] - # Run if: (a) workflow_dispatch, or (b) schedule with new commits if: | always() && (github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true') @@ -134,8 +131,30 @@ jobs: echo "CUTLASS_PATH: $CUTLASS_PATH" ls -la $CUTLASS_PATH/include/ | head -5 + - name: Detect GPU architecture + run: | + python - <<'PY' + import os + import torch + + torch.cuda.init() + major, minor = torch.cuda.get_device_capability(0) + arch = f"{major}.{minor}" + gpu_count = torch.cuda.device_count() + gpu_name = torch.cuda.get_device_name(0) + + with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file: + env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n") + env_file.write(f"GPU_COUNT={gpu_count}\n") + + print(f"Detected GPU: {gpu_name}") + print(f"Detected compute capability: {arch}") + print(f"Detected GPU count: {gpu_count}") + PY + - name: Install DeepSpeed run: | + echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST" # Initialize CUDA before install so setup.py can detect NCCL version python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')" # Use --no-build-isolation so setup.py can access pre-installed PyTorch @@ -148,7 +167,7 @@ jobs: - name: Unit tests (parallel) run: | - export TORCH_CUDA_ARCH_LIST="8.9" + echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs" cd tests # Skip tests requiring unavailable hardware or known issues: # - nvme checkpointing: no nvme device @@ -166,7 +185,7 @@ jobs: - name: Unit tests (sequential) run: | - export TORCH_CUDA_ARCH_LIST="8.9" + echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs" cd tests rm -rf /mnt/aio/pytest pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \