Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 30 additions & 11 deletions .github/workflows/aws-torch-latest-full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# DeepSpeed CI - AWS L40S GPU Full Tests (PyTorch Latest)
#
# Runs the full DeepSpeed unit test suite on AWS self-hosted runners.
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
# Prefers 4x NVIDIA L40S GPUs on g6e.12xlarge instances, with AWS-side
# fallback to 8x A100 nodes when L40S capacity is unavailable.
#
# This workflow runs:
# - Parallel tests with pytest-xdist (-n 8)
# - Sequential tests marked with @pytest.mark.sequential
#
# Nightly schedule: skips if no new commits since last successful run.
# - Nightly schedule: skips if no new commits since last successful run
################################################################################

name: aws-torch-latest-full
Expand All @@ -26,7 +26,6 @@ jobs:
check-changes:
name: Check for new commits
runs-on: ubuntu-latest
# Only check on schedule; workflow_dispatch always runs
if: github.event_name == 'schedule'
outputs:
has_changes: ${{ steps.check.outputs.has_changes }}
Expand All @@ -38,28 +37,26 @@ jobs:
run: |
default_branch="${{ github.event.repository.default_branch }}"

# Get the HEAD SHA of the last successful run of this workflow
last_sha=$(gh api \
"repos/${{ github.repository }}/actions/workflows/aws-torch-latest-full.yml/runs?status=success&branch=${default_branch}&per_page=1" \
--jq '.workflow_runs[0].head_sha // empty')

current_sha="${{ github.sha }}"

if [ -z "$last_sha" ]; then
echo "No previous successful run found running tests"
echo "No previous successful run found - running tests"
echo "has_changes=true" >> "$GITHUB_OUTPUT"
elif [ "$last_sha" = "$current_sha" ]; then
echo "No new commits since last successful run ($last_sha) skipping"
echo "No new commits since last successful run ($last_sha) - skipping"
echo "has_changes=false" >> "$GITHUB_OUTPUT"
else
echo "New commits detected: $last_sha -> $current_sha running tests"
echo "New commits detected: $last_sha -> $current_sha - running tests"
echo "has_changes=true" >> "$GITHUB_OUTPUT"
fi

unit-tests:
name: Unit Tests (Full)
needs: [check-changes]
# Run if: (a) workflow_dispatch, or (b) schedule with new commits
if: |
always() &&
(github.event_name == 'workflow_dispatch' || needs.check-changes.outputs.has_changes == 'true')
Expand Down Expand Up @@ -134,8 +131,30 @@ jobs:
echo "CUTLASS_PATH: $CUTLASS_PATH"
ls -la $CUTLASS_PATH/include/ | head -5

- name: Detect GPU architecture
run: |
python - <<'PY'
import os
import torch

torch.cuda.init()
major, minor = torch.cuda.get_device_capability(0)
arch = f"{major}.{minor}"
gpu_count = torch.cuda.device_count()
gpu_name = torch.cuda.get_device_name(0)

with open(os.environ["GITHUB_ENV"], "a", encoding="utf-8") as env_file:
env_file.write(f"TORCH_CUDA_ARCH_LIST={arch}\n")
env_file.write(f"GPU_COUNT={gpu_count}\n")

print(f"Detected GPU: {gpu_name}")
print(f"Detected compute capability: {arch}")
print(f"Detected GPU count: {gpu_count}")
PY

- name: Install DeepSpeed
run: |
echo "Using TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
# Initialize CUDA before install so setup.py can detect NCCL version
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
Expand All @@ -148,7 +167,7 @@ jobs:

- name: Unit tests (parallel)
run: |
export TORCH_CUDA_ARCH_LIST="8.9"
echo "Running parallel tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
cd tests
# Skip tests requiring unavailable hardware or known issues:
# - nvme checkpointing: no nvme device
Expand All @@ -166,7 +185,7 @@ jobs:

- name: Unit tests (sequential)
run: |
export TORCH_CUDA_ARCH_LIST="8.9"
echo "Running sequential tests with TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST on $GPU_COUNT GPUs"
cd tests
rm -rf /mnt/aio/pytest
pytest --instafail --timeout 600 --forked -m 'sequential' --basetemp=/mnt/aio/pytest unit/ \
Expand Down
Loading