Skip to content

Commit a0aef71

Browse files
committed
Update CUDA installation script and CI workflow for version 13.0 support
- Updated NVSHMEM and cuDNN versions in install_cuda.sh. - Added support for CUDA 13.0 installation, including NCCL and cuSparseLt. - Modified test_install_cuda.yml to include a matrix for testing CUDA versions 12.8 and 13.0.
1 parent 08afcce commit a0aef71

File tree

2 files changed

+50
-36
lines changed

2 files changed

+50
-36
lines changed

.github/workflows/test_install_cuda.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ jobs:
1515
test-install-cuda:
1616
runs-on: ubuntu-latest
1717
timeout-minutes: 30
18+
strategy:
19+
fail-fast: false
20+
matrix:
21+
cuda_version: [12.8, 13.0]
1822

1923
steps:
2024
- name: Checkout code
@@ -28,7 +32,7 @@ jobs:
2832
- name: Make script executable
2933
run: chmod +x install_cuda.sh
3034

31-
- name: Run install_cuda.sh (skip GPU driver install)
35+
- name: Run install_cuda.sh (CUDA ${{ matrix.cuda_version }})
3236
env:
3337
CUDA_INSTALL_PREFIX: /tmp/test-cuda-install
3438
SKIP_PRUNE: 1
@@ -37,9 +41,9 @@ jobs:
3741
INSTALL_NCCL: 0
3842
run: |
3943
# Only test CUDA Toolkit installation, no GPU driver install (no root, no real GPU)
40-
# Use a supported version, e.g. 12.8
44+
# Use a supported version from the matrix
4145
# NCCL installation is disabled to avoid compilation errors in CI environment
42-
bash install_cuda.sh 12.8 || (cat /tmp/${USER}/cuda_install/error.log || true; exit 1)
46+
bash install_cuda.sh ${{ matrix.cuda_version }} || (cat /tmp/${USER}/cuda_install/error.log || true; exit 1)
4347
4448
- name: Check summary output
4549
run: |

install_cuda.sh

Lines changed: 43 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ echo "🚀 ===== CUDA Installation Script Started ====="
1414
CUDA_INSTALL_PREFIX=${CUDA_INSTALL_PREFIX:-$HOME/opt}
1515
CUDA_INSTALL_PREFIX=${CUDA_INSTALL_PREFIX%/}
1616
CUDA_VERSION=${CUDA_VERSION:-12.8}
17-
SKIP_PRUNE=${SKIP_PRUNE:-1}
18-
NVSHMEM_VERSION=${NVSHMEM_VERSION:-3.3.9}
17+
NVSHMEM_VERSION=${NVSHMEM_VERSION:-3.3.20}
1918
INSTALL_NCCL=${INSTALL_NCCL:-1}
2019

2120
echo "CUDA_INSTALL_PREFIX=${CUDA_INSTALL_PREFIX}"
@@ -195,6 +194,8 @@ function install_nccl {
195194
echo "Getting NCCL version information..."
196195
if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
197196
NCCL_VERSION=$(curl -sL https://github.com/pytorch/pytorch/raw/refs/heads/main/.ci/docker/ci_commit_pins/nccl-cu12.txt)
197+
elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
198+
NCCL_VERSION=$(curl -sL https://github.com/pytorch/pytorch/raw/refs/heads/main/.ci/docker/ci_commit_pins/nccl-cu13.txt)
198199
else
199200
error_exit "Unsupported CUDA version: ${CUDA_VERSION}"
200201
fi
@@ -249,8 +250,13 @@ function install_cusparselt {
249250
local cusparselt_version
250251
local arch_path=${ARCH_PATH}
251252

252-
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
253+
local CUSPARSELT_NAME
254+
if [[ ${CUDA_VERSION:0:2} == "13" ]]; then
255+
cusparselt_version="0.8.0.4"
256+
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-${cusparselt_version}_cuda13-archive"
257+
elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
253258
cusparselt_version="0.7.1.0"
259+
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-${cusparselt_version}-archive"
254260
else
255261
popd
256262
rm -rf tmp_cusparselt
@@ -259,7 +265,6 @@ function install_cusparselt {
259265

260266
echo "${cusparselt_version}" >"${USER_TMPDIR}/cusparselt_version.txt"
261267

262-
local CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-${cusparselt_version}-archive"
263268
echo "Downloading cuSparseLt: ${CUSPARSELT_NAME}.tar.xz"
264269

265270
if ! curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz; then
@@ -310,26 +315,27 @@ function install_nvshmem {
310315
mkdir -p "${tmpdir}" && cd "${tmpdir}"
311316

312317
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
313-
local filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
314-
local url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
318+
local filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
319+
local suffix=".tar.xz"
320+
local url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
315321

316-
echo "Downloading nvSHMEM: ${filename}.tar.gz"
322+
echo "Downloading nvSHMEM: ${filename}${suffix}"
317323
if ! wget -q "${url}"; then
318324
cd ..
319325
rm -rf "${tmpdir}"
320-
error_exit "nvSHMEM download failed: ${filename}.tar.gz"
326+
error_exit "nvSHMEM download failed: ${filename}${suffix}"
321327
fi
322328

323329
echo "Extracting nvSHMEM..."
324-
if ! tar xf "${filename}.tar.gz"; then
330+
if ! tar xf "${filename}${suffix}"; then
325331
cd ..
326332
rm -rf "${tmpdir}"
327-
error_exit "nvSHMEM extraction failed: ${filename}.tar.gz"
333+
error_exit "nvSHMEM extraction failed: ${filename}${suffix}"
328334
fi
329335

330336
echo "Installing nvSHMEM to CUDA directory..."
331-
cp -a "libnvshmem/include/"* ${CUDA_INSTALL_PREFIX}/cuda/include/
332-
cp -a "libnvshmem/lib/"* ${CUDA_INSTALL_PREFIX}/cuda/lib64/
337+
cp -a "${filename}/include/"* ${CUDA_INSTALL_PREFIX}/cuda/include/
338+
cp -a "${filename}/lib/"* ${CUDA_INSTALL_PREFIX}/cuda/lib64/
333339

334340
cd ..
335341
rm -rf "${tmpdir}"
@@ -371,7 +377,7 @@ function install_126 {
371377

372378
# CUDA 12.8 installation function
373379
function install_128 {
374-
local CUDNN_VERSION=9.10.2.21
380+
local CUDNN_VERSION=9.8.0.87
375381
echo "Starting installation for CUDA 12.8..."
376382

377383
echo "📦 STEP 1: Installing CUDA toolkit..."
@@ -425,31 +431,39 @@ function install_129 {
425431
return 0
426432
}
427433

428-
# Simplified pruning function - enable as needed
429-
function prune_cuda {
430-
local cuda_version=$1
431-
local major_minor=$2
434+
# CUDA 13.0 installation function
435+
function install_130 {
436+
local CUDNN_VERSION=9.12.0.46
437+
echo "Starting installation for CUDA 13.0..."
438+
439+
echo "📦 STEP 1: Installing CUDA toolkit..."
440+
install_cuda "13.0.0" "cuda_13.0.0_580.65.06_linux" || error_exit "CUDA 13.0.0 toolkit installation failed"
441+
442+
echo "🧠 STEP 2: Installing cuDNN..."
443+
install_cudnn "13" "${CUDNN_VERSION}" || error_exit "cuDNN installation failed"
444+
445+
echo "🔗 STEP 3: Installing NCCL..."
446+
install_nccl || error_exit "NCCL installation failed"
432447

433-
echo "Pruning CUDA ${major_minor}..."
448+
echo "⚡ STEP 4: Installing cuSparseLt..."
449+
install_cusparselt || error_exit "cuSparseLt installation failed"
434450

435-
# CUDA pruning logic can be added back as needed
436-
# Kept empty for now for easier troubleshooting
451+
echo "💾 STEP 5: Installing nvSHMEM..."
452+
install_nvshmem "13" || error_exit "nvSHMEM installation failed"
437453

438-
# Pruning complete marker
439-
touch "${USER_TMPDIR}/cuda_${major_minor}_pruned"
454+
if [ "$(id -u)" -eq 0 ]; then
455+
ldconfig
456+
fi
440457

441-
echo "CUDA ${major_minor} pruning completed"
458+
echo "CUDA 13.0 installation completed"
442459
return 0
443460
}
444461

445-
# Version-specific pruning functions
446-
function prune_126 {
447-
prune_cuda "126" "12.6"
448-
}
462+
449463

450464
# Main execution logic
451465
echo "🔧 ===== Parsing command line arguments ====="
452-
VALID_VERSIONS=("12.6" "12.8" "12.9")
466+
VALID_VERSIONS=("12.6" "12.8" "12.9" "13.0")
453467

454468
# Parse command line arguments
455469
while test $# -gt 0; do
@@ -487,11 +501,7 @@ if [ $INSTALL_RESULT -ne 0 ]; then
487501
error_exit "Installation failed, exit code: $INSTALL_RESULT"
488502
fi
489503

490-
# Perform pruning if requested
491-
if [ "$SKIP_PRUNE" -eq 0 ]; then
492-
echo "Performing CUDA pruning operations..."
493-
eval "prune_${version_no_dot}" || error_exit "Pruning failed"
494-
fi
504+
495505

496506
# Final cleanup
497507
cleanup_temp_dirs

0 commit comments

Comments
 (0)