From c286ff80a8d1827acff357f0b047943089e8b067 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 19 Jun 2024 19:33:42 -0400 Subject: [PATCH 1/3] chore(ci): workaround to retry with uv Signed-off-by: Jinzhe Zeng --- .github/workflows/build_cc.yml | 2 +- .github/workflows/test_cc.yml | 4 ++-- .github/workflows/test_cuda.yml | 4 ++-- .github/workflows/test_python.yml | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml index bf16b67656..775b88cfd3 100644 --- a/.github/workflows/build_cc.yml +++ b/.github/workflows/build_cc.yml @@ -32,7 +32,7 @@ jobs: python-version: '3.11' - uses: lukka/get-cmake@latest - run: python -m pip install uv - - run: python -m uv pip install --system tensorflow + - run: source/install/uv_with_retry.sh pip install --system tensorflow - name: Download libtorch run: | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml index 799a55e9ff..ebbfc4d960 100644 --- a/.github/workflows/test_cc.yml +++ b/.github/workflows/test_cc.yml @@ -27,7 +27,7 @@ jobs: mpi: mpich - uses: lukka/get-cmake@latest - run: python -m pip install uv - - run: python -m uv pip install --system tensorflow + - run: source/install/uv_with_retry.sh pip install --system tensorflow - name: Download libtorch run: | wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.2%2Bcpu.zip -O libtorch.zip @@ -49,7 +49,7 @@ jobs: # test lammps - run: | export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - python -m uv pip install --system -e .[cpu,test,lmp] mpi4py + source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp] mpi4py env: DP_BUILD_TESTING: 1 if: ${{ !matrix.check_memleak }} diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml index d97b1f9431..703d0ea2fe 100644 --- a/.github/workflows/test_cuda.yml +++ b/.github/workflows/test_cuda.yml @@ -47,10 +47,10 @@ jobs: && sudo apt-get -y install cuda-12-3 libcudnn8=8.9.5.*-1+cuda12.3 if: false # skip as we use nvidia image - run: python -m pip install -U uv - - run: python -m uv pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" + - run: source/install/uv_with_retry.sh pip install --system "tensorflow>=2.15.0rc0" "torch>=2.2.0" - run: | export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)') - python -m uv pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py + source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch] mpi4py env: DP_VARIANT: cuda DP_ENABLE_NATIVE_OPTIMIZATION: 1 diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml index 0f9fc61acd..3cf56ecbd3 100644 --- a/.github/workflows/test_python.yml +++ b/.github/workflows/test_python.yml @@ -25,10 +25,10 @@ jobs: python-version: ${{ matrix.python }} - run: python -m pip install -U uv - run: | - uv pip install --system mpich - uv pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ + source/install/uv_with_retry.sh pip install --system mpich + source/install/uv_with_retry.sh pip install --system "torch==2.3.0+cpu.cxx11.abi" -i https://download.pytorch.org/whl/ export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])') - uv pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py + source/install/uv_with_retry.sh pip install --system --only-binary=horovod -e .[cpu,test] horovod[tensorflow-cpu] mpi4py env: # Please note that uv has some issues with finding # existing TensorFlow package. Currently, it uses From 287b04577e629fa7614220d541bcfceabb4cf432 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 19 Jun 2024 19:34:16 -0400 Subject: [PATCH 2/3] sh Signed-off-by: Jinzhe Zeng --- source/install/uv_with_retry.sh | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100755 source/install/uv_with_retry.sh diff --git a/source/install/uv_with_retry.sh b/source/install/uv_with_retry.sh new file mode 100755 index 0000000000..6de95edbfa --- /dev/null +++ b/source/install/uv_with_retry.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# This script is used to retry the uv command if the error "error decoding response body" is encountered. +# See also: +# https://github.com/astral-sh/uv/issues/2586 +# https://github.com/astral-sh/uv/issues/3456 +# https://github.com/astral-sh/uv/issues/3514 +# https://github.com/astral-sh/uv/issues/4402 +tmpstderr=$(mktemp) +max_retry=3 +while true; do + uv "$@" 2> >(tee -a ${tmpstderr} >&2) + exit_code=$? + # exit if ok + if [ $exit_code -eq 0 ]; then + rm -f ${tmpstderr} + exit 0 + fi + # check if "error decoding response body" is in the stderr + if grep -q "error decoding response body" ${tmpstderr}; then + echo "Retrying uv in 1 s..." + max_retry=$((max_retry - 1)) + if [ $max_retry -eq 0 ]; then + echo "Max retry reached, exiting..." + rm -f ${tmpstderr} + exit 1 + fi + sleep 1 + else + rm -f ${tmpstderr} + exit $exit_code + fi +done From facf45b8552406d232a29ef66236414523292f9a Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Wed, 19 Jun 2024 20:00:38 -0400 Subject: [PATCH 3/3] Update source/install/uv_with_retry.sh Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Jinzhe Zeng --- source/install/uv_with_retry.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/source/install/uv_with_retry.sh b/source/install/uv_with_retry.sh index 6de95edbfa..2d9a524f6b 100755 --- a/source/install/uv_with_retry.sh +++ b/source/install/uv_with_retry.sh @@ -8,25 +8,25 @@ tmpstderr=$(mktemp) max_retry=3 while true; do - uv "$@" 2> >(tee -a ${tmpstderr} >&2) + uv "$@" 2> >(tee -a "${tmpstderr}" >&2) exit_code=$? # exit if ok if [ $exit_code -eq 0 ]; then - rm -f ${tmpstderr} + rm -f "${tmpstderr}" exit 0 fi # check if "error decoding response body" is in the stderr - if grep -q "error decoding response body" ${tmpstderr}; then + if grep -q "error decoding response body" "${tmpstderr}"; then echo "Retrying uv in 1 s..." max_retry=$((max_retry - 1)) if [ $max_retry -eq 0 ]; then echo "Max retry reached, exiting..." - rm -f ${tmpstderr} + rm -f "${tmpstderr}" exit 1 fi sleep 1 else - rm -f ${tmpstderr} + rm -f "${tmpstderr}" exit $exit_code fi done