From 530f53c11c2b3f3034ea4498967b51707e3c1ecf Mon Sep 17 00:00:00 2001 From: Charlie Lye Date: Sun, 29 Oct 2023 21:08:15 +0000 Subject: [PATCH 1/6] Retry on request spot. --- build-system/scripts/request_spot | 11 ++++++++--- build-system/scripts/spot_run_script | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/build-system/scripts/request_spot b/build-system/scripts/request_spot index 40d7d9a585ad..999292bf732b 100755 --- a/build-system/scripts/request_spot +++ b/build-system/scripts/request_spot @@ -110,6 +110,11 @@ done # Wait till ssh port is open. >&2 echo "Waiting for SSH at $IP..." -while ! nc -z $IP 22; do sleep 1; done; - -echo $IP +for I in {1..60}; do + if nc -z $IP 22; then + echo $IP + exit 0 + fi + sleep 1 +done +exit 1 diff --git a/build-system/scripts/spot_run_script b/build-system/scripts/spot_run_script index d8cd288070a9..c9b45d3fd38c 100755 --- a/build-system/scripts/spot_run_script +++ b/build-system/scripts/spot_run_script @@ -23,7 +23,7 @@ function on_exit { trap on_exit EXIT # Get spot instance. -IP=$(request_spot $CONTENT_HASH:$JOB_NAME $CPUS) +IP=$(retry request_spot $CONTENT_HASH:$JOB_NAME $CPUS) # Run script remotely on spot instance, capturing success or failure. set +e From 902a78873d513f5aa31c5de62d1ced5d4c17ff0f Mon Sep 17 00:00:00 2001 From: Charlie Lye Date: Sun, 29 Oct 2023 21:08:29 +0000 Subject: [PATCH 2/6] [ci rebuild] From a4b7aacd37c5ce7fdaf53b8077b85fca31687d52 Mon Sep 17 00:00:00 2001 From: Charlie Lye Date: Mon, 30 Oct 2023 11:24:01 +0000 Subject: [PATCH 3/6] wip [ci rebuild] --- build-system/scripts/create_ecr_manifest | 4 ++-- build-system/scripts/remote_run_script | 1 + build-system/scripts/request_spot | 2 +- build-system/scripts/retry | 2 +- build-system/scripts/spot_run_script | 23 +++++++++++++---------- 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/build-system/scripts/create_ecr_manifest b/build-system/scripts/create_ecr_manifest index 44d887a190f2..2e9fad6ae68b 100755 --- a/build-system/scripts/create_ecr_manifest +++ b/build-system/scripts/create_ecr_manifest @@ -27,9 +27,9 @@ for A in $ARCH_LIST do ARCH_IMAGE=$IMAGE_URI-$A echo "Adding image $ARCH_IMAGE to manifest list." - docker manifest create $IMAGE_URI --amend $ARCH_IMAGE + retry docker manifest create $IMAGE_URI --amend $ARCH_IMAGE done IFS=$OLD_IFS unset OLD_IFS -docker manifest push --purge $IMAGE_URI +retry docker manifest push --purge $IMAGE_URI diff --git a/build-system/scripts/remote_run_script b/build-system/scripts/remote_run_script index 8aa4e0ea64f4..e9ef84d723d4 100755 --- a/build-system/scripts/remote_run_script +++ b/build-system/scripts/remote_run_script @@ -15,6 +15,7 @@ shift SSH_CONFIG_PATH=${SSH_CONFIG_PATH:-$BUILD_SYSTEM_PATH/remote/ssh_config} # Copy the runner script to spot instance. This is what we actually run. +echo "Copying ./remote_runner to $IP..." scp -rF $SSH_CONFIG_PATH $BUILD_SYSTEM_PATH/scripts/remote_runner $IP:. # Run script on remote instance, passing environment variables. diff --git a/build-system/scripts/request_spot b/build-system/scripts/request_spot index 999292bf732b..9e35b9b769ad 100755 --- a/build-system/scripts/request_spot +++ b/build-system/scripts/request_spot @@ -24,7 +24,7 @@ INSTANCE_TYPE_SUFFIX=${cpu_map[$CPUS]} # Check if INSTANCE_TYPE_SUFFIX is set, if not, the CPU count is not recognized. if [ -z "$INSTANCE_TYPE_SUFFIX" ]; then - echo "Unrecognized CPU count: $CPUS" + >&2 echo "Unrecognized CPU count: $CPUS" exit 1 fi diff --git a/build-system/scripts/retry b/build-system/scripts/retry index 88cbeb6789ee..0489aa226fd2 100755 --- a/build-system/scripts/retry +++ b/build-system/scripts/retry @@ -3,5 +3,5 @@ ATTEMPTS=3 for i in $(seq 1 $ATTEMPTS); do "$@" && exit || sleep 10 done -echo "$@ failed after $ATTEMPTS attempts" +>&2 echo "$@ failed after $ATTEMPTS attempts" exit 1 diff --git a/build-system/scripts/spot_run_script b/build-system/scripts/spot_run_script index c9b45d3fd38c..de1ee2230519 100755 --- a/build-system/scripts/spot_run_script +++ b/build-system/scripts/spot_run_script @@ -12,8 +12,14 @@ CONTENT_HASH=$1 CPUS=$2 shift 2 -# On any sort of exit (error or not), kill spot request so it doesn't count against quota. +# On any sort of exit (error or not). function on_exit { + if [ -n "$IP" ]; then + echo "Terminating spot instance..." + ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1 + fi + + # Kill spot request so it doesn't count against quota. if [ -f "sir-$CONTENT_HASH:$JOB_NAME.txt" ]; then SIR=$(cat sir-$CONTENT_HASH:$JOB_NAME.txt) echo "Cancelling spot instance request $SIR (silently)" @@ -25,13 +31,10 @@ trap on_exit EXIT # Get spot instance. IP=$(retry request_spot $CONTENT_HASH:$JOB_NAME $CPUS) -# Run script remotely on spot instance, capturing success or failure. -set +e -remote_run_script $IP $@ -CODE=$? +if [ -z "$IP" ]; then + echo "Failed to get spot instance." + exit 1 +fi -# Shutdown spot. -echo "Terminating spot instance..." -ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1 - -exit $CODE +# Run script remotely on spot instance, capturing success or failure. +remote_run_script $IP $@ \ No newline at end of file From 2ff2951597dce57696ab48c3de68f75bc524c8cf Mon Sep 17 00:00:00 2001 From: Charlie Lye Date: Mon, 30 Oct 2023 11:34:19 +0000 Subject: [PATCH 4/6] wip [ci rebuild] [ci debug] From 0bcd3a068b38e510c6b41ff18f8aa5c2e7708d27 Mon Sep 17 00:00:00 2001 From: Charlie Lye Date: Mon, 30 Oct 2023 11:42:32 +0000 Subject: [PATCH 5/6] wip [ci rebuild] [ci debug] --- build-system/scripts/spot_run_script | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build-system/scripts/spot_run_script b/build-system/scripts/spot_run_script index de1ee2230519..69e46824e054 100755 --- a/build-system/scripts/spot_run_script +++ b/build-system/scripts/spot_run_script @@ -16,7 +16,7 @@ shift 2 function on_exit { if [ -n "$IP" ]; then echo "Terminating spot instance..." - ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1 + ssh -F $SSH_CONFIG_PATH $IP sudo halt -p fi # Kill spot request so it doesn't count against quota. From 3d9c34939a81742c01dcc8eae0addd18cebe0d36 Mon Sep 17 00:00:00 2001 From: Charlie Lye Date: Mon, 30 Oct 2023 11:46:24 +0000 Subject: [PATCH 6/6] wip [ci rebuild] [ci debug] --- build-system/scripts/spot_run_script | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/build-system/scripts/spot_run_script b/build-system/scripts/spot_run_script index 69e46824e054..69707de660f4 100755 --- a/build-system/scripts/spot_run_script +++ b/build-system/scripts/spot_run_script @@ -14,15 +14,17 @@ shift 2 # On any sort of exit (error or not). function on_exit { + set +e + if [ -n "$IP" ]; then echo "Terminating spot instance..." - ssh -F $SSH_CONFIG_PATH $IP sudo halt -p + ssh -F $SSH_CONFIG_PATH $IP sudo halt -p > /dev/null 2>&1 fi # Kill spot request so it doesn't count against quota. if [ -f "sir-$CONTENT_HASH:$JOB_NAME.txt" ]; then SIR=$(cat sir-$CONTENT_HASH:$JOB_NAME.txt) - echo "Cancelling spot instance request $SIR (silently)" + echo "Cancelling spot instance request $SIR..." aws ec2 cancel-spot-instance-requests --spot-instance-request-ids $SIR >/dev/null 2>&1 || true fi }