From 4f14e19df979a11497dc040866d7afb6eaeed17c Mon Sep 17 00:00:00 2001 From: Navin Soni Date: Sat, 6 Aug 2022 00:21:50 +0000 Subject: [PATCH] fix: Add retry in session.py to check if training is finished --- src/sagemaker/session.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index 145bf41cbe..e7dc0171df 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -41,6 +41,7 @@ secondary_training_status_changed, secondary_training_status_message, sts_regional_endpoint, + retries, ) from sagemaker import exceptions from sagemaker.session_settings import SessionSettings @@ -4691,21 +4692,30 @@ def _train_done(sagemaker_client, job_name, last_desc): """Placeholder docstring""" in_progress_statuses = ["InProgress", "Created"] - desc = sagemaker_client.describe_training_job(TrainingJobName=job_name) - status = desc["TrainingJobStatus"] + for _ in retries( + max_retry_count=10, # 10*30 = 5min + exception_message_prefix="Waiting for schedule to leave 'Pending' status", + seconds_to_sleep=30, + ): + try: + desc = sagemaker_client.describe_training_job(TrainingJobName=job_name) + status = desc["TrainingJobStatus"] - if secondary_training_status_changed(desc, last_desc): - print() - print(secondary_training_status_message(desc, last_desc), end="") - else: - print(".", end="") - sys.stdout.flush() + if secondary_training_status_changed(desc, last_desc): + print() + print(secondary_training_status_message(desc, last_desc), end="") + else: + print(".", end="") + sys.stdout.flush() - if status in in_progress_statuses: - return desc, False + if status in in_progress_statuses: + return desc, False - print() - return desc, True + print() + return desc, True + except botocore.exceptions.ClientError as err: + if err.response["Error"]["Code"] == "AccessDeniedException": + pass def _processing_job_status(sagemaker_client, job_name):