Skip to content

Commit be3c597

Browse files
authored
[Monitoring] Update auto scaling scripts to use metrics by PK API (Cloud-CV#4355)
1 parent 6535fe9 commit be3c597

4 files changed

Lines changed: 47 additions & 31 deletions

File tree

scripts/monitoring/auto_scale_ec2_workers.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,17 @@ def start_instance(challenge, evalai_interface):
7272
)
7373

7474

75-
def start_or_stop_workers(challenge, challenge_metrics, evalai_interface):
75+
def start_or_stop_workers(challenge, evalai_interface):
7676
try:
77+
challenge_metrics = evalai_interface.get_challenge_submission_metrics_by_pk(challenge["id"])
7778
pending_submissions = get_pending_submission_count(challenge_metrics)
78-
except Exception: # noqa: F841
79+
except Exception as e: # noqa: F841
7980
print(
8081
"Unable to get the pending submissions for challenge ID: {}, Title: {}. Skipping.".format(
8182
challenge["id"], challenge["title"]
8283
)
8384
)
85+
print(e)
8486
return
8587

8688
print("Pending Submissions: {}, Challenge PK: {}, Title: {}".format(pending_submissions, challenge["id"], challenge["title"]))
@@ -94,11 +96,11 @@ def start_or_stop_workers(challenge, challenge_metrics, evalai_interface):
9496

9597

9698
# TODO: Factor in limits for the APIs
97-
def start_or_stop_workers_for_challenges(response, metrics, evalai_interface):
99+
def start_or_stop_workers_for_challenges(response, evalai_interface):
98100
for challenge in response["results"]:
99101
if challenge["uses_ec2_worker"]:
100102
try:
101-
start_or_stop_workers(challenge, metrics[str(challenge["id"])], evalai_interface)
103+
start_or_stop_workers(challenge, evalai_interface)
102104
except Exception as e:
103105
print(e)
104106

@@ -112,12 +114,11 @@ def create_evalai_interface(auth_token, evalai_endpoint):
112114
def start_job():
113115
evalai_interface = create_evalai_interface(auth_token, evalai_endpoint)
114116
response = evalai_interface.get_challenges()
115-
metrics = evalai_interface.get_challenges_submission_metrics()
116-
start_or_stop_workers_for_challenges(response, metrics, evalai_interface)
117+
start_or_stop_workers_for_challenges(response, evalai_interface)
117118
next_page = response["next"]
118119
while next_page is not None:
119120
response = evalai_interface.make_request(next_page, "GET")
120-
start_or_stop_workers_for_challenges(response, metrics, evalai_interface)
121+
start_or_stop_workers_for_challenges(response, evalai_interface)
121122
next_page = response["next"]
122123

123124

scripts/monitoring/auto_scale_eks_nodes.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
# Env Variables
2727
ENV = os.environ.get("ENV", "production")
28-
AUTH_TOKEN = os.environ.get("AUTH_TOKEN")
28+
STAFF_AUTH_TOKEN = os.environ.get("AUTH_TOKEN")
2929
EVALAI_ENDPOINT = os.environ.get("API_HOST_URL", "https://eval.ai")
3030

3131
json_path = os.environ.get("JSON_PATH", "~/prod_eks_auth_tokens.json")
@@ -107,8 +107,7 @@ def stop_eks_worker(challenge, evalai_interface, aws_keys):
107107
return response
108108

109109

110-
def get_pending_submission_count_by_pk(metrics, challenge_pk):
111-
challenge_metrics = metrics[str(challenge_pk)]
110+
def get_pending_submission_count(challenge_metrics):
112111
pending_submissions = 0
113112
for status in ["running", "submitted", "queued", "resuming"]:
114113
pending_submissions += challenge_metrics.get(status, 0)
@@ -151,15 +150,17 @@ def scale_up_workers(challenge, original_desired_size, pending_submissions, eval
151150
)
152151

153152

154-
def scale_up_or_down_workers(challenge, metrics, evalai_interface, aws_keys, scale_up_desired_size):
153+
def scale_up_or_down_workers(challenge, evalai_interface, staff_evalai_interface, aws_keys, scale_up_desired_size):
155154
try:
156-
pending_submissions = get_pending_submission_count_by_pk(metrics, challenge["id"])
157-
except Exception: # noqa: F841
155+
challenge_metrics = staff_evalai_interface.get_challenge_submission_metrics_by_pk(challenge["id"])
156+
pending_submissions = get_pending_submission_count(challenge_metrics)
157+
except Exception as e: # noqa: F841
158158
print(
159159
"Unable to get the pending submissions for challenge ID: {}, Title: {}. Skipping.".format(
160160
challenge["id"], challenge["title"]
161161
)
162162
)
163+
print(e)
163164
return
164165

165166
eks_client, cluster_name, nodegroup_name = get_eks_meta(
@@ -209,8 +210,7 @@ def scale_up_or_down_workers(challenge, metrics, evalai_interface, aws_keys, sca
209210
def start_job():
210211

211212
# Get metrics
212-
evalai_interface = create_evalai_interface(AUTH_TOKEN)
213-
metrics = evalai_interface.get_challenges_submission_metrics()
213+
staff_evalai_interface = create_evalai_interface(STAFF_AUTH_TOKEN)
214214

215215
for challenge_id, details in INCLUDED_CHALLENGE_PKS.items():
216216
# Auth Token
@@ -237,7 +237,7 @@ def start_job():
237237
), "Challenge ID: {}, Title: {} is either not docker-based or remote-evaluation. Skipping.".format(
238238
challenge["id"], challenge["title"]
239239
)
240-
scale_up_or_down_workers(challenge, metrics, evalai_interface, aws_keys, scale_up_desired_size)
240+
scale_up_or_down_workers(challenge, evalai_interface, staff_evalai_interface, aws_keys, scale_up_desired_size)
241241
time.sleep(1)
242242
except Exception as e:
243243
print(e)

scripts/monitoring/auto_scale_workers.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -88,19 +88,27 @@ def scale_up_or_down_workers(challenge, challenge_metrics):
8888

8989

9090
# TODO: Factor in limits for the APIs
91-
def scale_up_or_down_workers_for_challenges(response, metrics):
91+
def scale_up_or_down_workers_for_challenge(challenge, challenge_metrics):
92+
if ENV == "prod":
93+
try:
94+
if challenge["remote_evaluation"] is False:
95+
scale_up_or_down_workers(challenge, challenge_metrics)
96+
except Exception as e:
97+
print(e)
98+
else:
99+
try:
100+
scale_up_or_down_workers(challenge, challenge_metrics)
101+
except Exception as e:
102+
print(e)
103+
104+
105+
def scale_up_or_down_workers_for_challenges(response, evalai_interface):
92106
for challenge in response["results"]:
93-
if ENV == "prod":
94-
try:
95-
if challenge["remote_evaluation"] is False:
96-
scale_up_or_down_workers(challenge, metrics[str(challenge["id"])])
97-
except Exception as e:
98-
print(e)
99-
else:
100-
try:
101-
scale_up_or_down_workers(challenge, metrics[str(challenge["id"])])
102-
except Exception as e:
103-
print(e)
107+
try:
108+
challenge_metrics = evalai_interface.get_challenge_submission_metrics_by_pk(challenge["id"])
109+
scale_up_or_down_workers_for_challenge(challenge, challenge_metrics)
110+
except Exception as e:
111+
print(e)
104112

105113

106114
def create_evalai_interface(auth_token, evalai_endpoint):
@@ -112,12 +120,11 @@ def create_evalai_interface(auth_token, evalai_endpoint):
112120
def start_job():
113121
evalai_interface = create_evalai_interface(auth_token, evalai_endpoint)
114122
response = evalai_interface.get_challenges()
115-
metrics = evalai_interface.get_challenges_submission_metrics()
116-
scale_up_or_down_workers_for_challenges(response, metrics)
123+
scale_up_or_down_workers_for_challenges(response, evalai_interface)
117124
next_page = response["next"]
118125
while next_page is not None:
119126
response = evalai_interface.make_request(next_page, "GET")
120-
scale_up_or_down_workers_for_challenges(response, metrics)
127+
scale_up_or_down_workers_for_challenges(response, evalai_interface)
121128
next_page = response["next"]
122129

123130

scripts/monitoring/evalai_interface.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
"get_challenges": "/api/challenges/challenge/all/all/all",
2020
"get_submissions_for_challenge": "/api/jobs/challenge/{}/submission/",
2121
"get_challenges_submission_metrics": "/api/challenges/challenge/get_submission_metrics",
22+
"get_challenge_submission_metrics_by_pk": "/api/challenges/challenge/get_submission_metrics_by_pk/{}/",
2223
"manage_ec2_instance": "/api/challenges/{}/manage_ec2_instance/{}",
2324
"get_ec2_instance_details": "/api/challenges/{}/get_ec2_instance_details/",
2425
}
@@ -144,6 +145,13 @@ def get_challenges_submission_metrics(self):
144145
response = self.make_request(url, "GET")
145146
return response
146147

148+
def get_challenge_submission_metrics_by_pk(self, challenge_pk):
149+
url_template = URLS.get("get_challenge_submission_metrics_by_pk")
150+
url = url_template.format(challenge_pk)
151+
url = self.return_url_per_environment(url)
152+
response = self.make_request(url, "GET")
153+
return response
154+
147155
def get_ec2_instance_details(self, challenge_pk):
148156
url_template = URLS.get("get_ec2_instance_details")
149157
url = url_template.format(challenge_pk)

0 commit comments

Comments
 (0)