From 3e9560579e4baa691de38512d601189e514cd766 Mon Sep 17 00:00:00 2001 From: Johann Fuechsl Date: Mon, 15 Jun 2020 14:18:37 +0200 Subject: [PATCH 1/2] Retry on potential ReplicaSet create race-condition. --- rootfs/scheduler/resources/deployment.py | 45 +++++++++++++++--------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/rootfs/scheduler/resources/deployment.py b/rootfs/scheduler/resources/deployment.py index 0618dc274..74d9f7ce8 100644 --- a/rootfs/scheduler/resources/deployment.py +++ b/rootfs/scheduler/resources/deployment.py @@ -378,22 +378,35 @@ def _check_for_failed_events(self, namespace, labels): Request for new ReplicaSet of Deployment and search for failed events involved by that RS Raises: KubeException when RS have events with FailedCreate reason """ - response = self.rs.get(namespace, labels=labels) - data = response.json() - fields = { - 'involvedObject.kind': 'ReplicaSet', - 'involvedObject.name': data['items'][0]['metadata']['name'], - 'involvedObject.namespace': namespace, - 'involvedObject.uid': data['items'][0]['metadata']['uid'], - } - events_list = self.ns.events(namespace, fields=fields).json() - events = events_list.get('items', []) - if events is not None and len(events) != 0: - for event in events: - if event['reason'] == 'FailedCreate': - log = self._get_formatted_messages(events) - self.log(namespace, log) - raise KubeException(log) + max_retries = 3 + retry_sleep_sec = 3.0 + for try_ in range(max_retries): + response = self.rs.get(namespace, labels=labels) + data = response.json() + try: + fields = { + 'involvedObject.kind': 'ReplicaSet', + 'involvedObject.name': data['items'][0]['metadata']['name'], + 'involvedObject.namespace': namespace, + 'involvedObject.uid': data['items'][0]['metadata']['uid'], + } + except Exception as e: + if try_ + 1 < max_retries: + self.log(namespace, "Got an empty ReplicaSet list. Trying one more time. {}".format( + json.dumps(labels))) + time.sleep(retry_sleep_sec) + continue + self.log(namespace, "Did not find the ReplicaSet for {}".format( + json.dumps(labels)), "WARN") + raise e + events_list = self.ns.events(namespace, fields=fields).json() + events = events_list.get('items', []) + if events is not None and len(events) != 0: + for event in events: + if event['reason'] == 'FailedCreate': + log = self._get_formatted_messages(events) + self.log(namespace, log) + raise KubeException(log) @staticmethod def _get_formatted_messages(events): From 7f3fea72af67de37596a01c2364d4700fd62d4b0 Mon Sep 17 00:00:00 2001 From: Johann Fuechsl Date: Tue, 16 Jun 2020 07:27:05 +0200 Subject: [PATCH 2/2] Fix style --- rootfs/scheduler/resources/deployment.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rootfs/scheduler/resources/deployment.py b/rootfs/scheduler/resources/deployment.py index 74d9f7ce8..b30cbc000 100644 --- a/rootfs/scheduler/resources/deployment.py +++ b/rootfs/scheduler/resources/deployment.py @@ -392,8 +392,9 @@ def _check_for_failed_events(self, namespace, labels): } except Exception as e: if try_ + 1 < max_retries: - self.log(namespace, "Got an empty ReplicaSet list. Trying one more time. {}".format( - json.dumps(labels))) + self.log(namespace, + "Got an empty ReplicaSet list. Trying one more time. {}".format( + json.dumps(labels))) time.sleep(retry_sleep_sec) continue self.log(namespace, "Did not find the ReplicaSet for {}".format(