From 6eac4f61a226ee633b09fcbc7a6bb23a83d53c4b Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Sun, 16 Mar 2025 17:23:39 +0200 Subject: [PATCH 01/11] Avoid race when removing interfaces via NNCP Remoivng an interface that was created using an NNCP, is done by editing the same NNCP. This sometimes resulted in a race, in which the NNCP success status actually presented the prvious status, leading to deleting the NNCP before the configuration was completed, leaving hanging interfaces in the cluster nodes, with node native interfaces occupied as the ports of these tests-created interfaces. A recent PR made this failed flow to always occur. This PR aims to assure that the timestamp of the AVAIALBLE status is updated for the recent change (the interface removal) and not the previous change (setup or modification). This PR is based on the fix that was presented in https://github.com/RedHatQE/openshift-virtualization-tests/pull/430. --- .../node_network_configuration_policy.py | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index b9f21e2f0b..28d56b0d43 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -1,8 +1,9 @@ import re +from datetime import datetime from kubernetes.dynamic.exceptions import ConflictError -from ocp_resources.utils.constants import TIMEOUT_4MINUTES +from ocp_resources.utils.constants import TIMEOUT_1MINUTE, TIMEOUT_4MINUTES, TIMEOUT_5SEC from ocp_resources.exceptions import NNCPConfigurationFailed from ocp_resources.node import Node from ocp_resources.node_network_configuration_enactment import ( @@ -10,7 +11,7 @@ ) from ocp_resources.node_network_state import NodeNetworkState from ocp_resources.resource import Resource, ResourceEditor -from timeout_sampler import TimeoutExpiredError, TimeoutSampler, TimeoutWatch +from timeout_sampler import TimeoutExpiredError, TimeoutSampler, TimeoutWatch, retry IPV4_STR = "ipv4" IPV6_STR = "ipv6" @@ -447,3 +448,33 @@ def _get_failed_nnce(self): for nnce_cond in nnce.instance.status.conditions: if nnce_cond.type == "Failing" and nnce_cond.status == Resource.Condition.Status.TRUE: yield nnce + + def _get_nncp_configured_last_transition_time(self): + for condition in self.instance.status.conditions: + if ( + condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE + and condition["status"] == "True" + and condition["reason"] == NodeNetworkConfigurationPolicy.Conditions.Reason.SUCCESSFULLY_CONFIGURED + ): + return condition["lastTransitionTime"] + + @retry( + wait_timeout=TIMEOUT_1MINUTE, + sleep=TIMEOUT_5SEC, + ) + def _wait_for_nncp_with_different_transition_time(self, initial_transition_time): + date_format = "%Y-%m-%dT%H:%M:%SZ" + for condition in self.instance.get("status", {}).get("conditions", []): + if ( + condition + and condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE + and datetime.strptime(condition["lastTransitionTime"], date_format) + > datetime.strptime(initial_transition_time, date_format) + ): + return True + return False + + def update(self, resource_dict=None): + initial_transition_time = self._get_nncp_configured_last_transition_time() + super().update(resource_dict=resource_dict) + self._wait_for_nncp_with_different_transition_time(initial_transition_time=initial_transition_time) From f331d4bd76e8b800f803365fec6dc22c549af8b9 Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Mon, 17 Mar 2025 13:26:04 +0200 Subject: [PATCH 02/11] Store fixed initial time outside loop --- ocp_resources/node_network_configuration_policy.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 28d56b0d43..ee1ca3759f 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -464,12 +464,13 @@ def _get_nncp_configured_last_transition_time(self): ) def _wait_for_nncp_with_different_transition_time(self, initial_transition_time): date_format = "%Y-%m-%dT%H:%M:%SZ" + formatted_initial_transition_time = datetime.strptime(initial_transition_time, date_format) for condition in self.instance.get("status", {}).get("conditions", []): if ( condition and condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE and datetime.strptime(condition["lastTransitionTime"], date_format) - > datetime.strptime(initial_transition_time, date_format) + > formatted_initial_transition_time ): return True return False From 85b043a0cfd713992e3630a181723a3fb04de6aa Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Mon, 17 Mar 2025 17:53:57 +0200 Subject: [PATCH 03/11] Handle CR comments --- .../node_network_configuration_policy.py | 60 +++++++++---------- 1 file changed, 29 insertions(+), 31 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index ee1ca3759f..29bf53b220 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -327,6 +327,35 @@ def _absent_interface(self): patches={self: {"spec": {"desiredState": {"interfaces": self.desired_state["interfaces"]}}}} ).update() + def update(self, resource_dict=None): + initial_transition_time = self._get_nncp_configured_last_transition_time() + super().update(resource_dict=resource_dict) + self._wait_for_nncp_status_update(initial_transition_time=initial_transition_time) + + def _get_nncp_configured_last_transition_time(self): + for condition in self.instance.status.conditions: + if ( + condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE + and condition["status"] == "True" + and condition["reason"] == NodeNetworkConfigurationPolicy.Conditions.Reason.SUCCESSFULLY_CONFIGURED + ): + return condition["lastTransitionTime"] + + @retry( + wait_timeout=TIMEOUT_1MINUTE, + sleep=TIMEOUT_5SEC, + ) + def _wait_for_nncp_status_update(self, initial_transition_time): + date_format = "%Y-%m-%dT%H:%M:%SZ" + formatted_initial_transition_time = datetime.strptime(initial_transition_time, date_format) + if any( + condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE + and datetime.strptime(condition["lastTransitionTime"], date_format) > formatted_initial_transition_time + for condition in self.instance.get("status", {}).get("conditions", []) + ): + return True + return False + @property def status(self): for condition in self.instance.status.conditions: @@ -448,34 +477,3 @@ def _get_failed_nnce(self): for nnce_cond in nnce.instance.status.conditions: if nnce_cond.type == "Failing" and nnce_cond.status == Resource.Condition.Status.TRUE: yield nnce - - def _get_nncp_configured_last_transition_time(self): - for condition in self.instance.status.conditions: - if ( - condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE - and condition["status"] == "True" - and condition["reason"] == NodeNetworkConfigurationPolicy.Conditions.Reason.SUCCESSFULLY_CONFIGURED - ): - return condition["lastTransitionTime"] - - @retry( - wait_timeout=TIMEOUT_1MINUTE, - sleep=TIMEOUT_5SEC, - ) - def _wait_for_nncp_with_different_transition_time(self, initial_transition_time): - date_format = "%Y-%m-%dT%H:%M:%SZ" - formatted_initial_transition_time = datetime.strptime(initial_transition_time, date_format) - for condition in self.instance.get("status", {}).get("conditions", []): - if ( - condition - and condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE - and datetime.strptime(condition["lastTransitionTime"], date_format) - > formatted_initial_transition_time - ): - return True - return False - - def update(self, resource_dict=None): - initial_transition_time = self._get_nncp_configured_last_transition_time() - super().update(resource_dict=resource_dict) - self._wait_for_nncp_with_different_transition_time(initial_transition_time=initial_transition_time) From bed9b315242d741502d0037db53987d91208a290 Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Tue, 18 Mar 2025 14:17:15 +0200 Subject: [PATCH 04/11] More CR fixes Verify orderly teardown of a failed NNCP. --- ocp_resources/node_network_configuration_policy.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 29bf53b220..a531007971 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -330,7 +330,10 @@ def _absent_interface(self): def update(self, resource_dict=None): initial_transition_time = self._get_nncp_configured_last_transition_time() super().update(resource_dict=resource_dict) - self._wait_for_nncp_status_update(initial_transition_time=initial_transition_time) + + # If the setup NNCP failed - we don't need to verify its teardown status time is updated. + if initial_transition_time: + self._wait_for_nncp_status_update(initial_transition_time=initial_transition_time) def _get_nncp_configured_last_transition_time(self): for condition in self.instance.status.conditions: @@ -348,13 +351,11 @@ def _get_nncp_configured_last_transition_time(self): def _wait_for_nncp_status_update(self, initial_transition_time): date_format = "%Y-%m-%dT%H:%M:%SZ" formatted_initial_transition_time = datetime.strptime(initial_transition_time, date_format) - if any( + return any( condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE and datetime.strptime(condition["lastTransitionTime"], date_format) > formatted_initial_transition_time for condition in self.instance.get("status", {}).get("conditions", []) - ): - return True - return False + ) @property def status(self): From 99c506e43c958bc5875dbdf29ccbfe95039c3268 Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Tue, 18 Mar 2025 16:35:55 +0200 Subject: [PATCH 05/11] More CR fixes Remove redundant overriding update(). --- ocp_resources/node_network_configuration_policy.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index a531007971..72214d7c47 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -323,14 +323,11 @@ def _absent_interface(self): if self.ports: self.add_ports() + initial_transition_time = self._get_nncp_configured_last_transition_time() ResourceEditor( patches={self: {"spec": {"desiredState": {"interfaces": self.desired_state["interfaces"]}}}} ).update() - def update(self, resource_dict=None): - initial_transition_time = self._get_nncp_configured_last_transition_time() - super().update(resource_dict=resource_dict) - # If the setup NNCP failed - we don't need to verify its teardown status time is updated. if initial_transition_time: self._wait_for_nncp_status_update(initial_transition_time=initial_transition_time) From 273fde65d74f2d2d645568f3cf8bcd0f187eafcc Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Tue, 18 Mar 2025 17:54:51 +0200 Subject: [PATCH 06/11] Fix CI failure Add datetime to flake8. --- .flake8 | 1 + 1 file changed, 1 insertion(+) diff --git a/.flake8 b/.flake8 index b29653fe2d..614ff503e9 100644 --- a/.flake8 +++ b/.flake8 @@ -58,6 +58,7 @@ fcn_exclude_functions = click, ast, filecmp, + datetime, enable-extensions = FCN, From e9a1776878bddff0dec44aa1929817b4ca7701ba Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Wed, 19 Mar 2025 11:26:44 +0200 Subject: [PATCH 07/11] Fix more CR comments Clarify comment Rename args for clarification Add typing for new functions --- .../node_network_configuration_policy.py | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 72214d7c47..5da244b578 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -323,21 +323,22 @@ def _absent_interface(self): if self.ports: self.add_ports() - initial_transition_time = self._get_nncp_configured_last_transition_time() + initial_success_status_time = self._get_last_successful_transition_time() ResourceEditor( patches={self: {"spec": {"desiredState": {"interfaces": self.desired_state["interfaces"]}}}} ).update() - # If the setup NNCP failed - we don't need to verify its teardown status time is updated. - if initial_transition_time: - self._wait_for_nncp_status_update(initial_transition_time=initial_transition_time) + # If the NNCP failed on setup, then at this point it's not in AVAILABLE status, so the next time it will be in + # AVAILABLE status will necessarily be the updated one. + if initial_success_status_time: + self._wait_for_nncp_status_update(initial_transition_time=initial_success_status_time) - def _get_nncp_configured_last_transition_time(self): + def _get_last_successful_transition_time(self) -> str: for condition in self.instance.status.conditions: if ( - condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE - and condition["status"] == "True" - and condition["reason"] == NodeNetworkConfigurationPolicy.Conditions.Reason.SUCCESSFULLY_CONFIGURED + condition["type"] == self.Conditions.Type.AVAILABLE + and condition["status"] == Resource.Condition.Status.TRUE + and condition["reason"] == self.Conditions.Reason.SUCCESSFULLY_CONFIGURED ): return condition["lastTransitionTime"] @@ -345,14 +346,15 @@ def _get_nncp_configured_last_transition_time(self): wait_timeout=TIMEOUT_1MINUTE, sleep=TIMEOUT_5SEC, ) - def _wait_for_nncp_status_update(self, initial_transition_time): + def _wait_for_nncp_status_update(self, initial_transition_time: str) -> bool: date_format = "%Y-%m-%dT%H:%M:%SZ" formatted_initial_transition_time = datetime.strptime(initial_transition_time, date_format) - return any( - condition["type"] == NodeNetworkConfigurationPolicy.Conditions.Type.AVAILABLE - and datetime.strptime(condition["lastTransitionTime"], date_format) > formatted_initial_transition_time - for condition in self.instance.get("status", {}).get("conditions", []) - ) + for condition in self.instance.get("status", {}).get("conditions", []): + if ( + condition["type"] == self.Conditions.Type.AVAILABLE + and datetime.strptime(condition["lastTransitionTime"], date_format) > formatted_initial_transition_time + ): + return True @property def status(self): From 9706dccded5d728ecb0b91390d30663e85209107 Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Wed, 19 Mar 2025 12:51:20 +0200 Subject: [PATCH 08/11] Fix CI failure --- ocp_resources/node_network_configuration_policy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 5da244b578..56223db8bf 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -333,7 +333,7 @@ def _absent_interface(self): if initial_success_status_time: self._wait_for_nncp_status_update(initial_transition_time=initial_success_status_time) - def _get_last_successful_transition_time(self) -> str: + def _get_last_successful_transition_time(self) -> str | None: for condition in self.instance.status.conditions: if ( condition["type"] == self.Conditions.Type.AVAILABLE @@ -341,6 +341,7 @@ def _get_last_successful_transition_time(self) -> str: and condition["reason"] == self.Conditions.Reason.SUCCESSFULLY_CONFIGURED ): return condition["lastTransitionTime"] + return None @retry( wait_timeout=TIMEOUT_1MINUTE, @@ -355,6 +356,7 @@ def _wait_for_nncp_status_update(self, initial_transition_time: str) -> bool: and datetime.strptime(condition["lastTransitionTime"], date_format) > formatted_initial_transition_time ): return True + return False @property def status(self): From 477df8f02eb8a557abcc48e06a7ee2d5881ba9ca Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Wed, 19 Mar 2025 13:26:27 +0200 Subject: [PATCH 09/11] Add clarifying comment --- ocp_resources/node_network_configuration_policy.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 56223db8bf..6a262f1cfe 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -323,6 +323,8 @@ def _absent_interface(self): if self.ports: self.add_ports() + # The time-stamp that is returned by _get_last_successful_transition_time() will chnage after the call to + # update(), therefore it must be fetched and stored before, and comapred with the new time-stamp after. initial_success_status_time = self._get_last_successful_transition_time() ResourceEditor( patches={self: {"spec": {"desiredState": {"interfaces": self.desired_state["interfaces"]}}}} From 0b1ce1af90638615bc7b3a6ac171165309f5dc36 Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Wed, 19 Mar 2025 14:00:06 +0200 Subject: [PATCH 10/11] Apply code-reabbit suggestions --- ocp_resources/node_network_configuration_policy.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 6a262f1cfe..22d3986f46 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -323,8 +323,8 @@ def _absent_interface(self): if self.ports: self.add_ports() - # The time-stamp that is returned by _get_last_successful_transition_time() will chnage after the call to - # update(), therefore it must be fetched and stored before, and comapred with the new time-stamp after. + # The time-stamp that is returned by _get_last_successful_transition_time() will change after the call to + # update(), therefore it must be fetched and stored before, and compared with the new time-stamp after. initial_success_status_time = self._get_last_successful_transition_time() ResourceEditor( patches={self: {"spec": {"desiredState": {"interfaces": self.desired_state["interfaces"]}}}} @@ -355,6 +355,7 @@ def _wait_for_nncp_status_update(self, initial_transition_time: str) -> bool: for condition in self.instance.get("status", {}).get("conditions", []): if ( condition["type"] == self.Conditions.Type.AVAILABLE + and condition["status"] == Resource.Condition.Status.TRUE and datetime.strptime(condition["lastTransitionTime"], date_format) > formatted_initial_transition_time ): return True From b8fc77929245d7d982bb007486ee7a047668efc9 Mon Sep 17 00:00:00 2001 From: Yossi Segev Date: Wed, 19 Mar 2025 16:10:41 +0200 Subject: [PATCH 11/11] Fix comments --- ocp_resources/node_network_configuration_policy.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ocp_resources/node_network_configuration_policy.py b/ocp_resources/node_network_configuration_policy.py index 22d3986f46..5f16f2dd3f 100644 --- a/ocp_resources/node_network_configuration_policy.py +++ b/ocp_resources/node_network_configuration_policy.py @@ -323,15 +323,14 @@ def _absent_interface(self): if self.ports: self.add_ports() - # The time-stamp that is returned by _get_last_successful_transition_time() will change after the call to - # update(), therefore it must be fetched and stored before, and compared with the new time-stamp after. + # The current time-stamp of the NNCP's available status will change after the NNCP is updated, therefore + # it must be fetched and stored before the update, and compared with the new time-stamp after. initial_success_status_time = self._get_last_successful_transition_time() ResourceEditor( patches={self: {"spec": {"desiredState": {"interfaces": self.desired_state["interfaces"]}}}} ).update() - # If the NNCP failed on setup, then at this point it's not in AVAILABLE status, so the next time it will be in - # AVAILABLE status will necessarily be the updated one. + # If the NNCP failed on setup, then its tear-down AVAIALBLE status will necessarily be the first. if initial_success_status_time: self._wait_for_nncp_status_update(initial_transition_time=initial_success_status_time)