From e3647b0c31cbf90afdf00d4b092ecf18285b46a0 Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 6 May 2026 19:50:34 +0100 Subject: [PATCH 1/3] fix GCP spec gap --- .../providers/gcp/rules/ai/workbench_idle.py | 98 ++- .../gcp/ai/test_gcp_workbench_idle.py | 689 ++++++++++-------- 2 files changed, 484 insertions(+), 303 deletions(-) diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py index e906198..78f4572 100644 --- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py +++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py @@ -70,9 +70,13 @@ def find_idle_workbench_instances( Currently EMITTING_DISABLED: no qualifying canonical signal exists for per-instance kernel activity. updateTime and createTime MUST NOT be used - as idle signals. + as idle signals. Always returns an empty list. - Always returns an empty list until a qualifying signal is available. + Performs full candidate classification and surfaces results via warnings: + - PARTIAL scope: unreachable[] locations or discovery failures warn once. + - NOT_EVALUABLE: ACTIVE candidates with NO_SIGNAL warn with resource names. + - INVALID resources: malformed name or missing state; counted but not warned. + - OUT_OF_SCOPE: non-ACTIVE instances; excluded silently. IAM permissions required: notebooks.instances.list (roles/notebooks.viewer) @@ -81,7 +85,90 @@ def find_idle_workbench_instances( raise ValueError(f"idle_days must be >= 1, got {idle_days!r}") session = AuthorizedSession(credentials) - _list_instances(session, project_id) + raw_instances, unreachable_locations, discovery_failed = _list_instances( + session, project_id + ) + + # --- Partial scan scope --- + # The spec explicitly covers unreachable[] from the API as a PARTIAL trigger. + # discovery_failed (400/5xx/network) is an implementation extension: those + # errors make enumeration incomplete in the same way, so PARTIAL is the + # conservative and correct choice even though the spec does not spell it out. + if unreachable_locations or discovery_failed: + parts = [] + if unreachable_locations: + parts.append(f"unreachable locations: {unreachable_locations}") + if discovery_failed: + parts.append("discovery incomplete due to transport/server error") + warnings.warn( + f"gcp.vertex.workbench.idle: scan scope PARTIAL for project '{project_id}'" + f" — {'; '.join(parts)}", + UserWarning, + stacklevel=2, + ) + + # --- Classify resource records --- + # INVALID: resource name absent/malformed or state absent/empty — counted. + # OUT_OF_SCOPE: valid name+state but not ACTIVE — excluded silently. + excluded_invalid_count = 0 + candidate_resources: list[dict] = [] + + for raw in raw_instances: + name = (raw.get("name") or "").strip() + state = (raw.get("state") or "").strip() + + if not name or not _INSTANCE_NAME_RE.match(name): + excluded_invalid_count += 1 + continue + + if not state: + excluded_invalid_count += 1 + continue + + # OUT_OF_SCOPE: valid but not ACTIVE — excluded silently, not counted. + if state != "ACTIVE": + continue + + # location is segment index 3 of the validated name. + location = name.split("/")[3] + + # Region filter: exact string equality, no aliasing or case folding. + if region_filter and location != region_filter: + continue + + candidate_resources.append({"name": name, "location": location}) + + # --- INVALID count --- + # Surface the exact count of records excluded as INVALID so operators can + # detect data-quality issues without the rule silently discarding records. + if excluded_invalid_count: + warnings.warn( + f"gcp.vertex.workbench.idle: {excluded_invalid_count} resource record(s)" + f" in project '{project_id}' excluded as INVALID" + f" (malformed resource name or missing state field)", + UserWarning, + stacklevel=2, + ) + + # --- NOT_EVALUABLE: ACTIVE candidates exist but no canonical signal --- + # Warn with a capped list of resource names (first 5 + overflow count) so + # the warning stays readable even in large projects. + if candidate_resources: + _CAP = 5 + shown = [r["name"] for r in candidate_resources[:_CAP]] + overflow = len(candidate_resources) - _CAP + name_summary = ", ".join(shown) + if overflow > 0: + name_summary += f", ... (+{overflow} more)" + warnings.warn( + f"gcp.vertex.workbench.idle: {len(candidate_resources)} ACTIVE instance(s)" + f" in project '{project_id}' cannot be evaluated (NO_SIGNAL) —" + f" no qualifying canonical kernel-activity signal exists;" + f" rule is EMITTING_DISABLED. Instances: {name_summary}", + UserWarning, + stacklevel=2, + ) + return [] @@ -91,7 +178,7 @@ def find_idle_workbench_instances( def _list_instances( session: AuthorizedSession, project_id: str, -) -> tuple: +) -> tuple[list, list, bool]: """ List all Vertex AI Workbench instances across all locations using the v2 API. @@ -138,6 +225,9 @@ def _list_instances( ) if resp.status_code == 404: + # Implementation choice: 404 = Notebooks API not enabled for this + # project, meaning provably no instances exist. Treated as a clean + # empty scope (FULL, EVALUABLE). Not explicitly defined in the spec. return [], [], False if resp.status_code == 400: diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py index e7d9ffc..e453b18 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py @@ -2,31 +2,29 @@ Tests for gcp.vertex.workbench.idle rule. The rule is EMITTING_DISABLED and always returns an empty List[Finding]. -No qualifying canonical kernel-activity signal exists; updateTime, createTime, -age, and CPU utilization are all explicitly non-canonical. +Classification is performed and surfaced via UserWarning: + - PARTIAL scope: unreachable[] locations or discovery failures. + - NOT_EVALUABLE: ACTIVE candidates warn with resource names (NO_SIGNAL). + - INVALID: malformed name or missing state; counted, not warned. + - OUT_OF_SCOPE: non-ACTIVE instances; excluded silently. Coverage: - Public API (find_idle_workbench_instances): - - return type and value - - idle_days validation (zero, negative, boundary, error message) - - region_filter parameter accepted - - 403/404/400/5xx/network error handling - - warning type, message content (project, HTTP code, rule ID) - - Internal (_list_instances): - - empty response - - instance accumulation - - pagination over 2 and 3 pages - - pageToken forwarded on subsequent requests - - pageSize=100 in initial request - - unreachable[] collected and deduplicated across pages - - empty unreachable entries skipped - - 404 returns clean ([], [], False) - - 400 returns ([], [], True) - - 5xx sets discovery_failed; preserves already-fetched instances - - network error sets discovery_failed; preserves already-fetched instances - - 403 raises PermissionError - - URL contains project ID and locations/- wildcard + find_idle_workbench_instances (public API): + - always returns [] + - idle_days validation + - region_filter (exact match, no case folding) + - ACTIVE instances warn NOT_EVALUABLE/NO_SIGNAL + - unreachable locations warn PARTIAL scope + - discovery_failed warns PARTIAL scope + - INVALID resources (name/state) counted, not surfaced individually + - OUT_OF_SCOPE (non-ACTIVE) excluded silently + - 403/404/400/5xx/network HTTP error handling + + _list_instances (internal): + - pagination, pageToken forwarding, pageSize + - unreachable[] accumulation and deduplication + - 404/400/5xx/network/403 error handling + - URL shape (project ID, locations/- wildcard, v2) """ import warnings @@ -41,6 +39,8 @@ ) _PROJECT = "my-project" +_LOCATION = "us-central1" +_INSTANCE_NAME = f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-1" # --------------------------------------------------------------------------- @@ -49,7 +49,6 @@ def _ok(body: dict = None): - """Build a 200 response mock with the given JSON body.""" resp = MagicMock() resp.status_code = 200 resp.json.return_value = body or {} @@ -58,71 +57,84 @@ def _ok(body: dict = None): def _err(status_code: int): - """Build an error response mock with the given status code.""" resp = MagicMock() resp.status_code = status_code return resp def _session(*responses): - """Build a mock session whose .get() returns responses in order.""" mock = MagicMock() mock.get.side_effect = list(responses) return mock -def _invoke(**kwargs): +def _run(instances, unreachable=None, discovery_failed=False, **kwargs): """ - Call find_idle_workbench_instances with a default 200/empty mock session. - Extra kwargs are forwarded to the rule function. + Invoke find_idle_workbench_instances with _list_instances patched to return + (instances, unreachable, discovery_failed). """ with patch( - "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", - return_value=_session(_ok()), + "cleancloud.providers.gcp.rules.ai.workbench_idle._list_instances", + return_value=(instances, unreachable or [], discovery_failed), ): - return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs) + return find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), **kwargs + ) + + +def _run_with_warnings(instances, unreachable=None, discovery_failed=False, **kwargs): + """Like _run but also captures warnings.""" + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = _run(instances, unreachable, discovery_failed, **kwargs) + return result, [w for w in caught if issubclass(w.category, UserWarning)] + +def _active(name=_INSTANCE_NAME): + return {"name": name, "state": "ACTIVE"} -def _invoke_with_session(mock_session, **kwargs): - """Call find_idle_workbench_instances with a custom session mock.""" + +def _invoke_http(mock_session, **kwargs): with patch( "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", return_value=mock_session, ): - return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs) + return find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), **kwargs + ) + + +def _invoke_http_with_warnings(mock_session, **kwargs): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = _invoke_http(mock_session, **kwargs) + return result, [w for w in caught if issubclass(w.category, UserWarning)] # --------------------------------------------------------------------------- -# Return type and value +# Return value — always [] # --------------------------------------------------------------------------- class TestReturnValue: def test_returns_list(self): - assert isinstance(_invoke(), list) - - def test_always_empty(self): - assert _invoke() == [] - - def test_empty_when_api_returns_active_instances(self): - """EMITTING_DISABLED: ACTIVE instances in API response still yield no findings.""" - inst = { - "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1", - "state": "ACTIVE", - } - result = _invoke_with_session(_session(_ok({"instances": [inst]}))) - assert result == [] + assert isinstance(_run([]), list) + + def test_always_empty_no_instances(self): + assert _run([]) == [] + + def test_always_empty_with_active_instances(self): + assert _run([_active()]) == [] - def test_empty_when_api_returns_multiple_instances(self): + def test_always_empty_with_multiple_active(self): instances = [ - { - "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}", - "state": "ACTIVE", - } + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(5) ] - result = _invoke_with_session(_session(_ok({"instances": instances}))) - assert result == [] + assert _run(instances) == [] + + def test_always_empty_with_stopped_instances(self): + assert _run([{"name": _INSTANCE_NAME, "state": "STOPPED"}]) == [] # --------------------------------------------------------------------------- @@ -131,20 +143,16 @@ def test_empty_when_api_returns_multiple_instances(self): class TestIdleDaysValidation: - def test_zero_raises_value_error(self): - with pytest.raises(ValueError, match="idle_days must be >= 1"): - find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), idle_days=0) - - def test_negative_one_raises(self): + def test_zero_raises(self): with pytest.raises(ValueError, match="idle_days must be >= 1"): find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), idle_days=-1 + project_id=_PROJECT, credentials=MagicMock(), idle_days=0 ) - def test_large_negative_raises(self): + def test_negative_raises(self): with pytest.raises(ValueError, match="idle_days must be >= 1"): find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), idle_days=-999 + project_id=_PROJECT, credentials=MagicMock(), idle_days=-1 ) def test_error_message_includes_bad_value(self): @@ -154,138 +162,316 @@ def test_error_message_includes_bad_value(self): ) def test_one_is_valid(self): - assert _invoke(idle_days=1) == [] + assert _run([], idle_days=1) == [] def test_default_14_is_valid(self): - assert _invoke() == [] + assert _run([]) == [] def test_large_value_is_valid(self): - assert _invoke(idle_days=365) == [] + assert _run([], idle_days=365) == [] + + +# --------------------------------------------------------------------------- +# NOT_EVALUABLE / NO_SIGNAL warning for ACTIVE candidates +# --------------------------------------------------------------------------- + + +class TestNotEvaluableWarning: + def test_active_instance_emits_warning(self): + _, warns = _run_with_warnings([_active()]) + assert len(warns) == 1 + + def test_warning_is_user_warning(self): + _, warns = _run_with_warnings([_active()]) + assert issubclass(warns[0].category, UserWarning) + + def test_warning_mentions_no_signal(self): + _, warns = _run_with_warnings([_active()]) + assert "NO_SIGNAL" in str(warns[0].message) + + def test_warning_mentions_emitting_disabled(self): + _, warns = _run_with_warnings([_active()]) + assert "EMITTING_DISABLED" in str(warns[0].message) + + def test_warning_mentions_project(self): + _, warns = _run_with_warnings([_active()]) + assert _PROJECT in str(warns[0].message) + + def test_warning_mentions_resource_name(self): + _, warns = _run_with_warnings([_active()]) + assert _INSTANCE_NAME in str(warns[0].message) + + def test_warning_mentions_count(self): + instances = [ + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") + for i in range(3) + ] + _, warns = _run_with_warnings(instances) + assert "3" in str(warns[0].message) + + def test_warning_mentions_all_resource_names_when_under_cap(self): + inst1 = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-1") + inst2 = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-2") + _, warns = _run_with_warnings([inst1, inst2]) + msg = str(warns[0].message) + assert "wb-1" in msg + assert "wb-2" in msg + + def test_warning_caps_name_list_at_five(self): + instances = [ + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") + for i in range(8) + ] + _, warns = _run_with_warnings(instances) + msg = str(warns[0].message) + assert "+3 more" in msg + + def test_warning_no_overflow_label_when_at_cap(self): + instances = [ + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") + for i in range(5) + ] + _, warns = _run_with_warnings(instances) + assert "more" not in str(warns[0].message) + + def test_no_warning_when_no_active_instances(self): + _, warns = _run_with_warnings([]) + assert len(warns) == 0 + + def test_no_warning_for_stopped_instances(self): + _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "STOPPED"}]) + assert len(warns) == 0 + + +# --------------------------------------------------------------------------- +# PARTIAL scope warning — unreachable locations +# --------------------------------------------------------------------------- + + +class TestPartialScopeUnreachable: + def test_unreachable_location_emits_warning(self): + _, warns = _run_with_warnings([], unreachable=["asia-east1"]) + assert len(warns) == 1 + + def test_warning_mentions_partial(self): + _, warns = _run_with_warnings([], unreachable=["asia-east1"]) + assert "PARTIAL" in str(warns[0].message) + + def test_warning_mentions_unreachable_location(self): + _, warns = _run_with_warnings([], unreachable=["asia-east1"]) + assert "asia-east1" in str(warns[0].message) + + def test_warning_mentions_project(self): + _, warns = _run_with_warnings([], unreachable=["us-east1"]) + assert _PROJECT in str(warns[0].message) + + def test_multiple_unreachable_locations_in_single_warning(self): + _, warns = _run_with_warnings([], unreachable=["asia-east1", "europe-west3"]) + assert len(warns) == 1 + msg = str(warns[0].message) + assert "asia-east1" in msg + assert "europe-west3" in msg + + def test_no_partial_warning_when_no_unreachable(self): + _, warns = _run_with_warnings([]) + assert len(warns) == 0 + + +# --------------------------------------------------------------------------- +# PARTIAL scope warning — discovery_failed +# --------------------------------------------------------------------------- + + +class TestPartialScopeDiscoveryFailed: + def test_discovery_failed_emits_warning(self): + _, warns = _run_with_warnings([], discovery_failed=True) + assert len(warns) == 1 + + def test_warning_mentions_partial(self): + _, warns = _run_with_warnings([], discovery_failed=True) + assert "PARTIAL" in str(warns[0].message) + + def test_warning_mentions_project(self): + _, warns = _run_with_warnings([], discovery_failed=True) + assert _PROJECT in str(warns[0].message) + + def test_both_partial_and_not_evaluable_can_warn(self): + """ACTIVE instances + unreachable locations = two warnings.""" + _, warns = _run_with_warnings([_active()], unreachable=["asia-east1"]) + assert len(warns) == 2 + + def test_discovery_failed_and_active_instances_both_warn(self): + _, warns = _run_with_warnings([_active()], discovery_failed=True) + assert len(warns) == 2 + + +# --------------------------------------------------------------------------- +# INVALID resource classification +# --------------------------------------------------------------------------- + + +class TestInvalidResources: + def test_empty_name_emits_invalid_warning(self): + inst = {"name": "", "state": "ACTIVE"} + _, warns = _run_with_warnings([inst]) + assert any("INVALID" in str(w.message) for w in warns) + + def test_missing_name_emits_invalid_warning(self): + inst = {"state": "ACTIVE"} + _, warns = _run_with_warnings([inst]) + assert any("INVALID" in str(w.message) for w in warns) + + def test_wrong_pattern_emits_invalid_warning(self): + inst = {"name": "projects/p/locations/l/wrongType/id", "state": "ACTIVE"} + _, warns = _run_with_warnings([inst]) + assert any("INVALID" in str(w.message) for w in warns) + + def test_extra_path_segments_emits_invalid_warning(self): + inst = {"name": f"{_INSTANCE_NAME}/extra", "state": "ACTIVE"} + _, warns = _run_with_warnings([inst]) + assert any("INVALID" in str(w.message) for w in warns) + + def test_empty_state_emits_invalid_warning(self): + inst = {"name": _INSTANCE_NAME, "state": ""} + _, warns = _run_with_warnings([inst]) + assert any("INVALID" in str(w.message) for w in warns) + + def test_missing_state_emits_invalid_warning(self): + inst = {"name": _INSTANCE_NAME} + _, warns = _run_with_warnings([inst]) + assert any("INVALID" in str(w.message) for w in warns) + + def test_invalid_warning_mentions_count(self): + bad1 = {"name": "", "state": "ACTIVE"} + bad2 = {"name": _INSTANCE_NAME, "state": ""} + _, warns = _run_with_warnings([bad1, bad2]) + invalid_warn = next(w for w in warns if "INVALID" in str(w.message)) + assert "2" in str(invalid_warn.message) + + def test_invalid_warning_mentions_project(self): + inst = {"name": "", "state": "ACTIVE"} + _, warns = _run_with_warnings([inst]) + invalid_warn = next(w for w in warns if "INVALID" in str(w.message)) + assert _PROJECT in str(invalid_warn.message) + + def test_no_invalid_warning_when_all_valid(self): + _, warns = _run_with_warnings([_active()]) + assert not any("INVALID" in str(w.message) for w in warns) + + def test_invalid_not_in_not_evaluable_candidates(self): + """INVALID records must not appear in the NOT_EVALUABLE warning.""" + bad = {"name": "", "state": "ACTIVE"} + good = _active() + _, warns = _run_with_warnings([bad, good]) + not_eval_warn = next(w for w in warns if "NO_SIGNAL" in str(w.message)) + assert _INSTANCE_NAME in str(not_eval_warn.message) + # count should be 1 (only the valid ACTIVE instance) + assert "1 ACTIVE" in str(not_eval_warn.message) + + +# --------------------------------------------------------------------------- +# OUT_OF_SCOPE resources — non-ACTIVE states +# --------------------------------------------------------------------------- + + +class TestOutOfScope: + def test_stopped_excluded_silently(self): + _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "STOPPED"}]) + assert len(warns) == 0 + + def test_suspended_excluded_silently(self): + _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "SUSPENDED"}]) + assert len(warns) == 0 + + def test_provisioning_excluded_silently(self): + _, warns = _run_with_warnings( + [{"name": _INSTANCE_NAME, "state": "PROVISIONING"}] + ) + assert len(warns) == 0 + + def test_mixed_active_and_stopped_only_active_warned(self): + active = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/active") + stopped = {"name": f"projects/{_PROJECT}/locations/{_LOCATION}/instances/stopped", + "state": "STOPPED"} + _, warns = _run_with_warnings([active, stopped]) + assert len(warns) == 1 + msg = str(warns[0].message) + assert "active" in msg + assert "stopped" not in msg # --------------------------------------------------------------------------- -# region_filter parameter +# Region filter — exact string equality, no case folding # --------------------------------------------------------------------------- class TestRegionFilter: - def test_region_filter_string_accepted(self): - assert _invoke(region_filter="us-central1") == [] + def test_matching_region_produces_warning(self): + _, warns = _run_with_warnings([_active()], region_filter=_LOCATION) + assert len(warns) == 1 + + def test_non_matching_region_excluded(self): + _, warns = _run_with_warnings([_active()], region_filter="europe-west1") + assert len(warns) == 0 - def test_region_filter_none_accepted(self): - assert _invoke(region_filter=None) == [] + def test_uppercase_region_does_not_match(self): + """Exact equality — 'US-CENTRAL1' must NOT match 'us-central1'.""" + _, warns = _run_with_warnings([_active()], region_filter="US-CENTRAL1") + assert len(warns) == 0 + + def test_none_region_filter_includes_all(self): + inst1 = _active("projects/p/locations/europe-west1/instances/i1") + inst2 = _active("projects/p/locations/us-central1/instances/i2") + _, warns = _run_with_warnings([inst1, inst2], region_filter=None) + assert len(warns) == 1 + msg = str(warns[0].message) + assert "i1" in msg + assert "i2" in msg # --------------------------------------------------------------------------- -# HTTP error handling via public API +# HTTP error handling (via AuthorizedSession) # --------------------------------------------------------------------------- class TestHttpErrors: def test_403_raises_permission_error(self): - with pytest.raises(PermissionError): - _invoke_with_session(_session(_err(403))) - - def test_403_message_mentions_permission(self): with pytest.raises(PermissionError, match="notebooks.instances.list"): - _invoke_with_session(_session(_err(403))) + _invoke_http(_session(_err(403))) - def test_403_message_mentions_role(self): + def test_403_mentions_role(self): with pytest.raises(PermissionError, match="roles/notebooks.viewer"): - _invoke_with_session(_session(_err(403))) - - def test_404_returns_empty_list(self): - assert _invoke_with_session(_session(_err(404))) == [] - - def test_404_no_warning_emitted(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(404))) - assert not any(issubclass(w.category, UserWarning) for w in caught) - - def test_400_returns_empty_list(self): - assert _invoke_with_session(_session(_err(400))) == [] - - def test_400_emits_user_warning(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(400))) - assert any(issubclass(w.category, UserWarning) for w in caught) - - def test_400_warning_mentions_status_code(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(400))) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert "400" in msgs - - def test_400_warning_mentions_project(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(400))) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert _PROJECT in msgs - - def test_500_returns_empty_list(self): - assert _invoke_with_session(_session(_err(500))) == [] - - def test_500_emits_user_warning(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(500))) - assert any(issubclass(w.category, UserWarning) for w in caught) - - def test_500_warning_mentions_status_code(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(500))) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert "500" in msgs - - def test_503_warning_mentions_status_code(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(503))) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert "503" in msgs - - def test_5xx_warning_mentions_project(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(_session(_err(500))) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert _PROJECT in msgs - - def test_network_error_returns_empty_list(self): - session = MagicMock() - session.get.side_effect = ConnectionError("timeout") - assert _invoke_with_session(session) == [] + _invoke_http(_session(_err(403))) - def test_network_error_emits_user_warning(self): - session = MagicMock() - session.get.side_effect = ConnectionError("timeout") - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(session) - assert any(issubclass(w.category, UserWarning) for w in caught) + def test_404_returns_empty_no_partial_warning(self): + result, warns = _invoke_http_with_warnings(_session(_err(404))) + assert result == [] + assert len(warns) == 0 - def test_network_error_warning_mentions_project(self): - session = MagicMock() - session.get.side_effect = OSError("no route to host") - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(session) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert _PROJECT in msgs - - def test_network_error_warning_mentions_exception_type(self): + def test_400_returns_empty_with_partial_warning(self): + result, warns = _invoke_http_with_warnings(_session(_err(400))) + assert result == [] + # _list_instances warns about 400, caller warns about PARTIAL + msgs = " ".join(str(w.message) for w in warns) + assert "400" in msgs or "PARTIAL" in msgs + + def test_500_returns_empty_with_warning(self): + result, warns = _invoke_http_with_warnings(_session(_err(500))) + assert result == [] + msgs = " ".join(str(w.message) for w in warns) + assert "500" in msgs or "PARTIAL" in msgs + + def test_503_returns_empty_with_warning(self): + result, warns = _invoke_http_with_warnings(_session(_err(503))) + assert result == [] + assert len(warns) >= 1 + + def test_network_error_returns_empty_with_warning(self): session = MagicMock() - session.get.side_effect = ConnectionError("dropped") - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _invoke_with_session(session) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert "ConnectionError" in msgs + session.get.side_effect = ConnectionError("timeout") + result, warns = _invoke_http_with_warnings(session) + assert result == [] + assert len(warns) >= 1 # --------------------------------------------------------------------------- @@ -296,52 +482,38 @@ def test_network_error_warning_mentions_exception_type(self): class TestListInstancesBasic: def test_empty_response(self): instances, unreachable, failed = _list_instances(_session(_ok()), _PROJECT) - assert instances == [] - assert unreachable == [] - assert failed is False + assert instances == [] and unreachable == [] and failed is False def test_instances_returned(self): - inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + inst = {"name": _INSTANCE_NAME, "state": "ACTIVE"} instances, _, _ = _list_instances(_session(_ok({"instances": [inst]})), _PROJECT) assert instances == [inst] - def test_multiple_instances_in_single_page(self): - inst_list = [ - {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"} - for i in range(3) - ] - instances, _, _ = _list_instances(_session(_ok({"instances": inst_list})), _PROJECT) - assert instances == inst_list - def test_page_size_100_in_initial_request(self): session = _session(_ok()) _list_instances(session, _PROJECT) - params = session.get.call_args.kwargs["params"] - assert params["pageSize"] == 100 + assert session.get.call_args.kwargs["params"]["pageSize"] == 100 def test_url_contains_project_id(self): session = _session(_ok()) _list_instances(session, "target-project-xyz") - url = session.get.call_args.args[0] - assert "target-project-xyz" in url + assert "target-project-xyz" in session.get.call_args.args[0] def test_url_uses_wildcard_location(self): session = _session(_ok()) _list_instances(session, _PROJECT) - url = session.get.call_args.args[0] - assert "locations/-" in url + assert "locations/-" in session.get.call_args.args[0] def test_url_uses_v2_api(self): session = _session(_ok()) _list_instances(session, _PROJECT) - url = session.get.call_args.args[0] - assert "/v2/" in url + assert "/v2/" in session.get.call_args.args[0] class TestListInstancesPagination: def test_two_pages_accumulates_instances(self): - inst1 = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} - inst2 = {"name": "projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"} + inst1 = {"name": f"projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + inst2 = {"name": f"projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"} session = _session( _ok({"instances": [inst1], "nextPageToken": "tok1"}), _ok({"instances": [inst2]}), @@ -350,62 +522,31 @@ def test_two_pages_accumulates_instances(self): assert instances == [inst1, inst2] def test_three_pages_all_accumulated(self): - def _inst(i): - return {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"} - - session = _session( - _ok({"instances": [_inst(1)], "nextPageToken": "t1"}), - _ok({"instances": [_inst(2)], "nextPageToken": "t2"}), - _ok({"instances": [_inst(3)]}), - ) - instances, _, _ = _list_instances(session, _PROJECT) + pages = [ + _ok({"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}], + "nextPageToken": f"t{i}"} if i < 2 else + {"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}]}) + for i in range(3) + ] + instances, _, _ = _list_instances(_session(*pages), _PROJECT) assert len(instances) == 3 def test_page_token_forwarded_on_second_request(self): - session = _session( - _ok({"nextPageToken": "tok-abc"}), - _ok({}), - ) - _list_instances(session, _PROJECT) - second_params = session.get.call_args_list[1].kwargs["params"] - assert second_params.get("pageToken") == "tok-abc" - - def test_page_token_forwarded_on_third_request(self): - session = _session( - _ok({"nextPageToken": "t1"}), - _ok({"nextPageToken": "t2"}), - _ok({}), - ) + session = _session(_ok({"nextPageToken": "tok-abc"}), _ok({})) _list_instances(session, _PROJECT) - third_params = session.get.call_args_list[2].kwargs["params"] - assert third_params.get("pageToken") == "t2" + assert session.get.call_args_list[1].kwargs["params"]["pageToken"] == "tok-abc" def test_stops_when_no_next_token(self): session = _session(_ok({})) _list_instances(session, _PROJECT) assert session.get.call_count == 1 - def test_exactly_two_calls_for_two_pages(self): - session = _session( - _ok({"nextPageToken": "t1"}), - _ok({}), - ) - _list_instances(session, _PROJECT) - assert session.get.call_count == 2 - class TestListInstancesUnreachable: - def test_single_unreachable_location_collected(self): - session = _session(_ok({"unreachable": ["asia-east1"]})) - _, unreachable, _ = _list_instances(session, _PROJECT) + def test_unreachable_location_collected(self): + _, unreachable, _ = _list_instances(_session(_ok({"unreachable": ["asia-east1"]})), _PROJECT) assert "asia-east1" in unreachable - def test_multiple_unreachable_locations(self): - session = _session(_ok({"unreachable": ["asia-east1", "europe-west3"]})) - _, unreachable, _ = _list_instances(session, _PROJECT) - assert "asia-east1" in unreachable - assert "europe-west3" in unreachable - def test_unreachable_deduplicated_across_pages(self): session = _session( _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}), @@ -415,24 +556,10 @@ def test_unreachable_deduplicated_across_pages(self): assert unreachable.count("asia-east1") == 1 def test_empty_string_in_unreachable_skipped(self): - session = _session(_ok({"unreachable": ["", "us-east1"]})) - _, unreachable, _ = _list_instances(session, _PROJECT) - assert "" not in unreachable - assert "us-east1" in unreachable - - def test_no_unreachable_when_field_absent(self): - session = _session(_ok({})) - _, unreachable, _ = _list_instances(session, _PROJECT) - assert unreachable == [] - - def test_unreachable_from_multiple_pages_merged(self): - session = _session( - _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}), - _ok({"unreachable": ["europe-west3"]}), + _, unreachable, _ = _list_instances( + _session(_ok({"unreachable": ["", "us-east1"]})), _PROJECT ) - _, unreachable, _ = _list_instances(session, _PROJECT) - assert "asia-east1" in unreachable - assert "europe-west3" in unreachable + assert "" not in unreachable and "us-east1" in unreachable class TestListInstancesErrors: @@ -440,36 +567,26 @@ def test_403_raises_permission_error(self): with pytest.raises(PermissionError, match="notebooks.instances.list"): _list_instances(_session(_err(403)), _PROJECT) - def test_404_returns_empty_clean(self): + def test_404_returns_clean_empty(self): instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT) - assert instances == [] - assert unreachable == [] - assert failed is False - - def test_400_returns_empty_with_discovery_failed(self): - instances, unreachable, failed = _list_instances(_session(_err(400)), _PROJECT) - assert instances == [] - assert unreachable == [] + assert instances == [] and unreachable == [] and failed is False + + def test_400_sets_discovery_failed(self): + _, _, failed = _list_instances(_session(_err(400)), _PROJECT) assert failed is True def test_500_sets_discovery_failed(self): _, _, failed = _list_instances(_session(_err(500)), _PROJECT) assert failed is True - def test_503_sets_discovery_failed(self): - _, _, failed = _list_instances(_session(_err(503)), _PROJECT) - assert failed is True - - def test_5xx_preserves_instances_from_earlier_pages(self): - """Instances already fetched before a 5xx error must be returned.""" - inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + def test_5xx_preserves_earlier_instances(self): + inst = {"name": _INSTANCE_NAME, "state": "ACTIVE"} session = _session( _ok({"instances": [inst], "nextPageToken": "t1"}), _err(503), ) instances, _, failed = _list_instances(session, _PROJECT) - assert instances == [inst] - assert failed is True + assert instances == [inst] and failed is True def test_network_error_sets_discovery_failed(self): session = MagicMock() @@ -478,40 +595,14 @@ def test_network_error_sets_discovery_failed(self): assert failed is True def test_network_error_preserves_earlier_instances(self): - inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} - session = _session( - _ok({"instances": [inst], "nextPageToken": "t1"}), - ) + inst = {"name": _INSTANCE_NAME, "state": "ACTIVE"} + session = MagicMock() session.get.side_effect = [ _ok({"instances": [inst], "nextPageToken": "t1"}), ConnectionError("dropped"), ] instances, _, failed = _list_instances(session, _PROJECT) - assert instances == [inst] - assert failed is True - - def test_400_emits_warning_with_project(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _list_instances(_session(_err(400)), _PROJECT) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert _PROJECT in msgs - - def test_500_emits_warning_with_status_code(self): - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _list_instances(_session(_err(500)), _PROJECT) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert "500" in msgs - - def test_network_error_emits_warning_with_project(self): - session = MagicMock() - session.get.side_effect = OSError("no route to host") - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - _list_instances(session, _PROJECT) - msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) - assert _PROJECT in msgs + assert instances == [inst] and failed is True # --------------------------------------------------------------------------- @@ -532,5 +623,5 @@ def test_service(self): def test_cost_impact(self): assert RULE_METADATA["cost_impact"] == "high" - def test_rule_id_attribute_on_function(self): + def test_rule_id_attribute(self): assert find_idle_workbench_instances.RULE_ID == "gcp.vertex.workbench.idle" From 099f542e2adbdbc0be9e2c586b36a7e3689b94c2 Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 6 May 2026 21:44:50 +0100 Subject: [PATCH 2/3] aws.ec2.gpu.idle --- .../providers/aws/rules/ai/ec2_gpu_idle.py | 153 ++++--- .../providers/gcp/rules/ai/workbench_idle.py | 49 +- docs/rules/aws.md | 8 +- docs/specs/aws/ai/ec2_gpu_idle.md | 433 ++++++++++++++++++ .../providers/aws/ai/test_aws_ec2_gpu_idle.py | 239 +++++++++- .../gcp/ai/test_gcp_workbench_idle.py | 133 ++++-- 6 files changed, 884 insertions(+), 131 deletions(-) create mode 100644 docs/specs/aws/ai/ec2_gpu_idle.md diff --git a/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py b/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py index 818a152..66e4dea 100644 --- a/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py +++ b/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py @@ -132,7 +132,7 @@ } _DEFAULT_MONTHLY_COST = 600.0 -# GPU utilisation thresholds +# GPU utilization thresholds _GPU_UTIL_THRESHOLD_PCT = 5.0 # below this = idle (when GPU metric available) _CPU_UTIL_THRESHOLD_PCT = 10.0 # below this = idle (CPU fallback) @@ -151,7 +151,7 @@ def find_idle_gpu_instances( cpu_threshold: float = _CPU_UTIL_THRESHOLD_PCT, ) -> List[Finding]: """ - Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilisation. + Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilization. GPU instances (raw EC2, outside SageMaker) incur continuous charges while running regardless of whether GPUs are being utilised. A p4d.24xlarge costs ~$23K/month @@ -162,15 +162,19 @@ def find_idle_gpu_instances( - Instance state is running - Instance type is a known GPU/accelerator family - Instance is older than idle_days (avoids flagging newly launched instances) - - GPU utilisation < gpu_threshold % over idle_days (HIGH confidence, when NVIDIA - CloudWatch agent publishes nvidia_smi_utilization_gpu under CWAgent namespace) - - OR CPU utilisation < cpu_threshold % over idle_days (MEDIUM confidence fallback, + - GPU utilization < gpu_threshold % over idle_days (HIGH confidence, when the + nvidia_smi_utilization_gpu metric is discoverable for the instance in CloudWatch) + - OR CPU utilization < cpu_threshold % over idle_days (MEDIUM confidence fallback, used when GPU metrics are not available — CPU alone is a weaker signal) GPU metric detection: - The NVIDIA CloudWatch agent publishes nvidia_smi_utilization_gpu under the CWAgent - namespace with an InstanceId dimension. Availability is probed via ListMetrics per - instance — not assumed. Instances without the agent fall back to CPU utilisation. + The rule probes CloudWatch ListMetrics for nvidia_smi_utilization_gpu in the CWAgent + namespace, filtered by InstanceId dimension. This depends on the CloudWatch agent + being installed and configured to append EC2 instance dimensions (e.g. via + append_dimensions = {"InstanceId": ...}). AWS does not guarantee the InstanceId + dimension is present by default; its presence is implementation-dependent. If the + metric is absent or the agent is misconfigured, the rule falls back to CPU + utilization. Absence of the metric is NOT proof the GPU is idle. Multi-GPU handling: For multi-GPU instances (e.g., p4d.24xlarge has 8 A100s), the MAX statistic is @@ -178,8 +182,8 @@ def find_idle_gpu_instances( would be averaged away using AVG, producing a misleadingly low reading. Confidence: - - HIGH: GPU metric available AND max GPU utilisation < gpu_threshold over idle_days - - MEDIUM: GPU metric unavailable, CPU utilisation < cpu_threshold over idle_days + - HIGH: GPU metric discoverable AND max GPU utilization < gpu_threshold over idle_days + - MEDIUM: GPU metric not discoverable; CPU utilization < cpu_threshold over idle_days IAM permissions: - ec2:DescribeInstances @@ -208,53 +212,62 @@ def find_idle_gpu_instances( if not _is_gpu_instance(instance_type): continue - instance_id = inst["InstanceId"] + # Normalize InstanceId — skip if missing or empty (spec section 5) + instance_id = (inst.get("InstanceId") or "").strip() + if not instance_id: + continue + tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])} # "spot" | "scheduled" | None (on-demand) instance_lifecycle = inst.get("InstanceLifecycle") purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand" - launch_time = inst.get("LaunchTime") - - age_days: Optional[int] = None - if launch_time: - if launch_time.tzinfo is None: - launch_time = launch_time.replace(tzinfo=timezone.utc) - age_days = (now - launch_time).days - # Skip instances younger than idle_days — too new to classify - if age_days is not None and age_days < idle_days: + # Normalize LaunchTime — skip if missing, naive, or future (spec section 5, section 8.3) + launch_time = inst.get("LaunchTime") + if not launch_time: + continue # missing LaunchTime → SKIP ITEM + if launch_time.tzinfo is None: + continue # naive timestamp is not tz-aware UTC → SKIP ITEM + age_days = (now - launch_time).days + if age_days < 0: + continue # future LaunchTime → SKIP ITEM + + # Skip instances younger than effective_idle_days — too new to classify + if age_days < idle_days: continue # Probe for GPU metrics — single ListMetrics call reused for stats gpu_metrics = _list_gpu_metrics(cloudwatch, instance_id) if gpu_metrics: - max_gpu_util = _get_max_gpu_utilisation( + max_gpu_util = _get_max_gpu_utilization( cloudwatch, gpu_metrics, idle_days, now ) if max_gpu_util is None or max_gpu_util >= gpu_threshold: continue confidence = ConfidenceLevel.HIGH - idle_signal = "gpu_utilisation" + idle_signal = "gpu_utilization" util_value = max_gpu_util - util_label = f"Max GPU utilisation: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)" + util_label = f"Max GPU utilization: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)" else: - avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now) - if avg_cpu is None or avg_cpu >= cpu_threshold: + max_cpu = _get_max_daily_cpu_utilization( + cloudwatch, instance_id, idle_days, now + ) + if max_cpu is None or max_cpu >= cpu_threshold: continue # CPU fallback is a weak heuristic for GPU workloads: - # accelerator utilisation is invisible to CPU metrics, so a GPU + # accelerator utilization is invisible to CPU metrics, so a GPU # instance running a compute-bound model can show near-zero CPU # while doing real work. Confidence is capped at MEDIUM to reflect # this limitation. Absence of the CWAgent GPU metric is NOT proof - # that the GPU is idle — the agent may simply not be installed. + # that the GPU is idle — the agent may be absent or misconfigured. confidence = ConfidenceLevel.MEDIUM - idle_signal = "cpu_utilisation_fallback" - util_value = avg_cpu + idle_signal = "cpu_utilization_fallback" + util_value = max_cpu util_label = ( - f"Peak daily CPU utilisation: {avg_cpu:.1f}% " + f"Peak daily CPU utilization: {max_cpu:.1f}% " f"(threshold: {cpu_threshold}%) — " - f"heuristic only; GPU/accelerator utilisation not directly measured" + f"heuristic only; GPU/accelerator utilization not directly measured" ) monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST) @@ -272,15 +285,14 @@ def find_idle_gpu_instances( f"Instance type: {instance_type} (GPU/accelerator family)", f"Purchasing model: {purchasing_model}", util_label, + f"Instance age: {age_days} days", ] - if age_days is not None: - signals.append(f"Instance age: {age_days} days") if not gpu_metrics: if _is_neuron_instance(instance_type): signals.append( "Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not " "applicable; CPU used as heuristic fallback; confidence MEDIUM. " - "Neuron utilisation requires AWS Neuron SDK metrics." + "Neuron utilization requires AWS Neuron SDK metrics." ) else: signals.append( @@ -289,11 +301,18 @@ def find_idle_gpu_instances( "the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM." ) + # signals_not_checked: GPU note only applies on CPU fallback path (spec section 11.1) not_checked = [ - "GPU/accelerator utilisation (not directly measurable without CWAgent)", "Scheduled batch jobs that run outside the observation window", "Planned future use", ] + if not gpu_metrics: + not_checked.insert( + 0, + "Direct GPU/accelerator utilization — nvidia_smi_utilization_gpu was not " + "discoverable in CloudWatch (CWAgent may be absent or not configured with " + "InstanceId dimension); absence of the metric does not confirm the GPU is idle", + ) if purchasing_model == "spot": not_checked.append( "Spot interruption history — Spot instances may appear idle " @@ -307,6 +326,17 @@ def find_idle_gpu_instances( ) metric_label = "GPU" if gpu_metrics else "CPU (fallback)" + if gpu_metrics: + reason = ( + f"GPU EC2 instance has low GPU utilization " + f"({util_value:.1f}%) over {idle_days} days" + ) + else: + reason = ( + f"GPU EC2 instance shows low CPU proxy signal " + f"({util_value:.1f}%) over {idle_days} days — " + f"GPU activity not directly measured" + ) findings.append( Finding( provider="aws", @@ -316,22 +346,18 @@ def find_idle_gpu_instances( region=region, estimated_monthly_cost_usd=monthly_cost, title=( - f"Idle GPU EC2 Instance ({metric_label} utilisation " + f"Idle GPU EC2 Instance ({metric_label} utilization " f"<{gpu_threshold if gpu_metrics else cpu_threshold}% " f"over {idle_days} days)" ), summary=( f"EC2 instance '{name_tag}' ({instance_type}) has had " - f"{'GPU' if gpu_metrics else 'CPU'} utilisation below " + f"{'GPU' if gpu_metrics else 'CPU'} utilization below " f"{gpu_threshold if gpu_metrics else cpu_threshold}% " f"for {idle_days} days while running, incurring " f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)." ), - reason=( - f"GPU EC2 instance has low " - f"{'GPU' if gpu_metrics else 'CPU'} utilisation " - f"({util_value:.1f}%) for {idle_days} days" - ), + reason=reason, risk=risk, confidence=confidence, detected_at=now, @@ -340,11 +366,11 @@ def find_idle_gpu_instances( "instance_id": instance_id, "instance_type": instance_type, "name": name_tag, - "age_days": (age_days if age_days is not None else "unknown"), + "age_days": age_days, "idle_days_threshold": idle_days, "idle_ratio": idle_ratio, "idle_signal": idle_signal, - "utilisation_pct": round(util_value, 2), + "utilization_pct": round(util_value, 2), "purchasing_model": purchasing_model, "gpu_metric_available": bool(gpu_metrics), "gpu_metric_note": ( @@ -387,25 +413,34 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list: """ Probe CloudWatch ListMetrics for nvidia_smi_utilization_gpu under CWAgent namespace. - Returns the Metrics list (one entry per GPU index) so the caller can reuse it + Exhausts pagination via NextToken (spec section 2 key fact 6: ListMetrics returns up to + 500 results per call). Returns all Metrics entries so the caller can reuse them for GetMetricStatistics without a second ListMetrics call. Returns [] on any error. """ + metrics: list = [] + kwargs: dict = { + "Namespace": "CWAgent", + "MetricName": "nvidia_smi_utilization_gpu", + "Dimensions": [{"Name": "InstanceId", "Value": instance_id}], + } try: - resp = cloudwatch.list_metrics( - Namespace="CWAgent", - MetricName="nvidia_smi_utilization_gpu", - Dimensions=[{"Name": "InstanceId", "Value": instance_id}], - ) - return resp.get("Metrics", []) + while True: + resp = cloudwatch.list_metrics(**kwargs) + metrics.extend(resp.get("Metrics", [])) + next_token = resp.get("NextToken") + if not next_token: + break + kwargs["NextToken"] = next_token except Exception: return [] + return metrics -def _get_max_gpu_utilisation( +def _get_max_gpu_utilization( cloudwatch, gpu_metrics: list, days: int, now: datetime ) -> Optional[float]: """ - Return the maximum GPU utilisation across all GPU indices over the window. + Return the maximum GPU utilization across all GPU indices over the window. Takes the gpu_metrics list already fetched by _list_gpu_metrics — no second ListMetrics call. Uses MAX statistic so a single active GPU on a multi-GPU @@ -440,17 +475,17 @@ def _get_max_gpu_utilisation( return max_util -def _get_avg_cpu_utilisation( +def _get_max_daily_cpu_utilization( cloudwatch, instance_id: str, days: int, now: datetime ) -> Optional[float]: """ - Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization. + Return the maximum daily CPU peak over the window using AWS/EC2 CPUUtilization. - Uses Maximum statistic per day and returns the highest daily peak. This avoids - flagging burst workloads where a short but significant CPU spike would be averaged - away — if the max CPU across any day is below threshold, the instance is truly idle. + Uses Maximum statistic at daily (86400s) period and returns the highest value + across all returned datapoints (spec section 6.2). This avoids flagging burst workloads + where a short but significant CPU spike would be averaged away. - Returns None on error — caller treats None as "not idle" (safe default). + Returns None on error or no datapoints — caller treats None as "not idle" (safe default). """ start = now - timedelta(days=days) try: diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py index 78f4572..bc94624 100644 --- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py +++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py @@ -85,24 +85,25 @@ def find_idle_workbench_instances( raise ValueError(f"idle_days must be >= 1, got {idle_days!r}") session = AuthorizedSession(credentials) - raw_instances, unreachable_locations, discovery_failed = _list_instances( - session, project_id - ) + raw_instances, unreachable_locations, discovery_failed = _list_instances(session, project_id) # --- Partial scan scope --- - # The spec explicitly covers unreachable[] from the API as a PARTIAL trigger. - # discovery_failed (400/5xx/network) is an implementation extension: those - # errors make enumeration incomplete in the same way, so PARTIAL is the - # conservative and correct choice even though the spec does not spell it out. - if unreachable_locations or discovery_failed: - parts = [] - if unreachable_locations: - parts.append(f"unreachable locations: {unreachable_locations}") - if discovery_failed: - parts.append("discovery incomplete due to transport/server error") + # The spec defines PARTIAL exclusively from discovery-layer reachability: + # unreachable[] locations reported by the API (section 12.1, 12.2.7). + # 400/5xx/network errors (discovery_failed) make enumeration incomplete but + # are NOT defined as PARTIAL by the spec; they get a separate warning. + if unreachable_locations: warnings.warn( f"gcp.vertex.workbench.idle: scan scope PARTIAL for project '{project_id}'" - f" — {'; '.join(parts)}", + f" — unreachable locations: {unreachable_locations}", + UserWarning, + stacklevel=2, + ) + + if discovery_failed: + warnings.warn( + f"gcp.vertex.workbench.idle: discovery incomplete for project '{project_id}'" + f" — transport or server error encountered; scan results may be incomplete", UserWarning, stacklevel=2, ) @@ -154,9 +155,9 @@ def find_idle_workbench_instances( # Warn with a capped list of resource names (first 5 + overflow count) so # the warning stays readable even in large projects. if candidate_resources: - _CAP = 5 - shown = [r["name"] for r in candidate_resources[:_CAP]] - overflow = len(candidate_resources) - _CAP + _cap = 5 + shown = [r["name"] for r in candidate_resources[:_cap]] + overflow = len(candidate_resources) - _cap name_summary = ", ".join(shown) if overflow > 0: name_summary += f", ... (+{overflow} more)" @@ -225,9 +226,17 @@ def _list_instances( ) if resp.status_code == 404: - # Implementation choice: 404 = Notebooks API not enabled for this - # project, meaning provably no instances exist. Treated as a clean - # empty scope (FULL, EVALUABLE). Not explicitly defined in the spec. + # 404 = Notebooks API not enabled for this project. + # Assumption: if the API is not enabled, no instances can exist. + # Treated as a clean empty scope (FULL, EVALUABLE). + # A warning is emitted so callers are aware of the assumption. + warnings.warn( + f"gcp.vertex.workbench.idle: Notebooks API not enabled for project" + f" '{project_id}' (HTTP 404) — assuming no instances exist;" + f" scope treated as FULL", + UserWarning, + stacklevel=3, + ) return [], [], False if resp.status_code == 400: diff --git a/docs/rules/aws.md b/docs/rules/aws.md index 82adc6e..402a756 100644 --- a/docs/rules/aws.md +++ b/docs/rules/aws.md @@ -250,17 +250,17 @@ **Spec:** [specs/aws/ai/sagemaker_notebook_idle.md](../specs/aws/ai/sagemaker_notebook_idle.md) #### `aws.ec2.gpu.idle` -**Detects:** EC2 GPU/accelerator instances (p/g/trn/inf/dl families) with <5% GPU utilization or <10% CPU over `idle_days` +**Detects:** EC2 GPU/accelerator instances (p/g/trn/inf/dl families) with max GPU utilization < 5% or max daily CPU < 10% over `idle_days` -**Confidence / Risk:** HIGH (NVIDIA CW agent present, max GPU < 5%); MEDIUM (no NVIDIA agent, avg CPU < 10%) / CRITICAL (`idle_ratio ≥ 2.0`); HIGH (all other GPU/accelerator instances) +**Confidence / Risk:** HIGH (`nvidia_smi_utilization_gpu` discoverable in CloudWatch and max GPU < threshold); MEDIUM (metric not discoverable — CPU proxy only, max daily CPU < threshold) / CRITICAL (`idle_ratio >= 2.0`); HIGH (otherwise) **Permissions:** `ec2:DescribeInstances`, `cloudwatch:GetMetricStatistics`, `cloudwatch:ListMetrics` **Params:** `idle_days` (default: 7), `gpu_threshold` (default: 5.0%), `cpu_threshold` (default: 10.0%) -**Exclusions:** non-GPU instance families; instances younger than threshold +**Exclusions:** non-GPU instance families; missing, naive, or future `LaunchTime`; instances younger than threshold; no CloudWatch datapoints on chosen metric path -**Spec:** — +**Spec:** [specs/aws/ai/ec2_gpu_idle.md](../specs/aws/ai/ec2_gpu_idle.md) #### `aws.bedrock.provisioned_throughput.idle` **Detects:** Bedrock Provisioned Throughput (Model Units) with zero invocations for `idle_days`; bills per MU per hour regardless of traffic diff --git a/docs/specs/aws/ai/ec2_gpu_idle.md b/docs/specs/aws/ai/ec2_gpu_idle.md new file mode 100644 index 0000000..a9877a5 --- /dev/null +++ b/docs/specs/aws/ai/ec2_gpu_idle.md @@ -0,0 +1,433 @@ +# aws.ec2.gpu.idle -- Canonical Rule Specification + +## 1. Intent + +Detect raw Amazon EC2 accelerated-computing instances that are still `running`, old enough to +evaluate, and show persistently low accelerator-related utilization over the configured evaluation +window. + +This is a **CleanCloud-derived review-candidate heuristic**, not an AWS-native idle-state signal. +It is intentionally conservative and read-only. It is **not** proof that an instance is safe to +stop, **not** proof that no scheduled or intermittent workload exists, and **not** proof that no +future use is planned. + +Two signal tiers are used: + +1. **HIGH confidence** when an NVIDIA CloudWatch agent GPU metric is available and stays below + threshold. +2. **MEDIUM confidence** CPU fallback when no qualifying per-instance NVIDIA GPU metric is + available. + +--- + +## 2. AWS API Grounding + +Based on current AWS EC2 and CloudWatch documentation. + +### Key facts + +1. `DescribeInstances` supports filtering by `instance-state-name`. +2. EC2 instance state names include `running`, `pending`, `stopping`, `stopped`, and others. +3. `DescribeInstances` returns core fields needed by this rule, including `InstanceId`, + `InstanceType`, `LaunchTime`, `Tags`, `State`, and `InstanceLifecycle`. +4. The EC2 CloudWatch metrics reference documents `CPUUtilization` in namespace `AWS/EC2`, + with meaningful statistics including `Average`, `Minimum`, and `Maximum`. +5. AWS documents `GPUPowerUtilization` in namespace `AWS/EC2` for only a subset of accelerated + instance types. +6. `ListMetrics` lists metrics by namespace, metric name, and dimensions, returns up to 500 + results per call, and does not return metrics that have not published data in the past two + weeks. +7. `GetMetricStatistics` retrieves CloudWatch datapoints for a metric and requires exact + dimensions that match the published metric. +8. `GetMetricStatistics` retains 1-hour metric resolution for 455 days, and when the start time + is greater than 63 days ago, 3600-second periods are valid. +9. AWS documents NVIDIA GPU metrics collected by the CloudWatch agent, including + `nvidia_smi_utilization_gpu`, which represents the percentage of time over the sample period + during which one or more kernels on the GPU was running. +10. AWS documents CloudWatch agent metric dimensions for NVIDIA GPU metrics as GPU-level + dimensions such as `index`, `name`, and `arch`. +11. AWS also documents CloudWatch agent `append_dimensions`, including `InstanceId`, when the + agent is configured to append EC2 dimensions to collected metrics. +12. AWS accelerated-computing instance documentation includes GPU and AI-accelerator families such + as P, G, Trn, Inf, and DL families, plus non-AI families such as F and VT. +13. AWS pricing docs exist, but fixed monthly USD estimates used by CleanCloud are implementation + context, not canonical AWS API outputs. + +### Implications + +- `DescribeInstances(Filters=[{"Name": "instance-state-name", "Values": ["running"]}])` is the + canonical inventory API for this rule. +- Only `running` instances are in scope because they incur live compute charges. +- `LaunchTime` is the canonical age gate for suppressing newly launched instances. +- `CPUUtilization` is a documented EC2 metric and can support a fallback heuristic. +- `GPUPowerUtilization` is excluded due to inconsistent emission and non-idle semantics across + instance families. +- `nvidia_smi_utilization_gpu` is the strongest available GPU activity signal used by this rule, + but it is **agent-dependent**, not AWS-native by default. +- The current implementation's high-confidence GPU path depends on the CloudWatch agent publishing + GPU metrics that are discoverable per instance under the exact dimension pattern the rule probes. + In practice, this often requires agent `append_dimensions` such as `InstanceId`, but AWS does not + guarantee that GPU metrics include that dimension by default. +- Because `ListMetrics` does not return metrics with no datapoints in the past two weeks, GPU + metric discovery is inherently recency-limited even though `GetMetricStatistics` can query older + datapoints at coarser resolution. In quiet, low-noise, or intermittently reporting fleets, this + can systematically under-detect GPU metrics, suppress the GPU path entirely, and bias + classification toward CPU fallback findings rather than GPU-based findings. +- Absence of `nvidia_smi_utilization_gpu` is **not** proof that the instance is idle; it may mean + the CloudWatch agent is absent, misconfigured, or not appending the dimensions the rule probes. +- Trainium, Inferentia, and other non-NVIDIA accelerator families do not expose + `nvidia_smi_utilization_gpu`; those families necessarily fall back to CPU in the current rule, + and AWS does not document a standard EC2-native GPU-equivalent utilization metric for them that + this rule can use as a direct substitute. +- In many real fleets, the CWAgent GPU metric path will be unavailable more often than available, + so CPU fallback may dominate in practice. + +--- + +## 3. Scope and Terminology + +- **EC2 accelerated instance** - an EC2 instance whose type belongs to the rule's targeted + AI/accelerator family allowlist. +- **Running instance** - an EC2 instance whose `State.Name` is exactly `running`. +- **Age gate** - `LaunchTime` is at least `effective_idle_days` old. +- **GPU metric path** - NVIDIA CloudWatch agent metric `CWAgent / nvidia_smi_utilization_gpu` + exists for the instance and remains below threshold over the evaluation window. +- **CPU fallback path** - no qualifying GPU metric is available, so `AWS/EC2 / CPUUtilization` + is used as a weaker heuristic. +- **effective_idle_days** - `max(idle_days, 1)`. +- **age_days** - `floor((now_utc - launch_time_utc) / 86400 seconds)`. +- **idle_ratio** - `age_days / effective_idle_days`. +- **evaluation_window_end_utc** - `now_utc` +- **evaluation_window_start_utc** - `evaluation_window_end_utc - effective_idle_days x 86400 seconds` +- **GPU metric available** - at least one matching `nvidia_smi_utilization_gpu` metric was + discoverable for the instance. + +### 3.1 Explicit scope boundary + +This rule applies only to **raw EC2 accelerated-computing instances**, not managed ML resources. + +Out of scope: + +- SageMaker endpoints, notebooks, Studio apps, or training jobs +- stopped, stopping, pending, shutting-down, or terminated EC2 instances +- FPGA-only and video-transcode families not targeted by this rule +- storage-only cost after instance stop +- workload-semantic reconstruction from application logs + +### 3.2 Current family allowlist + +The current rule targets these instance-type prefixes: + +- `p2.`, `p3.`, `p3dn.`, `p4d.`, `p4de.`, `p5.`, `p5en.`, `p6.` +- `g4dn.`, `g4ad.`, `g5.`, `g5g.`, `g6.`, `g6e.`, `gr6.` +- `trn1.`, `trn1n.`, `trn2.` +- `inf1.`, `inf2.` +- `dl1.`, `dl2q.` + +Notes: + +- This is a **rule allowlist**, not a complete inventory of every accelerated EC2 family AWS may + document. +- Non-AI accelerated families such as `f1`, `f2`, and `vt1` are intentionally out of scope. +- Future AWS family-name additions may require allowlist updates even when they are documented in + the latest accelerated-computing pages. + +--- + +## 4. Canonical Rule Statement + +An EC2 instance is eligible only when **all** of the following are true: + +1. `State.Name == "running"` +2. `InstanceType` matches the rule allowlist +3. `LaunchTime` is valid and `age_days >= effective_idle_days` +4. one of the utilization branches below is satisfied: + - **GPU path:** maximum observed `nvidia_smi_utilization_gpu` over the evaluation window is below + `gpu_threshold` + - **CPU fallback path:** no qualifying GPU metric is available and maximum observed daily + `CPUUtilization` over the evaluation window is below `cpu_threshold` + +No finding may be emitted from age alone, from instance family alone, or from missing GPU metrics +alone. + +--- + +## 5. Normalization Contract + +All logic must operate on normalized fields only. + +| Canonical field | Source field | Absent / invalid | +|---|---|---| +| `resource_id` | `InstanceId` | skip item | +| `instance_id` | `InstanceId` | skip item | +| `instance_type` | `InstanceType` | skip item | +| `normalized_state` | `State.Name` | skip item | +| `launch_time_utc` | `LaunchTime` (tz-aware UTC) | skip item | +| `age_days` | floor((now - launch_time_utc) / 86400) | skip item | +| `name_tag` | tag `Name` | fall back to `instance_id` | +| `purchasing_model` | `InstanceLifecycle` | `"on-demand"` when absent | +| `tags` | `Tags` | `{}` | + +Normalization requirements: + +- `InstanceId` and `InstanceType` must be non-empty strings. +- `State.Name` must be a non-empty string. +- `LaunchTime` must be timezone-aware UTC before use. +- future `LaunchTime` must skip the item. +- `effective_idle_days = max(idle_days, 1)`. + +--- + +## 6. Signal Contract + +This rule uses a two-tier utilization model. + +### 6.1 HIGH-confidence GPU path + +The preferred signal is: + +- namespace: `CWAgent` +- metric: `nvidia_smi_utilization_gpu` + +Signal interpretation: + +- AWS documents this metric as the percentage of time over the sample period during which one or + more GPU kernels were running. +- The rule first probes metric presence with `ListMetrics`. +- When GPU metrics are present, the rule queries `GetMetricStatistics` using `Statistics=["Maximum"]` + and `Period=3600`. +- For multi-GPU instances, the rule takes the **maximum** observed utilization across all returned + GPU-index metrics in the evaluation window. +- If the GPU metric has not published in the last two weeks, `ListMetrics` may not return it even + if older datapoints still exist in CloudWatch retention; in that case the rule falls back to CPU. +- The current rule does **not** require continuous evaluation-window GPU datapoint coverage. Any returned + datapoints on the chosen GPU metric path are treated as sufficient for evaluation; complete + absence of datapoints causes a safe-default skip instead. +- This is intentionally a loose coverage rule matching current implementation behavior: sparse or + intermittent GPU datapoints can still drive a decision, which is weaker than requiring dense or + complete coverage of the evaluation window. +- The current rule defines no minimum datapoint count or density threshold for the GPU path. +- In the current implementation, even a single returned GPU datapoint can drive the GPU-path + decision. + +Canonical threshold: + +- emit only when `max_gpu_utilization_pct < gpu_threshold` + +Why MAX is required: + +- on multi-GPU hosts, averaging across GPU indices can hide a single active accelerator +- if any GPU is materially active, the instance must not be flagged idle + +### 6.2 MEDIUM-confidence CPU fallback + +If no qualifying GPU metric is discoverable, the rule falls back to: + +- namespace: `AWS/EC2` +- metric: `CPUUtilization` + +Signal interpretation: + +- The rule queries `GetMetricStatistics` with `Statistics=["Maximum"]` and `Period=86400`. +- It then takes the **maximum** daily CPU peak across the evaluation window. +- The current rule does **not** require continuous evaluation-window CPU datapoint coverage. Any returned + datapoints on the chosen CPU path are treated as sufficient for evaluation; complete absence of + datapoints causes a safe-default skip instead. +- This is intentionally a loose coverage rule matching current implementation behavior: sparse CPU + datapoints can still drive a decision, which increases false-positive risk relative to a denser + or complete-coverage requirement. +- The current rule defines no minimum datapoint count or density threshold for the CPU path. +- In the current implementation, even a very small number of CPU datapoints can drive the CPU-path + decision. + +Canonical threshold: + +- emit only when `max_daily_cpu_utilization_pct < cpu_threshold` + +Why this is weaker: + +- CPU is not a direct accelerator activity metric +- accelerator-heavy workloads can perform real work with low CPU +- absence of the CWAgent GPU metric is not proof of GPU inactivity +- the CPU fallback path therefore has materially higher false-positive risk than the GPU path + +Therefore CPU fallback is **MEDIUM confidence only**. + +### 6.3 Neuron and non-NVIDIA accelerators + +For `trn*`, `inf*`, `dl1`, and `dl2q` families: + +- `nvidia_smi_utilization_gpu` is not expected to exist +- CPU fallback is the only supported signal in the current rule +- no EC2-native CloudWatch GPU-equivalent metric exists that this rule can query as a drop-in + replacement across these families +- finding text must make this limitation explicit + +### 6.4 Non-canonical or out-of-contract signals + +The following are not used for eligibility: + +- `GPUPowerUtilization` +- EC2 network, disk, or status-check metrics +- instance age alone +- purchasing model (`spot`, `scheduled`, `capacity-block`, `on-demand`) +- tags +- CloudWatch Logs or application logs + +--- + +## 7. Confidence, Risk, and Cost + +### 7.1 Confidence + +- **HIGH** when GPU metric path is used and threshold is met +- **MEDIUM** when CPU fallback path is used and threshold is met + +### 7.2 Risk + +- `idle_ratio = age_days / effective_idle_days` +- **CRITICAL** when `idle_ratio >= 2.0` +- **HIGH** otherwise + +### 7.3 Cost + +- `estimated_monthly_cost_usd` is advisory context only +- CleanCloud may use an implementation-defined us-east-1 on-demand monthly cost table +- if the instance type is unknown to that table, a default fallback estimate may be used +- cost must not affect eligibility, confidence, or risk gating + +--- + +## 8. Deterministic Evaluation Order + +1. Set `effective_idle_days = max(idle_days, 1)`. +2. Call `DescribeInstances` and fully paginate only `running` instances. +3. For each returned instance: + - state missing or not `running` -> **SKIP ITEM** + - `InstanceType` missing or not in allowlist -> **SKIP ITEM** + - invalid / naive / future `LaunchTime` -> **SKIP ITEM** + - `age_days < effective_idle_days` -> **SKIP ITEM** +4. Probe GPU metrics with `ListMetrics` for `CWAgent / nvidia_smi_utilization_gpu`. +5. If GPU metrics exist: + - fetch 1-hour `Maximum` datapoints per metric within `[evaluation_window_start_utc, evaluation_window_end_utc]` + - if no datapoints or metric retrieval fails -> **SKIP ITEM** + - compute global max across all GPU metrics + - if `global_max_gpu >= gpu_threshold` -> **SKIP ITEM** + - otherwise -> **EMIT HIGH-confidence finding** +6. If no GPU metrics exist: + - fetch daily `Maximum` datapoints for `AWS/EC2 / CPUUtilization` within `[evaluation_window_start_utc, evaluation_window_end_utc]` + - if no datapoints or metric retrieval fails -> **SKIP ITEM** + - compute max daily CPU peak + - if `max_daily_cpu >= cpu_threshold` -> **SKIP ITEM** + - otherwise -> **EMIT MEDIUM-confidence finding** + +No raw field access after normalization. + +--- + +## 9. Exclusion Rules + +1. non-`running` instance state +2. instance type not in the rule allowlist +3. missing `InstanceId` +4. missing `InstanceType` +5. missing, naive, or future `LaunchTime` +6. `age_days < effective_idle_days` +7. GPU metric present but any observed GPU maximum meets or exceeds threshold +8. GPU metric absent and CPU daily peak meets or exceeds threshold +9. missing datapoints on the chosen metric path + +--- + +## 10. Failure Model + +**Rule-level failures (FAIL RULE):** + +- `DescribeInstances` permission failure (`UnauthorizedOperation`, `AccessDenied`) + +**Item-level safe-default skips (SKIP ITEM):** + +- malformed identity or timestamps +- unsupported instance family +- too young to classify +- CloudWatch metric retrieval returns no datapoints +- CloudWatch metric retrieval errors on the chosen path + +Design principle: + +- metric uncertainty must resolve to **not idle**, never to a finding +- CloudWatch errors are treated as safe defaults, not as idle evidence + +--- + +## 11. Output Contract + +Each emitted finding must preserve the standard `Finding` contract and include, at minimum: + +- `provider = "aws"` +- `rule_id = "aws.ec2.gpu.idle"` +- `resource_type = "aws.ec2.instance"` +- `resource_id = InstanceId` +- `region = evaluated region` +- `risk` +- `confidence` +- `detected_at` +- `estimated_monthly_cost_usd` +- `evidence` +- `details` + +### 11.1 Evidence contract + +`evidence.signals_used` should include: + +- instance running state +- instance type +- purchasing model +- the utilization signal used for eligibility +- age when available + +`evidence.signals_not_checked` should include: + +- direct accelerator utilization when CPU fallback is used +- scheduled workloads outside the observation window +- planned future use + +Additional context should note: + +- CPU fallback is heuristic only +- CPU fallback has higher false-positive risk than GPU-based detection +- Neuron families require different telemetry for true accelerator utilization +- missing CWAgent GPU metrics do not prove idleness + +### 11.2 Details contract + +`details` should include: + +- `instance_id` +- `instance_type` +- `name` +- `age_days` +- `idle_days_threshold` +- `idle_ratio` +- `idle_signal` +- `utilization_pct` +- `purchasing_model` +- `gpu_metric_available` +- `gpu_threshold_pct` +- `cpu_threshold_pct` +- `estimated_monthly_cost` +- `cost_basis` +- `tags` + +--- + +## 12. Summary of Intended Semantics + +This rule detects **running raw EC2 accelerated instances** that are old enough to evaluate and +appear idle under one of two heuristics: + +1. **HIGH confidence** - CloudWatch agent GPU metric proves very low GPU execution activity +2. **MEDIUM confidence** - GPU metric unavailable, and CPU peak remains low over the evaluation window + +It is intentionally cost-focused and review-oriented. It does **not** prove that an accelerator +workload is absent; it identifies likely expensive idle compute for human review. diff --git a/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py b/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py index 716dd29..b67cb64 100644 --- a/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py +++ b/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py @@ -8,12 +8,19 @@ - Instance state filter: only running instances - GPU family filter: non-GPU instances skipped - Age guard: instances younger than idle_days skipped +- Normalization: missing InstanceId → skip (spec section 5) +- Normalization: missing LaunchTime → skip (spec section 5) +- Normalization: naive (tz-unaware) LaunchTime → skip, not fixed up (spec section 5) +- Normalization: future LaunchTime → skip (spec section 5) - Utilisation thresholds: configurable gpu_threshold and cpu_threshold - Risk levels: CRITICAL (idle_ratio >= 2.0), HIGH otherwise - Active GPU instance not flagged - Active CPU instance not flagged (fallback path) - CloudWatch errors treated as active (safe default) - Permission errors: PermissionError raised on AccessDenied from describe_instances +- _list_gpu_metrics paginates via NextToken (spec section 2 key fact 6) +- signals_not_checked: GPU note only in CPU fallback path (spec section 11.1) +- details: utilization_pct uses US spelling (spec section 11.2) - RULE_METADATA and RULE_ID attributes """ @@ -67,7 +74,7 @@ def _make_session(instances, gpu_util=None, cpu_util=None, has_gpu_metric=False) Build a mock boto3 session. gpu_util: if not None, NVIDIA CloudWatch agent metrics are available with this max value. - cpu_util: avg CPU utilisation returned from AWS/EC2 namespace. + cpu_util: maximum daily CPU utilization value returned from AWS/EC2 namespace. has_gpu_metric: whether ListMetrics returns NVIDIA metrics. """ session = MagicMock() @@ -99,7 +106,7 @@ def _make_session(instances, gpu_util=None, cpu_util=None, has_gpu_metric=False) else: cw.list_metrics.return_value = {"Metrics": []} - # GetMetricStatistics for GPU utilisation + # GetMetricStatistics for GPU/CPU utilization if gpu_util is not None: cw.get_metric_statistics.return_value = { "Datapoints": [{"Maximum": gpu_util, "Timestamp": NOW}] @@ -223,7 +230,7 @@ def test_g5_is_not_neuron(self): class TestFindIdleGpuInstances: def test_idle_gpu_instance_flagged_high_confidence(self): - """GPU metric available, utilisation below threshold → HIGH confidence.""" + """GPU metric available, utilization below threshold → HIGH confidence.""" # age=7, idle_days=7 → idle_ratio=1.0 < 2.0 → HIGH risk (not CRITICAL) findings = _run([_make_instance(age_days=7)], gpu_util=2.0, has_gpu_metric=True) assert len(findings) == 1 @@ -242,12 +249,12 @@ def test_idle_gpu_instance_cpu_fallback_medium_confidence(self): assert findings[0].confidence == ConfidenceLevel.MEDIUM def test_active_gpu_instance_not_flagged(self): - """GPU utilisation above threshold → not flagged.""" + """GPU utilization above threshold → not flagged.""" findings = _run([_make_instance()], gpu_util=50.0, has_gpu_metric=True) assert findings == [] def test_active_cpu_fallback_not_flagged(self): - """CPU utilisation above threshold in fallback path → not flagged.""" + """CPU utilization above threshold in fallback path → not flagged.""" findings = _run([_make_instance()], cpu_util=25.0, has_gpu_metric=False) assert findings == [] @@ -388,11 +395,11 @@ def test_permission_error_on_describe_instances(self): def test_details_include_idle_signal_source(self): findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) - assert findings[0].details["idle_signal"] == "gpu_utilisation" + assert findings[0].details["idle_signal"] == "gpu_utilization" def test_details_include_cpu_fallback_signal_source(self): findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False) - assert findings[0].details["idle_signal"] == "cpu_utilisation_fallback" + assert findings[0].details["idle_signal"] == "cpu_utilization_fallback" def test_details_include_gpu_metric_available_flag(self): findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) @@ -424,3 +431,221 @@ def _client(service, **kwargs): mock_dt.fromisoformat = datetime.fromisoformat findings = find_idle_gpu_instances(session, _REGION) assert len(findings) == 2 + + +# --------------------------------------------------------------------------- +# Normalization: skip rules (spec section 5) +# --------------------------------------------------------------------------- + + +class TestNormalizationSkips: + """ + Instances with missing/invalid identity or timestamp fields must be skipped + without raising an exception (spec section 5, section 8.3). + """ + + def test_missing_instance_id_is_skipped(self): + """InstanceId absent → skip item, not KeyError.""" + inst = _make_instance() + del inst["InstanceId"] + findings = _run([inst], cpu_util=1.0, has_gpu_metric=False) + assert findings == [] + + def test_empty_instance_id_is_skipped(self): + inst = _make_instance() + inst["InstanceId"] = "" + findings = _run([inst], cpu_util=1.0, has_gpu_metric=False) + assert findings == [] + + def test_missing_launch_time_is_skipped(self): + """Missing LaunchTime → skip item (spec section 5).""" + inst = _make_instance() + del inst["LaunchTime"] + findings = _run([inst], cpu_util=1.0, has_gpu_metric=False) + assert findings == [] + + def test_naive_launch_time_is_skipped(self): + """ + Naive (tz-unaware) LaunchTime must be skipped, not coerced to UTC. + Spec section 5: 'LaunchTime must be timezone-aware UTC before use.' + """ + inst = _make_instance() + inst["LaunchTime"] = NOW.replace(tzinfo=None) - timedelta(days=30) + findings = _run([inst], cpu_util=1.0, has_gpu_metric=False) + assert findings == [] + + def test_future_launch_time_is_skipped(self): + """Future LaunchTime → skip item (spec section 5).""" + inst = _make_instance() + inst["LaunchTime"] = NOW + timedelta(days=1) + findings = _run([inst], cpu_util=1.0, has_gpu_metric=False) + assert findings == [] + + def test_valid_instance_alongside_invalid_is_still_flagged(self): + """A bad record next to a good one must not suppress the valid finding.""" + bad = _make_instance(instance_id="i-bad") + del bad["LaunchTime"] + good = _make_instance(instance_id="i-good", age_days=30) + findings = _run([bad, good], cpu_util=1.0, has_gpu_metric=False) + assert len(findings) == 1 + assert findings[0].resource_id == "i-good" + + +# --------------------------------------------------------------------------- +# _list_gpu_metrics — pagination (spec section 2 key fact 6) +# --------------------------------------------------------------------------- + + +class TestListGpuMetricsPagination: + def test_follows_next_token(self): + """NextToken on first page must be followed to collect all metrics.""" + cw = MagicMock() + metric1 = { + "MetricName": "nvidia_smi_utilization_gpu", + "Dimensions": [{"Name": "gpu_id", "Value": "0"}], + } + metric2 = { + "MetricName": "nvidia_smi_utilization_gpu", + "Dimensions": [{"Name": "gpu_id", "Value": "1"}], + } + cw.list_metrics.side_effect = [ + {"Metrics": [metric1], "NextToken": "tok1"}, + {"Metrics": [metric2]}, + ] + result = _list_gpu_metrics(cw, "i-abc") + assert result == [metric1, metric2] + assert cw.list_metrics.call_count == 2 + + def test_second_call_includes_next_token(self): + cw = MagicMock() + cw.list_metrics.side_effect = [ + {"Metrics": [], "NextToken": "tok-xyz"}, + {"Metrics": []}, + ] + _list_gpu_metrics(cw, "i-abc") + second_call_kwargs = cw.list_metrics.call_args_list[1][1] + assert second_call_kwargs.get("NextToken") == "tok-xyz" + + def test_three_pages_accumulated(self): + cw = MagicMock() + + def make_metric(idx): + return { + "MetricName": "nvidia_smi_utilization_gpu", + "Dimensions": [{"Name": "gpu_id", "Value": str(idx)}], + } + + cw.list_metrics.side_effect = [ + {"Metrics": [make_metric(0)], "NextToken": "t1"}, + {"Metrics": [make_metric(1)], "NextToken": "t2"}, + {"Metrics": [make_metric(2)]}, + ] + result = _list_gpu_metrics(cw, "i-abc") + assert len(result) == 3 + + def test_error_on_second_page_returns_empty(self): + """Any exception during pagination returns [] (safe default).""" + cw = MagicMock() + cw.list_metrics.side_effect = [ + {"Metrics": [], "NextToken": "tok1"}, + ClientError({"Error": {"Code": "Throttling", "Message": ""}}, "ListMetrics"), + ] + assert _list_gpu_metrics(cw, "i-abc") == [] + + +# --------------------------------------------------------------------------- +# signals_not_checked: GPU note only in CPU fallback path (spec section 11.1) +# --------------------------------------------------------------------------- + + +class TestSignalsNotChecked: + def test_gpu_path_does_not_include_gpu_note_in_not_checked(self): + """ + When GPU metric is used, 'direct accelerator utilization' must NOT appear + in signals_not_checked — the GPU was checked and was the basis for the finding. + """ + findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) + not_checked = findings[0].evidence.signals_not_checked + assert not any("accelerator" in s.lower() for s in not_checked) + + def test_cpu_path_includes_gpu_note_in_not_checked(self): + """ + When CPU fallback is used, 'direct accelerator utilization' MUST appear + in signals_not_checked (spec section 11.1). + """ + findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False) + not_checked = findings[0].evidence.signals_not_checked + assert any("accelerator" in s.lower() for s in not_checked) + + def test_scheduled_workloads_in_not_checked_for_gpu_path(self): + findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) + not_checked = findings[0].evidence.signals_not_checked + assert any("scheduled" in s.lower() or "batch" in s.lower() for s in not_checked) + + def test_planned_future_use_in_not_checked_for_gpu_path(self): + findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) + not_checked = findings[0].evidence.signals_not_checked + assert any("planned future" in s.lower() or "future use" in s.lower() for s in not_checked) + + +# --------------------------------------------------------------------------- +# Details contract (spec section 11.2) +# --------------------------------------------------------------------------- + + +class TestDetailsContract: + def test_utilization_pct_us_spelling_gpu_path(self): + """Details field must use 'utilization_pct' (US spelling, spec section 11.2).""" + findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) + assert "utilization_pct" in findings[0].details + assert "utilisation_pct" not in findings[0].details + + def test_utilization_pct_us_spelling_cpu_path(self): + findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False) + assert "utilization_pct" in findings[0].details + assert "utilisation_pct" not in findings[0].details + + def test_age_days_is_integer_not_unknown(self): + """age_days must be an integer — 'unknown' fallback is removed since LaunchTime is required.""" + findings = _run([_make_instance(age_days=30)], cpu_util=5.0, has_gpu_metric=False) + assert isinstance(findings[0].details["age_days"], int) + assert findings[0].details["age_days"] == 30 + + +# --------------------------------------------------------------------------- +# Reason string — per-path framing (spec section 11.1, drift fix) +# --------------------------------------------------------------------------- + + +class TestReasonString: + """ + GPU path: factual assertion — GPU utilization was directly measured. + CPU path: softened proxy framing — GPU activity was NOT directly measured. + """ + + def test_gpu_path_reason_asserts_gpu_utilization(self): + findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) + assert "GPU utilization" in findings[0].reason + + def test_gpu_path_reason_does_not_say_proxy(self): + findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True) + assert "proxy" not in findings[0].reason.lower() + + def test_cpu_path_reason_says_proxy_signal(self): + """CPU fallback reason must use proxy framing, not assert GPU facts.""" + findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False) + assert "proxy" in findings[0].reason.lower() + + def test_cpu_path_reason_says_not_directly_measured(self): + findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False) + assert "not directly measured" in findings[0].reason + + def test_cpu_path_reason_includes_utilization_value(self): + findings = _run([_make_instance()], cpu_util=3.7, has_gpu_metric=False) + assert "3.7" in findings[0].reason + + def test_not_checked_cpu_path_mentions_absence_does_not_confirm_idle(self): + """The not_checked entry for CPU path must carry the absence ≠ idle clarification.""" + findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False) + not_checked = " ".join(findings[0].evidence.signals_not_checked) + assert "does not confirm" in not_checked or "absence" in not_checked diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py index e453b18..3ca9d70 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py @@ -77,9 +77,7 @@ def _run(instances, unreachable=None, discovery_failed=False, **kwargs): "cleancloud.providers.gcp.rules.ai.workbench_idle._list_instances", return_value=(instances, unreachable or [], discovery_failed), ): - return find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), **kwargs - ) + return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs) def _run_with_warnings(instances, unreachable=None, discovery_failed=False, **kwargs): @@ -99,9 +97,7 @@ def _invoke_http(mock_session, **kwargs): "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", return_value=mock_session, ): - return find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), **kwargs - ) + return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs) def _invoke_http_with_warnings(mock_session, **kwargs): @@ -128,8 +124,7 @@ def test_always_empty_with_active_instances(self): def test_always_empty_with_multiple_active(self): instances = [ - _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") - for i in range(5) + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(5) ] assert _run(instances) == [] @@ -145,9 +140,7 @@ def test_always_empty_with_stopped_instances(self): class TestIdleDaysValidation: def test_zero_raises(self): with pytest.raises(ValueError, match="idle_days must be >= 1"): - find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), idle_days=0 - ) + find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), idle_days=0) def test_negative_raises(self): with pytest.raises(ValueError, match="idle_days must be >= 1"): @@ -203,8 +196,7 @@ def test_warning_mentions_resource_name(self): def test_warning_mentions_count(self): instances = [ - _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") - for i in range(3) + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(3) ] _, warns = _run_with_warnings(instances) assert "3" in str(warns[0].message) @@ -219,8 +211,7 @@ def test_warning_mentions_all_resource_names_when_under_cap(self): def test_warning_caps_name_list_at_five(self): instances = [ - _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") - for i in range(8) + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(8) ] _, warns = _run_with_warnings(instances) msg = str(warns[0].message) @@ -228,8 +219,7 @@ def test_warning_caps_name_list_at_five(self): def test_warning_no_overflow_label_when_at_cap(self): instances = [ - _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") - for i in range(5) + _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(5) ] _, warns = _run_with_warnings(instances) assert "more" not in str(warns[0].message) @@ -278,32 +268,58 @@ def test_no_partial_warning_when_no_unreachable(self): # --------------------------------------------------------------------------- -# PARTIAL scope warning — discovery_failed +# Discovery-incomplete warning — discovery_failed (NOT PARTIAL per spec) # --------------------------------------------------------------------------- -class TestPartialScopeDiscoveryFailed: +class TestDiscoveryFailedWarning: + """ + discovery_failed (400/5xx/network) emits a 'discovery incomplete' warning. + It is NOT labelled PARTIAL — PARTIAL is reserved for unreachable[] per spec + (sections 12.1 and 12.2.7). + """ + def test_discovery_failed_emits_warning(self): _, warns = _run_with_warnings([], discovery_failed=True) assert len(warns) == 1 - def test_warning_mentions_partial(self): + def test_warning_does_not_mention_partial(self): + """discovery_failed must NOT say PARTIAL — that label is only for unreachable[].""" _, warns = _run_with_warnings([], discovery_failed=True) - assert "PARTIAL" in str(warns[0].message) + assert "PARTIAL" not in str(warns[0].message) + + def test_warning_mentions_incomplete(self): + _, warns = _run_with_warnings([], discovery_failed=True) + assert "incomplete" in str(warns[0].message).lower() def test_warning_mentions_project(self): _, warns = _run_with_warnings([], discovery_failed=True) assert _PROJECT in str(warns[0].message) - def test_both_partial_and_not_evaluable_can_warn(self): - """ACTIVE instances + unreachable locations = two warnings.""" + def test_unreachable_and_not_evaluable_each_emit_separate_warning(self): + """ACTIVE instances + unreachable locations = two warnings (PARTIAL + NOT_EVALUABLE).""" _, warns = _run_with_warnings([_active()], unreachable=["asia-east1"]) assert len(warns) == 2 def test_discovery_failed_and_active_instances_both_warn(self): + """discovery_failed + ACTIVE instances = two separate warnings.""" _, warns = _run_with_warnings([_active()], discovery_failed=True) assert len(warns) == 2 + def test_all_three_conditions_emit_three_warnings(self): + """unreachable + discovery_failed + ACTIVE = three separate warnings.""" + _, warns = _run_with_warnings( + [_active()], unreachable=["asia-east1"], discovery_failed=True + ) + assert len(warns) == 3 + + def test_unreachable_warning_still_says_partial(self): + """unreachable[] warning must still say PARTIAL even when discovery_failed is also True.""" + _, warns = _run_with_warnings([], unreachable=["asia-east1"], discovery_failed=True) + partial_warns = [w for w in warns if "PARTIAL" in str(w.message)] + assert len(partial_warns) == 1 + assert "asia-east1" in str(partial_warns[0].message) + # --------------------------------------------------------------------------- # INVALID resource classification @@ -384,15 +400,15 @@ def test_suspended_excluded_silently(self): assert len(warns) == 0 def test_provisioning_excluded_silently(self): - _, warns = _run_with_warnings( - [{"name": _INSTANCE_NAME, "state": "PROVISIONING"}] - ) + _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "PROVISIONING"}]) assert len(warns) == 0 def test_mixed_active_and_stopped_only_active_warned(self): active = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/active") - stopped = {"name": f"projects/{_PROJECT}/locations/{_LOCATION}/instances/stopped", - "state": "STOPPED"} + stopped = { + "name": f"projects/{_PROJECT}/locations/{_LOCATION}/instances/stopped", + "state": "STOPPED", + } _, warns = _run_with_warnings([active, stopped]) assert len(warns) == 1 msg = str(warns[0].message) @@ -443,23 +459,36 @@ def test_403_mentions_role(self): with pytest.raises(PermissionError, match="roles/notebooks.viewer"): _invoke_http(_session(_err(403))) - def test_404_returns_empty_no_partial_warning(self): + def test_404_returns_empty_with_assumption_warning(self): + """404 emits a warning surfacing the 'no instances assumed' assumption.""" result, warns = _invoke_http_with_warnings(_session(_err(404))) assert result == [] - assert len(warns) == 0 + assert len(warns) == 1 + assert "404" in str(warns[0].message) - def test_400_returns_empty_with_partial_warning(self): + def test_404_warning_does_not_say_partial(self): + """404 is a clean-scope assumption, not a PARTIAL scan.""" + _, warns = _invoke_http_with_warnings(_session(_err(404))) + assert "PARTIAL" not in str(warns[0].message) + + def test_400_returns_empty_with_incomplete_warning(self): result, warns = _invoke_http_with_warnings(_session(_err(400))) assert result == [] - # _list_instances warns about 400, caller warns about PARTIAL msgs = " ".join(str(w.message) for w in warns) - assert "400" in msgs or "PARTIAL" in msgs + assert "400" in msgs or "incomplete" in msgs.lower() + + def test_400_warning_does_not_say_partial(self): + """discovery_failed (400) must NOT say PARTIAL — that label is only for unreachable[].""" + _, warns = _invoke_http_with_warnings(_session(_err(400))) + # _list_instances warns about 400; find_ warns about discovery incomplete + partial_warns = [w for w in warns if "PARTIAL" in str(w.message)] + assert len(partial_warns) == 0 def test_500_returns_empty_with_warning(self): result, warns = _invoke_http_with_warnings(_session(_err(500))) assert result == [] msgs = " ".join(str(w.message) for w in warns) - assert "500" in msgs or "PARTIAL" in msgs + assert "500" in msgs or "incomplete" in msgs.lower() def test_503_returns_empty_with_warning(self): result, warns = _invoke_http_with_warnings(_session(_err(503))) @@ -512,8 +541,8 @@ def test_url_uses_v2_api(self): class TestListInstancesPagination: def test_two_pages_accumulates_instances(self): - inst1 = {"name": f"projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} - inst2 = {"name": f"projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"} + inst1 = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + inst2 = {"name": "projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"} session = _session( _ok({"instances": [inst1], "nextPageToken": "tok1"}), _ok({"instances": [inst2]}), @@ -523,9 +552,20 @@ def test_two_pages_accumulates_instances(self): def test_three_pages_all_accumulated(self): pages = [ - _ok({"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}], - "nextPageToken": f"t{i}"} if i < 2 else - {"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}]}) + _ok( + { + "instances": [ + {"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"} + ], + "nextPageToken": f"t{i}", + } + if i < 2 + else { + "instances": [ + {"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"} + ] + } + ) for i in range(3) ] instances, _, _ = _list_instances(_session(*pages), _PROJECT) @@ -544,7 +584,9 @@ def test_stops_when_no_next_token(self): class TestListInstancesUnreachable: def test_unreachable_location_collected(self): - _, unreachable, _ = _list_instances(_session(_ok({"unreachable": ["asia-east1"]})), _PROJECT) + _, unreachable, _ = _list_instances( + _session(_ok({"unreachable": ["asia-east1"]})), _PROJECT + ) assert "asia-east1" in unreachable def test_unreachable_deduplicated_across_pages(self): @@ -568,9 +610,18 @@ def test_403_raises_permission_error(self): _list_instances(_session(_err(403)), _PROJECT) def test_404_returns_clean_empty(self): - instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT) + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT) assert instances == [] and unreachable == [] and failed is False + def test_404_emits_assumption_warning(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _list_instances(_session(_err(404)), _PROJECT) + msgs = [str(w.message) for w in caught if issubclass(w.category, UserWarning)] + assert any("404" in m for m in msgs) + def test_400_sets_discovery_failed(self): _, _, failed = _list_instances(_session(_err(400)), _PROJECT) assert failed is True From 2b4346af16370cd4a9d625f661d2406bc425797d Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 6 May 2026 21:45:26 +0100 Subject: [PATCH 3/3] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7e2a2bf..217504b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cleancloud" -version = "1.30.0" +version = "1.31.0" description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry." readme = "README.md" requires-python = ">=3.10"