From e3647b0c31cbf90afdf00d4b092ecf18285b46a0 Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 6 May 2026 19:50:34 +0100
Subject: [PATCH 1/3] fix GCP spec gap

---
 .../providers/gcp/rules/ai/workbench_idle.py  |  98 ++-
 .../gcp/ai/test_gcp_workbench_idle.py         | 689 ++++++++++--------
 2 files changed, 484 insertions(+), 303 deletions(-)

diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
index e906198..78f4572 100644
--- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py
+++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
@@ -70,9 +70,13 @@ def find_idle_workbench_instances(
 
     Currently EMITTING_DISABLED: no qualifying canonical signal exists for
     per-instance kernel activity. updateTime and createTime MUST NOT be used
-    as idle signals.
+    as idle signals. Always returns an empty list.
 
-    Always returns an empty list until a qualifying signal is available.
+    Performs full candidate classification and surfaces results via warnings:
+      - PARTIAL scope: unreachable[] locations or discovery failures warn once.
+      - NOT_EVALUABLE: ACTIVE candidates with NO_SIGNAL warn with resource names.
+      - INVALID resources: malformed name or missing state; counted but not warned.
+      - OUT_OF_SCOPE: non-ACTIVE instances; excluded silently.
 
     IAM permissions required:
         notebooks.instances.list (roles/notebooks.viewer)
@@ -81,7 +85,90 @@ def find_idle_workbench_instances(
         raise ValueError(f"idle_days must be >= 1, got {idle_days!r}")
 
     session = AuthorizedSession(credentials)
-    _list_instances(session, project_id)
+    raw_instances, unreachable_locations, discovery_failed = _list_instances(
+        session, project_id
+    )
+
+    # --- Partial scan scope ---
+    # The spec explicitly covers unreachable[] from the API as a PARTIAL trigger.
+    # discovery_failed (400/5xx/network) is an implementation extension: those
+    # errors make enumeration incomplete in the same way, so PARTIAL is the
+    # conservative and correct choice even though the spec does not spell it out.
+    if unreachable_locations or discovery_failed:
+        parts = []
+        if unreachable_locations:
+            parts.append(f"unreachable locations: {unreachable_locations}")
+        if discovery_failed:
+            parts.append("discovery incomplete due to transport/server error")
+        warnings.warn(
+            f"gcp.vertex.workbench.idle: scan scope PARTIAL for project '{project_id}'"
+            f" — {'; '.join(parts)}",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    # --- Classify resource records ---
+    # INVALID: resource name absent/malformed or state absent/empty — counted.
+    # OUT_OF_SCOPE: valid name+state but not ACTIVE — excluded silently.
+    excluded_invalid_count = 0
+    candidate_resources: list[dict] = []
+
+    for raw in raw_instances:
+        name = (raw.get("name") or "").strip()
+        state = (raw.get("state") or "").strip()
+
+        if not name or not _INSTANCE_NAME_RE.match(name):
+            excluded_invalid_count += 1
+            continue
+
+        if not state:
+            excluded_invalid_count += 1
+            continue
+
+        # OUT_OF_SCOPE: valid but not ACTIVE — excluded silently, not counted.
+        if state != "ACTIVE":
+            continue
+
+        # location is segment index 3 of the validated name.
+        location = name.split("/")[3]
+
+        # Region filter: exact string equality, no aliasing or case folding.
+        if region_filter and location != region_filter:
+            continue
+
+        candidate_resources.append({"name": name, "location": location})
+
+    # --- INVALID count ---
+    # Surface the exact count of records excluded as INVALID so operators can
+    # detect data-quality issues without the rule silently discarding records.
+    if excluded_invalid_count:
+        warnings.warn(
+            f"gcp.vertex.workbench.idle: {excluded_invalid_count} resource record(s)"
+            f" in project '{project_id}' excluded as INVALID"
+            f" (malformed resource name or missing state field)",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    # --- NOT_EVALUABLE: ACTIVE candidates exist but no canonical signal ---
+    # Warn with a capped list of resource names (first 5 + overflow count) so
+    # the warning stays readable even in large projects.
+    if candidate_resources:
+        _CAP = 5
+        shown = [r["name"] for r in candidate_resources[:_CAP]]
+        overflow = len(candidate_resources) - _CAP
+        name_summary = ", ".join(shown)
+        if overflow > 0:
+            name_summary += f", ... (+{overflow} more)"
+        warnings.warn(
+            f"gcp.vertex.workbench.idle: {len(candidate_resources)} ACTIVE instance(s)"
+            f" in project '{project_id}' cannot be evaluated (NO_SIGNAL) —"
+            f" no qualifying canonical kernel-activity signal exists;"
+            f" rule is EMITTING_DISABLED. Instances: {name_summary}",
+            UserWarning,
+            stacklevel=2,
+        )
+
     return []
 
 
@@ -91,7 +178,7 @@ def find_idle_workbench_instances(
 def _list_instances(
     session: AuthorizedSession,
     project_id: str,
-) -> tuple:
+) -> tuple[list, list, bool]:
     """
     List all Vertex AI Workbench instances across all locations using the v2 API.
 
@@ -138,6 +225,9 @@ def _list_instances(
             )
 
         if resp.status_code == 404:
+            # Implementation choice: 404 = Notebooks API not enabled for this
+            # project, meaning provably no instances exist. Treated as a clean
+            # empty scope (FULL, EVALUABLE). Not explicitly defined in the spec.
             return [], [], False
 
         if resp.status_code == 400:
diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
index e7d9ffc..e453b18 100644
--- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
+++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
@@ -2,31 +2,29 @@
 Tests for gcp.vertex.workbench.idle rule.
 
 The rule is EMITTING_DISABLED and always returns an empty List[Finding].
-No qualifying canonical kernel-activity signal exists; updateTime, createTime,
-age, and CPU utilization are all explicitly non-canonical.
+Classification is performed and surfaced via UserWarning:
+  - PARTIAL scope: unreachable[] locations or discovery failures.
+  - NOT_EVALUABLE: ACTIVE candidates warn with resource names (NO_SIGNAL).
+  - INVALID: malformed name or missing state; counted, not warned.
+  - OUT_OF_SCOPE: non-ACTIVE instances; excluded silently.
 
 Coverage:
-  Public API (find_idle_workbench_instances):
-    - return type and value
-    - idle_days validation (zero, negative, boundary, error message)
-    - region_filter parameter accepted
-    - 403/404/400/5xx/network error handling
-    - warning type, message content (project, HTTP code, rule ID)
-
-  Internal (_list_instances):
-    - empty response
-    - instance accumulation
-    - pagination over 2 and 3 pages
-    - pageToken forwarded on subsequent requests
-    - pageSize=100 in initial request
-    - unreachable[] collected and deduplicated across pages
-    - empty unreachable entries skipped
-    - 404 returns clean ([], [], False)
-    - 400 returns ([], [], True)
-    - 5xx sets discovery_failed; preserves already-fetched instances
-    - network error sets discovery_failed; preserves already-fetched instances
-    - 403 raises PermissionError
-    - URL contains project ID and locations/- wildcard
+  find_idle_workbench_instances (public API):
+    - always returns []
+    - idle_days validation
+    - region_filter (exact match, no case folding)
+    - ACTIVE instances warn NOT_EVALUABLE/NO_SIGNAL
+    - unreachable locations warn PARTIAL scope
+    - discovery_failed warns PARTIAL scope
+    - INVALID resources (name/state) counted, not surfaced individually
+    - OUT_OF_SCOPE (non-ACTIVE) excluded silently
+    - 403/404/400/5xx/network HTTP error handling
+
+  _list_instances (internal):
+    - pagination, pageToken forwarding, pageSize
+    - unreachable[] accumulation and deduplication
+    - 404/400/5xx/network/403 error handling
+    - URL shape (project ID, locations/- wildcard, v2)
 """
 
 import warnings
@@ -41,6 +39,8 @@
 )
 
 _PROJECT = "my-project"
+_LOCATION = "us-central1"
+_INSTANCE_NAME = f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-1"
 
 
 # ---------------------------------------------------------------------------
@@ -49,7 +49,6 @@
 
 
 def _ok(body: dict = None):
-    """Build a 200 response mock with the given JSON body."""
     resp = MagicMock()
     resp.status_code = 200
     resp.json.return_value = body or {}
@@ -58,71 +57,84 @@ def _ok(body: dict = None):
 
 
 def _err(status_code: int):
-    """Build an error response mock with the given status code."""
     resp = MagicMock()
     resp.status_code = status_code
     return resp
 
 
 def _session(*responses):
-    """Build a mock session whose .get() returns responses in order."""
     mock = MagicMock()
     mock.get.side_effect = list(responses)
     return mock
 
 
-def _invoke(**kwargs):
+def _run(instances, unreachable=None, discovery_failed=False, **kwargs):
     """
-    Call find_idle_workbench_instances with a default 200/empty mock session.
-    Extra kwargs are forwarded to the rule function.
+    Invoke find_idle_workbench_instances with _list_instances patched to return
+    (instances, unreachable, discovery_failed).
     """
     with patch(
-        "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
-        return_value=_session(_ok()),
+        "cleancloud.providers.gcp.rules.ai.workbench_idle._list_instances",
+        return_value=(instances, unreachable or [], discovery_failed),
     ):
-        return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs)
+        return find_idle_workbench_instances(
+            project_id=_PROJECT, credentials=MagicMock(), **kwargs
+        )
+
+
+def _run_with_warnings(instances, unreachable=None, discovery_failed=False, **kwargs):
+    """Like _run but also captures warnings."""
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always")
+        result = _run(instances, unreachable, discovery_failed, **kwargs)
+    return result, [w for w in caught if issubclass(w.category, UserWarning)]
+
 
+def _active(name=_INSTANCE_NAME):
+    return {"name": name, "state": "ACTIVE"}
 
-def _invoke_with_session(mock_session, **kwargs):
-    """Call find_idle_workbench_instances with a custom session mock."""
+
+def _invoke_http(mock_session, **kwargs):
     with patch(
         "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
         return_value=mock_session,
     ):
-        return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs)
+        return find_idle_workbench_instances(
+            project_id=_PROJECT, credentials=MagicMock(), **kwargs
+        )
+
+
+def _invoke_http_with_warnings(mock_session, **kwargs):
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always")
+        result = _invoke_http(mock_session, **kwargs)
+    return result, [w for w in caught if issubclass(w.category, UserWarning)]
 
 
 # ---------------------------------------------------------------------------
-# Return type and value
+# Return value — always []
 # ---------------------------------------------------------------------------
 
 
 class TestReturnValue:
     def test_returns_list(self):
-        assert isinstance(_invoke(), list)
-
-    def test_always_empty(self):
-        assert _invoke() == []
-
-    def test_empty_when_api_returns_active_instances(self):
-        """EMITTING_DISABLED: ACTIVE instances in API response still yield no findings."""
-        inst = {
-            "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1",
-            "state": "ACTIVE",
-        }
-        result = _invoke_with_session(_session(_ok({"instances": [inst]})))
-        assert result == []
+        assert isinstance(_run([]), list)
+
+    def test_always_empty_no_instances(self):
+        assert _run([]) == []
+
+    def test_always_empty_with_active_instances(self):
+        assert _run([_active()]) == []
 
-    def test_empty_when_api_returns_multiple_instances(self):
+    def test_always_empty_with_multiple_active(self):
         instances = [
-            {
-                "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}",
-                "state": "ACTIVE",
-            }
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
             for i in range(5)
         ]
-        result = _invoke_with_session(_session(_ok({"instances": instances})))
-        assert result == []
+        assert _run(instances) == []
+
+    def test_always_empty_with_stopped_instances(self):
+        assert _run([{"name": _INSTANCE_NAME, "state": "STOPPED"}]) == []
 
 
 # ---------------------------------------------------------------------------
@@ -131,20 +143,16 @@ def test_empty_when_api_returns_multiple_instances(self):
 
 
 class TestIdleDaysValidation:
-    def test_zero_raises_value_error(self):
-        with pytest.raises(ValueError, match="idle_days must be >= 1"):
-            find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), idle_days=0)
-
-    def test_negative_one_raises(self):
+    def test_zero_raises(self):
         with pytest.raises(ValueError, match="idle_days must be >= 1"):
             find_idle_workbench_instances(
-                project_id=_PROJECT, credentials=MagicMock(), idle_days=-1
+                project_id=_PROJECT, credentials=MagicMock(), idle_days=0
             )
 
-    def test_large_negative_raises(self):
+    def test_negative_raises(self):
         with pytest.raises(ValueError, match="idle_days must be >= 1"):
             find_idle_workbench_instances(
-                project_id=_PROJECT, credentials=MagicMock(), idle_days=-999
+                project_id=_PROJECT, credentials=MagicMock(), idle_days=-1
             )
 
     def test_error_message_includes_bad_value(self):
@@ -154,138 +162,316 @@ def test_error_message_includes_bad_value(self):
             )
 
     def test_one_is_valid(self):
-        assert _invoke(idle_days=1) == []
+        assert _run([], idle_days=1) == []
 
     def test_default_14_is_valid(self):
-        assert _invoke() == []
+        assert _run([]) == []
 
     def test_large_value_is_valid(self):
-        assert _invoke(idle_days=365) == []
+        assert _run([], idle_days=365) == []
+
+
+# ---------------------------------------------------------------------------
+# NOT_EVALUABLE / NO_SIGNAL warning for ACTIVE candidates
+# ---------------------------------------------------------------------------
+
+
+class TestNotEvaluableWarning:
+    def test_active_instance_emits_warning(self):
+        _, warns = _run_with_warnings([_active()])
+        assert len(warns) == 1
+
+    def test_warning_is_user_warning(self):
+        _, warns = _run_with_warnings([_active()])
+        assert issubclass(warns[0].category, UserWarning)
+
+    def test_warning_mentions_no_signal(self):
+        _, warns = _run_with_warnings([_active()])
+        assert "NO_SIGNAL" in str(warns[0].message)
+
+    def test_warning_mentions_emitting_disabled(self):
+        _, warns = _run_with_warnings([_active()])
+        assert "EMITTING_DISABLED" in str(warns[0].message)
+
+    def test_warning_mentions_project(self):
+        _, warns = _run_with_warnings([_active()])
+        assert _PROJECT in str(warns[0].message)
+
+    def test_warning_mentions_resource_name(self):
+        _, warns = _run_with_warnings([_active()])
+        assert _INSTANCE_NAME in str(warns[0].message)
+
+    def test_warning_mentions_count(self):
+        instances = [
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
+            for i in range(3)
+        ]
+        _, warns = _run_with_warnings(instances)
+        assert "3" in str(warns[0].message)
+
+    def test_warning_mentions_all_resource_names_when_under_cap(self):
+        inst1 = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-1")
+        inst2 = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-2")
+        _, warns = _run_with_warnings([inst1, inst2])
+        msg = str(warns[0].message)
+        assert "wb-1" in msg
+        assert "wb-2" in msg
+
+    def test_warning_caps_name_list_at_five(self):
+        instances = [
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
+            for i in range(8)
+        ]
+        _, warns = _run_with_warnings(instances)
+        msg = str(warns[0].message)
+        assert "+3 more" in msg
+
+    def test_warning_no_overflow_label_when_at_cap(self):
+        instances = [
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
+            for i in range(5)
+        ]
+        _, warns = _run_with_warnings(instances)
+        assert "more" not in str(warns[0].message)
+
+    def test_no_warning_when_no_active_instances(self):
+        _, warns = _run_with_warnings([])
+        assert len(warns) == 0
+
+    def test_no_warning_for_stopped_instances(self):
+        _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "STOPPED"}])
+        assert len(warns) == 0
+
+
+# ---------------------------------------------------------------------------
+# PARTIAL scope warning — unreachable locations
+# ---------------------------------------------------------------------------
+
+
+class TestPartialScopeUnreachable:
+    def test_unreachable_location_emits_warning(self):
+        _, warns = _run_with_warnings([], unreachable=["asia-east1"])
+        assert len(warns) == 1
+
+    def test_warning_mentions_partial(self):
+        _, warns = _run_with_warnings([], unreachable=["asia-east1"])
+        assert "PARTIAL" in str(warns[0].message)
+
+    def test_warning_mentions_unreachable_location(self):
+        _, warns = _run_with_warnings([], unreachable=["asia-east1"])
+        assert "asia-east1" in str(warns[0].message)
+
+    def test_warning_mentions_project(self):
+        _, warns = _run_with_warnings([], unreachable=["us-east1"])
+        assert _PROJECT in str(warns[0].message)
+
+    def test_multiple_unreachable_locations_in_single_warning(self):
+        _, warns = _run_with_warnings([], unreachable=["asia-east1", "europe-west3"])
+        assert len(warns) == 1
+        msg = str(warns[0].message)
+        assert "asia-east1" in msg
+        assert "europe-west3" in msg
+
+    def test_no_partial_warning_when_no_unreachable(self):
+        _, warns = _run_with_warnings([])
+        assert len(warns) == 0
+
+
+# ---------------------------------------------------------------------------
+# PARTIAL scope warning — discovery_failed
+# ---------------------------------------------------------------------------
+
+
+class TestPartialScopeDiscoveryFailed:
+    def test_discovery_failed_emits_warning(self):
+        _, warns = _run_with_warnings([], discovery_failed=True)
+        assert len(warns) == 1
+
+    def test_warning_mentions_partial(self):
+        _, warns = _run_with_warnings([], discovery_failed=True)
+        assert "PARTIAL" in str(warns[0].message)
+
+    def test_warning_mentions_project(self):
+        _, warns = _run_with_warnings([], discovery_failed=True)
+        assert _PROJECT in str(warns[0].message)
+
+    def test_both_partial_and_not_evaluable_can_warn(self):
+        """ACTIVE instances + unreachable locations = two warnings."""
+        _, warns = _run_with_warnings([_active()], unreachable=["asia-east1"])
+        assert len(warns) == 2
+
+    def test_discovery_failed_and_active_instances_both_warn(self):
+        _, warns = _run_with_warnings([_active()], discovery_failed=True)
+        assert len(warns) == 2
+
+
+# ---------------------------------------------------------------------------
+# INVALID resource classification
+# ---------------------------------------------------------------------------
+
+
+class TestInvalidResources:
+    def test_empty_name_emits_invalid_warning(self):
+        inst = {"name": "", "state": "ACTIVE"}
+        _, warns = _run_with_warnings([inst])
+        assert any("INVALID" in str(w.message) for w in warns)
+
+    def test_missing_name_emits_invalid_warning(self):
+        inst = {"state": "ACTIVE"}
+        _, warns = _run_with_warnings([inst])
+        assert any("INVALID" in str(w.message) for w in warns)
+
+    def test_wrong_pattern_emits_invalid_warning(self):
+        inst = {"name": "projects/p/locations/l/wrongType/id", "state": "ACTIVE"}
+        _, warns = _run_with_warnings([inst])
+        assert any("INVALID" in str(w.message) for w in warns)
+
+    def test_extra_path_segments_emits_invalid_warning(self):
+        inst = {"name": f"{_INSTANCE_NAME}/extra", "state": "ACTIVE"}
+        _, warns = _run_with_warnings([inst])
+        assert any("INVALID" in str(w.message) for w in warns)
+
+    def test_empty_state_emits_invalid_warning(self):
+        inst = {"name": _INSTANCE_NAME, "state": ""}
+        _, warns = _run_with_warnings([inst])
+        assert any("INVALID" in str(w.message) for w in warns)
+
+    def test_missing_state_emits_invalid_warning(self):
+        inst = {"name": _INSTANCE_NAME}
+        _, warns = _run_with_warnings([inst])
+        assert any("INVALID" in str(w.message) for w in warns)
+
+    def test_invalid_warning_mentions_count(self):
+        bad1 = {"name": "", "state": "ACTIVE"}
+        bad2 = {"name": _INSTANCE_NAME, "state": ""}
+        _, warns = _run_with_warnings([bad1, bad2])
+        invalid_warn = next(w for w in warns if "INVALID" in str(w.message))
+        assert "2" in str(invalid_warn.message)
+
+    def test_invalid_warning_mentions_project(self):
+        inst = {"name": "", "state": "ACTIVE"}
+        _, warns = _run_with_warnings([inst])
+        invalid_warn = next(w for w in warns if "INVALID" in str(w.message))
+        assert _PROJECT in str(invalid_warn.message)
+
+    def test_no_invalid_warning_when_all_valid(self):
+        _, warns = _run_with_warnings([_active()])
+        assert not any("INVALID" in str(w.message) for w in warns)
+
+    def test_invalid_not_in_not_evaluable_candidates(self):
+        """INVALID records must not appear in the NOT_EVALUABLE warning."""
+        bad = {"name": "", "state": "ACTIVE"}
+        good = _active()
+        _, warns = _run_with_warnings([bad, good])
+        not_eval_warn = next(w for w in warns if "NO_SIGNAL" in str(w.message))
+        assert _INSTANCE_NAME in str(not_eval_warn.message)
+        # count should be 1 (only the valid ACTIVE instance)
+        assert "1 ACTIVE" in str(not_eval_warn.message)
+
+
+# ---------------------------------------------------------------------------
+# OUT_OF_SCOPE resources — non-ACTIVE states
+# ---------------------------------------------------------------------------
+
+
+class TestOutOfScope:
+    def test_stopped_excluded_silently(self):
+        _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "STOPPED"}])
+        assert len(warns) == 0
+
+    def test_suspended_excluded_silently(self):
+        _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "SUSPENDED"}])
+        assert len(warns) == 0
+
+    def test_provisioning_excluded_silently(self):
+        _, warns = _run_with_warnings(
+            [{"name": _INSTANCE_NAME, "state": "PROVISIONING"}]
+        )
+        assert len(warns) == 0
+
+    def test_mixed_active_and_stopped_only_active_warned(self):
+        active = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/active")
+        stopped = {"name": f"projects/{_PROJECT}/locations/{_LOCATION}/instances/stopped",
+                   "state": "STOPPED"}
+        _, warns = _run_with_warnings([active, stopped])
+        assert len(warns) == 1
+        msg = str(warns[0].message)
+        assert "active" in msg
+        assert "stopped" not in msg
 
 
 # ---------------------------------------------------------------------------
-# region_filter parameter
+# Region filter — exact string equality, no case folding
 # ---------------------------------------------------------------------------
 
 
 class TestRegionFilter:
-    def test_region_filter_string_accepted(self):
-        assert _invoke(region_filter="us-central1") == []
+    def test_matching_region_produces_warning(self):
+        _, warns = _run_with_warnings([_active()], region_filter=_LOCATION)
+        assert len(warns) == 1
+
+    def test_non_matching_region_excluded(self):
+        _, warns = _run_with_warnings([_active()], region_filter="europe-west1")
+        assert len(warns) == 0
 
-    def test_region_filter_none_accepted(self):
-        assert _invoke(region_filter=None) == []
+    def test_uppercase_region_does_not_match(self):
+        """Exact equality — 'US-CENTRAL1' must NOT match 'us-central1'."""
+        _, warns = _run_with_warnings([_active()], region_filter="US-CENTRAL1")
+        assert len(warns) == 0
+
+    def test_none_region_filter_includes_all(self):
+        inst1 = _active("projects/p/locations/europe-west1/instances/i1")
+        inst2 = _active("projects/p/locations/us-central1/instances/i2")
+        _, warns = _run_with_warnings([inst1, inst2], region_filter=None)
+        assert len(warns) == 1
+        msg = str(warns[0].message)
+        assert "i1" in msg
+        assert "i2" in msg
 
 
 # ---------------------------------------------------------------------------
-# HTTP error handling via public API
+# HTTP error handling (via AuthorizedSession)
 # ---------------------------------------------------------------------------
 
 
 class TestHttpErrors:
     def test_403_raises_permission_error(self):
-        with pytest.raises(PermissionError):
-            _invoke_with_session(_session(_err(403)))
-
-    def test_403_message_mentions_permission(self):
         with pytest.raises(PermissionError, match="notebooks.instances.list"):
-            _invoke_with_session(_session(_err(403)))
+            _invoke_http(_session(_err(403)))
 
-    def test_403_message_mentions_role(self):
+    def test_403_mentions_role(self):
         with pytest.raises(PermissionError, match="roles/notebooks.viewer"):
-            _invoke_with_session(_session(_err(403)))
-
-    def test_404_returns_empty_list(self):
-        assert _invoke_with_session(_session(_err(404))) == []
-
-    def test_404_no_warning_emitted(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(404)))
-        assert not any(issubclass(w.category, UserWarning) for w in caught)
-
-    def test_400_returns_empty_list(self):
-        assert _invoke_with_session(_session(_err(400))) == []
-
-    def test_400_emits_user_warning(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(400)))
-        assert any(issubclass(w.category, UserWarning) for w in caught)
-
-    def test_400_warning_mentions_status_code(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(400)))
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert "400" in msgs
-
-    def test_400_warning_mentions_project(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(400)))
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert _PROJECT in msgs
-
-    def test_500_returns_empty_list(self):
-        assert _invoke_with_session(_session(_err(500))) == []
-
-    def test_500_emits_user_warning(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(500)))
-        assert any(issubclass(w.category, UserWarning) for w in caught)
-
-    def test_500_warning_mentions_status_code(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(500)))
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert "500" in msgs
-
-    def test_503_warning_mentions_status_code(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(503)))
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert "503" in msgs
-
-    def test_5xx_warning_mentions_project(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(_session(_err(500)))
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert _PROJECT in msgs
-
-    def test_network_error_returns_empty_list(self):
-        session = MagicMock()
-        session.get.side_effect = ConnectionError("timeout")
-        assert _invoke_with_session(session) == []
+            _invoke_http(_session(_err(403)))
 
-    def test_network_error_emits_user_warning(self):
-        session = MagicMock()
-        session.get.side_effect = ConnectionError("timeout")
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(session)
-        assert any(issubclass(w.category, UserWarning) for w in caught)
+    def test_404_returns_empty_no_partial_warning(self):
+        result, warns = _invoke_http_with_warnings(_session(_err(404)))
+        assert result == []
+        assert len(warns) == 0
 
-    def test_network_error_warning_mentions_project(self):
-        session = MagicMock()
-        session.get.side_effect = OSError("no route to host")
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(session)
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert _PROJECT in msgs
-
-    def test_network_error_warning_mentions_exception_type(self):
+    def test_400_returns_empty_with_partial_warning(self):
+        result, warns = _invoke_http_with_warnings(_session(_err(400)))
+        assert result == []
+        # _list_instances warns about 400, caller warns about PARTIAL
+        msgs = " ".join(str(w.message) for w in warns)
+        assert "400" in msgs or "PARTIAL" in msgs
+
+    def test_500_returns_empty_with_warning(self):
+        result, warns = _invoke_http_with_warnings(_session(_err(500)))
+        assert result == []
+        msgs = " ".join(str(w.message) for w in warns)
+        assert "500" in msgs or "PARTIAL" in msgs
+
+    def test_503_returns_empty_with_warning(self):
+        result, warns = _invoke_http_with_warnings(_session(_err(503)))
+        assert result == []
+        assert len(warns) >= 1
+
+    def test_network_error_returns_empty_with_warning(self):
         session = MagicMock()
-        session.get.side_effect = ConnectionError("dropped")
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _invoke_with_session(session)
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert "ConnectionError" in msgs
+        session.get.side_effect = ConnectionError("timeout")
+        result, warns = _invoke_http_with_warnings(session)
+        assert result == []
+        assert len(warns) >= 1
 
 
 # ---------------------------------------------------------------------------
@@ -296,52 +482,38 @@ def test_network_error_warning_mentions_exception_type(self):
 class TestListInstancesBasic:
     def test_empty_response(self):
         instances, unreachable, failed = _list_instances(_session(_ok()), _PROJECT)
-        assert instances == []
-        assert unreachable == []
-        assert failed is False
+        assert instances == [] and unreachable == [] and failed is False
 
     def test_instances_returned(self):
-        inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        inst = {"name": _INSTANCE_NAME, "state": "ACTIVE"}
         instances, _, _ = _list_instances(_session(_ok({"instances": [inst]})), _PROJECT)
         assert instances == [inst]
 
-    def test_multiple_instances_in_single_page(self):
-        inst_list = [
-            {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"}
-            for i in range(3)
-        ]
-        instances, _, _ = _list_instances(_session(_ok({"instances": inst_list})), _PROJECT)
-        assert instances == inst_list
-
     def test_page_size_100_in_initial_request(self):
         session = _session(_ok())
         _list_instances(session, _PROJECT)
-        params = session.get.call_args.kwargs["params"]
-        assert params["pageSize"] == 100
+        assert session.get.call_args.kwargs["params"]["pageSize"] == 100
 
     def test_url_contains_project_id(self):
         session = _session(_ok())
         _list_instances(session, "target-project-xyz")
-        url = session.get.call_args.args[0]
-        assert "target-project-xyz" in url
+        assert "target-project-xyz" in session.get.call_args.args[0]
 
     def test_url_uses_wildcard_location(self):
         session = _session(_ok())
         _list_instances(session, _PROJECT)
-        url = session.get.call_args.args[0]
-        assert "locations/-" in url
+        assert "locations/-" in session.get.call_args.args[0]
 
     def test_url_uses_v2_api(self):
         session = _session(_ok())
         _list_instances(session, _PROJECT)
-        url = session.get.call_args.args[0]
-        assert "/v2/" in url
+        assert "/v2/" in session.get.call_args.args[0]
 
 
 class TestListInstancesPagination:
     def test_two_pages_accumulates_instances(self):
-        inst1 = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
-        inst2 = {"name": "projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"}
+        inst1 = {"name": f"projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        inst2 = {"name": f"projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"}
         session = _session(
             _ok({"instances": [inst1], "nextPageToken": "tok1"}),
             _ok({"instances": [inst2]}),
@@ -350,62 +522,31 @@ def test_two_pages_accumulates_instances(self):
         assert instances == [inst1, inst2]
 
     def test_three_pages_all_accumulated(self):
-        def _inst(i):
-            return {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"}
-
-        session = _session(
-            _ok({"instances": [_inst(1)], "nextPageToken": "t1"}),
-            _ok({"instances": [_inst(2)], "nextPageToken": "t2"}),
-            _ok({"instances": [_inst(3)]}),
-        )
-        instances, _, _ = _list_instances(session, _PROJECT)
+        pages = [
+            _ok({"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}],
+                 "nextPageToken": f"t{i}"} if i < 2 else
+                {"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}]})
+            for i in range(3)
+        ]
+        instances, _, _ = _list_instances(_session(*pages), _PROJECT)
         assert len(instances) == 3
 
     def test_page_token_forwarded_on_second_request(self):
-        session = _session(
-            _ok({"nextPageToken": "tok-abc"}),
-            _ok({}),
-        )
-        _list_instances(session, _PROJECT)
-        second_params = session.get.call_args_list[1].kwargs["params"]
-        assert second_params.get("pageToken") == "tok-abc"
-
-    def test_page_token_forwarded_on_third_request(self):
-        session = _session(
-            _ok({"nextPageToken": "t1"}),
-            _ok({"nextPageToken": "t2"}),
-            _ok({}),
-        )
+        session = _session(_ok({"nextPageToken": "tok-abc"}), _ok({}))
         _list_instances(session, _PROJECT)
-        third_params = session.get.call_args_list[2].kwargs["params"]
-        assert third_params.get("pageToken") == "t2"
+        assert session.get.call_args_list[1].kwargs["params"]["pageToken"] == "tok-abc"
 
     def test_stops_when_no_next_token(self):
         session = _session(_ok({}))
         _list_instances(session, _PROJECT)
         assert session.get.call_count == 1
 
-    def test_exactly_two_calls_for_two_pages(self):
-        session = _session(
-            _ok({"nextPageToken": "t1"}),
-            _ok({}),
-        )
-        _list_instances(session, _PROJECT)
-        assert session.get.call_count == 2
-
 
 class TestListInstancesUnreachable:
-    def test_single_unreachable_location_collected(self):
-        session = _session(_ok({"unreachable": ["asia-east1"]}))
-        _, unreachable, _ = _list_instances(session, _PROJECT)
+    def test_unreachable_location_collected(self):
+        _, unreachable, _ = _list_instances(_session(_ok({"unreachable": ["asia-east1"]})), _PROJECT)
         assert "asia-east1" in unreachable
 
-    def test_multiple_unreachable_locations(self):
-        session = _session(_ok({"unreachable": ["asia-east1", "europe-west3"]}))
-        _, unreachable, _ = _list_instances(session, _PROJECT)
-        assert "asia-east1" in unreachable
-        assert "europe-west3" in unreachable
-
     def test_unreachable_deduplicated_across_pages(self):
         session = _session(
             _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}),
@@ -415,24 +556,10 @@ def test_unreachable_deduplicated_across_pages(self):
         assert unreachable.count("asia-east1") == 1
 
     def test_empty_string_in_unreachable_skipped(self):
-        session = _session(_ok({"unreachable": ["", "us-east1"]}))
-        _, unreachable, _ = _list_instances(session, _PROJECT)
-        assert "" not in unreachable
-        assert "us-east1" in unreachable
-
-    def test_no_unreachable_when_field_absent(self):
-        session = _session(_ok({}))
-        _, unreachable, _ = _list_instances(session, _PROJECT)
-        assert unreachable == []
-
-    def test_unreachable_from_multiple_pages_merged(self):
-        session = _session(
-            _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}),
-            _ok({"unreachable": ["europe-west3"]}),
+        _, unreachable, _ = _list_instances(
+            _session(_ok({"unreachable": ["", "us-east1"]})), _PROJECT
         )
-        _, unreachable, _ = _list_instances(session, _PROJECT)
-        assert "asia-east1" in unreachable
-        assert "europe-west3" in unreachable
+        assert "" not in unreachable and "us-east1" in unreachable
 
 
 class TestListInstancesErrors:
@@ -440,36 +567,26 @@ def test_403_raises_permission_error(self):
         with pytest.raises(PermissionError, match="notebooks.instances.list"):
             _list_instances(_session(_err(403)), _PROJECT)
 
-    def test_404_returns_empty_clean(self):
+    def test_404_returns_clean_empty(self):
         instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT)
-        assert instances == []
-        assert unreachable == []
-        assert failed is False
-
-    def test_400_returns_empty_with_discovery_failed(self):
-        instances, unreachable, failed = _list_instances(_session(_err(400)), _PROJECT)
-        assert instances == []
-        assert unreachable == []
+        assert instances == [] and unreachable == [] and failed is False
+
+    def test_400_sets_discovery_failed(self):
+        _, _, failed = _list_instances(_session(_err(400)), _PROJECT)
         assert failed is True
 
     def test_500_sets_discovery_failed(self):
         _, _, failed = _list_instances(_session(_err(500)), _PROJECT)
         assert failed is True
 
-    def test_503_sets_discovery_failed(self):
-        _, _, failed = _list_instances(_session(_err(503)), _PROJECT)
-        assert failed is True
-
-    def test_5xx_preserves_instances_from_earlier_pages(self):
-        """Instances already fetched before a 5xx error must be returned."""
-        inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+    def test_5xx_preserves_earlier_instances(self):
+        inst = {"name": _INSTANCE_NAME, "state": "ACTIVE"}
         session = _session(
             _ok({"instances": [inst], "nextPageToken": "t1"}),
             _err(503),
         )
         instances, _, failed = _list_instances(session, _PROJECT)
-        assert instances == [inst]
-        assert failed is True
+        assert instances == [inst] and failed is True
 
     def test_network_error_sets_discovery_failed(self):
         session = MagicMock()
@@ -478,40 +595,14 @@ def test_network_error_sets_discovery_failed(self):
         assert failed is True
 
     def test_network_error_preserves_earlier_instances(self):
-        inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
-        session = _session(
-            _ok({"instances": [inst], "nextPageToken": "t1"}),
-        )
+        inst = {"name": _INSTANCE_NAME, "state": "ACTIVE"}
+        session = MagicMock()
         session.get.side_effect = [
             _ok({"instances": [inst], "nextPageToken": "t1"}),
             ConnectionError("dropped"),
         ]
         instances, _, failed = _list_instances(session, _PROJECT)
-        assert instances == [inst]
-        assert failed is True
-
-    def test_400_emits_warning_with_project(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _list_instances(_session(_err(400)), _PROJECT)
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert _PROJECT in msgs
-
-    def test_500_emits_warning_with_status_code(self):
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _list_instances(_session(_err(500)), _PROJECT)
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert "500" in msgs
-
-    def test_network_error_emits_warning_with_project(self):
-        session = MagicMock()
-        session.get.side_effect = OSError("no route to host")
-        with warnings.catch_warnings(record=True) as caught:
-            warnings.simplefilter("always")
-            _list_instances(session, _PROJECT)
-        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
-        assert _PROJECT in msgs
+        assert instances == [inst] and failed is True
 
 
 # ---------------------------------------------------------------------------
@@ -532,5 +623,5 @@ def test_service(self):
     def test_cost_impact(self):
         assert RULE_METADATA["cost_impact"] == "high"
 
-    def test_rule_id_attribute_on_function(self):
+    def test_rule_id_attribute(self):
         assert find_idle_workbench_instances.RULE_ID == "gcp.vertex.workbench.idle"

From 099f542e2adbdbc0be9e2c586b36a7e3689b94c2 Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 6 May 2026 21:44:50 +0100
Subject: [PATCH 2/3] aws.ec2.gpu.idle

---
 .../providers/aws/rules/ai/ec2_gpu_idle.py    | 153 ++++---
 .../providers/gcp/rules/ai/workbench_idle.py  |  49 +-
 docs/rules/aws.md                             |   8 +-
 docs/specs/aws/ai/ec2_gpu_idle.md             | 433 ++++++++++++++++++
 .../providers/aws/ai/test_aws_ec2_gpu_idle.py | 239 +++++++++-
 .../gcp/ai/test_gcp_workbench_idle.py         | 133 ++++--
 6 files changed, 884 insertions(+), 131 deletions(-)
 create mode 100644 docs/specs/aws/ai/ec2_gpu_idle.md

diff --git a/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py b/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py
index 818a152..66e4dea 100644
--- a/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py
+++ b/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py
@@ -132,7 +132,7 @@
 }
 _DEFAULT_MONTHLY_COST = 600.0
 
-# GPU utilisation thresholds
+# GPU utilization thresholds
 _GPU_UTIL_THRESHOLD_PCT = 5.0  # below this = idle (when GPU metric available)
 _CPU_UTIL_THRESHOLD_PCT = 10.0  # below this = idle (CPU fallback)
 
@@ -151,7 +151,7 @@ def find_idle_gpu_instances(
     cpu_threshold: float = _CPU_UTIL_THRESHOLD_PCT,
 ) -> List[Finding]:
     """
-    Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilisation.
+    Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilization.
 
     GPU instances (raw EC2, outside SageMaker) incur continuous charges while running
     regardless of whether GPUs are being utilised. A p4d.24xlarge costs ~$23K/month
@@ -162,15 +162,19 @@ def find_idle_gpu_instances(
     - Instance state is running
     - Instance type is a known GPU/accelerator family
     - Instance is older than idle_days (avoids flagging newly launched instances)
-    - GPU utilisation < gpu_threshold % over idle_days (HIGH confidence, when NVIDIA
-      CloudWatch agent publishes nvidia_smi_utilization_gpu under CWAgent namespace)
-    - OR CPU utilisation < cpu_threshold % over idle_days (MEDIUM confidence fallback,
+    - GPU utilization < gpu_threshold % over idle_days (HIGH confidence, when the
+      nvidia_smi_utilization_gpu metric is discoverable for the instance in CloudWatch)
+    - OR CPU utilization < cpu_threshold % over idle_days (MEDIUM confidence fallback,
       used when GPU metrics are not available — CPU alone is a weaker signal)
 
     GPU metric detection:
-    The NVIDIA CloudWatch agent publishes nvidia_smi_utilization_gpu under the CWAgent
-    namespace with an InstanceId dimension. Availability is probed via ListMetrics per
-    instance — not assumed. Instances without the agent fall back to CPU utilisation.
+    The rule probes CloudWatch ListMetrics for nvidia_smi_utilization_gpu in the CWAgent
+    namespace, filtered by InstanceId dimension. This depends on the CloudWatch agent
+    being installed and configured to append EC2 instance dimensions (e.g. via
+    append_dimensions = {"InstanceId": ...}). AWS does not guarantee the InstanceId
+    dimension is present by default; its presence is implementation-dependent. If the
+    metric is absent or the agent is misconfigured, the rule falls back to CPU
+    utilization. Absence of the metric is NOT proof the GPU is idle.
 
     Multi-GPU handling:
     For multi-GPU instances (e.g., p4d.24xlarge has 8 A100s), the MAX statistic is
@@ -178,8 +182,8 @@ def find_idle_gpu_instances(
     would be averaged away using AVG, producing a misleadingly low reading.
 
     Confidence:
-    - HIGH: GPU metric available AND max GPU utilisation < gpu_threshold over idle_days
-    - MEDIUM: GPU metric unavailable, CPU utilisation < cpu_threshold over idle_days
+    - HIGH: GPU metric discoverable AND max GPU utilization < gpu_threshold over idle_days
+    - MEDIUM: GPU metric not discoverable; CPU utilization < cpu_threshold over idle_days
 
     IAM permissions:
     - ec2:DescribeInstances
@@ -208,53 +212,62 @@ def find_idle_gpu_instances(
                     if not _is_gpu_instance(instance_type):
                         continue
 
-                    instance_id = inst["InstanceId"]
+                    # Normalize InstanceId — skip if missing or empty (spec section 5)
+                    instance_id = (inst.get("InstanceId") or "").strip()
+                    if not instance_id:
+                        continue
+
                     tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
                     # "spot" | "scheduled" | None (on-demand)
                     instance_lifecycle = inst.get("InstanceLifecycle")
                     purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
-                    launch_time = inst.get("LaunchTime")
-
-                    age_days: Optional[int] = None
-                    if launch_time:
-                        if launch_time.tzinfo is None:
-                            launch_time = launch_time.replace(tzinfo=timezone.utc)
-                        age_days = (now - launch_time).days
 
-                    # Skip instances younger than idle_days — too new to classify
-                    if age_days is not None and age_days < idle_days:
+                    # Normalize LaunchTime — skip if missing, naive, or future (spec section 5, section 8.3)
+                    launch_time = inst.get("LaunchTime")
+                    if not launch_time:
+                        continue  # missing LaunchTime → SKIP ITEM
+                    if launch_time.tzinfo is None:
+                        continue  # naive timestamp is not tz-aware UTC → SKIP ITEM
+                    age_days = (now - launch_time).days
+                    if age_days < 0:
+                        continue  # future LaunchTime → SKIP ITEM
+
+                    # Skip instances younger than effective_idle_days — too new to classify
+                    if age_days < idle_days:
                         continue
 
                     # Probe for GPU metrics — single ListMetrics call reused for stats
                     gpu_metrics = _list_gpu_metrics(cloudwatch, instance_id)
 
                     if gpu_metrics:
-                        max_gpu_util = _get_max_gpu_utilisation(
+                        max_gpu_util = _get_max_gpu_utilization(
                             cloudwatch, gpu_metrics, idle_days, now
                         )
                         if max_gpu_util is None or max_gpu_util >= gpu_threshold:
                             continue
                         confidence = ConfidenceLevel.HIGH
-                        idle_signal = "gpu_utilisation"
+                        idle_signal = "gpu_utilization"
                         util_value = max_gpu_util
-                        util_label = f"Max GPU utilisation: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
+                        util_label = f"Max GPU utilization: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
                     else:
-                        avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now)
-                        if avg_cpu is None or avg_cpu >= cpu_threshold:
+                        max_cpu = _get_max_daily_cpu_utilization(
+                            cloudwatch, instance_id, idle_days, now
+                        )
+                        if max_cpu is None or max_cpu >= cpu_threshold:
                             continue
                         # CPU fallback is a weak heuristic for GPU workloads:
-                        # accelerator utilisation is invisible to CPU metrics, so a GPU
+                        # accelerator utilization is invisible to CPU metrics, so a GPU
                         # instance running a compute-bound model can show near-zero CPU
                         # while doing real work. Confidence is capped at MEDIUM to reflect
                         # this limitation. Absence of the CWAgent GPU metric is NOT proof
-                        # that the GPU is idle — the agent may simply not be installed.
+                        # that the GPU is idle — the agent may be absent or misconfigured.
                         confidence = ConfidenceLevel.MEDIUM
-                        idle_signal = "cpu_utilisation_fallback"
-                        util_value = avg_cpu
+                        idle_signal = "cpu_utilization_fallback"
+                        util_value = max_cpu
                         util_label = (
-                            f"Peak daily CPU utilisation: {avg_cpu:.1f}% "
+                            f"Peak daily CPU utilization: {max_cpu:.1f}% "
                             f"(threshold: {cpu_threshold}%) — "
-                            f"heuristic only; GPU/accelerator utilisation not directly measured"
+                            f"heuristic only; GPU/accelerator utilization not directly measured"
                         )
 
                     monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST)
@@ -272,15 +285,14 @@ def find_idle_gpu_instances(
                         f"Instance type: {instance_type} (GPU/accelerator family)",
                         f"Purchasing model: {purchasing_model}",
                         util_label,
+                        f"Instance age: {age_days} days",
                     ]
-                    if age_days is not None:
-                        signals.append(f"Instance age: {age_days} days")
                     if not gpu_metrics:
                         if _is_neuron_instance(instance_type):
                             signals.append(
                                 "Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
                                 "applicable; CPU used as heuristic fallback; confidence MEDIUM. "
-                                "Neuron utilisation requires AWS Neuron SDK metrics."
+                                "Neuron utilization requires AWS Neuron SDK metrics."
                             )
                         else:
                             signals.append(
@@ -289,11 +301,18 @@ def find_idle_gpu_instances(
                                 "the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
                             )
 
+                    # signals_not_checked: GPU note only applies on CPU fallback path (spec section 11.1)
                     not_checked = [
-                        "GPU/accelerator utilisation (not directly measurable without CWAgent)",
                         "Scheduled batch jobs that run outside the observation window",
                         "Planned future use",
                     ]
+                    if not gpu_metrics:
+                        not_checked.insert(
+                            0,
+                            "Direct GPU/accelerator utilization — nvidia_smi_utilization_gpu was not "
+                            "discoverable in CloudWatch (CWAgent may be absent or not configured with "
+                            "InstanceId dimension); absence of the metric does not confirm the GPU is idle",
+                        )
                     if purchasing_model == "spot":
                         not_checked.append(
                             "Spot interruption history — Spot instances may appear idle "
@@ -307,6 +326,17 @@ def find_idle_gpu_instances(
                     )
 
                     metric_label = "GPU" if gpu_metrics else "CPU (fallback)"
+                    if gpu_metrics:
+                        reason = (
+                            f"GPU EC2 instance has low GPU utilization "
+                            f"({util_value:.1f}%) over {idle_days} days"
+                        )
+                    else:
+                        reason = (
+                            f"GPU EC2 instance shows low CPU proxy signal "
+                            f"({util_value:.1f}%) over {idle_days} days — "
+                            f"GPU activity not directly measured"
+                        )
                     findings.append(
                         Finding(
                             provider="aws",
@@ -316,22 +346,18 @@ def find_idle_gpu_instances(
                             region=region,
                             estimated_monthly_cost_usd=monthly_cost,
                             title=(
-                                f"Idle GPU EC2 Instance ({metric_label} utilisation "
+                                f"Idle GPU EC2 Instance ({metric_label} utilization "
                                 f"<{gpu_threshold if gpu_metrics else cpu_threshold}% "
                                 f"over {idle_days} days)"
                             ),
                             summary=(
                                 f"EC2 instance '{name_tag}' ({instance_type}) has had "
-                                f"{'GPU' if gpu_metrics else 'CPU'} utilisation below "
+                                f"{'GPU' if gpu_metrics else 'CPU'} utilization below "
                                 f"{gpu_threshold if gpu_metrics else cpu_threshold}% "
                                 f"for {idle_days} days while running, incurring "
                                 f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)."
                             ),
-                            reason=(
-                                f"GPU EC2 instance has low "
-                                f"{'GPU' if gpu_metrics else 'CPU'} utilisation "
-                                f"({util_value:.1f}%) for {idle_days} days"
-                            ),
+                            reason=reason,
                             risk=risk,
                             confidence=confidence,
                             detected_at=now,
@@ -340,11 +366,11 @@ def find_idle_gpu_instances(
                                 "instance_id": instance_id,
                                 "instance_type": instance_type,
                                 "name": name_tag,
-                                "age_days": (age_days if age_days is not None else "unknown"),
+                                "age_days": age_days,
                                 "idle_days_threshold": idle_days,
                                 "idle_ratio": idle_ratio,
                                 "idle_signal": idle_signal,
-                                "utilisation_pct": round(util_value, 2),
+                                "utilization_pct": round(util_value, 2),
                                 "purchasing_model": purchasing_model,
                                 "gpu_metric_available": bool(gpu_metrics),
                                 "gpu_metric_note": (
@@ -387,25 +413,34 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list:
     """
     Probe CloudWatch ListMetrics for nvidia_smi_utilization_gpu under CWAgent namespace.
 
-    Returns the Metrics list (one entry per GPU index) so the caller can reuse it
+    Exhausts pagination via NextToken (spec section 2 key fact 6: ListMetrics returns up to
+    500 results per call). Returns all Metrics entries so the caller can reuse them
     for GetMetricStatistics without a second ListMetrics call. Returns [] on any error.
     """
+    metrics: list = []
+    kwargs: dict = {
+        "Namespace": "CWAgent",
+        "MetricName": "nvidia_smi_utilization_gpu",
+        "Dimensions": [{"Name": "InstanceId", "Value": instance_id}],
+    }
     try:
-        resp = cloudwatch.list_metrics(
-            Namespace="CWAgent",
-            MetricName="nvidia_smi_utilization_gpu",
-            Dimensions=[{"Name": "InstanceId", "Value": instance_id}],
-        )
-        return resp.get("Metrics", [])
+        while True:
+            resp = cloudwatch.list_metrics(**kwargs)
+            metrics.extend(resp.get("Metrics", []))
+            next_token = resp.get("NextToken")
+            if not next_token:
+                break
+            kwargs["NextToken"] = next_token
     except Exception:
         return []
+    return metrics
 
 
-def _get_max_gpu_utilisation(
+def _get_max_gpu_utilization(
     cloudwatch, gpu_metrics: list, days: int, now: datetime
 ) -> Optional[float]:
     """
-    Return the maximum GPU utilisation across all GPU indices over the window.
+    Return the maximum GPU utilization across all GPU indices over the window.
 
     Takes the gpu_metrics list already fetched by _list_gpu_metrics — no second
     ListMetrics call. Uses MAX statistic so a single active GPU on a multi-GPU
@@ -440,17 +475,17 @@ def _get_max_gpu_utilisation(
     return max_util
 
 
-def _get_avg_cpu_utilisation(
+def _get_max_daily_cpu_utilization(
     cloudwatch, instance_id: str, days: int, now: datetime
 ) -> Optional[float]:
     """
-    Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
+    Return the maximum daily CPU peak over the window using AWS/EC2 CPUUtilization.
 
-    Uses Maximum statistic per day and returns the highest daily peak. This avoids
-    flagging burst workloads where a short but significant CPU spike would be averaged
-    away — if the max CPU across any day is below threshold, the instance is truly idle.
+    Uses Maximum statistic at daily (86400s) period and returns the highest value
+    across all returned datapoints (spec section 6.2). This avoids flagging burst workloads
+    where a short but significant CPU spike would be averaged away.
 
-    Returns None on error — caller treats None as "not idle" (safe default).
+    Returns None on error or no datapoints — caller treats None as "not idle" (safe default).
     """
     start = now - timedelta(days=days)
     try:
diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
index 78f4572..bc94624 100644
--- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py
+++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
@@ -85,24 +85,25 @@ def find_idle_workbench_instances(
         raise ValueError(f"idle_days must be >= 1, got {idle_days!r}")
 
     session = AuthorizedSession(credentials)
-    raw_instances, unreachable_locations, discovery_failed = _list_instances(
-        session, project_id
-    )
+    raw_instances, unreachable_locations, discovery_failed = _list_instances(session, project_id)
 
     # --- Partial scan scope ---
-    # The spec explicitly covers unreachable[] from the API as a PARTIAL trigger.
-    # discovery_failed (400/5xx/network) is an implementation extension: those
-    # errors make enumeration incomplete in the same way, so PARTIAL is the
-    # conservative and correct choice even though the spec does not spell it out.
-    if unreachable_locations or discovery_failed:
-        parts = []
-        if unreachable_locations:
-            parts.append(f"unreachable locations: {unreachable_locations}")
-        if discovery_failed:
-            parts.append("discovery incomplete due to transport/server error")
+    # The spec defines PARTIAL exclusively from discovery-layer reachability:
+    # unreachable[] locations reported by the API (section 12.1, 12.2.7).
+    # 400/5xx/network errors (discovery_failed) make enumeration incomplete but
+    # are NOT defined as PARTIAL by the spec; they get a separate warning.
+    if unreachable_locations:
         warnings.warn(
             f"gcp.vertex.workbench.idle: scan scope PARTIAL for project '{project_id}'"
-            f" — {'; '.join(parts)}",
+            f" — unreachable locations: {unreachable_locations}",
+            UserWarning,
+            stacklevel=2,
+        )
+
+    if discovery_failed:
+        warnings.warn(
+            f"gcp.vertex.workbench.idle: discovery incomplete for project '{project_id}'"
+            f" — transport or server error encountered; scan results may be incomplete",
             UserWarning,
             stacklevel=2,
         )
@@ -154,9 +155,9 @@ def find_idle_workbench_instances(
     # Warn with a capped list of resource names (first 5 + overflow count) so
     # the warning stays readable even in large projects.
     if candidate_resources:
-        _CAP = 5
-        shown = [r["name"] for r in candidate_resources[:_CAP]]
-        overflow = len(candidate_resources) - _CAP
+        _cap = 5
+        shown = [r["name"] for r in candidate_resources[:_cap]]
+        overflow = len(candidate_resources) - _cap
         name_summary = ", ".join(shown)
         if overflow > 0:
             name_summary += f", ... (+{overflow} more)"
@@ -225,9 +226,17 @@ def _list_instances(
             )
 
         if resp.status_code == 404:
-            # Implementation choice: 404 = Notebooks API not enabled for this
-            # project, meaning provably no instances exist. Treated as a clean
-            # empty scope (FULL, EVALUABLE). Not explicitly defined in the spec.
+            # 404 = Notebooks API not enabled for this project.
+            # Assumption: if the API is not enabled, no instances can exist.
+            # Treated as a clean empty scope (FULL, EVALUABLE).
+            # A warning is emitted so callers are aware of the assumption.
+            warnings.warn(
+                f"gcp.vertex.workbench.idle: Notebooks API not enabled for project"
+                f" '{project_id}' (HTTP 404) — assuming no instances exist;"
+                f" scope treated as FULL",
+                UserWarning,
+                stacklevel=3,
+            )
             return [], [], False
 
         if resp.status_code == 400:
diff --git a/docs/rules/aws.md b/docs/rules/aws.md
index 82adc6e..402a756 100644
--- a/docs/rules/aws.md
+++ b/docs/rules/aws.md
@@ -250,17 +250,17 @@
 **Spec:** [specs/aws/ai/sagemaker_notebook_idle.md](../specs/aws/ai/sagemaker_notebook_idle.md)
 
 #### `aws.ec2.gpu.idle`
-**Detects:** EC2 GPU/accelerator instances (p/g/trn/inf/dl families) with <5% GPU utilization or <10% CPU over `idle_days`
+**Detects:** EC2 GPU/accelerator instances (p/g/trn/inf/dl families) with max GPU utilization < 5% or max daily CPU < 10% over `idle_days`
 
-**Confidence / Risk:** HIGH (NVIDIA CW agent present, max GPU < 5%); MEDIUM (no NVIDIA agent, avg CPU < 10%) / CRITICAL (`idle_ratio ≥ 2.0`); HIGH (all other GPU/accelerator instances)
+**Confidence / Risk:** HIGH (`nvidia_smi_utilization_gpu` discoverable in CloudWatch and max GPU < threshold); MEDIUM (metric not discoverable — CPU proxy only, max daily CPU < threshold) / CRITICAL (`idle_ratio >= 2.0`); HIGH (otherwise)
 
 **Permissions:** `ec2:DescribeInstances`, `cloudwatch:GetMetricStatistics`, `cloudwatch:ListMetrics`
 
 **Params:** `idle_days` (default: 7), `gpu_threshold` (default: 5.0%), `cpu_threshold` (default: 10.0%)
 
-**Exclusions:** non-GPU instance families; instances younger than threshold
+**Exclusions:** non-GPU instance families; missing, naive, or future `LaunchTime`; instances younger than threshold; no CloudWatch datapoints on chosen metric path
 
-**Spec:** —
+**Spec:** [specs/aws/ai/ec2_gpu_idle.md](../specs/aws/ai/ec2_gpu_idle.md)
 
 #### `aws.bedrock.provisioned_throughput.idle`
 **Detects:** Bedrock Provisioned Throughput (Model Units) with zero invocations for `idle_days`; bills per MU per hour regardless of traffic
diff --git a/docs/specs/aws/ai/ec2_gpu_idle.md b/docs/specs/aws/ai/ec2_gpu_idle.md
new file mode 100644
index 0000000..a9877a5
--- /dev/null
+++ b/docs/specs/aws/ai/ec2_gpu_idle.md
@@ -0,0 +1,433 @@
+# aws.ec2.gpu.idle -- Canonical Rule Specification
+
+## 1. Intent
+
+Detect raw Amazon EC2 accelerated-computing instances that are still `running`, old enough to
+evaluate, and show persistently low accelerator-related utilization over the configured evaluation
+window.
+
+This is a **CleanCloud-derived review-candidate heuristic**, not an AWS-native idle-state signal.
+It is intentionally conservative and read-only. It is **not** proof that an instance is safe to
+stop, **not** proof that no scheduled or intermittent workload exists, and **not** proof that no
+future use is planned.
+
+Two signal tiers are used:
+
+1. **HIGH confidence** when an NVIDIA CloudWatch agent GPU metric is available and stays below
+   threshold.
+2. **MEDIUM confidence** CPU fallback when no qualifying per-instance NVIDIA GPU metric is
+   available.
+
+---
+
+## 2. AWS API Grounding
+
+Based on current AWS EC2 and CloudWatch documentation.
+
+### Key facts
+
+1. `DescribeInstances` supports filtering by `instance-state-name`.
+2. EC2 instance state names include `running`, `pending`, `stopping`, `stopped`, and others.
+3. `DescribeInstances` returns core fields needed by this rule, including `InstanceId`,
+   `InstanceType`, `LaunchTime`, `Tags`, `State`, and `InstanceLifecycle`.
+4. The EC2 CloudWatch metrics reference documents `CPUUtilization` in namespace `AWS/EC2`,
+   with meaningful statistics including `Average`, `Minimum`, and `Maximum`.
+5. AWS documents `GPUPowerUtilization` in namespace `AWS/EC2` for only a subset of accelerated
+   instance types.
+6. `ListMetrics` lists metrics by namespace, metric name, and dimensions, returns up to 500
+   results per call, and does not return metrics that have not published data in the past two
+   weeks.
+7. `GetMetricStatistics` retrieves CloudWatch datapoints for a metric and requires exact
+   dimensions that match the published metric.
+8. `GetMetricStatistics` retains 1-hour metric resolution for 455 days, and when the start time
+   is greater than 63 days ago, 3600-second periods are valid.
+9. AWS documents NVIDIA GPU metrics collected by the CloudWatch agent, including
+   `nvidia_smi_utilization_gpu`, which represents the percentage of time over the sample period
+   during which one or more kernels on the GPU was running.
+10. AWS documents CloudWatch agent metric dimensions for NVIDIA GPU metrics as GPU-level
+    dimensions such as `index`, `name`, and `arch`.
+11. AWS also documents CloudWatch agent `append_dimensions`, including `InstanceId`, when the
+    agent is configured to append EC2 dimensions to collected metrics.
+12. AWS accelerated-computing instance documentation includes GPU and AI-accelerator families such
+    as P, G, Trn, Inf, and DL families, plus non-AI families such as F and VT.
+13. AWS pricing docs exist, but fixed monthly USD estimates used by CleanCloud are implementation
+    context, not canonical AWS API outputs.
+
+### Implications
+
+- `DescribeInstances(Filters=[{"Name": "instance-state-name", "Values": ["running"]}])` is the
+  canonical inventory API for this rule.
+- Only `running` instances are in scope because they incur live compute charges.
+- `LaunchTime` is the canonical age gate for suppressing newly launched instances.
+- `CPUUtilization` is a documented EC2 metric and can support a fallback heuristic.
+- `GPUPowerUtilization` is excluded due to inconsistent emission and non-idle semantics across
+  instance families.
+- `nvidia_smi_utilization_gpu` is the strongest available GPU activity signal used by this rule,
+  but it is **agent-dependent**, not AWS-native by default.
+- The current implementation's high-confidence GPU path depends on the CloudWatch agent publishing
+  GPU metrics that are discoverable per instance under the exact dimension pattern the rule probes.
+  In practice, this often requires agent `append_dimensions` such as `InstanceId`, but AWS does not
+  guarantee that GPU metrics include that dimension by default.
+- Because `ListMetrics` does not return metrics with no datapoints in the past two weeks, GPU
+  metric discovery is inherently recency-limited even though `GetMetricStatistics` can query older
+  datapoints at coarser resolution. In quiet, low-noise, or intermittently reporting fleets, this
+  can systematically under-detect GPU metrics, suppress the GPU path entirely, and bias
+  classification toward CPU fallback findings rather than GPU-based findings.
+- Absence of `nvidia_smi_utilization_gpu` is **not** proof that the instance is idle; it may mean
+  the CloudWatch agent is absent, misconfigured, or not appending the dimensions the rule probes.
+- Trainium, Inferentia, and other non-NVIDIA accelerator families do not expose
+  `nvidia_smi_utilization_gpu`; those families necessarily fall back to CPU in the current rule,
+  and AWS does not document a standard EC2-native GPU-equivalent utilization metric for them that
+  this rule can use as a direct substitute.
+- In many real fleets, the CWAgent GPU metric path will be unavailable more often than available,
+  so CPU fallback may dominate in practice.
+
+---
+
+## 3. Scope and Terminology
+
+- **EC2 accelerated instance** - an EC2 instance whose type belongs to the rule's targeted
+  AI/accelerator family allowlist.
+- **Running instance** - an EC2 instance whose `State.Name` is exactly `running`.
+- **Age gate** - `LaunchTime` is at least `effective_idle_days` old.
+- **GPU metric path** - NVIDIA CloudWatch agent metric `CWAgent / nvidia_smi_utilization_gpu`
+  exists for the instance and remains below threshold over the evaluation window.
+- **CPU fallback path** - no qualifying GPU metric is available, so `AWS/EC2 / CPUUtilization`
+  is used as a weaker heuristic.
+- **effective_idle_days** - `max(idle_days, 1)`.
+- **age_days** - `floor((now_utc - launch_time_utc) / 86400 seconds)`.
+- **idle_ratio** - `age_days / effective_idle_days`.
+- **evaluation_window_end_utc** - `now_utc`
+- **evaluation_window_start_utc** - `evaluation_window_end_utc - effective_idle_days x 86400 seconds`
+- **GPU metric available** - at least one matching `nvidia_smi_utilization_gpu` metric was
+  discoverable for the instance.
+
+### 3.1 Explicit scope boundary
+
+This rule applies only to **raw EC2 accelerated-computing instances**, not managed ML resources.
+
+Out of scope:
+
+- SageMaker endpoints, notebooks, Studio apps, or training jobs
+- stopped, stopping, pending, shutting-down, or terminated EC2 instances
+- FPGA-only and video-transcode families not targeted by this rule
+- storage-only cost after instance stop
+- workload-semantic reconstruction from application logs
+
+### 3.2 Current family allowlist
+
+The current rule targets these instance-type prefixes:
+
+- `p2.`, `p3.`, `p3dn.`, `p4d.`, `p4de.`, `p5.`, `p5en.`, `p6.`
+- `g4dn.`, `g4ad.`, `g5.`, `g5g.`, `g6.`, `g6e.`, `gr6.`
+- `trn1.`, `trn1n.`, `trn2.`
+- `inf1.`, `inf2.`
+- `dl1.`, `dl2q.`
+
+Notes:
+
+- This is a **rule allowlist**, not a complete inventory of every accelerated EC2 family AWS may
+  document.
+- Non-AI accelerated families such as `f1`, `f2`, and `vt1` are intentionally out of scope.
+- Future AWS family-name additions may require allowlist updates even when they are documented in
+  the latest accelerated-computing pages.
+
+---
+
+## 4. Canonical Rule Statement
+
+An EC2 instance is eligible only when **all** of the following are true:
+
+1. `State.Name == "running"`
+2. `InstanceType` matches the rule allowlist
+3. `LaunchTime` is valid and `age_days >= effective_idle_days`
+4. one of the utilization branches below is satisfied:
+   - **GPU path:** maximum observed `nvidia_smi_utilization_gpu` over the evaluation window is below
+      `gpu_threshold`
+   - **CPU fallback path:** no qualifying GPU metric is available and maximum observed daily
+     `CPUUtilization` over the evaluation window is below `cpu_threshold`
+
+No finding may be emitted from age alone, from instance family alone, or from missing GPU metrics
+alone.
+
+---
+
+## 5. Normalization Contract
+
+All logic must operate on normalized fields only.
+
+| Canonical field | Source field | Absent / invalid |
+|---|---|---|
+| `resource_id` | `InstanceId` | skip item |
+| `instance_id` | `InstanceId` | skip item |
+| `instance_type` | `InstanceType` | skip item |
+| `normalized_state` | `State.Name` | skip item |
+| `launch_time_utc` | `LaunchTime` (tz-aware UTC) | skip item |
+| `age_days` | floor((now - launch_time_utc) / 86400) | skip item |
+| `name_tag` | tag `Name` | fall back to `instance_id` |
+| `purchasing_model` | `InstanceLifecycle` | `"on-demand"` when absent |
+| `tags` | `Tags` | `{}` |
+
+Normalization requirements:
+
+- `InstanceId` and `InstanceType` must be non-empty strings.
+- `State.Name` must be a non-empty string.
+- `LaunchTime` must be timezone-aware UTC before use.
+- future `LaunchTime` must skip the item.
+- `effective_idle_days = max(idle_days, 1)`.
+
+---
+
+## 6. Signal Contract
+
+This rule uses a two-tier utilization model.
+
+### 6.1 HIGH-confidence GPU path
+
+The preferred signal is:
+
+- namespace: `CWAgent`
+- metric: `nvidia_smi_utilization_gpu`
+
+Signal interpretation:
+
+- AWS documents this metric as the percentage of time over the sample period during which one or
+  more GPU kernels were running.
+- The rule first probes metric presence with `ListMetrics`.
+- When GPU metrics are present, the rule queries `GetMetricStatistics` using `Statistics=["Maximum"]`
+  and `Period=3600`.
+- For multi-GPU instances, the rule takes the **maximum** observed utilization across all returned
+  GPU-index metrics in the evaluation window.
+- If the GPU metric has not published in the last two weeks, `ListMetrics` may not return it even
+  if older datapoints still exist in CloudWatch retention; in that case the rule falls back to CPU.
+- The current rule does **not** require continuous evaluation-window GPU datapoint coverage. Any returned
+  datapoints on the chosen GPU metric path are treated as sufficient for evaluation; complete
+  absence of datapoints causes a safe-default skip instead.
+- This is intentionally a loose coverage rule matching current implementation behavior: sparse or
+  intermittent GPU datapoints can still drive a decision, which is weaker than requiring dense or
+  complete coverage of the evaluation window.
+- The current rule defines no minimum datapoint count or density threshold for the GPU path.
+- In the current implementation, even a single returned GPU datapoint can drive the GPU-path
+  decision.
+
+Canonical threshold:
+
+- emit only when `max_gpu_utilization_pct < gpu_threshold`
+
+Why MAX is required:
+
+- on multi-GPU hosts, averaging across GPU indices can hide a single active accelerator
+- if any GPU is materially active, the instance must not be flagged idle
+
+### 6.2 MEDIUM-confidence CPU fallback
+
+If no qualifying GPU metric is discoverable, the rule falls back to:
+
+- namespace: `AWS/EC2`
+- metric: `CPUUtilization`
+
+Signal interpretation:
+
+- The rule queries `GetMetricStatistics` with `Statistics=["Maximum"]` and `Period=86400`.
+- It then takes the **maximum** daily CPU peak across the evaluation window.
+- The current rule does **not** require continuous evaluation-window CPU datapoint coverage. Any returned
+  datapoints on the chosen CPU path are treated as sufficient for evaluation; complete absence of
+  datapoints causes a safe-default skip instead.
+- This is intentionally a loose coverage rule matching current implementation behavior: sparse CPU
+  datapoints can still drive a decision, which increases false-positive risk relative to a denser
+  or complete-coverage requirement.
+- The current rule defines no minimum datapoint count or density threshold for the CPU path.
+- In the current implementation, even a very small number of CPU datapoints can drive the CPU-path
+  decision.
+
+Canonical threshold:
+
+- emit only when `max_daily_cpu_utilization_pct < cpu_threshold`
+
+Why this is weaker:
+
+- CPU is not a direct accelerator activity metric
+- accelerator-heavy workloads can perform real work with low CPU
+- absence of the CWAgent GPU metric is not proof of GPU inactivity
+- the CPU fallback path therefore has materially higher false-positive risk than the GPU path
+
+Therefore CPU fallback is **MEDIUM confidence only**.
+
+### 6.3 Neuron and non-NVIDIA accelerators
+
+For `trn*`, `inf*`, `dl1`, and `dl2q` families:
+
+- `nvidia_smi_utilization_gpu` is not expected to exist
+- CPU fallback is the only supported signal in the current rule
+- no EC2-native CloudWatch GPU-equivalent metric exists that this rule can query as a drop-in
+  replacement across these families
+- finding text must make this limitation explicit
+
+### 6.4 Non-canonical or out-of-contract signals
+
+The following are not used for eligibility:
+
+- `GPUPowerUtilization`
+- EC2 network, disk, or status-check metrics
+- instance age alone
+- purchasing model (`spot`, `scheduled`, `capacity-block`, `on-demand`)
+- tags
+- CloudWatch Logs or application logs
+
+---
+
+## 7. Confidence, Risk, and Cost
+
+### 7.1 Confidence
+
+- **HIGH** when GPU metric path is used and threshold is met
+- **MEDIUM** when CPU fallback path is used and threshold is met
+
+### 7.2 Risk
+
+- `idle_ratio = age_days / effective_idle_days`
+- **CRITICAL** when `idle_ratio >= 2.0`
+- **HIGH** otherwise
+
+### 7.3 Cost
+
+- `estimated_monthly_cost_usd` is advisory context only
+- CleanCloud may use an implementation-defined us-east-1 on-demand monthly cost table
+- if the instance type is unknown to that table, a default fallback estimate may be used
+- cost must not affect eligibility, confidence, or risk gating
+
+---
+
+## 8. Deterministic Evaluation Order
+
+1. Set `effective_idle_days = max(idle_days, 1)`.
+2. Call `DescribeInstances` and fully paginate only `running` instances.
+3. For each returned instance:
+   - state missing or not `running` -> **SKIP ITEM**
+   - `InstanceType` missing or not in allowlist -> **SKIP ITEM**
+   - invalid / naive / future `LaunchTime` -> **SKIP ITEM**
+   - `age_days < effective_idle_days` -> **SKIP ITEM**
+4. Probe GPU metrics with `ListMetrics` for `CWAgent / nvidia_smi_utilization_gpu`.
+5. If GPU metrics exist:
+   - fetch 1-hour `Maximum` datapoints per metric within `[evaluation_window_start_utc, evaluation_window_end_utc]`
+   - if no datapoints or metric retrieval fails -> **SKIP ITEM**
+   - compute global max across all GPU metrics
+   - if `global_max_gpu >= gpu_threshold` -> **SKIP ITEM**
+   - otherwise -> **EMIT HIGH-confidence finding**
+6. If no GPU metrics exist:
+   - fetch daily `Maximum` datapoints for `AWS/EC2 / CPUUtilization` within `[evaluation_window_start_utc, evaluation_window_end_utc]`
+   - if no datapoints or metric retrieval fails -> **SKIP ITEM**
+   - compute max daily CPU peak
+   - if `max_daily_cpu >= cpu_threshold` -> **SKIP ITEM**
+   - otherwise -> **EMIT MEDIUM-confidence finding**
+
+No raw field access after normalization.
+
+---
+
+## 9. Exclusion Rules
+
+1. non-`running` instance state
+2. instance type not in the rule allowlist
+3. missing `InstanceId`
+4. missing `InstanceType`
+5. missing, naive, or future `LaunchTime`
+6. `age_days < effective_idle_days`
+7. GPU metric present but any observed GPU maximum meets or exceeds threshold
+8. GPU metric absent and CPU daily peak meets or exceeds threshold
+9. missing datapoints on the chosen metric path
+
+---
+
+## 10. Failure Model
+
+**Rule-level failures (FAIL RULE):**
+
+- `DescribeInstances` permission failure (`UnauthorizedOperation`, `AccessDenied`)
+
+**Item-level safe-default skips (SKIP ITEM):**
+
+- malformed identity or timestamps
+- unsupported instance family
+- too young to classify
+- CloudWatch metric retrieval returns no datapoints
+- CloudWatch metric retrieval errors on the chosen path
+
+Design principle:
+
+- metric uncertainty must resolve to **not idle**, never to a finding
+- CloudWatch errors are treated as safe defaults, not as idle evidence
+
+---
+
+## 11. Output Contract
+
+Each emitted finding must preserve the standard `Finding` contract and include, at minimum:
+
+- `provider = "aws"`
+- `rule_id = "aws.ec2.gpu.idle"`
+- `resource_type = "aws.ec2.instance"`
+- `resource_id = InstanceId`
+- `region = evaluated region`
+- `risk`
+- `confidence`
+- `detected_at`
+- `estimated_monthly_cost_usd`
+- `evidence`
+- `details`
+
+### 11.1 Evidence contract
+
+`evidence.signals_used` should include:
+
+- instance running state
+- instance type
+- purchasing model
+- the utilization signal used for eligibility
+- age when available
+
+`evidence.signals_not_checked` should include:
+
+- direct accelerator utilization when CPU fallback is used
+- scheduled workloads outside the observation window
+- planned future use
+
+Additional context should note:
+
+- CPU fallback is heuristic only
+- CPU fallback has higher false-positive risk than GPU-based detection
+- Neuron families require different telemetry for true accelerator utilization
+- missing CWAgent GPU metrics do not prove idleness
+
+### 11.2 Details contract
+
+`details` should include:
+
+- `instance_id`
+- `instance_type`
+- `name`
+- `age_days`
+- `idle_days_threshold`
+- `idle_ratio`
+- `idle_signal`
+- `utilization_pct`
+- `purchasing_model`
+- `gpu_metric_available`
+- `gpu_threshold_pct`
+- `cpu_threshold_pct`
+- `estimated_monthly_cost`
+- `cost_basis`
+- `tags`
+
+---
+
+## 12. Summary of Intended Semantics
+
+This rule detects **running raw EC2 accelerated instances** that are old enough to evaluate and
+appear idle under one of two heuristics:
+
+1. **HIGH confidence** - CloudWatch agent GPU metric proves very low GPU execution activity
+2. **MEDIUM confidence** - GPU metric unavailable, and CPU peak remains low over the evaluation window
+
+It is intentionally cost-focused and review-oriented. It does **not** prove that an accelerator
+workload is absent; it identifies likely expensive idle compute for human review.
diff --git a/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py b/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py
index 716dd29..b67cb64 100644
--- a/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py
+++ b/tests/cleancloud/providers/aws/ai/test_aws_ec2_gpu_idle.py
@@ -8,12 +8,19 @@
 - Instance state filter: only running instances
 - GPU family filter: non-GPU instances skipped
 - Age guard: instances younger than idle_days skipped
+- Normalization: missing InstanceId → skip (spec section 5)
+- Normalization: missing LaunchTime → skip (spec section 5)
+- Normalization: naive (tz-unaware) LaunchTime → skip, not fixed up (spec section 5)
+- Normalization: future LaunchTime → skip (spec section 5)
 - Utilisation thresholds: configurable gpu_threshold and cpu_threshold
 - Risk levels: CRITICAL (idle_ratio >= 2.0), HIGH otherwise
 - Active GPU instance not flagged
 - Active CPU instance not flagged (fallback path)
 - CloudWatch errors treated as active (safe default)
 - Permission errors: PermissionError raised on AccessDenied from describe_instances
+- _list_gpu_metrics paginates via NextToken (spec section 2 key fact 6)
+- signals_not_checked: GPU note only in CPU fallback path (spec section 11.1)
+- details: utilization_pct uses US spelling (spec section 11.2)
 - RULE_METADATA and RULE_ID attributes
 """
 
@@ -67,7 +74,7 @@ def _make_session(instances, gpu_util=None, cpu_util=None, has_gpu_metric=False)
     Build a mock boto3 session.
 
     gpu_util: if not None, NVIDIA CloudWatch agent metrics are available with this max value.
-    cpu_util: avg CPU utilisation returned from AWS/EC2 namespace.
+    cpu_util: maximum daily CPU utilization value returned from AWS/EC2 namespace.
     has_gpu_metric: whether ListMetrics returns NVIDIA metrics.
     """
     session = MagicMock()
@@ -99,7 +106,7 @@ def _make_session(instances, gpu_util=None, cpu_util=None, has_gpu_metric=False)
     else:
         cw.list_metrics.return_value = {"Metrics": []}
 
-    # GetMetricStatistics for GPU utilisation
+    # GetMetricStatistics for GPU/CPU utilization
     if gpu_util is not None:
         cw.get_metric_statistics.return_value = {
             "Datapoints": [{"Maximum": gpu_util, "Timestamp": NOW}]
@@ -223,7 +230,7 @@ def test_g5_is_not_neuron(self):
 
 class TestFindIdleGpuInstances:
     def test_idle_gpu_instance_flagged_high_confidence(self):
-        """GPU metric available, utilisation below threshold → HIGH confidence."""
+        """GPU metric available, utilization below threshold → HIGH confidence."""
         # age=7, idle_days=7 → idle_ratio=1.0 < 2.0 → HIGH risk (not CRITICAL)
         findings = _run([_make_instance(age_days=7)], gpu_util=2.0, has_gpu_metric=True)
         assert len(findings) == 1
@@ -242,12 +249,12 @@ def test_idle_gpu_instance_cpu_fallback_medium_confidence(self):
         assert findings[0].confidence == ConfidenceLevel.MEDIUM
 
     def test_active_gpu_instance_not_flagged(self):
-        """GPU utilisation above threshold → not flagged."""
+        """GPU utilization above threshold → not flagged."""
         findings = _run([_make_instance()], gpu_util=50.0, has_gpu_metric=True)
         assert findings == []
 
     def test_active_cpu_fallback_not_flagged(self):
-        """CPU utilisation above threshold in fallback path → not flagged."""
+        """CPU utilization above threshold in fallback path → not flagged."""
         findings = _run([_make_instance()], cpu_util=25.0, has_gpu_metric=False)
         assert findings == []
 
@@ -388,11 +395,11 @@ def test_permission_error_on_describe_instances(self):
 
     def test_details_include_idle_signal_source(self):
         findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
-        assert findings[0].details["idle_signal"] == "gpu_utilisation"
+        assert findings[0].details["idle_signal"] == "gpu_utilization"
 
     def test_details_include_cpu_fallback_signal_source(self):
         findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False)
-        assert findings[0].details["idle_signal"] == "cpu_utilisation_fallback"
+        assert findings[0].details["idle_signal"] == "cpu_utilization_fallback"
 
     def test_details_include_gpu_metric_available_flag(self):
         findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
@@ -424,3 +431,221 @@ def _client(service, **kwargs):
             mock_dt.fromisoformat = datetime.fromisoformat
             findings = find_idle_gpu_instances(session, _REGION)
         assert len(findings) == 2
+
+
+# ---------------------------------------------------------------------------
+# Normalization: skip rules (spec section 5)
+# ---------------------------------------------------------------------------
+
+
+class TestNormalizationSkips:
+    """
+    Instances with missing/invalid identity or timestamp fields must be skipped
+    without raising an exception (spec section 5, section 8.3).
+    """
+
+    def test_missing_instance_id_is_skipped(self):
+        """InstanceId absent → skip item, not KeyError."""
+        inst = _make_instance()
+        del inst["InstanceId"]
+        findings = _run([inst], cpu_util=1.0, has_gpu_metric=False)
+        assert findings == []
+
+    def test_empty_instance_id_is_skipped(self):
+        inst = _make_instance()
+        inst["InstanceId"] = ""
+        findings = _run([inst], cpu_util=1.0, has_gpu_metric=False)
+        assert findings == []
+
+    def test_missing_launch_time_is_skipped(self):
+        """Missing LaunchTime → skip item (spec section 5)."""
+        inst = _make_instance()
+        del inst["LaunchTime"]
+        findings = _run([inst], cpu_util=1.0, has_gpu_metric=False)
+        assert findings == []
+
+    def test_naive_launch_time_is_skipped(self):
+        """
+        Naive (tz-unaware) LaunchTime must be skipped, not coerced to UTC.
+        Spec section 5: 'LaunchTime must be timezone-aware UTC before use.'
+        """
+        inst = _make_instance()
+        inst["LaunchTime"] = NOW.replace(tzinfo=None) - timedelta(days=30)
+        findings = _run([inst], cpu_util=1.0, has_gpu_metric=False)
+        assert findings == []
+
+    def test_future_launch_time_is_skipped(self):
+        """Future LaunchTime → skip item (spec section 5)."""
+        inst = _make_instance()
+        inst["LaunchTime"] = NOW + timedelta(days=1)
+        findings = _run([inst], cpu_util=1.0, has_gpu_metric=False)
+        assert findings == []
+
+    def test_valid_instance_alongside_invalid_is_still_flagged(self):
+        """A bad record next to a good one must not suppress the valid finding."""
+        bad = _make_instance(instance_id="i-bad")
+        del bad["LaunchTime"]
+        good = _make_instance(instance_id="i-good", age_days=30)
+        findings = _run([bad, good], cpu_util=1.0, has_gpu_metric=False)
+        assert len(findings) == 1
+        assert findings[0].resource_id == "i-good"
+
+
+# ---------------------------------------------------------------------------
+# _list_gpu_metrics — pagination (spec section 2 key fact 6)
+# ---------------------------------------------------------------------------
+
+
+class TestListGpuMetricsPagination:
+    def test_follows_next_token(self):
+        """NextToken on first page must be followed to collect all metrics."""
+        cw = MagicMock()
+        metric1 = {
+            "MetricName": "nvidia_smi_utilization_gpu",
+            "Dimensions": [{"Name": "gpu_id", "Value": "0"}],
+        }
+        metric2 = {
+            "MetricName": "nvidia_smi_utilization_gpu",
+            "Dimensions": [{"Name": "gpu_id", "Value": "1"}],
+        }
+        cw.list_metrics.side_effect = [
+            {"Metrics": [metric1], "NextToken": "tok1"},
+            {"Metrics": [metric2]},
+        ]
+        result = _list_gpu_metrics(cw, "i-abc")
+        assert result == [metric1, metric2]
+        assert cw.list_metrics.call_count == 2
+
+    def test_second_call_includes_next_token(self):
+        cw = MagicMock()
+        cw.list_metrics.side_effect = [
+            {"Metrics": [], "NextToken": "tok-xyz"},
+            {"Metrics": []},
+        ]
+        _list_gpu_metrics(cw, "i-abc")
+        second_call_kwargs = cw.list_metrics.call_args_list[1][1]
+        assert second_call_kwargs.get("NextToken") == "tok-xyz"
+
+    def test_three_pages_accumulated(self):
+        cw = MagicMock()
+
+        def make_metric(idx):
+            return {
+                "MetricName": "nvidia_smi_utilization_gpu",
+                "Dimensions": [{"Name": "gpu_id", "Value": str(idx)}],
+            }
+
+        cw.list_metrics.side_effect = [
+            {"Metrics": [make_metric(0)], "NextToken": "t1"},
+            {"Metrics": [make_metric(1)], "NextToken": "t2"},
+            {"Metrics": [make_metric(2)]},
+        ]
+        result = _list_gpu_metrics(cw, "i-abc")
+        assert len(result) == 3
+
+    def test_error_on_second_page_returns_empty(self):
+        """Any exception during pagination returns [] (safe default)."""
+        cw = MagicMock()
+        cw.list_metrics.side_effect = [
+            {"Metrics": [], "NextToken": "tok1"},
+            ClientError({"Error": {"Code": "Throttling", "Message": ""}}, "ListMetrics"),
+        ]
+        assert _list_gpu_metrics(cw, "i-abc") == []
+
+
+# ---------------------------------------------------------------------------
+# signals_not_checked: GPU note only in CPU fallback path (spec section 11.1)
+# ---------------------------------------------------------------------------
+
+
+class TestSignalsNotChecked:
+    def test_gpu_path_does_not_include_gpu_note_in_not_checked(self):
+        """
+        When GPU metric is used, 'direct accelerator utilization' must NOT appear
+        in signals_not_checked — the GPU was checked and was the basis for the finding.
+        """
+        findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
+        not_checked = findings[0].evidence.signals_not_checked
+        assert not any("accelerator" in s.lower() for s in not_checked)
+
+    def test_cpu_path_includes_gpu_note_in_not_checked(self):
+        """
+        When CPU fallback is used, 'direct accelerator utilization' MUST appear
+        in signals_not_checked (spec section 11.1).
+        """
+        findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False)
+        not_checked = findings[0].evidence.signals_not_checked
+        assert any("accelerator" in s.lower() for s in not_checked)
+
+    def test_scheduled_workloads_in_not_checked_for_gpu_path(self):
+        findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
+        not_checked = findings[0].evidence.signals_not_checked
+        assert any("scheduled" in s.lower() or "batch" in s.lower() for s in not_checked)
+
+    def test_planned_future_use_in_not_checked_for_gpu_path(self):
+        findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
+        not_checked = findings[0].evidence.signals_not_checked
+        assert any("planned future" in s.lower() or "future use" in s.lower() for s in not_checked)
+
+
+# ---------------------------------------------------------------------------
+# Details contract (spec section 11.2)
+# ---------------------------------------------------------------------------
+
+
+class TestDetailsContract:
+    def test_utilization_pct_us_spelling_gpu_path(self):
+        """Details field must use 'utilization_pct' (US spelling, spec section 11.2)."""
+        findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
+        assert "utilization_pct" in findings[0].details
+        assert "utilisation_pct" not in findings[0].details
+
+    def test_utilization_pct_us_spelling_cpu_path(self):
+        findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False)
+        assert "utilization_pct" in findings[0].details
+        assert "utilisation_pct" not in findings[0].details
+
+    def test_age_days_is_integer_not_unknown(self):
+        """age_days must be an integer — 'unknown' fallback is removed since LaunchTime is required."""
+        findings = _run([_make_instance(age_days=30)], cpu_util=5.0, has_gpu_metric=False)
+        assert isinstance(findings[0].details["age_days"], int)
+        assert findings[0].details["age_days"] == 30
+
+
+# ---------------------------------------------------------------------------
+# Reason string — per-path framing (spec section 11.1, drift fix)
+# ---------------------------------------------------------------------------
+
+
+class TestReasonString:
+    """
+    GPU path: factual assertion — GPU utilization was directly measured.
+    CPU path: softened proxy framing — GPU activity was NOT directly measured.
+    """
+
+    def test_gpu_path_reason_asserts_gpu_utilization(self):
+        findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
+        assert "GPU utilization" in findings[0].reason
+
+    def test_gpu_path_reason_does_not_say_proxy(self):
+        findings = _run([_make_instance()], gpu_util=1.0, has_gpu_metric=True)
+        assert "proxy" not in findings[0].reason.lower()
+
+    def test_cpu_path_reason_says_proxy_signal(self):
+        """CPU fallback reason must use proxy framing, not assert GPU facts."""
+        findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False)
+        assert "proxy" in findings[0].reason.lower()
+
+    def test_cpu_path_reason_says_not_directly_measured(self):
+        findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False)
+        assert "not directly measured" in findings[0].reason
+
+    def test_cpu_path_reason_includes_utilization_value(self):
+        findings = _run([_make_instance()], cpu_util=3.7, has_gpu_metric=False)
+        assert "3.7" in findings[0].reason
+
+    def test_not_checked_cpu_path_mentions_absence_does_not_confirm_idle(self):
+        """The not_checked entry for CPU path must carry the absence ≠ idle clarification."""
+        findings = _run([_make_instance()], cpu_util=5.0, has_gpu_metric=False)
+        not_checked = " ".join(findings[0].evidence.signals_not_checked)
+        assert "does not confirm" in not_checked or "absence" in not_checked
diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
index e453b18..3ca9d70 100644
--- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
+++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
@@ -77,9 +77,7 @@ def _run(instances, unreachable=None, discovery_failed=False, **kwargs):
         "cleancloud.providers.gcp.rules.ai.workbench_idle._list_instances",
         return_value=(instances, unreachable or [], discovery_failed),
     ):
-        return find_idle_workbench_instances(
-            project_id=_PROJECT, credentials=MagicMock(), **kwargs
-        )
+        return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs)
 
 
 def _run_with_warnings(instances, unreachable=None, discovery_failed=False, **kwargs):
@@ -99,9 +97,7 @@ def _invoke_http(mock_session, **kwargs):
         "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
         return_value=mock_session,
     ):
-        return find_idle_workbench_instances(
-            project_id=_PROJECT, credentials=MagicMock(), **kwargs
-        )
+        return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs)
 
 
 def _invoke_http_with_warnings(mock_session, **kwargs):
@@ -128,8 +124,7 @@ def test_always_empty_with_active_instances(self):
 
     def test_always_empty_with_multiple_active(self):
         instances = [
-            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
-            for i in range(5)
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(5)
         ]
         assert _run(instances) == []
 
@@ -145,9 +140,7 @@ def test_always_empty_with_stopped_instances(self):
 class TestIdleDaysValidation:
     def test_zero_raises(self):
         with pytest.raises(ValueError, match="idle_days must be >= 1"):
-            find_idle_workbench_instances(
-                project_id=_PROJECT, credentials=MagicMock(), idle_days=0
-            )
+            find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), idle_days=0)
 
     def test_negative_raises(self):
         with pytest.raises(ValueError, match="idle_days must be >= 1"):
@@ -203,8 +196,7 @@ def test_warning_mentions_resource_name(self):
 
     def test_warning_mentions_count(self):
         instances = [
-            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
-            for i in range(3)
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(3)
         ]
         _, warns = _run_with_warnings(instances)
         assert "3" in str(warns[0].message)
@@ -219,8 +211,7 @@ def test_warning_mentions_all_resource_names_when_under_cap(self):
 
     def test_warning_caps_name_list_at_five(self):
         instances = [
-            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
-            for i in range(8)
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(8)
         ]
         _, warns = _run_with_warnings(instances)
         msg = str(warns[0].message)
@@ -228,8 +219,7 @@ def test_warning_caps_name_list_at_five(self):
 
     def test_warning_no_overflow_label_when_at_cap(self):
         instances = [
-            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}")
-            for i in range(5)
+            _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-{i}") for i in range(5)
         ]
         _, warns = _run_with_warnings(instances)
         assert "more" not in str(warns[0].message)
@@ -278,32 +268,58 @@ def test_no_partial_warning_when_no_unreachable(self):
 
 
 # ---------------------------------------------------------------------------
-# PARTIAL scope warning — discovery_failed
+# Discovery-incomplete warning — discovery_failed (NOT PARTIAL per spec)
 # ---------------------------------------------------------------------------
 
 
-class TestPartialScopeDiscoveryFailed:
+class TestDiscoveryFailedWarning:
+    """
+    discovery_failed (400/5xx/network) emits a 'discovery incomplete' warning.
+    It is NOT labelled PARTIAL — PARTIAL is reserved for unreachable[] per spec
+    (sections 12.1 and 12.2.7).
+    """
+
     def test_discovery_failed_emits_warning(self):
         _, warns = _run_with_warnings([], discovery_failed=True)
         assert len(warns) == 1
 
-    def test_warning_mentions_partial(self):
+    def test_warning_does_not_mention_partial(self):
+        """discovery_failed must NOT say PARTIAL — that label is only for unreachable[]."""
         _, warns = _run_with_warnings([], discovery_failed=True)
-        assert "PARTIAL" in str(warns[0].message)
+        assert "PARTIAL" not in str(warns[0].message)
+
+    def test_warning_mentions_incomplete(self):
+        _, warns = _run_with_warnings([], discovery_failed=True)
+        assert "incomplete" in str(warns[0].message).lower()
 
     def test_warning_mentions_project(self):
         _, warns = _run_with_warnings([], discovery_failed=True)
         assert _PROJECT in str(warns[0].message)
 
-    def test_both_partial_and_not_evaluable_can_warn(self):
-        """ACTIVE instances + unreachable locations = two warnings."""
+    def test_unreachable_and_not_evaluable_each_emit_separate_warning(self):
+        """ACTIVE instances + unreachable locations = two warnings (PARTIAL + NOT_EVALUABLE)."""
         _, warns = _run_with_warnings([_active()], unreachable=["asia-east1"])
         assert len(warns) == 2
 
     def test_discovery_failed_and_active_instances_both_warn(self):
+        """discovery_failed + ACTIVE instances = two separate warnings."""
         _, warns = _run_with_warnings([_active()], discovery_failed=True)
         assert len(warns) == 2
 
+    def test_all_three_conditions_emit_three_warnings(self):
+        """unreachable + discovery_failed + ACTIVE = three separate warnings."""
+        _, warns = _run_with_warnings(
+            [_active()], unreachable=["asia-east1"], discovery_failed=True
+        )
+        assert len(warns) == 3
+
+    def test_unreachable_warning_still_says_partial(self):
+        """unreachable[] warning must still say PARTIAL even when discovery_failed is also True."""
+        _, warns = _run_with_warnings([], unreachable=["asia-east1"], discovery_failed=True)
+        partial_warns = [w for w in warns if "PARTIAL" in str(w.message)]
+        assert len(partial_warns) == 1
+        assert "asia-east1" in str(partial_warns[0].message)
+
 
 # ---------------------------------------------------------------------------
 # INVALID resource classification
@@ -384,15 +400,15 @@ def test_suspended_excluded_silently(self):
         assert len(warns) == 0
 
     def test_provisioning_excluded_silently(self):
-        _, warns = _run_with_warnings(
-            [{"name": _INSTANCE_NAME, "state": "PROVISIONING"}]
-        )
+        _, warns = _run_with_warnings([{"name": _INSTANCE_NAME, "state": "PROVISIONING"}])
         assert len(warns) == 0
 
     def test_mixed_active_and_stopped_only_active_warned(self):
         active = _active(f"projects/{_PROJECT}/locations/{_LOCATION}/instances/active")
-        stopped = {"name": f"projects/{_PROJECT}/locations/{_LOCATION}/instances/stopped",
-                   "state": "STOPPED"}
+        stopped = {
+            "name": f"projects/{_PROJECT}/locations/{_LOCATION}/instances/stopped",
+            "state": "STOPPED",
+        }
         _, warns = _run_with_warnings([active, stopped])
         assert len(warns) == 1
         msg = str(warns[0].message)
@@ -443,23 +459,36 @@ def test_403_mentions_role(self):
         with pytest.raises(PermissionError, match="roles/notebooks.viewer"):
             _invoke_http(_session(_err(403)))
 
-    def test_404_returns_empty_no_partial_warning(self):
+    def test_404_returns_empty_with_assumption_warning(self):
+        """404 emits a warning surfacing the 'no instances assumed' assumption."""
         result, warns = _invoke_http_with_warnings(_session(_err(404)))
         assert result == []
-        assert len(warns) == 0
+        assert len(warns) == 1
+        assert "404" in str(warns[0].message)
 
-    def test_400_returns_empty_with_partial_warning(self):
+    def test_404_warning_does_not_say_partial(self):
+        """404 is a clean-scope assumption, not a PARTIAL scan."""
+        _, warns = _invoke_http_with_warnings(_session(_err(404)))
+        assert "PARTIAL" not in str(warns[0].message)
+
+    def test_400_returns_empty_with_incomplete_warning(self):
         result, warns = _invoke_http_with_warnings(_session(_err(400)))
         assert result == []
-        # _list_instances warns about 400, caller warns about PARTIAL
         msgs = " ".join(str(w.message) for w in warns)
-        assert "400" in msgs or "PARTIAL" in msgs
+        assert "400" in msgs or "incomplete" in msgs.lower()
+
+    def test_400_warning_does_not_say_partial(self):
+        """discovery_failed (400) must NOT say PARTIAL — that label is only for unreachable[]."""
+        _, warns = _invoke_http_with_warnings(_session(_err(400)))
+        # _list_instances warns about 400; find_ warns about discovery incomplete
+        partial_warns = [w for w in warns if "PARTIAL" in str(w.message)]
+        assert len(partial_warns) == 0
 
     def test_500_returns_empty_with_warning(self):
         result, warns = _invoke_http_with_warnings(_session(_err(500)))
         assert result == []
         msgs = " ".join(str(w.message) for w in warns)
-        assert "500" in msgs or "PARTIAL" in msgs
+        assert "500" in msgs or "incomplete" in msgs.lower()
 
     def test_503_returns_empty_with_warning(self):
         result, warns = _invoke_http_with_warnings(_session(_err(503)))
@@ -512,8 +541,8 @@ def test_url_uses_v2_api(self):
 
 class TestListInstancesPagination:
     def test_two_pages_accumulates_instances(self):
-        inst1 = {"name": f"projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
-        inst2 = {"name": f"projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"}
+        inst1 = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        inst2 = {"name": "projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"}
         session = _session(
             _ok({"instances": [inst1], "nextPageToken": "tok1"}),
             _ok({"instances": [inst2]}),
@@ -523,9 +552,20 @@ def test_two_pages_accumulates_instances(self):
 
     def test_three_pages_all_accumulated(self):
         pages = [
-            _ok({"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}],
-                 "nextPageToken": f"t{i}"} if i < 2 else
-                {"instances": [{"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}]})
+            _ok(
+                {
+                    "instances": [
+                        {"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}
+                    ],
+                    "nextPageToken": f"t{i}",
+                }
+                if i < 2
+                else {
+                    "instances": [
+                        {"name": f"projects/p/locations/l/instances/i{i}", "state": "ACTIVE"}
+                    ]
+                }
+            )
             for i in range(3)
         ]
         instances, _, _ = _list_instances(_session(*pages), _PROJECT)
@@ -544,7 +584,9 @@ def test_stops_when_no_next_token(self):
 
 class TestListInstancesUnreachable:
     def test_unreachable_location_collected(self):
-        _, unreachable, _ = _list_instances(_session(_ok({"unreachable": ["asia-east1"]})), _PROJECT)
+        _, unreachable, _ = _list_instances(
+            _session(_ok({"unreachable": ["asia-east1"]})), _PROJECT
+        )
         assert "asia-east1" in unreachable
 
     def test_unreachable_deduplicated_across_pages(self):
@@ -568,9 +610,18 @@ def test_403_raises_permission_error(self):
             _list_instances(_session(_err(403)), _PROJECT)
 
     def test_404_returns_clean_empty(self):
-        instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT)
+        with warnings.catch_warnings(record=True):
+            warnings.simplefilter("always")
+            instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT)
         assert instances == [] and unreachable == [] and failed is False
 
+    def test_404_emits_assumption_warning(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _list_instances(_session(_err(404)), _PROJECT)
+        msgs = [str(w.message) for w in caught if issubclass(w.category, UserWarning)]
+        assert any("404" in m for m in msgs)
+
     def test_400_sets_discovery_failed(self):
         _, _, failed = _list_instances(_session(_err(400)), _PROJECT)
         assert failed is True

From 2b4346af16370cd4a9d625f661d2406bc425797d Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 6 May 2026 21:45:26 +0100
Subject: [PATCH 3/3] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7e2a2bf..217504b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cleancloud"
-version = "1.30.0"
+version = "1.31.0"
 description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry."
 readme = "README.md"
 requires-python = ">=3.10"