Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 94 additions & 59 deletions cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@
}
_DEFAULT_MONTHLY_COST = 600.0

# GPU utilisation thresholds
# GPU utilization thresholds
_GPU_UTIL_THRESHOLD_PCT = 5.0 # below this = idle (when GPU metric available)
_CPU_UTIL_THRESHOLD_PCT = 10.0 # below this = idle (CPU fallback)

Expand All @@ -151,7 +151,7 @@ def find_idle_gpu_instances(
cpu_threshold: float = _CPU_UTIL_THRESHOLD_PCT,
) -> List[Finding]:
"""
Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilisation.
Find EC2 GPU instances (p2/p3/p4/p5/g4/g5/g6/trn/inf/dl) with low utilization.

GPU instances (raw EC2, outside SageMaker) incur continuous charges while running
regardless of whether GPUs are being utilised. A p4d.24xlarge costs ~$23K/month
Expand All @@ -162,24 +162,28 @@ def find_idle_gpu_instances(
- Instance state is running
- Instance type is a known GPU/accelerator family
- Instance is older than idle_days (avoids flagging newly launched instances)
- GPU utilisation < gpu_threshold % over idle_days (HIGH confidence, when NVIDIA
CloudWatch agent publishes nvidia_smi_utilization_gpu under CWAgent namespace)
- OR CPU utilisation < cpu_threshold % over idle_days (MEDIUM confidence fallback,
- GPU utilization < gpu_threshold % over idle_days (HIGH confidence, when the
nvidia_smi_utilization_gpu metric is discoverable for the instance in CloudWatch)
- OR CPU utilization < cpu_threshold % over idle_days (MEDIUM confidence fallback,
used when GPU metrics are not available — CPU alone is a weaker signal)

GPU metric detection:
The NVIDIA CloudWatch agent publishes nvidia_smi_utilization_gpu under the CWAgent
namespace with an InstanceId dimension. Availability is probed via ListMetrics per
instance — not assumed. Instances without the agent fall back to CPU utilisation.
The rule probes CloudWatch ListMetrics for nvidia_smi_utilization_gpu in the CWAgent
namespace, filtered by InstanceId dimension. This depends on the CloudWatch agent
being installed and configured to append EC2 instance dimensions (e.g. via
append_dimensions = {"InstanceId": ...}). AWS does not guarantee the InstanceId
dimension is present by default; its presence is implementation-dependent. If the
metric is absent or the agent is misconfigured, the rule falls back to CPU
utilization. Absence of the metric is NOT proof the GPU is idle.

Multi-GPU handling:
For multi-GPU instances (e.g., p4d.24xlarge has 8 A100s), the MAX statistic is
used across all GPU index dimensions. A single active GPU on an 8-GPU instance
would be averaged away using AVG, producing a misleadingly low reading.

Confidence:
- HIGH: GPU metric available AND max GPU utilisation < gpu_threshold over idle_days
- MEDIUM: GPU metric unavailable, CPU utilisation < cpu_threshold over idle_days
- HIGH: GPU metric discoverable AND max GPU utilization < gpu_threshold over idle_days
- MEDIUM: GPU metric not discoverable; CPU utilization < cpu_threshold over idle_days

IAM permissions:
- ec2:DescribeInstances
Expand Down Expand Up @@ -208,53 +212,62 @@ def find_idle_gpu_instances(
if not _is_gpu_instance(instance_type):
continue

instance_id = inst["InstanceId"]
# Normalize InstanceId — skip if missing or empty (spec section 5)
instance_id = (inst.get("InstanceId") or "").strip()
if not instance_id:
continue

tags = {t["Key"]: t["Value"] for t in inst.get("Tags", [])}
# "spot" | "scheduled" | None (on-demand)
instance_lifecycle = inst.get("InstanceLifecycle")
purchasing_model = instance_lifecycle if instance_lifecycle else "on-demand"
launch_time = inst.get("LaunchTime")

age_days: Optional[int] = None
if launch_time:
if launch_time.tzinfo is None:
launch_time = launch_time.replace(tzinfo=timezone.utc)
age_days = (now - launch_time).days

# Skip instances younger than idle_days — too new to classify
if age_days is not None and age_days < idle_days:
# Normalize LaunchTime — skip if missing, naive, or future (spec section 5, section 8.3)
launch_time = inst.get("LaunchTime")
if not launch_time:
continue # missing LaunchTime → SKIP ITEM
if launch_time.tzinfo is None:
continue # naive timestamp is not tz-aware UTC → SKIP ITEM
age_days = (now - launch_time).days
if age_days < 0:
continue # future LaunchTime → SKIP ITEM

# Skip instances younger than effective_idle_days — too new to classify
if age_days < idle_days:
continue

# Probe for GPU metrics — single ListMetrics call reused for stats
gpu_metrics = _list_gpu_metrics(cloudwatch, instance_id)

if gpu_metrics:
max_gpu_util = _get_max_gpu_utilisation(
max_gpu_util = _get_max_gpu_utilization(
cloudwatch, gpu_metrics, idle_days, now
)
if max_gpu_util is None or max_gpu_util >= gpu_threshold:
continue
confidence = ConfidenceLevel.HIGH
idle_signal = "gpu_utilisation"
idle_signal = "gpu_utilization"
util_value = max_gpu_util
util_label = f"Max GPU utilisation: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
util_label = f"Max GPU utilization: {max_gpu_util:.1f}% (threshold: {gpu_threshold}%)"
else:
avg_cpu = _get_avg_cpu_utilisation(cloudwatch, instance_id, idle_days, now)
if avg_cpu is None or avg_cpu >= cpu_threshold:
max_cpu = _get_max_daily_cpu_utilization(
cloudwatch, instance_id, idle_days, now
)
if max_cpu is None or max_cpu >= cpu_threshold:
continue
# CPU fallback is a weak heuristic for GPU workloads:
# accelerator utilisation is invisible to CPU metrics, so a GPU
# accelerator utilization is invisible to CPU metrics, so a GPU
# instance running a compute-bound model can show near-zero CPU
# while doing real work. Confidence is capped at MEDIUM to reflect
# this limitation. Absence of the CWAgent GPU metric is NOT proof
# that the GPU is idle — the agent may simply not be installed.
# that the GPU is idle — the agent may be absent or misconfigured.
confidence = ConfidenceLevel.MEDIUM
idle_signal = "cpu_utilisation_fallback"
util_value = avg_cpu
idle_signal = "cpu_utilization_fallback"
util_value = max_cpu
util_label = (
f"Peak daily CPU utilisation: {avg_cpu:.1f}% "
f"Peak daily CPU utilization: {max_cpu:.1f}% "
f"(threshold: {cpu_threshold}%) — "
f"heuristic only; GPU/accelerator utilisation not directly measured"
f"heuristic only; GPU/accelerator utilization not directly measured"
)

monthly_cost = _MONTHLY_COST.get(instance_type, _DEFAULT_MONTHLY_COST)
Expand All @@ -272,15 +285,14 @@ def find_idle_gpu_instances(
f"Instance type: {instance_type} (GPU/accelerator family)",
f"Purchasing model: {purchasing_model}",
util_label,
f"Instance age: {age_days} days",
]
if age_days is not None:
signals.append(f"Instance age: {age_days} days")
if not gpu_metrics:
if _is_neuron_instance(instance_type):
signals.append(
"Neuron instance (Trainium/Inferentia) — NVIDIA GPU metric not "
"applicable; CPU used as heuristic fallback; confidence MEDIUM. "
"Neuron utilisation requires AWS Neuron SDK metrics."
"Neuron utilization requires AWS Neuron SDK metrics."
)
else:
signals.append(
Expand All @@ -289,11 +301,18 @@ def find_idle_gpu_instances(
"the GPU is idle. CPU used as heuristic fallback; confidence MEDIUM."
)

# signals_not_checked: GPU note only applies on CPU fallback path (spec section 11.1)
not_checked = [
"GPU/accelerator utilisation (not directly measurable without CWAgent)",
"Scheduled batch jobs that run outside the observation window",
"Planned future use",
]
if not gpu_metrics:
not_checked.insert(
0,
"Direct GPU/accelerator utilization — nvidia_smi_utilization_gpu was not "
"discoverable in CloudWatch (CWAgent may be absent or not configured with "
"InstanceId dimension); absence of the metric does not confirm the GPU is idle",
)
if purchasing_model == "spot":
not_checked.append(
"Spot interruption history — Spot instances may appear idle "
Expand All @@ -307,6 +326,17 @@ def find_idle_gpu_instances(
)

metric_label = "GPU" if gpu_metrics else "CPU (fallback)"
if gpu_metrics:
reason = (
f"GPU EC2 instance has low GPU utilization "
f"({util_value:.1f}%) over {idle_days} days"
)
else:
reason = (
f"GPU EC2 instance shows low CPU proxy signal "
f"({util_value:.1f}%) over {idle_days} days — "
f"GPU activity not directly measured"
)
findings.append(
Finding(
provider="aws",
Expand All @@ -316,22 +346,18 @@ def find_idle_gpu_instances(
region=region,
estimated_monthly_cost_usd=monthly_cost,
title=(
f"Idle GPU EC2 Instance ({metric_label} utilisation "
f"Idle GPU EC2 Instance ({metric_label} utilization "
f"<{gpu_threshold if gpu_metrics else cpu_threshold}% "
f"over {idle_days} days)"
),
summary=(
f"EC2 instance '{name_tag}' ({instance_type}) has had "
f"{'GPU' if gpu_metrics else 'CPU'} utilisation below "
f"{'GPU' if gpu_metrics else 'CPU'} utilization below "
f"{gpu_threshold if gpu_metrics else cpu_threshold}% "
f"for {idle_days} days while running, incurring "
f"continuous charges (~${monthly_cost:,.0f}/month us-east-1 estimate)."
),
reason=(
f"GPU EC2 instance has low "
f"{'GPU' if gpu_metrics else 'CPU'} utilisation "
f"({util_value:.1f}%) for {idle_days} days"
),
reason=reason,
risk=risk,
confidence=confidence,
detected_at=now,
Expand All @@ -340,11 +366,11 @@ def find_idle_gpu_instances(
"instance_id": instance_id,
"instance_type": instance_type,
"name": name_tag,
"age_days": (age_days if age_days is not None else "unknown"),
"age_days": age_days,
"idle_days_threshold": idle_days,
"idle_ratio": idle_ratio,
"idle_signal": idle_signal,
"utilisation_pct": round(util_value, 2),
"utilization_pct": round(util_value, 2),
"purchasing_model": purchasing_model,
"gpu_metric_available": bool(gpu_metrics),
"gpu_metric_note": (
Expand Down Expand Up @@ -387,25 +413,34 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list:
"""
Probe CloudWatch ListMetrics for nvidia_smi_utilization_gpu under CWAgent namespace.

Returns the Metrics list (one entry per GPU index) so the caller can reuse it
Exhausts pagination via NextToken (spec section 2 key fact 6: ListMetrics returns up to
500 results per call). Returns all Metrics entries so the caller can reuse them
for GetMetricStatistics without a second ListMetrics call. Returns [] on any error.
"""
metrics: list = []
kwargs: dict = {
"Namespace": "CWAgent",
"MetricName": "nvidia_smi_utilization_gpu",
"Dimensions": [{"Name": "InstanceId", "Value": instance_id}],
}
try:
resp = cloudwatch.list_metrics(
Namespace="CWAgent",
MetricName="nvidia_smi_utilization_gpu",
Dimensions=[{"Name": "InstanceId", "Value": instance_id}],
)
return resp.get("Metrics", [])
while True:
resp = cloudwatch.list_metrics(**kwargs)
metrics.extend(resp.get("Metrics", []))
next_token = resp.get("NextToken")
if not next_token:
break
kwargs["NextToken"] = next_token
except Exception:
return []
return metrics


def _get_max_gpu_utilisation(
def _get_max_gpu_utilization(
cloudwatch, gpu_metrics: list, days: int, now: datetime
) -> Optional[float]:
"""
Return the maximum GPU utilisation across all GPU indices over the window.
Return the maximum GPU utilization across all GPU indices over the window.

Takes the gpu_metrics list already fetched by _list_gpu_metrics — no second
ListMetrics call. Uses MAX statistic so a single active GPU on a multi-GPU
Expand Down Expand Up @@ -440,17 +475,17 @@ def _get_max_gpu_utilisation(
return max_util


def _get_avg_cpu_utilisation(
def _get_max_daily_cpu_utilization(
cloudwatch, instance_id: str, days: int, now: datetime
) -> Optional[float]:
"""
Return the peak CPU utilisation over the window using AWS/EC2 CPUUtilization.
Return the maximum daily CPU peak over the window using AWS/EC2 CPUUtilization.

Uses Maximum statistic per day and returns the highest daily peak. This avoids
flagging burst workloads where a short but significant CPU spike would be averaged
away — if the max CPU across any day is below threshold, the instance is truly idle.
Uses Maximum statistic at daily (86400s) period and returns the highest value
across all returned datapoints (spec section 6.2). This avoids flagging burst workloads
where a short but significant CPU spike would be averaged away.

Returns None on error — caller treats None as "not idle" (safe default).
Returns None on error or no datapoints — caller treats None as "not idle" (safe default).
"""
start = now - timedelta(days=days)
try:
Expand Down
Loading
Loading