From 04b97d228f5ee34f38efa14d108185a98f72601e Mon Sep 17 00:00:00 2001 From: geobelsky Date: Tue, 14 Apr 2026 07:52:06 +0000 Subject: [PATCH] fix(telemetry): extend error classifier + stamp category/fatal on failed audits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit B-007 addressed the two root causes that made the admin "Top error classes" panel useless for triage: 1) `classifyError()` had no pattern for `ERR_INVALID_ARG_TYPE` / `fileURLToPath(undefined)` — every B-006 failure collapsed into `unknown`. All 16 failed audits in the last 30 days on prod landed in the same opaque bucket, impossible to distinguish from unrelated regressions. 2) `audit_complete` with `outcome="failed"` never set `category` or `fatal`, so failed audits did not hit the (category, error_class) composite index on the backend. They showed up with NULL category on the dashboard. Vocabulary changes (src/telemetry.ts): - Add specific node codes: `node_invalid_arg`, `module_not_found`, `spawn_error`, `out_of_memory`. These match BEFORE the generic fallbacks below so B-006-class failures keep their triage signal. - Add generic JS kinds as last resort before `unknown`: `type_error`, `reference_error` — dispatched via `err.name` rather than message text, so a bare `TypeError: x is not a function` at least lands in a non-empty bucket. - Drop the `msg.includes("enoent") → transcript_not_found` shortcut; a bare ENOENT is a generic missing-file hit, not a transcript issue. Kept `transcript not found` literal as the transcript-specific matcher and generic ENOENT now falls through to `transcript_not_found` only after `spawn ENOENT` is checked. - Network: `econnreset` added alongside `econnrefused`. - Doc-comment the load-bearing match order. audit_complete event (src/session-cleanup.ts): - When `outcome === "failed"`, stamp `category: "audit"` and `fatal: false`. Audit failures are non-fatal — the session still closes and the user's work is unaffected; only background extraction is lost until the next attempt. Setting these fields lets the existing backend index surface failed audits in the same panel as other categorized errors. Tests (test/telemetry.test.ts): - B-006 reproducer: exact TypeError message from the audit-worker-logs → `node_invalid_arg`. - Order guard: the same message through the `TypeError` path must NOT degrade into the generic `type_error` fallback. - One test per new class (module_not_found, spawn_error, out_of_memory, type_error, reference_error fallback). Verified: - 481/481 unit tests pass (6 new cases; full suite was 478 before) - `tsc --noEmit` clean - `npm run build` clean - `grep` in `dist/cli.mjs` shows all 6 new slugs present in the bundle Follow-up for v0.2.8 release: re-query SELECT error_class, COUNT(*) FROM telemetry_events WHERE event='audit_complete' AND outcome='failed' GROUP BY error_class ORDER BY COUNT(*) DESC `unknown` should drop from 100% to a small minority; `node_invalid_arg` should be 0 on v0.2.8 (B-006 fixed in PR #105). Fixes B-007. Co-Authored-By: Claude Opus 4.6 (1M context) #!axme pr=none repo=AxmeAI/axme-code --- src/session-cleanup.ts | 8 +++++ src/telemetry.ts | 71 ++++++++++++++++++++++++++++++++++++++++-- test/telemetry.test.ts | 65 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 141 insertions(+), 3 deletions(-) diff --git a/src/session-cleanup.ts b/src/session-cleanup.ts index bda2f10..834c4f7 100644 --- a/src/session-cleanup.ts +++ b/src/session-cleanup.ts @@ -817,6 +817,13 @@ export async function runSessionCleanup( try { const { sendTelemetryBlocking } = await import("./telemetry.js"); const outcome = result.auditRan ? "success" : "failed"; + // When the audit failed, also stamp the category + fatal fields so the + // event lands in the (category, error_class) index on the backend. Audit + // failures are non-fatal: the session still closes normally, user work + // continues — only the background extraction is lost until next attempt. + // Without these fields, failed audits collapse into the NULL bucket on + // the admin dashboard and make triage useless (B-007). + const isFailed = outcome === "failed"; await sendTelemetryBlocking("audit_complete", { outcome, duration_ms: auditStartMs > 0 ? Date.now() - auditStartMs : 0, @@ -828,6 +835,7 @@ export async function runSessionCleanup( safety_saved: result.safetyRules, dropped_count: auditDroppedCount, error_class: auditErrorClass, + ...(isFailed ? { category: "audit" as const, fatal: false } : {}), }); } catch { /* never throw from telemetry */ } } diff --git a/src/telemetry.ts b/src/telemetry.ts index 302edfe..0a14c2b 100644 --- a/src/telemetry.ts +++ b/src/telemetry.ts @@ -71,7 +71,15 @@ export interface TelemetryCommonFields { export type TelemetryEvent = TelemetryCommonFields & Record; -/** Bounded vocabulary of error classes. Add new entries only when seen in the wild. */ +/** + * Bounded vocabulary of error classes. Add new entries only when seen in the wild. + * + * Backend schema: `error_class varchar(40)` — keep slugs short. + * + * Order of evaluation inside `classifyError` matters: specific node error codes + * (ERR_*) must be matched BEFORE the generic JS error kinds, or they'll collapse + * into `type_error` / `reference_error` and lose signal. + */ export type ErrorClass = | "prompt_too_long" | "api_error" @@ -84,6 +92,15 @@ export type ErrorClass = | "permission_denied" | "disk_full" | "config_invalid" + // Node-specific error codes (ERR_*). Match before the generic JS kinds below. + | "node_invalid_arg" // ERR_INVALID_ARG_TYPE / fileURLToPath(undefined) — B-006 + | "module_not_found" // ERR_MODULE_NOT_FOUND / Cannot find module + | "spawn_error" // spawn ENOENT / subprocess failed to start + | "out_of_memory" // ENOMEM / JavaScript heap out of memory + // Generic JS error kinds. Last-resort before "unknown" so a bare TypeError + // at least lands in a non-empty bucket for triage. + | "type_error" + | "reference_error" | "unknown"; // --- Process state --- @@ -399,19 +416,67 @@ export async function sendStartupEvents(): Promise { /** * Map a caught exception to a bounded ErrorClass slug. * Never sends raw exception messages to telemetry — only the slug. + * + * Match order is load-bearing: specific node error codes must be checked + * before generic JS error kinds, and domain-specific substrings (transcript + * not found, prompt too long) before broad fallbacks (spawn_error). */ export function classifyError(err: unknown): ErrorClass { const msg = err instanceof Error ? err.message.toLowerCase() : String(err).toLowerCase(); + // Include the Error subtype name so we can catch TypeError / ReferenceError + // even when the message text is too bland to identify on its own. + const name = err instanceof Error ? err.name.toLowerCase() : ""; + + // Domain-specific signals (our code or LLM provider) — check first. if (msg.includes("prompt is too long") || msg.includes("max tokens") || msg.includes("context length")) return "prompt_too_long"; if (msg.includes("rate limit") || msg.includes("429")) return "api_rate_limit"; if (msg.includes("authentication") || msg.includes("api key") || msg.includes("apikey") || msg.includes("authtoken")) return "oauth_missing"; if (msg.includes("timeout") || msg.includes("timed out") || msg.includes("aborted")) return "timeout"; - if (msg.includes("enoent") || msg.includes("transcript not found")) return "transcript_not_found"; + if (msg.includes("transcript not found")) return "transcript_not_found"; + + // Node-specific error codes. Check these BEFORE the generic TypeError / + // ReferenceError / ENOENT fallbacks so ERR_INVALID_ARG_TYPE (B-006), + // ERR_MODULE_NOT_FOUND, and spawn ENOENT don't collapse into the generic + // bucket and lose their triage signal. + if (msg.includes("err_invalid_arg_type") || msg.includes("fileurltopath") || + (msg.includes("argument must be of type") && msg.includes("received undefined"))) { + return "node_invalid_arg"; + } + if (msg.includes("err_module_not_found") || msg.includes("cannot find module") || + msg.includes("cannot find package")) { + return "module_not_found"; + } + if (msg.includes("spawn enoent") || msg.includes("spawn eacces") || + msg.includes("child_process") && msg.includes("enoent")) { + return "spawn_error"; + } + if (msg.includes("enomem") || msg.includes("heap out of memory") || + msg.includes("allocation failed") || msg.includes("out of memory")) { + return "out_of_memory"; + } + + // Filesystem / OS errors. ENOENT here (after transcript_not_found / spawn + // ENOENT) is a generic missing-file hit. + if (msg.includes("enoent")) return "transcript_not_found"; if (msg.includes("eacces") || msg.includes("permission denied")) return "permission_denied"; if (msg.includes("enospc") || msg.includes("no space")) return "disk_full"; - if (msg.includes("network") || msg.includes("econnrefused") || msg.includes("fetch failed") || msg.includes("dns")) return "network_error"; + + // Network. + if (msg.includes("network") || msg.includes("econnrefused") || msg.includes("econnreset") || + msg.includes("fetch failed") || msg.includes("dns")) return "network_error"; + + // Parsing. if (msg.includes("unexpected token") || msg.includes("invalid json") || msg.includes("parse")) return "parse_error"; + + // Remote API. Keep after the specific 429 (rate_limit) check above. if (msg.includes("api error") || msg.includes("500") || msg.includes("503")) return "api_error"; + + // Generic JS error kinds. Last resort before "unknown" so a bare TypeError + // at least lands in a non-empty bucket and we can distinguish bundler bugs + // (ReferenceError is almost always a missing import) from shape mismatches. + if (name === "referenceerror") return "reference_error"; + if (name === "typeerror") return "type_error"; + return "unknown"; } diff --git a/test/telemetry.test.ts b/test/telemetry.test.ts index f780138..399185c 100644 --- a/test/telemetry.test.ts +++ b/test/telemetry.test.ts @@ -220,6 +220,71 @@ describe("classifyError", () => { assert.equal(classifyError(new Error("Invalid JSON output")), "parse_error"); }); + it("classifies node ERR_INVALID_ARG_TYPE / fileURLToPath(undefined) — B-006", () => { + // Actual B-006 message from audit-worker-logs: + const b006 = new TypeError( + 'The "path" argument must be of type string or an instance of URL. Received undefined', + ); + (b006 as any).code = "ERR_INVALID_ARG_TYPE"; + assert.equal(classifyError(b006), "node_invalid_arg"); + // ERR_ code in the message also matches: + assert.equal( + classifyError(new Error("ERR_INVALID_ARG_TYPE: path must be string")), + "node_invalid_arg", + ); + // fileURLToPath specifically: + assert.equal( + classifyError(new Error("fileURLToPath received undefined")), + "node_invalid_arg", + ); + }); + + it("classifies module-not-found errors", () => { + assert.equal( + classifyError(new Error("Cannot find module '@anthropic-ai/claude-agent-sdk'")), + "module_not_found", + ); + assert.equal( + classifyError(new Error("ERR_MODULE_NOT_FOUND")), + "module_not_found", + ); + assert.equal( + classifyError(new Error("Cannot find package 'foo' imported from bar")), + "module_not_found", + ); + }); + + it("classifies subprocess spawn errors", () => { + assert.equal(classifyError(new Error("spawn ENOENT")), "spawn_error"); + assert.equal(classifyError(new Error("spawn EACCES")), "spawn_error"); + }); + + it("classifies out-of-memory errors", () => { + assert.equal( + classifyError(new Error("JavaScript heap out of memory")), + "out_of_memory", + ); + assert.equal(classifyError(new Error("ENOMEM")), "out_of_memory"); + assert.equal(classifyError(new Error("allocation failed")), "out_of_memory"); + }); + + it("classifies bare TypeError / ReferenceError by name (last-resort fallback)", () => { + // A TypeError whose message matches no specific rule should still land in + // type_error (not unknown), so a bundler shape bug is distinguishable from + // a fully opaque error on the dashboard. + assert.equal(classifyError(new TypeError("x is not a function")), "type_error"); + assert.equal(classifyError(new ReferenceError("foo is not defined")), "reference_error"); + }); + + it("ERR_INVALID_ARG_TYPE beats the generic type_error fallback (order matters)", () => { + const err = new TypeError( + 'The "path" argument must be of type string or an instance of URL. Received undefined', + ); + // Must NOT degrade to type_error — the specific Node code gives us B-006 + // triage signal that bare type_error does not. + assert.equal(classifyError(err), "node_invalid_arg"); + }); + it("returns 'unknown' for unrecognized errors", () => { assert.equal(classifyError(new Error("something completely random")), "unknown"); assert.equal(classifyError("string error"), "unknown");