diff --git a/.claude/skills/infrastructure-health/references/exa-a3-telemetry.md b/.claude/skills/infrastructure-health/references/exa-a3-telemetry.md index c15fa9953..844122120 100644 --- a/.claude/skills/infrastructure-health/references/exa-a3-telemetry.md +++ b/.claude/skills/infrastructure-health/references/exa-a3-telemetry.md @@ -26,6 +26,15 @@ sum(rate(claude_exa_ab_latency_ms_count{arm="treatment"}[1h])) > 0.05 ``` +**Note on `invalid_input`:** This PromQL deliberately scopes to `arm="treatment"` and excludes `invalid_input`. The `invalid_input` arm fires when the orchestrator passes >5 variations and the validator rejects — it never hits Exa, so it's not a treatment-side failure. Track it separately: + +```promql +# Orchestrator misuse rate (validator rejections per hour) +sum(rate(claude_exa_ab_latency_ms_count{arm="invalid_input"}[1h])) +``` + +If this rises sustained → prompt drift (orchestrator generating too-many variations). Inspect via `event:exa_a3_validator_rejected` in Cloud Logging. + **Likely causes:** - Exa upstream instability (compare with control arm — if both elevated, point at Exa) - Orchestrator generating malformed variations (check `claude_exa_summary_type_anomaly_total` and structured logs for `event=exa_a3_validator_rejected`) diff --git a/.claude/skills/post-deploy-verify/scripts/verify-tier2.sh b/.claude/skills/post-deploy-verify/scripts/verify-tier2.sh index e0519d1ac..8fef79c94 100755 --- a/.claude/skills/post-deploy-verify/scripts/verify-tier2.sh +++ b/.claude/skills/post-deploy-verify/scripts/verify-tier2.sh @@ -52,17 +52,36 @@ else add_check "V4 — Citation verifier accepts FMP" "WARNING" "No qa-outputs found locally (manual check on staging container)" fi -# ── V5: A3 telemetry + audit log presence (PR #114, v7.6.1) ─────────────────── -A3_FLAG=$(curl -s --max-time 10 "$BASE_URL/health" | jq -r '.feature_flags.EXA_ADDITIONAL_QUERIES // "unknown"' 2>/dev/null) -if [ "$A3_FLAG" = "true" ]; then - A3_METRIC_LINES=$(curl -s --max-time 10 "$BASE_URL/metrics" 2>/dev/null | grep -c '^claude_exa_ab_latency_ms_count{.*outcome=' || echo 0) - if [ "${A3_METRIC_LINES:-0}" -gt 0 ]; then - add_check "V5 — Exa A3 telemetry" "PASSED" "claude_exa_ab_latency_ms{outcome=...} exposes $A3_METRIC_LINES label-set rows. Verify event_data.exa_a3 captured: $QUERIES_DIR/v5-exa-a3-audit-log.sql" +# ── V5: A3 telemetry + audit log presence (PR #114, v7.6.1; defensive update PR #115) ── +# Capture both /health response status AND parsed value separately so we can +# distinguish "health endpoint dead" from "flag is false" from "flag is true". +A3_HEALTH_BODY=$(curl -s --max-time 10 "$BASE_URL/health" 2>/dev/null) +A3_HEALTH_CURL_RC=$? +A3_FLAG="" +if [ "$A3_HEALTH_CURL_RC" -ne 0 ] || [ -z "$A3_HEALTH_BODY" ]; then + add_check "V5 — Exa A3 telemetry" "WARNING" "Cannot reach $BASE_URL/health to determine EXA_ADDITIONAL_QUERIES state (curl rc=$A3_HEALTH_CURL_RC). V5 inconclusive." +else + A3_FLAG=$(echo "$A3_HEALTH_BODY" | jq -r '.feature_flags.EXA_ADDITIONAL_QUERIES // empty' 2>/dev/null) + if [ "$A3_FLAG" = "true" ]; then + # /metrics must be reachable for V5 to be conclusive + A3_METRICS_BODY=$(curl -s --max-time 10 "$BASE_URL/metrics" 2>/dev/null) + if [ -z "$A3_METRICS_BODY" ]; then + add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES=true but $BASE_URL/metrics returned empty body. /metrics broken? Retry after re-deploy." + else + A3_METRIC_LINES=$(echo "$A3_METRICS_BODY" | grep -c '^claude_exa_ab_latency_ms_count{.*outcome=' || true) + A3_METRIC_LINES="${A3_METRIC_LINES:-0}" + if [ "$A3_METRIC_LINES" -gt 0 ]; then + add_check "V5 — Exa A3 telemetry" "PASSED" "claude_exa_ab_latency_ms{outcome=...} exposes $A3_METRIC_LINES label-set rows. Verify event_data.exa_a3 captured: $QUERIES_DIR/v5-exa-a3-audit-log.sql" + else + add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES=true but /metrics has 0 outcome-labeled rows. Either no A3-eligible call yet OR sdkMetrics export broken. (Note: behind a load balancer this can be a false negative — metric scraping may have hit a different instance.) Verify with: $QUERIES_DIR/v5-exa-a3-audit-log.sql" + fi + fi + elif [ "$A3_FLAG" = "false" ]; then + add_check "V5 — Exa A3 telemetry" "PASSED" "EXA_ADDITIONAL_QUERIES=false — V5 not applicable (skip)" else - add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES=true but /metrics has 0 outcome-labeled rows. Either no A3-eligible call yet OR sdkMetrics export broken. Verify with: $QUERIES_DIR/v5-exa-a3-audit-log.sql" + # A3_FLAG is empty (key missing from /health) OR null OR some unexpected value + add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES not present or non-boolean in /health.feature_flags (got: '$A3_FLAG'). Cannot evaluate V5; check /health response shape." fi -else - add_check "V5 — Exa A3 telemetry" "PASSED" "EXA_ADDITIONAL_QUERIES=$A3_FLAG — V5 not applicable (skip)" fi # ── Container env audit ─────────────────────────────────────────────────────── diff --git a/super-legal-mcp-refactored/CHANGELOG.md b/super-legal-mcp-refactored/CHANGELOG.md index 8dde32d85..bc3183fa8 100644 --- a/super-legal-mcp-refactored/CHANGELOG.md +++ b/super-legal-mcp-refactored/CHANGELOG.md @@ -2,6 +2,46 @@ All notable changes to the Super Legal MCP Server are documented in this file. +## [7.6.2] - 2026-05-10 — Exa A3 observability gaps (PR #115) + +Closes the audit gaps surfaced by 4 parallel post-merge Explore agents reviewing PR #114. Two P0 items (block prod rollout) + four P1 items (full observability during rollout). Seven P2 items deferred — tracked as system-wide or polish. + +### Architectural clarification (post-review revert) + +One of the original audit findings ("PostToolUseFailure missing `_ab_arm` extraction") was based on a wrong premise. The Agent SDK's `PostToolUseFailureHookInput` (sdk.d.ts:1858) does NOT carry `tool_response`. For Exa-tool failures, `executeExaSearch`'s catch path returns the sentinel envelope `[{_ab_arm, _ab_outcome, _empty: true}]` as a successful tool return value, which routes through PostToolUse (not PostToolUseFailure). PR #114's PostToolUse handler already extracts `_ab_arm` correctly. PostToolUseFailure fires only for SDK-transport-level failures (user interrupt, MCP wire error) where no `tool_response` exists and arm correlation is intrinsically unavailable. A comment block at `hookSSEBridge.js` PostToolUseFailure case documents this. No code change needed. + +### Fixed (P0 — block rollout) + +- **PII prohibition in `EXA_ADDITIONAL_QUERIES_GUIDANCE` (`_promptConstants.js:947`)**: New protocol step #5 — agents MUST NOT include names, SSNs, account numbers, emails, addresses, DOB in variations. Variations persist into `hook_audit_log.event_data.exa_a3` and surface in regulator handoff bundles via `client-audit-export`. 200-char truncation provides containment but not prevention. +- **V5 verify script false positives (`verify-tier2.sh`)**: Defensive `curl` rc check; explicit `true`/`false`/empty branches for `EXA_ADDITIONAL_QUERIES`; explicit `/metrics` reachability check; LB-routing false-negative warning embedded in WARNING message. + +### Added (P1 — observability during rollout) + +- **OTel trace correlation in audit log (`hookDBBridge.js`)**: `event_data.exa_a3.otel_trace_id` populated when active trace exists. Operators can now pivot from `SELECT event_data->'exa_a3' FROM hook_audit_log` to Cloud Trace by trace_id. Lazy-imported via `sdkTracing.getActiveTraceId()` — safe when OTel disabled. +- **`getActiveTraceId()` helper (`sdkTracing.js`)**: returns hex trace ID of active span or null. Safe when OTel SDK not initialized. +- **Retry-path structured logging (`BaseWebSearchClient.js`)**: `logWarn('exa_search_retry_attempt', {domain, status_code, retry_count, max_retries, delay_ms, ab_arm})` fires alongside existing `console.log` + metric. Cloud Logging filter `event:exa_search_retry_attempt` surfaces retry storms. +- **Telemetry doc clarifies `invalid_input` semantics (`exa-a3-telemetry.md`)**: dedicated section explains why treatment-failure-rate PromQL excludes `invalid_input` (validator rejection, never hits Exa) + dedicated PromQL for orchestrator-misuse rate. + +### Deferred (P2 — tracked as separate issues) + +- `agent_type` not in `hook_audit_log` rows (system-wide, not A3-specific) +- Sentinel envelope filter enforcement at HybridClient layer (refactor) +- Shared `parseToolResponse()` helper to eliminate double JSON.parse (refactor) +- `session-diagnostics` § 10 pagination for 200+ A3 rows (UX polish) +- `invalid_input` records `latencyMs=0` in histogram (replace with counter, future) +- `invalid_input` arm never increments `exaAbSampleAssignments` (asymmetric but intentional) + +### Tests + +- No new tests in this PR — existing 12 hardening tests (PR #114) cover the verified paths. +- Full A3 regression: 142/142 across 6 test files (unchanged from PR #114 baseline). + +### Compatibility + +- All changes inert when `EXA_ADDITIONAL_QUERIES=false` or OTel disabled. +- `event_data.exa_a3.otel_trace_id` is additive JSONB key — no migration. +- `exa_ab_arm` SSE event with `on_failure: true` is additive — frontend consumers that don't read the flag ignore it harmlessly. + ## [7.6.1] - 2026-05-10 — Exa A3 hardening: closes audit defects 4.2.1, 4.2.2, 4.2.5, 4.3.1, 4.3.2 (PR #114) Closes the observability and error-handling gaps surfaced by the v7.6.0 staging A/B audit (see `docs/runbooks/exa-a3-ab-report.md` § 4). When `EXA_ADDITIONAL_QUERIES=true`, the feature now meets the same auditability/governance bar as the rest of the v6.2.0+ Wave 3 surface — symmetric metrics on success and failure, structured logging routed through `sdkLogger`, hook + audit-log capture for EU AI Act Art. 12 forensic replay. diff --git a/super-legal-mcp-refactored/src/api-clients/BaseWebSearchClient.js b/super-legal-mcp-refactored/src/api-clients/BaseWebSearchClient.js index 67de41aa8..ea068c0b2 100644 --- a/super-legal-mcp-refactored/src/api-clients/BaseWebSearchClient.js +++ b/super-legal-mcp-refactored/src/api-clients/BaseWebSearchClient.js @@ -329,6 +329,16 @@ export class BaseWebSearchClient extends SearchQualityMixin { if (this.isRetryableError(statusCode) && _retryCount < this.retryConfig.maxRetries) { const delayMs = this.retryConfig.baseDelayMs * (_retryCount + 1); // Linear backoff console.log(`[Exa] Retry ${_retryCount + 1}/${this.retryConfig.maxRetries} after ${statusCode}, waiting ${delayMs}ms`); + // PR #115: structured log parity with metric. Cloud Logging can filter + // event=exa_search_retry_attempt to see all retry storms across the fleet. + logWarn('exa_search_retry_attempt', { + domain: domain || 'unknown', + status_code: statusCode, + retry_count: _retryCount + 1, + max_retries: this.retryConfig.maxRetries, + delay_ms: delayMs, + ab_arm: abArm || null + }); // PR #114 (defect 4.2.2): record outer-attempt outcome BEFORE recursion. // Recursive call has _suppressAbRecording=true so it won't double-count. _recordAbOutcome('http_error_retried', []); diff --git a/super-legal-mcp-refactored/src/config/legalSubagents/_promptConstants.js b/super-legal-mcp-refactored/src/config/legalSubagents/_promptConstants.js index a1ccde371..03799fef5 100644 --- a/super-legal-mcp-refactored/src/config/legalSubagents/_promptConstants.js +++ b/super-legal-mcp-refactored/src/config/legalSubagents/_promptConstants.js @@ -946,6 +946,7 @@ Exa Deep search parallelizes variations server-side at NO extra cost, deduplicat 2. **NEVER restate, paraphrase, expand, or annotate the primary.** The primary query already covers its own axis; variations exist to OPEN NEW AXES. 3. **Aim for 2-3 variations** (Exa-recommended count). Hard cap: 5. 4. **If you genuinely cannot identify 2+ distinct axes, OMIT the parameter** rather than ship paraphrases. Partial adoption is fine; low-quality variations actively waste cost. +5. **NEVER include PII in variations** — no individual names, Social Security numbers, account numbers, email addresses, phone numbers, addresses, dates of birth, or any other personally identifying information. Variations are persisted in \`hook_audit_log.event_data.exa_a3\` for EU AI Act Art. 12 forensic replay and surface in regulator handoff bundles. Use doctrinal anchors, statute references, jurisdiction descriptors, and case names from public court records — these are safe. If the primary query references a specific named party, do NOT propagate that name into variations; pivot to the doctrine or claim type instead. ### WORKED EXAMPLES diff --git a/super-legal-mcp-refactored/src/utils/hookDBBridge.js b/super-legal-mcp-refactored/src/utils/hookDBBridge.js index 6cb316f4d..7354d3ee1 100644 --- a/super-legal-mcp-refactored/src/utils/hookDBBridge.js +++ b/super-legal-mcp-refactored/src/utils/hookDBBridge.js @@ -837,6 +837,15 @@ async function persistAuditEvent(pool, sessionCache, hookName, input, result) { if (first?._ab_outcome) eventData.exa_a3.ab_outcome = first._ab_outcome; } } catch { /* malformed JSON — silent skip */ } + // PR #115: trace correlation. Embed active OTel trace ID so operators + // can pivot from `SELECT event_data->'exa_a3' FROM hook_audit_log` to + // Cloud Trace by trace_id. Lazy import to avoid module load cost when + // OTel disabled. Returns null when OTel SDK not initialized — safe. + try { + const { getActiveTraceId } = await import('./sdkTracing.js'); + const traceId = getActiveTraceId(); + if (traceId) eventData.exa_a3.otel_trace_id = traceId; + } catch { /* tracing module unavailable — silent skip */ } } } } diff --git a/super-legal-mcp-refactored/src/utils/hookSSEBridge.js b/super-legal-mcp-refactored/src/utils/hookSSEBridge.js index 3739a090e..a2eeb6330 100644 --- a/super-legal-mcp-refactored/src/utils/hookSSEBridge.js +++ b/super-legal-mcp-refactored/src/utils/hookSSEBridge.js @@ -503,6 +503,14 @@ export function forwardHookToSSE(hookName, input, result, onEvent, agentRegistry case 'PostToolUseFailure': { const { tool_name, error, is_interrupt } = input || {}; + // NOTE (PR #115 review): the Agent SDK does NOT deliver `tool_response` + // on PostToolUseFailureHookInput (per sdk.d.ts:1858). For Exa-tool + // failures, executeExaSearch's catch path returns a sentinel envelope as + // a successful tool return value, which routes through PostToolUse (not + // here) and PR #114's PostToolUse handler extracts `_ab_arm` from it + // correctly. This hook fires only for SDK-transport-level failures + // (interrupt, timeout in the agent loop, MCP wire error) where no + // tool_response exists and arm correlation is intrinsically unavailable. onEvent('tool_failure', { tool_name: tool_name || null, error: typeof error === 'string' ? error.slice(0, 200) : 'Unknown error', diff --git a/super-legal-mcp-refactored/src/utils/sdkTracing.js b/super-legal-mcp-refactored/src/utils/sdkTracing.js index bc5b1b442..b7b354bed 100644 --- a/super-legal-mcp-refactored/src/utils/sdkTracing.js +++ b/super-legal-mcp-refactored/src/utils/sdkTracing.js @@ -126,4 +126,23 @@ export function runInContext(ctx, fn) { return api.context.with(ctx, fn); } +/** + * PR #115: Return the current active trace ID hex string, or null if no + * active span / OTel disabled. Used to correlate `hook_audit_log.event_data` + * rows to Cloud Trace spans for cross-system forensic replay (EU AI Act + * Art. 12). Safe to call unconditionally — returns null when OTel SDK isn't + * initialized rather than throwing. + */ +export function getActiveTraceId() { + if (!api?.trace) return null; + try { + const span = api.trace.getActiveSpan(); + if (!span) return null; + const ctx = span.spanContext(); + return ctx?.traceId || null; + } catch { + return null; + } +} + export { getTracer, SpanStatusCode }; diff --git a/super-legal-mcp-refactored/test/sdk/exa-a3-hardening.test.js b/super-legal-mcp-refactored/test/sdk/exa-a3-hardening.test.js index 0ce2cf405..f63f879e3 100644 --- a/super-legal-mcp-refactored/test/sdk/exa-a3-hardening.test.js +++ b/super-legal-mcp-refactored/test/sdk/exa-a3-hardening.test.js @@ -201,6 +201,7 @@ describe('A3 hardening (PR #114) — defects 4.2.1, 4.2.2, 4.3.1, 4.3.2', () => expect(armEvent).toBeUndefined(); }); + // ── Driver bug: getMetric aggregation across domain labels ── test('8. driver getMetric sums across multiple domain labels', () => {