Number531 · Number531 · May 11, 2026 · May 11, 2026
diff --git a/.claude/skills/infrastructure-health/references/exa-a3-telemetry.md b/.claude/skills/infrastructure-health/references/exa-a3-telemetry.md
@@ -26,6 +26,15 @@ sum(rate(claude_exa_ab_latency_ms_count{arm="treatment"}[1h]))
 > 0.05
 ```
 
+**Note on `invalid_input`:** This PromQL deliberately scopes to `arm="treatment"` and excludes `invalid_input`. The `invalid_input` arm fires when the orchestrator passes >5 variations and the validator rejects — it never hits Exa, so it's not a treatment-side failure. Track it separately:
+
+```promql
+# Orchestrator misuse rate (validator rejections per hour)
+sum(rate(claude_exa_ab_latency_ms_count{arm="invalid_input"}[1h]))
+```
+
+If this rises sustained → prompt drift (orchestrator generating too-many variations). Inspect via `event:exa_a3_validator_rejected` in Cloud Logging.
+
 **Likely causes:**
 - Exa upstream instability (compare with control arm — if both elevated, point at Exa)
 - Orchestrator generating malformed variations (check `claude_exa_summary_type_anomaly_total` and structured logs for `event=exa_a3_validator_rejected`)

diff --git a/.claude/skills/post-deploy-verify/scripts/verify-tier2.sh b/.claude/skills/post-deploy-verify/scripts/verify-tier2.sh
@@ -52,17 +52,36 @@ else
   add_check "V4 — Citation verifier accepts FMP" "WARNING" "No qa-outputs found locally (manual check on staging container)"
 fi
 
-# ── V5: A3 telemetry + audit log presence (PR #114, v7.6.1) ───────────────────
-A3_FLAG=$(curl -s --max-time 10 "$BASE_URL/health" | jq -r '.feature_flags.EXA_ADDITIONAL_QUERIES // "unknown"' 2>/dev/null)
-if [ "$A3_FLAG" = "true" ]; then
-  A3_METRIC_LINES=$(curl -s --max-time 10 "$BASE_URL/metrics" 2>/dev/null | grep -c '^claude_exa_ab_latency_ms_count{.*outcome=' || echo 0)
-  if [ "${A3_METRIC_LINES:-0}" -gt 0 ]; then
-    add_check "V5 — Exa A3 telemetry" "PASSED" "claude_exa_ab_latency_ms{outcome=...} exposes $A3_METRIC_LINES label-set rows. Verify event_data.exa_a3 captured: $QUERIES_DIR/v5-exa-a3-audit-log.sql"
+# ── V5: A3 telemetry + audit log presence (PR #114, v7.6.1; defensive update PR #115) ──
+# Capture both /health response status AND parsed value separately so we can
+# distinguish "health endpoint dead" from "flag is false" from "flag is true".
+A3_HEALTH_BODY=$(curl -s --max-time 10 "$BASE_URL/health" 2>/dev/null)
+A3_HEALTH_CURL_RC=$?
+A3_FLAG=""
+if [ "$A3_HEALTH_CURL_RC" -ne 0 ] || [ -z "$A3_HEALTH_BODY" ]; then
+  add_check "V5 — Exa A3 telemetry" "WARNING" "Cannot reach $BASE_URL/health to determine EXA_ADDITIONAL_QUERIES state (curl rc=$A3_HEALTH_CURL_RC). V5 inconclusive."
+else
+  A3_FLAG=$(echo "$A3_HEALTH_BODY" | jq -r '.feature_flags.EXA_ADDITIONAL_QUERIES // empty' 2>/dev/null)
+  if [ "$A3_FLAG" = "true" ]; then
+    # /metrics must be reachable for V5 to be conclusive
+    A3_METRICS_BODY=$(curl -s --max-time 10 "$BASE_URL/metrics" 2>/dev/null)
+    if [ -z "$A3_METRICS_BODY" ]; then
+      add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES=true but $BASE_URL/metrics returned empty body. /metrics broken? Retry after re-deploy."
+    else
+      A3_METRIC_LINES=$(echo "$A3_METRICS_BODY" | grep -c '^claude_exa_ab_latency_ms_count{.*outcome=' || true)
+      A3_METRIC_LINES="${A3_METRIC_LINES:-0}"
+      if [ "$A3_METRIC_LINES" -gt 0 ]; then
+        add_check "V5 — Exa A3 telemetry" "PASSED" "claude_exa_ab_latency_ms{outcome=...} exposes $A3_METRIC_LINES label-set rows. Verify event_data.exa_a3 captured: $QUERIES_DIR/v5-exa-a3-audit-log.sql"
+      else
+        add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES=true but /metrics has 0 outcome-labeled rows. Either no A3-eligible call yet OR sdkMetrics export broken. (Note: behind a load balancer this can be a false negative — metric scraping may have hit a different instance.) Verify with: $QUERIES_DIR/v5-exa-a3-audit-log.sql"
+      fi
+    fi
+  elif [ "$A3_FLAG" = "false" ]; then
+    add_check "V5 — Exa A3 telemetry" "PASSED" "EXA_ADDITIONAL_QUERIES=false — V5 not applicable (skip)"
   else
-    add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES=true but /metrics has 0 outcome-labeled rows. Either no A3-eligible call yet OR sdkMetrics export broken. Verify with: $QUERIES_DIR/v5-exa-a3-audit-log.sql"
+    # A3_FLAG is empty (key missing from /health) OR null OR some unexpected value
+    add_check "V5 — Exa A3 telemetry" "WARNING" "EXA_ADDITIONAL_QUERIES not present or non-boolean in /health.feature_flags (got: '$A3_FLAG'). Cannot evaluate V5; check /health response shape."
   fi
-else
-  add_check "V5 — Exa A3 telemetry" "PASSED" "EXA_ADDITIONAL_QUERIES=$A3_FLAG — V5 not applicable (skip)"
 fi
 
 # ── Container env audit ───────────────────────────────────────────────────────

diff --git a/super-legal-mcp-refactored/CHANGELOG.md b/super-legal-mcp-refactored/CHANGELOG.md
@@ -2,6 +2,46 @@
 
 All notable changes to the Super Legal MCP Server are documented in this file.
 
+## [7.6.2] - 2026-05-10 — Exa A3 observability gaps (PR #115)
+
+Closes the audit gaps surfaced by 4 parallel post-merge Explore agents reviewing PR #114. Two P0 items (block prod rollout) + four P1 items (full observability during rollout). Seven P2 items deferred — tracked as system-wide or polish.
+
+### Architectural clarification (post-review revert)
+
+One of the original audit findings ("PostToolUseFailure missing `_ab_arm` extraction") was based on a wrong premise. The Agent SDK's `PostToolUseFailureHookInput` (sdk.d.ts:1858) does NOT carry `tool_response`. For Exa-tool failures, `executeExaSearch`'s catch path returns the sentinel envelope `[{_ab_arm, _ab_outcome, _empty: true}]` as a successful tool return value, which routes through PostToolUse (not PostToolUseFailure). PR #114's PostToolUse handler already extracts `_ab_arm` correctly. PostToolUseFailure fires only for SDK-transport-level failures (user interrupt, MCP wire error) where no `tool_response` exists and arm correlation is intrinsically unavailable. A comment block at `hookSSEBridge.js` PostToolUseFailure case documents this. No code change needed.
+
+### Fixed (P0 — block rollout)
+
+- **PII prohibition in `EXA_ADDITIONAL_QUERIES_GUIDANCE` (`_promptConstants.js:947`)**: New protocol step #5 — agents MUST NOT include names, SSNs, account numbers, emails, addresses, DOB in variations. Variations persist into `hook_audit_log.event_data.exa_a3` and surface in regulator handoff bundles via `client-audit-export`. 200-char truncation provides containment but not prevention.
+- **V5 verify script false positives (`verify-tier2.sh`)**: Defensive `curl` rc check; explicit `true`/`false`/empty branches for `EXA_ADDITIONAL_QUERIES`; explicit `/metrics` reachability check; LB-routing false-negative warning embedded in WARNING message.
+
+### Added (P1 — observability during rollout)
+
+- **OTel trace correlation in audit log (`hookDBBridge.js`)**: `event_data.exa_a3.otel_trace_id` populated when active trace exists. Operators can now pivot from `SELECT event_data->'exa_a3' FROM hook_audit_log` to Cloud Trace by trace_id. Lazy-imported via `sdkTracing.getActiveTraceId()` — safe when OTel disabled.
+- **`getActiveTraceId()` helper (`sdkTracing.js`)**: returns hex trace ID of active span or null. Safe when OTel SDK not initialized.
+- **Retry-path structured logging (`BaseWebSearchClient.js`)**: `logWarn('exa_search_retry_attempt', {domain, status_code, retry_count, max_retries, delay_ms, ab_arm})` fires alongside existing `console.log` + metric. Cloud Logging filter `event:exa_search_retry_attempt` surfaces retry storms.
+- **Telemetry doc clarifies `invalid_input` semantics (`exa-a3-telemetry.md`)**: dedicated section explains why treatment-failure-rate PromQL excludes `invalid_input` (validator rejection, never hits Exa) + dedicated PromQL for orchestrator-misuse rate.
+
+### Deferred (P2 — tracked as separate issues)
+
+- `agent_type` not in `hook_audit_log` rows (system-wide, not A3-specific)
+- Sentinel envelope filter enforcement at HybridClient layer (refactor)
+- Shared `parseToolResponse()` helper to eliminate double JSON.parse (refactor)
+- `session-diagnostics` § 10 pagination for 200+ A3 rows (UX polish)
+- `invalid_input` records `latencyMs=0` in histogram (replace with counter, future)
+- `invalid_input` arm never increments `exaAbSampleAssignments` (asymmetric but intentional)
+
+### Tests
+
+- No new tests in this PR — existing 12 hardening tests (PR #114) cover the verified paths.
+- Full A3 regression: 142/142 across 6 test files (unchanged from PR #114 baseline).
+
+### Compatibility
+
+- All changes inert when `EXA_ADDITIONAL_QUERIES=false` or OTel disabled.
+- `event_data.exa_a3.otel_trace_id` is additive JSONB key — no migration.
+- `exa_ab_arm` SSE event with `on_failure: true` is additive — frontend consumers that don't read the flag ignore it harmlessly.
+
 ## [7.6.1] - 2026-05-10 — Exa A3 hardening: closes audit defects 4.2.1, 4.2.2, 4.2.5, 4.3.1, 4.3.2 (PR #114)
 
 Closes the observability and error-handling gaps surfaced by the v7.6.0 staging A/B audit (see `docs/runbooks/exa-a3-ab-report.md` § 4). When `EXA_ADDITIONAL_QUERIES=true`, the feature now meets the same auditability/governance bar as the rest of the v6.2.0+ Wave 3 surface — symmetric metrics on success and failure, structured logging routed through `sdkLogger`, hook + audit-log capture for EU AI Act Art. 12 forensic replay.

diff --git a/super-legal-mcp-refactored/src/api-clients/BaseWebSearchClient.js b/super-legal-mcp-refactored/src/api-clients/BaseWebSearchClient.js
@@ -329,6 +329,16 @@ export class BaseWebSearchClient extends SearchQualityMixin {
         if (this.isRetryableError(statusCode) && _retryCount < this.retryConfig.maxRetries) {
           const delayMs = this.retryConfig.baseDelayMs * (_retryCount + 1); // Linear backoff
           console.log(`[Exa] Retry ${_retryCount + 1}/${this.retryConfig.maxRetries} after ${statusCode}, waiting ${delayMs}ms`);
+          // PR #115: structured log parity with metric. Cloud Logging can filter
+          // event=exa_search_retry_attempt to see all retry storms across the fleet.
+          logWarn('exa_search_retry_attempt', {
+            domain: domain || 'unknown',
+            status_code: statusCode,
+            retry_count: _retryCount + 1,
+            max_retries: this.retryConfig.maxRetries,
+            delay_ms: delayMs,
+            ab_arm: abArm || null
+          });
           // PR #114 (defect 4.2.2): record outer-attempt outcome BEFORE recursion.
           // Recursive call has _suppressAbRecording=true so it won't double-count.
           _recordAbOutcome('http_error_retried', []);

diff --git a/super-legal-mcp-refactored/src/config/legalSubagents/_promptConstants.js b/super-legal-mcp-refactored/src/config/legalSubagents/_promptConstants.js
@@ -946,6 +946,7 @@ Exa Deep search parallelizes variations server-side at NO extra cost, deduplicat
 2. **NEVER restate, paraphrase, expand, or annotate the primary.** The primary query already covers its own axis; variations exist to OPEN NEW AXES.
 3. **Aim for 2-3 variations** (Exa-recommended count). Hard cap: 5.
 4. **If you genuinely cannot identify 2+ distinct axes, OMIT the parameter** rather than ship paraphrases. Partial adoption is fine; low-quality variations actively waste cost.
+5. **NEVER include PII in variations** — no individual names, Social Security numbers, account numbers, email addresses, phone numbers, addresses, dates of birth, or any other personally identifying information. Variations are persisted in \`hook_audit_log.event_data.exa_a3\` for EU AI Act Art. 12 forensic replay and surface in regulator handoff bundles. Use doctrinal anchors, statute references, jurisdiction descriptors, and case names from public court records — these are safe. If the primary query references a specific named party, do NOT propagate that name into variations; pivot to the doctrine or claim type instead.
 
 ### WORKED EXAMPLES
 

diff --git a/super-legal-mcp-refactored/src/utils/hookDBBridge.js b/super-legal-mcp-refactored/src/utils/hookDBBridge.js
@@ -837,6 +837,15 @@ async function persistAuditEvent(pool, sessionCache, hookName, input, result) {
             if (first?._ab_outcome) eventData.exa_a3.ab_outcome = first._ab_outcome;
           }
         } catch { /* malformed JSON — silent skip */ }
+        // PR #115: trace correlation. Embed active OTel trace ID so operators
+        // can pivot from `SELECT event_data->'exa_a3' FROM hook_audit_log` to
+        // Cloud Trace by trace_id. Lazy import to avoid module load cost when
+        // OTel disabled. Returns null when OTel SDK not initialized — safe.
+        try {
+          const { getActiveTraceId } = await import('./sdkTracing.js');
+          const traceId = getActiveTraceId();
+          if (traceId) eventData.exa_a3.otel_trace_id = traceId;
+        } catch { /* tracing module unavailable — silent skip */ }
       }
     }
   }

diff --git a/super-legal-mcp-refactored/src/utils/hookSSEBridge.js b/super-legal-mcp-refactored/src/utils/hookSSEBridge.js
@@ -503,6 +503,14 @@ export function forwardHookToSSE(hookName, input, result, onEvent, agentRegistry
 
     case 'PostToolUseFailure': {
       const { tool_name, error, is_interrupt } = input || {};
+      // NOTE (PR #115 review): the Agent SDK does NOT deliver `tool_response`
+      // on PostToolUseFailureHookInput (per sdk.d.ts:1858). For Exa-tool
+      // failures, executeExaSearch's catch path returns a sentinel envelope as
+      // a successful tool return value, which routes through PostToolUse (not
+      // here) and PR #114's PostToolUse handler extracts `_ab_arm` from it
+      // correctly. This hook fires only for SDK-transport-level failures
+      // (interrupt, timeout in the agent loop, MCP wire error) where no
+      // tool_response exists and arm correlation is intrinsically unavailable.
       onEvent('tool_failure', {
         tool_name: tool_name || null,
         error: typeof error === 'string' ? error.slice(0, 200) : 'Unknown error',

diff --git a/super-legal-mcp-refactored/src/utils/sdkTracing.js b/super-legal-mcp-refactored/src/utils/sdkTracing.js
@@ -126,4 +126,23 @@ export function runInContext(ctx, fn) {
   return api.context.with(ctx, fn);
 }
 
+/**
+ * PR #115: Return the current active trace ID hex string, or null if no
+ * active span / OTel disabled. Used to correlate `hook_audit_log.event_data`
+ * rows to Cloud Trace spans for cross-system forensic replay (EU AI Act
+ * Art. 12). Safe to call unconditionally — returns null when OTel SDK isn't
+ * initialized rather than throwing.
+ */
+export function getActiveTraceId() {
+  if (!api?.trace) return null;
+  try {
+    const span = api.trace.getActiveSpan();
+    if (!span) return null;
+    const ctx = span.spanContext();
+    return ctx?.traceId || null;
+  } catch {
+    return null;
+  }
+}
+
 export { getTracer, SpanStatusCode };
diff --git a/super-legal-mcp-refactored/test/sdk/exa-a3-hardening.test.js b/super-legal-mcp-refactored/test/sdk/exa-a3-hardening.test.js
@@ -201,6 +201,7 @@ describe('A3 hardening (PR #114) — defects 4.2.1, 4.2.2, 4.3.1, 4.3.2', () =>
     expect(armEvent).toBeUndefined();
   });
 
+
   // ── Driver bug: getMetric aggregation across domain labels ──
 
   test('8. driver getMetric sums across multiple domain labels', () => {
-Original file line number
+Diff line change
@@ Expand Up @@
         expect(armEvent).toBeUndefined();
       });
       // ── Driver bug: getMetric aggregation across domain labels ──
       test('8. driver getMetric sums across multiple domain labels', () => {
@@ Expand Down @@