diff --git a/super-legal-mcp-refactored/CHANGELOG.md b/super-legal-mcp-refactored/CHANGELOG.md index 113cdccd9..1e53d4d74 100644 --- a/super-legal-mcp-refactored/CHANGELOG.md +++ b/super-legal-mcp-refactored/CHANGELOG.md @@ -4,6 +4,50 @@ All notable changes to the Super Legal MCP Server are documented in this file. ## [Unreleased] +### Added — Avenue A v2: Structured-output enforcement on code-execution bridge (PR forthcoming) + +Eliminates the "missing envelope on turn 1 → corrective retry on turn 2" pattern observed in production renders (PR #134 L4 logs: 1-of-5 phases retried). Adds Anthropic's `output_config: { format: { type: 'json_schema', schema: {...} } }` enforcement on the model's text-block output, validated empirically against the `code_execution_20260120` tool + streaming + `pause_turn` continuations. + +**Two envelope schema variants** (`src/tools/codeExecutionBridge.js`): +- `ENVELOPE_SCHEMA_GENERAL` — non-xlsx callers (MCP gateway, Agent SDK subagents, `run_python_analysis`): enforces `{ success, analysis, data, charts, stderr }` shape on text output. L5 verified: turn_count=1, `envelope_source='text'`, output_config-enforced JSON in text block. +- `ENVELOPE_SCHEMA_XLSX` (Option A — audit-only): enforces ONLY audit metadata (`audit_results`, `sheets`, `phase_sources`, `named_ranges_count`) in text. The b64_xlsx binary payload continues via stdout (legacy path) to avoid the MAX_TOKENS budget cliff that broke the first attempt (text_len=62106 on phase3 LBO). Bridge merges audit-from-text + b64-from-stdout in `selectEnvelopeWithFallback()`. + +**Anthropic structured-output schema constraints discovered empirically (L2/L4)**: +- All `type: 'object'` nodes MUST set `additionalProperties: false` (400 error otherwise) +- `minimum` / `maximum` not supported on numeric types +- `minLength` / `maxLength` likely also unsupported (stripped defensively) +- `enum` works on strings (confirmed) + +**Feature flag**: `STRUCTURED_OUTPUT_ENFORCEMENT` (default `false`). Flag off → byte-identical to pre-Avenue-A-v2 (verified: L3 flag-off + flag-on both 197/0/2). Flag on → API-level JSON enforcement engages. + +**Cross-caller benefit** (verified via Phase 1 Explore): the bridge is shared infrastructure. The fix benefits: +- All 5 xlsx templates (full-deal-workbook, lbo-focused, valuation-only, tax-memo-workbook, session-models) via `src/utils/xlsxRenderer/index.js:208` (single-turn) + `multiTurnOrchestrator.js:105` (multi-turn) +- MCP tool gateway (`run_python_analysis` via `toolImplementations.js:958`) when caller provides `constraints.output_format` +- Agent SDK subagents (tax analyst, antitrust analyst, etc.) via `agentSdkToolAdapter.js:167` +- NOT affected: Skills' parallel native path at `legacyStreamHandler.js:78` (registers `code_execution_20250825` directly, bypasses bridge) + +**L4 empirical comparison** (full-deal-workbook live renders, paired against same seeded session inputs): + +| | Baseline (flag OFF) | Treatment v1 (full schema) | **Option A (audit-only schema)** | +|---|---|---|---| +| `success` | true | **false** (phase3 LBO failed at max_tokens cliff) | ✅ **true** | +| Wall time | 8.9 min (532s) | 19.2 min (1151s) | ✅ **7.8 min (465s)** | +| Phases delivered | 5/5 | 4/5 | ✅ **5/5** | +| MAX_TOKENS hits | 0 | 1 | ✅ **0** | +| Named ranges | 32 | n/a | ✅ **46** | + +Option A matches baseline reliability + adds API-level structured-output enforcement on the audit envelope. Per-phase wall times match baseline (no b64-in-text verbose overhead). + +**Files modified**: +- `src/tools/codeExecutionBridge.js` — schemas, output_config injection (2 call sites: 572, 603), extractResults (parsed_output extraction), selectEnvelopeWithFallback (priority + merge logic) +- `src/config/featureFlags.js` — STRUCTURED_OUTPUT_ENFORCEMENT flag (default false) +- `flags.env` — STRUCTURED_OUTPUT_ENFORCEMENT=false (prod default) +- `test/sdk/code-execution-bridge.test.js` — 3 new Avenue A v2 source-level assertions (10 assertions total) + +**Test suite**: 197/0/2 maintained on both flag states. Bridge tests gain 10 Avenue A v2 source-level assertions. + +**Rollback**: trivial flag flip (set STRUCTURED_OUTPUT_ENFORCEMENT=false in flags.env, redeploy). No schema/migration/data implications. + ### Changed — Avenue B Phase 1: `full-deal-workbook` sensitivity isolation (Issue #100, PR forthcoming) The `full-deal-workbook` template's phase split has been rebalanced. Phase count is **unchanged at 5** — only the sheet routing within `phase4` and `phase5` changes: diff --git a/super-legal-mcp-refactored/flags.env b/super-legal-mcp-refactored/flags.env index 284a995b9..3484eee6b 100644 --- a/super-legal-mcp-refactored/flags.env +++ b/super-legal-mcp-refactored/flags.env @@ -15,6 +15,11 @@ FILES_API_CHART_EXTRACTION=true CHART_PERSISTENCE=true # XLSX renderer — Phase 1B foundation (default false; flip after Phase 0 probes pass + staging validation) XLSX_RENDERER=false +# Avenue A v2 — code-execution-bridge envelope JSON schema enforcement via output_config. +# Default false (existing prompt-level + corrective-retry path). When true, bridge +# enforces envelope shape at the Anthropic API level via output_config + parsed_output +# extraction. Target: eliminate the ~80% turn-1-envelope-miss rate. +STRUCTURED_OUTPUT_ENFORCEMENT=false # Phase 7 — operational caps (per-process; multi-pod multiplies) XLSX_RENDER_CONCURRENCY=10 XLSX_RENDER_MAX_QUEUE=50 diff --git a/super-legal-mcp-refactored/src/config/featureFlags.js b/super-legal-mcp-refactored/src/config/featureFlags.js index 7c05458f1..451e7db02 100644 --- a/super-legal-mcp-refactored/src/config/featureFlags.js +++ b/super-legal-mcp-refactored/src/config/featureFlags.js @@ -45,6 +45,15 @@ export const featureFlags = { // Default false; flip true after Phase 0 probes pass and staging validation completes. // Plan: docs/pending-updates/excel-code-execution.md v4.5 XLSX_RENDERER: envBool(process.env.XLSX_RENDERER, false), + // Avenue A v2 (Issue forthcoming) — enforce envelope JSON shape at the Anthropic + // API level via output_config: { format: { type: 'json_schema', schema: {...} } }. + // When off (default), bridge uses existing prompt-level + corrective-retry path. + // When on, code-execution-bridge passes output_config to the API call and updates + // extractResults to read parsed_output / final text block before falling back to + // stdout. Target: eliminate the ~80% turn-1-envelope-miss rate observed in L4 + // live render logs (PR #134 validation). + // Plan: /Users/ej/.claude/plans/glittery-toasting-stardust.md + STRUCTURED_OUTPUT_ENFORCEMENT: envBool(process.env.STRUCTURED_OUTPUT_ENFORCEMENT, false), // Files API chart extraction — native PNG retrieval via file_id references // Requires files-api-2025-04-14 beta header. Falls back to base64-through-stdout when off. FILES_API_CHART_EXTRACTION: envBool(process.env.FILES_API_CHART_EXTRACTION, true), diff --git a/super-legal-mcp-refactored/src/tools/codeExecutionBridge.js b/super-legal-mcp-refactored/src/tools/codeExecutionBridge.js index 18a72fcd2..e99c5a1c6 100644 --- a/super-legal-mcp-refactored/src/tools/codeExecutionBridge.js +++ b/super-legal-mcp-refactored/src/tools/codeExecutionBridge.js @@ -64,6 +64,133 @@ const MAX_PAUSE_CONTINUATIONS = 5; // Per Anthropic docs: recommended limit for // both addressed by streaming. const OVERALL_TIMEOUT_MS = Number(process.env.CODE_EXECUTION_TIMEOUT_MS) || 1_200_000; +// ─── Avenue A v2: Structured-Output Envelope Schemas ──────────────────────── +// +// When STRUCTURED_OUTPUT_ENFORCEMENT=true, the bridge passes one of these as +// `output_config.format` to enforce envelope JSON shape at the Anthropic API +// level (not just prompt level). Picks variant based on +// constraints?.output_format?.includes('b64_xlsx') — matches existing detection +// at line 613. +// +// SDK type signature: matches JSONOutputFormat at +// node_modules/@anthropic-ai/sdk/resources/messages/messages.d.ts:506-513 +// (only `type: 'json_schema'` is supported in SDK 0.86.1; no `'json_object'` mode). +// +// Plan: /Users/ej/.claude/plans/glittery-toasting-stardust.md "Schema definitions". + +// Anthropic structured-output requires `additionalProperties: false` on every +// `type: 'object'` node (L2 finding 2026-05-15 — API error 400 explicit: +// "For 'object' type, 'additionalProperties: true' is not supported. Please set +// 'additionalProperties' to false"). Permissive nested shapes use empty +// `properties: {}` + `additionalProperties: false` together — accepts an empty +// object but caller can still pass through any KEYS via tryParseJSON downstream +// since the model emits the envelope as a STRING in text/parsed_output and the +// bridge re-parses (the schema validates shape; downstream JSONB is opaque). +// For genuinely-variable sub-structures (audit_results.checks varies per phase), +// declare the sub-object as `{ type: 'object', additionalProperties: false }` +// with NO properties listed — model is forced to emit `{}` for these sub-fields, +// then the actual phase-specific keys live in `data` (top-level passthrough +// fields not validated by output_config) OR the model includes them within +// a permissive `extras` field declared below. Trade-off: tighter shape on the +// envelope contract, looser on the inner data. Acceptable because the bridge's +// existing post-extraction validation (isCompleteXlsxB64, etc.) handles the +// actual semantic checks. + +// Anthropic structured-output schema constraints (verified empirically via L2/L4): +// - All `type: 'object'` nodes MUST set `additionalProperties: false` +// - `minimum` / `maximum` NOT supported on numeric types +// - `minLength` / `maxLength` likely also NOT supported (stripped defensively) +// - `enum` works on strings (L2 confirmed) +// Schema validates SHAPE only; semantic validation (isCompleteXlsxB64, magic +// bytes, value ranges) happens in downstream code after envelope extraction. +// +// Option A (post-L4 pivot, 2026-05-15): The xlsx schema enforces ONLY the audit +// metadata in the model's text-block output. b64_xlsx (multi-KB binary payload) +// stays in the bash_code_execution_tool_result.stdout path where it has no +// token-budget pressure. This separates the two data classes by their natural +// transport: small structured metadata via text/parsed_output (output_config +// enforced); large binary payload via stdout (legacy path). The bridge merges +// both at the envelope-construction step. L4 prior attempt (treatment 9600202, +// schema with b64_xlsx required in text) hit max_tokens=32K cliff on phase3 +// LBO (text_len=62106) → corrupt envelope → render failure. Option A removes +// the b64_xlsx field from text-enforcement; b64 retained in stdout. + +const ENVELOPE_SCHEMA_XLSX = { + type: 'json_schema', + schema: { + type: 'object', + required: ['audit_results'], + properties: { + audit_results: { + type: 'object', + required: ['status'], + properties: { + status: { type: 'string', enum: ['PASS', 'FAIL', 'UNKNOWN'] }, + checks: { type: 'object', additionalProperties: false }, + warnings: { type: 'array', items: { type: 'string' } }, + }, + additionalProperties: false, + }, + phase_sources: { + type: 'array', + items: { + type: 'object', + properties: { + sheet: { type: 'string' }, + cell: { type: 'string' }, + source_url: { type: 'string' }, + citation: { type: 'string' }, + }, + additionalProperties: false, + }, + }, + sheets: { type: 'array', items: { type: 'string' } }, + named_ranges_count: { type: 'integer' }, + }, + additionalProperties: false, + }, +}; + +const ENVELOPE_SCHEMA_GENERAL = { + type: 'json_schema', + schema: { + type: 'object', + required: ['success'], + properties: { + success: { type: 'boolean' }, + analysis: { type: 'string' }, + data: { type: 'object', additionalProperties: false }, // permissive empty-only; model puts actual values in `data` via downstream JSON + mean: { type: 'number' }, // common analytical output, hoisted to top-level for accessibility + methodology: { type: 'string' }, + charts: { + type: 'array', + items: { + type: 'object', + properties: { + name: { type: 'string' }, + data: { type: 'string' }, + file_id: { type: 'string' }, + source: { type: 'string' }, + disk_path: { type: 'string' }, + }, + additionalProperties: false, + }, + }, + stderr: { type: ['string', 'null'] }, + }, + additionalProperties: false, + }, +}; + +// Helper: returns the right schema variant based on caller's intent. +// Mirrors the existing detection logic at line 613 (needsEnvelope check). +function getEnvelopeSchema(constraints) { + if (constraints?.output_format?.includes('b64_xlsx')) { + return ENVELOPE_SCHEMA_XLSX; + } + return ENVELOPE_SCHEMA_GENERAL; +} + // ─── Feature Flag ─────────────────────────────────────────────────────────── /** @@ -498,6 +625,14 @@ export async function runPythonAnalysis( // it (cross-phase reuse). Inner pause_turn continuations already // do this via the same `containerId` variable a few lines below. ...(containerId ? { container: containerId } : {}), + // Avenue A v2: enforce envelope JSON shape at the API level when flag on. + // SDK 0.86.1 OutputConfig (messages.d.ts:1908) accepts json_schema format; + // SDK auto-parses into response.parsed_output. Bridge's extractResults + // reads parsed_output / final text block first (see line ~944). + // Flag off → byte-identical to pre-Avenue-A-v2 behavior. + ...(featureFlags.STRUCTURED_OUTPUT_ENFORCEMENT + ? { output_config: { format: getEnvelopeSchema(constraints) } } + : {}), messages }, { signal: controller.signal, @@ -526,6 +661,11 @@ export async function runPythonAnalysis( system: cachedSystem, tools: [{ type: CODE_EXECUTION_TOOL_TYPE, name: 'code_execution' }], ...(containerId ? { container: containerId } : {}), + // Avenue A v2: also enforce on pause_turn continuations so the schema + // applies consistently through the iteration loop. + ...(featureFlags.STRUCTURED_OUTPUT_ENFORCEMENT + ? { output_config: { format: getEnvelopeSchema(constraints) } } + : {}), messages: [ { role: 'user', content: userContent }, { role: 'assistant', content: response.content } @@ -587,7 +727,8 @@ export async function runPythonAnalysis( if (extracted.executionSuccess && extracted.outputs.length > 0) { finalResult.success = true; finalResult.raw_output = extracted.outputs[extracted.outputs.length - 1]; - finalResult.data = selectBestJSON(extracted.outputs); + // Avenue A v2: envelope-priority order — parsed_output → final text → stdout. + finalResult.data = selectEnvelopeWithFallback(extracted, finalResult, constraints); finalResult.charts = extracted.charts; finalResult.stderr = extracted.stderr; await downloadFilesApiCharts(client, extracted.fileIds, finalResult); @@ -673,7 +814,7 @@ export async function runPythonAnalysis( if (extracted.outputs.length > 0) { finalResult.success = true; finalResult.raw_output = extracted.outputs[extracted.outputs.length - 1]; - finalResult.data = selectBestJSON(extracted.outputs); + finalResult.data = selectEnvelopeWithFallback(extracted, finalResult, constraints); finalResult.charts = extracted.charts; finalResult.stderr = extracted.stderr; await downloadFilesApiCharts(client, extracted.fileIds, finalResult); @@ -686,7 +827,7 @@ export async function runPythonAnalysis( if (extracted.outputs.length > 0) { finalResult.success = true; finalResult.raw_output = extracted.outputs[extracted.outputs.length - 1]; - finalResult.data = selectBestJSON(extracted.outputs); + finalResult.data = selectEnvelopeWithFallback(extracted, finalResult, constraints); finalResult.charts = extracted.charts; finalResult.stderr = extracted.stderr; await downloadFilesApiCharts(client, extracted.fileIds, finalResult); @@ -868,9 +1009,25 @@ export function extractResults(response) { text: '', executionSuccess: false, stderr: null, - fileIds: [] + fileIds: [], + // Avenue A v2: when output_config enforces envelope shape, SDK populates + // response.parsed_output. Bridge prefers this over stdout extraction. + // Also tracks which path won — useful for telemetry + L2 diagnosis. + envelopeFromParsedOutput: null, + envelopeFromText: null, + envelopeSource: null, // 'parsed_output' | 'text' | 'stdout' (set downstream by selectBestJSON) }; + // Avenue A v2 Path 1: SDK's auto-parsed envelope (highest-priority when present). + // Only consulted when STRUCTURED_OUTPUT_ENFORCEMENT is on AND output_config was + // passed. Otherwise response.parsed_output is undefined and we fall through. + if (featureFlags.STRUCTURED_OUTPUT_ENFORCEMENT + && response?.parsed_output + && typeof response.parsed_output === 'object') { + results.envelopeFromParsedOutput = response.parsed_output; + results.envelopeSource = 'parsed_output'; + } + for (const block of response.content) { if (block.type === 'text') { results.text += block.text; @@ -1105,6 +1262,73 @@ function selectBestJSON(outputs) { return best; } +/** + * Avenue A v2 + Option A: envelope extraction with MERGE for xlsx mode. + * + * For NON-xlsx mode (GENERAL schema) — simple priority: + * 1. extracted.envelopeFromParsedOutput (SDK auto-parsed text via output_config) + * 2. Final text block parsed as JSON (output_config enforced, SDK didn't pre-parse) + * 3. selectBestJSON(extracted.outputs) (stdout-based, legacy path) + * + * For XLSX mode (audit-only schema in text + b64 in stdout) — MERGE strategy: + * - Audit metadata from text/parsed_output (output_config-enforced when flag on) + * - b64_xlsx / xlsx_filename / workbook_size_bytes from stdout (legacy path) + * - Merged envelope feeds existing downstream consumers unchanged + * - Falls back to pure-stdout when text/parsed_output empty (flag off OR + * output_config didn't enforce OR audit envelope failed to parse) + * + * Mutates finalResult.envelope_source ∈ + * {'parsed_output','text','stdout','merged','none'} so callers + telemetry + * can observe which path won. Returns the chosen/merged object (or null). + */ +function selectEnvelopeWithFallback(extracted, finalResult, constraints) { + const isXlsxMode = !!(constraints?.output_format?.includes('b64_xlsx')); + + // Try text-side envelope first (parsed_output → text-block JSON-parse) + let textEnvelope = null; + let textSource = null; + if (extracted.envelopeFromParsedOutput) { + textEnvelope = extracted.envelopeFromParsedOutput; + textSource = 'parsed_output'; + } else if (featureFlags.STRUCTURED_OUTPUT_ENFORCEMENT && extracted.text) { + const parsedText = tryParseJSON(extracted.text); + if (parsedText && typeof parsedText === 'object' && !Array.isArray(parsedText)) { + textEnvelope = parsedText; + textSource = 'text'; + } + } + + // For NON-xlsx mode: text envelope is sufficient if present, else stdout + if (!isXlsxMode) { + if (textEnvelope) { + finalResult.envelope_source = textSource; + return textEnvelope; + } + const fromStdout = selectBestJSON(extracted.outputs); + finalResult.envelope_source = fromStdout ? 'stdout' : 'none'; + return fromStdout; + } + + // For XLSX mode: stdout is REQUIRED (contains b64_xlsx). Merge text + stdout + // when both present; fall back to stdout-only when text empty. + const fromStdout = selectBestJSON(extracted.outputs); + if (textEnvelope && fromStdout) { + // MERGE: text-side audit metadata overlays stdout's b64-bearing envelope. + // Stdout's b64_xlsx/xlsx_filename/workbook_size_bytes are preserved; + // text-side audit_results/sheets/etc. take precedence where present. + finalResult.envelope_source = `merged:${textSource}+stdout`; + return { ...fromStdout, ...textEnvelope }; + } + if (fromStdout) { + // Text envelope missing — pure stdout path (existing pre-Avenue-A-v2 behavior). + finalResult.envelope_source = 'stdout'; + return fromStdout; + } + // No stdout AND no text-envelope — bridge will detect missing b64 downstream. + finalResult.envelope_source = 'none'; + return null; +} + /** * Infer model_id from task content via keyword overlap with catalog model names. * Used as Option E fallback when caller doesn't pass explicit model_id (Day 6.G). diff --git a/super-legal-mcp-refactored/test/sdk/code-execution-bridge.test.js b/super-legal-mcp-refactored/test/sdk/code-execution-bridge.test.js index 8fec67699..6bfa3399a 100644 --- a/super-legal-mcp-refactored/test/sdk/code-execution-bridge.test.js +++ b/super-legal-mcp-refactored/test/sdk/code-execution-bridge.test.js @@ -1185,6 +1185,75 @@ async function testChartPersistence() { else delete process.env.CURRENT_SESSION_DIR; } +// ─── Avenue A v2 — structured-output enforcement tests ───────────────────── + +async function _readBridgeSrc() { + const { readFile } = await import('node:fs/promises'); + return readFile( + path.resolve(__dirname, '../../src/tools/codeExecutionBridge.js'), + 'utf8', + ); +} + +async function testEnvelopeSchemaShape() { + console.log('\n--- Test: ENVELOPE_SCHEMA_XLSX + ENVELOPE_SCHEMA_GENERAL shape (Avenue A v2) ---'); + const src = await _readBridgeSrc(); + assert( + src.includes('const ENVELOPE_SCHEMA_XLSX'), + 'Avenue A v2: ENVELOPE_SCHEMA_XLSX defined in bridge', + ); + assert( + src.includes('const ENVELOPE_SCHEMA_GENERAL'), + 'Avenue A v2: ENVELOPE_SCHEMA_GENERAL defined in bridge', + ); + assert( + src.includes("type: 'json_schema'"), + 'Avenue A v2: schemas use type=json_schema (SDK 0.86.1 JSONOutputFormat shape)', + ); + assert( + /additionalProperties:\s*false/.test(src), + 'Avenue A v2: schemas set additionalProperties=false (Anthropic structured-output requirement)', + ); + assert( + src.includes('STRUCTURED_OUTPUT_ENFORCEMENT'), + 'Avenue A v2: feature flag referenced in bridge code', + ); + assert( + src.includes('selectEnvelopeWithFallback'), + 'Avenue A v2: priority-order helper defined (parsed_output → text → stdout)', + ); +} + +async function testStructuredOutputFlagOff() { + console.log('\n--- Test: STRUCTURED_OUTPUT_ENFORCEMENT=false → existing behavior preserved (Avenue A v2) ---'); + const src = await _readBridgeSrc(); + // Both API call sites must have the conditional output_config injection. + const matches = src.match(/featureFlags\.STRUCTURED_OUTPUT_ENFORCEMENT[\s\S]{0,200}output_config/g); + assert( + matches && matches.length >= 2, + `Avenue A v2: output_config gated by flag at both API call sites (found ${matches?.length || 0}, expected ≥ 2)`, + ); + // extractResults must check flag before consuming parsed_output. + assert( + /STRUCTURED_OUTPUT_ENFORCEMENT[\s\S]{0,200}response\?\.parsed_output/.test(src), + 'Avenue A v2: extractResults gates parsed_output read on flag', + ); +} + +async function testStructuredOutputFlagOn() { + console.log('\n--- Test: STRUCTURED_OUTPUT_ENFORCEMENT=true → output_config emitted (Avenue A v2) ---'); + const src = await _readBridgeSrc(); + assert( + src.includes('function getEnvelopeSchema'), + 'Avenue A v2: getEnvelopeSchema helper defined', + ); + // selectEnvelopeWithFallback must respect priority: parsed_output → text → stdout. + assert( + /envelopeFromParsedOutput[\s\S]{0,500}parsed_output[\s\S]{0,500}text[\s\S]{0,500}stdout/.test(src), + 'Avenue A v2: selectEnvelopeWithFallback priority order parsed_output → text → stdout', + ); +} + // ─── Main ─────────────────────────────────────────────────────────────────── async function main() { @@ -1202,6 +1271,11 @@ async function main() { await testFilesApiChartExtraction(); await testChartPersistence(); + // Avenue A v2 — structured-output enforcement (source-level + behavior assertions) + await testEnvelopeSchemaShape(); + await testStructuredOutputFlagOff(); + await testStructuredOutputFlagOn(); + // Live API tests only if bridge is enabled await testSimpleAnalysis(); await testFinancialAnalysis();