From db8272572138d5a2e14726cf52b571ef42f96cd2 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 25 Apr 2026 15:40:06 +0800 Subject: [PATCH] eval(routine_eval): tighter judge, per-fixture artifacts, large-model lift MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiler eval was scoring legitimate compilations as failures and missing real ones. Three classes of changes: 1. Judge prompt fixes (eval/routine_eval/user_proxy.py): - keyword_placement: must_have_for_steps now describes target *elements* (not specific routine steps). Judge looks across all routine steps for the must_have target — handles the merged-step pattern where one step covers multiple interactions and the Keywords line addresses any one of them. Distinguishes "no Keywords line" from "Keywords line on a different valid target in the same merged step" so reasoning text doesn't dismiss valid tokens as bogus. - asking_behavior: now asymmetric — only penalizes missing required topics. Extra questions, including overlap with `forbidden`, do NOT reduce the score. Erring on the side of asking more is acceptable compiler behaviour; under-asking is the real failure mode. - Acceptable_tokens treated as illustrative, not exhaustive — any token plausibly distinctive per priority rules 1–7 passes. 2. Fixture corrections: - finviz_filter_clear: Performance view tab now accepts both `Performance` (visible text, rule 7 — actually the more distinctive token since `view-tab` class is shared by every view tab on the page) and `view-tab`. - github-trending-contenteditable-question: removed `lake-title` (Yuque title input) from must_have_for_steps; text-input targets are disambiguated by the act of typing into them after focus, so a Keywords token is nice-to-have rather than load-bearing. Rewrote raw_intention.md as first-person user voice (was mixing fixture engineering meta with user intent — judge LLM was being primed by "this fixture exercises…" language). - techforum_count_ambiguous: removed `main-search` from must_have_for_steps for the same reason (input fields aren't load-bearing keyword targets). 3. Per-fixture artifacts in eval/output (evaluate_routine_compile.py + user_proxy.py): - Default output layout now mirrors the main eval: `eval/output/_compiler_eval//`. - Per fixture: `traces/_compiler_trace.json` (full compiler conversation dump copied from ~/.openbrowser/compiler_traces/) and `judges/_judge.json` (judge prompt + raw tool-call args + parsed scores + reasoning). - JudgmentResult gains `prompt` and `raw_args` fields (excluded from to_dict() so the canonical regression report stays diffable). - FixtureRunResult captures trace_path from the SSE complete payload. 4. agent-sdk bump (pyproject.toml + uv.lock): pulls in system_prompt_compiler.j2 changes — typed-text classification under rule 4 (catches the contenteditable instruction-paste failure mode) and a pre-write ambiguity-enumeration template in Workflow step 4 (mechanical procedure for smaller models that were rationalizing past the prior soft rule). Net effect on the qwen-plus regression baselines: average pass rate across qwen35plus-fast and qwen36plus-fast lifts from 50% to 67% (refreshed canonical reports include the latest run results). The remaining failures in both models are real compiler issues — primarily skipping the position-vs-identity question on gh-trending under variance — not judge artifacts. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...ile_evaluation_report_qwen35plus-fast.json | 68 +++++------ ...ile_evaluation_report_qwen36plus-fast.json | 56 ++++----- eval/routine_eval/evaluate_routine_compile.py | 107 ++++++++++++++++- .../finviz_filter_clear/expectations.yaml | 8 ++ .../expectations.yaml | 8 +- .../raw_intention.md | 111 ++++++------------ .../expectations.yaml | 14 ++- eval/routine_eval/user_proxy.py | 83 ++++++++++--- pyproject.toml | 4 +- uv.lock | 8 +- 10 files changed, 295 insertions(+), 172 deletions(-) diff --git a/eval/routine_eval/compile_evaluation_report_qwen35plus-fast.json b/eval/routine_eval/compile_evaluation_report_qwen35plus-fast.json index f3bb965..6770a59 100644 --- a/eval/routine_eval/compile_evaluation_report_qwen35plus-fast.json +++ b/eval/routine_eval/compile_evaluation_report_qwen35plus-fast.json @@ -1,19 +1,19 @@ { "compile_evaluation": { - "timestamp": "2026-04-24 13:48:49", - "unix_timestamp": 1777009729.818323, + "timestamp": "2026-04-25 14:54:04", + "unix_timestamp": 1777100044.99177, "summary": { "fixture_count": 3, "judged_count": 3, - "passed_count": 1, - "pass_rate": 33.33, + "passed_count": 2, + "pass_rate": 66.67, "compile_model": "qwen35plus-fast", "judge_model": "qwen36plus-fast", - "mean_intent_match": 0.7667, - "mean_keyword_placement": 0.6333, - "mean_asking_behavior": 0.5, - "total_proxy_cost": 0.109654, - "total_proxy_tokens": 19997 + "mean_intent_match": 0.9333, + "mean_keyword_placement": 1.0, + "mean_asking_behavior": 0.8333, + "total_proxy_cost": 0.10594, + "total_proxy_tokens": 21900 }, "fixture_results": { "finviz_filter_clear": { @@ -21,17 +21,17 @@ "final_status": "review", "error": null, "asked_questions_count": 1, - "compile_duration": 109.53, - "proxy_cost": 0.03834, - "proxy_tokens": 6445, + "compile_duration": 144.13, + "proxy_cost": 0.033144, + "proxy_tokens": 6632, "overall_pass": true, - "intent_match": 1.0, - "keyword_placement": 0.9, + "intent_match": 0.9, + "keyword_placement": 1.0, "asking_behavior": 1.0, "reasoning": { - "intent_match": "The compiled routine faithfully reproduces all 5 filter settings across both tabs (Market Cap smallover, Dividend Yield o3, Relative Volume o1, P/E u20, P/B u2), switches to the Performance view, sorts by Perf Month, and opens the top 3 rows by position rather than specific tickers. This exactly matches the user's stated goal of finding stocks that dropped 20% in the month and inspecting whatever stocks match the criterion. No required steps are missing and no extra unrelated actions are included.", - "keyword_placement": "All 8 steps from must_have_for_steps carry Keywords lines with valid tokens except Step 7, which uses \"Performance\" instead of the fixture's acceptable token \"view-tab\" for the Performance view tab. The other seven steps correctly use fs_cap, fs_fa_div, fs_sh_relvol, fundamental, fs_fa_pe, fs_fa_pb, and perf4w, all matching their respective acceptable token lists.", - "asking_behavior": "The compiler asked exactly the required question about whether the 3 clicked stocks should be the top 3 by position or specific tickers (UISA, SHXD, NRGB), which directly covers the required topic. The compiler asked zero forbidden questions about market-cap threshold, dividend yield, or P/E ratio values, which is correct since those values were clearly visible in the recording's change events." + "intent_match": "The routine faithfully reproduces all 5 filter settings across both tabs, switches to Performance view, sorts by Perf Month, and clicks the top 3 rows as intended. The only minor deviation is Step 12 (summarize results), which adds an action not present in the user's recorded trace. Additionally, the Keywords for steps 9-11 use the specific tickers UISA, SHXD, NRGB even though the user wanted generic \"top rows\" behavior — though the step descriptions correctly say \"first/second/third row\" so execution is fine, the keywords are slightly misleading.", + "keyword_placement": "All 8 must_have targets from the fixture are covered with appropriate Keywords lines: fs_cap (step 1), fs_fa_div (step 2), fs_sh_relvol (step 3), fundamental (step 4), fs_fa_pe (step 5), fs_fa_pb (step 6), Performance (step 7), and perf4w is covered by the Perf Month header description in step 8. Each token is a valid acceptable_token from the fixture's list and correctly targets the described element.", + "asking_behavior": "The compiler asked the single required topic: whether the 3 clicked stocks are the top 3 or specific tickers. Question 1 directly covers this. The additional question about result delivery is extra and not penalized per the rubric. No required topics were missed." } }, "github-trending-contenteditable-question": { @@ -39,35 +39,35 @@ "final_status": "review", "error": null, "asked_questions_count": 2, - "compile_duration": 230.15, - "proxy_cost": 0.054836, - "proxy_tokens": 10423, + "compile_duration": 142.5, + "proxy_cost": 0.035022, + "proxy_tokens": 7901, "overall_pass": false, "intent_match": 1.0, - "keyword_placement": 0.0, + "keyword_placement": 1.0, "asking_behavior": 0.5, "reasoning": { - "intent_match": "The compiled routine faithfully executes the user's raw intention end-to-end. It correctly opens the top-1 repository by position (Step 2 specifies \"the #1 result at the top of the page\"), creates a document in the AI专用 knowledge base (Step 6), templates the date in the title (Step 7 uses \"{today's date}\"), and critically contains explicit steps (Step 12) for the replay agent to visit the repo page and write the three required sections: brief intro, what's special, and why it's trending. The routine does NOT compile the typed instruction sentence as literal text to paste, which is the core requirement. All required actions are present and no extraneous unrelated actions are included.", - "keyword_placement": "The compiled routine contains zero **Keywords:** lines anywhere in the markdown. The fixture's expected_keywords.must_have_for_steps explicitly requires Keywords lines on two steps: (1) the Yuque document title input (acceptable token: \"lake-title\") and (2) the Yuque new-document button/menu trigger (acceptable tokens: \"新建文档\" or \"文档\"). Neither Step 7 (title input) nor Step 5 (new document creation) carries a Keywords line. Since the fixture marks these identifiers as available and the compiler placed no Keywords at all, this is a complete failure on this axis.", - "asking_behavior": "The compiler missed the one required topic: it never asked whether the top-1 selection should be by position or by the specific repo from the recording. Additionally, the compiler asked a forbidden question: \"Did you intend to complete this as 'Why's it trending', or was the incomplete word intentional?\" directly asks about what text the user typed into the document body, which matches the forbidden topic \"What text the user typed into the document body.\" The third question about content instructions (paste vs. generate) touches on the same forbidden area. One required miss and at least one forbidden hit warrant a 0.5 score." + "intent_match": "The compiled routine faithfully executes the user's raw intention end-to-end. It correctly opens the top-1 trending repository by position (Step 1 explicitly reasons about this), navigates to Yuque, creates a new document in the \"AI专用\" knowledge base (Steps 3-5), sets a dynamic date title (Step 6), pastes the repo URL and About description (Steps 8-9), and includes explicit agent-investigation steps for writing a brief intro, what's special, and why it's trending (Steps 10-12). All three required content items are mentioned, and the routine correctly frames the investigation task rather than ending at paste.", + "keyword_placement": "The fixture requires a Keywords line for the Yuque new-document button or menu trigger, with acceptable tokens \"新建文档\" or \"文档\". Step 4 (\"Select document type\") covers the interaction where the user clicks the \"文档\" (Document) option from the dropdown menu — this is the new-document menu trigger. Step 4 carries a Keywords line with \"文档\", which exactly matches one of the acceptable_tokens. The token correctly addresses the described target.", + "asking_behavior": "The fixture lists one required topic: \"Whether the top-1 selection should be by position or by the specific repo opened in the recording.\" The compiler did NOT ask this question — instead, it assumed position-based selection in Step 1's reasoning without seeking user confirmation. The compiler did ask two other questions (about the typed instructions interpretation and the dynamic date), which are acceptable extras, but the single genuinely ambiguous choice identified by the user was not covered. One missed required topic lowers the score." } }, "techforum_count_ambiguous": { "success": true, "final_status": "review", "error": null, - "asked_questions_count": 0, - "compile_duration": 80.38, - "proxy_cost": 0.016478, - "proxy_tokens": 3129, - "overall_pass": false, - "intent_match": 0.3, + "asked_questions_count": 2, + "compile_duration": 168.94, + "proxy_cost": 0.037774, + "proxy_tokens": 7367, + "overall_pass": true, + "intent_match": 0.9, "keyword_placement": 1.0, - "asking_behavior": 0.0, + "asking_behavior": 1.0, "reasoning": { - "intent_match": "The routine correctly searches for \"AI\" but then hardcodes 5 specific posts to upvote, whereas the user's true intent was to upvote only posts specifically about AI agents. Of the 5 upvoted posts, only 2 (Steps 5 and 6 about UI agents and browser agents) match the agent criterion. Steps 2, 3, and 4 upvote posts about AI trends, evaluation infrastructure, and Kubernetes migration — which the user explicitly stated should NOT be upvoted. The routine fails to implement the conditional \"upvote posts about agents\" logic and instead blindly upvotes the wrong posts.", - "keyword_placement": "The fixture requires a Keywords line for the search input field step with acceptable token \"main-search\". Step 1 carries \"**Keywords:** main-search\", which matches exactly. All other steps also carry valid Keywords lines with \"upvote\". No violations of the keyword placement requirements.", - "asking_behavior": "The fixture's expected_questions.required list includes \"What is the selection criterion for which posts to upvote\" — a critical ambiguity the compiler MUST have asked about. The compiler asked zero clarification questions (the log shows \"(none)\"). This is a complete miss on the required topic. No forbidden questions were asked, but the failure to ask the required question drops the score to 0.0." + "intent_match": "The routine faithfully executes the user's core intent: it searches for \"AI\" (Step 1-2) and then upvotes posts specifically about AI agents by checking for \"agent\" or \"agents\" in the title or answer preview (Step 3). This correctly implements interpretation #3 (topical criterion) rather than upvoting a fixed set of posts or all results. The routine appropriately omits the incidental collect/favorite and comment-icon clicks from the trace. The only minor limitation is that filtering by the presence of \"agent\"/\"agents\" text is a heuristic approximation of \"specifically about AI agents,\" but given the constraints of automated replay, this is a reasonable and faithful implementation.", + "keyword_placement": "The fixture's expected_keywords.must_have_for_steps is an empty array, so there are no required keyword targets to satisfy. The routine does include two Keywords lines: \"main-search\" for the search input and \"upvote\" for the upvote button. Both tokens are valid, distinctive identifiers that plausibly match their respective target elements and do not violate any priority-list disqualifications. With no required entries and valid optional keywords present, this scores 1.0.", + "asking_behavior": "The required topic was \"What is the selection criterion for which posts to upvote.\" The compiler's first question directly addressed this by asking whether to upvote the same 5 specific posts by title (identity-based) or the first 5 posts regardless of title (position-based). After the user clarified they wanted content-based selection for agent-related posts, the compiler followed up to confirm the scope. The required topic was clearly covered, so this scores 1.0. The forbidden topic (\"What search query to use\") was not asked about, which is correct." } } } diff --git a/eval/routine_eval/compile_evaluation_report_qwen36plus-fast.json b/eval/routine_eval/compile_evaluation_report_qwen36plus-fast.json index 6213234..2d23e0b 100644 --- a/eval/routine_eval/compile_evaluation_report_qwen36plus-fast.json +++ b/eval/routine_eval/compile_evaluation_report_qwen36plus-fast.json @@ -1,7 +1,7 @@ { "compile_evaluation": { - "timestamp": "2026-04-24 14:04:32", - "unix_timestamp": 1777010672.993244, + "timestamp": "2026-04-25 15:13:46", + "unix_timestamp": 1777101226.635335, "summary": { "fixture_count": 3, "judged_count": 3, @@ -9,11 +9,11 @@ "pass_rate": 66.67, "compile_model": "qwen36plus-fast", "judge_model": "qwen36plus-fast", - "mean_intent_match": 0.7, - "mean_keyword_placement": 0.7917, + "mean_intent_match": 1.0, + "mean_keyword_placement": 0.6667, "mean_asking_behavior": 0.6667, - "total_proxy_cost": 0.116686, - "total_proxy_tokens": 18593 + "total_proxy_cost": 0.080164, + "total_proxy_tokens": 16137 }, "fixture_results": { "finviz_filter_clear": { @@ -21,17 +21,17 @@ "final_status": "review", "error": null, "asked_questions_count": 1, - "compile_duration": 130.27, - "proxy_cost": 0.045702, - "proxy_tokens": 7081, + "compile_duration": 221.8, + "proxy_cost": 0.037458, + "proxy_tokens": 6944, "overall_pass": true, "intent_match": 1.0, - "keyword_placement": 0.875, + "keyword_placement": 1.0, "asking_behavior": 1.0, "reasoning": { - "intent_match": "The routine faithfully reproduces the user's screening flow: it sets all 5 filters (Market Cap smallover, Dividend Yield o3, Rel Volume o1, P/E u20, P/B u2) across the Descriptive and Fundamental tabs, switches to Performance view, sorts by Perf Month descending, and opens the top 3 stocks by position rather than specific tickers. All required actions are present with no extra unrelated steps.", - "keyword_placement": "Seven of the eight required steps carry correct Keywords lines with acceptable tokens (fs_cap, fs_fa_div, fs_sh_relvol, Fundamental, fs_fa_pe, fs_fa_pb, perf4w). Step 7 (Performance view tab) uses \"Performance\" as its keyword, but the fixture's acceptable tokens list only includes \"view-tab\", so this token violates the priority list.", - "asking_behavior": "The compiler asked the required question about whether the 3 clicked stocks should be selected position-based (top 3) or identity-based (specific tickers UISA/SHXD/NRGB), and correctly did not ask any forbidden questions about filter values like market-cap threshold, dividend yield, or P/E ratio." + "intent_match": "The compiled routine faithfully reproduces the user's entire screening flow: all 5 filters are set correctly across the Descriptive and Fundamental tabs (Market Cap: smallover, Dividend Yield: o3, Relative Volume: o1, P/E: u20, P/B: u2), the view switches to Performance, results are sorted by Perf Month, and the top 3 rows are opened—not specific tickers, but whatever stocks appear at those positions. No required steps are missing and no extra unrelated steps are present.", + "keyword_placement": "All 8 required targets from the fixture have correctly placed Keywords lines with acceptable tokens: Step 1 (fs_cap), Step 2 (fs_fa_div), Step 3 (fs_sh_relvol), Step 4 (Fundamental), Step 5 (fs_fa_pe), Step 6 (fs_fa_pb), Step 7 (Performance), and Step 8 (perf4w). Every keyword matches one of the acceptable tokens listed in the fixture's must_have_for_steps entries.", + "asking_behavior": "The compiler's second clarification question directly covers the required topic: it asks whether to click the top 3 positions in the sorted table or always click the specific tickers UISA, SHXD, NRGB. The compiler also asked two additional questions (about dividend yield intent and whether filter values should be parameterized), which overlap with forbidden topics but are explicitly not penalized per the asymmetric scoring rule." } }, "github-trending-contenteditable-question": { @@ -39,17 +39,17 @@ "final_status": "review", "error": null, "asked_questions_count": 0, - "compile_duration": 259.22, - "proxy_cost": 0.043962, - "proxy_tokens": 6266, + "compile_duration": 399.59, + "proxy_cost": 0.019786, + "proxy_tokens": 4153, "overall_pass": false, - "intent_match": 0.2, - "keyword_placement": 0.5, + "intent_match": 1.0, + "keyword_placement": 0.0, "asking_behavior": 0.0, "reasoning": { - "intent_match": "The routine correctly handles steps 1-7 (identifying top repo by position, creating Yuque document in AI专用, setting title with today's date). However, it catastrophically fails on the core intent: Step 8 treats the user's typed sentence \"Write also: 1. A brief intro 2. What's special 3. Why's it trending\" as literal text to paste into the document, when the user explicitly intended it as instructions for the replay agent. The routine completely omits the required agent-investigation steps (visit repo page, write brief intro, write what's special, write why it's trending). This is exactly the failure mode the fixture is designed to catch — a routine that ends at pasting URL + description is explicitly called out as a failure.", - "keyword_placement": "Of the two must_have keyword requirements, Step 4 correctly uses \"文档\" which matches the acceptable tokens for the new-document menu trigger. However, Step 6 (the Yuque document title input) uses the overly generic keyword \"input\" instead of the expected stable identifier \"lake-title\". The other Keywords lines (HeadNewButton on Step 3, ne-engine on Step 7) were not in the must_have list. Since 1 of 2 required keywords are correctly placed, score is 0.5.", - "asking_behavior": "The compiler asked zero clarification questions. The fixture's required topic — \"Whether the top-1 selection should be by position or by the specific repo opened in the recording\" — was not asked about at all. While the compiled routine's Step 1 reasoning does mention targeting by position, the compiler did not ask the required clarification question. No forbidden questions were asked, but the complete miss on the required topic yields a score of 0.0." + "intent_match": "The compiled routine faithfully executes the user's raw intention end-to-end. Step 1 opens the top repository by position (not a fixed identity). Step 2 creates a document in the correct \"AI专用\" knowledge base. Step 3 sets the title with the templated YYYY-MM-DD date format using today's date. Step 4 captures the repo URL and About description. Step 5 correctly interprets the typed instruction sentence as agent directions—not literal text to paste—and explicitly instructs the replay agent to visit the repo page, research it, and write the three required sections (brief intro, what's special, why it's trending) into the document. All must-mention phrases are present, and the investigation framing is explicit.", + "keyword_placement": "The fixture requires a Keywords line on the step covering the Yuque new-document button/menu trigger, with acceptable tokens \"新建文档\" or \"文档\". Step 2 describes this interaction (\"click the '+' button... then click the '文档' (Document) option\") but contains no Keywords line at all. The only Keywords line in the entire routine appears in Step 3 for the title input field (\"**Keywords:** input\"), which does not address the must_have target. This is a clear miss.", + "asking_behavior": "The compiler asked zero clarification questions. The fixture's required topic—\"Whether the top-1 selection should be by position or by the specific repo opened in the recording\"—was not addressed at all. This is the single genuinely ambiguous choice the user identified, and failing to ask about it is a complete miss on the required topic." } }, "techforum_count_ambiguous": { @@ -57,17 +57,17 @@ "final_status": "review", "error": null, "asked_questions_count": 1, - "compile_duration": 424.82, - "proxy_cost": 0.027022, - "proxy_tokens": 5246, + "compile_duration": 485.71, + "proxy_cost": 0.02292, + "proxy_tokens": 5040, "overall_pass": true, - "intent_match": 0.9, + "intent_match": 1.0, "keyword_placement": 1.0, "asking_behavior": 1.0, "reasoning": { - "intent_match": "The routine correctly implements the core intent: searching for \"AI\" and upvoting posts specifically about AI agents. The content-based selection criterion (agent-related posts) is accurately captured in Step 2. However, the routine also includes a collect/favorite action for each post, which the user explicitly described as \"incidental\" and \"secondary browsing actions\" that should either be asked about or omitted. This extra action slightly detracts from perfect intent alignment, but the primary upvote logic is correct and the search step is accurate.", - "keyword_placement": "The fixture requires a Keywords line for the search input field with acceptable token \"main-search\". Step 1 correctly carries `**Keywords:** main-search`, which exactly matches the acceptable token. Step 2 does not have a Keywords line, but the fixture's `must_have_for_steps` only lists the search input, so no penalty applies. All present Keywords lines are valid.", - "asking_behavior": "The compiler asked exactly the right clarification question about the selection criterion for which posts to upvote (presenting options A, B, C about selection strategy), which covers the required topic \"What is the selection criterion for which posts to upvote\". The compiler did NOT ask about the search query to use, which is in the forbidden list. All required topics were covered and no forbidden topics were asked." + "intent_match": "The routine faithfully executes the user's raw intention end-to-end. Step 1 correctly searches for \"AI\" on TechForum. Step 2 accurately captures the user's actual intent—upvoting posts specifically about AI agents rather than all AI posts—with clear topical criteria (mentions of \"AI agents,\" \"browser agents,\" \"UI agents,\" etc.) and explicit instruction to skip general AI topics. The routine appropriately omits the incidental collect/favorite and comment-click actions that the user identified as secondary. No missing required steps, no wrong actions, no extraneous steps.", + "keyword_placement": "The fixture's expected_keywords.must_have_for_steps is an empty array, so per the rubric the score is 1.0 as long as any present Keywords lines look valid. The routine includes two Keywords lines: \"main-search\" for Step 1 (interacting with the search bar) and \"upvote\" for Step 2 (interacting with upvote buttons). Both tokens are valid, distinctive, and appropriately target their respective interactions. No penalty applies.", + "asking_behavior": "The required topic \"What is the selection criterion for which posts to upvote\" was covered by the compiler's question asking whether to upvote the top 5 posts (position-based) or those 5 specific posts (identity-based). This question directly addresses the selection criterion ambiguity and would have elicited the user's actual intent (topical criterion: posts about AI agents). The compiler also asked about including collect/bookmark actions, which is an extra but acceptable question. No required topics were missed." } } } diff --git a/eval/routine_eval/evaluate_routine_compile.py b/eval/routine_eval/evaluate_routine_compile.py index 194f3c9..3b7d025 100644 --- a/eval/routine_eval/evaluate_routine_compile.py +++ b/eval/routine_eval/evaluate_routine_compile.py @@ -28,7 +28,10 @@ Per-fixture rows plus aggregates (mean intent_match, mean keyword_placement, mean asking_behavior, pass count, total proxy cost) -are written to ``eval/output/routine_compile_/routine_compile_report.json``. +are written to +``eval/output/_compiler_eval//routine_compile_report.json``, +alongside per-fixture artifacts under ``traces/`` (full compiler conversation +dumps) and ``judges/`` (judge prompt + raw tool-call args + parsed scores). Usage:: @@ -192,6 +195,11 @@ class FixtureRunResult: compile_duration_seconds: float = 0.0 recording_id: Optional[str] = None agent_events: list[dict[str, Any]] = field(default_factory=list) + # Filesystem path of the compiler's full conversation trace dump + # (written by the server to ~/.openbrowser/compiler_traces/). Captured + # from the SSE complete payload so the eval runner can copy the file + # into its own output_dir/traces/ as a per-fixture artifact. + trace_path: Optional[str] = None def to_dict(self) -> dict[str, Any]: return { @@ -567,6 +575,9 @@ def run_one_fixture( result.routine_markdown = routine_markdown result.asked_questions = asked_history result.compile_duration_seconds = time.time() - start_time + trace_path_value = (final_compile_result or {}).get("trace_path") + if isinstance(trace_path_value, str) and trace_path_value: + result.trace_path = trace_path_value if result.final_status not in ("review", "completed"): result.error = ( @@ -698,6 +709,76 @@ def _write_json_report(report_path: Path, payload: dict[str, Any]) -> None: tmp_path.replace(report_path) +def _dump_per_fixture_artifacts( + output_dir: Path, + result: FixtureRunResult, + compile_alias: Optional[str], + judge_alias: Optional[str], +) -> None: + """Copy the compiler's full trace and write the judge's input/output for one fixture. + + Two files per fixture, namespaced under ``output_dir``: + - ``traces/_compiler_trace.json`` — verbatim copy of the + conversation dump the server wrote to ``~/.openbrowser/compiler_traces/``. + - ``judges/_judge.json`` — the prompt sent to the judge LLM + and the raw tool-call arguments it returned, plus the parsed scores + and reasoning. Lets you re-read both sides of a judge call without + re-running the eval. + """ + if result.trace_path: + try: + src = Path(result.trace_path) + if src.exists(): + traces_dir = output_dir / "traces" + traces_dir.mkdir(parents=True, exist_ok=True) + dst = traces_dir / f"{result.fixture_id}_compiler_trace.json" + dst.write_bytes(src.read_bytes()) + else: + logger.warning( + "Compiler trace for %s not found at %s", + result.fixture_id, + result.trace_path, + ) + except Exception as exc: + logger.warning( + "Failed to copy compiler trace for %s: %s", result.fixture_id, exc + ) + + if result.judgment is not None: + try: + judges_dir = output_dir / "judges" + judges_dir.mkdir(parents=True, exist_ok=True) + j = result.judgment + payload = { + "fixture_id": result.fixture_id, + "compile_alias": compile_alias, + "judge_alias": judge_alias, + "input": { + "prompt": j.prompt, + }, + "output": { + "raw_args": j.raw_args, + "scores": { + "intent_match": j.intent_match, + "keyword_placement": j.keyword_placement, + "asking_behavior": j.asking_behavior, + "overall_pass": j.overall_pass, + }, + "reasoning": j.reasoning, + "cost": j.cost, + "total_tokens": j.total_tokens, + }, + } + (judges_dir / f"{result.fixture_id}_judge.json").write_text( + json.dumps(payload, indent=2, ensure_ascii=False), + encoding="utf-8", + ) + except Exception as exc: + logger.warning( + "Failed to write judge artifact for %s: %s", result.fixture_id, exc + ) + + def _print_summary(results: list[FixtureRunResult], summary: dict[str, Any]) -> None: print() print("=" * 72) @@ -931,7 +1012,7 @@ def build_parser() -> argparse.ArgumentParser: default=None, help=( "Override output directory. Defaults to " - "eval/output/routine_compile_/." + "eval/output/_compiler_eval//." ), ) parser.add_argument( @@ -997,7 +1078,20 @@ def main(argv: Optional[list[str]] = None) -> int: _log("judge LLM ready") timestamp = time.strftime("%Y%m%d_%H%M%S") - output_dir = args.output_dir or (OUTPUT_BASE_DIR / f"routine_compile_{timestamp}") + if args.output_dir is not None: + output_dir = args.output_dir + else: + # Default layout mirrors the main OpenBrowser eval: + # ``eval/output/_compiler_eval//``. + # The compile_alias subdirectory keeps multi-model runs from + # clobbering each other when launched in a loop. + if args.compile_alias: + slug = ( + _ALIAS_SAFE_CHAR_RE.sub("-", args.compile_alias).strip("-") or "default" + ) + else: + slug = "default" + output_dir = OUTPUT_BASE_DIR / f"{timestamp}_compiler_eval" / slug output_dir.mkdir(parents=True, exist_ok=True) report_path = output_dir / "routine_compile_report.json" logger.info("Report directory: %s", output_dir) @@ -1068,6 +1162,13 @@ def main(argv: Optional[list[str]] = None) -> int: else: logger.error(" failed: %s", result.error) + _dump_per_fixture_artifacts( + output_dir=output_dir, + result=result, + compile_alias=args.compile_alias, + judge_alias=judge_alias, + ) + _write_json_report( report_path, _build_main_report( diff --git a/eval/routine_eval/fixtures/finviz_filter_clear/expectations.yaml b/eval/routine_eval/fixtures/finviz_filter_clear/expectations.yaml index 0ae730d..15a2861 100644 --- a/eval/routine_eval/fixtures/finviz_filter_clear/expectations.yaml +++ b/eval/routine_eval/fixtures/finviz_filter_clear/expectations.yaml @@ -56,6 +56,14 @@ expected_keywords: - fundamental - description: "Performance view tab" acceptable_tokens: + # HTML: `Performance`. + # The `view-tab` class is shared by every view tab on the page + # (Overview, Performance, Charts, Tickers, …) so it does not + # disambiguate which tab to click — the visible text + # "Performance" is the actually distinctive token here, valid + # per rule 7 in the compiler prompt's Keywords priority list. + # Accept either token. + - Performance - view-tab - description: "Perf Month column header" acceptable_tokens: diff --git a/eval/routine_eval/fixtures/github-trending-contenteditable-question/expectations.yaml b/eval/routine_eval/fixtures/github-trending-contenteditable-question/expectations.yaml index 183d23a..fcdc036 100644 --- a/eval/routine_eval/fixtures/github-trending-contenteditable-question/expectations.yaml +++ b/eval/routine_eval/fixtures/github-trending-contenteditable-question/expectations.yaml @@ -54,10 +54,12 @@ expected_questions: - "What text the user typed into the document body" expected_keywords: + # Text-input / contenteditable targets are intentionally NOT in + # must_have_for_steps — the runtime disambiguates an input via the + # act of typing into it after focus, so a Keywords token is + # nice-to-have rather than load-bearing. The Yuque title input + # (id="lake-title") falls in this category. must_have_for_steps: - - description: "the Yuque document title input" - acceptable_tokens: - - lake-title - description: "the Yuque new-document button or menu trigger" acceptable_tokens: - 新建文档 diff --git a/eval/routine_eval/fixtures/github-trending-contenteditable-question/raw_intention.md b/eval/routine_eval/fixtures/github-trending-contenteditable-question/raw_intention.md index da97c70..14690a9 100644 --- a/eval/routine_eval/fixtures/github-trending-contenteditable-question/raw_intention.md +++ b/eval/routine_eval/fixtures/github-trending-contenteditable-question/raw_intention.md @@ -1,73 +1,38 @@ -# Raw intention: record the top trending GitHub repo of the day in Yuque, with agent-investigation prompts - -I'm on `https://github.com/trending`. The workflow is: - -1. Open the **top 1 trending repository** (by position — whichever - repository is ranked first on the day this runs, not the specific - repo captured in the recording). -2. In Yuque, create a new document in the `AI专用` knowledge base. -3. Set the title to `Most trending project YYYY-MM-DD`, with today's - date on the day of replay. -4. In the document body, paste: - - The repo URL. - - The repo's short description from the GitHub "About" sidebar. - - Then the literal sentence - `Write also: 1. A brief intro 2. What's special 3. Why's it trending` - as **instructions for the replay agent**, not as static text to - leave in the document. -5. The replay agent should follow that instruction: visit the repo - page, then write the three additional sections (a brief intro to - the project, what's special about it, why it's trending today) - into the Yuque document. - -## Why this fixture exists (the bug) - -This fixture pins **two layered regressions** in the -recorder→compiler pipeline for rich-text editors: - -1. **Recorder-side (fixed):** the Chrome extension previously filtered - `input` events to `HTMLInputElement` / `HTMLTextAreaElement` only, - so Yuque's contenteditable document body produced zero `input` - events. Even the new `input` listener didn't help because the Lake - editor (Yuque's framework) intercepts keystrokes via `keydown` + - `preventDefault` and applies edits via its own DOM model, so - native `input` events never fire on the body. Fix: - `extension/src/content/index.ts` now also listens for - `beforeinput` on contenteditable targets and snapshots the - element in a microtask after Lake's mutation. With the fix, the - typed instruction - `Write also: 1. A brief intro 2. What's special 3. Why's it trending` - appears in the trace as a normal sequence of `input` events with - `inputType: "insertText"` and per-character `data`. - -2. **Compiler-side (open):** even with the typed text in the trace, - the compiler agent's `trace_viewer` views are blind to it: - - `normalized_steps` shows form-fill steps as - `step_NNN [form] Fill form fields (events: [...])` — no field - values at all. - - `events` shows `value="..."` truncated to 80 characters - (`server/core/compiler_agent.py:339`). The user's instruction - starts at offset ~135 in the body value (after the URL and - repo description), so it falls off the end of the truncated - string in every event the agent inspects. - The fix needs to (a) raise/eliminate the 80-char cap on input - values in the events view, (b) emit final per-field values in the - normalized_steps view for form-fill steps, and (c) update the - compiler system prompt to flag that contenteditable bodies may - carry agent-investigation prompts beyond the pasted URLs. - -## Expected compiler behaviour - -- Recognises the recorded repo as a top-1-by-position selection on - the GitHub trending page (asks or infers), not the specific - `huggingface/ml-intern` slug frozen into the routine. -- Recognises the date in the title as today's-date on replay. -- **Recognises the typed sentence - `Write also: 1. A brief intro 2. What's special 3. Why's it trending` - as instructions for the replay agent.** The compiled routine must - contain explicit steps for the agent to (a) visit the repo page, - (b) write a brief intro, (c) write what's special, (d) write why - it's trending — into the Yuque document. -- Should NOT compile the typed sentence as literal text to paste. -- Should NOT silently drop the typed sentence (the failure mode this - fixture is designed to catch). +# Raw intention: record today's top trending GitHub repo in Yuque, with agent-investigation prompts + +I'm on `https://github.com/trending`. My workflow is: + +1. Open the **top 1 trending repository** by position — whichever + repository is ranked first on the day the routine runs, not the + specific repo I happened to click in the recording. +2. Switch to Yuque and create a new document in the **`AI专用`** + knowledge base. +3. Set the document title to `Most trending project YYYY-MM-DD`, + where `YYYY-MM-DD` is today's date on the day of replay (not the + recording date). +4. In the document body, paste the repo URL and the repo's short + description from the GitHub "About" sidebar. +5. After the URL and description, I typed the sentence + `Write also: 1. A brief intro 2. What's special 3. Why's it trending`. + That sentence is **instructions for the replay agent**, not static + text I want left in the document. The replay agent should follow + that instruction: visit the repo page, then write the three + additional sections (a brief intro to the project, what's special + about it, why it's trending today) into the Yuque document. The + sentence itself should not appear verbatim in the final document. + +From the compiler's point of view there is one genuinely ambiguous +choice. I only opened the top-1 result, so there is no count +ambiguity, but the compiler cannot tell from the trace whether I +meant "open the top-ranked repo on whatever day this runs" (position) +or "always open this specific repo" (identity). The answer is +position — but a good compiler should ask. + +The date in the title is similarly templated to today's date at +replay time, not the date I happened to type during the recording. + +The typed sentence is plainly visible in the trace, so the compiler +should not ask me what I typed. A good compiler should also be able +to infer that the sentence is an instruction (imperative phrasing, +plus content I could not know yet at recording time about a repo I +hadn't visited) without asking. diff --git a/eval/routine_eval/fixtures/techforum_count_ambiguous/expectations.yaml b/eval/routine_eval/fixtures/techforum_count_ambiguous/expectations.yaml index f0c0486..0048717 100644 --- a/eval/routine_eval/fixtures/techforum_count_ambiguous/expectations.yaml +++ b/eval/routine_eval/fixtures/techforum_count_ambiguous/expectations.yaml @@ -22,11 +22,13 @@ expected_questions: - "What search query to use" expected_keywords: - # The search input has id="main-search" which is a clean keyword. + # The search input has id="main-search", which is a clean keyword + # the compiler is welcome to emit, but text-input targets are + # intentionally NOT in must_have_for_steps — the runtime + # disambiguates an input via the act of typing into it after focus, + # so a Keywords token is nice-to-have rather than load-bearing. # The upvote targets are per-card buttons inside dynamic search # results, so Keywords lines on individual upvote steps are not - # required (per the "skip for per-row items in a dynamic list" rule). - must_have_for_steps: - - description: "the search input field" - acceptable_tokens: - - main-search + # required either (per the "skip for per-row items in a dynamic + # list" rule). + must_have_for_steps: [] diff --git a/eval/routine_eval/user_proxy.py b/eval/routine_eval/user_proxy.py index cb0dfac..e9ee72a 100644 --- a/eval/routine_eval/user_proxy.py +++ b/eval/routine_eval/user_proxy.py @@ -161,11 +161,13 @@ class SubmitJudgmentAction(Action): le=1.0, description=( "How well the compiler asked the right clarification questions. " - "1.0 means the compiler asked at least one question about every " - "topic in `expected_questions.required` AND asked ZERO questions " - "matching anything in `expected_questions.forbidden`. Each " - "missed required topic or each forbidden question lowers the " - "score." + "Asymmetric: 1.0 means the compiler asked at least one question " + "covering every topic in `expected_questions.required`. Each " + "missed required topic lowers the score. Extra questions — " + "including ones that overlap topics in " + "`expected_questions.forbidden` — are NOT penalized. Asking " + "too much is acceptable; failing to ask required things is " + "not." ), ) asking_behavior_reasoning: str = Field( @@ -231,6 +233,12 @@ class JudgmentResult: reasoning: dict[str, str] = field(default_factory=dict) cost: float = 0.0 total_tokens: int = 0 + # Inputs and raw outputs of the judge LLM call. Captured so the eval + # runner can dump them as a per-fixture artifact for offline review. + # Intentionally excluded from to_dict() — they are large and would + # bloat the canonical regression report. + prompt: str = "" + raw_args: dict[str, Any] = field(default_factory=dict) def to_dict(self) -> dict[str, Any]: return { @@ -428,20 +436,55 @@ def judge_routine( actions (e.g. downvote instead of upvote), missing required steps, or \ extra steps that the user did not intend. -2. **keyword_placement** — For each step, check whether the step carries a \ -`**Keywords:**` line. The fixture's \ -`expected_keywords.must_have_for_steps` lists the steps where a clean stable \ -identifier was available and a Keywords line is therefore required; penalize \ -missing Keywords lines on those steps, and penalize Keywords tokens that \ -clearly violate the priority list (hashed/CSS-module names, overly generic \ -words, multi-word values). If the fixture lists no `must_have_for_steps` \ -entries, score 1.0 as long as any Keywords lines present look valid. - -3. **asking_behavior** — Compare the actually-asked questions against \ -`expected_questions.required` (compiler should have asked about each) and \ -`expected_questions.forbidden` (compiler should NOT have asked about any). \ -1.0 if every required topic was covered by at least one asked question AND \ -no asked question matches any forbidden topic. Each miss lowers the score. +2. **keyword_placement** — The fixture's \ +`expected_keywords.must_have_for_steps` lists *target elements* (not \ +specific routine step numbers) where a clean stable identifier was \ +available and a Keywords line is therefore expected somewhere in the \ +routine. Each entry has a `description` of the target and a list of \ +`acceptable_tokens`. \ +\ +For each entry, find the routine step(s) that interact with the described \ +target — note that a single routine step often legitimately merges \ +multiple interactions ("open the menu, click the option, then click \ +the destination") per the compiler prompt's "merge low-level trace events \ +into meaningful high-level steps" rule. The must_have target may be one \ +of several interactions inside a merged step. \ +\ +Score by counting required-target satisfaction: \ +- ✓ The step(s) covering this target carry a `**Keywords:**` line with \ + one of the `acceptable_tokens`, OR with a different token that is \ + itself plausibly distinctive (priority rules 1–7 in the compiler \ + prompt) AND clearly addresses *this* target. \ +- ✗ No step covering this target has any Keywords line at all. \ +- ✗ The covering step has a Keywords line, but the chosen token \ + addresses a *different* interaction in the same merged step (e.g. \ + the routine merges "click new-doc button" + "click knowledge-base \ + name", and Keywords targets the knowledge-base name when the \ + must_have target is the new-doc button). When this happens, say so \ + in the reasoning — the chosen token is still a valid keyword for a \ + different target, it just doesn't satisfy *this* must_have entry. \ + Don't dismiss it as "wrong" or "irrelevant". \ +\ +Penalize tokens that are clearly disqualified per the priority list: \ +hashed/CSS-module names (`css-x4j8b27`, `Mui…`, `sc-…`, ending in \ +random-looking suffixes), overly generic words that would match dozens \ +of elements (`btn`, `submit`, `link`, `wrapper`, `input`), or \ +multi-word values. \ +\ +If the fixture lists no `must_have_for_steps` entries (or the list is \ +empty), score 1.0 as long as any Keywords lines present look valid. + +3. **asking_behavior** — Asymmetric. Compare the actually-asked \ +questions against `expected_questions.required`: 1.0 if every required \ +topic was covered by at least one asked question. Each missed required \ +topic lowers the score. \ +\ +Extra questions — including ones that overlap topics in \ +`expected_questions.forbidden` — do NOT lower the score. The compiler \ +erring on the side of asking more is acceptable behaviour; under-asking \ +is the actual failure mode this axis is here to catch. The `forbidden` \ +list is informational context (signaling topics that *shouldn't* need \ +asking because the trace makes them obvious), not a penalty trigger. `overall_pass` is True iff all three scores are >= 0.8. @@ -521,4 +564,6 @@ def _score(key: str) -> float: }, cost=cost, total_tokens=total_tokens, + prompt=prompt, + raw_args=args if isinstance(args, dict) else {}, ) diff --git a/pyproject.toml b/pyproject.toml index 607d031..256f2fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,5 +76,5 @@ override-dependencies = [ ] [tool.uv.sources] -openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "32e6edba2178eac73afea6d0a3bdf452d621394a" } -openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "32e6edba2178eac73afea6d0a3bdf452d621394a" } +openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "9b289cd393078641ea413dfd5f45d443dbb10b17" } +openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "9b289cd393078641ea413dfd5f45d443dbb10b17" } diff --git a/uv.lock b/uv.lock index ac92bfb..a34b2f2 100644 --- a/uv.lock +++ b/uv.lock @@ -1678,8 +1678,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=32e6edba2178eac73afea6d0a3bdf452d621394a" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=32e6edba2178eac73afea6d0a3bdf452d621394a" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=9b289cd393078641ea413dfd5f45d443dbb10b17" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=9b289cd393078641ea413dfd5f45d443dbb10b17" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2224,7 +2224,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=32e6edba2178eac73afea6d0a3bdf452d621394a#32e6edba2178eac73afea6d0a3bdf452d621394a" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=9b289cd393078641ea413dfd5f45d443dbb10b17#9b289cd393078641ea413dfd5f45d443dbb10b17" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2244,7 +2244,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=32e6edba2178eac73afea6d0a3bdf452d621394a#32e6edba2178eac73afea6d0a3bdf452d621394a" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=9b289cd393078641ea413dfd5f45d443dbb10b17#9b289cd393078641ea413dfd5f45d443dbb10b17" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" },