From a80f3de679930f0b106fb952c2108c686d437cf9 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 16 Apr 2026 21:43:34 +0800 Subject: [PATCH 1/9] skill(open-browser): document portable ~/.claude/skills invocation path The skill is now symlinked into ~/.claude/skills/open-browser/ for global use. Update every `python3 skill/claude/open-browser/...` reference to `python3 ~/.claude/skills/open-browser/...` so the same command works from any project's CWD (including inside the OpenBrowser repo, where the symlink still resolves back here). Co-Authored-By: Claude Opus 4.6 (1M context) --- skill/claude/open-browser/SKILL.md | 16 +++++++++------- skill/claude/open-browser/references/setup.md | 2 +- .../claude/open-browser/scripts/check_status.py | 2 +- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/skill/claude/open-browser/SKILL.md b/skill/claude/open-browser/SKILL.md index 1130574..3b7478c 100644 --- a/skill/claude/open-browser/SKILL.md +++ b/skill/claude/open-browser/SKILL.md @@ -37,7 +37,7 @@ Before sending a browser task, confirm all of the following: Run this first: ```bash -python3 skill/claude/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" +python3 ~/.claude/skills/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" ``` If readiness fails, read [references/setup.md](references/setup.md) or @@ -72,7 +72,7 @@ Code, because the SSE stream becomes part of your conversation context without any extra plumbing: ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Open https://example.com and report the page title" \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" ``` @@ -115,7 +115,7 @@ encoded, and sent as data URIs — no upload endpoint or static server is required. Limit: 10 MB per image, up to 8 images per message. ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Open the local dashboard and tell me which section looks different from this screenshot." \ --image /tmp/reference.png \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" @@ -141,7 +141,7 @@ keeps its prior screenshots and observations), reuse the conversation ID from the previous run: ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Now click the 'Sign in' button you just identified" \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" \ --conversation-id 1b32b26a-1a7e-4b6c-9599-139fc6b9c89b @@ -153,14 +153,16 @@ report a value it already saw. ## Working Directory -Run commands from the OpenBrowser repo root so the relative script -paths resolve cleanly. +The skill's scripts live at `~/.claude/skills/open-browser/` so they +work from any project's current working directory. The OpenBrowser +server itself must still be started from the repo root +(`uv run local-chrome-server serve` in `~/git/OpenBrowser`). Use `--cwd` when the browser task should operate with context from another workspace: ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Open the local app and verify the login flow" \ --cwd /absolute/path/to/project \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" diff --git a/skill/claude/open-browser/references/setup.md b/skill/claude/open-browser/references/setup.md index 5abbc1c..477596e 100644 --- a/skill/claude/open-browser/references/setup.md +++ b/skill/claude/open-browser/references/setup.md @@ -45,7 +45,7 @@ drive the browser that registered it. ## Quick verification ```bash -python3 skill/claude/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" +python3 ~/.claude/skills/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" ``` Expected outcome: diff --git a/skill/claude/open-browser/scripts/check_status.py b/skill/claude/open-browser/scripts/check_status.py index c218162..8752bf7 100644 --- a/skill/claude/open-browser/scripts/check_status.py +++ b/skill/claude/open-browser/scripts/check_status.py @@ -136,7 +136,7 @@ def main() -> int: print("Ready for browser automation.") return 0 - print("Not ready. See skill/claude/open-browser/references/setup.md if needed.") + print("Not ready. See ~/.claude/skills/open-browser/references/setup.md if needed.") return 1 From ac7fa57da1c8ebbf58a18bfb871908a8aa090609 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 17 Apr 2026 13:08:10 +0800 Subject: [PATCH 2/9] chore: update openhands-sdk and openhands-tools to bd4cb296355c3d03dd411883e78527b1915fa8c4 Co-Authored-By: Claude Haiku 4.5 --- pyproject.toml | 4 ++-- uv.lock | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dd933be..69ae578 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,5 +76,5 @@ override-dependencies = [ ] [tool.uv.sources] -openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "764fb87256d7bc20b3eccf82c8a4d241e6740d63" } -openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "764fb87256d7bc20b3eccf82c8a4d241e6740d63" } +openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "bd4cb296355c3d03dd411883e78527b1915fa8c4" } +openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "bd4cb296355c3d03dd411883e78527b1915fa8c4" } diff --git a/uv.lock b/uv.lock index 418acbe..36f3fc5 100644 --- a/uv.lock +++ b/uv.lock @@ -1678,8 +1678,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=2eb7db59461e9117b1e3e0519616b39f1497c0f9" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2224,7 +2224,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63#764fb87256d7bc20b3eccf82c8a4d241e6740d63" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4#bd4cb296355c3d03dd411883e78527b1915fa8c4" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2244,7 +2244,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63#764fb87256d7bc20b3eccf82c8a4d241e6740d63" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4#bd4cb296355c3d03dd411883e78527b1915fa8c4" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" }, From f046eed0d9455e5644fa5f465ba4be963b721cbb Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 17 Apr 2026 13:17:32 +0800 Subject: [PATCH 3/9] =?UTF-8?q?skill(ob-routines):=20add=20Browser=20Routi?= =?UTF-8?q?nes=20skill=20(record=20=E2=86=92=20compile=20=E2=86=92=20repla?= =?UTF-8?q?y)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduces the ob-routines skill (alias for openbrowser-routines) for capturing, compiling, and replaying named Chrome workflows. Previously lived only in ~/.claude/skills/routines/; now versioned under skill/claude/ob-routines/ so it can be installed via symlink alongside open-browser. Co-Authored-By: Claude Opus 4.7 (1M context) --- skill/claude/ob-routines/SKILL.md | 235 +++++++++++ skill/claude/ob-routines/scripts/compile.py | 390 ++++++++++++++++++ .../ob-routines/scripts/list_routines.py | 121 ++++++ skill/claude/ob-routines/scripts/replay.py | 346 ++++++++++++++++ .../ob-routines/scripts/start_recording.py | 123 ++++++ .../ob-routines/scripts/stop_recording.py | 90 ++++ 6 files changed, 1305 insertions(+) create mode 100644 skill/claude/ob-routines/SKILL.md create mode 100644 skill/claude/ob-routines/scripts/compile.py create mode 100644 skill/claude/ob-routines/scripts/list_routines.py create mode 100644 skill/claude/ob-routines/scripts/replay.py create mode 100644 skill/claude/ob-routines/scripts/start_recording.py create mode 100644 skill/claude/ob-routines/scripts/stop_recording.py diff --git a/skill/claude/ob-routines/SKILL.md b/skill/claude/ob-routines/SKILL.md new file mode 100644 index 0000000..d9ecb7a --- /dev/null +++ b/skill/claude/ob-routines/SKILL.md @@ -0,0 +1,235 @@ +--- +name: ob-routines +description: Record, compile, and replay Browser Routines — saved, named browser workflows. (Alias for openbrowser-routines.) Supports subcommands: "list [query]" to list/search routines, "new" to record a new routine, "execute " to replay a saved routine. Use when the user says "list routines", "record a routine", "replay X", "execute X", or "/ob-routines ". +--- + +# Browser Routines + +Browser Routines are named, compiled workflows captured from real Chrome sessions. +The pipeline has four stages: **record → compile → name → replay**. + +## Subcommand dispatch + +When invoked with arguments, act immediately — do not ask the user what they want: + +| Invocation | Action | +|---|---| +| `/ob-routines` | Show available routines and ask what to do | +| `/ob-routines list [query]` | Run `list_routines.py [query]` and display results | +| `/ob-routines new` | Ask what flow to record, then start the full record → compile pipeline | +| `/ob-routines execute ` | Run `replay.py ` immediately | + +--- + +## Your role during compilation + +You are a **bridge and quality gate**, not the compiler. The Compiler Agent does +the reasoning; you ensure it did its job correctly before finalizing. + +### Bridge duties +1. Run `compile.py` in a tmux pane (mandatory — see below). +2. Watch for `[compiler:question]` — relay it to the user, send their answer back. +3. Watch for `[compiler:stalled]` — show the agent's message, optionally prompt a follow-up. +4. At `[compiler:name_prompt]` — help the user pick a short slug. + +### Quality gate (run before every finalize) + +After the compiler reports `status=review`, read the compiled routine markdown +and check **both** of the following before calling `/compile/finalize`: + +#### Gate 1 — Intent clarity +Did the compiler understand *why* the user performed each action, not just *what* +they clicked? Red flags: +- Steps that say "click X" with no explanation of goal or condition +- A position-based selection from a sorted/filtered list without asking whether + to replay by position or by identity (e.g. "upvote the top 3 posts" — top 3 + today vs. the same 3 posts always?) +- A value (date, search query, ticker, ID) that will obviously change between + runs, not parameterized + +If any red flag is present and the compiler did NOT ask about it: relay the +ambiguity to the user yourself, get their answer, then send it via +`POST /recordings/{id}/compile/answer` so the compiler can revise. + +#### Gate 2 — Delivery goal for read-only workflows + +A workflow is **read-only** if it has no form submission, no purchase, no +send/post/create/delete action — the user only navigated, read, filtered, or +inspected. For read-only workflows, ask: does the compiled routine end with a +delivery step (a `file_editor` write, a `terminal` command, or an explicit +instruction to report results in chat)? + +**If the routine is read-only AND has no delivery step, the compiler made an +error.** Do not finalize. Instead: + +1. Tell the user: "This routine reads data but doesn't capture results anywhere. + How do you want results delivered on replay?" + - (a) Summary shown in chat (brief / structured table / full details?) + - (b) Written to a local file (path + format: plain text, Markdown, CSV, JSON?) + - (c) Both +2. Get their answer. +3. Send it to the compiler via `POST /recordings/{id}/compile/answer` — the + compiler will revise the routine to include the delivery step. +4. Wait for the next `status=review`, then re-run both gates. + +> **Why this matters:** A routine that just clicks through pages is useless on +> replay — OpenBrowser will navigate and stop with no output. The delivery step +> is what makes the routine meaningful. + +--- + +## Preconditions + +**First time?** Complete the full setup in `skill/claude/open-browser/references/setup.md` +before using this skill. That guide covers: loading the Chrome extension, connecting +it to the server, and obtaining a valid `OPENBROWSER_CHROME_UUID`. Without that, +recording and replay will fail immediately. + +For subsequent uses, confirm: +- OpenBrowser server at `http://127.0.0.1:8765` +- Chrome extension connected +- `OPENBROWSER_CHROME_UUID` set (or passed via `--chrome-uuid`) + +Quick check: +```bash +python3 skill/claude/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" +``` + +Start the server if needed: +```bash +cd /Users/yangxiao/git/OpenBrowser && uv run local-chrome-server serve +``` + +Scripts path: `skill/claude/ob-routines/scripts/` (run from repo root). + +--- + +## List & search routines + +```bash +python3 skill/claude/ob-routines/scripts/list_routines.py +python3 skill/claude/ob-routines/scripts/list_routines.py "login" +python3 skill/claude/ob-routines/scripts/list_routines.py --recordings +``` + +--- + +## Record a routine + +### Step 1 — start recording +```bash +python3 skill/claude/ob-routines/scripts/start_recording.py \ + --chrome-uuid "$OPENBROWSER_CHROME_UUID" \ + --name "xiaohongshu-messages" \ + --intent "check messages on Xiaohongshu" +``` + +Prints `[recording:started] `. **Save this ID.** + +Tell the user: **"Perform your actions in the browser window, then come back and say done."** +Do NOT proceed until the user confirms. + +### Step 2 — stop recording +```bash +python3 skill/claude/ob-routines/scripts/stop_recording.py +``` + +--- + +## Compile to a routine — MANDATORY: tmux interactive session + +**compile.py uses `input()` for Q&A and the name prompt. It MUST run in an +interactive shell. Never invoke it directly via the Bash tool — it will block +and then be killed, losing the compiler session.** + +### Launch in tmux +```bash +tmux new-window -n "compile" \ + "cd /Users/yangxiao/git/OpenBrowser && python3 skill/claude/ob-routines/scripts/compile.py ; echo '[compile-done]'" +``` + +### Monitor output +```bash +tmux capture-pane -t "compile" -p +``` + +### Send an answer +```bash +tmux send-keys -t "compile" "the answer" Enter +``` + +### Markers to watch for + +| Marker | Your action | +|---|---| +| `[compiler:thought]` / `[compiler:action]` | Relay as progress to user | +| `[compiler:question] ` | Relay to user, wait for answer, send via `tmux send-keys` | +| `[compiler:stalled] ` | Show message, ask user for follow-up | +| `[compiler:complete] goal=… steps=N` | Compilation reached review state | +| `[compiler:routine_draft]` | Full routine markdown printed for inspection | +| `[compiler:gate_check]` | **Run both quality gates here.** Send feedback or press Enter | +| `[compiler:name_prompt]` | Gates passed — help user pick slug | +| `[compiler:saved]` | Done — report name and id | + +### Quality gate checkpoint +When `[compiler:gate_check]` appears in the pane, compile.py is explicitly +paused waiting for your review of `[compiler:routine_draft]`. Run Gate 1 and Gate 2: + +- **Gates pass** → send an empty Enter: `tmux send-keys -t main:compile "" Enter` +- **Gate fails** → send corrective feedback: + `tmux send-keys -t main:compile "Please add a delivery step: summarise results in chat as a structured list of tickers with metrics." Enter` + +compile.py forwards non-empty input back to the compiler, streams the revision, +and loops back to another `[compiler:gate_check]`. Only an empty Enter advances +to `[compiler:name_prompt]`. + +**Never send gate feedback at the `[compiler:name_prompt]` stage** — that input +goes directly to the routine name field, not the compiler. + +--- + +## Replay a routine + +```bash +python3 skill/claude/ob-routines/scripts/replay.py "routine-name" \ + --chrome-uuid "$OPENBROWSER_CHROME_UUID" + +# List without replaying +python3 skill/claude/ob-routines/scripts/replay.py --list +``` + +Name matching: exact → ID → prefix → substring. + +--- + +## Full example workflow + +``` +1. /ob-routines new → ask user what to record +2. start_recording → [recording:started] abc123 +3. (user records in browser, says "done") +4. stop_recording abc123 → [recording:events] 21 events +5. tmux new-window "compile.py abc123" +6. monitor pane → relay questions → send answers +7. [compiler:complete] → run Gate 1 + Gate 2 + Gate 2 fails: routine is read-only, no delivery step + → ask user: chat summary, file, or both? + → send answer via tmux send-keys + → wait for next [compiler:complete] +8. Gates pass → [compiler:name_prompt] → user picks slug +9. [compiler:saved] name='…' id=… +10. /ob-routines execute → streams [action] … [complete] +``` + +--- + +## Failure handling + +- **Server unreachable**: `uv run local-chrome-server serve` +- **Browser UUID invalid**: reconnect Chrome extension, get fresh UUID +- **0 events captured**: browser disconnected; re-record +- **tmux not found**: `brew install tmux` +- **tmux window conflict**: check `tmux list-windows`, use a unique `-n` name +- **Compiler session expired** (pane exited before finalize): call + `POST /recordings/{id}/compile` again to restart — session is fresh +- **Relay stuck**: `[observation:error]` lines in SSE stream; relay to user diff --git a/skill/claude/ob-routines/scripts/compile.py b/skill/claude/ob-routines/scripts/compile.py new file mode 100644 index 0000000..5ed9a92 --- /dev/null +++ b/skill/claude/ob-routines/scripts/compile.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +"""Compile a stopped recording into a named Browser Routine. + +Starts the Compiler Agent, streams its SSE output, and acts as a bridge +between the agent and the user: + - Agent reasoning and tool calls are printed to stdout as they arrive. + - When the compiler agent asks a clarification question (status=asking), + this script prints the question and reads the user's answer from stdin, + then resumes compilation via /compile/answer. + - When the agent stalls (status=stalled), the agent's last message is + shown and the user can send a follow-up. + - When compilation completes (status=review), the script prompts the user + to name the routine, then calls /compile/finalize to save it. + +The outer agent (Claude Code / Codex) should relay the printed questions to +the user and feed their responses back via stdin — it should NOT try to +re-implement compiler logic. + +Example: + python3 compile.py abc123-recording-id + python3 compile.py abc123-recording-id --model-alias fast +""" + +from __future__ import annotations + +import argparse +import json +import sys +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 15, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +# --------------------------------------------------------------------------- +# SSE event formatting (same conventions as send_task.py) +# --------------------------------------------------------------------------- + + +def _format_compiler_event(event_type: str, data: dict) -> None: + """Print one SSE event from the compiler agent stream.""" + if event_type == "error": + print(f"[compiler:error] {data.get('error', data)}", flush=True) + return + + if event_type != "agent_event": + # Pass-through for unknown top-level event types + print(f"[{event_type}] {json.dumps(data, ensure_ascii=False)}", flush=True) + return + + data_type = data.get("type", "unknown") + + if data_type == "SystemPromptEvent": + text_len = len(data.get("text", "")) + print( + f"[compiler:system_prompt] suppressed ({text_len} chars)", + flush=True, + ) + return + + if data_type == "ThoughtEvent": + thought = data.get("thought", data.get("content", "")) + print(f"[compiler:thought] {thought}", flush=True) + return + + if data_type == "ActionEvent": + action = data.get("action", {}) + if isinstance(action, dict): + action_name = action.get("action", "unknown") + if action_name == "ask_user": + question = action.get("question", "") + print(f"[compiler:ask_user] {question}", flush=True) + else: + # FileEditorTool, TraceViewerTool, SubmitWorkflowTool, etc. + extras = { + k: v for k, v in action.items() + if k != "action" and v is not None + } + suffix = (" " + json.dumps(extras, ensure_ascii=False)) if extras else "" + print(f"[compiler:action] {action_name}{suffix}", flush=True) + else: + print(f"[compiler:action] {action}", flush=True) + return + + if data_type == "ObservationEvent": + success = data.get("success", False) + message = data.get("message", "") + state = "ok" if success else "error" + print(f"[compiler:observation:{state}] {message}", flush=True) + return + + if data_type == "MessageEvent": + role = data.get("role", "unknown") + text = data.get("text", "") + print(f"[compiler:message:{role}] {text}", flush=True) + return + + if data_type == "ErrorEvent": + print(f"[compiler:error] {data.get('error', 'unknown error')}", flush=True) + return + + print( + f"[compiler:agent_event:{data_type}] {json.dumps(data, ensure_ascii=False)}", + flush=True, + ) + + +# --------------------------------------------------------------------------- +# SSE streaming +# --------------------------------------------------------------------------- + + +def _stream_sse(url: str, body: dict) -> dict | None: + """POST to url with body, stream SSE events, return the final complete result. + + Returns the ``result`` dict from the complete event, or None on error. + """ + req = Request( + url, + data=json.dumps(body).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Accept": "text/event-stream", + }, + method="POST", + ) + + complete_result: dict | None = None + sse_event: str | None = None + sse_data: str | None = None + + try: + with urlopen(req, timeout=None) as response: + for raw_line in response: + line = raw_line.decode("utf-8").rstrip("\n") + if not line: + if sse_event and sse_data is not None: + try: + parsed = json.loads(sse_data) + except json.JSONDecodeError: + parsed = {"raw": sse_data} + + if sse_event == "complete": + complete_result = parsed.get("result", parsed) + else: + _format_compiler_event(sse_event, parsed) + + sse_event = None + sse_data = None + continue + + if line.startswith("event:"): + sse_event = line[6:].strip() + elif line.startswith("data:"): + sse_data = line[5:].lstrip() + + except HTTPError as exc: + body_text = exc.read().decode("utf-8", errors="replace") + print(f"[compiler:http_error] {exc.code} {exc.reason}: {body_text}", file=sys.stderr) + return None + + return complete_result + + +# --------------------------------------------------------------------------- +# Compile loop +# --------------------------------------------------------------------------- + + +def compile_recording(base_url: str, recording_id: str, model_alias: str | None) -> int: + """Run the compile → Q&A → finalize flow. Returns exit code.""" + print(f"[compiler:start] recording={recording_id}", flush=True) + + # ── Phase 1: initial compile ────────────────────────────────────────── + compile_body: dict = {} + if model_alias: + compile_body["model_alias"] = model_alias + + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile", + body=compile_body, + ) + if result is None: + return 1 + + # ── Phase 2: Q&A loop ───────────────────────────────────────────────── + while True: + status = result.get("status") + + if status == "asking": + question = result.get("question", "") + print(f"\n[compiler:question] {question}", flush=True) + print("[compiler:waiting_for_answer] Type your answer and press Enter:", + flush=True) + try: + answer = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Compilation cancelled.", flush=True) + return 130 + + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile/answer", + body={"answer": answer}, + ) + if result is None: + return 1 + + elif status == "stalled": + # Agent replied in prose instead of calling ask_user. + # Show the message and let the user send a follow-up. + message = result.get("message", "") + if message: + print(f"\n[compiler:stalled] {message}", flush=True) + print( + "[compiler:waiting_for_follow_up] Agent stalled — send a follow-up " + "(or press Enter to continue without one):", + flush=True, + ) + try: + follow_up = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Compilation cancelled.", flush=True) + return 130 + + if not follow_up: + follow_up = "Please continue." + + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile/answer", + body={"answer": follow_up}, + ) + if result is None: + return 1 + + elif status == "review": + # Compilation done — show the draft and pause for quality gate + # before proceeding to the name prompt. The outer agent (Claude + # Code / Codex) reads the routine here and may send corrective + # feedback (e.g. missing delivery step) via the gate prompt. + # Only an empty Enter moves forward to naming. + goal = result.get("goal", "") + step_count = result.get("step_count", "?") + routine_markdown = result.get("routine_markdown", "") + print(f"\n[compiler:complete] goal={goal!r} steps={step_count}", flush=True) + if routine_markdown: + print(f"[compiler:routine_draft]\n{routine_markdown}", flush=True) + print( + "\n[compiler:gate_check] Review the routine above.\n" + "Press Enter to proceed to naming, or type feedback to send back to the compiler:", + flush=True, + ) + try: + gate_input = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Compilation cancelled.", flush=True) + return 130 + + if gate_input: + # Outer agent has feedback — send it back to the compiler + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile/answer", + body={"answer": gate_input}, + ) + if result is None: + return 1 + # Loop back to handle the next status + continue + + # Gate passed — proceed to naming + break + + else: + print( + f"[compiler:unexpected_status] {status} — result: {result}", + file=sys.stderr, + ) + return 1 + + # ── Phase 3: name the routine and finalize ──────────────────────────── + goal = result.get("goal", "") + step_count = result.get("step_count", "?") + + # Suggest a slug derived from the goal + suggested = _slugify(goal) if goal else "my-routine" + print( + f"\n[compiler:name_prompt] Suggested name: {suggested!r}\n" + f"Accept (press Enter) or type a new name:", + flush=True, + ) + try: + chosen_name = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Finalization cancelled.", flush=True) + return 130 + + if not chosen_name: + chosen_name = suggested + + # ── Phase 4: finalize ───────────────────────────────────────────────── + try: + finalize_result = request_json( + f"{base_url}/recordings/{recording_id}/compile/finalize", + method="POST", + body={"name": chosen_name}, + ) + except HTTPError as exc: + body_text = exc.read().decode("utf-8", errors="replace") + print(f"[compiler:finalize_error] {exc.code}: {body_text}", file=sys.stderr) + return 1 + except Exception as exc: + print(f"[compiler:finalize_error] {exc}", file=sys.stderr) + return 1 + + routine = finalize_result.get("routine", {}) + routine_id = routine.get("routine_id", "?") + name = routine.get("name", chosen_name) + steps = routine.get("step_count", "?") + + print(f"[compiler:saved] name={name!r} id={routine_id} steps={steps}", flush=True) + print( + f"\nRoutine saved. To replay it, run:\n\n" + f" python3 replay.py {name!r}\n", + flush=True, + ) + return 0 + + +def _slugify(text: str) -> str: + """Turn a goal string into a short, lowercase, hyphenated slug.""" + import re + # Lowercase, keep only alnum and spaces, collapse and replace with hyphens + slug = re.sub(r"[^\w\s]", "", text.lower()) + slug = re.sub(r"\s+", "-", slug.strip()) + # Truncate to 40 chars, trim trailing hyphens + slug = slug[:40].rstrip("-") + return slug or "routine" + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Compile a stopped recording into a named Browser Routine", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("recording_id", help="Recording ID from stop_recording.py") + parser.add_argument( + "--model-alias", + help="LLM model alias to use for compilation (uses server default if omitted)", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + try: + return compile_recording(args.url, args.recording_id, args.model_alias) + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except KeyboardInterrupt: + print("Interrupted.", file=sys.stderr) + return 130 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/list_routines.py b/skill/claude/ob-routines/scripts/list_routines.py new file mode 100644 index 0000000..24a1dac --- /dev/null +++ b/skill/claude/ob-routines/scripts/list_routines.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +"""List saved routines and/or stopped recordings. + +Routines are named, compiled browser workflows ready to replay. +Recordings are raw captured traces that may not yet be compiled. + +Examples: + python3 list_routines.py # list all routines + python3 list_routines.py login # filter by name/goal substring + python3 list_routines.py --recordings # list stopped recordings instead + python3 list_routines.py --recordings login # filter recordings by name +""" + +from __future__ import annotations + +import argparse +import json +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +def request_json(url: str, *, timeout: int = 10) -> dict: + req = Request(url, headers={"Accept": "application/json"}) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +def list_routines(base_url: str, query: str | None) -> int: + try: + data = request_json(f"{base_url}/routines") + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + + items = data.get("routines", []) + if query: + q = query.lower() + items = [ + r for r in items + if q in r["name"].lower() or q in r.get("goal", "").lower() + ] + + if not items: + suffix = f" matching {query!r}" if query else "" + print(f"No routines found{suffix}.") + return 0 + + print(f"{'NAME':<30} {'STEPS':>5} {'GOAL'}") + print("-" * 72) + for r in items: + name = r["name"] + steps = r.get("step_count", "?") + goal = r.get("goal", "") + routine_id = r["routine_id"] + print(f"{name:<30} {steps:>5} {goal}") + print(f" id={routine_id}") + return 0 + + +def list_recordings(base_url: str, query: str | None) -> int: + try: + data = request_json(f"{base_url}/recordings?status=stopped") + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + + items = data.get("recordings", []) + if query: + q = query.lower() + items = [ + r for r in items + if q in (r.get("name") or "").lower() + ] + + if not items: + suffix = f" matching {query!r}" if query else "" + print(f"No stopped recordings found{suffix}.") + return 0 + + print(f"{'NAME':<30} {'EVENTS':>6} {'RECORDING ID'}") + print("-" * 72) + for r in items: + name = r.get("name") or "(unnamed)" + events = r.get("event_count", "?") + recording_id = r["recording_id"] + compiled = "(compiled)" if (r.get("metadata") or {}).get("routine_id") else "" + print(f"{name:<30} {events:>6} {recording_id} {compiled}") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser( + description="List saved routines or stopped recordings", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "query", + nargs="?", + help="Filter by name or goal substring (case-insensitive)", + ) + parser.add_argument( + "--recordings", + action="store_true", + help="List stopped recordings instead of compiled routines", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + if args.recordings: + return list_recordings(args.url, args.query) + return list_routines(args.url, args.query) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/replay.py b/skill/claude/ob-routines/scripts/replay.py new file mode 100644 index 0000000..5520fb4 --- /dev/null +++ b/skill/claude/ob-routines/scripts/replay.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +"""Execute a saved Browser Routine in Chrome. + +Looks up the routine by name (exact or prefix match, case-insensitive), +creates an agent conversation in routine_replay mode, sends the routine +markdown as the task, and streams execution output. + +Examples: + python3 replay.py "techforum-upvote" --chrome-uuid "$OPENBROWSER_CHROME_UUID" + python3 replay.py login # prefix match + python3 replay.py --list # list all available routines +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 10, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +# --------------------------------------------------------------------------- +# Routine lookup +# --------------------------------------------------------------------------- + + +def find_routine(base_url: str, query: str) -> dict | None: + """Return a single routine matching query by exact name, then prefix, then substring.""" + data = request_json(f"{base_url}/routines") + routines = data.get("routines", []) + if not routines: + return None + + q = query.lower() + + # 1. Exact name match + for r in routines: + if r["name"].lower() == q: + return r + + # 2. Exact routine_id match + for r in routines: + if r["routine_id"].lower() == q: + return r + + # 3. Prefix match on name + prefix = [r for r in routines if r["name"].lower().startswith(q)] + if len(prefix) == 1: + return prefix[0] + if len(prefix) > 1: + print("[replay:ambiguous] Multiple routines match that prefix:", flush=True) + for r in prefix: + print(f" {r['name']} (id={r['routine_id']})", flush=True) + print("Provide a more specific name or the full routine_id.", flush=True) + return None + + # 4. Substring match on name or goal + sub = [ + r for r in routines + if q in r["name"].lower() or q in r.get("goal", "").lower() + ] + if len(sub) == 1: + return sub[0] + if len(sub) > 1: + print("[replay:ambiguous] Multiple routines match that substring:", flush=True) + for r in sub: + print(f" {r['name']} (id={r['routine_id']})", flush=True) + print("Provide a more specific name or the full routine_id.", flush=True) + return None + + return None + + +# --------------------------------------------------------------------------- +# SSE streaming (same conventions as send_task.py) +# --------------------------------------------------------------------------- + + +def _format_event(event_type: str, data: dict) -> None: + if event_type == "complete": + print(f"[complete] {data.get('message', '')}", flush=True) + return + + if event_type == "usage_metrics": + metrics = data.get("metrics", {}) + model_name = metrics.get("model_name", "unknown") + cost = metrics.get("accumulated_cost", 0) + token_usage = metrics.get("accumulated_token_usage", {}) + total_tokens = token_usage.get("total_tokens", 0) + if total_tokens == 0: + total_tokens = ( + token_usage.get("prompt_tokens", 0) + + token_usage.get("completion_tokens", 0) + + token_usage.get("reasoning_tokens", 0) + ) + print( + f"[usage] model={model_name} cost_rmb={cost:.6f} tokens={total_tokens}", + flush=True, + ) + return + + if event_type != "agent_event": + print(f"[{event_type}] {json.dumps(data, ensure_ascii=False)}", flush=True) + return + + data_type = data.get("type", "unknown") + + if data_type == "SystemPromptEvent": + text_len = len(data.get("text", "")) + print( + f"[system_prompt] suppressed ({text_len} chars)", + flush=True, + ) + return + + if data_type == "MessageEvent": + role = data.get("role", "unknown") + text = data.get("text", "") + print(f"[message:{role}] {text}", flush=True) + return + + if data_type == "ThoughtEvent": + thought = data.get("thought", data.get("content", "")) + print(f"[thought] {thought}", flush=True) + return + + if data_type == "ActionEvent": + action = data.get("action", {}) + if isinstance(action, dict): + action_name = action.get("action", "unknown") + element_id = action.get("element_id") + url = action.get("url") + text = action.get("text") + extras = [] + if element_id: + extras.append(f"element_id={element_id}") + if url: + extras.append(f"url={url}") + if text: + extras.append(f"text={text!r}") + suffix = (" " + " ".join(extras)) if extras else "" + print(f"[action] {action_name}{suffix}", flush=True) + else: + print(f"[action] {action}", flush=True) + return + + if data_type == "ObservationEvent": + success = data.get("success", False) + message = data.get("message", "") + state = "ok" if success else "error" + print(f"[observation:{state}] {message}", flush=True) + return + + if data_type == "ErrorEvent": + print(f"[error] {data.get('error', 'unknown error')}", flush=True) + return + + print( + f"[agent_event:{data_type}] {json.dumps(data, ensure_ascii=False)}", + flush=True, + ) + + +def stream_replay( + base_url: str, + conversation_id: str, + task: str, + cwd: str, + chrome_uuid: str, +) -> None: + req = Request( + f"{base_url}/agent/conversations/{conversation_id}/messages", + data=json.dumps({ + "text": task, + "cwd": cwd, + "browser_id": chrome_uuid, + }).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Accept": "text/event-stream", + }, + method="POST", + ) + + with urlopen(req, timeout=None) as response: + sse_event: str | None = None + sse_data: str | None = None + for raw_line in response: + line = raw_line.decode("utf-8").rstrip("\n") + if not line: + if sse_event and sse_data is not None: + try: + _format_event(sse_event, json.loads(sse_data)) + except json.JSONDecodeError: + print(f"[{sse_event}] {sse_data}", flush=True) + sse_event = None + sse_data = None + continue + + if line.startswith("event:"): + sse_event = line[6:].strip() + elif line.startswith("data:"): + sse_data = line[5:].lstrip() + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Replay a saved Browser Routine in Chrome", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "routine", + nargs="?", + help="Routine name, ID, or prefix to replay", + ) + parser.add_argument( + "--chrome-uuid", + default=os.environ.get("OPENBROWSER_CHROME_UUID"), + help="Browser UUID capability token (or set OPENBROWSER_CHROME_UUID)", + ) + parser.add_argument( + "--cwd", + default=".", + help="Working directory passed to the agent", + ) + parser.add_argument( + "--list", + action="store_true", + help="List available routines and exit", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + try: + if args.list or not args.routine: + data = request_json(f"{args.url}/routines") + routines = data.get("routines", []) + if not routines: + print("No routines saved yet.") + return 0 + print(f"{'NAME':<30} {'STEPS':>5} GOAL") + print("-" * 72) + for r in routines: + print(f"{r['name']:<30} {r.get('step_count', '?'):>5} {r.get('goal', '')}") + return 0 + + if not args.chrome_uuid: + print( + "Browser UUID is required. Set OPENBROWSER_CHROME_UUID or pass --chrome-uuid.", + file=sys.stderr, + ) + return 2 + + # ── Find the routine ────────────────────────────────────────────── + routine = find_routine(args.url, args.routine) + if routine is None: + print( + f"[replay:not_found] No routine found matching {args.routine!r}. " + "Run with --list to see available routines.", + file=sys.stderr, + ) + return 1 + + name = routine["name"] + routine_id = routine["routine_id"] + goal = routine.get("goal", "") + routine_markdown = routine.get("routine_markdown", "") + + print(f"[replay:routine] {name} id={routine_id}", flush=True) + if goal: + print(f"[replay:goal] {goal}", flush=True) + + # ── Validate browser UUID ───────────────────────────────────────── + browser_status = request_json(f"{args.url}/browsers/{args.chrome_uuid}/valid") + if not browser_status.get("valid", False): + msg = browser_status.get("message", "browser UUID is not valid") + print(f"Browser UUID validation failed: {msg}", file=sys.stderr) + return 1 + + # ── Create conversation in routine_replay mode ──────────────────── + conv_result = request_json( + f"{args.url}/agent/conversations", + method="POST", + body={ + "cwd": args.cwd, + "browser_id": args.chrome_uuid, + "mode": "routine_replay", + }, + ) + conversation_id = conv_result["conversation_id"] + print(f"[replay:conversation] {conversation_id}", flush=True) + + # ── Send routine markdown as the task ──────────────────────────── + stream_replay( + args.url, + conversation_id, + routine_markdown, + args.cwd, + args.chrome_uuid, + ) + return 0 + + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except KeyboardInterrupt: + print("Interrupted.", file=sys.stderr) + return 130 + except Exception as exc: + print(f"Replay failed: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/start_recording.py b/skill/claude/ob-routines/scripts/start_recording.py new file mode 100644 index 0000000..a34fad2 --- /dev/null +++ b/skill/claude/ob-routines/scripts/start_recording.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +"""Start a new browser recording session. + +The server sends a command to the Chrome extension which opens a dedicated +recording window. After this script exits, the user performs their actions +in that browser window. When done, they return to the terminal and run +stop_recording.py with the printed recording_id. + +Example: + python3 start_recording.py \\ + --chrome-uuid "$OPENBROWSER_CHROME_UUID" \\ + --name "Gmail compose flow" \\ + --intent "draft a new email to a contact and send it" +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 10, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Start a new recording session in Chrome", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--chrome-uuid", + default=os.environ.get("OPENBROWSER_CHROME_UUID"), + help="Browser UUID capability token (or set OPENBROWSER_CHROME_UUID)", + ) + parser.add_argument( + "--name", + help="Human-readable name for this recording session", + ) + parser.add_argument( + "--intent", + help="Short description of what you intend to record (guides compilation later)", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + if not args.chrome_uuid: + print( + "Browser UUID is required. Set OPENBROWSER_CHROME_UUID or pass --chrome-uuid.", + file=sys.stderr, + ) + return 2 + + try: + # Validate browser connectivity first + browser_status = request_json(f"{args.url}/browsers/{args.chrome_uuid}/valid") + if not browser_status.get("valid", False): + msg = browser_status.get("message", "browser UUID is not valid") + print(f"Browser UUID validation failed: {msg}", file=sys.stderr) + return 1 + + # Create and start recording + payload: dict = {"browser_id": args.chrome_uuid} + if args.name: + payload["name"] = args.name + + result = request_json(f"{args.url}/recordings", method="POST", body=payload) + if not result.get("success"): + print(f"Failed to create recording: {result}", file=sys.stderr) + return 1 + + recording = result["recording"] + recording_id = recording["recording_id"] + + # Save intent note if provided + if args.intent: + request_json( + f"{args.url}/recordings/{recording_id}/intent-note", + method="POST", + body={"intent_note": args.intent}, + ) + + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except Exception as exc: + print(f"Failed to start recording: {exc}", file=sys.stderr) + return 1 + + name_display = f" ({args.name})" if args.name else "" + print(f"[recording:started] {recording_id}{name_display}", flush=True) + if args.intent: + print(f"[recording:intent] {args.intent}", flush=True) + print( + "\nA recording window has opened in Chrome.\n" + "Perform your actions in the browser, then return here and run:\n\n" + f" python3 stop_recording.py {recording_id}\n", + flush=True, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/stop_recording.py b/skill/claude/ob-routines/scripts/stop_recording.py new file mode 100644 index 0000000..6d91656 --- /dev/null +++ b/skill/claude/ob-routines/scripts/stop_recording.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +"""Stop an active recording session. + +Sends a stop command to the Chrome extension, which closes the recording +window and flushes the event buffer. Prints the final event count so the +agent knows how much was captured before kicking off compilation. + +Example: + python3 stop_recording.py abc123-recording-id +""" + +from __future__ import annotations + +import argparse +import json +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 15, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Stop an active recording session", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("recording_id", help="Recording ID from start_recording.py") + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + try: + result = request_json( + f"{args.url}/recordings/{args.recording_id}/stop", + method="POST", + body={}, + ) + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except Exception as exc: + print(f"Failed to stop recording: {exc}", file=sys.stderr) + return 1 + + if not result.get("success"): + print(f"Stop failed: {result}", file=sys.stderr) + return 1 + + recording = result.get("recording") or {} + event_count = recording.get("event_count", "?") + name = recording.get("name") or "" + stop_reason = result.get("stop_reason", "") + + display = f" ({name})" if name else "" + print(f"[recording:stopped] {args.recording_id}{display}", flush=True) + print(f"[recording:events] {event_count} events captured", flush=True) + if stop_reason == "browser_disconnected": + print( + "[recording:warning] Browser was disconnected — recording marked stopped " + "locally. Event capture may be incomplete.", + flush=True, + ) + + print( + f"\nRecording stopped. To compile this recording into a routine, run:\n\n" + f" python3 compile.py {args.recording_id}\n", + flush=True, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) From f4490ffcdcd618391436e1ea8d190ca968e08ae2 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Fri, 17 Apr 2026 22:26:02 +0800 Subject: [PATCH 4/9] perf(replay): auto-confirm unique clicks and trim image window to 1 In routine-replay mode, where the compiled SOP gives the agent precise element keywords, the 2-phase click/select/keyboard_input confirmation round-trip and the 3-frame screenshot history both pay for ambiguity that does not exist. - BrowserExecutor now tracks the most recent highlight result per conversation. When the agent targets the unique element that highlight just returned, click/select/keyboard_input skip the pending-confirmation round-trip and execute directly. Falls back to 2PC in any other case. - get_context_image_window(routine_replay=True) returns 1, overriding the default of 3 for replay conversations only. - ob-routines SKILL.md: tighten /ob-routines new to ask only for the one-line goal and defer URL/site/parameter questions to the compiler. Co-Authored-By: Claude Opus 4.7 (1M context) --- server/agent/context_image_window.py | 12 +++- server/agent/manager.py | 8 ++- server/agent/tools/browser_executor.py | 91 ++++++++++++++++++++++++++ skill/claude/ob-routines/SKILL.md | 17 ++++- 4 files changed, 124 insertions(+), 4 deletions(-) diff --git a/server/agent/context_image_window.py b/server/agent/context_image_window.py index c2da913..39f09cc 100644 --- a/server/agent/context_image_window.py +++ b/server/agent/context_image_window.py @@ -12,9 +12,16 @@ DEFAULT_CONTEXT_IMAGE_WINDOW = 3 -def get_context_image_window() -> int | None: +ROUTINE_REPLAY_CONTEXT_IMAGE_WINDOW = 1 + + +def get_context_image_window(routine_replay: bool = False) -> int | None: """Return the tool-image window passed to the SDK Agent. + Routine-replay conversations use a fixed window of 1: the SOP already + spells out each step, so a single most-recent screenshot is enough to + ground the next action and three-frame history would only pad context. + The default is to keep only the latest screenshot-bearing tool message. Environment variable semantics: - `-1`: disable SDK filtering entirely (`None`) @@ -22,6 +29,9 @@ def get_context_image_window() -> int | None: - `N >= 1`: keep the latest N screenshot-bearing tool messages """ + if routine_replay: + return ROUTINE_REPLAY_CONTEXT_IMAGE_WINDOW + raw_value = os.getenv(ENV_CONTEXT_IMAGE_WINDOW) if raw_value is None or raw_value.strip() == "": return DEFAULT_CONTEXT_IMAGE_WINDOW diff --git a/server/agent/manager.py b/server/agent/manager.py index cef99c0..7e87026 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -329,7 +329,9 @@ def _create_conversation_in_process( agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) - tool_image_window = get_context_image_window() + tool_image_window = get_context_image_window( + routine_replay=self._is_routine_replay_mode(mode) + ) condenser_llm = llm_instance.model_copy(update={"usage_id": "condenser"}) agent = Agent( llm=llm_instance, @@ -576,7 +578,9 @@ def get_or_create_conversation( agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) - tool_image_window = get_context_image_window() + tool_image_window = get_context_image_window( + routine_replay=self._is_routine_replay_mode(mode) + ) condenser_llm = llm_instance.model_copy(update={"usage_id": "condenser"}) agent = Agent( llm=llm_instance, diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 26b3f35..81feb97 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -105,6 +105,11 @@ def __init__(self): self.conversation_id = None # Pending confirmations per conversation for 2PC actions. self.pending_confirmations: Dict[str, Dict[str, Any]] = {} + # Most recent highlight result per conversation. Keyed by conversation_id, + # value is the list of element dicts returned by the last highlight call. + # Used in routine-replay mode to auto-confirm clicks/selects/keyboard_input + # when the target was just uniquely highlighted. + self.last_highlight_elements: Dict[str, List[Dict[str, Any]]] = {} def _uses_small_model(self) -> bool: """Whether the active conversation uses the small-model profile.""" @@ -132,6 +137,38 @@ def _uses_small_model(self) -> bool: return is_small_model(model_name) + def _is_routine_replay_mode(self) -> bool: + """Whether the active conversation is running in routine-replay mode.""" + if not self.conversation_id: + return False + + session = session_manager.get_session(str(self.conversation_id)) + if session is None: + return False + + return session.metadata.get("mode") == "routine_replay" + + def _auto_confirm_target_id(self, requested_element_id: str) -> str | None: + """Return the resolved element id if auto-confirm applies, else None. + + In routine-replay mode, when the most recent highlight call in this + conversation returned exactly one element whose id matches the one the + agent is now targeting, we can skip the two-phase confirmation round + trip: the routine SOP's precise keywords already disambiguated the + target, so a confirmation prompt adds latency without adding safety. + """ + if not self._is_routine_replay_mode(): + return None + if not self.conversation_id or not requested_element_id: + return None + recent = self.last_highlight_elements.get(self.conversation_id) + if not recent or len(recent) != 1: + return None + only_id = recent[0].get("id") + if not only_id or only_id != requested_element_id: + return None + return only_id + def __call__( self, action: OpenBrowserAction, conversation ) -> OpenBrowserObservation: @@ -333,6 +370,8 @@ def _execute_highlight_action( # Extract elements and pagination info elements = result_dict.get("data", {}).get("elements", []) total_elements = result_dict.get("data", {}).get("totalElements", 0) + if self.conversation_id: + self.last_highlight_elements[self.conversation_id] = list(elements) element_label = self._format_highlight_element_label( element_type=element_type, count=len(elements) ) @@ -366,6 +405,22 @@ def _execute_element_interaction_action( if action_type == "click": if not action.element_id: raise ValueError("click requires element_id parameter") + auto_id = self._auto_confirm_target_id(action.element_id) + if auto_id: + command = ClickElementCommand( + element_id=auto_id, + conversation_id=self.conversation_id, + tab_id=action.tab_id, + ) + result_dict = self._execute_command_sync(command) + if not result_dict or not result_dict.get("success"): + ext_error = self._extract_result_error(result_dict) + raise RuntimeError(f"Failed to click element: {ext_error}") + return self._build_observation_from_result( + result_dict, + f"Auto-confirmed and clicked element: {auto_id}", + element_id=auto_id, + ) element_preview = self._get_element_full_html(action.element_id, "click") full_html = element_preview[0] screenshot = element_preview[1] @@ -572,6 +627,23 @@ def _execute_element_interaction_action( raise ValueError("keyboard_input requires element_id parameter") if not action.text: raise ValueError("keyboard_input requires text parameter") + auto_id = self._auto_confirm_target_id(action.element_id) + if auto_id: + command = KeyboardInputCommand( + element_id=auto_id, + text=action.text, + conversation_id=self.conversation_id, + tab_id=action.tab_id, + ) + result_dict = self._execute_command_sync(command) + if not result_dict or not result_dict.get("success"): + ext_error = self._extract_result_error(result_dict) + raise RuntimeError(f"Failed to input text: {ext_error}") + return self._build_observation_from_result( + result_dict, + f"Auto-confirmed and input text to element: {auto_id}", + element_id=auto_id, + ) element_preview = self._get_element_full_html( action.element_id, "keyboard_input" ) @@ -622,6 +694,25 @@ def _execute_element_interaction_action( raise ValueError("select requires element_id parameter") if action.value is None: raise ValueError("select requires value parameter") + auto_id = self._auto_confirm_target_id(action.element_id) + if auto_id: + command = SelectElementCommand( + element_id=auto_id, + value=action.value, + conversation_id=self.conversation_id, + tab_id=action.tab_id, + ) + result_dict = self._execute_command_sync(command) + if not result_dict or not result_dict.get("success"): + ext_error = self._extract_result_error(result_dict) + raise RuntimeError(f"Failed to select option: {ext_error}") + value_preview = self._format_select_value_preview(action.value) + return self._build_observation_from_result( + result_dict, + f"Auto-confirmed and selected option {value_preview} in element: " + f"{auto_id}", + element_id=auto_id, + ) element_preview = self._get_element_full_html(action.element_id, "select") full_html = element_preview[0] screenshot = element_preview[1] diff --git a/skill/claude/ob-routines/SKILL.md b/skill/claude/ob-routines/SKILL.md index d9ecb7a..589bd0e 100644 --- a/skill/claude/ob-routines/SKILL.md +++ b/skill/claude/ob-routines/SKILL.md @@ -16,7 +16,7 @@ When invoked with arguments, act immediately — do not ask the user what they w |---|---| | `/ob-routines` | Show available routines and ask what to do | | `/ob-routines list [query]` | Run `list_routines.py [query]` and display results | -| `/ob-routines new` | Ask what flow to record, then start the full record → compile pipeline | +| `/ob-routines new` | Ask **only** for the one-line goal/intention, then start recording immediately (see "Before recording" below) | | `/ob-routines execute ` | Run `replay.py ` immediately | --- @@ -116,6 +116,21 @@ python3 skill/claude/ob-routines/scripts/list_routines.py --recordings ## Record a routine +### Before recording — DO NOT interrogate the user + +The whole point of record → compile is that the browser actions are **observed**, +and the Compiler Agent asks clarifying questions *after* it has seen them. + +Ask the user **only** for a short goal/intention (one line). Do **NOT** ask: +- which site or URL to start from +- which tool/screener to use +- how to define filter terms ("what's high-value?", "what's significant?") +- which parameters should vary between runs + +All of that is the compiler's job during Gate 1. Pre-record interrogation +defeats the pipeline and wastes the user's time. If the user's goal is vague +("find good stocks"), that's fine — start recording. The compiler will ask. + ### Step 1 — start recording ```bash python3 skill/claude/ob-routines/scripts/start_recording.py \ From a9c0c7c13a268f1f2d28284b024c3ce0cc9a0c81 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 18 Apr 2026 10:22:06 +0800 Subject: [PATCH 5/9] chore: apply pre-commit formatting (black) on ob-routines scripts Co-Authored-By: Claude Opus 4.7 (1M context) --- skill/claude/ob-routines/scripts/compile.py | 27 ++++++++++++------- .../ob-routines/scripts/list_routines.py | 8 ++---- skill/claude/ob-routines/scripts/replay.py | 20 +++++++------- 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/skill/claude/ob-routines/scripts/compile.py b/skill/claude/ob-routines/scripts/compile.py index 5ed9a92..5f56b6e 100644 --- a/skill/claude/ob-routines/scripts/compile.py +++ b/skill/claude/ob-routines/scripts/compile.py @@ -29,7 +29,6 @@ from urllib.error import HTTPError, URLError from urllib.request import Request, urlopen - # --------------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------------- @@ -90,10 +89,11 @@ def _format_compiler_event(event_type: str, data: dict) -> None: else: # FileEditorTool, TraceViewerTool, SubmitWorkflowTool, etc. extras = { - k: v for k, v in action.items() - if k != "action" and v is not None + k: v for k, v in action.items() if k != "action" and v is not None } - suffix = (" " + json.dumps(extras, ensure_ascii=False)) if extras else "" + suffix = ( + (" " + json.dumps(extras, ensure_ascii=False)) if extras else "" + ) print(f"[compiler:action] {action_name}{suffix}", flush=True) else: print(f"[compiler:action] {action}", flush=True) @@ -173,7 +173,10 @@ def _stream_sse(url: str, body: dict) -> dict | None: except HTTPError as exc: body_text = exc.read().decode("utf-8", errors="replace") - print(f"[compiler:http_error] {exc.code} {exc.reason}: {body_text}", file=sys.stderr) + print( + f"[compiler:http_error] {exc.code} {exc.reason}: {body_text}", + file=sys.stderr, + ) return None return complete_result @@ -207,8 +210,10 @@ def compile_recording(base_url: str, recording_id: str, model_alias: str | None) if status == "asking": question = result.get("question", "") print(f"\n[compiler:question] {question}", flush=True) - print("[compiler:waiting_for_answer] Type your answer and press Enter:", - flush=True) + print( + "[compiler:waiting_for_answer] Type your answer and press Enter:", + flush=True, + ) try: answer = input().strip() except (EOFError, KeyboardInterrupt): @@ -258,7 +263,9 @@ def compile_recording(base_url: str, recording_id: str, model_alias: str | None) goal = result.get("goal", "") step_count = result.get("step_count", "?") routine_markdown = result.get("routine_markdown", "") - print(f"\n[compiler:complete] goal={goal!r} steps={step_count}", flush=True) + print( + f"\n[compiler:complete] goal={goal!r} steps={step_count}", flush=True + ) if routine_markdown: print(f"[compiler:routine_draft]\n{routine_markdown}", flush=True) print( @@ -335,8 +342,7 @@ def compile_recording(base_url: str, recording_id: str, model_alias: str | None) print(f"[compiler:saved] name={name!r} id={routine_id} steps={steps}", flush=True) print( - f"\nRoutine saved. To replay it, run:\n\n" - f" python3 replay.py {name!r}\n", + f"\nRoutine saved. To replay it, run:\n\n" f" python3 replay.py {name!r}\n", flush=True, ) return 0 @@ -345,6 +351,7 @@ def compile_recording(base_url: str, recording_id: str, model_alias: str | None) def _slugify(text: str) -> str: """Turn a goal string into a short, lowercase, hyphenated slug.""" import re + # Lowercase, keep only alnum and spaces, collapse and replace with hyphens slug = re.sub(r"[^\w\s]", "", text.lower()) slug = re.sub(r"\s+", "-", slug.strip()) diff --git a/skill/claude/ob-routines/scripts/list_routines.py b/skill/claude/ob-routines/scripts/list_routines.py index 24a1dac..a1ab1e7 100644 --- a/skill/claude/ob-routines/scripts/list_routines.py +++ b/skill/claude/ob-routines/scripts/list_routines.py @@ -37,8 +37,7 @@ def list_routines(base_url: str, query: str | None) -> int: if query: q = query.lower() items = [ - r for r in items - if q in r["name"].lower() or q in r.get("goal", "").lower() + r for r in items if q in r["name"].lower() or q in r.get("goal", "").lower() ] if not items: @@ -68,10 +67,7 @@ def list_recordings(base_url: str, query: str | None) -> int: items = data.get("recordings", []) if query: q = query.lower() - items = [ - r for r in items - if q in (r.get("name") or "").lower() - ] + items = [r for r in items if q in (r.get("name") or "").lower()] if not items: suffix = f" matching {query!r}" if query else "" diff --git a/skill/claude/ob-routines/scripts/replay.py b/skill/claude/ob-routines/scripts/replay.py index 5520fb4..8b61d7b 100644 --- a/skill/claude/ob-routines/scripts/replay.py +++ b/skill/claude/ob-routines/scripts/replay.py @@ -20,7 +20,6 @@ from urllib.error import URLError from urllib.request import Request, urlopen - # --------------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------------- @@ -77,8 +76,7 @@ def find_routine(base_url: str, query: str) -> dict | None: # 4. Substring match on name or goal sub = [ - r for r in routines - if q in r["name"].lower() or q in r.get("goal", "").lower() + r for r in routines if q in r["name"].lower() or q in r.get("goal", "").lower() ] if len(sub) == 1: return sub[0] @@ -191,11 +189,13 @@ def stream_replay( ) -> None: req = Request( f"{base_url}/agent/conversations/{conversation_id}/messages", - data=json.dumps({ - "text": task, - "cwd": cwd, - "browser_id": chrome_uuid, - }).encode("utf-8"), + data=json.dumps( + { + "text": task, + "cwd": cwd, + "browser_id": chrome_uuid, + } + ).encode("utf-8"), headers={ "Content-Type": "application/json", "Accept": "text/event-stream", @@ -272,7 +272,9 @@ def main() -> int: print(f"{'NAME':<30} {'STEPS':>5} GOAL") print("-" * 72) for r in routines: - print(f"{r['name']:<30} {r.get('step_count', '?'):>5} {r.get('goal', '')}") + print( + f"{r['name']:<30} {r.get('step_count', '?'):>5} {r.get('goal', '')}" + ) return 0 if not args.chrome_uuid: From 85dd4f7007bfb8ef06c7fe555f46314326b769d8 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 18 Apr 2026 00:44:02 +0800 Subject: [PATCH 6/9] perf(highlight): cache layout reads + spatial-index pagination MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tab init's two heaviest phases share the same shape: per-element loops that re-do work the previous step already paid for. Cut both. Scanner (highlight-detection.injected.js): wrap collectHighlightCandidates in withScanLayoutCache, which monkey-patches Element.prototype.getBoundingClientRect, SVGGraphicsElement.prototype.getBoundingClientRect, window.getComputedStyle, and Document.prototype.elementsFromPoint with per-scan WeakMap/Map caches. The scan runs in one synchronous Runtime.evaluate, so layout cannot change mid-task and caching is safe; originals are restored in finally. Also skip inert tags (script/style/meta/...) before the first layout read. Pagination (collision-detection.ts): SelectedSpatialIndex (96px grid) keyed on union(bbox, labelBBox) of placed elements. isPlacementFeasible now iterates only nearby placed elements via nearbySelectedFor, which queries by inflate (union(candidate.bbox, candidate.labelBBox), CLEARANCE) — covering all four collision tests. chooseLeastBlockingPlacement also uses an "influence rect" to skip re-evaluating spatially-far future candidates when a hypothetical placement cannot affect them. Measured (best run, fresh tab init): - finviz.com (349 elements): 17.8s -> 13.7s (-23%) - bluebook mock (50): 6.3s -> 5.4s (-14%) - techforum mock (34): 4.3s -> 3.9s (-11%) - 16 mock sites aggregate: -4% to -14% Correctness: - 181/181 extension unit tests pass. - Strict integration check (selector + type + labelPosition + bbox + element ORDER) passes on all 16 deterministic mock sites. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../commands/highlight-detection.injected.js | 112 ++++++++ extension/src/utils/collision-detection.ts | 269 ++++++++++++++++-- 2 files changed, 354 insertions(+), 27 deletions(-) diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index 72f3e8e..95ef232 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -77,6 +77,106 @@ function hasCallableMethod(value, methodNames) { ); } +// Layout reads (getBoundingClientRect, getComputedStyle) and elementsFromPoint +// are the single biggest cost in collectHighlightCandidates: every visibility +// predicate re-reads them for the same element. Within one synchronous +// Runtime.evaluate task no page JS runs concurrently, so the values cannot +// change mid-scan. We monkey-patch the prototypes for the duration of one +// scan, populate a per-element WeakMap, and restore originals at the end. +const SCAN_NON_INTERACTIVE_TAGS = new Set([ + 'script', + 'style', + 'link', + 'meta', + 'head', + 'title', + 'noscript', + 'br', + 'hr', + 'source', + 'track', + 'template', + 'param', + 'col', + 'colgroup', +]); + +function isScanSkippableTag(el) { + if (!el || !el.tagName) return false; + return SCAN_NON_INTERACTIVE_TAGS.has(el.tagName.toLowerCase()); +} + +function withScanLayoutCache(fn) { + const rectCache = new WeakMap(); + const styleCache = new WeakMap(); + // elementsFromPoint dedup keyed by rounded "x:y" + const efpCache = new Map(); + + const origElementRect = Element.prototype.getBoundingClientRect; + const SVGGraphicsProto = + typeof SVGGraphicsElement !== 'undefined' + ? SVGGraphicsElement.prototype + : null; + const origSVGRect = + SVGGraphicsProto && SVGGraphicsProto.getBoundingClientRect; + const origGetComputedStyle = window.getComputedStyle; + // Patch Document.prototype rather than the document instance so we don't + // leave an own-property shadowing the prototype after the scan finishes. + const DocumentProto = + typeof Document !== 'undefined' ? Document.prototype : null; + const origElementsFromPoint = + DocumentProto && DocumentProto.elementsFromPoint; + + function patchedRect() { + let r = rectCache.get(this); + if (r === undefined) { + r = origElementRect.call(this); + rectCache.set(this, r); + } + return r; + } + + Element.prototype.getBoundingClientRect = patchedRect; + if (SVGGraphicsProto && origSVGRect) { + SVGGraphicsProto.getBoundingClientRect = patchedRect; + } + + window.getComputedStyle = function (el, pseudo) { + if (pseudo) return origGetComputedStyle.call(window, el, pseudo); + let s = styleCache.get(el); + if (s === undefined) { + s = origGetComputedStyle.call(window, el); + styleCache.set(el, s); + } + return s; + }; + + if (DocumentProto && origElementsFromPoint) { + DocumentProto.elementsFromPoint = function (x, y) { + const key = Math.round(x) + ':' + Math.round(y); + let stack = efpCache.get(key); + if (stack === undefined) { + stack = origElementsFromPoint.call(this, x, y); + efpCache.set(key, stack); + } + return stack; + }; + } + + try { + return fn(); + } finally { + Element.prototype.getBoundingClientRect = origElementRect; + if (SVGGraphicsProto && origSVGRect) { + SVGGraphicsProto.getBoundingClientRect = origSVGRect; + } + window.getComputedStyle = origGetComputedStyle; + if (DocumentProto && origElementsFromPoint) { + DocumentProto.elementsFromPoint = origElementsFromPoint; + } + } +} + function createHighlightTrace() { const traceStart = performance.now(); @@ -2473,6 +2573,12 @@ function collectUploadableCandidates(trace) { } function collectHighlightCandidates(config, trace, layoutStability) { + return withScanLayoutCache(() => + collectHighlightCandidatesImpl(config, trace, layoutStability), + ); +} + +function collectHighlightCandidatesImpl(config, trace, layoutStability) { const activeTopLayerRoot = getActiveTopLayerRoot(); const registry = new Map(); @@ -2529,6 +2635,12 @@ function collectHighlightCandidates(config, trace, layoutStability) { ); } + // Tag-only fast reject before any layout read. Saves rect/style work on + // the long tail of inert markup (script/style/meta/...). + if (isScanSkippableTag(element)) { + continue; + } + if (!isElementInViewportForDetection(element)) { continue; } diff --git a/extension/src/utils/collision-detection.ts b/extension/src/utils/collision-detection.ts index 054abc2..da4911a 100644 --- a/extension/src/utils/collision-detection.ts +++ b/extension/src/utils/collision-detection.ts @@ -36,6 +36,108 @@ interface RemainingCandidate { element: InteractiveElement; } +// Coarse spatial grid used to skip O(N) scans of `selected` and `remaining` +// when checking collisions. Cell size is a heuristic — large enough that most +// label rects touch only a couple of cells, small enough that a typical +// query returns far fewer than the full set. +const SPATIAL_INDEX_CELL_PX = 96; + +class SelectedSpatialIndex { + private cells = new Map(); + + add(element: InteractiveElement): void { + const labelBBox = getLabelBBox( + element.bbox, + element.labelPosition ?? 'above', + element.id, + ); + const union = unionBBox(element.bbox, labelBBox); + this.forEachCell(union, (key) => { + let bucket = this.cells.get(key); + if (!bucket) { + bucket = []; + this.cells.set(key, bucket); + } + // Avoid duplicate registration when a single element straddles cells we + // visit out of order — the per-call dedup Set in queryNear handles dup + // results across cells. + if (bucket[bucket.length - 1] !== element) { + bucket.push(element); + } + }); + } + + // Returns elements whose registered union-rect lies in any cell touched by + // the query rect (inflated by clearance on each side). Includes elements + // whose registration cells are *adjacent* to the query rect — see + // `queryNear` callers, which already inflate the query rect with clearance. + queryNear(query: BBox): InteractiveElement[] { + const seen = new Set(); + const out: InteractiveElement[] = []; + this.forEachCell(query, (key) => { + const bucket = this.cells.get(key); + if (!bucket) return; + for (const el of bucket) { + if (!seen.has(el)) { + seen.add(el); + out.push(el); + } + } + }); + return out; + } + + private forEachCell(rect: BBox, fn: (key: number) => void): void { + // Real bboxes from getBoundingClientRect are always finite, but synthetic + // test inputs or future callers might pass NaN/Infinity. Without this + // guard Math.floor would yield NaN, the loop would skip, and we'd + // silently drop a registration — masking real collisions. + if ( + !Number.isFinite(rect.x) || + !Number.isFinite(rect.y) || + !Number.isFinite(rect.width) || + !Number.isFinite(rect.height) + ) { + // Single sentinel cell so the registration is still discoverable. + fn(Number.MIN_SAFE_INTEGER); + return; + } + const minCx = Math.floor(rect.x / SPATIAL_INDEX_CELL_PX); + const maxCx = Math.floor( + (rect.x + Math.max(0, rect.width)) / SPATIAL_INDEX_CELL_PX, + ); + const minCy = Math.floor(rect.y / SPATIAL_INDEX_CELL_PX); + const maxCy = Math.floor( + (rect.y + Math.max(0, rect.height)) / SPATIAL_INDEX_CELL_PX, + ); + for (let cy = minCy; cy <= maxCy; cy++) { + for (let cx = minCx; cx <= maxCx; cx++) { + // Cantor-pair-ish key: cy gets the high bits, cx the low bits. + // Negative coords are uncommon for label rects but still encode safely + // because Math.floor preserves order under shift. + fn(cy * 100000 + cx); + } + } + } +} + +function unionBBox(a: BBox, b: BBox): BBox { + const x = Math.min(a.x, b.x); + const y = Math.min(a.y, b.y); + const xMax = Math.max(a.x + a.width, b.x + b.width); + const yMax = Math.max(a.y + a.height, b.y + b.height); + return { x, y, width: xMax - x, height: yMax - y }; +} + +function inflateBBox(rect: BBox, padding: number): BBox { + return { + x: rect.x - padding, + y: rect.y - padding, + width: rect.width + 2 * padding, + height: rect.height + 2 * padding, + }; +} + interface PlacementEvaluation { position: LabelPosition; blockedCandidateCount: number; @@ -302,12 +404,14 @@ function buildCollisionFreePages( while (remaining.length > 0) { const selected: InteractiveElement[] = []; + const selectedIndex = new SelectedSpatialIndex(); let pageRemaining = remaining; while (pageRemaining.length > 0) { const nextSelection = chooseNextCandidate( pageRemaining, selected, + selectedIndex, viewportWidth, viewportHeight, ); @@ -316,10 +420,12 @@ function buildCollisionFreePages( break; } - selected.push({ + const placed: InteractiveElement = { ...nextSelection.candidate.element, labelPosition: nextSelection.position, - }); + }; + selected.push(placed); + selectedIndex.add(placed); pageRemaining = pageRemaining.filter( (candidate) => candidate.sourceIndex !== nextSelection.candidate.sourceIndex, @@ -347,14 +453,16 @@ function tryBuildUniformPositionPage( viewportHeight?: number, ): InteractiveElement[] | null { const selected: InteractiveElement[] = []; + const index = new SelectedSpatialIndex(); for (const element of elements) { + const nearby = nearbySelectedFor(element, position, element.id, index); if ( !isPlacementFeasible( element, element.id, position, - selected, + nearby, viewportWidth, viewportHeight, ) @@ -362,10 +470,12 @@ function tryBuildUniformPositionPage( return null; } - selected.push({ + const placed: InteractiveElement = { ...element, labelPosition: position, - }); + }; + selected.push(placed); + index.add(placed); } return selected; @@ -374,6 +484,7 @@ function tryBuildUniformPositionPage( function chooseNextCandidate( remaining: RemainingCandidate[], selected: InteractiveElement[], + selectedIndex: SelectedSpatialIndex, viewportWidth?: number, viewportHeight?: number, ): (PlacementEvaluation & { candidate: RemainingCandidate }) | null { @@ -388,6 +499,7 @@ function chooseNextCandidate( candidate.element, candidate.element.id, selected, + selectedIndex, viewportWidth, viewportHeight, ); @@ -415,6 +527,7 @@ function chooseNextCandidate( constrainedCandidate.feasiblePositions, remaining, selected, + selectedIndex, viewportWidth, viewportHeight, ), @@ -426,6 +539,7 @@ function chooseLeastBlockingPlacement( feasiblePositions: LabelPosition[], remaining: RemainingCandidate[], selected: InteractiveElement[], + selectedIndex: SelectedSpatialIndex, viewportWidth?: number, viewportHeight?: number, ): PlacementEvaluation { @@ -435,31 +549,106 @@ function chooseLeastBlockingPlacement( ); let bestPlacement: PlacementEvaluation | null = null; + // Pre-compute each future candidate's baseline feasible positions against + // the current `selected` set. When we test a hypothetical placement of + // `candidate@position`, only future candidates whose bbox/label is + // geometrically near that placement can have their feasibility change. The + // rest keep their baseline feasibility — saving the O(|future|×4×|selected|) + // recomputation per position. + interface FutureBaseline { + candidate: RemainingCandidate; + elementUnion: BBox; // bbox ∪ all four label rects + feasibleCount: number; + totalLength: number; + } + const futureBaselines: FutureBaseline[] = futureCandidates.map((fc) => { + const baseline = getFeasiblePositions( + fc.element, + fc.element.id, + selected, + selectedIndex, + viewportWidth, + viewportHeight, + ); + let union = fc.element.bbox; + for (const pos of POSITION_PRIORITY) { + union = unionBBox(union, getLabelBBox(fc.element.bbox, pos, fc.element.id)); + } + return { + candidate: fc, + elementUnion: union, + feasibleCount: baseline.length, + totalLength: baseline.length, + }; + }); + + const baselineBlockedCount = futureBaselines.reduce( + (acc, fb) => (fb.feasibleCount === 0 ? acc + 1 : acc), + 0, + ); + const baselineTotalOptions = futureBaselines.reduce( + (acc, fb) => acc + fb.totalLength, + 0, + ); + for (const position of feasiblePositions) { - const hypotheticalSelected = [ - ...selected, - { - ...candidate.element, - labelPosition: position, - }, - ]; - let blockedCandidateCount = 0; - let totalFutureOptions = 0; - - futureCandidates.forEach((candidate) => { - const futureOptions = getFeasiblePositions( - candidate.element, - candidate.element.id, - hypotheticalSelected, - viewportWidth, - viewportHeight, - ); + const hypotheticalElement: InteractiveElement = { + ...candidate.element, + labelPosition: position, + }; + const hypotheticalLabelBBox = getLabelBBox( + candidate.element.bbox, + position, + candidate.element.id, + ); + // Influence rect: anything whose elementUnion does NOT intersect this + // (inflated by clearance) cannot be affected by adding the hypothetical + // candidate. We only need to recompute for future candidates inside it. + const influenceRect = inflateBBox( + unionBBox(candidate.element.bbox, hypotheticalLabelBBox), + VISUAL_LABEL_CLEARANCE_PX, + ); - if (futureOptions.length === 0) { + let blockedCandidateCount = baselineBlockedCount; + let totalFutureOptions = baselineTotalOptions; + + for (const fb of futureBaselines) { + if (!bboxesIntersect(fb.elementUnion, influenceRect)) { + continue; + } + // Feasibility can change for this future candidate. Re-test against + // the spatially-near selected set plus the hypothetical candidate. + let updatedFeasibleLen = 0; + for (const pos of POSITION_PRIORITY) { + const nearby = nearbySelectedFor( + fb.candidate.element, + pos, + fb.candidate.element.id, + selectedIndex, + [hypotheticalElement], + ); + if ( + isPlacementFeasible( + fb.candidate.element, + fb.candidate.element.id, + pos, + nearby, + viewportWidth, + viewportHeight, + ) + ) { + updatedFeasibleLen++; + } + } + + // Adjust baseline aggregates for the delta on this single future. + if (fb.feasibleCount === 0 && updatedFeasibleLen > 0) { + blockedCandidateCount--; + } else if (fb.feasibleCount > 0 && updatedFeasibleLen === 0) { blockedCandidateCount++; } - totalFutureOptions += futureOptions.length; - }); + totalFutureOptions += updatedFeasibleLen - fb.totalLength; + } if ( !bestPlacement || @@ -492,18 +681,22 @@ function getFeasiblePositions( element: InteractiveElement, labelText: string, selected: InteractiveElement[], + selectedIndex: SelectedSpatialIndex | null, viewportWidth?: number, viewportHeight?: number, ): LabelPosition[] { const feasiblePositions: LabelPosition[] = []; for (const position of POSITION_PRIORITY) { + const nearby = selectedIndex + ? nearbySelectedFor(element, position, labelText, selectedIndex) + : selected; if ( isPlacementFeasible( element, labelText, position, - selected, + nearby, viewportWidth, viewportHeight, ) @@ -515,6 +708,28 @@ function getFeasiblePositions( return feasiblePositions; } +// Returns the subset of `selected` that could plausibly collide with the +// candidate placement. The query rect is the union of the candidate's bbox +// and its label rect for the requested position, inflated by the visible +// clearance threshold. Optional `extras` are appended (e.g. a hypothetical +// candidate not yet inserted into the index). +function nearbySelectedFor( + element: InteractiveElement, + position: LabelPosition, + labelText: string, + index: SelectedSpatialIndex, + extras: InteractiveElement[] = [], +): InteractiveElement[] { + const labelBBox = getLabelBBox(element.bbox, position, labelText); + const query = inflateBBox( + unionBBox(element.bbox, labelBBox), + VISUAL_LABEL_CLEARANCE_PX, + ); + const near = index.queryNear(query); + if (extras.length === 0) return near; + return near.concat(extras); +} + function isPlacementFeasible( element: InteractiveElement, labelText: string, From 9e9cc021fab4e155f67376f41b8ff18a7bbc7ea7 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 18 Apr 2026 01:18:07 +0800 Subject: [PATCH 7/9] perf(highlight): cache resolve-phase classifiers + emit perf breakdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pagination win revealed that scan-phase resolve was the new bottleneck: on finviz the resolve phase alone was 6.1s of a 6.5s scan. Per candidate, resolveClickableCandidate walks up to 5 ancestors, each calling isClickableCandidate, which calls hasExplicitClickableAncestor that walks ALL ancestors back to body, calling getSemanticClickableSignal at each. For deep DOM (finviz tables) the same elements were classified dozens of times per scan. Add per-scan WeakMap memoization (cleared by withScanLayoutCache) for the classifiers that are pure functions of element + DOM state: - getSemanticClickableSignal - isClickableCandidate - getBaseClickableSignal - hasExplicitClickableAncestor - getElementTextForDetection (textContent walk) - getElementSearchText Also add scan_stats / scan_times to the response payload so harness/tooling can attribute time per phase without parsing console output. Measured (best run, finviz.com/screener.ashx, ~349 candidates): - in-page scan: 6537ms -> 585ms (-91%, ~11x) - pagination: 397ms -> 300ms (already optimized in prior commit) - end-to-end: 17787ms -> 4975ms (-72%, ~3.6x) Resolve-phase breakdown after caching: 6121ms -> 51ms. Correctness: 181/181 unit tests pass. Strict integration check (selector, type, labelPosition, bbox, element ORDER) passes on all 16 deterministic mock sites — same elements, same labels, same order. finviz_real returns identical 336/6/138 element/page/page1 counts. Caching is safe because the scan runs in one synchronous Runtime.evaluate call and these classifiers depend only on DOM state that cannot mutate during the scan. Co-Authored-By: Claude Opus 4.7 (1M context) --- extension/src/background/index.ts | 21 ++- .../commands/highlight-detection.injected.js | 170 ++++++++++++++++-- 2 files changed, 175 insertions(+), 16 deletions(-) diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index b35132c..5228fd9 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -583,7 +583,11 @@ async function captureHighlightedPageState( : ''; const detectedViewport = detectionResult.result.value.viewport || {}; const layoutStability = detectionResult.result.value.layoutStability; + const inPagePerf = detectionResult.result.value._perf || {}; const highlightTraceStart = Date.now(); + let paginationMs = 0; + let screenshotMs = 0; + let consistencyMs = 0; const detectedViewportWidth = typeof detectedViewport.width === 'number' ? detectedViewport.width : 0; const detectedViewportHeight = @@ -656,8 +660,9 @@ async function captureHighlightedPageState( console.log( `📄 [${logLabel}] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`, ); + paginationMs = Date.now() - paginationBuildStart; console.log( - `⏱️ [HighlightTrace] background pagination build-pages=${Date.now() - paginationBuildStart}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, + `⏱️ [HighlightTrace] background pagination build-pages=${paginationMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, ); } @@ -702,8 +707,9 @@ async function captureHighlightedPageState( console.log( `📸 [${logLabel}] Screenshot captured (with in-page highlights), size: ${screenshotResult.imageData.length} bytes`, ); + screenshotMs = Date.now() - screenshotStart; console.log( - `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`, + `⏱️ [HighlightTrace] background screenshot ${screenshotMs}ms`, ); // Apply bboxes returned from the highlight injection script @@ -766,8 +772,9 @@ async function captureHighlightedPageState( })), currentConsistencySamples, ); + consistencyMs = Date.now() - consistencyCheckStart; console.log( - `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, + `⏱️ [HighlightTrace] background consistency-check ${consistencyMs}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, ); const repeatedDrift = isRepeatedHighlightDrift( highlightConsistency, @@ -841,6 +848,14 @@ async function captureHighlightedPageState( page: currentPage, pageState, readinessReasons, + _perf: { + scan_ms: typeof inPagePerf.scan_ms === 'number' ? inPagePerf.scan_ms : 0, + scan_stats: inPagePerf.scan_stats || {}, + scan_times: inPagePerf.scan_times || {}, + pagination_ms: paginationMs, + screenshot_ms: screenshotMs, + consistency_ms: consistencyMs, + }, ...buildScreenshotPayload(compressedScreenshotResult), }; } diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index 95ef232..9581d94 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -106,11 +106,29 @@ function isScanSkippableTag(el) { return SCAN_NON_INTERACTIVE_TAGS.has(el.tagName.toLowerCase()); } +// Per-scan memoization caches for pure-function classifiers that get hit many +// times for the same element during the resolve phase (each candidate walks +// up to 5 ancestors, each ancestor calls hasExplicitClickableAncestor which +// walks ALL ancestors, etc.). Reset at the start of each scan, leak nothing +// outside it. WeakMap so any GC'd nodes drop out automatically. +let _scanSemanticSignalCache = null; +let _scanClickableCandidateCache = null; +let _scanBaseClickableSignalCache = null; +let _scanTextContentCache = null; +let _scanSearchTextCache = null; +let _scanExplicitAncestorCache = null; + function withScanLayoutCache(fn) { const rectCache = new WeakMap(); const styleCache = new WeakMap(); // elementsFromPoint dedup keyed by rounded "x:y" const efpCache = new Map(); + _scanSemanticSignalCache = new WeakMap(); + _scanClickableCandidateCache = new WeakMap(); + _scanBaseClickableSignalCache = new WeakMap(); + _scanTextContentCache = new WeakMap(); + _scanSearchTextCache = new WeakMap(); + _scanExplicitAncestorCache = new WeakMap(); const origElementRect = Element.prototype.getBoundingClientRect; const SVGGraphicsProto = @@ -174,6 +192,12 @@ function withScanLayoutCache(fn) { if (DocumentProto && origElementsFromPoint) { DocumentProto.elementsFromPoint = origElementsFromPoint; } + _scanSemanticSignalCache = null; + _scanClickableCandidateCache = null; + _scanBaseClickableSignalCache = null; + _scanTextContentCache = null; + _scanSearchTextCache = null; + _scanExplicitAncestorCache = null; } } @@ -405,6 +429,15 @@ function getSwipeMarkerText(el) { } function getElementTextForDetection(el) { + if (_scanTextContentCache && _scanTextContentCache.has(el)) { + return _scanTextContentCache.get(el); + } + const r = getElementTextForDetectionImpl(el); + if (_scanTextContentCache) _scanTextContentCache.set(el, r); + return r; +} + +function getElementTextForDetectionImpl(el) { if (el instanceof HTMLInputElement) { const inputType = (el.type || '').toLowerCase(); if ( @@ -416,10 +449,22 @@ function getElementTextForDetection(el) { } } + // textContent on a deep node walks the entire subtree of text nodes — for + // a table row with hundreds of descendants this is expensive enough to + // dominate the resolve phase. Cache so each candidate pays at most once. return normalizeWhitespace(el.textContent || '', 240); } function getElementSearchText(el) { + if (_scanSearchTextCache && _scanSearchTextCache.has(el)) { + return _scanSearchTextCache.get(el); + } + const r = getElementSearchTextImpl(el); + if (_scanSearchTextCache) _scanSearchTextCache.set(el, r); + return r; +} + +function getElementSearchTextImpl(el) { const tokens = [ el.tagName.toLowerCase(), ...getAttributeTextTokens(el, [ @@ -580,6 +625,15 @@ function hasPointerCursor(el) { } function getBaseClickableSignal(el) { + if (_scanBaseClickableSignalCache && _scanBaseClickableSignalCache.has(el)) { + return _scanBaseClickableSignalCache.get(el); + } + const r = getBaseClickableSignalImpl(el); + if (_scanBaseClickableSignalCache) _scanBaseClickableSignalCache.set(el, r); + return r; +} + +function getBaseClickableSignalImpl(el) { const semanticSignal = getSemanticClickableSignal(el); if (semanticSignal) { return semanticSignal; @@ -673,6 +727,15 @@ function getControlAffinityScore(el) { } function getSemanticClickableSignal(el) { + if (_scanSemanticSignalCache && _scanSemanticSignalCache.has(el)) { + return _scanSemanticSignalCache.get(el); + } + const r = getSemanticClickableSignalImpl(el); + if (_scanSemanticSignalCache) _scanSemanticSignalCache.set(el, r); + return r; +} + +function getSemanticClickableSignalImpl(el) { const tag = el.tagName.toLowerCase(); const role = (el.getAttribute('role') || '').toLowerCase(); @@ -869,18 +932,30 @@ function countDirectClickableChildren(el) { } function hasExplicitClickableAncestor(el) { + if (_scanExplicitAncestorCache && _scanExplicitAncestorCache.has(el)) { + return _scanExplicitAncestorCache.get(el); + } + // Per-call top-level memoization only. A previous version tried to + // walk-and-memoize each visited ancestor too, but that's incorrect — + // a node's own `hasExplicitClickableAncestor` is about ITS ancestors, + // not about its own signal, and it's also influenced by its own signal + // when answering the same question for *its* descendants. Doing the full + // walk per unique element (with getSemanticClickableSignal cached) is + // already cheap enough thanks to the upstream caches. let current = el.parentElement; - + let answer = false; while (current && current !== document.body) { const signal = getSemanticClickableSignal(current); if (signal === 'semantic' || signal === 'attribute') { - return true; + answer = true; + break; } - current = current.parentElement; } - - return false; + if (_scanExplicitAncestorCache) { + _scanExplicitAncestorCache.set(el, answer); + } + return answer; } function isInputableCandidate(el) { @@ -1011,6 +1086,15 @@ function hasStructuredInteractiveDescendant(el) { } function isClickableCandidate(el) { + if (_scanClickableCandidateCache && _scanClickableCandidateCache.has(el)) { + return _scanClickableCandidateCache.get(el); + } + const r = isClickableCandidateImpl(el); + if (_scanClickableCandidateCache) _scanClickableCandidateCache.set(el, r); + return r; +} + +function isClickableCandidateImpl(el) { if (isDisabledForDetection(el)) { return null; } @@ -2625,6 +2709,27 @@ function collectHighlightCandidatesImpl(config, trace, layoutStability) { ); let scannedCount = 0; + // Per-phase reject counters and timings — gated behind the trace, helps + // identify where the scan budget is spent without per-element console spam. + const phaseStats = { + tagSkip: 0, + notInViewport: 0, + notVisible: 0, + scrollParentClipped: 0, + notInActiveTopLayer: 0, + hitTestOccluded: 0, + notResolvable: 0, + matched: 0, + }; + const phaseTimes = { + tag: 0, + viewport: 0, + visible: 0, + scrollParent: 0, + topLayer: 0, + hitTest: 0, + resolve: 0, + }; for (const element of allElements) { scannedCount += 1; @@ -2635,40 +2740,65 @@ function collectHighlightCandidatesImpl(config, trace, layoutStability) { ); } - // Tag-only fast reject before any layout read. Saves rect/style work on - // the long tail of inert markup (script/style/meta/...). + let t = performance.now(); if (isScanSkippableTag(element)) { + phaseStats.tagSkip += 1; + phaseTimes.tag += performance.now() - t; continue; } + phaseTimes.tag += performance.now() - t; - if (!isElementInViewportForDetection(element)) { + t = performance.now(); + const inViewport = isElementInViewportForDetection(element); + phaseTimes.viewport += performance.now() - t; + if (!inViewport) { + phaseStats.notInViewport += 1; continue; } - if (!isElementVisibleForDetection(element)) { + t = performance.now(); + const visible = isElementVisibleForDetection(element); + phaseTimes.visible += performance.now() - t; + if (!visible) { + phaseStats.notVisible += 1; continue; } - if (!isElementVisibleInScrollParent(element)) { + t = performance.now(); + const scrollOk = isElementVisibleInScrollParent(element); + phaseTimes.scrollParent += performance.now() - t; + if (!scrollOk) { + phaseStats.scrollParentClipped += 1; continue; } - if (!isElementInActiveTopLayer(element, activeTopLayerRoot)) { + t = performance.now(); + const topLayerOk = isElementInActiveTopLayer(element, activeTopLayerRoot); + phaseTimes.topLayer += performance.now() - t; + if (!topLayerOk) { + phaseStats.notInActiveTopLayer += 1; continue; } + t = performance.now(); const hitTestVisibility = getElementHitTestVisibility(element); + phaseTimes.hitTest += performance.now() - t; if (!hitTestVisibility.visible) { + phaseStats.hitTestOccluded += 1; continue; } + t = performance.now(); const resolvedCandidate = resolveElementCandidate( element, config.elementType, ); + phaseTimes.resolve += performance.now() - t; if (!resolvedCandidate) { + phaseStats.notResolvable += 1; continue; } + phaseStats.matched += 1; const candidate = { element: resolvedCandidate.element, @@ -2717,14 +2847,20 @@ function collectHighlightCandidatesImpl(config, trace, layoutStability) { return element; }); + const roundedTimes = {}; + for (const k of Object.keys(phaseTimes)) { + roundedTimes[k] = Math.round(phaseTimes[k]); + } trace( 'scan:done', - `processed=${scannedCount} matched=${elements.length} counts=${JSON.stringify(counts)}`, + `processed=${scannedCount} matched=${elements.length} counts=${JSON.stringify(counts)} reject=${JSON.stringify(phaseStats)} ms=${JSON.stringify(roundedTimes)}`, ); return { elements, counts, + _scan_stats: phaseStats, + _scan_times: roundedTimes, }; } @@ -2737,11 +2873,14 @@ async function runOpenBrowserHighlightDetection(config) { const layoutStability = evaluateReadinessSnapshot(trace); - const { elements, counts } = collectHighlightCandidates( + const scanStart = performance.now(); + const scanResult = collectHighlightCandidates( config, trace, layoutStability, ); + const { elements, counts } = scanResult; + const scanMs = Math.round(performance.now() - scanStart); trace('return', `elements=${elements.length}`); return { @@ -2753,5 +2892,10 @@ async function runOpenBrowserHighlightDetection(config) { width: window.innerWidth, height: window.innerHeight, }, + _perf: { + scan_ms: scanMs, + scan_stats: scanResult._scan_stats || {}, + scan_times: scanResult._scan_times || {}, + }, }; } From e8268a86605b8d34428bec018cdcee774bb15fbb Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 18 Apr 2026 10:06:28 +0800 Subject: [PATCH 8/9] set extension auto reload timeout to 40s --- extension/vite.config.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/extension/vite.config.ts b/extension/vite.config.ts index bf660fc..3ebf0bd 100644 --- a/extension/vite.config.ts +++ b/extension/vite.config.ts @@ -122,16 +122,18 @@ const devReloadPlugin = () => { return; } - // Otherwise wait for the extension to connect (up to 10s) + // Otherwise wait for the extension to connect (up to 40s — covers a + // full chrome.alarms keepalive cycle when the MV3 service worker has + // been terminated by Chrome). console.log( '🔄 [DevReload] Build complete — waiting for extension to connect...', ); const timeout = setTimeout(() => { console.warn( - '🔄 [DevReload] No extension connected within 10s. Reload the extension manually once, then future `npm run dev` runs will auto-reload.', + '🔄 [DevReload] No extension connected within 40s. Reload the extension manually once, then future `npm run dev` runs will auto-reload.', ); process.exit(0); - }, 10_000); + }, 40_000); // Check periodically if a client has connected const poll = setInterval(() => { From 6aac696f972b606665e28450904a0d4193ab8547 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Sat, 18 Apr 2026 10:22:17 +0800 Subject: [PATCH 9/9] chore: apply pre-commit formatting (prettier) on highlight perf changes Co-Authored-By: Claude Opus 4.7 (1M context) --- extension/src/background/index.ts | 7 +++---- extension/src/commands/highlight-detection.injected.js | 6 +----- extension/src/utils/collision-detection.ts | 5 ++++- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 5228fd9..3b0f24b 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -708,9 +708,7 @@ async function captureHighlightedPageState( `📸 [${logLabel}] Screenshot captured (with in-page highlights), size: ${screenshotResult.imageData.length} bytes`, ); screenshotMs = Date.now() - screenshotStart; - console.log( - `⏱️ [HighlightTrace] background screenshot ${screenshotMs}ms`, - ); + console.log(`⏱️ [HighlightTrace] background screenshot ${screenshotMs}ms`); // Apply bboxes returned from the highlight injection script const preCaptureData = screenshotResult.preCaptureResult; @@ -849,7 +847,8 @@ async function captureHighlightedPageState( pageState, readinessReasons, _perf: { - scan_ms: typeof inPagePerf.scan_ms === 'number' ? inPagePerf.scan_ms : 0, + scan_ms: + typeof inPagePerf.scan_ms === 'number' ? inPagePerf.scan_ms : 0, scan_stats: inPagePerf.scan_stats || {}, scan_times: inPagePerf.scan_times || {}, pagination_ms: paginationMs, diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index 9581d94..4140016 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -2874,11 +2874,7 @@ async function runOpenBrowserHighlightDetection(config) { const layoutStability = evaluateReadinessSnapshot(trace); const scanStart = performance.now(); - const scanResult = collectHighlightCandidates( - config, - trace, - layoutStability, - ); + const scanResult = collectHighlightCandidates(config, trace, layoutStability); const { elements, counts } = scanResult; const scanMs = Math.round(performance.now() - scanStart); diff --git a/extension/src/utils/collision-detection.ts b/extension/src/utils/collision-detection.ts index da4911a..a409c64 100644 --- a/extension/src/utils/collision-detection.ts +++ b/extension/src/utils/collision-detection.ts @@ -572,7 +572,10 @@ function chooseLeastBlockingPlacement( ); let union = fc.element.bbox; for (const pos of POSITION_PRIORITY) { - union = unionBBox(union, getLabelBBox(fc.element.bbox, pos, fc.element.id)); + union = unionBBox( + union, + getLabelBBox(fc.element.bbox, pos, fc.element.id), + ); } return { candidate: fc,