From e1429fac5478f9e7d98da8d0229e07454d342e87 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 29 Apr 2026 20:03:16 +0800
Subject: [PATCH 01/14] feat(agent): pure pixel-level browser interaction with
 virtual cursor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the highlight + element_id paradigm with a human-like pixel
control loop: the agent sees a clean screenshot with a visible cursor
sprite and drives the page via a virtual mouse and keyboard. Same
toolset for fresh tasks and routine replay; the legacy highlight /
element-interaction modules stay on disk for non-agent flows but are
no longer exposed.

Tools (live agent surface): tab, mouse, keyboard, dialog.
- mouse: move (eased lerp), click (in-place — must move first), drag,
  scroll, reset. Coordinates in Qwen-VL [0,1000] normalized space; the
  server denormalizes to CSS pixels via the captured viewport.
- keyboard: type (one char at a time, real keydown/keypress/input
  events for ASCII printables, fallback insertText for CJK/emoji),
  press (named keys + modifiers; Enter/Tab/Space carry text so
  keypress fires and form-submit works), clear (Ctrl+A → Backspace).
- tab: clean screenshots with the cursor in-frame on every action;
  refresh/view/back/forward auto-fill the active tab_id.

Cursor sprite is a 36x36 white-and-black arrow with a red dot and
pulsing red ring at the click point, injected via preCaptureScript so
it lands in the captured frame even after navigation.

Schema: extend MouseClickCommand with optional x/y (now ignored), add
MouseDragCommand, drop le=1280/le=720 bounds on MouseMoveCommand, add
live_mode flag to BaseCommand for extension routing. Pixel commands
finally reach the wire — added MouseDragCommand routing in
CommandProcessor.execute and case handlers in the extension switch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts             | 377 ++++++++--
 extension/src/commands/pixel-actions.ts       | 693 ++++++++++++++++++
 extension/src/commands/tab-manager.ts         |   6 +
 extension/src/commands/virtual-cursor.ts      | 215 ++++++
 extension/src/types.ts                        |  20 +
 server/agent/api.py                           |  18 +-
 server/agent/manager.py                       |  24 +-
 server/agent/prompts/big_model/dialog_tool.j2 |   4 +-
 .../agent/prompts/big_model/keyboard_tool.j2  |  56 ++
 server/agent/prompts/big_model/mouse_tool.j2  |  84 +++
 server/agent/prompts/big_model/tab_tool.j2    |  47 +-
 .../agent/prompts/small_model/dialog_tool.j2  |   2 +-
 .../prompts/small_model/keyboard_tool.j2      |  38 +
 .../agent/prompts/small_model/mouse_tool.j2   |  59 ++
 server/agent/prompts/small_model/tab_tool.j2  |  22 +-
 server/agent/tools/base.py                    |  15 +
 server/agent/tools/browser_executor.py        | 297 ++++++++
 .../agent/tools/element_interaction_tool.py   |   3 +
 server/agent/tools/highlight_tool.py          |   3 +
 server/agent/tools/keyboard_tool.py           | 107 +++
 server/agent/tools/mouse_tool.py              | 157 ++++
 server/api/routes/commands.py                 |  10 +-
 server/core/processor.py                      |  35 +-
 server/models/commands.py                     |  68 +-
 24 files changed, 2249 insertions(+), 111 deletions(-)
 create mode 100644 extension/src/commands/pixel-actions.ts
 create mode 100644 extension/src/commands/virtual-cursor.ts
 create mode 100644 server/agent/prompts/big_model/keyboard_tool.j2
 create mode 100644 server/agent/prompts/big_model/mouse_tool.j2
 create mode 100644 server/agent/prompts/small_model/keyboard_tool.j2
 create mode 100644 server/agent/prompts/small_model/mouse_tool.j2
 create mode 100644 server/agent/tools/keyboard_tool.py
 create mode 100644 server/agent/tools/mouse_tool.py

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 3cf9d86..280660c 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -19,6 +19,20 @@ import { tabManager } from '../commands/tab-manager';
 import { javascript } from '../commands/javascript';
 import { debuggerSessionManager } from '../commands/debugger-manager';
 import { dialogManager } from '../commands/dialog';
+import {
+  buildCursorInjectScript,
+  resolveCursorOrCenter,
+  getCursorPosition,
+} from '../commands/virtual-cursor';
+import {
+  performMouseMove,
+  performMouseClick,
+  performMouseDrag,
+  performMouseScroll,
+  performKeyboardType,
+  performKeyboardPress,
+  performResetMouse,
+} from '../commands/pixel-actions';
 import { clearScreenshotCache } from '../commands/computer';
 
 import {
@@ -688,6 +702,12 @@ interface ScreenshotPayload {
   screenshot?: string;
   dialog_auto_accepted?: unknown;
   dialog_auto_accepted_list?: unknown;
+  // Viewport metadata in CSS pixels — required by the live agent for
+  // denormalizing Qwen-VL [0,1000] coordinates to real pixels before
+  // dispatching CDP input events.
+  viewport_width?: number;
+  viewport_height?: number;
+  device_pixel_ratio?: number;
 }
 
 interface HighlightedPageStateData extends ScreenshotPayload {
@@ -716,10 +736,16 @@ function buildScreenshotPayload(
         imageData?: string;
         dialog_auto_accepted?: unknown;
         dialog_auto_accepted_list?: unknown;
+        metadata?: {
+          viewportWidth?: number;
+          viewportHeight?: number;
+          devicePixelRatio?: number;
+        };
       }
     | null
     | undefined,
 ): ScreenshotPayload {
+  const meta = screenshotResult?.metadata;
   return {
     screenshot: screenshotResult?.imageData,
     ...(screenshotResult?.dialog_auto_accepted
@@ -732,6 +758,15 @@ function buildScreenshotPayload(
           dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list,
         }
       : {}),
+    ...(typeof meta?.viewportWidth === 'number'
+      ? { viewport_width: meta.viewportWidth }
+      : {}),
+    ...(typeof meta?.viewportHeight === 'number'
+      ? { viewport_height: meta.viewportHeight }
+      : {}),
+    ...(typeof meta?.devicePixelRatio === 'number'
+      ? { device_pixel_ratio: meta.devicePixelRatio }
+      : {}),
   };
 }
 
@@ -1042,6 +1077,7 @@ async function captureHighlightedPageState(
       imageData: screenshotResult.imageData,
       dialog_auto_accepted: screenshotResult.dialog_auto_accepted,
       dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list,
+      metadata: screenshotResult.metadata,
     });
     console.log(
       `⏱️ [HighlightTrace] background compress ${Date.now() - compressStart}ms`,
@@ -1073,6 +1109,44 @@ async function captureHighlightedPageState(
   throw new Error('Failed to produce a stable highlight screenshot');
 }
 
+/**
+ * Live-mode capture: a clean (no-highlight) screenshot with the virtual
+ * cursor injected via preCaptureScript. Used by the live pixel-only agent
+ * path on tab navigation, dialog handling, etc., in place of the highlight
+ * pipeline. Returns the same `ScreenshotPayload` shape that
+ * `captureDefaultHighlightedPageState` falls back to on failure, so callers
+ * don't need shape-aware branching.
+ */
+async function captureLiveCleanPageState(options: {
+  tabId: number;
+  conversationId: string;
+  logLabel: string;
+  waitForRender?: number;
+  captureOptions?: ScreenshotCaptureOptions;
+}): Promise<ScreenshotPayload> {
+  const {
+    tabId,
+    conversationId,
+    logLabel,
+    waitForRender = 350,
+    captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+  } = options;
+  const cursor = await resolveCursorOrCenter(tabId, conversationId);
+  const screenshotResult = await captureScreenshot(
+    tabId,
+    conversationId,
+    true, // includeCursor (no-op for CDP, kept for legacy callers)
+    90,
+    false,
+    waitForRender,
+    captureOptions,
+    buildCursorInjectScript(cursor.x, cursor.y),
+  );
+  const compressed = await compressScreenshotResult(screenshotResult);
+  console.log(`✅ [${logLabel}] Live clean screenshot captured`);
+  return buildScreenshotPayload(compressed);
+}
+
 async function captureDefaultHighlightedPageState(options: {
   tabId: number;
   conversationId: string;
@@ -1185,6 +1259,13 @@ function isHeavyBrowserCommand(data: any): boolean {
     case 'select_element':
     case 'upload_file':
     case 'handle_dialog':
+    case 'mouse_move':
+    case 'mouse_click':
+    case 'mouse_drag':
+    case 'mouse_scroll':
+    case 'keyboard_type':
+    case 'keyboard_press':
+    case 'reset_mouse':
       return true;
     case 'tab':
       return (
@@ -1664,6 +1745,17 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         await tabManager.ensureTabManaged(activeTabId, conversationId);
         tabManager.updateTabActivity(activeTabId, conversationId);
 
+        // Resolve the virtual cursor position (defaults to viewport center on
+        // first call) and inject it into the page DOM via preCaptureScript so
+        // it appears in the captured frame. Live agents always see a cursor.
+        const cursorBeforeShot =
+          command.include_visual_mouse !== false
+            ? await resolveCursorOrCenter(activeTabId, conversationId)
+            : null;
+        const cursorPreCaptureScript = cursorBeforeShot
+          ? buildCursorInjectScript(cursorBeforeShot.x, cursorBeforeShot.y)
+          : undefined;
+
         // Take screenshot in background (no tab activation)
         const screenshotResult = await captureScreenshot(
           activeTabId,
@@ -1672,6 +1764,8 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
           command.quality || 90,
           false, // resizeToPreset: false for WYSIWYG mode
           0, // waitForRender
+          undefined, // capture options
+          cursorPreCaptureScript,
         );
         const compressedScreenshotResult =
           await compressScreenshotResult(screenshotResult);
@@ -1686,6 +1780,151 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         };
       }
 
+      // ============== Pixel-level mouse / keyboard ==============
+      // The live agent uses these instead of the highlight + element-id flow.
+      // The server has already denormalized Qwen [0,1000] coords to CSS px.
+      case 'mouse_move':
+      case 'mouse_click':
+      case 'mouse_drag':
+      case 'mouse_scroll':
+      case 'keyboard_type':
+      case 'keyboard_press':
+      case 'reset_mouse': {
+        if (!command.conversation_id) {
+          throw new Error(
+            `conversation_id is required for ${command.type} command (strict mode)`,
+          );
+        }
+        const conversationId = command.conversation_id;
+        const activeTabId = tabManager.getCurrentActiveTabId(conversationId);
+        if (!activeTabId) {
+          throw new Error(
+            `No active tab found for conversation ${conversationId}. Use tab init first.`,
+          );
+        }
+        await tabManager.ensureTabManaged(activeTabId, conversationId);
+        tabManager.updateTabActivity(activeTabId, conversationId);
+
+        let actionDetail: Record<string, unknown> = {};
+        try {
+          switch (command.type) {
+            case 'mouse_move': {
+              const r = await performMouseMove(
+                activeTabId,
+                conversationId,
+                command.x,
+                command.y,
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'mouse_click': {
+              const r = await performMouseClick(
+                activeTabId,
+                conversationId,
+                command.x,
+                command.y,
+                command.button || 'left',
+                command.count || (command.double ? 2 : 1),
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'mouse_drag': {
+              const r = await performMouseDrag(
+                activeTabId,
+                conversationId,
+                command.start_x,
+                command.start_y,
+                command.end_x,
+                command.end_y,
+                command.button || 'left',
+                command.steps || 10,
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'mouse_scroll': {
+              const r = await performMouseScroll(
+                activeTabId,
+                conversationId,
+                command.direction || 'down',
+                command.amount || 300,
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'keyboard_type': {
+              const r = await performKeyboardType(
+                activeTabId,
+                conversationId,
+                command.text || '',
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'keyboard_press': {
+              const r = await performKeyboardPress(
+                activeTabId,
+                conversationId,
+                command.key || '',
+                command.modifiers,
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'reset_mouse': {
+              const r = await performResetMouse(activeTabId, conversationId);
+              actionDetail = r;
+              break;
+            }
+          }
+        } catch (err) {
+          throw new Error(
+            `Pixel action ${command.type} failed: ${err instanceof Error ? err.message : String(err)}`,
+          );
+        }
+
+        // Capture a fresh post-action screenshot with the cursor visible.
+        // For actions that can navigate or trigger heavy re-render
+        // (`mouse_click`, `mouse_drag`, `keyboard_press` Enter), give the
+        // browser a brief settle window so the captured frame reflects the
+        // new state instead of a transitional DOM. Lighter actions
+        // (mouse_move, mouse_scroll, keyboard_type, reset_mouse) take 0.
+        const settleMs =
+          command.type === 'mouse_click' ||
+          command.type === 'mouse_drag' ||
+          command.type === 'keyboard_press'
+            ? 350
+            : 0;
+        const cursorAfter =
+          getCursorPosition(activeTabId) ??
+          (await resolveCursorOrCenter(activeTabId, conversationId));
+        const postScreenshotResult = await captureScreenshot(
+          activeTabId,
+          conversationId,
+          true,
+          90,
+          false,
+          settleMs,
+          undefined,
+          buildCursorInjectScript(cursorAfter.x, cursorAfter.y),
+        );
+        const compressedPost =
+          await compressScreenshotResult(postScreenshotResult);
+
+        return {
+          success: true,
+          message: `Pixel action ${command.type} completed`,
+          data: {
+            ...(compressedPost || {}),
+            pixel_action: command.type,
+            ...actionDetail,
+          },
+          timestamp: Date.now(),
+        };
+      }
+
       case 'tab': {
         // ✅ STRICT MODE: conversation_id is REQUIRED
         if (!command.conversation_id) {
@@ -1713,13 +1952,20 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             // Set the newly created tab as active
             tabManager.setCurrentActiveTabId(conversationId, initResult.tabId);
 
-            // Capture screenshot after initialization
-            const initPageState = await captureDefaultHighlightedPageState({
-              tabId: initResult.tabId,
-              conversationId,
-              logLabel: 'Tab Init',
-              primeWithRawScreenshot: true,
-            });
+            // Capture screenshot after initialization. Live agent path
+            // returns clean+cursor; replay returns highlight inventory.
+            const initPageState = command.live_mode
+              ? await captureLiveCleanPageState({
+                  tabId: initResult.tabId,
+                  conversationId,
+                  logLabel: 'Tab Init',
+                })
+              : await captureDefaultHighlightedPageState({
+                  tabId: initResult.tabId,
+                  conversationId,
+                  logLabel: 'Tab Init',
+                  primeWithRawScreenshot: true,
+                });
 
             return {
               success: true,
@@ -1751,12 +1997,18 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
 
             // Capture screenshot after opening
             const openPageState = openResult.tabId
-              ? await captureDefaultHighlightedPageState({
-                  tabId: openResult.tabId,
-                  conversationId,
-                  logLabel: 'Tab Open',
-                  primeWithRawScreenshot: true,
-                })
+              ? command.live_mode
+                ? await captureLiveCleanPageState({
+                    tabId: openResult.tabId,
+                    conversationId,
+                    logLabel: 'Tab Open',
+                  })
+                : await captureDefaultHighlightedPageState({
+                    tabId: openResult.tabId,
+                    conversationId,
+                    logLabel: 'Tab Open',
+                    primeWithRawScreenshot: true,
+                  })
               : {};
 
             return {
@@ -1797,12 +2049,18 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             tabManager.setCurrentActiveTabId(conversationId, command.tab_id);
 
             // Capture screenshot after switching
-            const switchPageState = await captureDefaultHighlightedPageState({
-              tabId: command.tab_id,
-              conversationId,
-              logLabel: 'Tab Switch',
-              primeWithRawScreenshot: true,
-            });
+            const switchPageState = command.live_mode
+              ? await captureLiveCleanPageState({
+                  tabId: command.tab_id,
+                  conversationId,
+                  logLabel: 'Tab Switch',
+                })
+              : await captureDefaultHighlightedPageState({
+                  tabId: command.tab_id,
+                  conversationId,
+                  logLabel: 'Tab Switch',
+                  primeWithRawScreenshot: true,
+                });
 
             return {
               success: true,
@@ -1839,12 +2097,18 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             const refreshResult = await tabs.refreshTab(command.tab_id);
 
             // Capture screenshot after refresh
-            const refreshPageState = await captureDefaultHighlightedPageState({
-              tabId: command.tab_id,
-              conversationId,
-              logLabel: 'Tab Refresh',
-              primeWithRawScreenshot: true,
-            });
+            const refreshPageState = command.live_mode
+              ? await captureLiveCleanPageState({
+                  tabId: command.tab_id,
+                  conversationId,
+                  logLabel: 'Tab Refresh',
+                })
+              : await captureDefaultHighlightedPageState({
+                  tabId: command.tab_id,
+                  conversationId,
+                  logLabel: 'Tab Refresh',
+                  primeWithRawScreenshot: true,
+                });
 
             return {
               success: true,
@@ -1873,6 +2137,14 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
               `👁️ [Tab View] Capturing screenshot for tab ${viewActiveTabId}, conversation: ${conversationId}`,
             );
 
+            // Inject the virtual cursor before capture so live-mode screenshots
+            // always show the pointer. For replay/legacy callers, this is
+            // harmless — the cursor is just an extra DOM element below the
+            // highlight overlay's z-index.
+            const viewCursor = await resolveCursorOrCenter(
+              viewActiveTabId,
+              conversationId,
+            );
             const viewScreenshotResult = await captureScreenshot(
               viewActiveTabId,
               conversationId,
@@ -1881,6 +2153,7 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
               false,
               350,
               TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS,
+              buildCursorInjectScript(viewCursor.x, viewCursor.y),
             );
             const compressedViewScreenshotResult =
               await compressScreenshotResult(viewScreenshotResult);
@@ -1945,14 +2218,20 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
                 ? await tabs.goBack(targetTabId)
                 : await tabs.goForward(targetTabId);
 
-            const navigationPageState =
-              await captureDefaultHighlightedPageState({
-                tabId: targetTabId,
-                conversationId,
-                logLabel:
-                  command.action === 'back' ? 'Tab Back' : 'Tab Forward',
-                primeWithRawScreenshot: true,
-              });
+            const navigationPageState = command.live_mode
+              ? await captureLiveCleanPageState({
+                  tabId: targetTabId,
+                  conversationId,
+                  logLabel:
+                    command.action === 'back' ? 'Tab Back' : 'Tab Forward',
+                })
+              : await captureDefaultHighlightedPageState({
+                  tabId: targetTabId,
+                  conversationId,
+                  logLabel:
+                    command.action === 'back' ? 'Tab Back' : 'Tab Forward',
+                  primeWithRawScreenshot: true,
+                });
 
             return {
               success: true,
@@ -2213,11 +2492,17 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
               console.log(`💬 [HandleDialog] Auto-accepting cascading alert`);
               await dialogManager.autoAcceptDialog(activeTabId);
 
-              const dialogPageState = await captureDefaultHighlightedPageState({
-                tabId: activeTabId,
-                conversationId,
-                logLabel: 'HandleDialog',
-              });
+              const dialogPageState = command.live_mode
+                ? await captureLiveCleanPageState({
+                    tabId: activeTabId,
+                    conversationId,
+                    logLabel: 'HandleDialog',
+                  })
+                : await captureDefaultHighlightedPageState({
+                    tabId: activeTabId,
+                    conversationId,
+                    logLabel: 'HandleDialog',
+                  });
 
               return {
                 success: true,
@@ -2254,11 +2539,17 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             };
           }
 
-          const dialogPageState = await captureDefaultHighlightedPageState({
-            tabId: activeTabId,
-            conversationId,
-            logLabel: 'HandleDialog',
-          });
+          const dialogPageState = command.live_mode
+            ? await captureLiveCleanPageState({
+                tabId: activeTabId,
+                conversationId,
+                logLabel: 'HandleDialog',
+              })
+            : await captureDefaultHighlightedPageState({
+                tabId: activeTabId,
+                conversationId,
+                logLabel: 'HandleDialog',
+              });
 
           console.log(
             `✅ [HandleDialog] Dialog handling complete, screenshot captured`,
diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
new file mode 100644
index 0000000..f00a150
--- /dev/null
+++ b/extension/src/commands/pixel-actions.ts
@@ -0,0 +1,693 @@
+/**
+ * Pixel-level mouse and keyboard dispatch via CDP.
+ *
+ * Used by the live agent path: the model emits Qwen-VL [0,1000] coordinates,
+ * the server denormalizes them to CSS pixels, and these helpers turn the CSS
+ * pixels into `Input.dispatchMouseEvent` / `Input.dispatchKeyEvent` calls.
+ *
+ * All entry points clamp coordinates to the live viewport before dispatch
+ * (defense-in-depth) and refresh the in-DOM virtual cursor on every call.
+ *
+ * Coordinates are CSS viewport pixels — same space CDP `Input.*` consumes.
+ */
+
+import { CdpCommander } from './cdp-commander';
+import { debuggerSessionManager } from './debugger-manager';
+import { dialogManager } from './dialog';
+import {
+  buildCursorInjectScript,
+  setCursorPosition,
+  getCursorPosition,
+  resolveCursorOrCenter,
+  buildViewportProbeScript,
+} from './virtual-cursor';
+
+/**
+ * Common pre-flight: ensure CDP debugger is attached AND dialog tracking is
+ * enabled for this tab. Without dialog tracking, a click that opens a
+ * confirm/prompt would block subsequent CDP calls without surfacing a
+ * `dialog_opened` state to the agent.
+ */
+async function attachWithDialogTracking(
+  tabId: number,
+  conversationId: string,
+): Promise<void> {
+  await debuggerSessionManager.attachDebugger(tabId, conversationId);
+  try {
+    await dialogManager.enableForTab(tabId);
+  } catch (err) {
+    console.warn(
+      `⚠️ [PixelActions] dialogManager.enableForTab failed on tab ${tabId}:`,
+      err,
+    );
+  }
+}
+
+const MODIFIER_BITS: Record<string, number> = {
+  alt: 1,
+  control: 2,
+  ctrl: 2,
+  meta: 4,
+  cmd: 4,
+  command: 4,
+  shift: 8,
+};
+
+function modifiersBitmask(modifiers: string[] | undefined | null): number {
+  if (!modifiers || modifiers.length === 0) return 0;
+  let mask = 0;
+  for (const m of modifiers) {
+    const bit = MODIFIER_BITS[m.toLowerCase()];
+    if (bit) mask |= bit;
+  }
+  return mask;
+}
+
+async function getViewport(
+  cdp: CdpCommander,
+): Promise<{ width: number; height: number }> {
+  try {
+    const probe = await cdp.sendCommand<{
+      result?: { value?: { width?: number; height?: number } };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression: buildViewportProbeScript(),
+        returnByValue: true,
+      },
+      8000,
+      0,
+    );
+    const value = probe?.result?.value;
+    const w = typeof value?.width === 'number' && value.width > 0
+      ? value.width
+      : 1280;
+    const h = typeof value?.height === 'number' && value.height > 0
+      ? value.height
+      : 720;
+    return { width: w, height: h };
+  } catch {
+    return { width: 1280, height: 720 };
+  }
+}
+
+function clampToViewport(
+  x: number,
+  y: number,
+  vw: number,
+  vh: number,
+): { x: number; y: number; warning?: string } {
+  let warning: string | undefined;
+  let cx = Math.round(x);
+  let cy = Math.round(y);
+  if (cx < 0 || cx > vw || cy < 0 || cy > vh) {
+    warning = `(${cx}, ${cy}) outside viewport ${vw}x${vh}; clamped`;
+  }
+  cx = Math.max(0, Math.min(vw, cx));
+  cy = Math.max(0, Math.min(vh, cy));
+  return { x: cx, y: cy, warning };
+}
+
+async function refreshCursor(
+  cdp: CdpCommander,
+  tabId: number,
+  x: number,
+  y: number,
+): Promise<void> {
+  setCursorPosition(tabId, x, y);
+  try {
+    await cdp.sendCommand(
+      'Runtime.evaluate',
+      {
+        expression: buildCursorInjectScript(x, y),
+        returnByValue: true,
+      },
+      8000,
+      0,
+    );
+  } catch (err) {
+    console.warn(
+      `⚠️ [PixelActions] Cursor refresh failed on tab ${tabId}:`,
+      err,
+    );
+  }
+}
+
+// Cubic ease-in-out — slow start, fast middle, slow stop. Mimics a real
+// human reach instead of teleporting between two points.
+function easeInOut(t: number): number {
+  return t < 0.5 ? 4 * t * t * t : 1 - Math.pow(-2 * t + 2, 3) / 2;
+}
+
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+export async function performMouseMove(
+  tabId: number,
+  conversationId: string,
+  x: number,
+  y: number,
+): Promise<{ x: number; y: number; warning?: string }> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const { width: vw, height: vh } = await getViewport(cdp);
+  const target = clampToViewport(x, y, vw, vh);
+
+  // Lerp the cursor from its last position to the target with eased
+  // intermediate `mouseMoved` events. This makes the move look like a
+  // real human stroke: hover/mouseenter events fire in order along the
+  // path, and live observers see the cursor sprite glide instead of
+  // teleporting. The CSS transition on the cursor div smooths the
+  // visible sprite; the CDP step-through smooths the input-event side.
+  const start = getCursorPosition(tabId) ?? { x: target.x, y: target.y };
+  const dx = target.x - start.x;
+  const dy = target.y - start.y;
+  const distance = Math.sqrt(dx * dx + dy * dy);
+  const steps = Math.max(2, Math.min(30, Math.round(distance / 24)));
+  // Total move duration scales with distance (≈ 1.5 ms / px) and is
+  // capped so even cross-screen sweeps complete in well under a second.
+  const totalMs = Math.max(60, Math.min(450, distance * 1.5));
+  const stepDelay = steps > 1 ? totalMs / (steps - 1) : 0;
+
+  for (let i = 1; i <= steps; i++) {
+    const t = easeInOut(i / steps);
+    const ix = Math.round(start.x + dx * t);
+    const iy = Math.round(start.y + dy * t);
+    await cdp.sendCommand(
+      'Input.dispatchMouseEvent',
+      {
+        type: 'mouseMoved',
+        x: ix,
+        y: iy,
+        button: 'none',
+        buttons: 0,
+      },
+      8000,
+      0,
+    );
+    if (stepDelay > 4 && i < steps) {
+      await sleep(stepDelay);
+    }
+  }
+
+  await refreshCursor(cdp, tabId, target.x, target.y);
+  return target;
+}
+
+export async function performMouseClick(
+  tabId: number,
+  conversationId: string,
+  // `click` is an in-place action. `_x` and `_y` are kept on the wire
+  // schema for compatibility but are intentionally ignored: if the
+  // agent wants to click somewhere new, it must `move` there first.
+  // This makes the cursor's visible position load-bearing — the click
+  // commits exactly where the agent (and any human observer) sees it.
+  _x: number | undefined,
+  _y: number | undefined,
+  button: 'left' | 'right' | 'middle' = 'left',
+  count: number = 1,
+): Promise<{ x: number; y: number; button: string; warning?: string }> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const { width: vw, height: vh } = await getViewport(cdp);
+
+  // Click happens at the cursor's last known position. On the very
+  // first action of a tab, fall back to viewport center — same default
+  // the cursor sprite uses on first inject.
+  const cursor =
+    getCursorPosition(tabId) ??
+    (await resolveCursorOrCenter(tabId, conversationId));
+  const clamped = clampToViewport(cursor.x, cursor.y, vw, vh);
+
+  const cdpButton: 'left' | 'right' | 'middle' = button;
+  const buttons = button === 'left' ? 1 : button === 'right' ? 2 : 4;
+  const safeCount = Math.max(1, Math.min(3, count | 0));
+
+  // CDP convention: emit one press/release pair per click and increment
+  // `clickCount` (1, 2, 3) so Chrome interprets it as a single → double →
+  // triple click sequence. Sending N pairs each with `clickCount:N` produces
+  // N independent N-clicks, which is wrong for double-click semantics.
+  for (let i = 1; i <= safeCount; i++) {
+    await cdp.sendCommand(
+      'Input.dispatchMouseEvent',
+      {
+        type: 'mousePressed',
+        x: clamped.x,
+        y: clamped.y,
+        button: cdpButton,
+        buttons,
+        clickCount: i,
+      },
+      8000,
+      0,
+    );
+    await cdp.sendCommand(
+      'Input.dispatchMouseEvent',
+      {
+        type: 'mouseReleased',
+        x: clamped.x,
+        y: clamped.y,
+        button: cdpButton,
+        buttons,
+        clickCount: i,
+      },
+      8000,
+      0,
+    );
+  }
+
+  await refreshCursor(cdp, tabId, clamped.x, clamped.y);
+  return { x: clamped.x, y: clamped.y, button, warning: clamped.warning };
+}
+
+export async function performMouseDrag(
+  tabId: number,
+  conversationId: string,
+  startX: number,
+  startY: number,
+  endX: number,
+  endY: number,
+  button: 'left' | 'right' | 'middle' = 'left',
+  steps: number = 10,
+): Promise<{
+  start: { x: number; y: number };
+  end: { x: number; y: number };
+  warning?: string;
+}> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const { width: vw, height: vh } = await getViewport(cdp);
+  const start = clampToViewport(startX, startY, vw, vh);
+  const end = clampToViewport(endX, endY, vw, vh);
+  const safeSteps = Math.max(2, Math.min(40, steps | 0));
+  const cdpButton: 'left' | 'right' | 'middle' = button;
+  const buttons = button === 'left' ? 1 : button === 'right' ? 2 : 4;
+  const warning = start.warning || end.warning;
+
+  // Pre-move to start
+  await cdp.sendCommand(
+    'Input.dispatchMouseEvent',
+    {
+      type: 'mouseMoved',
+      x: start.x,
+      y: start.y,
+      button: 'none',
+      buttons: 0,
+    },
+    8000,
+    0,
+  );
+  // Press
+  await cdp.sendCommand(
+    'Input.dispatchMouseEvent',
+    {
+      type: 'mousePressed',
+      x: start.x,
+      y: start.y,
+      button: cdpButton,
+      buttons,
+      clickCount: 1,
+    },
+    8000,
+    0,
+  );
+  // Lerp moves
+  for (let i = 1; i <= safeSteps; i++) {
+    const t = i / safeSteps;
+    const ix = Math.round(start.x + (end.x - start.x) * t);
+    const iy = Math.round(start.y + (end.y - start.y) * t);
+    await cdp.sendCommand(
+      'Input.dispatchMouseEvent',
+      {
+        type: 'mouseMoved',
+        x: ix,
+        y: iy,
+        button: cdpButton,
+        buttons,
+      },
+      8000,
+      0,
+    );
+  }
+  // Release
+  await cdp.sendCommand(
+    'Input.dispatchMouseEvent',
+    {
+      type: 'mouseReleased',
+      x: end.x,
+      y: end.y,
+      button: cdpButton,
+      buttons,
+      clickCount: 1,
+    },
+    8000,
+    0,
+  );
+
+  await refreshCursor(cdp, tabId, end.x, end.y);
+  return {
+    start: { x: start.x, y: start.y },
+    end: { x: end.x, y: end.y },
+    warning,
+  };
+}
+
+export async function performMouseScroll(
+  tabId: number,
+  conversationId: string,
+  direction: 'up' | 'down' | 'left' | 'right',
+  amount: number,
+): Promise<{ x: number; y: number; deltaX: number; deltaY: number }> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const cursor =
+    getCursorPosition(tabId) ??
+    (await resolveCursorOrCenter(tabId, conversationId));
+  const safeAmount = Math.max(1, Math.min(2000, amount | 0));
+  let deltaX = 0;
+  let deltaY = 0;
+  switch (direction) {
+    case 'down':
+      deltaY = safeAmount;
+      break;
+    case 'up':
+      deltaY = -safeAmount;
+      break;
+    case 'right':
+      deltaX = safeAmount;
+      break;
+    case 'left':
+      deltaX = -safeAmount;
+      break;
+  }
+  await cdp.sendCommand(
+    'Input.dispatchMouseEvent',
+    {
+      type: 'mouseWheel',
+      x: cursor.x,
+      y: cursor.y,
+      deltaX,
+      deltaY,
+    },
+    8000,
+    0,
+  );
+  await refreshCursor(cdp, tabId, cursor.x, cursor.y);
+  return { x: cursor.x, y: cursor.y, deltaX, deltaY };
+}
+
+// Per-character US-keyboard mapping for plain ASCII printables. Used by
+// `performKeyboardType` to dispatch real keyDown/keyUp events one char at
+// a time — feels like a human typing and lets per-character JS handlers
+// (autocomplete, validation) react in order. Anything outside this map
+// (CJK, emoji, accented Latin, etc.) falls through to `Input.insertText`.
+const SHIFT_PUNCT: Record<string, { key: string; code: string; keyCode: number }> = {
+  '!': { key: '!', code: 'Digit1', keyCode: 49 },
+  '@': { key: '@', code: 'Digit2', keyCode: 50 },
+  '#': { key: '#', code: 'Digit3', keyCode: 51 },
+  $: { key: '$', code: 'Digit4', keyCode: 52 },
+  '%': { key: '%', code: 'Digit5', keyCode: 53 },
+  '^': { key: '^', code: 'Digit6', keyCode: 54 },
+  '&': { key: '&', code: 'Digit7', keyCode: 55 },
+  '*': { key: '*', code: 'Digit8', keyCode: 56 },
+  '(': { key: '(', code: 'Digit9', keyCode: 57 },
+  ')': { key: ')', code: 'Digit0', keyCode: 48 },
+  _: { key: '_', code: 'Minus', keyCode: 189 },
+  '+': { key: '+', code: 'Equal', keyCode: 187 },
+  '{': { key: '{', code: 'BracketLeft', keyCode: 219 },
+  '}': { key: '}', code: 'BracketRight', keyCode: 221 },
+  '|': { key: '|', code: 'Backslash', keyCode: 220 },
+  ':': { key: ':', code: 'Semicolon', keyCode: 186 },
+  '"': { key: '"', code: 'Quote', keyCode: 222 },
+  '<': { key: '<', code: 'Comma', keyCode: 188 },
+  '>': { key: '>', code: 'Period', keyCode: 190 },
+  '?': { key: '?', code: 'Slash', keyCode: 191 },
+  '~': { key: '~', code: 'Backquote', keyCode: 192 },
+};
+const PLAIN_PUNCT: Record<string, { key: string; code: string; keyCode: number }> = {
+  '`': { key: '`', code: 'Backquote', keyCode: 192 },
+  '-': { key: '-', code: 'Minus', keyCode: 189 },
+  '=': { key: '=', code: 'Equal', keyCode: 187 },
+  '[': { key: '[', code: 'BracketLeft', keyCode: 219 },
+  ']': { key: ']', code: 'BracketRight', keyCode: 221 },
+  '\\': { key: '\\', code: 'Backslash', keyCode: 220 },
+  ';': { key: ';', code: 'Semicolon', keyCode: 186 },
+  "'": { key: "'", code: 'Quote', keyCode: 222 },
+  ',': { key: ',', code: 'Comma', keyCode: 188 },
+  '.': { key: '.', code: 'Period', keyCode: 190 },
+  '/': { key: '/', code: 'Slash', keyCode: 191 },
+};
+
+function keyParamsForChar(
+  ch: string,
+): { key: string; code: string; keyCode: number; shift: boolean } | null {
+  if (ch.length !== 1) return null;
+  const code = ch.charCodeAt(0);
+  if (code > 0x7e || code < 0x20) return null;
+  if (ch >= 'a' && ch <= 'z') {
+    return {
+      key: ch,
+      code: `Key${ch.toUpperCase()}`,
+      keyCode: ch.toUpperCase().charCodeAt(0),
+      shift: false,
+    };
+  }
+  if (ch >= 'A' && ch <= 'Z') {
+    return {
+      key: ch,
+      code: `Key${ch}`,
+      keyCode: ch.charCodeAt(0),
+      shift: true,
+    };
+  }
+  if (ch >= '0' && ch <= '9') {
+    return {
+      key: ch,
+      code: `Digit${ch}`,
+      keyCode: ch.charCodeAt(0),
+      shift: false,
+    };
+  }
+  if (ch === ' ') return { key: ' ', code: 'Space', keyCode: 32, shift: false };
+  if (PLAIN_PUNCT[ch]) return { ...PLAIN_PUNCT[ch], shift: false };
+  if (SHIFT_PUNCT[ch]) return { ...SHIFT_PUNCT[ch], shift: true };
+  return null;
+}
+
+export async function performKeyboardType(
+  tabId: number,
+  conversationId: string,
+  text: string,
+): Promise<{ length: number }> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+
+  // Type one character at a time so the page sees real `keydown` →
+  // `keypress` → `input` → `keyup` events for each char. This matches
+  // what a human keyboard produces and lets per-char JS handlers
+  // (autocomplete dropdowns, live validation, debounced search) react
+  // in order. Small inter-char delays keep the cadence human-paced.
+  // Non-ASCII characters (CJK, emoji, accented Latin) fall back to
+  // `Input.insertText` because they don't have a clean US-keyboard
+  // representation.
+  const PER_CHAR_DELAY_MS = 28;
+  for (let i = 0; i < text.length; i++) {
+    const ch = text[i];
+    const params = keyParamsForChar(ch);
+    if (params) {
+      const modifiers = params.shift ? 8 : 0;
+      // keyDown with `text` fires keypress + input as well as keydown.
+      await cdp.sendCommand(
+        'Input.dispatchKeyEvent',
+        {
+          type: 'keyDown',
+          key: params.key,
+          code: params.code,
+          windowsVirtualKeyCode: params.keyCode,
+          text: params.key,
+          unmodifiedText: params.key,
+          modifiers,
+        },
+        8000,
+        0,
+      );
+      await cdp.sendCommand(
+        'Input.dispatchKeyEvent',
+        {
+          type: 'keyUp',
+          key: params.key,
+          code: params.code,
+          windowsVirtualKeyCode: params.keyCode,
+          modifiers,
+        },
+        8000,
+        0,
+      );
+    } else {
+      // Non-ASCII: insert as raw text. Fires `input` but not keydown.
+      await cdp.sendCommand('Input.insertText', { text: ch }, 8000, 0);
+    }
+    if (PER_CHAR_DELAY_MS > 0 && i < text.length - 1) {
+      await sleep(PER_CHAR_DELAY_MS);
+    }
+  }
+  return { length: text.length };
+}
+
+const NAMED_KEY_MAP: Record<
+  string,
+  { key: string; code: string; keyCode?: number }
+> = {
+  enter: { key: 'Enter', code: 'Enter', keyCode: 13 },
+  return: { key: 'Enter', code: 'Enter', keyCode: 13 },
+  escape: { key: 'Escape', code: 'Escape', keyCode: 27 },
+  esc: { key: 'Escape', code: 'Escape', keyCode: 27 },
+  tab: { key: 'Tab', code: 'Tab', keyCode: 9 },
+  backspace: { key: 'Backspace', code: 'Backspace', keyCode: 8 },
+  delete: { key: 'Delete', code: 'Delete', keyCode: 46 },
+  arrowup: { key: 'ArrowUp', code: 'ArrowUp', keyCode: 38 },
+  arrowdown: { key: 'ArrowDown', code: 'ArrowDown', keyCode: 40 },
+  arrowleft: { key: 'ArrowLeft', code: 'ArrowLeft', keyCode: 37 },
+  arrowright: { key: 'ArrowRight', code: 'ArrowRight', keyCode: 39 },
+  pageup: { key: 'PageUp', code: 'PageUp', keyCode: 33 },
+  pagedown: { key: 'PageDown', code: 'PageDown', keyCode: 34 },
+  home: { key: 'Home', code: 'Home', keyCode: 36 },
+  end: { key: 'End', code: 'End', keyCode: 35 },
+  space: { key: ' ', code: 'Space', keyCode: 32 },
+};
+
+function resolveNamedKey(rawKey: string): {
+  key: string;
+  code: string;
+  keyCode?: number;
+} {
+  const direct = NAMED_KEY_MAP[rawKey.toLowerCase()];
+  if (direct) return direct;
+  if (rawKey.length === 1) {
+    const ch = rawKey;
+    if (ch >= 'a' && ch <= 'z') {
+      return {
+        key: ch,
+        code: `Key${ch.toUpperCase()}`,
+        keyCode: ch.toUpperCase().charCodeAt(0),
+      };
+    }
+    if (ch >= 'A' && ch <= 'Z') {
+      return { key: ch, code: `Key${ch}`, keyCode: ch.charCodeAt(0) };
+    }
+    if (ch >= '0' && ch <= '9') {
+      return {
+        key: ch,
+        code: `Digit${ch}`,
+        keyCode: ch.charCodeAt(0),
+      };
+    }
+  }
+  // Unknown: pass through verbatim.
+  return { key: rawKey, code: rawKey };
+}
+
+// Keys that produce a character — these need a `text` field on the
+// keyDown so Chrome fires `keypress` (which is what most form-submit
+// handlers and search shortcuts listen for). Without the text, keypress
+// never fires and pressing Enter looks like nothing happened.
+const KEY_TEXT: Record<string, string> = {
+  Enter: '\r',
+  Tab: '\t',
+  Space: ' ',
+};
+
+export async function performKeyboardPress(
+  tabId: number,
+  conversationId: string,
+  rawKey: string,
+  modifiers: string[] | undefined,
+): Promise<{ key: string; modifiers: number }> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const resolved = resolveNamedKey(rawKey);
+  const mod = modifiersBitmask(modifiers);
+
+  // For single printable characters with no modifiers, attach `text` so
+  // the keypress event fires. Pure shortcuts (Ctrl+A, Cmd+K) intentionally
+  // omit text — keypress shouldn't fire there.
+  let text: string | undefined;
+  if (KEY_TEXT[resolved.key]) {
+    text = KEY_TEXT[resolved.key];
+  } else if (resolved.key.length === 1 && mod === 0) {
+    text = resolved.key;
+  }
+
+  const downParams: Record<string, unknown> = {
+    type: 'keyDown',
+    key: resolved.key,
+    code: resolved.code,
+    modifiers: mod,
+  };
+  if (resolved.keyCode !== undefined) {
+    downParams.windowsVirtualKeyCode = resolved.keyCode;
+  }
+  if (text !== undefined) {
+    downParams.text = text;
+    downParams.unmodifiedText = text;
+  }
+
+  await cdp.sendCommand('Input.dispatchKeyEvent', downParams, 8000, 0);
+  await cdp.sendCommand(
+    'Input.dispatchKeyEvent',
+    {
+      type: 'keyUp',
+      key: resolved.key,
+      code: resolved.code,
+      ...(resolved.keyCode !== undefined
+        ? { windowsVirtualKeyCode: resolved.keyCode }
+        : {}),
+      modifiers: mod,
+    },
+    8000,
+    0,
+  );
+  return { key: resolved.key, modifiers: mod };
+}
+
+/**
+ * Clear the currently focused input by selecting all then deleting.
+ * Convenience wrapper so the agent doesn't have to chain Ctrl+A →
+ * Backspace as two separate `press` calls.
+ */
+export async function performKeyboardClear(
+  tabId: number,
+  conversationId: string,
+): Promise<{ cleared: true }> {
+  // Select all (Ctrl+A — works on macOS in browser inputs too).
+  await performKeyboardPress(tabId, conversationId, 'a', ['Control']);
+  await sleep(20);
+  await performKeyboardPress(tabId, conversationId, 'Backspace', undefined);
+  return { cleared: true };
+}
+
+export async function performResetMouse(
+  tabId: number,
+  conversationId: string,
+): Promise<{ x: number; y: number }> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const { width: vw, height: vh } = await getViewport(cdp);
+  const cx = Math.round(vw / 2);
+  const cy = Math.round(vh / 2);
+  await cdp.sendCommand(
+    'Input.dispatchMouseEvent',
+    {
+      type: 'mouseMoved',
+      x: cx,
+      y: cy,
+      button: 'none',
+      buttons: 0,
+    },
+    8000,
+    0,
+  );
+  await refreshCursor(cdp, tabId, cx, cy);
+  return { x: cx, y: cy };
+}
diff --git a/extension/src/commands/tab-manager.ts b/extension/src/commands/tab-manager.ts
index 37e8b3b..1d866bc 100644
--- a/extension/src/commands/tab-manager.ts
+++ b/extension/src/commands/tab-manager.ts
@@ -4,6 +4,8 @@
  * Inspired by MANUS Chrome Plugin design
  */
 
+import { clearCursorPosition } from './virtual-cursor';
+
 // Tab group constants
 const TAB_GROUP_NAME = 'OpenBrowser';
 const TAB_GROUP_COLOR = 'grey' as chrome.tabGroups.Color;
@@ -855,6 +857,10 @@ export class TabManager {
   private setupListeners(): void {
     // Listen for tab removal
     chrome.tabs.onRemoved.addListener((tabId) => {
+      // Drop any cached virtual-cursor position for this tab so the entry
+      // doesn't outlive the tab.
+      clearCursorPosition(tabId);
+
       // Find which session this tab belongs to
       for (const [conversationId, session] of this.sessions.entries()) {
         if (session.managedTabs.has(tabId)) {
diff --git a/extension/src/commands/virtual-cursor.ts b/extension/src/commands/virtual-cursor.ts
new file mode 100644
index 0000000..f62fd59
--- /dev/null
+++ b/extension/src/commands/virtual-cursor.ts
@@ -0,0 +1,215 @@
+/**
+ * Virtual on-screen cursor — DOM overlay rendered into the page so the agent
+ * can see where the pointer is in every screenshot.
+ *
+ * Why an in-page DOM overlay instead of the native OS cursor: CDP
+ * `Page.captureScreenshot` does not include the OS cursor (see
+ * `screenshot.ts` includeCursor docstring). So the live agent path renders a
+ * 24×24 SVG arrow into the document with `position:fixed; pointer-events:none;
+ * z-index: 2147483646` (one below the highlight overlay's z-index).
+ *
+ * The cursor is **always** injected via `preCaptureScript` immediately before
+ * `Page.captureScreenshot` runs, so it appears fresh on whatever DOM exists at
+ * the moment of capture. This sidesteps races against navigation / async
+ * layout that a `chrome.webNavigation.onCommitted` listener would have.
+ *
+ * The hotspot (click point) is the upper-left tip of the arrow (pixel (2, 2)
+ * inside the 24×24 sprite), matching OS cursor convention. The agent's `(x,
+ * y)` coordinate aligns with the arrow's tip — the body extends down-right
+ * away from the target.
+ */
+
+import { CdpCommander } from './cdp-commander';
+import { debuggerSessionManager } from './debugger-manager';
+
+const CURSOR_OVERLAY_ID = '__ob_cursor_overlay__';
+const CURSOR_Z_INDEX = 2147483646;
+// Cursor sprite size in CSS pixels. Hotspot stays at (2, 2) inside the
+// sprite so the agent's `(x, y)` aligns with the upper-left tip of the arrow.
+const CURSOR_SIZE = 36;
+
+/**
+ * Build a JS source string that creates or updates the virtual cursor at
+ * (x, y) and returns viewport metadata. Designed to be passed as the
+ * `preCaptureScript` argument to `captureScreenshot` so the cursor lands in
+ * the captured image.
+ *
+ * Coordinates are CSS viewport pixels. The script is idempotent: it creates
+ * the overlay div once per page load and only repositions on subsequent
+ * calls. Position uses `transform: translate()` (cheaper than left/top
+ * reflow) and writes are batched in a single style assignment.
+ */
+export function buildCursorInjectScript(x: number, y: number): string {
+  const safeX = Math.max(0, Math.round(x));
+  const safeY = Math.max(0, Math.round(y));
+  return `
+    (() => {
+      try {
+        const ID = ${JSON.stringify(CURSOR_OVERLAY_ID)};
+        const Z = ${CURSOR_Z_INDEX};
+        const SZ = ${CURSOR_SIZE};
+        let host = document.getElementById(ID);
+        if (!host) {
+          host = document.createElement('div');
+          host.id = ID;
+          host.setAttribute('data-ob-virtual-cursor', '1');
+          host.style.cssText = [
+            'position:fixed',
+            'top:0',
+            'left:0',
+            'width:' + SZ + 'px',
+            'height:' + SZ + 'px',
+            'pointer-events:none',
+            'z-index:' + Z,
+            'will-change:transform',
+            'contain:layout style paint',
+            // Smooth interpolation between consecutive position updates so
+            // the cursor visibly glides instead of teleporting when watched
+            // live. CDP screenshots capture whatever frame is current at
+            // capture time, so this also makes mid-animation captures less
+            // jarring during navigation.
+            'transition:transform 120ms cubic-bezier(.25,.46,.45,.94)',
+          ].join(';');
+          // Layered sprite:
+          // 1. A red ring + dot at the click hotspot (top-left, hotspot 2,2).
+          //    The ring pulses subtly so the agent can spot it even on busy
+          //    or low-contrast pages.
+          // 2. A white arrow with a thick black outline and strong drop
+          //    shadow on top. The arrow's tip aligns with the dot so the
+          //    intended click point is unambiguous in screenshots.
+          host.innerHTML = [
+            '<style>',
+            '  @keyframes __ob_cursor_pulse {',
+            '    0%   { transform: scale(1);   opacity: 0.85; }',
+            '    50%  { transform: scale(1.45); opacity: 0.4; }',
+            '    100% { transform: scale(1);   opacity: 0.85; }',
+            '  }',
+            '  .__ob_cursor_ring {',
+            '    position:absolute; left:-5px; top:-5px;',
+            '    width:14px; height:14px; border-radius:50%;',
+            '    background: rgba(220, 38, 38, 0.55);',
+            '    box-shadow: 0 0 0 2px #fff, 0 0 8px rgba(0,0,0,0.55);',
+            '    transform-origin: center;',
+            '    animation: __ob_cursor_pulse 1.4s ease-in-out infinite;',
+            '  }',
+            '  .__ob_cursor_dot {',
+            '    position:absolute; left:-2px; top:-2px;',
+            '    width:8px; height:8px; border-radius:50%;',
+            '    background:#dc2626;',
+            '    box-shadow: 0 0 0 1.5px #fff;',
+            '  }',
+            '</style>',
+            '<div class="__ob_cursor_ring"></div>',
+            '<div class="__ob_cursor_dot"></div>',
+            '<svg xmlns="http://www.w3.org/2000/svg" width="' + SZ + '" height="' + SZ + '"',
+            ' viewBox="0 0 36 36"',
+            ' style="display:block;position:relative;filter:drop-shadow(0 2px 3px rgba(0,0,0,0.6))">',
+            '  <path d="M2 2 L2 28 L10 20 L15 31 L20 29 L15 18 L26 18 Z"',
+            '   fill="#ffffff" stroke="#000000" stroke-width="2"',
+            '   stroke-linejoin="round" />',
+            '</svg>',
+          ].join('');
+          (document.documentElement || document.body || document).appendChild(host);
+        }
+        host.style.transform = 'translate(' + ${safeX} + 'px,' + ${safeY} + 'px)';
+        return {
+          ok: true,
+          x: ${safeX},
+          y: ${safeY},
+          viewportWidth: window.innerWidth,
+          viewportHeight: window.innerHeight,
+          devicePixelRatio: window.devicePixelRatio || 1,
+        };
+      } catch (err) {
+        return { ok: false, error: String(err) };
+      }
+    })()
+  `;
+}
+
+/**
+ * Build a JS source string that returns the current viewport size in CSS
+ * pixels. Used to place the cursor at the viewport center on first injection
+ * before any pixel action has been issued.
+ */
+export function buildViewportProbeScript(): string {
+  return `({ width: window.innerWidth, height: window.innerHeight })`;
+}
+
+/**
+ * Track the virtual cursor position per tab. The position is updated every
+ * time the agent issues a pixel action; the next screenshot's
+ * `preCaptureScript` reads from here.
+ *
+ * Default position on first read is (0, 0) — callers should resolve to
+ * viewport center via `resolveCursorOrCenter()` before injecting.
+ */
+const cursorByTab = new Map<number, { x: number; y: number }>();
+
+export function setCursorPosition(tabId: number, x: number, y: number): void {
+  cursorByTab.set(tabId, {
+    x: Math.max(0, Math.round(x)),
+    y: Math.max(0, Math.round(y)),
+  });
+}
+
+export function getCursorPosition(
+  tabId: number,
+): { x: number; y: number } | undefined {
+  return cursorByTab.get(tabId);
+}
+
+export function clearCursorPosition(tabId: number): void {
+  cursorByTab.delete(tabId);
+}
+
+/**
+ * Resolve the cursor position for a tab, defaulting to viewport center on
+ * first call. Queries the page for `window.innerWidth/innerHeight` via CDP
+ * Runtime.evaluate so we use the real viewport even when the extension has
+ * no other source of truth yet.
+ *
+ * Returns CSS pixel coordinates suitable for `buildCursorInjectScript`.
+ */
+export async function resolveCursorOrCenter(
+  tabId: number,
+  conversationId: string,
+): Promise<{ x: number; y: number }> {
+  const known = cursorByTab.get(tabId);
+  if (known) return known;
+  try {
+    await debuggerSessionManager.attachDebugger(tabId, conversationId);
+    const cdp = new CdpCommander(tabId);
+    const probe = await cdp.sendCommand(
+      'Runtime.evaluate',
+      {
+        expression: buildViewportProbeScript(),
+        returnByValue: true,
+      },
+      3000,
+      0,
+    );
+    const value = (probe as { result?: { value?: unknown } } | undefined)
+      ?.result?.value as
+      | { width?: number; height?: number }
+      | undefined;
+    const w =
+      typeof value?.width === 'number' && value.width > 0 ? value.width : 1280;
+    const h =
+      typeof value?.height === 'number' && value.height > 0
+        ? value.height
+        : 720;
+    const center = { x: Math.round(w / 2), y: Math.round(h / 2) };
+    cursorByTab.set(tabId, center);
+    return center;
+  } catch (err) {
+    console.warn(
+      `⚠️ [VirtualCursor] resolveCursorOrCenter failed on tab ${tabId}:`,
+      err,
+    );
+    // Conservative default — clamps to a typical viewport.
+    const fallback = { x: 640, y: 360 };
+    cursorByTab.set(tabId, fallback);
+    return fallback;
+  }
+}
diff --git a/extension/src/types.ts b/extension/src/types.ts
index 97a72e0..f0dc526 100644
--- a/extension/src/types.ts
+++ b/extension/src/types.ts
@@ -21,6 +21,10 @@ export interface BaseCommand {
   timestamp?: number;
   tab_id?: number;
   conversation_id?: string; // For multi-session support
+  // When true, the live agent path is active: skip highlight injection and
+  // return a clean screenshot with the virtual cursor. Default false keeps
+  // routine-replay's highlight + element-id behavior.
+  live_mode?: boolean;
 }
 
 export interface MouseMoveCommand extends BaseCommand {
@@ -35,6 +39,21 @@ export interface MouseClickCommand extends BaseCommand {
   button?: MouseButton;
   double?: boolean;
   count?: number;
+  // Optional CSS-pixel target. When provided, the extension pre-moves the
+  // cursor to (x, y) before dispatching the click. When omitted, the click
+  // fires at the cursor's current position.
+  x?: number;
+  y?: number;
+}
+
+export interface MouseDragCommand extends BaseCommand {
+  type: 'mouse_drag';
+  start_x: number;
+  start_y: number;
+  end_x: number;
+  end_y: number;
+  button?: MouseButton;
+  steps?: number;
 }
 
 export interface MouseScrollCommand extends BaseCommand {
@@ -294,6 +313,7 @@ export interface GroundedElementsResponse {
 export type Command =
   | MouseMoveCommand
   | MouseClickCommand
+  | MouseDragCommand
   | MouseScrollCommand
   | ResetMouseCommand
   | KeyboardTypeCommand
diff --git a/server/agent/api.py b/server/agent/api.py
index 53aa826..0e59450 100644
--- a/server/agent/api.py
+++ b/server/agent/api.py
@@ -497,14 +497,24 @@ def initialize_agent():
         # Import the old OpenBrowserTool for backward compatibility
         # logger.info("OpenBrowserTool registered (deprecated, for backward compatibility)")
 
-        # Import new focused tools to ensure they're registered
+        # Tools exposed to the agent: tab, mouse, keyboard, dialog. The
+        # legacy highlight + element_interaction modules are imported only
+        # to keep them importable for non-agent flows; they are not in the
+        # agent's toolset.
         from .tools.tab_tool import TabTool
-        from .tools.highlight_tool import HighlightTool
-        from .tools.element_interaction_tool import ElementInteractionTool
         from .tools.dialog_tool import DialogTool
+        from .tools.mouse_tool import MouseTool
+        from .tools.keyboard_tool import KeyboardTool
+        # Imported for legacy tooling (routine recording) — not registered
+        # for the live agent.
+        from .tools.highlight_tool import HighlightTool  # noqa: F401
+        from .tools.element_interaction_tool import (  # noqa: F401
+            ElementInteractionTool,
+        )
 
         logger.info(
-            "4 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog"
+            "4 OpenBrowser tools registered for the agent: "
+            "tab, mouse, keyboard, dialog"
         )
 
     except Exception as e:
diff --git a/server/agent/manager.py b/server/agent/manager.py
index 7e87026..d59c726 100644
--- a/server/agent/manager.py
+++ b/server/agent/manager.py
@@ -93,10 +93,14 @@ def __init__(self, multi_process_mode: bool = False):
         else:
             logger.info("AgentManager initialized in single-process mode")
 
+        # The agent drives the browser like a human, with a virtual mouse,
+        # keyboard, and screenshots — same toolset for fresh tasks and routine
+        # replay. The legacy highlight + element_interaction tools live on
+        # disk for non-agent flows (recording tooling) but are not exposed.
         self.browser_tools = [
             Tool(name="tab"),  # Tab management
-            Tool(name="highlight"),  # Element discovery with visual overlays
-            Tool(name="element_interaction"),  # Click/input with 2PC, others direct
+            Tool(name="mouse"),  # Virtual mouse: move/click/drag/scroll/reset
+            Tool(name="keyboard"),  # Virtual keyboard: type/press
             Tool(name="dialog"),  # Browser dialog handling
         ]
         self.general_tools = [
@@ -134,9 +138,17 @@ def _resolve_llm_settings(
         return model_to_use, base_url_to_use, selected_llm
 
     def _get_tools_for_model(
-        self, model: Optional[str] = None, model_alias: Optional[str] = None
+        self,
+        model: Optional[str] = None,
+        model_alias: Optional[str] = None,
+        mode: Optional[str] = None,
     ) -> list[Tool]:
-        """Return the tool list for a model tier."""
+        """Return the tool list for a model tier.
+
+        ``mode`` is accepted for back-compat but no longer changes the
+        toolset — every conversation, including routine replay, uses the
+        pixel paradigm.
+        """
         self._resolve_llm_settings(model=model, model_alias=model_alias)
         return list(self.browser_tools) + list(self.general_tools)
 
@@ -328,7 +340,7 @@ def _create_conversation_in_process(
         # Create agent with tools
         agent_context = self._build_agent_context()
         llm_instance = self._create_llm_from_config(model, base_url, model_alias)
-        tools = self._get_tools_for_model(model, model_alias)
+        tools = self._get_tools_for_model(model, model_alias, mode)
         tool_image_window = get_context_image_window(
             routine_replay=self._is_routine_replay_mode(mode)
         )
@@ -577,7 +589,7 @@ def get_or_create_conversation(
         # Create agent with tools
         agent_context = self._build_agent_context()
         llm_instance = self._create_llm_from_config(model, base_url, model_alias)
-        tools = self._get_tools_for_model(model, model_alias)
+        tools = self._get_tools_for_model(model, model_alias, mode)
         tool_image_window = get_context_image_window(
             routine_replay=self._is_routine_replay_mode(mode)
         )
diff --git a/server/agent/prompts/big_model/dialog_tool.j2 b/server/agent/prompts/big_model/dialog_tool.j2
index a970f18..9b2f866 100644
--- a/server/agent/prompts/big_model/dialog_tool.j2
+++ b/server/agent/prompts/big_model/dialog_tool.j2
@@ -46,11 +46,11 @@ Handle the currently open dialog.
 
 - **Handle immediately**: Automation is blocked until you handle the dialog
 - **One at a time**: After handling, check if another dialog appeared (cascading dialogs)
-- **Returns highlighted screenshot**: When dialog handling completes without another blocking dialog, you get the default `highlight` `element_type: "any"` page 1 screenshot of the resulting page state
+- **Returns clean screenshot with cursor**: When dialog handling completes without another blocking dialog, you get a clean screenshot of the resulting page state with the virtual cursor visible.
 
 ## Screenshot Behavior
 
-The dialog tool returns the default `highlight` `element_type: "any"` page 1 screenshot after handling the dialog, showing the resulting page state with fresh interactive IDs.
+The dialog tool returns a clean screenshot of the resulting page state after handling the dialog, with the virtual cursor visible.
 
 ## Error Handling
 
diff --git a/server/agent/prompts/big_model/keyboard_tool.j2 b/server/agent/prompts/big_model/keyboard_tool.j2
new file mode 100644
index 0000000..a31437b
--- /dev/null
+++ b/server/agent/prompts/big_model/keyboard_tool.j2
@@ -0,0 +1,56 @@
+# Keyboard Tool
+
+Type text and press keys at the current focus. Click a field with `mouse` first to focus it; then use this tool.
+
+## Actions
+
+### type
+Type literal text where the keyboard focus is right now. Characters are sent one at a time, with a small delay between each, so per-character handlers (autocomplete, live validation) react in order — just like a real keyboard.
+
+```json
+{ "action": "type", "text": "hello world" }
+```
+
+`type` does not interpret special characters — newlines and tabs are inserted literally. For Enter / Tab / shortcuts, use `press`.
+
+### press
+Press a single named key, optionally with modifiers.
+
+```json
+{ "action": "press", "key": "Enter" }
+{ "action": "press", "key": "Escape" }
+{ "action": "press", "key": "Tab" }
+{ "action": "press", "key": "Backspace" }
+{ "action": "press", "key": "Delete" }
+{ "action": "press", "key": "ArrowDown" }
+{ "action": "press", "key": "a", "modifiers": ["Control"] }
+{ "action": "press", "key": "Tab", "modifiers": ["Shift"] }
+```
+
+Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`, `ArrowDown`, `ArrowLeft`, `ArrowRight`, `PageUp`, `PageDown`, `Home`, `End`, single letters/digits.
+
+Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
+
+### clear
+Convenience: select-all + delete the contents of the currently focused field. Use this when you want to overwrite a field that already has text in it.
+
+```json
+{ "action": "clear" }
+```
+
+Equivalent to `press a` with `modifiers: ["Control"]` then `press Backspace`.
+
+## Patterns
+
+- **Fill an empty form field**: `mouse` `click` on the field → `keyboard` `type` the value.
+- **Replace existing text in a field**: `mouse` `click` on the field → `keyboard` `clear` → `keyboard` `type` the new value.
+- **Submit a form / trigger search**: `keyboard` `press` `key: "Enter"` after typing.
+- **Erase a single character**: `keyboard` `press` `key: "Backspace"`.
+- **Select all in a field**: `keyboard` `press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS).
+- **Tab to next field**: `keyboard` `press` `key: "Tab"`.
+
+## Notes
+
+- One action per turn.
+- Typing or pressing goes to whatever has keyboard focus. If nothing is focused, nothing happens — `mouse click` on a field first.
+- Don't type into the address bar via this tool; use the `tab` tool to navigate.
diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
new file mode 100644
index 0000000..bcd7181
--- /dev/null
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -0,0 +1,84 @@
+# Mouse Tool
+
+Drive a virtual mouse cursor: move, click, drag, scroll.
+
+## Coordinates
+
+`(x, y)` and `(x2, y2)` are integers in the **[0, 1000] normalized space**:
+
+- `(0, 0)` = top-left of the viewport.
+- `(1000, 1000)` = bottom-right.
+
+Estimate from the screenshot. Aim for the visual center of your target. The system rescales to real pixels.
+
+## The Cursor Is Load-Bearing
+
+A red dot with a pulsing red ring sits inside a white-and-black arrow. **The dot is the click point.** It appears in every screenshot on this page.
+
+`click` is **in-place**: it commits exactly where the dot is right now. It does not take a target coordinate. If you want to click somewhere new, **`move` there first**, then verify in the next screenshot that the red dot is on top of the intended target, then `click`.
+
+This is a hard rule. Skipping the move-first step and clicking will commit at whatever position the cursor was last left at — which is rarely what you want.
+
+## Actions
+
+### move
+Slide the cursor to a point. The cursor traces an eased path, so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way.
+
+```json
+{ "action": "move", "x": 500, "y": 320 }
+```
+
+### click
+Click **where the cursor is now**. Does not take coordinates.
+
+```json
+{ "action": "click" }
+{ "action": "click", "button": "right" }
+{ "action": "click", "count": 2 }
+```
+
+- `button`: `"left"` (default), `"right"`, `"middle"`.
+- `count`: `1` (default), `2` for double-click, `3` for triple-click (text selection).
+
+Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it is not, call `move` first.
+
+### drag
+Press at `(x, y)`, drag to `(x2, y2)`, release. One call.
+
+```json
+{ "action": "drag", "x": 200, "y": 400, "x2": 800, "y2": 400 }
+```
+
+Use for sliders, kanban moves, marquee selection, drag-and-drop. `steps` (optional, default 10) controls the smoothness for DnD libraries that need many intermediate move events.
+
+### scroll
+Scroll at the cursor's current position by `amount` CSS pixels in `direction`.
+
+```json
+{ "action": "scroll", "direction": "down", "amount": 600 }
+{ "action": "scroll", "direction": "up", "amount": 300 }
+```
+
+`direction`: `"down"`, `"up"`, `"left"`, `"right"`. To scroll inside a specific panel/container, `move` over it first so the wheel event lands in the right scroll target.
+
+### reset
+Return the cursor to the viewport center.
+
+```json
+{ "action": "reset" }
+```
+
+## Patterns
+
+- **Click a button**: `move` to the button → check the screenshot → `click`.
+- **Hover-reveal menu**: `move` over the trigger; the next screenshot shows the menu open.
+- **Scroll to find something**: `scroll` direction `down`, then check the new screenshot. Repeat as needed.
+- **Drag a slider**: one `drag` from the handle's current position to the target position.
+- **Right-click for context menu**: `move` to the target → `click` with `button: "right"`.
+
+## Notes
+
+- One action per turn. The next observation reflects the post-action state.
+- The cursor position persists across actions — the cursor remains where you last left it until you `move` it again.
+- If a target isn't visible, `scroll` to bring it in view; don't try to click coordinates outside the viewport.
+- If a confirm/prompt dialog opens, the next mouse action will fail — handle the dialog first.
diff --git a/server/agent/prompts/big_model/tab_tool.j2 b/server/agent/prompts/big_model/tab_tool.j2
index ac7d8e0..374b17b 100644
--- a/server/agent/prompts/big_model/tab_tool.j2
+++ b/server/agent/prompts/big_model/tab_tool.j2
@@ -4,18 +4,17 @@ Manage browser tabs for the current conversation and establish the active page s
 
 ## Core Contract
 
-- `tab init` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the loaded page
-- `tab open` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the opened tab
-- `tab switch` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the switched-to tab
-- `tab refresh` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the refreshed page
-- `tab back` - Returns the default `highlight` `element_type: "any"` page 1 screenshot after navigating back
-- `tab forward` - Returns the default `highlight` `element_type: "any"` page 1 screenshot after navigating forward
-- `tab list` - Returns tab list only
-- `tab close` - Returns close result only
-- `tab view` - Returns a clean screenshot without overlays
-- If you need fresh `element_id`s after `tab view`, call `highlight`.
+- `tab init` - Returns a clean screenshot of the loaded page with the virtual cursor visible.
+- `tab open` - Returns a clean screenshot of the opened tab with the virtual cursor visible.
+- `tab switch` - Returns a clean screenshot of the switched-to tab with the virtual cursor visible.
+- `tab refresh` - Returns a clean screenshot of the refreshed page with the virtual cursor visible.
+- `tab back` - Returns a clean screenshot after navigating back with the virtual cursor visible.
+- `tab forward` - Returns a clean screenshot after navigating forward with the virtual cursor visible.
+- `tab list` - Returns tab list only.
+- `tab close` - Returns close result only.
+- `tab view` - Returns a clean screenshot of the current tab with the virtual cursor visible.
 - Keep work in one active tab unless opening or switching tabs clearly improves the task.
-- After navigation, use the returned observation first. If a likely target is already visible but clipped or cramped, use `scroll` before asking `highlight` for more pages.
+- After navigation, use the returned observation first. If a likely target is already visible but clipped or cramped, scroll to recenter it before clicking.
 
 ## Commands
 
@@ -26,7 +25,7 @@ Initialize a new browser session with its managed tab group.
 { "action": "init", "url": "https://example.com" }
 ```
 
-Start a task on a URL and get the default interactive observation for the loaded page.
+Start a task on a URL and get a screenshot of the loaded page.
 
 ### tab open
 Open a new tab in the current session and switch to it.
@@ -53,8 +52,6 @@ Switch to a specific tab in the session.
 { "action": "switch", "tab_id": 123 }
 ```
 
-Use the returned default observation before calling `highlight` again.
-
 ### tab list
 List all tabs in the current session.
 
@@ -71,16 +68,14 @@ Refresh the current active tab.
 { "action": "refresh", "tab_id": 123 }
 ```
 
-Use this to reload the page and get a fresh default observation.
-
 ### tab view
-Get a clean screenshot of the current active tab without element highlights.
+Get a fresh clean screenshot of the current active tab without performing any action.
 
 ```json
 { "action": "view" }
 ```
 
-Use this when you need the raw page image. It does not refresh the interactive inventory by itself.
+Use this when you want a current-state snapshot without moving the cursor or clicking.
 
 ### tab back
 Navigate back in the browser history (equivalent to clicking the browser's back button).
@@ -89,30 +84,26 @@ Navigate back in the browser history (equivalent to clicking the browser's back
 { "action": "back" }
 ```
 
-Use the returned observation before deciding whether more discovery is needed.
-
 ### tab forward
-Navigate forward in the browser history (equivalent to clicking the browser's forward button).
+Navigate forward in the browser history.
 
 ```json
 { "action": "forward" }
 ```
 
-Use the returned observation before deciding whether more discovery is needed.
-
 ## Workflow Integration
 
 1. **Start session**: `tab init https://example.com`
 2. **Navigate to other pages**: `tab open https://other.com`
-3. **Switch between tabs**: `tab switch` with appropriate tab_id
-4. These tab actions already return the default `highlight` `element_type: "any"` page 1 result for the new page state, so you can use the returned `element_id`s immediately.
-5. If the target is already partly visible after navigation, fix geometry with `scroll` before more discovery.
-6. If page 1 misses the target and it is not already partly visible, continue with the highlight tool on `element_type: "any"` page 2, 3, and so on before changing strategy.
+3. **Switch between tabs**: `tab switch` with the appropriate `tab_id`.
+4. These tab actions already return a clean screenshot of the new page state with the virtual cursor visible, so you can act on it directly with `mouse` and `keyboard`.
+5. If the target is already partly visible after navigation, scroll to recenter it before clicking.
+6. If the target is not visible on the current view, scroll or navigate to find it; do not click coordinates outside the viewport.
 
 ## Notes
 
 - **Tab IDs**: Integer identifiers assigned by Chrome. These are returned in tab list responses and used in other commands.
-- **Auto-resolution**: element_interaction tool actions (click, hover, scroll, swipe, keyboard_input) automatically use the conversation's active tab if `tab_id` is not provided.
+- **Auto-resolution**: `mouse` and `keyboard` actions automatically use the conversation's active tab.
 - **Session persistence**: Tabs for a conversation stay grouped together until explicitly closed or the browser restarts.
 
 ## Error Handling
diff --git a/server/agent/prompts/small_model/dialog_tool.j2 b/server/agent/prompts/small_model/dialog_tool.j2
index 5796ba9..6a2bdda 100644
--- a/server/agent/prompts/small_model/dialog_tool.j2
+++ b/server/agent/prompts/small_model/dialog_tool.j2
@@ -25,5 +25,5 @@ If a dialog is open, do not use other browser tools first. Browser execution is
 
 ## After Handling
 
-- Check the returned default `highlight` `element_type: "any"` page 1 screenshot
+- Check the returned clean screenshot of the resulting page state.
 - If another dialog appears, handle that next
diff --git a/server/agent/prompts/small_model/keyboard_tool.j2 b/server/agent/prompts/small_model/keyboard_tool.j2
new file mode 100644
index 0000000..1c5cc49
--- /dev/null
+++ b/server/agent/prompts/small_model/keyboard_tool.j2
@@ -0,0 +1,38 @@
+# Keyboard Tool
+
+Type text, press named keys, clear a field. Click a field with `mouse` first to focus it.
+
+## Actions
+
+### type
+Type literal text, one character at a time.
+```json
+{ "action": "type", "text": "hello world" }
+```
+
+### press
+Press a single named key, optionally with modifiers.
+```json
+{ "action": "press", "key": "Enter" }
+{ "action": "press", "key": "Backspace" }
+{ "action": "press", "key": "Tab" }
+{ "action": "press", "key": "a", "modifiers": ["Control"] }
+```
+
+Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`/`Down`/`Left`/`Right`, `PageDown`, `Home`, `End`, single letters/digits.
+
+Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
+
+### clear
+Select-all + delete the contents of the focused field. Use before overwriting a field that already has text.
+```json
+{ "action": "clear" }
+```
+
+## Patterns
+
+- **Fill an empty field**: `mouse click` → `keyboard type`.
+- **Replace text in a field**: `mouse click` on it → `keyboard clear` → `keyboard type`.
+- **Submit / search**: `keyboard press Enter`.
+- **Erase one char**: `keyboard press Backspace`.
+- **Tab to next field**: `keyboard press Tab`.
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
new file mode 100644
index 0000000..0630dd2
--- /dev/null
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -0,0 +1,59 @@
+# Mouse Tool
+
+Move, click, drag, and scroll a virtual mouse cursor.
+
+## Coordinates
+
+`(x, y)` and `(x2, y2)` are integers in **[0, 1000]**: `(0, 0)` is viewport top-left, `(1000, 1000)` is bottom-right. Estimate from the screenshot.
+
+## Cursor
+
+A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
+
+`click` is **in-place**: it commits where the dot is right now. It does not take coordinates. To click a new target: `move` there → check the screenshot → `click`.
+
+## Actions
+
+### move
+Slide the cursor to `(x, y)`.
+```json
+{ "action": "move", "x": 500, "y": 320 }
+```
+
+### click
+Click where the cursor is now. No coordinates.
+```json
+{ "action": "click" }
+{ "action": "click", "count": 2 }
+{ "action": "click", "button": "right" }
+```
+- `button`: `"left"` (default), `"right"`, `"middle"`.
+- `count`: 1 (default), 2 for double-click.
+
+Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` first.
+
+### drag
+Press at `(x, y)`, drag to `(x2, y2)`, release.
+```json
+{ "action": "drag", "x": 200, "y": 400, "x2": 800, "y2": 400 }
+```
+
+### scroll
+Scroll at the cursor by `amount` CSS pixels.
+```json
+{ "action": "scroll", "direction": "down", "amount": 600 }
+```
+`direction`: `"down"`, `"up"`, `"left"`, `"right"`.
+
+### reset
+Return cursor to viewport center.
+```json
+{ "action": "reset" }
+```
+
+## Patterns
+
+- **Click a button**: `move` → check screenshot → `click`.
+- **Hover**: `move` over the trigger; next screenshot shows the result.
+- **Scroll to find**: `scroll` then check the new screenshot.
+- **Drag**: one `drag` with start and end coordinates.
diff --git a/server/agent/prompts/small_model/tab_tool.j2 b/server/agent/prompts/small_model/tab_tool.j2
index 8757d1c..db54eb5 100644
--- a/server/agent/prompts/small_model/tab_tool.j2
+++ b/server/agent/prompts/small_model/tab_tool.j2
@@ -6,12 +6,9 @@ Manage tabs for the current conversation.
 
 1. Keep the workflow in one active tab unless a new tab is clearly necessary.
 2. After navigation, look at the returned screenshot before the next action.
-3. `tab init`, `tab open`, `tab switch`, `tab back`, `tab forward`, and `tab refresh` already return the default `highlight` `element_type: "any"` page 1 screenshot and IDs for the new page state.
-4. Prefer `tab view` when you only need a clean screenshot.
-5. If the target is already partly visible after navigation, scroll first to reposition it.
-6. If `tab view` gave you only a clean screenshot and you need `element_id`s, call `highlight`.
-7. If page 1 missed the target on the same unchanged page state, continue the same highlight mode before changing strategy.
-8. If dense UI, a sidebar, a tab strip, or collision-aware label placement may have split nearby controls across pages, keep paginating that same mode before narrowing or switching strategies.
+3. All `tab` actions return a clean screenshot of the new page state with the virtual cursor visible — act on it directly with `mouse` and `keyboard`.
+4. If the target is already partly visible after navigation, scroll to recenter it before clicking.
+5. If the target is not visible at all, scroll or navigate; do not click coordinates outside the viewport.
 
 ## Commands
 
@@ -44,7 +41,7 @@ List tabs when you need to know available `tab_id`s.
 ```
 
 ### tab view
-Get a clean screenshot of the active tab.
+Get a fresh screenshot of the active tab without acting.
 
 ```json
 { "action": "view" }
@@ -74,10 +71,7 @@ Close an unused tab.
 
 ## Recommended Flow
 
-- Start with `tab init`
-- If you need a clean screenshot, use `tab view`
-- Otherwise, use the returned `element_id`s from the default `highlight` `element_type: "any"` page 1 result
-- If the target is already partly visible or clipped, use `scroll` before more discovery
-- If the page state is unchanged and the target is still missing, your default next step is the next `any` page
-- If highlight page 1 does not show the target and it is not already partly visible, continue `element_type: "any"` pagination before changing strategy
-- If dense UI or collision-aware label placement may have split nearby controls across pages, keep paginating the same mode before narrowing or switching strategies
+- Start with `tab init`.
+- After init, the returned screenshot is your working state — use `mouse` and `keyboard` to act on it.
+- If you need a fresh screenshot without acting, use `tab view`.
+- If the target is partly visible or clipped, scroll to recenter it before clicking.
diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py
index 33b14a6..514ef90 100644
--- a/server/agent/tools/base.py
+++ b/server/agent/tools/base.py
@@ -287,6 +287,17 @@ class OpenBrowserObservation(Observation):
         default=None,
         description="Whether the active conversation uses the small-model profile.",
     )
+    # Viewport dimensions in CSS pixels at the time of the most recent screenshot.
+    # Surfaced to the model so it can self-correct if it ever drifts away from
+    # the [0,1000] normalized convention or the captured viewport changes.
+    viewport_width: Optional[int] = Field(
+        default=None,
+        description="CSS-pixel viewport width at screenshot time (None if unknown).",
+    )
+    viewport_height: Optional[int] = Field(
+        default=None,
+        description="CSS-pixel viewport height at screenshot time (None if unknown).",
+    )
 
     def _pending_confirmation_llm_content(
         self,
@@ -397,6 +408,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         # Operation Status Section
         text_parts.append("## Operation Status")
         text_parts.append("")
+        # Viewport size is intentionally not surfaced to the agent — the
+        # server denormalizes [0,1000] coords to real pixels automatically,
+        # so the agent never needs to reason about page dimensions. The
+        # cached vw/vh on the executor still drives that conversion.
         if not self.success:
             text_parts.append(f"**Status**: FAILED")
             text_parts.append(f"**Error**: {self.error}")
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 81feb97..a8311b5 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -40,6 +40,15 @@
     SetSliderValueCommand,
     UploadFileCommand,
     HighlightDropPreviewCommand,
+    MouseMoveCommand,
+    MouseClickCommand,
+    MouseDragCommand,
+    MouseScrollCommand,
+    KeyboardTypeCommand,
+    KeyboardPressCommand,
+    ResetMouseCommand,
+    MouseButton,
+    ScrollDirection,
 )
 
 # Import action types for type checking
@@ -47,6 +56,8 @@
 from server.agent.tools.highlight_tool import BaseHighlightAction
 from server.agent.tools.element_interaction_tool import ElementInteractionAction
 from server.agent.tools.dialog_tool import DialogHandleAction
+from server.agent.tools.mouse_tool import MouseAction
+from server.agent.tools.keyboard_tool import KeyboardAction
 
 from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
 from server.core.llm_config import llm_config_manager
@@ -110,6 +121,11 @@ def __init__(self):
         # Used in routine-replay mode to auto-confirm clicks/selects/keyboard_input
         # when the target was just uniquely highlighted.
         self.last_highlight_elements: Dict[str, List[Dict[str, Any]]] = {}
+        # Most recent CSS-pixel viewport per conversation (vw, vh). Captured
+        # from screenshot responses; consumed by the pixel-interaction path to
+        # denormalize Qwen-VL [0,1000] coordinates before dispatching to the
+        # extension. None = no screenshot yet — caller must take one first.
+        self.last_viewport_by_conv: Dict[str, tuple[int, int]] = {}
 
     def _uses_small_model(self) -> bool:
         """Whether the active conversation uses the small-model profile."""
@@ -137,6 +153,46 @@ def _uses_small_model(self) -> bool:
 
         return is_small_model(model_name)
 
+    def _cache_viewport(self, vw: int, vh: int) -> None:
+        """Cache the latest CSS-pixel viewport for the active conversation."""
+        if not self.conversation_id:
+            return
+        if vw <= 0 or vh <= 0:
+            return
+        self.last_viewport_by_conv[str(self.conversation_id)] = (vw, vh)
+
+    def _get_viewport(self) -> Optional[tuple[int, int]]:
+        """Return the latest cached CSS-pixel viewport, or None if unknown."""
+        if not self.conversation_id:
+            return None
+        return self.last_viewport_by_conv.get(str(self.conversation_id))
+
+    def _is_qwen_model(self) -> bool:
+        """Whether the active conversation uses a Qwen vision model.
+
+        Qwen-VL emits coordinates in the [0, 1000] normalized space, so the
+        server must denormalize before dispatching CDP input. Detection is
+        prefix-based on the canonical dashscope model id.
+        """
+        if not self.conversation_id:
+            return False
+        session = session_manager.get_session(str(self.conversation_id))
+        if session is None:
+            return False
+
+        model_name = session.metadata.get("model")
+        if not isinstance(model_name, str) or not model_name:
+            raw_alias = session.metadata.get("model_alias")
+            if isinstance(raw_alias, str) and raw_alias:
+                try:
+                    model_name = llm_config_manager.get_llm_config(raw_alias).model
+                except ValueError:
+                    model_name = None
+
+        if not isinstance(model_name, str):
+            return False
+        return model_name.startswith("dashscope/qwen") or model_name.startswith("qwen")
+
     def _is_routine_replay_mode(self) -> bool:
         """Whether the active conversation is running in routine-replay mode."""
         if not self.conversation_id:
@@ -244,6 +300,10 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
                 return self._execute_element_interaction_action(action)
             elif isinstance(action, DialogHandleAction):
                 return self._execute_dialog_action(action)
+            elif isinstance(action, MouseAction):
+                return self._execute_mouse_action(action)
+            elif isinstance(action, KeyboardAction):
+                return self._execute_keyboard_action(action)
             else:
                 raise ValueError(f"Unknown action type: {type(action).__name__}")
 
@@ -930,6 +990,213 @@ def _format_select_value_preview(value: Any) -> str:
             return f"[{joined}]"
         return f"'{value}'"
 
+    def _denormalize_xy(
+        self, x: Optional[int], y: Optional[int]
+    ) -> tuple[Optional[int], Optional[int]]:
+        """Convert Qwen-VL [0,1000] coords to CSS pixels using cached viewport.
+
+        For Qwen models, the agent emits coordinates in [0,1000] normalized
+        space; the server rescales using the captured viewport size before
+        dispatching CDP input events. For non-Qwen models, coordinates pass
+        through unchanged.
+
+        Returns the same `(x, y)` shape — `None` for any input that was None.
+        """
+        if x is None and y is None:
+            return (None, None)
+        if not self._is_qwen_model():
+            return (x, y)
+        viewport = self._get_viewport()
+        if viewport is None:
+            # No screenshot has populated the viewport yet. Best we can do
+            # is interpret coords as already-CSS pixels and let the extension
+            # clamp; a warning marks this so we can audit if it happens.
+            logger.warning(
+                "Pixel action dispatched before any screenshot populated the "
+                "viewport cache; passing coordinates through without "
+                "denormalization (conversation_id=%s).",
+                self.conversation_id,
+            )
+            return (x, y)
+        vw, vh = viewport
+        px = round(x * vw / 1000) if x is not None else None
+        py = round(y * vh / 1000) if y is not None else None
+        return (px, py)
+
+    def _execute_mouse_action(
+        self, action: MouseAction
+    ) -> OpenBrowserObservation:
+        """Execute one mouse action (move/click/drag/scroll/reset).
+
+        Coordinates from Qwen models are in [0, 1000] normalized space and are
+        denormalized to CSS pixels using the most recent viewport captured
+        from a screenshot. Non-Qwen models pass through.
+        """
+        kind = action.action
+        logger.debug(f"DEBUG: _execute_mouse_action kind={kind}")
+
+        try:
+            if kind == "move":
+                if action.x is None or action.y is None:
+                    raise ValueError("mouse move requires x and y")
+                px, py = self._denormalize_xy(action.x, action.y)
+                command = MouseMoveCommand(
+                    x=px, y=py, conversation_id=self.conversation_id
+                )
+                result_dict = self._execute_command_sync(command)
+                return self._build_observation_from_result(
+                    result_dict, f"Mouse moved to ({px}, {py})"
+                )
+
+            if kind == "click":
+                # Click is in-place at the cursor's current position. Any
+                # x/y the model emitted is ignored on purpose — if it wants
+                # to click somewhere new it must `move` there first, so the
+                # visible cursor in the screenshot is the click point.
+                if action.x is not None or action.y is not None:
+                    logger.debug(
+                        "Mouse click ignored x=%s, y=%s (click is in-place)",
+                        action.x,
+                        action.y,
+                    )
+                command = MouseClickCommand(
+                    button=MouseButton(action.button),
+                    count=action.count,
+                    double=(action.count == 2),
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(command)
+                return self._build_observation_from_result(
+                    result_dict,
+                    f"Clicked {action.button} at the cursor "
+                    f"(count={action.count})",
+                )
+
+            if kind == "drag":
+                if (
+                    action.x is None
+                    or action.y is None
+                    or action.x2 is None
+                    or action.y2 is None
+                ):
+                    raise ValueError("mouse drag requires x, y, x2, y2")
+                sx, sy = self._denormalize_xy(action.x, action.y)
+                ex, ey = self._denormalize_xy(action.x2, action.y2)
+                command = MouseDragCommand(
+                    start_x=sx,
+                    start_y=sy,
+                    end_x=ex,
+                    end_y=ey,
+                    button=MouseButton(action.button),
+                    steps=action.steps,
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(command)
+                return self._build_observation_from_result(
+                    result_dict, f"Dragged from ({sx}, {sy}) to ({ex}, {ey})"
+                )
+
+            if kind == "scroll":
+                command = MouseScrollCommand(
+                    direction=ScrollDirection(action.direction),
+                    amount=action.amount,
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(command)
+                return self._build_observation_from_result(
+                    result_dict,
+                    f"Scrolled {action.direction} by {action.amount}px",
+                )
+
+            if kind == "reset":
+                command = ResetMouseCommand(
+                    conversation_id=self.conversation_id
+                )
+                result_dict = self._execute_command_sync(command)
+                return self._build_observation_from_result(
+                    result_dict, "Reset cursor to viewport center"
+                )
+
+            raise ValueError(f"Unknown mouse action: {kind}")
+        except Exception as e:
+            logger.error(f"Mouse action failed (kind={kind}): {e}", exc_info=True)
+            return OpenBrowserObservation(
+                success=False, error=str(e), small_model=self._uses_small_model()
+            )
+
+    def _execute_keyboard_action(
+        self, action: KeyboardAction
+    ) -> OpenBrowserObservation:
+        """Execute one keyboard action (type/press/clear)."""
+        kind = action.action
+        logger.debug(f"DEBUG: _execute_keyboard_action kind={kind}")
+
+        try:
+            if kind == "type":
+                if not action.text:
+                    raise ValueError("keyboard type requires text")
+                command = KeyboardTypeCommand(
+                    text=action.text, conversation_id=self.conversation_id
+                )
+                result_dict = self._execute_command_sync(command)
+                preview = (
+                    action.text
+                    if len(action.text) <= 32
+                    else action.text[:29] + "..."
+                )
+                return self._build_observation_from_result(
+                    result_dict, f"Typed text: {preview!r}"
+                )
+
+            if kind == "press":
+                if not action.key:
+                    raise ValueError("keyboard press requires key")
+                command = KeyboardPressCommand(
+                    key=action.key,
+                    modifiers=list(action.modifiers or []),
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(command)
+                mod_text = (
+                    f" with {'+'.join(action.modifiers)}"
+                    if action.modifiers
+                    else ""
+                )
+                return self._build_observation_from_result(
+                    result_dict, f"Pressed {action.key}{mod_text}"
+                )
+
+            if kind == "clear":
+                # Clear == select-all then Backspace. Two press commands
+                # so each fires its own event sequence on the focused
+                # element. Done at the wire level via two
+                # KeyboardPressCommands so behavior matches what the
+                # agent would have manually scripted.
+                first = KeyboardPressCommand(
+                    key="a",
+                    modifiers=["Control"],
+                    conversation_id=self.conversation_id,
+                )
+                self._execute_command_sync(first)
+                second = KeyboardPressCommand(
+                    key="Backspace",
+                    modifiers=[],
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(second)
+                return self._build_observation_from_result(
+                    result_dict, "Cleared focused field (select-all + Backspace)"
+                )
+
+            raise ValueError(f"Unknown keyboard action: {kind}")
+        except Exception as e:
+            logger.error(
+                f"Keyboard action failed (kind={kind}): {e}", exc_info=True
+            )
+            return OpenBrowserObservation(
+                success=False, error=str(e), small_model=self._uses_small_model()
+            )
+
     def _execute_dialog_action(
         self, action: DialogHandleAction
     ) -> OpenBrowserObservation:
@@ -1246,6 +1513,26 @@ def _build_observation_from_result(
                             f"DEBUG: Extracted screenshot from data['imageData'], length={len(screenshot_data_url) if screenshot_data_url else 0}"
                         )
 
+                    # Capture viewport dims (CSS pixels) for Qwen [0,1000]
+                    # → pixel denormalization. Two shapes:
+                    #   highlighted/buildScreenshotPayload → viewport_width/height
+                    #   raw screenshot → metadata.viewportWidth/Height
+                    raw_vw = data.get("viewport_width")
+                    raw_vh = data.get("viewport_height")
+                    if raw_vw is None or raw_vh is None:
+                        meta = data.get("metadata")
+                        if isinstance(meta, dict):
+                            raw_vw = raw_vw if raw_vw is not None else meta.get(
+                                "viewportWidth"
+                            )
+                            raw_vh = raw_vh if raw_vh is not None else meta.get(
+                                "viewportHeight"
+                            )
+                    if isinstance(raw_vw, (int, float)) and isinstance(
+                        raw_vh, (int, float)
+                    ):
+                        self._cache_viewport(int(raw_vw), int(raw_vh))
+
                     # Extract highlighted elements for highlight_elements action
                     if highlighted_elements is None and "elements" in data:
                         highlighted_elements = data["elements"]
@@ -1355,6 +1642,7 @@ def _build_observation_from_result(
         pending_confirmation = self._get_pending_confirmation()
 
         # Build observation
+        cached_viewport = self._get_viewport()
         observation = OpenBrowserObservation(
             success=success,
             message=message,
@@ -1378,6 +1666,8 @@ def _build_observation_from_result(
             scroll_warning=scroll_warning,
             pending_confirmation=pending_confirmation,
             small_model=self._uses_small_model(),
+            viewport_width=cached_viewport[0] if cached_viewport else None,
+            viewport_height=cached_viewport[1] if cached_viewport else None,
         )
 
         return observation
@@ -1393,6 +1683,13 @@ def _execute_command_sync(self, command) -> Any:
                 if command.conversation_id is None:
                     command.conversation_id = self.conversation_id
 
+            # The agent loop is pixel-only — every screenshot returned to the
+            # model should be clean and show the virtual cursor. Highlights
+            # are reserved for non-agent flows (recording tooling) that don't
+            # come through this executor.
+            if hasattr(command, "live_mode"):
+                command.live_mode = True
+
             # Convert command to dict using model_dump
             cmd_dict = command.model_dump()
             logger.info(
diff --git a/server/agent/tools/element_interaction_tool.py b/server/agent/tools/element_interaction_tool.py
index 28b4e3c..8ad3150 100644
--- a/server/agent/tools/element_interaction_tool.py
+++ b/server/agent/tools/element_interaction_tool.py
@@ -5,6 +5,9 @@
 flow. Hover, scroll, and swipe execute directly.
 """
 
+# Legacy: kept for /ob-routines recording/replay; not exposed to the live agent
+# (which now uses PixelInteractionTool for pure pixel-level mouse/keyboard control).
+
 from collections.abc import Sequence
 from typing import List, Literal, Optional, Union
 
diff --git a/server/agent/tools/highlight_tool.py b/server/agent/tools/highlight_tool.py
index 773e809..6a2fa20 100644
--- a/server/agent/tools/highlight_tool.py
+++ b/server/agent/tools/highlight_tool.py
@@ -5,6 +5,9 @@
 allowing the AI agent to see and interact with elements via labeled overlays.
 """
 
+# Legacy: kept for /ob-routines recording/replay; not exposed to the live agent
+# (which now uses PixelInteractionTool for pure pixel-level mouse/keyboard control).
+
 from collections.abc import Sequence
 from typing import Optional, List
 
diff --git a/server/agent/tools/keyboard_tool.py b/server/agent/tools/keyboard_tool.py
new file mode 100644
index 0000000..7852f61
--- /dev/null
+++ b/server/agent/tools/keyboard_tool.py
@@ -0,0 +1,107 @@
+"""
+KeyboardTool - Type text and press named keys at the current focus.
+
+Use 'type' for plain text input (after clicking a field to focus it). Use
+'press' for named keys (Enter, Escape, Tab, arrows) and shortcuts with
+modifiers (Ctrl+A, Cmd+K).
+"""
+
+from collections.abc import Sequence
+from typing import List, Literal, Optional
+
+from openhands.sdk.tool import (
+    ToolDefinition,
+    ToolAnnotations,
+    register_tool,
+)
+from pydantic import Field
+
+from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
+from server.agent.tools.prompt_context import get_prompt_render_context
+from server.agent.tools.prompt_loader import render_tool_prompt
+
+
+def get_keyboard_tool_description(conv_state=None) -> str:
+    """Get the KeyboardTool description, rendered from Jinja2 template."""
+    return render_tool_prompt(
+        "keyboard_tool.j2",
+        conv_state,
+        context=get_prompt_render_context(conv_state),
+    )
+
+
+KeyboardActionKind = Literal["type", "press", "clear"]
+
+
+class KeyboardAction(OpenBrowserAction):
+    """Type text, press a named key, or clear the focused field."""
+
+    action: KeyboardActionKind = Field(
+        description=(
+            "'type' — type literal text one character at a time at the "
+            "current focus (click a field first to focus it). "
+            "'press' — press a single named key, optionally with modifiers. "
+            "Use this for Enter/Tab/Escape/Backspace/Delete/arrows and "
+            "shortcuts like Ctrl+A. "
+            "'clear' — convenience wrapper that selects all and deletes "
+            "the contents of the currently focused field (equivalent to "
+            "`press a` with `modifiers: ['Control']` then `press Backspace`)."
+        )
+    )
+
+    text: Optional[str] = Field(
+        default=None,
+        description="Text to type for 'type' (max 1000 chars).",
+        max_length=1000,
+    )
+    key: Optional[str] = Field(
+        default=None,
+        description=(
+            "Key name for 'press', e.g. 'Enter', 'Escape', 'Tab', "
+            "'Backspace', 'ArrowDown', 'PageDown'. Single letters/digits "
+            "also work ('a', '5')."
+        ),
+        max_length=50,
+    )
+    modifiers: List[str] = Field(
+        default_factory=list,
+        description=(
+            "Modifier keys for 'press' (e.g. ['Control'], ['Shift', 'Alt']). "
+            "Use 'Meta' for Cmd on macOS."
+        ),
+    )
+
+
+class KeyboardTool(ToolDefinition[KeyboardAction, OpenBrowserObservation]):
+    """Virtual keyboard — type text, press keys."""
+
+    name = "keyboard"
+
+    @classmethod
+    def create(cls, conv_state, terminal_executor=None) -> Sequence["KeyboardTool"]:
+        if terminal_executor is not None:
+            executor = terminal_executor
+        else:
+            conversation_id = getattr(conv_state, "id", None)
+            from server.agent.tools.browser_executor import get_browser_executor
+
+            executor = get_browser_executor(conversation_id)
+
+        return [
+            cls(
+                description=get_keyboard_tool_description(conv_state),
+                action_type=KeyboardAction,
+                observation_type=OpenBrowserObservation,
+                annotations=ToolAnnotations(
+                    title="Keyboard",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
+                executor=executor,
+            )
+        ]
+
+
+register_tool("keyboard", KeyboardTool.create)
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
new file mode 100644
index 0000000..d9139ee
--- /dev/null
+++ b/server/agent/tools/mouse_tool.py
@@ -0,0 +1,157 @@
+"""
+MouseTool - Move, click, drag, and scroll a virtual mouse cursor.
+
+The agent emits target coordinates in the Qwen-VL [0, 1000] normalized space
+(0 = viewport top-left, 1000 = bottom-right). The server denormalizes against
+the captured viewport before dispatching CDP input events. A small arrow
+cursor is rendered into the page DOM and appears in every screenshot.
+"""
+
+from collections.abc import Sequence
+from typing import Literal, Optional
+
+from openhands.sdk.tool import (
+    ToolDefinition,
+    ToolAnnotations,
+    register_tool,
+)
+from pydantic import Field
+
+from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
+from server.agent.tools.prompt_context import get_prompt_render_context
+from server.agent.tools.prompt_loader import render_tool_prompt
+
+
+def get_mouse_tool_description(conv_state=None) -> str:
+    """Get the MouseTool description, rendered from Jinja2 template."""
+    return render_tool_prompt(
+        "mouse_tool.j2",
+        conv_state,
+        context=get_prompt_render_context(conv_state),
+    )
+
+
+MouseActionKind = Literal["move", "click", "drag", "scroll", "reset"]
+
+
+class MouseAction(OpenBrowserAction):
+    """Move, click, drag, or scroll the virtual mouse cursor.
+
+    Coordinates `x, y, x2, y2` are integers in the Qwen-VL [0, 1000]
+    normalized space, with `(0, 0)` at the top-left of the viewport and
+    `(1000, 1000)` at the bottom-right.
+    """
+
+    action: MouseActionKind = Field(
+        description=(
+            "What to do with the mouse. "
+            "'move' — slide the cursor to (x, y). The cursor traces an eased "
+            "path so hover effects fire naturally along the way. "
+            "'click' — click WHERE THE CURSOR IS NOW. This is an in-place "
+            "action: it does not accept a target coordinate. Move there "
+            "first, verify the cursor is on the intended target in the "
+            "screenshot, then click. Use `count: 2` for double-click, "
+            "`count: 3` for triple-click. `button: 'right'` for context "
+            "menus. "
+            "'drag' — press at (x, y), drag to (x2, y2), release. "
+            "'scroll' — scroll at the cursor position by `amount` in "
+            "`direction`. "
+            "'reset' — return the cursor to the viewport center."
+        )
+    )
+
+    x: Optional[int] = Field(
+        default=None,
+        description=(
+            "Target X in Qwen-VL [0, 1000] normalized space. Required for "
+            "'move' and 'drag' (start). Ignored by 'click' — click is "
+            "in-place; move first if you need to retarget."
+        ),
+        ge=0,
+        le=1000,
+    )
+    y: Optional[int] = Field(
+        default=None,
+        description=(
+            "Target Y in Qwen-VL [0, 1000] normalized space. Required for "
+            "'move' and 'drag' (start). Ignored by 'click'."
+        ),
+        ge=0,
+        le=1000,
+    )
+    x2: Optional[int] = Field(
+        default=None,
+        description="Drag end X in [0, 1000]. Required for 'drag'.",
+        ge=0,
+        le=1000,
+    )
+    y2: Optional[int] = Field(
+        default=None,
+        description="Drag end Y in [0, 1000]. Required for 'drag'.",
+        ge=0,
+        le=1000,
+    )
+
+    button: Literal["left", "right", "middle"] = Field(
+        default="left",
+        description="Mouse button for 'click' and 'drag'.",
+    )
+    count: int = Field(
+        default=1,
+        ge=1,
+        le=3,
+        description="Click count for 'click' (1 = single, 2 = double, 3 = triple).",
+    )
+
+    direction: Literal["up", "down", "left", "right"] = Field(
+        default="down",
+        description="Scroll direction for 'scroll'.",
+    )
+    amount: int = Field(
+        default=300,
+        ge=1,
+        le=2000,
+        description="Scroll amount in CSS pixels for 'scroll'.",
+    )
+
+    steps: int = Field(
+        default=10,
+        ge=2,
+        le=40,
+        description="Intermediate move steps for 'drag' (smoother for DnD libraries).",
+    )
+
+
+class MouseTool(ToolDefinition[MouseAction, OpenBrowserObservation]):
+    """Virtual mouse — move, click, drag, scroll."""
+
+    name = "mouse"
+
+    @classmethod
+    def create(cls, conv_state, terminal_executor=None) -> Sequence["MouseTool"]:
+        if terminal_executor is not None:
+            executor = terminal_executor
+        else:
+            conversation_id = getattr(conv_state, "id", None)
+            from server.agent.tools.browser_executor import get_browser_executor
+
+            executor = get_browser_executor(conversation_id)
+
+        return [
+            cls(
+                description=get_mouse_tool_description(conv_state),
+                action_type=MouseAction,
+                observation_type=OpenBrowserObservation,
+                annotations=ToolAnnotations(
+                    title="Mouse",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
+                executor=executor,
+            )
+        ]
+
+
+register_tool("mouse", MouseTool.create)
diff --git a/server/api/routes/commands.py b/server/api/routes/commands.py
index e1a3854..a8f4f47 100644
--- a/server/api/routes/commands.py
+++ b/server/api/routes/commands.py
@@ -96,7 +96,15 @@ async def execute_command(command_data: dict):
 
 @router.post("/mouse/move")
 async def mouse_move(x: int, y: int, browser_id: str, duration: float = 0.1):
-    """Move mouse to absolute position in preset coordinate system (0-1280, 0-720)"""
+    """Move mouse to an absolute CSS-pixel position in the live viewport.
+
+    `x` and `y` are CSS pixels with `x, y >= 0`. The extension clamps to the
+    live viewport before dispatch, so callers don't need to know the exact
+    viewport size — but values should be within plausible browser dimensions
+    (e.g. up to 4K). The legacy 0-1280/0-720 cap is no longer enforced now
+    that the live agent path uses Qwen-VL [0,1000] coords with server-side
+    denormalization (see `BrowserExecutor._denormalize_xy`).
+    """
     command = {
         "type": "mouse_move",
         "x": x,
diff --git a/server/core/processor.py b/server/core/processor.py
index 1bb55b2..a16dc4f 100644
--- a/server/core/processor.py
+++ b/server/core/processor.py
@@ -11,6 +11,7 @@
     parse_command,
     MouseMoveCommand,
     MouseClickCommand,
+    MouseDragCommand,
     MouseScrollCommand,
     ResetMouseCommand,
     KeyboardTypeCommand,
@@ -132,6 +133,7 @@ def _prepare_command_dict(self, command: Command) -> dict:
             ScreenshotCommand,
             MouseMoveCommand,
             MouseClickCommand,
+            MouseDragCommand,
             MouseScrollCommand,
             ResetMouseCommand,
             KeyboardTypeCommand,
@@ -166,12 +168,24 @@ def _prepare_command_dict(self, command: Command) -> dict:
         ):
             # Check command type to decide if we should fill tab_id
             if isinstance(command, TabCommand):
-                # For tab commands, only fill tab_id for certain actions
-                # init and open create new tabs - don't fill
-                # close and switch need specific tab_id - don't fill if not specified
-                # list gets all tabs - don't fill
-                # So generally don't auto-fill for TabCommand
-                pass
+                # init/open create new tabs; close/switch need an explicit
+                # tab_id; list gets all tabs. But refresh/view/back/forward
+                # operate on "the current tab" semantically, so fill in the
+                # active tab when the agent didn't bother to pass it.
+                action_value = command_dict.get("action") or getattr(
+                    command, "action", None
+                )
+                action_name = (
+                    action_value.value
+                    if hasattr(action_value, "value")
+                    else action_value
+                )
+                if action_name in {"refresh", "view", "back", "forward"}:
+                    command_dict["tab_id"] = current_tab_id
+                    logger.debug(
+                        f"Auto-filled tab_id {current_tab_id} for "
+                        f"tab.{action_name} in conversation {conversation_id}"
+                    )
             elif isinstance(command, GetTabsCommand):
                 # GetTabsCommand gets all tabs, doesn't need tab_id
                 pass
@@ -214,6 +228,8 @@ async def execute(self, command: Command) -> CommandResponse:
                 return await self._execute_mouse_move(command)
             elif isinstance(command, MouseClickCommand):
                 return await self._execute_mouse_click(command)
+            elif isinstance(command, MouseDragCommand):
+                return await self._execute_mouse_drag(command)
             elif isinstance(command, MouseScrollCommand):
                 return await self._execute_mouse_scroll(command)
             elif isinstance(command, KeyboardTypeCommand):
@@ -285,6 +301,13 @@ async def _execute_mouse_click(self, command: MouseClickCommand) -> CommandRespo
         response = await self._send_prepared_command(command)
         return response
 
+    async def _execute_mouse_drag(
+        self, command: MouseDragCommand
+    ) -> CommandResponse:
+        """Execute mouse drag command"""
+        response = await self._send_prepared_command(command)
+        return response
+
     async def _execute_mouse_scroll(
         self, command: MouseScrollCommand
     ) -> CommandResponse:
diff --git a/server/models/commands.py b/server/models/commands.py
index 0b4f7dd..5eb5af5 100644
--- a/server/models/commands.py
+++ b/server/models/commands.py
@@ -52,21 +52,37 @@ class BaseCommand(BaseModel):
         default=None,
         description="Browser UUID capability token for targeted routing",
     )
+    # When True, the extension returns a clean (no-highlight) screenshot with
+    # the virtual cursor rendered, instead of a highlighted screenshot. Set by
+    # BrowserExecutor for live-agent-path conversations; False for routine
+    # replay (which still depends on the highlight + element-id inventory).
+    live_mode: bool = Field(
+        default=False,
+        description=(
+            "If True, the extension skips highlight injection and returns a "
+            "clean screenshot with the virtual cursor for the live pixel-only "
+            "agent path. Default False preserves the legacy highlight flow."
+        ),
+    )
 
 
 class MouseMoveCommand(BaseCommand):
-    """Move mouse to absolute position in preset coordinate system"""
+    """Move mouse to an absolute CSS-pixel position in the live viewport.
+
+    Coordinates are CSS pixels (post-denormalization). The extension clamps
+    out-of-range values to the live viewport before dispatch, so no upper
+    bound is enforced here — server-side denormalization is the single source
+    of truth on coordinate space.
+    """
 
     type: Literal["mouse_move"] = "mouse_move"
     x: int = Field(
-        description="X coordinate in preset coordinate system (0 to 1280, left to right)",
+        description="X coordinate in CSS pixels from viewport left.",
         ge=0,
-        le=1280,
     )
     y: int = Field(
-        description="Y coordinate in preset coordinate system (0 to 720, top to bottom)",
+        description="Y coordinate in CSS pixels from viewport top.",
         ge=0,
-        le=720,
     )
     duration: Optional[float] = Field(
         default=0.1,
@@ -77,12 +93,47 @@ class MouseMoveCommand(BaseCommand):
 
 
 class MouseClickCommand(BaseCommand):
-    """Click at current mouse position"""
+    """Click at the current cursor position, or at an explicit (x, y).
+
+    When ``x`` and ``y`` are provided, the extension first dispatches a
+    ``mouseMoved`` to that position so the click registers there; when omitted,
+    the click fires at the most recent cursor position tracked per tab.
+    """
 
     type: Literal["mouse_click"] = "mouse_click"
     button: MouseButton = Field(default=MouseButton.LEFT)
     double: bool = Field(default=False, description="Double click if True")
     count: int = Field(default=1, ge=1, le=3, description="Number of clicks (1-3)")
+    x: Optional[int] = Field(
+        default=None,
+        description="Optional X in CSS pixels — pre-move cursor before clicking.",
+        ge=0,
+    )
+    y: Optional[int] = Field(
+        default=None,
+        description="Optional Y in CSS pixels — pre-move cursor before clicking.",
+        ge=0,
+    )
+
+
+class MouseDragCommand(BaseCommand):
+    """Drag from (start_x, start_y) to (end_x, end_y) in CSS pixels.
+
+    Sequence: mouseMoved → mousePressed → N lerped mouseMoved → mouseReleased.
+    """
+
+    type: Literal["mouse_drag"] = "mouse_drag"
+    start_x: int = Field(description="Drag start X in CSS pixels.", ge=0)
+    start_y: int = Field(description="Drag start Y in CSS pixels.", ge=0)
+    end_x: int = Field(description="Drag end X in CSS pixels.", ge=0)
+    end_y: int = Field(description="Drag end Y in CSS pixels.", ge=0)
+    button: MouseButton = Field(default=MouseButton.LEFT)
+    steps: int = Field(
+        default=10,
+        ge=2,
+        le=40,
+        description="Intermediate mouseMoved events between start and end.",
+    )
 
 
 class MouseScrollCommand(BaseCommand):
@@ -259,6 +310,9 @@ class GetAccessibilityTreeCommand(BaseCommand):
     )
 
 
+# Legacy element-id commands (HighlightElementsCommand through HighlightDropPreviewCommand):
+# kept for /ob-routines recording/replay. The live agent uses pixel-level commands
+# (MouseMoveCommand, MouseClickCommand, etc.) and never references element_id.
 class HighlightElementsCommand(BaseCommand):
     """Highlight interactive elements on the page for visual selection
 
@@ -593,6 +647,7 @@ class TabsResponse(CommandResponse):
 Command = Union[
     MouseMoveCommand,
     MouseClickCommand,
+    MouseDragCommand,
     MouseScrollCommand,
     ResetMouseCommand,
     KeyboardTypeCommand,
@@ -630,6 +685,7 @@ def parse_command(data: dict) -> Command:
     command_map = {
         "mouse_move": MouseMoveCommand,
         "mouse_click": MouseClickCommand,
+        "mouse_drag": MouseDragCommand,
         "mouse_scroll": MouseScrollCommand,
         "reset_mouse": ResetMouseCommand,
         "keyboard_type": KeyboardTypeCommand,

From 9780ba730f5762276af8c7f258cd046e473c0712 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 29 Apr 2026 20:38:31 +0800
Subject: [PATCH 02/14] prompts(agent): tighten mouse/keyboard wording, drop
 implementation leaks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrite the mouse and keyboard tool prompts and Field descriptions to
describe only the canonical use, instead of warning the model away from
shapes that the executor already silently handles.

- mouse: drop "click does not take coordinates" / "MUST NOT supply x/y"
  language. Field descriptions now state the affirmative use only.
  Examples already only show the in-place form.
- keyboard: lead with "focus an input first" as a positive rule, drop
  the per-character / "convenience wrapper" implementation notes. Add a
  small_model `type into a field` pattern that pairs with mouse.
- mouse scroll (small_model): document that the wheel hits the
  container under the cursor — `move` first to scroll an inner panel.
- mouse_tool.py: revert the validator that rejected x/y on click; the
  executor already drops them silently. The schema reflects only what
  the agent should learn to send.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../agent/prompts/big_model/keyboard_tool.j2  | 28 ++++++++++---------
 server/agent/prompts/big_model/mouse_tool.j2  | 24 ++++++++--------
 .../prompts/small_model/keyboard_tool.j2      | 12 +++++---
 .../agent/prompts/small_model/mouse_tool.j2   |  9 +++---
 server/agent/tools/keyboard_tool.py           |  9 +++---
 server/agent/tools/mouse_tool.py              | 20 ++++++-------
 6 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/server/agent/prompts/big_model/keyboard_tool.j2 b/server/agent/prompts/big_model/keyboard_tool.j2
index a31437b..3a88ecd 100644
--- a/server/agent/prompts/big_model/keyboard_tool.j2
+++ b/server/agent/prompts/big_model/keyboard_tool.j2
@@ -1,11 +1,15 @@
 # Keyboard Tool
 
-Type text and press keys at the current focus. Click a field with `mouse` first to focus it; then use this tool.
+Type text and press keys at the current focus.
+
+## Focus an input first
+
+Keyboard events go to whatever element currently has focus. Before typing into a text field, search box, contenteditable, etc., **`move` the cursor over the field and `click` it to focus it.** Then call this tool. The same applies after navigating to a new page or opening a dialog — re-focus the input by clicking it before the next `type`.
 
 ## Actions
 
 ### type
-Type literal text where the keyboard focus is right now. Characters are sent one at a time, with a small delay between each, so per-character handlers (autocomplete, live validation) react in order — just like a real keyboard.
+Type literal text into the focused field.
 
 ```json
 { "action": "type", "text": "hello world" }
@@ -32,25 +36,23 @@ Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`, `ArrowD
 Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
 
 ### clear
-Convenience: select-all + delete the contents of the currently focused field. Use this when you want to overwrite a field that already has text in it.
+Select-all + delete the contents of the currently focused field. Use this to overwrite a field that already has text in it.
 
 ```json
 { "action": "clear" }
 ```
 
-Equivalent to `press a` with `modifiers: ["Control"]` then `press Backspace`.
-
 ## Patterns
 
-- **Fill an empty form field**: `mouse` `click` on the field → `keyboard` `type` the value.
-- **Replace existing text in a field**: `mouse` `click` on the field → `keyboard` `clear` → `keyboard` `type` the new value.
-- **Submit a form / trigger search**: `keyboard` `press` `key: "Enter"` after typing.
-- **Erase a single character**: `keyboard` `press` `key: "Backspace"`.
-- **Select all in a field**: `keyboard` `press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS).
-- **Tab to next field**: `keyboard` `press` `key: "Tab"`.
+- **Fill an empty form field**: `mouse move` to the field → `mouse click` → `keyboard type`.
+- **Replace existing text**: `mouse move` to the field → `mouse click` → `keyboard clear` → `keyboard type`.
+- **Submit a form / trigger search**: `keyboard press` `key: "Enter"` after typing.
+- **Erase a single character**: `keyboard press` `key: "Backspace"`.
+- **Select all in a field**: `keyboard press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS).
+- **Tab to next field**: `keyboard press` `key: "Tab"`.
 
 ## Notes
 
 - One action per turn.
-- Typing or pressing goes to whatever has keyboard focus. If nothing is focused, nothing happens — `mouse click` on a field first.
-- Don't type into the address bar via this tool; use the `tab` tool to navigate.
+- Click an input area before typing into it so it has focus.
+- For URL changes use the `tab` tool to navigate, not the keyboard.
diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
index bcd7181..4f17ee6 100644
--- a/server/agent/prompts/big_model/mouse_tool.j2
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -4,32 +4,30 @@ Drive a virtual mouse cursor: move, click, drag, scroll.
 
 ## Coordinates
 
-`(x, y)` and `(x2, y2)` are integers in the **[0, 1000] normalized space**:
+`(x, y)` and `(x2, y2)` are integers in **[0, 1000]** normalized space:
 
 - `(0, 0)` = top-left of the viewport.
 - `(1000, 1000)` = bottom-right.
 
 Estimate from the screenshot. Aim for the visual center of your target. The system rescales to real pixels.
 
-## The Cursor Is Load-Bearing
+## The Cursor
 
-A red dot with a pulsing red ring sits inside a white-and-black arrow. **The dot is the click point.** It appears in every screenshot on this page.
+A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
 
-`click` is **in-place**: it commits exactly where the dot is right now. It does not take a target coordinate. If you want to click somewhere new, **`move` there first**, then verify in the next screenshot that the red dot is on top of the intended target, then `click`.
-
-This is a hard rule. Skipping the move-first step and clicking will commit at whatever position the cursor was last left at — which is rarely what you want.
+`click` commits at the cursor's current position. To click a new target: `move` there → look at the next screenshot to confirm the red dot is on the target → `click`.
 
 ## Actions
 
 ### move
-Slide the cursor to a point. The cursor traces an eased path, so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way.
+Slide the cursor to a point. The cursor traces an eased path so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way.
 
 ```json
 { "action": "move", "x": 500, "y": 320 }
 ```
 
 ### click
-Click **where the cursor is now**. Does not take coordinates.
+Click at the cursor's current position.
 
 ```json
 { "action": "click" }
@@ -40,7 +38,7 @@ Click **where the cursor is now**. Does not take coordinates.
 - `button`: `"left"` (default), `"right"`, `"middle"`.
 - `count`: `1` (default), `2` for double-click, `3` for triple-click (text selection).
 
-Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it is not, call `move` first.
+Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it isn't, `move` again first.
 
 ### drag
 Press at `(x, y)`, drag to `(x2, y2)`, release. One call.
@@ -59,7 +57,7 @@ Scroll at the cursor's current position by `amount` CSS pixels in `direction`.
 { "action": "scroll", "direction": "up", "amount": 300 }
 ```
 
-`direction`: `"down"`, `"up"`, `"left"`, `"right"`. To scroll inside a specific panel/container, `move` over it first so the wheel event lands in the right scroll target.
+`direction`: `"down"`, `"up"`, `"left"`, `"right"`. To scroll inside a specific panel, `move` over it first so the wheel event lands in the right scroll target.
 
 ### reset
 Return the cursor to the viewport center.
@@ -79,6 +77,6 @@ Return the cursor to the viewport center.
 ## Notes
 
 - One action per turn. The next observation reflects the post-action state.
-- The cursor position persists across actions — the cursor remains where you last left it until you `move` it again.
-- If a target isn't visible, `scroll` to bring it in view; don't try to click coordinates outside the viewport.
-- If a confirm/prompt dialog opens, the next mouse action will fail — handle the dialog first.
+- The cursor position persists across actions — it stays where you last left it until the next `move`.
+- If a target isn't in the viewport, `scroll` to bring it in view before pointing at it.
+- If a confirm/prompt dialog opens, handle it with the dialog tool before the next mouse action.
diff --git a/server/agent/prompts/small_model/keyboard_tool.j2 b/server/agent/prompts/small_model/keyboard_tool.j2
index 1c5cc49..863654a 100644
--- a/server/agent/prompts/small_model/keyboard_tool.j2
+++ b/server/agent/prompts/small_model/keyboard_tool.j2
@@ -1,11 +1,15 @@
 # Keyboard Tool
 
-Type text, press named keys, clear a field. Click a field with `mouse` first to focus it.
+Type text and press named keys at the current focus.
+
+## Focus an input first
+
+Keyboard events go to the focused element. Before typing into a text box, search field, or any input area, **`mouse move` to the field and `mouse click` it** to focus it. Then call this tool.
 
 ## Actions
 
 ### type
-Type literal text, one character at a time.
+Type literal text into the focused field.
 ```json
 { "action": "type", "text": "hello world" }
 ```
@@ -31,8 +35,8 @@ Select-all + delete the contents of the focused field. Use before overwriting a
 
 ## Patterns
 
-- **Fill an empty field**: `mouse click` → `keyboard type`.
-- **Replace text in a field**: `mouse click` on it → `keyboard clear` → `keyboard type`.
+- **Fill an empty field**: `mouse move` → `mouse click` → `keyboard type`.
+- **Replace text in a field**: `mouse move` → `mouse click` → `keyboard clear` → `keyboard type`.
 - **Submit / search**: `keyboard press Enter`.
 - **Erase one char**: `keyboard press Backspace`.
 - **Tab to next field**: `keyboard press Tab`.
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
index 0630dd2..c9cc716 100644
--- a/server/agent/prompts/small_model/mouse_tool.j2
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -10,7 +10,7 @@ Move, click, drag, and scroll a virtual mouse cursor.
 
 A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
 
-`click` is **in-place**: it commits where the dot is right now. It does not take coordinates. To click a new target: `move` there → check the screenshot → `click`.
+`click` commits at the cursor's current position. To click a new target: `move` there → check the screenshot → `click`.
 
 ## Actions
 
@@ -21,7 +21,7 @@ Slide the cursor to `(x, y)`.
 ```
 
 ### click
-Click where the cursor is now. No coordinates.
+Click at the cursor's current position.
 ```json
 { "action": "click" }
 { "action": "click", "count": 2 }
@@ -30,7 +30,7 @@ Click where the cursor is now. No coordinates.
 - `button`: `"left"` (default), `"right"`, `"middle"`.
 - `count`: 1 (default), 2 for double-click.
 
-Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` first.
+Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` again first.
 
 ### drag
 Press at `(x, y)`, drag to `(x2, y2)`, release.
@@ -39,7 +39,7 @@ Press at `(x, y)`, drag to `(x2, y2)`, release.
 ```
 
 ### scroll
-Scroll at the cursor by `amount` CSS pixels.
+Scroll at the cursor by `amount` CSS pixels. The wheel event lands on whatever container is under the cursor — to scroll inside a panel, sidebar, or modal, `move` over it first.
 ```json
 { "action": "scroll", "direction": "down", "amount": 600 }
 ```
@@ -57,3 +57,4 @@ Return cursor to viewport center.
 - **Hover**: `move` over the trigger; next screenshot shows the result.
 - **Scroll to find**: `scroll` then check the new screenshot.
 - **Drag**: one `drag` with start and end coordinates.
+- **Type into a field**: `move` to the field → `click` to focus it → `keyboard type` the text.
diff --git a/server/agent/tools/keyboard_tool.py b/server/agent/tools/keyboard_tool.py
index 7852f61..a53bf00 100644
--- a/server/agent/tools/keyboard_tool.py
+++ b/server/agent/tools/keyboard_tool.py
@@ -38,14 +38,13 @@ class KeyboardAction(OpenBrowserAction):
 
     action: KeyboardActionKind = Field(
         description=(
-            "'type' — type literal text one character at a time at the "
-            "current focus (click a field first to focus it). "
+            "'type' — type literal text into the focused field (click the "
+            "field first to focus it). "
             "'press' — press a single named key, optionally with modifiers. "
             "Use this for Enter/Tab/Escape/Backspace/Delete/arrows and "
             "shortcuts like Ctrl+A. "
-            "'clear' — convenience wrapper that selects all and deletes "
-            "the contents of the currently focused field (equivalent to "
-            "`press a` with `modifiers: ['Control']` then `press Backspace`)."
+            "'clear' — select all and delete the contents of the focused "
+            "field, then leave it empty for a fresh `type`."
         )
     )
 
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
index d9139ee..d7b52df 100644
--- a/server/agent/tools/mouse_tool.py
+++ b/server/agent/tools/mouse_tool.py
@@ -47,12 +47,11 @@ class MouseAction(OpenBrowserAction):
             "What to do with the mouse. "
             "'move' — slide the cursor to (x, y). The cursor traces an eased "
             "path so hover effects fire naturally along the way. "
-            "'click' — click WHERE THE CURSOR IS NOW. This is an in-place "
-            "action: it does not accept a target coordinate. Move there "
-            "first, verify the cursor is on the intended target in the "
-            "screenshot, then click. Use `count: 2` for double-click, "
-            "`count: 3` for triple-click. `button: 'right'` for context "
-            "menus. "
+            "'click' — click at the cursor's current position. To click a "
+            "different target, 'move' there first, then verify the red dot "
+            "is on the target in the next screenshot, then 'click'. Use "
+            "`count: 2` for double-click, `count: 3` for triple-click. "
+            "`button: 'right'` for context menus. "
             "'drag' — press at (x, y), drag to (x2, y2), release. "
             "'scroll' — scroll at the cursor position by `amount` in "
             "`direction`. "
@@ -63,9 +62,8 @@ class MouseAction(OpenBrowserAction):
     x: Optional[int] = Field(
         default=None,
         description=(
-            "Target X in Qwen-VL [0, 1000] normalized space. Required for "
-            "'move' and 'drag' (start). Ignored by 'click' — click is "
-            "in-place; move first if you need to retarget."
+            "Target X in Qwen-VL [0, 1000] normalized space. Used by 'move' "
+            "and 'drag' (start)."
         ),
         ge=0,
         le=1000,
@@ -73,8 +71,8 @@ class MouseAction(OpenBrowserAction):
     y: Optional[int] = Field(
         default=None,
         description=(
-            "Target Y in Qwen-VL [0, 1000] normalized space. Required for "
-            "'move' and 'drag' (start). Ignored by 'click'."
+            "Target Y in Qwen-VL [0, 1000] normalized space. Used by 'move' "
+            "and 'drag' (start)."
         ),
         ge=0,
         le=1000,

From eda0b73a41bfde7b3d5e448c1cc059665b31b7be Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 29 Apr 2026 20:43:38 +0800
Subject: [PATCH 03/14] chore(deps): bump agent-sdk pin to 37227545
 (pixel-paradigm system prompts)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Picks up the rewrite of system_prompt_{large,small}.j2 in
softpudding/agent-sdk@37227545 — drops element_id/highlight/replay
language and aligns the top-level system prompts with this repo's
mouse + keyboard tools.

uv.lock regenerated by `uv sync`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 4 ++--
 uv.lock        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 256f2fb..7789f92 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "9b289cd393078641ea413dfd5f45d443dbb10b17" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "9b289cd393078641ea413dfd5f45d443dbb10b17" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "37227545d9d371423757ce47cddedd9521cc62d5" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "37227545d9d371423757ce47cddedd9521cc62d5" }
diff --git a/uv.lock b/uv.lock
index a34b2f2..43f2cfc 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1678,8 +1678,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=9b289cd393078641ea413dfd5f45d443dbb10b17" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=9b289cd393078641ea413dfd5f45d443dbb10b17" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=37227545d9d371423757ce47cddedd9521cc62d5" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=37227545d9d371423757ce47cddedd9521cc62d5" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2224,7 +2224,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=9b289cd393078641ea413dfd5f45d443dbb10b17#9b289cd393078641ea413dfd5f45d443dbb10b17" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=37227545d9d371423757ce47cddedd9521cc62d5#37227545d9d371423757ce47cddedd9521cc62d5" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2244,7 +2244,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=9b289cd393078641ea413dfd5f45d443dbb10b17#9b289cd393078641ea413dfd5f45d443dbb10b17" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=37227545d9d371423757ce47cddedd9521cc62d5#37227545d9d371423757ce47cddedd9521cc62d5" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 6608c323f58463e1dad44f36d7b803de2ed964ae Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 29 Apr 2026 21:55:02 +0800
Subject: [PATCH 04/14] feat(agent): unthrottle background tab + intercept
 native <select> / file picker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two pixel-paradigm bug fixes for the live agent.

1. Mouse/keyboard CDP events were sluggish because Chrome throttles the
   agent's background tab while the user works in their own. After
   `chrome.debugger.attach`, send `Emulation.setFocusEmulationEnabled` and
   `Page.setWebLifecycleState({state: 'active'})` so the renderer runs at
   foreground priority — CDP-only, never changes OS-level tab focus, never
   interrupts the user. Re-asserted on every attach call so SPA navigation
   doesn't drop the flags.

2. Native `<select>` dropdowns and OS file pickers don't render into CDP
   screenshots, so a click on one left the agent blind. Now `mouse_click`
   hit-tests the cursor (walking through `<label>.control` for styled
   file inputs); on a hit, the native click is suppressed, the element is
   tagged with `data-ob-pending-form-target`, and the click observation
   surfaces the option list (for `<select>`) or input metadata (for
   `<input type=file>`). The agent then completes the action via two new
   tools: `select_option(values)` matches by `value` → exact label →
   case-insensitive substring; `upload_file(paths)` uses
   `DOM.setFileInputFiles`. Rich error messages on `option_not_found` /
   `no_pending_*` so the agent can self-correct.

System prompts (large + small) updated in agent-sdk to introduce the new
tools and a NATIVE_FORM_CONTROLS note; mirrored into the venv copy used
by the running server.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts             |  43 ++-
 extension/src/commands/debugger-manager.ts    |  52 +++
 extension/src/commands/pixel-actions.ts       | 337 +++++++++++++++++-
 extension/src/types.ts                        |  12 +
 server/agent/api.py                           |  15 +-
 server/agent/manager.py                       |   2 +
 .../prompts/big_model/select_option_tool.j2   |  34 ++
 .../prompts/big_model/upload_file_tool.j2     |  33 ++
 .../prompts/small_model/select_option_tool.j2 |  26 ++
 .../prompts/small_model/upload_file_tool.j2   |  26 ++
 server/agent/tools/browser_executor.py        | 214 ++++++++++-
 server/agent/tools/select_option_tool.py      |  89 +++++
 server/agent/tools/upload_file_tool.py        |  84 +++++
 server/core/processor.py                      |  22 ++
 server/models/commands.py                     |  49 +++
 15 files changed, 1025 insertions(+), 13 deletions(-)
 create mode 100644 server/agent/prompts/big_model/select_option_tool.j2
 create mode 100644 server/agent/prompts/big_model/upload_file_tool.j2
 create mode 100644 server/agent/prompts/small_model/select_option_tool.j2
 create mode 100644 server/agent/prompts/small_model/upload_file_tool.j2
 create mode 100644 server/agent/tools/select_option_tool.py
 create mode 100644 server/agent/tools/upload_file_tool.py

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 280660c..80261dd 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -32,6 +32,8 @@ import {
   performKeyboardType,
   performKeyboardPress,
   performResetMouse,
+  performSelectOption,
+  performUploadFilePending,
 } from '../commands/pixel-actions';
 import { clearScreenshotCache } from '../commands/computer';
 
@@ -1789,7 +1791,9 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
       case 'mouse_scroll':
       case 'keyboard_type':
       case 'keyboard_press':
-      case 'reset_mouse': {
+      case 'reset_mouse':
+      case 'select_option':
+      case 'upload_file_pending': {
         if (!command.conversation_id) {
           throw new Error(
             `conversation_id is required for ${command.type} command (strict mode)`,
@@ -1805,7 +1809,8 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         await tabManager.ensureTabManaged(activeTabId, conversationId);
         tabManager.updateTabActivity(activeTabId, conversationId);
 
-        let actionDetail: Record<string, unknown> = {};
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        let actionDetail: Record<string, any> = {};
         try {
           switch (command.type) {
             case 'mouse_move': {
@@ -1878,6 +1883,36 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
               actionDetail = r;
               break;
             }
+            case 'select_option': {
+              const raw = command.values;
+              const values = Array.isArray(raw)
+                ? raw.map((v: unknown) => String(v))
+                : raw !== undefined && raw !== null
+                  ? [String(raw)]
+                  : [];
+              const r = await performSelectOption(
+                activeTabId,
+                conversationId,
+                values,
+              );
+              actionDetail = r;
+              break;
+            }
+            case 'upload_file_pending': {
+              const raw = command.paths;
+              const paths = Array.isArray(raw)
+                ? raw.map((v: unknown) => String(v))
+                : raw !== undefined && raw !== null
+                  ? [String(raw)]
+                  : [];
+              const r = await performUploadFilePending(
+                activeTabId,
+                conversationId,
+                paths,
+              );
+              actionDetail = r;
+              break;
+            }
           }
         } catch (err) {
           throw new Error(
@@ -1894,7 +1929,9 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         const settleMs =
           command.type === 'mouse_click' ||
           command.type === 'mouse_drag' ||
-          command.type === 'keyboard_press'
+          command.type === 'keyboard_press' ||
+          command.type === 'select_option' ||
+          command.type === 'upload_file_pending'
             ? 350
             : 0;
         const cursorAfter =
diff --git a/extension/src/commands/debugger-manager.ts b/extension/src/commands/debugger-manager.ts
index 180338d..abb725e 100644
--- a/extension/src/commands/debugger-manager.ts
+++ b/extension/src/commands/debugger-manager.ts
@@ -219,6 +219,10 @@ export class DebuggerSessionManager {
             console.log(
               `🔧 [DebuggerManager] Tab ${tabId} already attached in session ${conversationId}`,
             );
+            // Re-assert unthrottle: in-page navigation can drop the
+            // emulation flags, so refreshing them on every command is
+            // cheap insurance against a slow background tab.
+            this.unthrottleRenderer(tabId);
             return true;
           }
         }
@@ -313,12 +317,60 @@ export class DebuggerSessionManager {
           console.log(
             `✅ [DebuggerManager] Debugger attached to tab ${tabId} in session ${conversationId}`,
           );
+
+          // The agent's tab usually sits in the background while the user
+          // works in their own foreground tab. Chrome aggressively throttles
+          // background renderers (timers, rAF, transitions, focus-gated
+          // handlers) which makes mouse/keyboard CDP events feel sluggish
+          // even though the input is dispatched promptly. Tell the renderer
+          // it is focused and active so it runs at foreground priority —
+          // both calls are CDP-only and never change OS-level tab focus.
+          this.unthrottleRenderer(tabId);
+
           resolve(true);
         }
       });
     });
   }
 
+  /**
+   * Tell the renderer it is focused and active so Chrome doesn't throttle it
+   * while the tab sits in the background. Fire-and-forget — failures are
+   * logged but do not fail the attach.
+   */
+  private unthrottleRenderer(tabId: number): void {
+    const send = (
+      method: string,
+      params: Record<string, unknown>,
+    ): Promise<void> =>
+      new Promise((resolve) => {
+        try {
+          chrome.debugger.sendCommand({ tabId }, method, params, () => {
+            const err = chrome.runtime.lastError;
+            if (err) {
+              console.warn(
+                `⚠️ [DebuggerManager] ${method} on tab ${tabId} failed: ${err.message}`,
+              );
+            }
+            resolve();
+          });
+        } catch (err) {
+          console.warn(
+            `⚠️ [DebuggerManager] ${method} on tab ${tabId} threw:`,
+            err,
+          );
+          resolve();
+        }
+      });
+
+    Promise.all([
+      send('Emulation.setFocusEmulationEnabled', { enabled: true }),
+      send('Page.setWebLifecycleState', { state: 'active' }),
+    ]).catch(() => {
+      /* swallowed — individual calls already logged */
+    });
+  }
+
   /**
    * Detach 指定 tab 的 debugger
    */
diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index f00a150..3672b82 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -195,6 +195,137 @@ export async function performMouseMove(
   return target;
 }
 
+export interface NativeFormControlHit {
+  kind: 'select' | 'file';
+  // For 'select'
+  multiple?: boolean;
+  options?: Array<{
+    index: number;
+    value: string;
+    label: string;
+    disabled: boolean;
+    selected: boolean;
+  }>;
+  // For 'file'
+  accept?: string;
+  // Common
+  name?: string;
+  ariaLabel?: string;
+}
+
+export interface MouseClickResult {
+  x: number;
+  y: number;
+  button: string;
+  warning?: string;
+  intercepted_form_control?: NativeFormControlHit;
+}
+
+// Hit-test the click point. If the cursor sits on a native <select> or
+// <input type="file"> (directly or via a <label> that targets one), mark
+// the element with a `data-ob-pending-form-target` attribute and return a
+// descriptor. The caller suppresses the native press/release dispatch so
+// the OS-level dropdown / file picker never opens — neither one renders
+// into CDP screenshots, which would leave the agent blind. Follow-up tools
+// (`select_option`, `upload_file`) operate on the marked element.
+function nativeFormControlProbeScript(x: number, y: number): string {
+  return `(function(px, py) {
+  function find(start) {
+    var node = start;
+    while (node && node !== document.body) {
+      if (node.tagName === 'SELECT' && !node.disabled) return node;
+      if (
+        node.tagName === 'INPUT' &&
+        (node.type || '').toLowerCase() === 'file' &&
+        !node.disabled
+      ) return node;
+      if (node.tagName === 'LABEL') {
+        var ctrl = node.control;
+        if (ctrl) {
+          if (ctrl.tagName === 'SELECT' && !ctrl.disabled) return ctrl;
+          if (
+            ctrl.tagName === 'INPUT' &&
+            (ctrl.type || '').toLowerCase() === 'file' &&
+            !ctrl.disabled
+          ) return ctrl;
+        }
+        var innerSel = node.querySelector('select:not([disabled])');
+        if (innerSel) return innerSel;
+        var innerFile = node.querySelector('input[type="file"]:not([disabled])');
+        if (innerFile) return innerFile;
+      }
+      node = node.parentElement;
+    }
+    return null;
+  }
+  var hit = document.elementFromPoint(px, py);
+  if (!hit) return null;
+  var target = find(hit);
+  if (!target) return null;
+  // Clear stale markers (only one pending target per page at a time).
+  var stale = document.querySelectorAll('[data-ob-pending-form-target]');
+  for (var i = 0; i < stale.length; i++) {
+    if (stale[i] !== target) stale[i].removeAttribute('data-ob-pending-form-target');
+  }
+  if (target.tagName === 'SELECT') {
+    target.setAttribute('data-ob-pending-form-target', 'select');
+    var opts = [];
+    for (var j = 0; j < target.options.length; j++) {
+      var o = target.options[j];
+      opts.push({
+        index: j,
+        value: o.value,
+        label: (o.textContent || '').trim(),
+        disabled: !!o.disabled,
+        selected: !!o.selected,
+      });
+    }
+    return {
+      kind: 'select',
+      multiple: !!target.multiple,
+      name: target.name || target.id || '',
+      ariaLabel: target.getAttribute('aria-label') || '',
+      options: opts,
+    };
+  }
+  target.setAttribute('data-ob-pending-form-target', 'file');
+  return {
+    kind: 'file',
+    name: target.name || target.id || '',
+    accept: target.getAttribute('accept') || '',
+    multiple: !!target.multiple,
+    ariaLabel: target.getAttribute('aria-label') || '',
+  };
+})(${x}, ${y})`;
+}
+
+async function detectNativeFormControl(
+  cdp: CdpCommander,
+  x: number,
+  y: number,
+): Promise<NativeFormControlHit | null> {
+  try {
+    const res = await cdp.sendCommand<{
+      result?: { value?: NativeFormControlHit | null };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression: nativeFormControlProbeScript(x, y),
+        returnByValue: true,
+      },
+      8000,
+      0,
+    );
+    return res?.result?.value ?? null;
+  } catch (err) {
+    console.warn(
+      '[PixelActions] Native form-control hit-test failed:',
+      err,
+    );
+    return null;
+  }
+}
+
 export async function performMouseClick(
   tabId: number,
   conversationId: string,
@@ -207,7 +338,7 @@ export async function performMouseClick(
   _y: number | undefined,
   button: 'left' | 'right' | 'middle' = 'left',
   count: number = 1,
-): Promise<{ x: number; y: number; button: string; warning?: string }> {
+): Promise<MouseClickResult> {
   await attachWithDialogTracking(tabId, conversationId);
   const cdp = new CdpCommander(tabId);
   const { width: vw, height: vh } = await getViewport(cdp);
@@ -220,6 +351,26 @@ export async function performMouseClick(
     (await resolveCursorOrCenter(tabId, conversationId));
   const clamped = clampToViewport(cursor.x, cursor.y, vw, vh);
 
+  // Intercept native <select> / <input type=file> before the click
+  // commits — only on a left single-click, where the OS-level UI would
+  // otherwise pop. Right-clicks (context menus) and double/triple-clicks
+  // pass through unchanged.
+  if (button === 'left' && (count | 0) <= 1) {
+    const hit = await detectNativeFormControl(cdp, clamped.x, clamped.y);
+    if (hit) {
+      // Refresh the cursor sprite at the click point so the agent's next
+      // screenshot matches the position they targeted.
+      await refreshCursor(cdp, tabId, clamped.x, clamped.y);
+      return {
+        x: clamped.x,
+        y: clamped.y,
+        button,
+        warning: clamped.warning,
+        intercepted_form_control: hit,
+      };
+    }
+  }
+
   const cdpButton: 'left' | 'right' | 'middle' = button;
   const buttons = button === 'left' ? 1 : button === 'right' ? 2 : 4;
   const safeCount = Math.max(1, Math.min(3, count | 0));
@@ -691,3 +842,187 @@ export async function performResetMouse(
   await refreshCursor(cdp, tabId, cx, cy);
   return { x: cx, y: cy };
 }
+
+export interface SelectOptionResult {
+  ok: boolean;
+  selected?: string[];
+  error?: string;
+  wanted?: string;
+  available?: string[];
+}
+
+/**
+ * Choose option(s) on the `<select>` most recently intercepted by a
+ * `mouse_click`. Sets `.value` (or per-option `.selected` for multi-select)
+ * and dispatches `input` + `change` so the page's listeners run as if a
+ * human picked from the dropdown.
+ *
+ * `values` is matched against options in this order:
+ *   1. exact `value` attribute
+ *   2. exact visible label (trimmed)
+ *   3. case-insensitive substring of the visible label
+ */
+export async function performSelectOption(
+  tabId: number,
+  conversationId: string,
+  values: string[],
+): Promise<SelectOptionResult> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const expr = `(function(values){
+    var el = document.querySelector('[data-ob-pending-form-target="select"]');
+    if (!el || el.tagName !== 'SELECT') {
+      return { ok: false, error: 'no_pending_select' };
+    }
+    var matched = [];
+    if (el.multiple) {
+      for (var i = 0; i < el.options.length; i++) {
+        var opt = el.options[i];
+        var label = (opt.textContent || '').trim();
+        var want = values.indexOf(opt.value) >= 0 || values.indexOf(label) >= 0;
+        opt.selected = !!want;
+        if (want) matched.push(opt.value);
+      }
+      if (matched.length === 0) {
+        var labels = [];
+        for (var k = 0; k < el.options.length; k++) labels.push((el.options[k].textContent || '').trim());
+        return { ok: false, error: 'option_not_found', wanted: values.join(','), available: labels };
+      }
+    } else {
+      var want = values[0];
+      var idx = -1;
+      for (var i2 = 0; i2 < el.options.length; i2++) {
+        if (el.options[i2].value === want) { idx = i2; break; }
+      }
+      if (idx === -1) {
+        for (var i3 = 0; i3 < el.options.length; i3++) {
+          if ((el.options[i3].textContent || '').trim() === want) { idx = i3; break; }
+        }
+      }
+      if (idx === -1 && want) {
+        var wl = String(want).toLowerCase();
+        for (var i4 = 0; i4 < el.options.length; i4++) {
+          if ((el.options[i4].textContent || '').toLowerCase().indexOf(wl) >= 0) { idx = i4; break; }
+        }
+      }
+      if (idx === -1) {
+        var avail = [];
+        for (var k2 = 0; k2 < el.options.length; k2++) avail.push((el.options[k2].textContent || '').trim());
+        return { ok: false, error: 'option_not_found', wanted: String(want || ''), available: avail };
+      }
+      el.selectedIndex = idx;
+      matched.push(el.options[idx].value);
+    }
+    el.dispatchEvent(new Event('input', { bubbles: true }));
+    el.dispatchEvent(new Event('change', { bubbles: true }));
+    el.removeAttribute('data-ob-pending-form-target');
+    return { ok: true, selected: matched };
+  })(${JSON.stringify(values)})`;
+  try {
+    const r = await cdp.sendCommand<{ result?: { value?: SelectOptionResult } }>(
+      'Runtime.evaluate',
+      { expression: expr, returnByValue: true },
+      8000,
+      0,
+    );
+    return r?.result?.value ?? { ok: false, error: 'no_result' };
+  } catch (err) {
+    return {
+      ok: false,
+      error: `eval_failed: ${err instanceof Error ? err.message : String(err)}`,
+    };
+  }
+}
+
+export interface UploadFilePendingResult {
+  ok: boolean;
+  paths?: string[];
+  error?: string;
+}
+
+/**
+ * Attach files to the `<input type="file">` most recently intercepted by
+ * a `mouse_click`. Uses CDP `DOM.setFileInputFiles`, which bypasses the
+ * native OS file picker entirely. Paths must exist on the host running
+ * Chrome (same machine as the server in the v1 setup).
+ */
+export async function performUploadFilePending(
+  tabId: number,
+  conversationId: string,
+  paths: string[],
+): Promise<UploadFilePendingResult> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  let objectId: string | undefined;
+  try {
+    const findRes = await cdp.sendCommand<{
+      result?: { objectId?: string; subtype?: string; type?: string };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression: `document.querySelector('[data-ob-pending-form-target="file"]')`,
+        returnByValue: false,
+      },
+      8000,
+      0,
+    );
+    objectId = findRes?.result?.objectId;
+  } catch (err) {
+    return {
+      ok: false,
+      error: `lookup_failed: ${err instanceof Error ? err.message : String(err)}`,
+    };
+  }
+  if (!objectId) {
+    return { ok: false, error: 'no_pending_file_input' };
+  }
+  let backendNodeId: number | undefined;
+  try {
+    const desc = await cdp.sendCommand<{ node?: { backendNodeId?: number } }>(
+      'DOM.describeNode',
+      { objectId },
+      8000,
+      0,
+    );
+    backendNodeId = desc?.node?.backendNodeId;
+  } catch (err) {
+    return {
+      ok: false,
+      error: `describe_node_failed: ${err instanceof Error ? err.message : String(err)}`,
+    };
+  }
+  if (!backendNodeId) {
+    return { ok: false, error: 'cannot_resolve_backend_node' };
+  }
+  try {
+    await cdp.sendCommand(
+      'DOM.setFileInputFiles',
+      { backendNodeId, files: paths },
+      15000,
+      0,
+    );
+  } catch (err) {
+    return {
+      ok: false,
+      error: `set_files_failed: ${err instanceof Error ? err.message : String(err)}`,
+    };
+  }
+  // Clear marker and dispatch the form events the page's listeners expect.
+  await cdp.sendCommand(
+    'Runtime.evaluate',
+    {
+      expression: `(function(){
+        var el = document.querySelector('[data-ob-pending-form-target="file"]');
+        if (el) {
+          el.removeAttribute('data-ob-pending-form-target');
+          try { el.dispatchEvent(new Event('input', { bubbles: true })); } catch (e) {}
+          try { el.dispatchEvent(new Event('change', { bubbles: true })); } catch (e) {}
+        }
+      })()`,
+      returnByValue: true,
+    },
+    8000,
+    0,
+  );
+  return { ok: true, paths };
+}
diff --git a/extension/src/types.ts b/extension/src/types.ts
index f0dc526..5cc2118 100644
--- a/extension/src/types.ts
+++ b/extension/src/types.ts
@@ -77,6 +77,16 @@ export interface KeyboardPressCommand extends BaseCommand {
   modifiers?: string[];
 }
 
+export interface SelectOptionCommand extends BaseCommand {
+  type: 'select_option';
+  values: string[];
+}
+
+export interface UploadFilePendingCommand extends BaseCommand {
+  type: 'upload_file_pending';
+  paths: string[];
+}
+
 export interface ScreenshotCommand extends BaseCommand {
   type: 'screenshot';
   tab_id?: number;
@@ -318,6 +328,8 @@ export type Command =
   | ResetMouseCommand
   | KeyboardTypeCommand
   | KeyboardPressCommand
+  | SelectOptionCommand
+  | UploadFilePendingCommand
   | ScreenshotCommand
   | TabCommand
   | GetTabsCommand
diff --git a/server/agent/api.py b/server/agent/api.py
index 0e59450..ebf6186 100644
--- a/server/agent/api.py
+++ b/server/agent/api.py
@@ -497,14 +497,17 @@ def initialize_agent():
         # Import the old OpenBrowserTool for backward compatibility
         # logger.info("OpenBrowserTool registered (deprecated, for backward compatibility)")
 
-        # Tools exposed to the agent: tab, mouse, keyboard, dialog. The
-        # legacy highlight + element_interaction modules are imported only
-        # to keep them importable for non-agent flows; they are not in the
-        # agent's toolset.
+        # Tools exposed to the agent: tab, mouse, keyboard, dialog,
+        # select_option, upload_file. The legacy highlight +
+        # element_interaction modules are imported only to keep them
+        # importable for non-agent flows; they are not in the agent's
+        # toolset.
         from .tools.tab_tool import TabTool
         from .tools.dialog_tool import DialogTool
         from .tools.mouse_tool import MouseTool
         from .tools.keyboard_tool import KeyboardTool
+        from .tools.select_option_tool import SelectOptionTool  # noqa: F401
+        from .tools.upload_file_tool import UploadFileTool  # noqa: F401
         # Imported for legacy tooling (routine recording) — not registered
         # for the live agent.
         from .tools.highlight_tool import HighlightTool  # noqa: F401
@@ -513,8 +516,8 @@ def initialize_agent():
         )
 
         logger.info(
-            "4 OpenBrowser tools registered for the agent: "
-            "tab, mouse, keyboard, dialog"
+            "6 OpenBrowser tools registered for the agent: "
+            "tab, mouse, keyboard, dialog, select_option, upload_file"
         )
 
     except Exception as e:
diff --git a/server/agent/manager.py b/server/agent/manager.py
index d59c726..cf287a9 100644
--- a/server/agent/manager.py
+++ b/server/agent/manager.py
@@ -102,6 +102,8 @@ def __init__(self, multi_process_mode: bool = False):
             Tool(name="mouse"),  # Virtual mouse: move/click/drag/scroll/reset
             Tool(name="keyboard"),  # Virtual keyboard: type/press
             Tool(name="dialog"),  # Browser dialog handling
+            Tool(name="select_option"),  # Pick from a native <select>
+            Tool(name="upload_file"),  # Attach files to <input type=file>
         ]
         self.general_tools = [
             Tool(name=PLEASE_HELP_ME_TOOL_NAME),
diff --git a/server/agent/prompts/big_model/select_option_tool.j2 b/server/agent/prompts/big_model/select_option_tool.j2
new file mode 100644
index 0000000..1307226
--- /dev/null
+++ b/server/agent/prompts/big_model/select_option_tool.j2
@@ -0,0 +1,34 @@
+# Select Option Tool
+
+Pick option(s) on a native `<select>` after clicking it.
+
+## When to use
+
+Some dropdowns are native `<select>` elements: clicking one opens an OS-level menu that does not appear in the screenshot. When you click such a dropdown, the observation lists the available options instead of opening the menu visually. Call `select_option` with the value or label you want.
+
+## Action
+
+```json
+{ "values": ["option-value"] }
+```
+
+`values`: list of option values or visible labels. Each entry matches in this order — exact `value` attribute, exact label, case-insensitive substring of the label. Pass one entry for a normal `<select>`, multiple for `<select multiple>`.
+
+## Pattern
+
+1. `move` the cursor over the dropdown.
+2. `click`. The observation lists the dropdown's options.
+3. `select_option` with the desired entry.
+
+## Examples
+
+```json
+{ "values": ["us"] }
+{ "values": ["United States"] }
+{ "values": ["red", "blue"] }
+```
+
+## Notes
+
+- This tool only works after a click intercepted a native `<select>`. For custom dropdowns built from divs, use `mouse` to click options visually.
+- The page's `change` event fires after selection, so dependent UI updates run as if a person picked the option.
diff --git a/server/agent/prompts/big_model/upload_file_tool.j2 b/server/agent/prompts/big_model/upload_file_tool.j2
new file mode 100644
index 0000000..1982dde
--- /dev/null
+++ b/server/agent/prompts/big_model/upload_file_tool.j2
@@ -0,0 +1,33 @@
+# Upload File Tool
+
+Attach file(s) to a native `<input type="file">` after clicking it.
+
+## When to use
+
+Clicking an upload control opens an OS-level file picker that does not render into screenshots. When you click such a control, the observation reports that a file input was focused. Call `upload_file` with the path(s) you want to attach.
+
+## Action
+
+```json
+{ "paths": ["/absolute/path/to/file.pdf"] }
+```
+
+`paths`: list of absolute file paths on the host running the browser. One entry for a normal upload, multiple for `<input type=file multiple>`.
+
+## Pattern
+
+1. `move` the cursor over the upload button or drop zone.
+2. `click`. The observation reports the file input was focused.
+3. `upload_file` with the absolute path(s).
+
+## Examples
+
+```json
+{ "paths": ["/tmp/resume.pdf"] }
+{ "paths": ["/tmp/photo1.jpg", "/tmp/photo2.jpg"] }
+```
+
+## Notes
+
+- Paths must exist on the same machine that runs Chrome.
+- The page's `change` event fires after attaching, so any UI that depends on a file being selected (preview, validation, submit-button enable) updates immediately.
diff --git a/server/agent/prompts/small_model/select_option_tool.j2 b/server/agent/prompts/small_model/select_option_tool.j2
new file mode 100644
index 0000000..637d639
--- /dev/null
+++ b/server/agent/prompts/small_model/select_option_tool.j2
@@ -0,0 +1,26 @@
+# Select Option Tool
+
+Pick option(s) on a native `<select>` after clicking it.
+
+Native dropdowns render in OS chrome and do not appear in screenshots. When `mouse click` lands on a `<select>`, the observation lists the options. Then call `select_option` with the value or label.
+
+## Action
+
+```json
+{ "values": ["option-value"] }
+```
+
+`values`: list of option values or labels. Match order — exact `value`, exact label, case-insensitive substring of label. One entry for normal `<select>`, multiple for `<select multiple>`.
+
+## Pattern
+
+1. `mouse move` over the dropdown.
+2. `mouse click`. The observation lists options.
+3. `select_option` with the desired entry.
+
+## Examples
+
+```json
+{ "values": ["United States"] }
+{ "values": ["red", "blue"] }
+```
diff --git a/server/agent/prompts/small_model/upload_file_tool.j2 b/server/agent/prompts/small_model/upload_file_tool.j2
new file mode 100644
index 0000000..4a979b7
--- /dev/null
+++ b/server/agent/prompts/small_model/upload_file_tool.j2
@@ -0,0 +1,26 @@
+# Upload File Tool
+
+Attach file(s) to a native `<input type="file">` after clicking it.
+
+OS file pickers do not appear in screenshots. When `mouse click` lands on an upload control, the observation reports that a file input was focused. Then call `upload_file` with absolute path(s).
+
+## Action
+
+```json
+{ "paths": ["/absolute/path/to/file.pdf"] }
+```
+
+`paths`: absolute file paths on the host running the browser. One entry for a normal upload, multiple for `<input type=file multiple>`.
+
+## Pattern
+
+1. `mouse move` over the upload button.
+2. `mouse click`. The observation reports a file input was focused.
+3. `upload_file` with absolute path(s).
+
+## Examples
+
+```json
+{ "paths": ["/tmp/resume.pdf"] }
+{ "paths": ["/tmp/photo1.jpg", "/tmp/photo2.jpg"] }
+```
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index a8311b5..625089c 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -47,6 +47,8 @@
     KeyboardTypeCommand,
     KeyboardPressCommand,
     ResetMouseCommand,
+    SelectOptionCommand,
+    UploadFilePendingCommand,
     MouseButton,
     ScrollDirection,
 )
@@ -58,6 +60,8 @@
 from server.agent.tools.dialog_tool import DialogHandleAction
 from server.agent.tools.mouse_tool import MouseAction
 from server.agent.tools.keyboard_tool import KeyboardAction
+from server.agent.tools.select_option_tool import SelectOptionAction
+from server.agent.tools.upload_file_tool import UploadFileAction
 
 from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
 from server.core.llm_config import llm_config_manager
@@ -304,6 +308,10 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
                 return self._execute_mouse_action(action)
             elif isinstance(action, KeyboardAction):
                 return self._execute_keyboard_action(action)
+            elif isinstance(action, SelectOptionAction):
+                return self._execute_select_option_action(action)
+            elif isinstance(action, UploadFileAction):
+                return self._execute_upload_file_action(action)
             else:
                 raise ValueError(f"Unknown action type: {type(action).__name__}")
 
@@ -1066,11 +1074,22 @@ def _execute_mouse_action(
                     conversation_id=self.conversation_id,
                 )
                 result_dict = self._execute_command_sync(command)
-                return self._build_observation_from_result(
-                    result_dict,
+                message = (
                     f"Clicked {action.button} at the cursor "
-                    f"(count={action.count})",
+                    f"(count={action.count})"
                 )
+                # When the click landed on a native <select> or
+                # <input type=file>, the extension suppresses the OS-level
+                # popup and returns descriptor metadata. Surface it in the
+                # observation message so the agent knows to follow up with
+                # `select_option` or `upload_file` (the option list / file
+                # input metadata is invisible in the screenshot).
+                intercepted = self._extract_intercepted_form_control(result_dict)
+                if intercepted:
+                    message = self._format_intercepted_message(
+                        intercepted, action.button, action.count
+                    )
+                return self._build_observation_from_result(result_dict, message)
 
             if kind == "drag":
                 if (
@@ -1197,6 +1216,195 @@ def _execute_keyboard_action(
                 success=False, error=str(e), small_model=self._uses_small_model()
             )
 
+    @staticmethod
+    def _extract_intercepted_form_control(
+        result_dict: Optional[Dict[str, Any]],
+    ) -> Optional[Dict[str, Any]]:
+        """Pull `intercepted_form_control` out of the click wire response.
+
+        The extension wraps pixel-action details under `data` (keyed
+        alongside the screenshot). Returns the descriptor dict, or None
+        if the click was a normal click that fell through.
+        """
+        if not result_dict:
+            return None
+        data = result_dict.get("data")
+        if isinstance(data, dict):
+            ifc = data.get("intercepted_form_control")
+            if isinstance(ifc, dict):
+                return ifc
+        ifc = result_dict.get("intercepted_form_control")
+        if isinstance(ifc, dict):
+            return ifc
+        return None
+
+    @staticmethod
+    def _format_intercepted_message(
+        ifc: Dict[str, Any], button: str, count: int
+    ) -> str:
+        """Render a human-readable observation when click hit a native control.
+
+        Lists every option for `<select>` so the agent can pick one with
+        `select_option`. For file inputs, names the input and reminds the
+        agent to call `upload_file`.
+        """
+        kind = ifc.get("kind")
+        name = (ifc.get("name") or "").strip()
+        aria = (ifc.get("ariaLabel") or "").strip()
+        ident = " ".join(
+            f"{k}={v!r}" for k, v in (("name", name), ("aria-label", aria)) if v
+        )
+        ident_suffix = f" ({ident})" if ident else ""
+        if kind == "select":
+            multiple = bool(ifc.get("multiple"))
+            options = ifc.get("options") or []
+            if not isinstance(options, list):
+                options = []
+            lines = []
+            for o in options:
+                if not isinstance(o, dict):
+                    continue
+                value = o.get("value", "")
+                label = (o.get("label") or "").strip()
+                tags = []
+                if o.get("selected"):
+                    tags.append("selected")
+                if o.get("disabled"):
+                    tags.append("disabled")
+                tag_str = f" [{', '.join(tags)}]" if tags else ""
+                lines.append(f"  - value={value!r} label={label!r}{tag_str}")
+            options_block = "\n".join(lines) if lines else "  (no options found)"
+            kind_word = "multi-select" if multiple else "select"
+            return (
+                f"Clicked {button} on a native <{kind_word}>{ident_suffix}; "
+                f"the OS dropdown does not render in screenshots, so the "
+                f"options are listed below. Pick one or more by calling "
+                f"`select_option` with the desired `value` or label.\n"
+                f"options:\n{options_block}"
+            )
+        if kind == "file":
+            accept = (ifc.get("accept") or "").strip()
+            multiple = bool(ifc.get("multiple"))
+            extras = []
+            if accept:
+                extras.append(f"accept={accept!r}")
+            if multiple:
+                extras.append("multiple")
+            extras_str = f" ({', '.join(extras)})" if extras else ""
+            return (
+                f"Clicked {button} on a native <input type=file>"
+                f"{ident_suffix}{extras_str}; the OS file picker does not "
+                f"render in screenshots. Call `upload_file` with the "
+                f"absolute path(s) to attach the file(s)."
+            )
+        # Unknown kind — fall back to the plain click message but include the
+        # raw kind so the agent isn't completely blind.
+        return (
+            f"Clicked {button} at the cursor (count={count}); intercepted "
+            f"native control of unknown kind={kind!r}"
+        )
+
+    def _execute_select_option_action(
+        self, action: SelectOptionAction
+    ) -> OpenBrowserObservation:
+        """Pick option(s) on the `<select>` focused by the previous click."""
+        try:
+            command = SelectOptionCommand(
+                values=list(action.values),
+                conversation_id=self.conversation_id,
+            )
+            result_dict = self._execute_command_sync(command)
+            preview = ", ".join(action.values[:3])
+            if len(action.values) > 3:
+                preview += f", … ({len(action.values)} total)"
+            message = self._format_select_option_message(
+                result_dict, action.values, preview
+            )
+            return self._build_observation_from_result(result_dict, message)
+        except Exception as e:
+            logger.error(f"select_option failed: {e}", exc_info=True)
+            return OpenBrowserObservation(
+                success=False, error=str(e), small_model=self._uses_small_model()
+            )
+
+    @staticmethod
+    def _format_select_option_message(
+        result_dict: Optional[Dict[str, Any]],
+        values: list,
+        preview: str,
+    ) -> str:
+        """Surface select_option outcomes (success, missing target, no match)."""
+        data = (result_dict or {}).get("data") or {}
+        ok = data.get("ok")
+        err = data.get("error")
+        if ok is False or err:
+            if err == "no_pending_select":
+                return (
+                    "select_option had no pending target. Click a native "
+                    "`<select>` first — its options are listed in that "
+                    "click's observation — then call select_option."
+                )
+            if err == "option_not_found":
+                wanted = data.get("wanted") or ", ".join(values)
+                avail = data.get("available") or []
+                shown = avail[:30]
+                avail_block = "\n".join(f"  - {label!r}" for label in shown)
+                if len(avail) > len(shown):
+                    avail_block += f"\n  …({len(avail) - len(shown)} more)"
+                return (
+                    f"select_option could not match {wanted!r}. Available "
+                    f"option labels:\n{avail_block}"
+                )
+            return f"select_option failed: {err!r}"
+        selected = data.get("selected") or values
+        sel_preview = ", ".join(str(s) for s in selected[:3])
+        if len(selected) > 3:
+            sel_preview += f", … ({len(selected)} total)"
+        return f"Selected option(s): {sel_preview} (requested: {preview})"
+
+    def _execute_upload_file_action(
+        self, action: UploadFileAction
+    ) -> OpenBrowserObservation:
+        """Attach file(s) to the file input focused by the previous click."""
+        try:
+            command = UploadFilePendingCommand(
+                paths=list(action.paths),
+                conversation_id=self.conversation_id,
+            )
+            result_dict = self._execute_command_sync(command)
+            preview = ", ".join(action.paths[:2])
+            if len(action.paths) > 2:
+                preview += f", … ({len(action.paths)} total)"
+            message = self._format_upload_file_message(
+                result_dict, action.paths, preview
+            )
+            return self._build_observation_from_result(result_dict, message)
+        except Exception as e:
+            logger.error(f"upload_file failed: {e}", exc_info=True)
+            return OpenBrowserObservation(
+                success=False, error=str(e), small_model=self._uses_small_model()
+            )
+
+    @staticmethod
+    def _format_upload_file_message(
+        result_dict: Optional[Dict[str, Any]],
+        paths: list,
+        preview: str,
+    ) -> str:
+        """Surface upload_file outcomes."""
+        data = (result_dict or {}).get("data") or {}
+        ok = data.get("ok")
+        err = data.get("error")
+        if ok is False or err:
+            if err == "no_pending_file_input":
+                return (
+                    "upload_file had no pending target. Click an "
+                    "<input type=file> (or its visible label/button) "
+                    "first, then call upload_file."
+                )
+            return f"upload_file failed: {err!r}"
+        return f"Uploaded file(s): {preview}"
+
     def _execute_dialog_action(
         self, action: DialogHandleAction
     ) -> OpenBrowserObservation:
diff --git a/server/agent/tools/select_option_tool.py b/server/agent/tools/select_option_tool.py
new file mode 100644
index 0000000..4863df2
--- /dev/null
+++ b/server/agent/tools/select_option_tool.py
@@ -0,0 +1,89 @@
+"""SelectOptionTool — choose option(s) on a focused native `<select>`.
+
+Native `<select>` dropdowns render in OS chrome rather than in the page
+DOM, so they don't appear in CDP screenshots when opened. The flow is:
+the agent moves the cursor over the select and clicks; the click is
+intercepted and the available options are returned in the observation;
+the agent then calls `select_option` with the desired value or label.
+"""
+
+from collections.abc import Sequence
+from typing import List
+
+from openhands.sdk.tool import (
+    ToolDefinition,
+    ToolAnnotations,
+    register_tool,
+)
+from pydantic import Field
+
+from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
+from server.agent.tools.prompt_context import get_prompt_render_context
+from server.agent.tools.prompt_loader import render_tool_prompt
+
+
+def get_select_option_tool_description(conv_state=None) -> str:
+    """Get the SelectOptionTool description, rendered from Jinja2 template."""
+    return render_tool_prompt(
+        "select_option_tool.j2",
+        conv_state,
+        context=get_prompt_render_context(conv_state),
+    )
+
+
+class SelectOptionAction(OpenBrowserAction):
+    """Choose option(s) on the native `<select>` focused by the previous click.
+
+    Match order: exact `value` attribute → exact visible label → case-
+    insensitive substring of the label. Pass multiple entries for
+    `<select multiple>`.
+    """
+
+    values: List[str] = Field(
+        description=(
+            "Option(s) to select. Single entry for a normal `<select>`, "
+            "multiple entries for `<select multiple>`. Each entry may be an "
+            "option's `value` attribute or its visible label."
+        ),
+        min_length=1,
+        max_length=50,
+    )
+
+
+class SelectOptionTool(
+    ToolDefinition[SelectOptionAction, OpenBrowserObservation]
+):
+    """Pick from a native `<select>` after clicking it."""
+
+    name = "select_option"
+
+    @classmethod
+    def create(
+        cls, conv_state, terminal_executor=None
+    ) -> Sequence["SelectOptionTool"]:
+        if terminal_executor is not None:
+            executor = terminal_executor
+        else:
+            conversation_id = getattr(conv_state, "id", None)
+            from server.agent.tools.browser_executor import get_browser_executor
+
+            executor = get_browser_executor(conversation_id)
+
+        return [
+            cls(
+                description=get_select_option_tool_description(conv_state),
+                action_type=SelectOptionAction,
+                observation_type=OpenBrowserObservation,
+                annotations=ToolAnnotations(
+                    title="SelectOption",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
+                executor=executor,
+            )
+        ]
+
+
+register_tool("select_option", SelectOptionTool.create)
diff --git a/server/agent/tools/upload_file_tool.py b/server/agent/tools/upload_file_tool.py
new file mode 100644
index 0000000..88f5e92
--- /dev/null
+++ b/server/agent/tools/upload_file_tool.py
@@ -0,0 +1,84 @@
+"""UploadFileTool — attach files to a focused `<input type="file">`.
+
+OS file pickers don't render into CDP screenshots, so the flow is: the
+agent clicks the upload control, the click is intercepted and focuses
+the underlying file input, then `upload_file` attaches files via
+`DOM.setFileInputFiles`. Paths must be absolute and exist on the host
+running Chrome.
+"""
+
+from collections.abc import Sequence
+from typing import List
+
+from openhands.sdk.tool import (
+    ToolDefinition,
+    ToolAnnotations,
+    register_tool,
+)
+from pydantic import Field
+
+from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
+from server.agent.tools.prompt_context import get_prompt_render_context
+from server.agent.tools.prompt_loader import render_tool_prompt
+
+
+def get_upload_file_tool_description(conv_state=None) -> str:
+    """Get the UploadFileTool description, rendered from Jinja2 template."""
+    return render_tool_prompt(
+        "upload_file_tool.j2",
+        conv_state,
+        context=get_prompt_render_context(conv_state),
+    )
+
+
+class UploadFileAction(OpenBrowserAction):
+    """Attach file(s) to the native file input focused by the previous click."""
+
+    paths: List[str] = Field(
+        description=(
+            "Absolute file paths to attach. One entry for a normal upload, "
+            "multiple entries for `<input type=file multiple>`. Paths must "
+            "exist on the host running the browser."
+        ),
+        min_length=1,
+        max_length=20,
+    )
+
+
+class UploadFileTool(
+    ToolDefinition[UploadFileAction, OpenBrowserObservation]
+):
+    """Upload file(s) to a native file input after clicking it."""
+
+    name = "upload_file"
+
+    @classmethod
+    def create(
+        cls, conv_state, terminal_executor=None
+    ) -> Sequence["UploadFileTool"]:
+        if terminal_executor is not None:
+            executor = terminal_executor
+        else:
+            conversation_id = getattr(conv_state, "id", None)
+            from server.agent.tools.browser_executor import get_browser_executor
+
+            executor = get_browser_executor(conversation_id)
+
+        return [
+            cls(
+                description=get_upload_file_tool_description(conv_state),
+                action_type=UploadFileAction,
+                observation_type=OpenBrowserObservation,
+                annotations=ToolAnnotations(
+                    title="UploadFile",
+                    readOnlyHint=False,
+                    destructiveHint=False,
+                    idempotentHint=False,
+                    openWorldHint=True,
+                ),
+                executor=executor,
+            )
+        ]
+
+
+register_tool("upload_file", UploadFileTool.create)
diff --git a/server/core/processor.py b/server/core/processor.py
index a16dc4f..30b2ff4 100644
--- a/server/core/processor.py
+++ b/server/core/processor.py
@@ -16,6 +16,8 @@
     ResetMouseCommand,
     KeyboardTypeCommand,
     KeyboardPressCommand,
+    SelectOptionCommand,
+    UploadFilePendingCommand,
     ScreenshotCommand,
     TabCommand,
     GetTabsCommand,
@@ -138,6 +140,8 @@ def _prepare_command_dict(self, command: Command) -> dict:
             ResetMouseCommand,
             KeyboardTypeCommand,
             KeyboardPressCommand,
+            SelectOptionCommand,
+            UploadFilePendingCommand,
             JavascriptExecuteCommand,
             HandleDialogCommand,
         )
@@ -236,6 +240,10 @@ async def execute(self, command: Command) -> CommandResponse:
                 return await self._execute_keyboard_type(command)
             elif isinstance(command, KeyboardPressCommand):
                 return await self._execute_keyboard_press(command)
+            elif isinstance(command, SelectOptionCommand):
+                return await self._execute_select_option(command)
+            elif isinstance(command, UploadFilePendingCommand):
+                return await self._execute_upload_file_pending(command)
             elif isinstance(command, ScreenshotCommand):
                 return await self._execute_screenshot(command)
             elif isinstance(command, TabCommand):
@@ -329,6 +337,20 @@ async def _execute_keyboard_press(
         response = await self._send_prepared_command(command)
         return response
 
+    async def _execute_select_option(
+        self, command: SelectOptionCommand
+    ) -> CommandResponse:
+        """Execute select_option command — operates on the pending `<select>`."""
+        response = await self._send_prepared_command(command)
+        return response
+
+    async def _execute_upload_file_pending(
+        self, command: UploadFilePendingCommand
+    ) -> CommandResponse:
+        """Execute upload_file_pending command — operates on the pending file input."""
+        response = await self._send_prepared_command(command)
+        return response
+
     async def _execute_screenshot(self, command: ScreenshotCommand) -> CommandResponse:
         """Execute screenshot command"""
         response = await self._send_prepared_command(command)
diff --git a/server/models/commands.py b/server/models/commands.py
index 5eb5af5..d894ea0 100644
--- a/server/models/commands.py
+++ b/server/models/commands.py
@@ -178,6 +178,51 @@ class KeyboardPressCommand(BaseCommand):
     )
 
 
+class SelectOptionCommand(BaseCommand):
+    """Choose option(s) on a `<select>` previously focused by `mouse_click`.
+
+    The agent's preceding `mouse_click` lands on a native `<select>`, the
+    extension intercepts it (the OS-level dropdown does not render into
+    screenshots) and marks the element. This command operates on that
+    pending mark — no element_id required.
+
+    `values` is matched against options in this order: exact `value`
+    attribute → exact visible label → case-insensitive substring of label.
+    Pass a list with multiple entries for `<select multiple>`.
+    """
+
+    type: Literal["select_option"] = "select_option"
+    values: List[str] = Field(
+        description=(
+            "Option(s) to select on the most recently clicked native `<select>`. "
+            "Pass a single-entry list for a normal dropdown, or multiple entries "
+            "for `<select multiple>`."
+        ),
+        min_length=1,
+        max_length=50,
+    )
+
+
+class UploadFilePendingCommand(BaseCommand):
+    """Attach file(s) to the `<input type=file>` previously focused by `mouse_click`.
+
+    Bypasses the native OS file picker via CDP `DOM.setFileInputFiles`.
+    Paths must be absolute and exist on the host running Chrome (same
+    machine as the server in the v1 setup).
+    """
+
+    type: Literal["upload_file_pending"] = "upload_file_pending"
+    paths: List[str] = Field(
+        description=(
+            "Absolute file paths to attach to the most recently clicked file "
+            "input. Single-entry list for a normal upload, multiple entries "
+            "for `<input type=file multiple>`."
+        ),
+        min_length=1,
+        max_length=20,
+    )
+
+
 class ScreenshotCommand(BaseCommand):
     """Capture screenshot"""
 
@@ -652,6 +697,8 @@ class TabsResponse(CommandResponse):
     ResetMouseCommand,
     KeyboardTypeCommand,
     KeyboardPressCommand,
+    SelectOptionCommand,
+    UploadFilePendingCommand,
     ScreenshotCommand,
     TabCommand,
     GetTabsCommand,
@@ -690,6 +737,8 @@ def parse_command(data: dict) -> Command:
         "reset_mouse": ResetMouseCommand,
         "keyboard_type": KeyboardTypeCommand,
         "keyboard_press": KeyboardPressCommand,
+        "select_option": SelectOptionCommand,
+        "upload_file_pending": UploadFilePendingCommand,
         "screenshot": ScreenshotCommand,
         "tab": TabCommand,
         "get_tabs": GetTabsCommand,

From fb9d299084cee8b0b5f2634764c49e925f8087bf Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 29 Apr 2026 21:57:40 +0800
Subject: [PATCH 05/14] chore(deps): bump agent-sdk pin to 962e9336
 (select_option + upload_file prompts)

Picks up the system-prompt updates that introduce the two new tools and
the NATIVE_FORM_CONTROLS guidance for native <select> / file pickers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 4 ++--
 uv.lock        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 7789f92..d13ca1e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "37227545d9d371423757ce47cddedd9521cc62d5" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "37227545d9d371423757ce47cddedd9521cc62d5" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "962e93361757dcab26f7acab6b5756496e4ee4e8" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "962e93361757dcab26f7acab6b5756496e4ee4e8" }
diff --git a/uv.lock b/uv.lock
index 43f2cfc..3b242f9 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1678,8 +1678,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=37227545d9d371423757ce47cddedd9521cc62d5" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=37227545d9d371423757ce47cddedd9521cc62d5" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=962e93361757dcab26f7acab6b5756496e4ee4e8" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=962e93361757dcab26f7acab6b5756496e4ee4e8" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2224,7 +2224,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=37227545d9d371423757ce47cddedd9521cc62d5#37227545d9d371423757ce47cddedd9521cc62d5" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=962e93361757dcab26f7acab6b5756496e4ee4e8#962e93361757dcab26f7acab6b5756496e4ee4e8" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2244,7 +2244,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=37227545d9d371423757ce47cddedd9521cc62d5#37227545d9d371423757ce47cddedd9521cc62d5" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=962e93361757dcab26f7acab6b5756496e4ee4e8#962e93361757dcab26f7acab6b5756496e4ee4e8" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 2175a5348146b5ba0205e74f79401ce40a4b7956 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 6 May 2026 13:42:42 +0800
Subject: [PATCH 06/14] feat(agent): pixel-click density gate with
 descriptor-rich nearby candidates

Before dispatching a pixel-paradigm `mouse(click)` or `mouse(drag)`, the
executor probes the click point for the hit element and any interactables
within a 30-CSS-px edge-distance window. When two or more interactables
sit near the click, the action enters a confirmation stage: the extension
renders a zoomed crop with a YELLOW box on the hit control plus orange
dashed outlines on nearby candidates within 140 CSS px center-distance.
The agent commits via `mouse(action="confirm")` or re-aims using the
candidate centers (Qwen [0,1000] space) listed in the message.

Implementation:
- New extension command `analyze_pixel_targets` reuses the highlight
  detection engine to compute hit + neighborhood + verdict.
- New extension command `render_pixel_confirm` produces the zoomed crop
  with bright orange dashed outlines on neighbors.
- `BrowserExecutor` tracks the virtual cursor on the server side so the
  in-place pixel `click` knows where to gate. `mouse_click_pixel` /
  `mouse_drag_pixel` pendings reuse the existing 2PC machinery.
- Non-confirm actions taken while a pixel pending is set count as an
  implicit rejection: the rejected candidates are appended to the new
  observation's message so course correction has them fresh.
- Candidate rendering reuses `_format_highlighted_element_lines` so the
  agent reads the same descriptor-first phrasing the element paradigm
  produced; bare `<tag>` fallback surfaces a truncated HTML snippet.
- `mouse_move` / `reset_mouse` post-action capture waits 150 ms so the
  cursor sprite's CSS transition completes before the screenshot.
- Tool prompts (`mouse_tool.j2`) describe the confirmation in the
  affirmative voice; system prompt update ships via the bumped
  agent-sdk pin.

Bumps agent-sdk pin to 2ea1956a so the system prompt teaches the agent
about the pixel-click preview shape.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts             |  99 ++-
 .../src/commands/pixel-confirm-render.ts      | 395 +++++++++++
 .../src/commands/pixel-target-analyzer.ts     | 225 ++++++
 extension/src/types.ts                        |  30 +
 pyproject.toml                                |   4 +-
 server/agent/prompts/big_model/mouse_tool.j2  |  25 +
 .../agent/prompts/small_model/mouse_tool.j2   |  22 +
 server/agent/tools/browser_executor.py        | 662 +++++++++++++++++-
 server/agent/tools/mouse_tool.py              |   8 +-
 server/core/processor.py                      |  18 +
 server/models/commands.py                     |  78 +++
 uv.lock                                       |   8 +-
 12 files changed, 1543 insertions(+), 31 deletions(-)
 create mode 100644 extension/src/commands/pixel-confirm-render.ts
 create mode 100644 extension/src/commands/pixel-target-analyzer.ts

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 80261dd..0f2ebe3 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -42,6 +42,8 @@ import {
   getConfirmationPromptText,
 } from '../commands/single-highlight';
 import { highlightDropPreview } from '../commands/drop-preview-highlight';
+import { analyzePixelTargets } from '../commands/pixel-target-analyzer';
+import { renderPixelConfirm } from '../commands/pixel-confirm-render';
 import { elementCache } from '../commands/element-cache';
 import { assignHashedElementIds } from '../commands/element-id';
 import { buildElementCacheMissMessage } from '../commands/element-cache';
@@ -1782,6 +1784,82 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         };
       }
 
+      // ============== Pixel-target analysis & confirmation render ==============
+      // Used by the server to gate dense pixel clicks: analyze probes the live
+      // viewport at (x,y) for nearby interactables, render produces a zoomed
+      // confirmation crop showing what was selected.
+      case 'analyze_pixel_targets': {
+        if (!command.conversation_id) {
+          throw new Error(
+            'conversation_id is required for analyze_pixel_targets command (strict mode)',
+          );
+        }
+        const conversationId = command.conversation_id;
+        const activeTabId = tabManager.getCurrentActiveTabId(conversationId);
+        if (!activeTabId) {
+          throw new Error(
+            `No active tab found for conversation ${conversationId}. Use tab init first.`,
+          );
+        }
+        await tabManager.ensureTabManaged(activeTabId, conversationId);
+        tabManager.updateTabActivity(activeTabId, conversationId);
+
+        const analysis = await analyzePixelTargets(
+          activeTabId,
+          conversationId,
+          command.x,
+          command.y,
+          typeof command.radius === 'number' ? command.radius : 30,
+          typeof command.candidate_limit === 'number'
+            ? command.candidate_limit
+            : 5,
+        );
+
+        return {
+          success: true,
+          message: `analyze_pixel_targets verdict=${analysis.verdict}`,
+          data: analysis,
+          timestamp: Date.now(),
+        };
+      }
+
+      case 'render_pixel_confirm': {
+        if (!command.conversation_id) {
+          throw new Error(
+            'conversation_id is required for render_pixel_confirm command (strict mode)',
+          );
+        }
+        const conversationId = command.conversation_id;
+        const activeTabId = tabManager.getCurrentActiveTabId(conversationId);
+        if (!activeTabId) {
+          throw new Error(
+            `No active tab found for conversation ${conversationId}. Use tab init first.`,
+          );
+        }
+        await tabManager.ensureTabManaged(activeTabId, conversationId);
+        tabManager.updateTabActivity(activeTabId, conversationId);
+
+        const rendered = await renderPixelConfirm(
+          activeTabId,
+          conversationId,
+          {
+            mode: command.mode,
+            x: command.x,
+            y: command.y,
+            target_bbox: command.target_bbox,
+            candidate_bboxes: command.candidate_bboxes,
+            drag_end: command.drag_end,
+          },
+        );
+
+        return {
+          success: true,
+          message: `render_pixel_confirm mode=${command.mode}`,
+          data: rendered,
+          timestamp: Date.now(),
+        };
+      }
+
       // ============== Pixel-level mouse / keyboard ==============
       // The live agent uses these instead of the highlight + element-id flow.
       // The server has already denormalized Qwen [0,1000] coords to CSS px.
@@ -1924,16 +2002,27 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         // For actions that can navigate or trigger heavy re-render
         // (`mouse_click`, `mouse_drag`, `keyboard_press` Enter), give the
         // browser a brief settle window so the captured frame reflects the
-        // new state instead of a transitional DOM. Lighter actions
-        // (mouse_move, mouse_scroll, keyboard_type, reset_mouse) take 0.
-        const settleMs =
+        // new state instead of a transitional DOM. `mouse_move` waits for
+        // the cursor sprite's CSS transition (120 ms — see
+        // `virtual-cursor.ts` `transition:transform 120ms`) to finish, so
+        // the screenshot shows the cursor at its destination rather than
+        // somewhere mid-glide. `reset_mouse` jumps to viewport center via
+        // the same sprite path and needs the same wait.
+        let settleMs = 0;
+        if (
           command.type === 'mouse_click' ||
           command.type === 'mouse_drag' ||
           command.type === 'keyboard_press' ||
           command.type === 'select_option' ||
           command.type === 'upload_file_pending'
-            ? 350
-            : 0;
+        ) {
+          settleMs = 350;
+        } else if (
+          command.type === 'mouse_move' ||
+          command.type === 'reset_mouse'
+        ) {
+          settleMs = 150;
+        }
         const cursorAfter =
           getCursorPosition(activeTabId) ??
           (await resolveCursorOrCenter(activeTabId, conversationId));
diff --git a/extension/src/commands/pixel-confirm-render.ts b/extension/src/commands/pixel-confirm-render.ts
new file mode 100644
index 0000000..935f007
--- /dev/null
+++ b/extension/src/commands/pixel-confirm-render.ts
@@ -0,0 +1,395 @@
+/**
+ * Pixel-Confirmation Render Module
+ *
+ * Produces a zoomed confirmation screenshot for a pending pixel mouse action
+ * (click or drag). Two visual modes:
+ *
+ *   - 'pixel_hit'  → YELLOW box around the hit element + zoom-crop centered on it.
+ *   - 'pixel_miss' → red crosshair at the click coord + thin grey outlines on
+ *                    nearby candidate elements + zoom-crop centered on the click.
+ *
+ * Both modes capture a fresh viewport screenshot (no virtual cursor — we draw
+ * our own crosshair / box so the cursor sprite would be redundant) and return
+ * a base64 PNG data URL keyed under `screenshot_data_url` to match the shape
+ * used by other 2PC previews.
+ */
+
+import { captureScreenshot, compressIfNeeded } from './screenshot';
+
+const HIT_BORDER_COLOR = '#FFD400';
+const HIT_GLOW_COLOR = 'rgba(255, 212, 0, 0.7)';
+const HIT_LINE_WIDTH = 4;
+const HIT_BOX_PADDING = 2;
+
+const CANDIDATE_BORDER_COLOR = '#FF6B00';
+const CANDIDATE_GLOW_COLOR = 'rgba(255, 107, 0, 0.55)';
+const CANDIDATE_LINE_WIDTH = 3;
+
+const DRAG_LINE_COLOR = 'rgba(255, 212, 0, 0.85)';
+const DRAG_LINE_WIDTH = 3;
+const DRAG_ARROW_HEAD = 14;
+
+const BASE_CONTEXT_PADDING_X = 96;
+const BASE_CONTEXT_PADDING_Y = 112;
+const BASE_MIN_CROP_WIDTH = 520;
+const BASE_MIN_CROP_HEIGHT = 320;
+const MIN_CROP_RATIO = 0.58;
+
+interface BBox {
+  x: number;
+  y: number;
+  width: number;
+  height: number;
+}
+
+interface PointXY {
+  x: number;
+  y: number;
+}
+
+export interface PixelConfirmRenderRequest {
+  mode: 'pixel_hit' | 'pixel_miss';
+  x: number; // CSS px
+  y: number; // CSS px
+  target_bbox?: BBox; // CSS px (required for pixel_hit)
+  candidate_bboxes?: BBox[]; // CSS px
+  drag_end?: PointXY; // CSS px (optional second point for drag previews)
+}
+
+export interface PixelConfirmRenderResult {
+  screenshot_data_url: string;
+  viewport: { width: number; height: number };
+  scale: number;
+  crop: BBox;
+}
+
+function clamp(value: number, min: number, max: number): number {
+  return Math.max(min, Math.min(max, value));
+}
+
+function expandBbox(b: BBox, padding: number): BBox {
+  return {
+    x: b.x - padding,
+    y: b.y - padding,
+    width: b.width + padding * 2,
+    height: b.height + padding * 2,
+  };
+}
+
+function unionBbox(boxes: BBox[]): BBox {
+  if (boxes.length === 0) return { x: 0, y: 0, width: 0, height: 0 };
+  let x1 = Infinity;
+  let y1 = Infinity;
+  let x2 = -Infinity;
+  let y2 = -Infinity;
+  for (const b of boxes) {
+    x1 = Math.min(x1, b.x);
+    y1 = Math.min(y1, b.y);
+    x2 = Math.max(x2, b.x + b.width);
+    y2 = Math.max(y2, b.y + b.height);
+  }
+  return { x: x1, y: y1, width: x2 - x1, height: y2 - y1 };
+}
+
+function chooseCropCenter(
+  request: PixelConfirmRenderRequest,
+): { center: PointXY; focusBbox: BBox } {
+  if (request.mode === 'pixel_hit' && request.target_bbox) {
+    const focus = request.drag_end
+      ? unionBbox([
+          request.target_bbox,
+          {
+            x: request.drag_end.x,
+            y: request.drag_end.y,
+            width: 1,
+            height: 1,
+          },
+        ])
+      : request.target_bbox;
+    return {
+      center: {
+        x: focus.x + focus.width / 2,
+        y: focus.y + focus.height / 2,
+      },
+      focusBbox: focus,
+    };
+  }
+  // pixel_miss or hit without bbox → center on the click point.
+  const focus: BBox = request.drag_end
+    ? unionBbox([
+        { x: request.x, y: request.y, width: 1, height: 1 },
+        {
+          x: request.drag_end.x,
+          y: request.drag_end.y,
+          width: 1,
+          height: 1,
+        },
+      ])
+    : { x: request.x - 1, y: request.y - 1, width: 2, height: 2 };
+  return {
+    center: {
+      x: focus.x + focus.width / 2,
+      y: focus.y + focus.height / 2,
+    },
+    focusBbox: focus,
+  };
+}
+
+function calculateCrop(
+  imageWidth: number,
+  imageHeight: number,
+  scale: number,
+  request: PixelConfirmRenderRequest,
+): BBox {
+  const { focusBbox } = chooseCropCenter(request);
+
+  const focusDevice = {
+    x: focusBbox.x * scale,
+    y: focusBbox.y * scale,
+    width: Math.max(1, focusBbox.width * scale),
+    height: Math.max(1, focusBbox.height * scale),
+  };
+
+  const contextX = BASE_CONTEXT_PADDING_X * scale;
+  const contextY = BASE_CONTEXT_PADDING_Y * scale;
+  const minCropW = Math.min(
+    imageWidth,
+    Math.max(BASE_MIN_CROP_WIDTH * scale, imageWidth * MIN_CROP_RATIO),
+  );
+  const minCropH = Math.min(
+    imageHeight,
+    Math.max(BASE_MIN_CROP_HEIGHT * scale, imageHeight * MIN_CROP_RATIO),
+  );
+
+  const desiredW = Math.max(minCropW, focusDevice.width + contextX * 2);
+  const desiredH = Math.max(minCropH, focusDevice.height + contextY * 2);
+
+  const cropW = Math.min(imageWidth, Math.round(desiredW));
+  const cropH = Math.min(imageHeight, Math.round(desiredH));
+
+  const centerX = focusDevice.x + focusDevice.width / 2;
+  const centerY = focusDevice.y + focusDevice.height / 2;
+
+  const cropX = clamp(
+    Math.round(centerX - cropW / 2),
+    0,
+    Math.max(0, imageWidth - cropW),
+  );
+  const cropY = clamp(
+    Math.round(centerY - cropH / 2),
+    0,
+    Math.max(0, imageHeight - cropH),
+  );
+
+  return { x: cropX, y: cropY, width: cropW, height: cropH };
+}
+
+function drawCandidateOutline(
+  ctx: OffscreenCanvasRenderingContext2D,
+  rect: BBox,
+  scale: number,
+): void {
+  ctx.save();
+  ctx.strokeStyle = CANDIDATE_BORDER_COLOR;
+  ctx.lineWidth = CANDIDATE_LINE_WIDTH * scale;
+  ctx.shadowColor = CANDIDATE_GLOW_COLOR;
+  ctx.shadowBlur = 8 * scale;
+  ctx.setLineDash([6 * scale, 4 * scale]);
+  ctx.strokeRect(rect.x, rect.y, rect.width, rect.height);
+  ctx.restore();
+}
+
+function drawHitBox(
+  ctx: OffscreenCanvasRenderingContext2D,
+  rect: BBox,
+  scale: number,
+): void {
+  ctx.save();
+  ctx.strokeStyle = HIT_BORDER_COLOR;
+  ctx.lineWidth = HIT_LINE_WIDTH * scale;
+  ctx.shadowColor = HIT_GLOW_COLOR;
+  ctx.shadowBlur = 12 * scale;
+  ctx.strokeRect(rect.x, rect.y, rect.width, rect.height);
+  ctx.restore();
+}
+
+function drawDragArrow(
+  ctx: OffscreenCanvasRenderingContext2D,
+  start: PointXY,
+  end: PointXY,
+  scale: number,
+): void {
+  const dx = end.x - start.x;
+  const dy = end.y - start.y;
+  const len = Math.hypot(dx, dy);
+  if (len < 1) return;
+  const ux = dx / len;
+  const uy = dy / len;
+  const head = DRAG_ARROW_HEAD * scale;
+
+  ctx.save();
+  ctx.strokeStyle = DRAG_LINE_COLOR;
+  ctx.fillStyle = DRAG_LINE_COLOR;
+  ctx.lineWidth = DRAG_LINE_WIDTH * scale;
+  ctx.lineCap = 'round';
+
+  ctx.beginPath();
+  ctx.moveTo(start.x, start.y);
+  ctx.lineTo(end.x, end.y);
+  ctx.stroke();
+
+  // Arrowhead.
+  const baseX = end.x - ux * head;
+  const baseY = end.y - uy * head;
+  const perpX = -uy;
+  const perpY = ux;
+  ctx.beginPath();
+  ctx.moveTo(end.x, end.y);
+  ctx.lineTo(baseX + perpX * head * 0.5, baseY + perpY * head * 0.5);
+  ctx.lineTo(baseX - perpX * head * 0.5, baseY - perpY * head * 0.5);
+  ctx.closePath();
+  ctx.fill();
+  ctx.restore();
+}
+
+export async function renderPixelConfirm(
+  tabId: number,
+  conversationId: string,
+  request: PixelConfirmRenderRequest,
+): Promise<PixelConfirmRenderResult> {
+  if (typeof OffscreenCanvas === 'undefined') {
+    throw new Error('[PixelConfirmRender] OffscreenCanvas is not available');
+  }
+  if (typeof createImageBitmap === 'undefined') {
+    throw new Error('[PixelConfirmRender] createImageBitmap is not available');
+  }
+
+  // Capture a clean shot — no cursor (we draw our own crosshair / box).
+  const shot = await captureScreenshot(
+    tabId,
+    conversationId,
+    /* includeCursor */ false,
+    /* quality */ 90,
+    /* resizeToPreset */ false,
+    /* waitForRender */ 0,
+    /* options */ undefined,
+    /* preCaptureScript */ undefined,
+  );
+  const screenshotDataUrl: string | undefined = shot?.imageData;
+  if (!screenshotDataUrl || !screenshotDataUrl.startsWith('data:')) {
+    throw new Error(
+      '[PixelConfirmRender] captureScreenshot returned no data URL',
+    );
+  }
+  const viewportWidth: number =
+    typeof shot?.metadata?.viewportWidth === 'number'
+      ? shot.metadata.viewportWidth
+      : 0;
+  const viewportHeight: number =
+    typeof shot?.metadata?.viewportHeight === 'number'
+      ? shot.metadata.viewportHeight
+      : 0;
+
+  const [, base64] = screenshotDataUrl.split(',');
+  const header = screenshotDataUrl.slice(
+    0,
+    screenshotDataUrl.indexOf(','),
+  );
+  const mimeType = header.substring(header.indexOf(':') + 1, header.indexOf(';'));
+  const binary = atob(base64);
+  const bytes = new Uint8Array(binary.length);
+  for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
+  const bitmap = await createImageBitmap(
+    new Blob([bytes], { type: mimeType }),
+  );
+
+  const actualScaleX =
+    viewportWidth > 0 ? bitmap.width / viewportWidth : 1;
+  const actualScaleY =
+    viewportHeight > 0 ? bitmap.height / viewportHeight : 1;
+  const scale = (actualScaleX + actualScaleY) / 2 || 1;
+
+  const crop = calculateCrop(bitmap.width, bitmap.height, scale, request);
+
+  const canvas = new OffscreenCanvas(crop.width, crop.height);
+  const ctx = canvas.getContext('2d');
+  if (!ctx) {
+    bitmap.close();
+    throw new Error('[PixelConfirmRender] Failed to acquire 2d context');
+  }
+
+  ctx.drawImage(
+    bitmap,
+    crop.x,
+    crop.y,
+    crop.width,
+    crop.height,
+    0,
+    0,
+    crop.width,
+    crop.height,
+  );
+  bitmap.close();
+
+  const toDeviceRect = (b: BBox): BBox => ({
+    x: Math.round(b.x * scale - crop.x),
+    y: Math.round(b.y * scale - crop.y),
+    width: Math.max(1, Math.round(b.width * scale)),
+    height: Math.max(1, Math.round(b.height * scale)),
+  });
+
+  const toDevicePoint = (p: PointXY): PointXY => ({
+    x: Math.round(p.x * scale - crop.x),
+    y: Math.round(p.y * scale - crop.y),
+  });
+
+  // Candidate outlines first (so the hit box / crosshair sits on top).
+  if (request.candidate_bboxes && request.candidate_bboxes.length > 0) {
+    for (const bbox of request.candidate_bboxes) {
+      drawCandidateOutline(ctx, toDeviceRect(bbox), scale);
+    }
+  }
+
+  // Hit case: draw a yellow box around the element the click would
+  // commit to. Miss case: draw nothing for the click point — the orange
+  // candidate outlines plus the message body tell the agent everything
+  // it needs to re-aim, without a distracting crosshair on top of the
+  // page content.
+  if (request.mode === 'pixel_hit' && request.target_bbox) {
+    const padded = expandBbox(request.target_bbox, HIT_BOX_PADDING / scale);
+    drawHitBox(ctx, toDeviceRect(padded), scale);
+  }
+
+  if (request.drag_end) {
+    const start = toDevicePoint({ x: request.x, y: request.y });
+    const end = toDevicePoint(request.drag_end);
+    drawDragArrow(ctx, start, end, scale);
+  }
+
+  const blob = await canvas.convertToBlob({ type: 'image/png' });
+  const dataUrl = await new Promise<string>((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onloadend = () => resolve(reader.result as string);
+    reader.onerror = () =>
+      reject(new Error('[PixelConfirmRender] Failed to read result blob'));
+    reader.readAsDataURL(blob);
+  });
+
+  const compressedRaw = await compressIfNeeded(dataUrl).catch(() => dataUrl);
+  const compressed =
+    typeof compressedRaw === 'string'
+      ? compressedRaw
+      : (compressedRaw &&
+          typeof compressedRaw === 'object' &&
+          'imageData' in compressedRaw &&
+          typeof compressedRaw.imageData === 'string'
+          ? compressedRaw.imageData
+          : dataUrl);
+
+  return {
+    screenshot_data_url: compressed,
+    viewport: { width: viewportWidth, height: viewportHeight },
+    scale,
+    crop,
+  };
+}
diff --git a/extension/src/commands/pixel-target-analyzer.ts b/extension/src/commands/pixel-target-analyzer.ts
new file mode 100644
index 0000000..8ed1dda
--- /dev/null
+++ b/extension/src/commands/pixel-target-analyzer.ts
@@ -0,0 +1,225 @@
+import { buildHighlightDetectionScript } from './highlight-detection';
+import { executeJavaScript } from './javascript';
+import type { InteractiveElement } from '../types';
+
+export interface PixelTargetCandidate {
+  selector: string;
+  tagName: string;
+  type: string;
+  interactionHints?: string[];
+  text?: string;
+  searchText?: string;
+  html?: string;
+  bbox: { x: number; y: number; width: number; height: number };
+  center: { x: number; y: number };
+  distance: number;
+  fingerprint?: string;
+  descriptor?: unknown;
+}
+
+export interface PixelTargetAnalysis {
+  viewport: { width: number; height: number };
+  hit: PixelTargetCandidate | null;
+  neighborhood: PixelTargetCandidate[];
+  verdict: 'sparse' | 'dense';
+  documentId?: string;
+}
+
+const DETECTION_TIMEOUT_MS = 12000;
+const MIN_CANDIDATE_DIM = 6;
+
+/**
+ * Min Euclidean distance from `(x, y)` to the closest point on `bbox`.
+ * Returns 0 when `(x, y)` is inside the rectangle. Used for the verdict
+ * trigger ("are the interactables overlapping the click area?").
+ */
+function distanceToBbox(
+  x: number,
+  y: number,
+  bbox: { x: number; y: number; width: number; height: number },
+): number {
+  const left = bbox.x;
+  const top = bbox.y;
+  const right = bbox.x + bbox.width;
+  const bottom = bbox.y + bbox.height;
+  const dx = x < left ? left - x : x > right ? x - right : 0;
+  const dy = y < top ? top - y : y > bottom ? y - bottom : 0;
+  return Math.hypot(dx, dy);
+}
+
+/**
+ * Euclidean distance from `(x, y)` to the bbox center. Used to rank /
+ * filter candidates for the agent's display: a wrapper container whose
+ * edge happens to be 5 px away but whose center is hundreds of pixels
+ * away is not visually adjacent to the click, so it shouldn't surface
+ * as a "nearby candidate" alternative.
+ */
+function distanceToCenter(
+  x: number,
+  y: number,
+  bbox: { x: number; y: number; width: number; height: number },
+): number {
+  const cx = bbox.x + bbox.width / 2;
+  const cy = bbox.y + bbox.height / 2;
+  return Math.hypot(x - cx, y - cy);
+}
+
+function bboxArea(bbox: { width: number; height: number }): number {
+  return Math.max(0, bbox.width) * Math.max(0, bbox.height);
+}
+
+function toCandidate(
+  el: InteractiveElement,
+  px: number,
+  py: number,
+): PixelTargetCandidate {
+  const center = {
+    x: Math.round(el.bbox.x + el.bbox.width / 2),
+    y: Math.round(el.bbox.y + el.bbox.height / 2),
+  };
+  return {
+    selector: el.selector,
+    tagName: el.tagName,
+    type: el.type,
+    interactionHints: el.interactionHints,
+    text: el.text,
+    searchText: el.searchText,
+    html: el.html,
+    bbox: {
+      x: Math.round(el.bbox.x),
+      y: Math.round(el.bbox.y),
+      width: Math.round(el.bbox.width),
+      height: Math.round(el.bbox.height),
+    },
+    center,
+    distance: Math.round(distanceToCenter(px, py, el.bbox)),
+    fingerprint: el.fingerprint,
+    descriptor: el.descriptor,
+  };
+}
+
+export async function analyzePixelTargets(
+  tabId: number,
+  conversationId: string,
+  x: number,
+  y: number,
+  radius: number,
+  candidateLimit: number,
+): Promise<PixelTargetAnalysis> {
+  const detectionScript = buildHighlightDetectionScript({
+    elementType: 'any',
+  });
+
+  const detection = await executeJavaScript(
+    tabId,
+    conversationId,
+    detectionScript,
+    true,
+    true,
+    DETECTION_TIMEOUT_MS,
+  );
+
+  if (!detection.success || !detection.result?.value) {
+    throw new Error(
+      detection.error || 'analyze_pixel_targets: failed to detect elements',
+    );
+  }
+
+  const value = detection.result.value as {
+    elements?: InteractiveElement[];
+    viewport?: { width?: number; height?: number };
+    documentId?: string;
+  };
+
+  const viewportWidth =
+    typeof value.viewport?.width === 'number' ? value.viewport!.width : 0;
+  const viewportHeight =
+    typeof value.viewport?.height === 'number' ? value.viewport!.height : 0;
+  const documentId =
+    typeof value.documentId === 'string' ? value.documentId : undefined;
+
+  const all = (value.elements || []).filter(
+    (el) =>
+      el &&
+      el.bbox &&
+      el.isVisible &&
+      el.isInViewport &&
+      (el.bbox.width >= MIN_CANDIDATE_DIM ||
+        el.bbox.height >= MIN_CANDIDATE_DIM),
+  );
+
+  // Hit detection: smallest bbox containing the point, tiebreaking by smallest area.
+  let hit: InteractiveElement | null = null;
+  let hitArea = Number.POSITIVE_INFINITY;
+  for (const el of all) {
+    const inside =
+      x >= el.bbox.x &&
+      x <= el.bbox.x + el.bbox.width &&
+      y >= el.bbox.y &&
+      y <= el.bbox.y + el.bbox.height;
+    if (!inside) continue;
+    const area = bboxArea(el.bbox);
+    if (area < hitArea) {
+      hit = el;
+      hitArea = area;
+    }
+  }
+
+  // Neighborhood: only elements whose bbox sits NEAR but not AROUND the
+  // click. A wrapper `<div>` whose bbox engulfs the click would otherwise
+  // report distance=0 (point inside) and slide into the list even when its
+  // center is hundreds of pixels away — useless guidance for course
+  // correction. The hit element (smallest containing) is reported on its
+  // own; everything else must be a true outside-but-close neighbor.
+  const isOutside = (b: { x: number; y: number; width: number; height: number }) =>
+    x < b.x || x > b.x + b.width || y < b.y || y > b.y + b.height;
+
+  // Two distinct distance metrics, each with its own threshold:
+  //
+  //   - Verdict (edge distance, threshold `radius` ≈ 30 px): "is the
+  //     click ambiguous?". Edge distance captures whether two
+  //     interactables visually overlap the click area, which is exactly
+  //     the case where the agent could have meant either. Center
+  //     distance would miss this (two adjacent buttons can have centers
+  //     80 px apart while their edges sit a few px from the click).
+  //
+  //   - Display (center distance, threshold `displayCenterRadius`
+  //     ≈ 140 px): "what to surface as alternatives if we gate?". Center
+  //     distance naturally drops bulky wrapper containers — their bbox
+  //     edge may extend close to the click, but their center sits far
+  //     away — while keeping genuinely adjacent toolbar controls.
+  const displayCenterRadius = 140;
+
+  const outsideNeighbors = all
+    .filter((el) => isOutside(el.bbox))
+    .map((el) => ({
+      el,
+      edgeDistance: distanceToBbox(x, y, el.bbox),
+      centerDistance: distanceToCenter(x, y, el.bbox),
+    }))
+    .filter((row) => row.centerDistance <= displayCenterRadius)
+    .sort((a, b) => a.centerDistance - b.centerDistance);
+
+  const hitSelector = hit?.selector;
+  const siblingCandidates = outsideNeighbors
+    .filter((row) => !hitSelector || row.el.selector !== hitSelector)
+    .slice(0, candidateLimit)
+    .map((row) => toCandidate(row.el, x, y));
+
+  // Density verdict uses edge distance against the strict `radius`. We
+  // don't want a single isolated button to trip the gate just because
+  // some unrelated control sits 70 px away.
+  const closeByEdge = outsideNeighbors.filter(
+    (row) => row.edgeDistance <= radius,
+  );
+  const totalNearby = closeByEdge.length + (hit ? 1 : 0);
+  const verdict: 'sparse' | 'dense' = totalNearby >= 2 ? 'dense' : 'sparse';
+
+  return {
+    viewport: { width: viewportWidth, height: viewportHeight },
+    hit: hit ? toCandidate(hit, x, y) : null,
+    neighborhood: siblingCandidates,
+    verdict,
+    documentId,
+  };
+}
diff --git a/extension/src/types.ts b/extension/src/types.ts
index 5cc2118..115db71 100644
--- a/extension/src/types.ts
+++ b/extension/src/types.ts
@@ -299,6 +299,34 @@ export interface HighlightDropPreviewCommand extends BaseCommand {
   tab_id?: number;
 }
 
+export interface AnalyzePixelTargetsCommand extends BaseCommand {
+  type: 'analyze_pixel_targets';
+  /** Click X in CSS pixels (viewport coord). */
+  x: number;
+  /** Click Y in CSS pixels (viewport coord). */
+  y: number;
+  /** Neighborhood radius in CSS pixels around (x, y). Default 30. */
+  radius?: number;
+  /** Max number of nearby candidates to return. Default 5. */
+  candidate_limit?: number;
+}
+
+export interface RenderPixelConfirmCommand extends BaseCommand {
+  type: 'render_pixel_confirm';
+  /** Visual mode for the preview crop. */
+  mode: 'pixel_hit' | 'pixel_miss';
+  /** Click X in CSS pixels. */
+  x: number;
+  /** Click Y in CSS pixels. */
+  y: number;
+  /** Bbox of the hit element in CSS pixels (required for pixel_hit). */
+  target_bbox?: { x: number; y: number; width: number; height: number };
+  /** Bboxes for nearby candidate outlines (used by pixel_miss). */
+  candidate_bboxes?: { x: number; y: number; width: number; height: number }[];
+  /** Optional drag end-point in CSS pixels (renders an arrow). */
+  drag_end?: { x: number; y: number };
+}
+
 export interface RecordingControlCommand extends BaseCommand {
   type: 'recording_control';
   action: RecordingControlAction;
@@ -352,6 +380,8 @@ export type Command =
   | GetElementHtmlCommand
   | HighlightSingleElementCommand
   | HighlightDropPreviewCommand
+  | AnalyzePixelTargetsCommand
+  | RenderPixelConfirmCommand
   | RecordingControlCommand;
 
 export interface CommandResponse {
diff --git a/pyproject.toml b/pyproject.toml
index d13ca1e..490bf42 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "962e93361757dcab26f7acab6b5756496e4ee4e8" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "962e93361757dcab26f7acab6b5756496e4ee4e8" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
index 4f17ee6..fb08c31 100644
--- a/server/agent/prompts/big_model/mouse_tool.j2
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -49,6 +49,15 @@ Press at `(x, y)`, drag to `(x2, y2)`, release. One call.
 
 Use for sliders, kanban moves, marquee selection, drag-and-drop. `steps` (optional, default 10) controls the smoothness for DnD libraries that need many intermediate move events.
 
+### confirm
+Commit a pending click or drag that was previewed in the previous response.
+
+```json
+{ "action": "confirm" }
+```
+
+Only valid right after a preview-style observation (zoomed crop with a yellow box or red crosshair). See **Confirmation previews** below.
+
 ### scroll
 Scroll at the cursor's current position by `amount` CSS pixels in `direction`.
 
@@ -66,6 +75,22 @@ Return the cursor to the viewport center.
 { "action": "reset" }
 ```
 
+## Confirmation previews
+
+When `click` or `drag` lands in an area with several interactable controls close together, the next observation is a zoomed crop showing exactly what your coordinate selected. Two cases:
+
+- **Yellow box** around an element: the click sits on it. Reply with `{"action": "confirm"}` to commit, or re-emit `move` + `click` (or `drag`) aimed at a corrected coordinate.
+- **No yellow box, only orange outlines**: the click sits between controls. Re-emit aimed at one of the candidate centers listed in the message.
+
+Orange dashed outlines mark nearby candidates. The message lists each one with its HTML and **center coordinates in [0, 1000] space**. Pick the center matching your goal:
+
+```json
+{ "action": "move", "x": 612, "y": 318 }
+{ "action": "click" }
+```
+
+For a drag preview, the same rules apply at each endpoint. `confirm` commits the drag as previewed; otherwise re-emit `drag` with corrected `x, y, x2, y2`.
+
 ## Patterns
 
 - **Click a button**: `move` to the button → check the screenshot → `click`.
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
index c9cc716..461e04a 100644
--- a/server/agent/prompts/small_model/mouse_tool.j2
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -51,6 +51,28 @@ Return cursor to viewport center.
 { "action": "reset" }
 ```
 
+### confirm
+Commit a previewed click or drag.
+```json
+{ "action": "confirm" }
+```
+Only valid after a preview observation (zoomed crop with a yellow box or red crosshair). See **Confirmation previews** below.
+
+## Confirmation previews
+
+If `click` or `drag` falls in a crowded area, the next observation is a zoomed crop:
+
+- **Yellow box** = your action would commit on that element. Reply `{"action": "confirm"}` to commit, or `move` to a corrected target.
+- **No yellow box, only orange dashed outlines** = the click landed between controls. Re-emit aimed at one of the listed candidate centers in [0, 1000] space.
+
+Orange dashed outlines mark nearby candidates in either case.
+
+Example after a miss:
+```json
+{ "action": "move", "x": 612, "y": 318 }
+{ "action": "click" }
+```
+
 ## Patterns
 
 - **Click a button**: `move` → check screenshot → `click`.
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 625089c..1131445 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -40,6 +40,8 @@
     SetSliderValueCommand,
     UploadFileCommand,
     HighlightDropPreviewCommand,
+    AnalyzePixelTargetsCommand,
+    RenderPixelConfirmCommand,
     MouseMoveCommand,
     MouseClickCommand,
     MouseDragCommand,
@@ -130,6 +132,13 @@ def __init__(self):
         # denormalize Qwen-VL [0,1000] coordinates before dispatching to the
         # extension. None = no screenshot yet — caller must take one first.
         self.last_viewport_by_conv: Dict[str, tuple[int, int]] = {}
+        # Most recent virtual-cursor position per conversation in CSS pixels.
+        # Tracked alongside extension state so the server can identify the
+        # implicit click point for the pixel `click` action (which fires in
+        # place at the cursor). Updated after `move`, `drag`, and `reset`;
+        # falls back to viewport center when unset, matching the extension's
+        # own first-action behavior.
+        self.last_cursor_by_conv: Dict[str, tuple[int, int]] = {}
 
     def _uses_small_model(self) -> bool:
         """Whether the active conversation uses the small-model profile."""
@@ -171,6 +180,34 @@ def _get_viewport(self) -> Optional[tuple[int, int]]:
             return None
         return self.last_viewport_by_conv.get(str(self.conversation_id))
 
+    def _cache_cursor(self, x_css: int, y_css: int) -> None:
+        """Cache the latest virtual-cursor position (CSS px) for the conv."""
+        if not self.conversation_id:
+            return
+        self.last_cursor_by_conv[str(self.conversation_id)] = (
+            int(x_css),
+            int(y_css),
+        )
+
+    def _get_cursor_or_center(self) -> Optional[tuple[int, int]]:
+        """Return the cached cursor position, falling back to viewport center.
+
+        Mirrors the extension's `resolveCursorOrCenter`: pixel `click` lands
+        wherever the virtual cursor currently sits, which is the viewport
+        center until the agent has moved it. Returns None only when neither
+        the cursor nor a viewport has been observed yet.
+        """
+        if not self.conversation_id:
+            return None
+        cached = self.last_cursor_by_conv.get(str(self.conversation_id))
+        if cached is not None:
+            return cached
+        viewport = self._get_viewport()
+        if viewport is None:
+            return None
+        vw, vh = viewport
+        return (vw // 2, vh // 2)
+
     def _is_qwen_model(self) -> bool:
         """Whether the active conversation uses a Qwen vision model.
 
@@ -289,7 +326,29 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
                 # Keep pending confirmation if this is a confirmation action
                 if action.action and action.action.startswith("confirm_"):
                     should_clear = False
+            elif isinstance(action, MouseAction):
+                # The pixel-action gate stores its pending confirmation in the
+                # same registry; preserve it when the agent commits via
+                # `mouse(action="confirm")`.
+                if action.action == "confirm":
+                    should_clear = False
+            # When the agent emits a non-confirm action while a pixel
+            # confirmation is pending, treat the action as an implicit
+            # rejection: stash the candidate list so the response to the
+            # new action can include them as structured guidance. The
+            # element-paradigm pendings are skipped here — they have their
+            # own LLM rendering path (`_pending_confirmation_llm_content`).
+            rejected_pixel_candidates: list = []
             if should_clear:
+                pending = self._get_pending_confirmation()
+                if (
+                    pending
+                    and pending.get("action_type")
+                    in ("mouse_click_pixel", "mouse_drag_pixel")
+                ):
+                    cands = (pending.get("extra_data") or {}).get("candidates")
+                    if isinstance(cands, list):
+                        rejected_pixel_candidates = cands
                 logger.debug(
                     f"DEBUG: Clearing pending confirmation before action {type(action).__name__}"
                 )
@@ -297,24 +356,55 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
 
             # Route based on action type
             if isinstance(action, TabAction):
-                return self._execute_tab_action(action)
+                obs = self._execute_tab_action(action)
             elif isinstance(action, BaseHighlightAction):
-                return self._execute_highlight_action(action)
+                obs = self._execute_highlight_action(action)
             elif isinstance(action, ElementInteractionAction):
-                return self._execute_element_interaction_action(action)
+                obs = self._execute_element_interaction_action(action)
             elif isinstance(action, DialogHandleAction):
-                return self._execute_dialog_action(action)
+                obs = self._execute_dialog_action(action)
             elif isinstance(action, MouseAction):
-                return self._execute_mouse_action(action)
+                obs = self._execute_mouse_action(action)
             elif isinstance(action, KeyboardAction):
-                return self._execute_keyboard_action(action)
+                obs = self._execute_keyboard_action(action)
             elif isinstance(action, SelectOptionAction):
-                return self._execute_select_option_action(action)
+                obs = self._execute_select_option_action(action)
             elif isinstance(action, UploadFileAction):
-                return self._execute_upload_file_action(action)
+                obs = self._execute_upload_file_action(action)
             else:
                 raise ValueError(f"Unknown action type: {type(action).__name__}")
 
+            # Append rejected-pending candidates to the response so the
+            # agent has them fresh when course-correcting. Skip if the new
+            # action triggered its own pixel gate (its message already
+            # contains a fresh candidates block).
+            if (
+                rejected_pixel_candidates
+                and getattr(obs, "success", False)
+            ):
+                new_pending = self._get_pending_confirmation()
+                new_is_pixel_gate = bool(
+                    new_pending
+                    and new_pending.get("action_type")
+                    in ("mouse_click_pixel", "mouse_drag_pixel")
+                )
+                if not new_is_pixel_gate:
+                    block = self._format_pixel_candidates_block(
+                        rejected_pixel_candidates,
+                        header=(
+                            "Candidates near the previously-previewed click "
+                            "(centers in [0,1000] space)"
+                        ),
+                    )
+                    if block:
+                        existing = obs.message or ""
+                        merged = (
+                            existing.rstrip() + ("\n\n" if existing else "") + block
+                        )
+                        # OpenBrowserObservation is frozen; rebuild via copy.
+                        obs = obs.model_copy(update={"message": merged})
+            return obs
+
         except Exception as e:
             logger.error(f"Error executing action: {e}", exc_info=True)
             return OpenBrowserObservation(
@@ -1031,6 +1121,505 @@ def _denormalize_xy(
         py = round(y * vh / 1000) if y is not None else None
         return (px, py)
 
+    # ========== Pixel-action density gate ==========
+
+    PIXEL_GATE_RADIUS_CSS = 30
+    PIXEL_GATE_CANDIDATE_LIMIT = 5
+
+    def _gate_pixel_target(
+        self, x_css: int, y_css: int
+    ) -> Optional[Dict[str, Any]]:
+        """Probe (x, y) for the hit element + nearby interactables.
+
+        Returns the analysis dict from the extension on success, or None if
+        the call failed (in which case the caller should fall through to the
+        un-gated dispatch path so a transient analyzer error doesn't block
+        the agent).
+        """
+        try:
+            cmd = AnalyzePixelTargetsCommand(
+                x=int(x_css),
+                y=int(y_css),
+                radius=self.PIXEL_GATE_RADIUS_CSS,
+                candidate_limit=self.PIXEL_GATE_CANDIDATE_LIMIT,
+                conversation_id=self.conversation_id,
+            )
+            result_dict = self._execute_command_sync(cmd)
+        except Exception as e:
+            logger.warning(
+                "Pixel gate analyze failed at (%s, %s): %s",
+                x_css,
+                y_css,
+                e,
+            )
+            return None
+        if not isinstance(result_dict, dict) or not result_dict.get("success"):
+            logger.warning(
+                "Pixel gate analyze returned no success at (%s, %s): %s",
+                x_css,
+                y_css,
+                result_dict,
+            )
+            return None
+        data = result_dict.get("data")
+        if not isinstance(data, dict):
+            return None
+        return data
+
+    def _serialize_pixel_candidates(
+        self,
+        candidates: list,
+        vw: int,
+        vh: int,
+    ) -> list:
+        """Convert extension candidate dicts to a Qwen-normalized payload.
+
+        Forwards the full element shape that the highlight observation uses
+        (tagName, descriptor, interactionHints, etc.) so the message renderer
+        can reuse `_format_highlighted_element_lines` and surface the same
+        structured signal as the element-paradigm inventory. Adds Qwen-norm
+        bbox / center coordinates so the agent can re-aim a `move` + `click`
+        directly without converting pixels.
+        """
+        out = []
+        if vw <= 0 or vh <= 0:
+            return out
+        for c in candidates or []:
+            bbox = c.get("bbox") or {}
+            try:
+                bx = float(bbox.get("x", 0))
+                by = float(bbox.get("y", 0))
+                bw = float(bbox.get("width", 0))
+                bh = float(bbox.get("height", 0))
+            except (TypeError, ValueError):
+                continue
+            html = c.get("html") or ""
+            if isinstance(html, str) and len(html) > 512:
+                html = html[:509] + "..."
+            cx = bx + bw / 2
+            cy = by + bh / 2
+            out.append(
+                {
+                    # Element fields (shape compatible with
+                    # `_format_highlighted_element_lines`).
+                    "tagName": c.get("tagName"),
+                    "type": c.get("type"),
+                    "interactionHints": c.get("interactionHints") or [],
+                    "text": c.get("text"),
+                    "descriptor": c.get("descriptor") or {},
+                    # Pixel-paradigm extras: structured coords + truncated
+                    # HTML snippet for grounding without re-querying.
+                    "html": html,
+                    "selector": c.get("selector"),
+                    "bbox_norm": {
+                        "x": round(bx / vw * 1000),
+                        "y": round(by / vh * 1000),
+                        "w": round(bw / vw * 1000),
+                        "h": round(bh / vh * 1000),
+                    },
+                    "center_norm": {
+                        "x": round(cx / vw * 1000),
+                        "y": round(cy / vh * 1000),
+                    },
+                    "distance_css": c.get("distance"),
+                }
+            )
+        return out
+
+    def _format_pixel_candidates_block(
+        self,
+        candidates: list,
+        header: str = "Nearby candidates",
+    ) -> str:
+        """Render candidates the way the old highlight observation does.
+
+        Reuses `_format_highlighted_element_lines` so the agent reads the
+        same descriptor-first wording (`<button>"Search" · type=submit`)
+        the element paradigm produced. Appends each candidate's center in
+        [0, 1000] space to the header line so the agent can feed the value
+        directly back into `move`.
+
+        Decorative wrappers often have empty descriptor fields; if the
+        renderer produces a bare `<tag>` line with no text or attrs, fall
+        back to a truncated HTML snippet so the agent at least sees class
+        names / ids / inline attributes to ground on.
+        """
+        if not candidates:
+            return ""
+        from server.agent.tools.base import _format_highlighted_element_lines
+
+        lines: list[str] = [header + ":"]
+        for i, c in enumerate(candidates, 1):
+            display_id = str(i)
+            element_lines = _format_highlighted_element_lines(display_id, c)
+            if not element_lines:
+                continue
+
+            # Detect "bare tag" rendering: the formatter emits no
+            # segments after the opening `<tag>` so the line ends with
+            # `>`. In that case the agent has nothing to disambiguate
+            # this candidate from any other plain wrapper.
+            if element_lines[0].rstrip().endswith(">"):
+                snippet = self._html_snippet_for_candidate(c)
+                if snippet:
+                    element_lines[0] = f"{element_lines[0]} · {snippet}"
+
+            cn = c.get("center_norm") or {}
+            cx = cn.get("x")
+            cy = cn.get("y")
+            if cx is not None and cy is not None:
+                element_lines[0] = (
+                    f"{element_lines[0]}  → center=({cx}, {cy})"
+                )
+            lines.extend(element_lines)
+        return "\n".join(lines)
+
+    @staticmethod
+    def _html_snippet_for_candidate(c: Dict[str, Any]) -> str:
+        """Build a short single-line HTML snippet for a candidate.
+
+        Used when the descriptor lacks meaningful text/name/class hints.
+        Collapses whitespace and truncates so a long `<div class="...">`
+        gloss doesn't blow the message budget.
+        """
+        html = c.get("html") or ""
+        if not isinstance(html, str) or not html:
+            selector = c.get("selector") or ""
+            if isinstance(selector, str) and selector:
+                return f"selector={selector!r}"
+            return ""
+        snippet = " ".join(html.split())
+        # Strip the trailing close tag for compactness — the agent only
+        # needs the opening attributes to identify the element.
+        close_idx = snippet.find(">")
+        if 0 < close_idx < 200:
+            snippet = snippet[: close_idx + 1]
+        if len(snippet) > 160:
+            snippet = snippet[:157] + "..."
+        return snippet
+
+    def _render_pixel_preview(
+        self,
+        mode: str,
+        x_css: int,
+        y_css: int,
+        target_bbox: Optional[Dict[str, Any]],
+        candidate_bboxes: Optional[list],
+        drag_end: Optional[Dict[str, Any]] = None,
+    ) -> Optional[str]:
+        """Ask the extension to render a confirmation crop.
+
+        Returns the data URL on success, or None on failure (caller should
+        gracefully proceed without a preview rather than block).
+        """
+        try:
+            cmd = RenderPixelConfirmCommand(
+                mode=mode,
+                x=int(x_css),
+                y=int(y_css),
+                target_bbox=target_bbox,
+                candidate_bboxes=candidate_bboxes,
+                drag_end=drag_end,
+                conversation_id=self.conversation_id,
+            )
+            result_dict = self._execute_command_sync(cmd)
+        except Exception as e:
+            logger.warning("Pixel preview render failed: %s", e)
+            return None
+        if not isinstance(result_dict, dict) or not result_dict.get("success"):
+            logger.warning("Pixel preview render returned no success: %s", result_dict)
+            return None
+        data = result_dict.get("data") or {}
+        url = data.get("screenshot_data_url")
+        if isinstance(url, str) and url.startswith("data:"):
+            return url
+        return None
+
+    def _build_pixel_gate_message(
+        self,
+        kind: str,
+        verdict: str,
+        hit: Optional[Dict[str, Any]],
+        candidates: list,
+        drag_endpoints: Optional[Dict[str, str]] = None,
+    ) -> str:
+        """Compose the human-readable confirmation message for the agent."""
+        lines: list[str] = []
+        if kind == "click":
+            if hit:
+                lines.append(
+                    "The yellow box marks the element `click` would commit. "
+                    "Orange dashed outlines mark nearby alternatives."
+                )
+            else:
+                lines.append(
+                    "Orange dashed outlines mark nearby interactable "
+                    "alternatives near the cursor."
+                )
+            lines.append(
+                "Reply with `mouse` `action: \"confirm\"` to commit, or "
+                "re-emit `move` + `click` aimed at one of the candidate "
+                "centers below."
+            )
+        elif kind == "drag":
+            note = ""
+            if drag_endpoints:
+                note = (
+                    f" Start={drag_endpoints.get('start', 'unknown')}, "
+                    f"end={drag_endpoints.get('end', 'unknown')}."
+                )
+            lines.append(
+                "At least one drag endpoint sits in a dense neighborhood." + note
+            )
+            lines.append(
+                "Reply with `mouse` `action: \"confirm\"` to commit the drag, "
+                "or re-emit `drag` with corrected endpoints."
+            )
+        block = self._format_pixel_candidates_block(
+            candidates,
+            header="Nearby candidates (centers in [0,1000] space)",
+        )
+        if block:
+            lines.append(block)
+        return "\n".join(lines)
+
+    def _gate_pixel_click(
+        self,
+        action: MouseAction,
+        cursor: tuple[int, int],
+        gate: Dict[str, Any],
+    ) -> OpenBrowserObservation:
+        """Stash a pending click and return a confirmation preview observation."""
+        cx, cy = cursor
+        viewport = self._get_viewport() or (
+            int(gate.get("viewport", {}).get("width") or 0),
+            int(gate.get("viewport", {}).get("height") or 0),
+        )
+        vw, vh = viewport
+        hit = gate.get("hit") if isinstance(gate.get("hit"), dict) else None
+        neighborhood = gate.get("neighborhood") or []
+        candidates = self._serialize_pixel_candidates(neighborhood, vw, vh)
+        candidate_bboxes = [
+            c["bbox"] for c in neighborhood if isinstance(c.get("bbox"), dict)
+        ]
+        target_bbox = (
+            hit.get("bbox") if hit and isinstance(hit.get("bbox"), dict) else None
+        )
+
+        preview_url = self._render_pixel_preview(
+            mode="pixel_hit" if hit else "pixel_miss",
+            x_css=cx,
+            y_css=cy,
+            target_bbox=target_bbox,
+            candidate_bboxes=candidate_bboxes,
+        )
+
+        message = self._build_pixel_gate_message(
+            kind="click",
+            verdict="dense",
+            hit=hit,
+            candidates=candidates,
+        )
+
+        self._set_pending_confirmation(
+            element_id="",
+            action_type="mouse_click_pixel",
+            full_html=(hit.get("html") if hit else "") or "",
+            extra_data={
+                "px": cx,
+                "py": cy,
+                "button": action.button,
+                "count": action.count,
+                "candidates": candidates,
+                "hit_selector": hit.get("selector") if hit else None,
+            },
+            screenshot_data_url=preview_url,
+        )
+
+        return OpenBrowserObservation(
+            success=True,
+            message=message,
+            screenshot_data_url=preview_url,
+            small_model=self._uses_small_model(),
+        )
+
+    def _gate_pixel_drag(
+        self,
+        action: MouseAction,
+        start_css: tuple[int, int],
+        end_css: tuple[int, int],
+        start_gate: Optional[Dict[str, Any]],
+        end_gate: Optional[Dict[str, Any]],
+        start_dense: bool,
+        end_dense: bool,
+    ) -> OpenBrowserObservation:
+        """Stash a pending drag and return a confirmation preview observation."""
+        sx, sy = start_css
+        ex, ey = end_css
+        viewport = self._get_viewport()
+        if viewport is None:
+            gate_for_viewport = start_gate or end_gate or {}
+            viewport = (
+                int(gate_for_viewport.get("viewport", {}).get("width") or 0),
+                int(gate_for_viewport.get("viewport", {}).get("height") or 0),
+            )
+        vw, vh = viewport
+
+        # Pick which endpoint to feature in the preview: prefer the dense one;
+        # if both are dense, prefer the start (the drag's source matters more
+        # for slider/handle grabs).
+        focus_gate = start_gate if start_dense else end_gate
+        focus_x, focus_y = (sx, sy) if start_dense else (ex, ey)
+        drag_end_point = {"x": ex, "y": ey} if start_dense else {"x": sx, "y": sy}
+
+        focus_hit = (
+            focus_gate.get("hit")
+            if focus_gate and isinstance(focus_gate.get("hit"), dict)
+            else None
+        )
+        focus_neighborhood = (focus_gate or {}).get("neighborhood") or []
+        candidates = self._serialize_pixel_candidates(focus_neighborhood, vw, vh)
+        candidate_bboxes = [
+            c["bbox"]
+            for c in focus_neighborhood
+            if isinstance(c.get("bbox"), dict)
+        ]
+        target_bbox = (
+            focus_hit.get("bbox")
+            if focus_hit and isinstance(focus_hit.get("bbox"), dict)
+            else None
+        )
+
+        preview_url = self._render_pixel_preview(
+            mode="pixel_hit" if focus_hit else "pixel_miss",
+            x_css=focus_x,
+            y_css=focus_y,
+            target_bbox=target_bbox,
+            candidate_bboxes=candidate_bboxes,
+            drag_end=drag_end_point,
+        )
+
+        endpoints = {
+            "start": "dense" if start_dense else "sparse",
+            "end": "dense" if end_dense else "sparse",
+        }
+        message = self._build_pixel_gate_message(
+            kind="drag",
+            verdict="dense",
+            hit=focus_hit,
+            candidates=candidates,
+            drag_endpoints=endpoints,
+        )
+
+        self._set_pending_confirmation(
+            element_id="",
+            action_type="mouse_drag_pixel",
+            full_html=(focus_hit.get("html") if focus_hit else "") or "",
+            extra_data={
+                "sx": sx,
+                "sy": sy,
+                "ex": ex,
+                "ey": ey,
+                "button": action.button,
+                "steps": action.steps,
+                "candidates": candidates,
+                "endpoints": endpoints,
+            },
+            screenshot_data_url=preview_url,
+        )
+
+        return OpenBrowserObservation(
+            success=True,
+            message=message,
+            screenshot_data_url=preview_url,
+            small_model=self._uses_small_model(),
+        )
+
+    def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
+        """Commit a pending pixel click or drag stashed by the gate."""
+        pending = self._get_pending_confirmation()
+        if not pending:
+            return OpenBrowserObservation(
+                success=False,
+                error=(
+                    "No pending pixel action to confirm. Emit `move` + `click` "
+                    "or `drag` first; `confirm` is only valid right after a "
+                    "gated preview."
+                ),
+                small_model=self._uses_small_model(),
+            )
+
+        action_type = pending.get("action_type")
+        extra = pending.get("extra_data") or {}
+
+        try:
+            if action_type == "mouse_click_pixel":
+                button = extra.get("button", "left")
+                count = int(extra.get("count", 1))
+                command = MouseClickCommand(
+                    button=MouseButton(button),
+                    count=count,
+                    double=(count == 2),
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(command)
+                self._clear_pending_confirmation()
+                message = (
+                    f"Confirmed click {button} at "
+                    f"({extra.get('px')}, {extra.get('py')}) (count={count})"
+                )
+                intercepted = self._extract_intercepted_form_control(result_dict)
+                if intercepted:
+                    message = self._format_intercepted_message(
+                        intercepted, button, count
+                    )
+                return self._build_observation_from_result(result_dict, message)
+
+            if action_type == "mouse_drag_pixel":
+                sx = extra.get("sx")
+                sy = extra.get("sy")
+                ex = extra.get("ex")
+                ey = extra.get("ey")
+                button = extra.get("button", "left")
+                steps = int(extra.get("steps", 10))
+                if None in (sx, sy, ex, ey):
+                    raise ValueError(
+                        "Pending drag is missing endpoint coordinates."
+                    )
+                command = MouseDragCommand(
+                    start_x=int(sx),
+                    start_y=int(sy),
+                    end_x=int(ex),
+                    end_y=int(ey),
+                    button=MouseButton(button),
+                    steps=steps,
+                    conversation_id=self.conversation_id,
+                )
+                result_dict = self._execute_command_sync(command)
+                self._clear_pending_confirmation()
+                self._cache_cursor(int(ex), int(ey))
+                return self._build_observation_from_result(
+                    result_dict, f"Confirmed drag from ({sx}, {sy}) to ({ex}, {ey})"
+                )
+
+            self._clear_pending_confirmation()
+            return OpenBrowserObservation(
+                success=False,
+                error=(
+                    f"Pending action type {action_type!r} is not a pixel "
+                    "action. `confirm` only commits gated `mouse` previews."
+                ),
+                small_model=self._uses_small_model(),
+            )
+        except Exception as e:
+            logger.error(
+                "Failed to commit pending pixel action: %s", e, exc_info=True
+            )
+            self._clear_pending_confirmation()
+            return OpenBrowserObservation(
+                success=False, error=str(e), small_model=self._uses_small_model()
+            )
+
     def _execute_mouse_action(
         self, action: MouseAction
     ) -> OpenBrowserObservation:
@@ -1052,21 +1641,36 @@ def _execute_mouse_action(
                     x=px, y=py, conversation_id=self.conversation_id
                 )
                 result_dict = self._execute_command_sync(command)
+                if px is not None and py is not None:
+                    self._cache_cursor(px, py)
                 return self._build_observation_from_result(
                     result_dict, f"Mouse moved to ({px}, {py})"
                 )
 
             if kind == "click":
-                # Click is in-place at the cursor's current position. Any
-                # x/y the model emitted is ignored on purpose — if it wants
-                # to click somewhere new it must `move` there first, so the
-                # visible cursor in the screenshot is the click point.
+                # Click is in-place at the cursor's current position. The
+                # gate runs against the cursor coord (the implicit click
+                # point), not action.x/y — the agent positions via `move`
+                # first, so any x/y on the action is informational.
                 if action.x is not None or action.y is not None:
                     logger.debug(
                         "Mouse click ignored x=%s, y=%s (click is in-place)",
                         action.x,
                         action.y,
                     )
+                cursor = self._get_cursor_or_center()
+                gate = (
+                    self._gate_pixel_target(cursor[0], cursor[1])
+                    if cursor is not None
+                    else None
+                )
+                if (
+                    gate
+                    and gate.get("verdict") == "dense"
+                    and cursor is not None
+                ):
+                    return self._gate_pixel_click(action, cursor, gate)
+
                 command = MouseClickCommand(
                     button=MouseButton(action.button),
                     count=action.count,
@@ -1078,12 +1682,6 @@ def _execute_mouse_action(
                     f"Clicked {action.button} at the cursor "
                     f"(count={action.count})"
                 )
-                # When the click landed on a native <select> or
-                # <input type=file>, the extension suppresses the OS-level
-                # popup and returns descriptor metadata. Surface it in the
-                # observation message so the agent knows to follow up with
-                # `select_option` or `upload_file` (the option list / file
-                # input metadata is invisible in the screenshot).
                 intercepted = self._extract_intercepted_form_control(result_dict)
                 if intercepted:
                     message = self._format_intercepted_message(
@@ -1101,6 +1699,26 @@ def _execute_mouse_action(
                     raise ValueError("mouse drag requires x, y, x2, y2")
                 sx, sy = self._denormalize_xy(action.x, action.y)
                 ex, ey = self._denormalize_xy(action.x2, action.y2)
+
+                start_gate = self._gate_pixel_target(sx, sy)
+                end_gate = self._gate_pixel_target(ex, ey)
+                start_dense = bool(
+                    start_gate and start_gate.get("verdict") == "dense"
+                )
+                end_dense = bool(
+                    end_gate and end_gate.get("verdict") == "dense"
+                )
+                if start_dense or end_dense:
+                    return self._gate_pixel_drag(
+                        action,
+                        (sx, sy),
+                        (ex, ey),
+                        start_gate,
+                        end_gate,
+                        start_dense,
+                        end_dense,
+                    )
+
                 command = MouseDragCommand(
                     start_x=sx,
                     start_y=sy,
@@ -1111,6 +1729,8 @@ def _execute_mouse_action(
                     conversation_id=self.conversation_id,
                 )
                 result_dict = self._execute_command_sync(command)
+                if ex is not None and ey is not None:
+                    self._cache_cursor(ex, ey)
                 return self._build_observation_from_result(
                     result_dict, f"Dragged from ({sx}, {sy}) to ({ex}, {ey})"
                 )
@@ -1132,10 +1752,16 @@ def _execute_mouse_action(
                     conversation_id=self.conversation_id
                 )
                 result_dict = self._execute_command_sync(command)
+                viewport = self._get_viewport()
+                if viewport is not None:
+                    self._cache_cursor(viewport[0] // 2, viewport[1] // 2)
                 return self._build_observation_from_result(
                     result_dict, "Reset cursor to viewport center"
                 )
 
+            if kind == "confirm":
+                return self._commit_pending_pixel_action()
+
             raise ValueError(f"Unknown mouse action: {kind}")
         except Exception as e:
             logger.error(f"Mouse action failed (kind={kind}): {e}", exc_info=True)
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
index d7b52df..9ac4fe1 100644
--- a/server/agent/tools/mouse_tool.py
+++ b/server/agent/tools/mouse_tool.py
@@ -31,7 +31,9 @@ def get_mouse_tool_description(conv_state=None) -> str:
     )
 
 
-MouseActionKind = Literal["move", "click", "drag", "scroll", "reset"]
+MouseActionKind = Literal[
+    "move", "click", "drag", "scroll", "reset", "confirm"
+]
 
 
 class MouseAction(OpenBrowserAction):
@@ -55,7 +57,9 @@ class MouseAction(OpenBrowserAction):
             "'drag' — press at (x, y), drag to (x2, y2), release. "
             "'scroll' — scroll at the cursor position by `amount` in "
             "`direction`. "
-            "'reset' — return the cursor to the viewport center."
+            "'reset' — return the cursor to the viewport center. "
+            "'confirm' — commit a pending click or drag that was previewed "
+            "as a zoomed crop in the previous response."
         )
     )
 
diff --git a/server/core/processor.py b/server/core/processor.py
index 30b2ff4..d22558f 100644
--- a/server/core/processor.py
+++ b/server/core/processor.py
@@ -39,6 +39,8 @@
     SetSliderValueCommand,
     UploadFileCommand,
     HighlightDropPreviewCommand,
+    AnalyzePixelTargetsCommand,
+    RenderPixelConfirmCommand,
 )
 from server.websocket.manager import ws_manager
 from server.core.config import config
@@ -286,6 +288,10 @@ async def execute(self, command: Command) -> CommandResponse:
                 return await self._execute_recording_control(command)
             elif isinstance(command, HighlightDropPreviewCommand):
                 return await self._execute_highlight_drop_preview(command)
+            elif isinstance(command, AnalyzePixelTargetsCommand):
+                return await self._execute_analyze_pixel_targets(command)
+            elif isinstance(command, RenderPixelConfirmCommand):
+                return await self._execute_render_pixel_confirm(command)
             elif isinstance(command, UploadFileCommand):
                 return await self._execute_upload_file(command)
             else:
@@ -503,6 +509,18 @@ async def _execute_highlight_drop_preview(
         """Highlight inner elements of a drop container for drag-and-drop 2PC"""
         return await self._send_prepared_command(command)
 
+    async def _execute_analyze_pixel_targets(
+        self, command: AnalyzePixelTargetsCommand
+    ) -> CommandResponse:
+        """Probe (x, y) for the hit element and nearby interactables."""
+        return await self._send_prepared_command(command)
+
+    async def _execute_render_pixel_confirm(
+        self, command: RenderPixelConfirmCommand
+    ) -> CommandResponse:
+        """Render a zoomed confirmation crop for a pending pixel action."""
+        return await self._send_prepared_command(command)
+
     async def _execute_upload_file(self, command: UploadFileCommand) -> CommandResponse:
         """Attach a local file to an <input type=file> via CDP.
 
diff --git a/server/models/commands.py b/server/models/commands.py
index d894ea0..cd2edd3 100644
--- a/server/models/commands.py
+++ b/server/models/commands.py
@@ -624,6 +624,80 @@ class HighlightSingleElementCommand(BaseCommand):
     )
 
 
+class AnalyzePixelTargetsCommand(BaseCommand):
+    """Probe the live viewport at a CSS-pixel coordinate to find what
+    interactable element (if any) lies under the point and which other
+    interactables sit within `radius` of it.
+
+    The extension reuses the highlight-detection engine and returns:
+      - hit: smallest interactable whose bbox contains (x, y), or None
+      - neighborhood: top-N interactables within `radius` CSS pixels of
+        the point, sorted by distance
+      - verdict: 'sparse' if neighborhood has 0–1 elements, 'dense' if 2+
+
+    The server uses the verdict to gate pixel mouse_click / mouse_drag:
+    only dense neighborhoods receive a confirmation preview round-trip.
+    """
+
+    type: Literal["analyze_pixel_targets"] = "analyze_pixel_targets"
+    x: int = Field(description="Click X in CSS pixels (viewport coord)")
+    y: int = Field(description="Click Y in CSS pixels (viewport coord)")
+    radius: int = Field(
+        default=30,
+        description="Neighborhood radius in CSS pixels around (x, y).",
+        ge=0,
+    )
+    candidate_limit: int = Field(
+        default=5,
+        description="Max number of nearby candidates to return.",
+        ge=1,
+        le=20,
+    )
+
+
+class RenderPixelConfirmCommand(BaseCommand):
+    """Produce a zoomed confirmation screenshot for a pending pixel action.
+
+    Two modes:
+      - 'pixel_hit': the click landed on `target_selector`; render a YELLOW
+        box around it and zoom-crop centered on the element.
+      - 'pixel_miss': the click landed in whitespace; render a red crosshair
+        at (x, y) plus thin grey outlines on `candidate_bboxes`. Zoom-crop
+        centered on the click point.
+
+    Returns a screenshot data URL only — server already holds the
+    structured candidate list from the prior analyze_pixel_targets call.
+    """
+
+    type: Literal["render_pixel_confirm"] = "render_pixel_confirm"
+    mode: Literal["pixel_hit", "pixel_miss"] = Field(
+        description="Visual mode: pixel_hit (yellow box) or pixel_miss (crosshair)."
+    )
+    x: int = Field(description="Click X in CSS pixels (viewport coord)")
+    y: int = Field(description="Click Y in CSS pixels (viewport coord)")
+    target_bbox: Optional[dict] = Field(
+        default=None,
+        description=(
+            "Bbox of the hit element {x, y, width, height} in CSS pixels. "
+            "Required for pixel_hit; ignored for pixel_miss."
+        ),
+    )
+    candidate_bboxes: Optional[List[dict]] = Field(
+        default=None,
+        description=(
+            "Bbox dicts {x, y, width, height} in CSS pixels for outlines "
+            "(used by pixel_miss)."
+        ),
+    )
+    drag_end: Optional[dict] = Field(
+        default=None,
+        description=(
+            "Optional {x, y} CSS-pixel point to draw an arrow toward (for "
+            "drag-endpoint previews)."
+        ),
+    )
+
+
 class HighlightDropPreviewCommand(BaseCommand):
     """Highlight inner elements of a drop container for drag-and-drop 2PC flow.
 
@@ -715,6 +789,8 @@ class TabsResponse(CommandResponse):
     SelectElementCommand,
     GetElementHtmlCommand | HighlightSingleElementCommand,
     HighlightDropPreviewCommand,
+    AnalyzePixelTargetsCommand,
+    RenderPixelConfirmCommand,
     RecordingControlCommand,
     DragAndDropElementCommand,
     SetSliderValueCommand,
@@ -756,6 +832,8 @@ def parse_command(data: dict) -> Command:
         "get_element_html": GetElementHtmlCommand,
         "highlight_single_element": HighlightSingleElementCommand,
         "highlight_drop_preview": HighlightDropPreviewCommand,
+        "analyze_pixel_targets": AnalyzePixelTargetsCommand,
+        "render_pixel_confirm": RenderPixelConfirmCommand,
         "recording_control": RecordingControlCommand,
         "drag_and_drop_element": DragAndDropElementCommand,
         "set_slider_value": SetSliderValueCommand,
diff --git a/uv.lock b/uv.lock
index 3b242f9..e443f38 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1678,8 +1678,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=962e93361757dcab26f7acab6b5756496e4ee4e8" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=962e93361757dcab26f7acab6b5756496e4ee4e8" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2224,7 +2224,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=962e93361757dcab26f7acab6b5756496e4ee4e8#962e93361757dcab26f7acab6b5756496e4ee4e8" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b#2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2244,7 +2244,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=962e93361757dcab26f7acab6b5756496e4ee4e8#962e93361757dcab26f7acab6b5756496e4ee4e8" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b#2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 0f4917d83001bfdd740d0fec3ba9e6f6df18e737 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 6 May 2026 15:29:46 +0800
Subject: [PATCH 07/14] chore(deps): bump agent-sdk pin to 50a52fad (pixel-only
 system prompts)

Picks up the rewritten system_prompt_large/small that drop highlight,
element_interaction, element_id, BLUE/YELLOW staging, and the
corner-badge label rule. The runtime hasn't exposed those tools for a
while, so the prompt residue was the last source of `highlight`-tool
hallucinations.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 4 ++--
 uv.lock        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 490bf42..5eba3ec 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "50a52fad63b96a48082146739ab40feafbc37423" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "50a52fad63b96a48082146739ab40feafbc37423" }
diff --git a/uv.lock b/uv.lock
index e443f38..8781ac2 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1678,8 +1678,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=50a52fad63b96a48082146739ab40feafbc37423" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=50a52fad63b96a48082146739ab40feafbc37423" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2224,7 +2224,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b#2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=50a52fad63b96a48082146739ab40feafbc37423#50a52fad63b96a48082146739ab40feafbc37423" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2244,7 +2244,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=2ea1956a1237187b08c1dbac7ebe9f699204a93b#2ea1956a1237187b08c1dbac7ebe9f699204a93b" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=50a52fad63b96a48082146739ab40feafbc37423#50a52fad63b96a48082146739ab40feafbc37423" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 169e0b0891a0b5eecb69c2f387aa5cba24eb6bc5 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 6 May 2026 15:30:03 +0800
Subject: [PATCH 08/14] feat(agent): combined click {x,y}, gated-preview DOM
 overlay, terser observation

- mouse `click` now accepts optional `x, y`. With both supplied the
  cursor moves there and clicks in one step; the gate runs against the
  new cursor. Without coords it still clicks at the cursor's current
  position for hover-then-click flows. Half-supplied coords are
  rejected so the agent re-emits cleanly.

- The dense-neighborhood gate now paints its overlay on the live page
  DOM in addition to the screenshot crop: a single absolute-positioned
  div per box (yellow solid for the hit, orange dashed for nearby
  candidates) lives inside one overlay container, so a human watching
  the browser sees what the agent sees and removing the container
  cleans everything up. Drawing as floating divs (not via `outline`
  on the element itself) avoids stacking with the page's own focus or
  hover ring, fixing the double-yellow rectangle on Xiaohongshu's
  favorite button.

- The new `clear_pixel_overlay` command fires before any non-confirm
  mouse action and at the top of `_commit_pending_pixel_action` so
  the overlay disappears whether the agent confirms or re-aims, and
  the post-commit screenshot is unhighlighted.

- Tool/observation prompts updated for the new shapes: `click {x, y}`
  shown as the canonical form; gated-preview observation is one line
  ("Click previewed on the yellow target. Confirm to commit, or
  re-emit `click` ...") with the verification step taught in the
  system prompt's gated-preview block instead of an in-page banner.

- Non-gated click message includes the actual click point in CSS
  pixels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts             |  33 +++-
 .../src/commands/pixel-confirm-render.ts      | 170 ++++++++++++++++++
 extension/src/types.ts                        |  11 ++
 server/agent/prompts/big_model/mouse_tool.j2  |  32 ++--
 .../agent/prompts/small_model/mouse_tool.j2   |  31 ++--
 server/agent/tools/browser_executor.py        | 129 ++++++++++---
 server/agent/tools/mouse_tool.py              |  20 ++-
 server/core/processor.py                      |   9 +
 server/models/commands.py                     |  36 ++++
 9 files changed, 396 insertions(+), 75 deletions(-)

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 0f2ebe3..d8d3ab2 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -43,7 +43,10 @@ import {
 } from '../commands/single-highlight';
 import { highlightDropPreview } from '../commands/drop-preview-highlight';
 import { analyzePixelTargets } from '../commands/pixel-target-analyzer';
-import { renderPixelConfirm } from '../commands/pixel-confirm-render';
+import {
+  renderPixelConfirm,
+  clearPixelConfirmOverlay,
+} from '../commands/pixel-confirm-render';
 import { elementCache } from '../commands/element-cache';
 import { assignHashedElementIds } from '../commands/element-id';
 import { buildElementCacheMissMessage } from '../commands/element-cache';
@@ -1823,6 +1826,31 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         };
       }
 
+      case 'clear_pixel_overlay': {
+        if (!command.conversation_id) {
+          throw new Error(
+            'conversation_id is required for clear_pixel_overlay command (strict mode)',
+          );
+        }
+        const conversationId = command.conversation_id;
+        const activeTabId = tabManager.getCurrentActiveTabId(conversationId);
+        if (!activeTabId) {
+          // No active tab — nothing to clear, but treat as a no-op success.
+          return {
+            success: true,
+            message: 'clear_pixel_overlay (no active tab)',
+            timestamp: Date.now(),
+          };
+        }
+        await tabManager.ensureTabManaged(activeTabId, conversationId);
+        await clearPixelConfirmOverlay(activeTabId, conversationId);
+        return {
+          success: true,
+          message: 'clear_pixel_overlay',
+          timestamp: Date.now(),
+        };
+      }
+
       case 'render_pixel_confirm': {
         if (!command.conversation_id) {
           throw new Error(
@@ -1848,6 +1876,9 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
             y: command.y,
             target_bbox: command.target_bbox,
             candidate_bboxes: command.candidate_bboxes,
+            target_selector: command.target_selector,
+            candidate_selectors: command.candidate_selectors,
+            banner_kind: command.banner_kind,
             drag_end: command.drag_end,
           },
         );
diff --git a/extension/src/commands/pixel-confirm-render.ts b/extension/src/commands/pixel-confirm-render.ts
index 935f007..6cb22b9 100644
--- a/extension/src/commands/pixel-confirm-render.ts
+++ b/extension/src/commands/pixel-confirm-render.ts
@@ -15,6 +15,10 @@
  */
 
 import { captureScreenshot, compressIfNeeded } from './screenshot';
+import { executeJavaScript } from './javascript';
+
+const PIXEL_OVERLAY_ID = '__ob_pixel_confirm_overlay__';
+const OVERLAY_INJECTION_TIMEOUT_MS = 5000;
 
 const HIT_BORDER_COLOR = '#FFD400';
 const HIT_GLOW_COLOR = 'rgba(255, 212, 0, 0.7)';
@@ -53,6 +57,9 @@ export interface PixelConfirmRenderRequest {
   y: number; // CSS px
   target_bbox?: BBox; // CSS px (required for pixel_hit)
   candidate_bboxes?: BBox[]; // CSS px
+  target_selector?: string; // CSS selector for the hit element (DOM overlay)
+  candidate_selectors?: string[]; // CSS selectors for candidates (DOM overlay)
+  banner_kind?: 'click' | 'drag'; // banner phrasing for the in-page prompt
   drag_end?: PointXY; // CSS px (optional second point for drag previews)
 }
 
@@ -252,6 +259,146 @@ function drawDragArrow(
   ctx.restore();
 }
 
+function buildPixelOverlayScript(request: PixelConfirmRenderRequest): string {
+  // No banner div — a floating banner overlaps neighboring candidates
+  // (which are exactly the alternatives the agent might want to re-aim
+  // at). The yellow + orange outlines are enough; the verification
+  // language lives in the system / tool prompts.
+  const payload = {
+    overlayId: PIXEL_OVERLAY_ID,
+    targetSelector: request.target_selector || null,
+    targetBbox: request.target_bbox || null,
+    candidateSelectors: request.candidate_selectors || [],
+    candidateBboxes: request.candidate_bboxes || [],
+    drag: request.drag_end
+      ? { from: { x: request.x, y: request.y }, to: request.drag_end }
+      : null,
+  };
+
+  return `
+    (() => {
+      const cfg = ${JSON.stringify(payload)};
+      const OVERLAY_ID = cfg.overlayId;
+
+      // Wipe any previous overlay container — every box we draw lives
+      // inside it, so removing the container is the entire cleanup.
+      const prev = document.getElementById(OVERLAY_ID);
+      if (prev) prev.remove();
+
+      const overlay = document.createElement('div');
+      overlay.id = OVERLAY_ID;
+      overlay.style.cssText =
+        'position:absolute;top:0;left:0;pointer-events:none;z-index:2147483647;';
+      document.documentElement.appendChild(overlay);
+
+      const sx = window.scrollX || window.pageXOffset || 0;
+      const sy = window.scrollY || window.pageYOffset || 0;
+
+      const resolveBbox = (selector, fallbackBbox) => {
+        if (selector) {
+          try {
+            const el = document.querySelector(selector);
+            if (el) {
+              const rect = el.getBoundingClientRect();
+              if (rect.width > 0 && rect.height > 0) {
+                return {
+                  x: rect.left,
+                  y: rect.top,
+                  width: rect.width,
+                  height: rect.height,
+                };
+              }
+            }
+          } catch (_) {}
+        }
+        return fallbackBbox || null;
+      };
+
+      const drawBox = (bbox, color, dashed, role) => {
+        if (!bbox || bbox.width <= 0 || bbox.height <= 0) return;
+        const div = document.createElement('div');
+        const borderWidth = dashed ? 2 : 3;
+        div.setAttribute('data-ob-role', role);
+        div.style.cssText =
+          'position:absolute;pointer-events:none;box-sizing:border-box;'
+          + 'left:' + (bbox.x + sx - borderWidth) + 'px;'
+          + 'top:' + (bbox.y + sy - borderWidth) + 'px;'
+          + 'width:' + (bbox.width + borderWidth * 2) + 'px;'
+          + 'height:' + (bbox.height + borderWidth * 2) + 'px;'
+          + 'border:' + borderWidth + 'px '
+          + (dashed ? 'dashed' : 'solid') + ' ' + color + ';'
+          + 'border-radius:3px;'
+          + 'background:transparent;';
+        overlay.appendChild(div);
+      };
+
+      // Candidates first so the hit box paints on top.
+      const candidateColor = '#FF6B00';
+      for (let i = 0; i < cfg.candidateSelectors.length; i++) {
+        const bbox = resolveBbox(
+          cfg.candidateSelectors[i],
+          cfg.candidateBboxes[i],
+        );
+        drawBox(bbox, candidateColor, true, 'candidate');
+      }
+
+      const hitColor = '#FFD400';
+      const hitBbox = resolveBbox(cfg.targetSelector, cfg.targetBbox);
+      if (hitBbox) drawBox(hitBbox, hitColor, false, 'hit');
+
+      if (cfg.drag && cfg.drag.from && cfg.drag.to) {
+        // Simple line + arrowhead between the two endpoints.
+        const arrow = document.createElement('div');
+        const dx = cfg.drag.to.x - cfg.drag.from.x;
+        const dy = cfg.drag.to.y - cfg.drag.from.y;
+        const len = Math.hypot(dx, dy);
+        const angle = Math.atan2(dy, dx);
+        arrow.style.cssText =
+          'position:absolute;pointer-events:none;'
+          + 'left:' + (cfg.drag.from.x + sx) + 'px;'
+          + 'top:' + (cfg.drag.from.y + sy - 1) + 'px;'
+          + 'width:' + Math.max(1, len) + 'px;'
+          + 'height:3px;'
+          + 'background:rgba(255,212,0,0.9);'
+          + 'transform-origin:0 50%;'
+          + 'transform:rotate(' + angle + 'rad);';
+        overlay.appendChild(arrow);
+      }
+
+      return { overlay: true };
+    })();
+  `;
+}
+
+function buildPixelOverlayCleanupScript(): string {
+  return `
+    (() => {
+      const OVERLAY_ID = ${JSON.stringify(PIXEL_OVERLAY_ID)};
+      const prev = document.getElementById(OVERLAY_ID);
+      if (prev) prev.remove();
+      return { cleared: true };
+    })();
+  `;
+}
+
+export async function clearPixelConfirmOverlay(
+  tabId: number,
+  conversationId: string,
+): Promise<void> {
+  try {
+    await executeJavaScript(
+      tabId,
+      conversationId,
+      buildPixelOverlayCleanupScript(),
+      true,
+      true,
+      OVERLAY_INJECTION_TIMEOUT_MS,
+    );
+  } catch (e) {
+    console.warn('[PixelConfirmRender] cleanup failed', e);
+  }
+}
+
 export async function renderPixelConfirm(
   tabId: number,
   conversationId: string,
@@ -264,6 +411,29 @@ export async function renderPixelConfirm(
     throw new Error('[PixelConfirmRender] createImageBitmap is not available');
   }
 
+  // Inject the same yellow / orange overlay onto the live page DOM so a
+  // human watching the browser sees what the agent sees. The screenshot
+  // captured below picks up these overlays naturally; canvas-side
+  // drawing further down is a fail-safe in case injection is blocked.
+  if (
+    request.target_selector ||
+    (request.candidate_selectors && request.candidate_selectors.length > 0) ||
+    request.banner_kind
+  ) {
+    try {
+      await executeJavaScript(
+        tabId,
+        conversationId,
+        buildPixelOverlayScript(request),
+        true,
+        true,
+        OVERLAY_INJECTION_TIMEOUT_MS,
+      );
+    } catch (e) {
+      console.warn('[PixelConfirmRender] DOM overlay injection failed', e);
+    }
+  }
+
   // Capture a clean shot — no cursor (we draw our own crosshair / box).
   const shot = await captureScreenshot(
     tabId,
diff --git a/extension/src/types.ts b/extension/src/types.ts
index 115db71..456eed8 100644
--- a/extension/src/types.ts
+++ b/extension/src/types.ts
@@ -323,10 +323,20 @@ export interface RenderPixelConfirmCommand extends BaseCommand {
   target_bbox?: { x: number; y: number; width: number; height: number };
   /** Bboxes for nearby candidate outlines (used by pixel_miss). */
   candidate_bboxes?: { x: number; y: number; width: number; height: number }[];
+  /** CSS selector for the hit element (drives in-page DOM overlay). */
+  target_selector?: string;
+  /** CSS selectors for nearby candidates (drive in-page DOM overlay). */
+  candidate_selectors?: string[];
+  /** Banner kind for the in-page confirmation prompt. */
+  banner_kind?: 'click' | 'drag';
   /** Optional drag end-point in CSS pixels (renders an arrow). */
   drag_end?: { x: number; y: number };
 }
 
+export interface ClearPixelOverlayCommand extends BaseCommand {
+  type: 'clear_pixel_overlay';
+}
+
 export interface RecordingControlCommand extends BaseCommand {
   type: 'recording_control';
   action: RecordingControlAction;
@@ -382,6 +392,7 @@ export type Command =
   | HighlightDropPreviewCommand
   | AnalyzePixelTargetsCommand
   | RenderPixelConfirmCommand
+  | ClearPixelOverlayCommand
   | RecordingControlCommand;
 
 export interface CommandResponse {
diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
index fb08c31..7905449 100644
--- a/server/agent/prompts/big_model/mouse_tool.j2
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -15,7 +15,7 @@ Estimate from the screenshot. Aim for the visual center of your target. The syst
 
 A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
 
-`click` commits at the cursor's current position. To click a new target: `move` there → look at the next screenshot to confirm the red dot is on the target → `click`.
+`click {x, y}` moves the cursor to `(x, y)` and clicks there in one step. `click` with no coordinates clicks at the cursor's current position — use that after a `move` when you want a hover effect to fire first.
 
 ## Actions
 
@@ -27,19 +27,20 @@ Slide the cursor to a point. The cursor traces an eased path so hover effects (C
 ```
 
 ### click
-Click at the cursor's current position.
+Click on the page.
 
 ```json
+{ "action": "click", "x": 612, "y": 318 }
 { "action": "click" }
+{ "action": "click", "x": 612, "y": 318, "count": 2 }
 { "action": "click", "button": "right" }
-{ "action": "click", "count": 2 }
 ```
 
+- With `x, y`: cursor moves to `(x, y)` and clicks there. Default form for "click this thing".
+- Without `x, y`: clicks at the cursor's current position. Use after a `move` for hover-then-click flows, or after a gated preview when re-aiming would be wrong.
 - `button`: `"left"` (default), `"right"`, `"middle"`.
 - `count`: `1` (default), `2` for double-click, `3` for triple-click (text selection).
 
-Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it isn't, `move` again first.
-
 ### drag
 Press at `(x, y)`, drag to `(x2, y2)`, release. One call.
 
@@ -77,31 +78,26 @@ Return the cursor to the viewport center.
 
 ## Confirmation previews
 
-When `click` or `drag` lands in an area with several interactable controls close together, the next observation is a zoomed crop showing exactly what your coordinate selected. Two cases:
-
-- **Yellow box** around an element: the click sits on it. Reply with `{"action": "confirm"}` to commit, or re-emit `move` + `click` (or `drag`) aimed at a corrected coordinate.
-- **No yellow box, only orange outlines**: the click sits between controls. Re-emit aimed at one of the candidate centers listed in the message.
+When `click` or `drag` lands in an area with several interactable controls close together, the next observation is a zoomed crop showing exactly what your coordinate selected. The same outlines are also painted onto the live page DOM, so the screenshot reflects what a human watching the browser would see.
 
-Orange dashed outlines mark nearby candidates. The message lists each one with its HTML and **center coordinates in [0, 1000] space**. Pick the center matching your goal:
+- A **yellow** outline marks the element the click would commit on.
+- **Orange dashed** outlines mark nearby candidates. The message lists each candidate's HTML and center coordinates in `[0, 1000]` space.
 
-```json
-{ "action": "move", "x": 612, "y": 318 }
-{ "action": "click" }
-```
+Check the yellow-highlighted element. If it matches what you wanted to click (or drag), reply `{"action": "confirm"}` to commit. If it does not, re-emit `click` (or `drag`) with one of the listed candidate centers as `x, y`.
 
 For a drag preview, the same rules apply at each endpoint. `confirm` commits the drag as previewed; otherwise re-emit `drag` with corrected `x, y, x2, y2`.
 
 ## Patterns
 
-- **Click a button**: `move` to the button → check the screenshot → `click`.
-- **Hover-reveal menu**: `move` over the trigger; the next screenshot shows the menu open.
+- **Click a button**: `click` with the button's center coordinates.
+- **Hover-reveal menu**: `move` over the trigger; the next screenshot shows the menu open. Then `click` (with or without coords) on the revealed item.
 - **Scroll to find something**: `scroll` direction `down`, then check the new screenshot. Repeat as needed.
 - **Drag a slider**: one `drag` from the handle's current position to the target position.
-- **Right-click for context menu**: `move` to the target → `click` with `button: "right"`.
+- **Right-click for context menu**: `click` with the target's coords and `button: "right"`.
 
 ## Notes
 
 - One action per turn. The next observation reflects the post-action state.
-- The cursor position persists across actions — it stays where you last left it until the next `move`.
+- The cursor position persists across actions — it stays where you last left it until the next `move` or coordinate-form `click`.
 - If a target isn't in the viewport, `scroll` to bring it in view before pointing at it.
 - If a confirm/prompt dialog opens, handle it with the dialog tool before the next mouse action.
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
index 461e04a..a88ef44 100644
--- a/server/agent/prompts/small_model/mouse_tool.j2
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -10,7 +10,7 @@ Move, click, drag, and scroll a virtual mouse cursor.
 
 A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
 
-`click` commits at the cursor's current position. To click a new target: `move` there → check the screenshot → `click`.
+`click {x, y}` moves the cursor to `(x, y)` and clicks in one step. `click` with no coords clicks at the cursor's current position — use that after a `move` when you want a hover effect to fire first.
 
 ## Actions
 
@@ -21,17 +21,18 @@ Slide the cursor to `(x, y)`.
 ```
 
 ### click
-Click at the cursor's current position.
+Click on the page.
 ```json
+{ "action": "click", "x": 612, "y": 318 }
 { "action": "click" }
-{ "action": "click", "count": 2 }
+{ "action": "click", "x": 612, "y": 318, "count": 2 }
 { "action": "click", "button": "right" }
 ```
+- With `x, y`: cursor moves to `(x, y)` and clicks. Default form.
+- Without `x, y`: clicks at the cursor's current position. Use after a `move` for hover-then-click flows.
 - `button`: `"left"` (default), `"right"`, `"middle"`.
 - `count`: 1 (default), 2 for double-click.
 
-Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` again first.
-
 ### drag
 Press at `(x, y)`, drag to `(x2, y2)`, release.
 ```json
@@ -60,23 +61,17 @@ Only valid after a preview observation (zoomed crop with a yellow box or red cro
 
 ## Confirmation previews
 
-If `click` or `drag` falls in a crowded area, the next observation is a zoomed crop:
-
-- **Yellow box** = your action would commit on that element. Reply `{"action": "confirm"}` to commit, or `move` to a corrected target.
-- **No yellow box, only orange dashed outlines** = the click landed between controls. Re-emit aimed at one of the listed candidate centers in [0, 1000] space.
+If `click` or `drag` falls in a crowded area, the next observation is a zoomed crop. The same outlines are also painted on the live page DOM.
 
-Orange dashed outlines mark nearby candidates in either case.
+- A **yellow** outline marks the element the click would commit on.
+- **Orange dashed** outlines mark nearby candidates, listed in the message with HTML and center coordinates in `[0, 1000]` space.
 
-Example after a miss:
-```json
-{ "action": "move", "x": 612, "y": 318 }
-{ "action": "click" }
-```
+Check the yellow-highlighted element. If it matches your intent, reply `{"action": "confirm"}` to commit. Otherwise, re-emit `click` (or `drag`) with one of the listed candidate centers as `x, y`.
 
 ## Patterns
 
-- **Click a button**: `move` → check screenshot → `click`.
-- **Hover**: `move` over the trigger; next screenshot shows the result.
+- **Click a button**: `click` with the button's center coordinates.
+- **Hover**: `move` over the trigger; next screenshot shows the result, then `click` to commit.
 - **Scroll to find**: `scroll` then check the new screenshot.
 - **Drag**: one `drag` with start and end coordinates.
-- **Type into a field**: `move` to the field → `click` to focus it → `keyboard type` the text.
+- **Type into a field**: `click {x, y}` on the field to focus it → `keyboard type` the text.
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 1131445..4fc2674 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -42,6 +42,7 @@
     HighlightDropPreviewCommand,
     AnalyzePixelTargetsCommand,
     RenderPixelConfirmCommand,
+    ClearPixelOverlayCommand,
     MouseMoveCommand,
     MouseClickCommand,
     MouseDragCommand,
@@ -1305,10 +1306,18 @@ def _render_pixel_preview(
         y_css: int,
         target_bbox: Optional[Dict[str, Any]],
         candidate_bboxes: Optional[list],
+        target_selector: Optional[str] = None,
+        candidate_selectors: Optional[list] = None,
+        banner_kind: Optional[str] = None,
         drag_end: Optional[Dict[str, Any]] = None,
     ) -> Optional[str]:
         """Ask the extension to render a confirmation crop.
 
+        Selectors and `banner_kind` enable the extension to draw the
+        same yellow/orange overlay directly on the live page DOM (so a
+        human watching the browser sees what the agent sees) before the
+        screenshot capture. The crop is always returned for the agent.
+
         Returns the data URL on success, or None on failure (caller should
         gracefully proceed without a preview rather than block).
         """
@@ -1319,6 +1328,9 @@ def _render_pixel_preview(
                 y=int(y_css),
                 target_bbox=target_bbox,
                 candidate_bboxes=candidate_bboxes,
+                target_selector=target_selector,
+                candidate_selectors=candidate_selectors,
+                banner_kind=banner_kind,
                 drag_end=drag_end,
                 conversation_id=self.conversation_id,
             )
@@ -1343,37 +1355,35 @@ def _build_pixel_gate_message(
         candidates: list,
         drag_endpoints: Optional[Dict[str, str]] = None,
     ) -> str:
-        """Compose the human-readable confirmation message for the agent."""
+        """Compose the human-readable confirmation message for the agent.
+
+        Kept terse on purpose: the zoomed crop already shows the yellow
+        target and orange neighbors visually, so the message contributes
+        only the candidate list (HTML + centers) and one-line guidance.
+        """
         lines: list[str] = []
         if kind == "click":
             if hit:
                 lines.append(
-                    "The yellow box marks the element `click` would commit. "
-                    "Orange dashed outlines mark nearby alternatives."
+                    "Click previewed on the yellow target. Confirm to "
+                    "commit, or re-emit `click` with one of the candidate "
+                    "centers below."
                 )
             else:
                 lines.append(
-                    "Orange dashed outlines mark nearby interactable "
-                    "alternatives near the cursor."
+                    "No element under the cursor — re-emit `click` with one "
+                    "of the candidate centers below."
                 )
-            lines.append(
-                "Reply with `mouse` `action: \"confirm\"` to commit, or "
-                "re-emit `move` + `click` aimed at one of the candidate "
-                "centers below."
-            )
         elif kind == "drag":
             note = ""
             if drag_endpoints:
                 note = (
-                    f" Start={drag_endpoints.get('start', 'unknown')}, "
-                    f"end={drag_endpoints.get('end', 'unknown')}."
+                    f" (start={drag_endpoints.get('start', 'unknown')}, "
+                    f"end={drag_endpoints.get('end', 'unknown')})"
                 )
             lines.append(
-                "At least one drag endpoint sits in a dense neighborhood." + note
-            )
-            lines.append(
-                "Reply with `mouse` `action: \"confirm\"` to commit the drag, "
-                "or re-emit `drag` with corrected endpoints."
+                "Drag previewed" + note + ". Confirm to commit, or re-emit "
+                "`drag` with corrected endpoints."
             )
         block = self._format_pixel_candidates_block(
             candidates,
@@ -1402,9 +1412,17 @@ def _gate_pixel_click(
         candidate_bboxes = [
             c["bbox"] for c in neighborhood if isinstance(c.get("bbox"), dict)
         ]
+        candidate_selectors = [
+            c.get("selector") for c in neighborhood
+            if isinstance(c.get("selector"), str)
+        ]
         target_bbox = (
             hit.get("bbox") if hit and isinstance(hit.get("bbox"), dict) else None
         )
+        target_selector = (
+            hit.get("selector") if hit and isinstance(hit.get("selector"), str)
+            else None
+        )
 
         preview_url = self._render_pixel_preview(
             mode="pixel_hit" if hit else "pixel_miss",
@@ -1412,6 +1430,9 @@ def _gate_pixel_click(
             y_css=cy,
             target_bbox=target_bbox,
             candidate_bboxes=candidate_bboxes,
+            target_selector=target_selector,
+            candidate_selectors=candidate_selectors,
+            banner_kind="click",
         )
 
         message = self._build_pixel_gate_message(
@@ -1484,11 +1505,20 @@ def _gate_pixel_drag(
             for c in focus_neighborhood
             if isinstance(c.get("bbox"), dict)
         ]
+        candidate_selectors = [
+            c.get("selector") for c in focus_neighborhood
+            if isinstance(c.get("selector"), str)
+        ]
         target_bbox = (
             focus_hit.get("bbox")
             if focus_hit and isinstance(focus_hit.get("bbox"), dict)
             else None
         )
+        target_selector = (
+            focus_hit.get("selector")
+            if focus_hit and isinstance(focus_hit.get("selector"), str)
+            else None
+        )
 
         preview_url = self._render_pixel_preview(
             mode="pixel_hit" if focus_hit else "pixel_miss",
@@ -1496,6 +1526,9 @@ def _gate_pixel_drag(
             y_css=focus_y,
             target_bbox=target_bbox,
             candidate_bboxes=candidate_bboxes,
+            target_selector=target_selector,
+            candidate_selectors=candidate_selectors,
+            banner_kind="drag",
             drag_end=drag_end_point,
         )
 
@@ -1549,6 +1582,15 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                 small_model=self._uses_small_model(),
             )
 
+        # Strip the in-page overlay before committing so the click commits on
+        # an unhighlighted page and the resulting screenshot is clean.
+        try:
+            self._execute_command_sync(
+                ClearPixelOverlayCommand(conversation_id=self.conversation_id)
+            )
+        except Exception as e:
+            logger.debug("clear_pixel_overlay before commit failed: %s", e)
+
         action_type = pending.get("action_type")
         extra = pending.get("extra_data") or {}
 
@@ -1632,6 +1674,20 @@ def _execute_mouse_action(
         kind = action.action
         logger.debug(f"DEBUG: _execute_mouse_action kind={kind}")
 
+        # Strip any DOM overlay left over from a previous gated preview so the
+        # live page matches the agent's new intent. `confirm` keeps the overlay
+        # in place — the next observation after the click will redraw or the
+        # navigation will replace the page.
+        if kind != "confirm":
+            try:
+                self._execute_command_sync(
+                    ClearPixelOverlayCommand(
+                        conversation_id=self.conversation_id
+                    )
+                )
+            except Exception as e:
+                logger.debug("clear_pixel_overlay best-effort failed: %s", e)
+
         try:
             if kind == "move":
                 if action.x is None or action.y is None:
@@ -1648,16 +1704,26 @@ def _execute_mouse_action(
                 )
 
             if kind == "click":
-                # Click is in-place at the cursor's current position. The
-                # gate runs against the cursor coord (the implicit click
-                # point), not action.x/y — the agent positions via `move`
-                # first, so any x/y on the action is informational.
-                if action.x is not None or action.y is not None:
-                    logger.debug(
-                        "Mouse click ignored x=%s, y=%s (click is in-place)",
-                        action.x,
-                        action.y,
+                # `click {x, y}` is move-then-click in one call. With both
+                # coordinates supplied, slide the cursor to (x, y), cache it,
+                # and run the gate against that fresh position. With no
+                # coordinates, click at the cursor's current position
+                # (hover-then-click flows). Mixing — e.g. only x — is
+                # rejected so the agent re-emits cleanly.
+                has_x = action.x is not None
+                has_y = action.y is not None
+                if has_x ^ has_y:
+                    raise ValueError(
+                        "mouse click with explicit coordinates needs both x and y"
                     )
+                if has_x and has_y:
+                    px, py = self._denormalize_xy(action.x, action.y)
+                    move_command = MouseMoveCommand(
+                        x=px, y=py, conversation_id=self.conversation_id
+                    )
+                    self._execute_command_sync(move_command)
+                    if px is not None and py is not None:
+                        self._cache_cursor(px, py)
                 cursor = self._get_cursor_or_center()
                 gate = (
                     self._gate_pixel_target(cursor[0], cursor[1])
@@ -1678,10 +1744,15 @@ def _execute_mouse_action(
                     conversation_id=self.conversation_id,
                 )
                 result_dict = self._execute_command_sync(command)
-                message = (
-                    f"Clicked {action.button} at the cursor "
-                    f"(count={action.count})"
+                cx, cy = cursor or (None, None)
+                where = (
+                    f"({cx}, {cy})" if cx is not None and cy is not None
+                    else "the cursor"
+                )
+                count_note = (
+                    f", count={action.count}" if action.count != 1 else ""
                 )
+                message = f"Clicked {action.button} at {where}{count_note}."
                 intercepted = self._extract_intercepted_form_control(result_dict)
                 if intercepted:
                     message = self._format_intercepted_message(
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
index 9ac4fe1..b3a5f60 100644
--- a/server/agent/tools/mouse_tool.py
+++ b/server/agent/tools/mouse_tool.py
@@ -49,11 +49,11 @@ class MouseAction(OpenBrowserAction):
             "What to do with the mouse. "
             "'move' — slide the cursor to (x, y). The cursor traces an eased "
             "path so hover effects fire naturally along the way. "
-            "'click' — click at the cursor's current position. To click a "
-            "different target, 'move' there first, then verify the red dot "
-            "is on the target in the next screenshot, then 'click'. Use "
-            "`count: 2` for double-click, `count: 3` for triple-click. "
-            "`button: 'right'` for context menus. "
+            "'click' — click on the page. Pass `x, y` to move the cursor "
+            "there and click in one step; omit `x, y` to click at the "
+            "cursor's current position (use this after a `move` for a "
+            "hover-then-click flow). `count: 2` double-clicks, `count: 3` "
+            "triple-clicks. `button: 'right'` opens the context menu. "
             "'drag' — press at (x, y), drag to (x2, y2), release. "
             "'scroll' — scroll at the cursor position by `amount` in "
             "`direction`. "
@@ -66,8 +66,9 @@ class MouseAction(OpenBrowserAction):
     x: Optional[int] = Field(
         default=None,
         description=(
-            "Target X in Qwen-VL [0, 1000] normalized space. Used by 'move' "
-            "and 'drag' (start)."
+            "Target X in [0, 1000] normalized space. For 'move' and 'click' "
+            "this is the destination; for 'drag' this is the start of the "
+            "drag."
         ),
         ge=0,
         le=1000,
@@ -75,8 +76,9 @@ class MouseAction(OpenBrowserAction):
     y: Optional[int] = Field(
         default=None,
         description=(
-            "Target Y in Qwen-VL [0, 1000] normalized space. Used by 'move' "
-            "and 'drag' (start)."
+            "Target Y in [0, 1000] normalized space. For 'move' and 'click' "
+            "this is the destination; for 'drag' this is the start of the "
+            "drag."
         ),
         ge=0,
         le=1000,
diff --git a/server/core/processor.py b/server/core/processor.py
index d22558f..a8e2e21 100644
--- a/server/core/processor.py
+++ b/server/core/processor.py
@@ -41,6 +41,7 @@
     HighlightDropPreviewCommand,
     AnalyzePixelTargetsCommand,
     RenderPixelConfirmCommand,
+    ClearPixelOverlayCommand,
 )
 from server.websocket.manager import ws_manager
 from server.core.config import config
@@ -292,6 +293,8 @@ async def execute(self, command: Command) -> CommandResponse:
                 return await self._execute_analyze_pixel_targets(command)
             elif isinstance(command, RenderPixelConfirmCommand):
                 return await self._execute_render_pixel_confirm(command)
+            elif isinstance(command, ClearPixelOverlayCommand):
+                return await self._execute_clear_pixel_overlay(command)
             elif isinstance(command, UploadFileCommand):
                 return await self._execute_upload_file(command)
             else:
@@ -521,6 +524,12 @@ async def _execute_render_pixel_confirm(
         """Render a zoomed confirmation crop for a pending pixel action."""
         return await self._send_prepared_command(command)
 
+    async def _execute_clear_pixel_overlay(
+        self, command: ClearPixelOverlayCommand
+    ) -> CommandResponse:
+        """Clear any pixel-confirmation overlay drawn on the page."""
+        return await self._send_prepared_command(command)
+
     async def _execute_upload_file(self, command: UploadFileCommand) -> CommandResponse:
         """Attach a local file to an <input type=file> via CDP.
 
diff --git a/server/models/commands.py b/server/models/commands.py
index cd2edd3..b0d93ac 100644
--- a/server/models/commands.py
+++ b/server/models/commands.py
@@ -689,6 +689,30 @@ class RenderPixelConfirmCommand(BaseCommand):
             "(used by pixel_miss)."
         ),
     )
+    target_selector: Optional[str] = Field(
+        default=None,
+        description=(
+            "CSS selector for the hit element. When provided, the extension "
+            "draws a yellow outline directly on the live page DOM in "
+            "addition to the canvas overlay."
+        ),
+    )
+    candidate_selectors: Optional[List[str]] = Field(
+        default=None,
+        description=(
+            "CSS selectors for nearby candidate elements. When provided, the "
+            "extension draws orange dashed outlines directly on the live "
+            "page DOM."
+        ),
+    )
+    banner_kind: Optional[Literal["click", "drag"]] = Field(
+        default=None,
+        description=(
+            "Banner kind for the in-page confirmation prompt — 'click' "
+            "renders 'Is this the element you wanted to click?', 'drag' "
+            "renders the drag equivalent. Omit to skip the banner."
+        ),
+    )
     drag_end: Optional[dict] = Field(
         default=None,
         description=(
@@ -698,6 +722,16 @@ class RenderPixelConfirmCommand(BaseCommand):
     )
 
 
+class ClearPixelOverlayCommand(BaseCommand):
+    """Remove any pixel-confirmation overlay currently drawn on the page.
+
+    Sent before a new mouse action begins so a stale yellow/orange overlay
+    from the previous turn does not linger across actions.
+    """
+
+    type: Literal["clear_pixel_overlay"] = "clear_pixel_overlay"
+
+
 class HighlightDropPreviewCommand(BaseCommand):
     """Highlight inner elements of a drop container for drag-and-drop 2PC flow.
 
@@ -791,6 +825,7 @@ class TabsResponse(CommandResponse):
     HighlightDropPreviewCommand,
     AnalyzePixelTargetsCommand,
     RenderPixelConfirmCommand,
+    ClearPixelOverlayCommand,
     RecordingControlCommand,
     DragAndDropElementCommand,
     SetSliderValueCommand,
@@ -834,6 +869,7 @@ def parse_command(data: dict) -> Command:
         "highlight_drop_preview": HighlightDropPreviewCommand,
         "analyze_pixel_targets": AnalyzePixelTargetsCommand,
         "render_pixel_confirm": RenderPixelConfirmCommand,
+        "clear_pixel_overlay": ClearPixelOverlayCommand,
         "recording_control": RecordingControlCommand,
         "drag_and_drop_element": DragAndDropElementCommand,
         "set_slider_value": SetSliderValueCommand,

From 9e682c0ef850767834e19f63988863bb7dea3ddb Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 6 May 2026 16:09:43 +0800
Subject: [PATCH 09/14] feat(agent): warn when a click produces no DOM change,
 surface nearby targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Arms a MutationObserver in `performMouseClick` before dispatch, reads
the count ~250 ms after the click, and returns `triggered_anything:
false` when no non-cursor mutations were recorded — i.e. the click did
nothing the page reacted to. The observer filters out the cursor
sprite (`#__ob_cursor__`) and the pixel-confirm overlay so neither
their post-click refreshes nor their teardown counts as page activity.
DOM mutations cover CSS-class flips, aria toggles, child appends, and
text changes, so a like-heart toggle that only swaps an SVG color
still registers as a real click.

When the server sees `triggered_anything: false`, the click
observation gets a one-line warning plus the same nearby-interactable
candidate block the dense-gate preview uses (descriptor + center in
[0,1000] space), so the agent has somewhere concrete to re-aim
instead of looping on the same dead pixel.

Validated on the Gmail Finance Follow-up eval test that previously
hit the 660 s wall-clock cap stuck in a click-Escape-click loop on
the reply-draft modal: with the warning in place the agent re-aims
on each no-op, finishes the workflow in 258 s, and scores 8.0/8.0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/commands/pixel-actions.ts | 151 +++++++++++++++++++++++-
 server/agent/tools/browser_executor.py  |  74 ++++++++++++
 2 files changed, 224 insertions(+), 1 deletion(-)

diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index 3672b82..a71beed 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -108,6 +108,117 @@ function clampToViewport(
   return { x: cx, y: cy, warning };
 }
 
+/**
+ * No-op click detection via MutationObserver.
+ *
+ * A snapshot/diff probe (URL, title, active-element, body-text length,
+ * dialog count) misses common interactions that only flip a CSS class
+ * or toggle an aria attribute — e.g. tapping a like-heart on Xiaohongshu
+ * changes the SVG color and `aria-pressed` but no body text, no URL.
+ * Counting DOM mutations during the click window catches those: any
+ * non-cursor-sprite mutation means the click did *something*.
+ *
+ * Mutations on the agent's injected cursor sprite are filtered out so
+ * `refreshCursor` running between the click and the read does not
+ * register as page activity.
+ */
+const ARM_MUTATION_OBSERVER_SCRIPT = `
+  (() => {
+    try {
+      const w = window;
+      if (w.__ob_click_obs__) {
+        try { w.__ob_click_obs__.disconnect(); } catch (_) {}
+      }
+      w.__ob_click_mutations__ = 0;
+      // The agent's cursor sprite lives at #__ob_cursor__ (see
+      // virtual-cursor.ts buildCursorInjectScript). Mutations on it
+      // (refreshCursor between click and read) are not page activity.
+      const cursorId = '__ob_cursor__';
+      const overlayId = '__ob_pixel_confirm_overlay__';
+      const skipIds = new Set([cursorId, overlayId]);
+      const obs = new MutationObserver((muts) => {
+        for (const m of muts) {
+          let t = m.target;
+          let skip = false;
+          while (t && t.nodeType === 1) {
+            if (skipIds.has(t.id)) { skip = true; break; }
+            t = t.parentNode;
+          }
+          if (!skip) w.__ob_click_mutations__++;
+        }
+      });
+      obs.observe(document.documentElement, {
+        childList: true,
+        subtree: true,
+        attributes: true,
+        characterData: true,
+      });
+      w.__ob_click_obs__ = obs;
+      return { armed: true };
+    } catch (e) {
+      return { armed: false, error: String(e) };
+    }
+  })()
+`;
+
+const READ_MUTATION_OBSERVER_SCRIPT = `
+  (() => {
+    try {
+      const w = window;
+      const obs = w.__ob_click_obs__;
+      if (!obs) return { mutations: -1 };
+      try { obs.disconnect(); } catch (_) {}
+      const c = w.__ob_click_mutations__ || 0;
+      delete w.__ob_click_obs__;
+      delete w.__ob_click_mutations__;
+      return { mutations: c };
+    } catch (e) {
+      return { mutations: -1, error: String(e) };
+    }
+  })()
+`;
+
+async function armClickMutationObserver(cdp: CdpCommander): Promise<boolean> {
+  try {
+    const probe = await cdp.sendCommand<{
+      result?: { value?: { armed?: boolean } };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression: ARM_MUTATION_OBSERVER_SCRIPT,
+        returnByValue: true,
+      },
+      4000,
+      0,
+    );
+    return probe?.result?.value?.armed === true;
+  } catch (e) {
+    console.warn('[PixelActions] arm mutation observer failed', e);
+    return false;
+  }
+}
+
+async function readClickMutationCount(cdp: CdpCommander): Promise<number> {
+  try {
+    const probe = await cdp.sendCommand<{
+      result?: { value?: { mutations?: number } };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression: READ_MUTATION_OBSERVER_SCRIPT,
+        returnByValue: true,
+      },
+      4000,
+      0,
+    );
+    const n = probe?.result?.value?.mutations;
+    return typeof n === 'number' ? n : -1;
+  } catch (e) {
+    console.warn('[PixelActions] read mutation observer failed', e);
+    return -1;
+  }
+}
+
 async function refreshCursor(
   cdp: CdpCommander,
   tabId: number,
@@ -219,6 +330,15 @@ export interface MouseClickResult {
   button: string;
   warning?: string;
   intercepted_form_control?: NativeFormControlHit;
+  /**
+   * False when a MutationObserver armed before dispatch saw zero
+   * non-cursor DOM mutations within ~250 ms after the click — i.e. the
+   * click did nothing the page reacted to. Surfaces as a warning in the
+   * agent's observation so it stops re-clicking a non-interactable spot.
+   * Undefined when the probe could not run (intercepted form control,
+   * arming failed, evaluator failure).
+   */
+  triggered_anything?: boolean;
 }
 
 // Hit-test the click point. If the cursor sits on a native <select> or
@@ -375,6 +495,11 @@ export async function performMouseClick(
   const buttons = button === 'left' ? 1 : button === 'right' ? 2 : 4;
   const safeCount = Math.max(1, Math.min(3, count | 0));
 
+  // Arm a MutationObserver right before dispatch so the post-click
+  // read can decide whether the click changed anything in the DOM.
+  // Cursor-sprite + pixel-overlay mutations are filtered out.
+  const observerArmed = await armClickMutationObserver(cdp);
+
   // CDP convention: emit one press/release pair per click and increment
   // `clickCount` (1, 2, 3) so Chrome interprets it as a single → double →
   // triple click sequence. Sending N pairs each with `clickCount:N` produces
@@ -409,7 +534,31 @@ export async function performMouseClick(
   }
 
   await refreshCursor(cdp, tabId, clamped.x, clamped.y);
-  return { x: clamped.x, y: clamped.y, button, warning: clamped.warning };
+
+  // Wait briefly for synchronous + short-async reactions (animation
+  // start, focus change, modal open, navigation kickoff) to land before
+  // we read the mutation count. 250 ms covers most React/Vue render
+  // passes without perceptibly slowing the action loop.
+  await new Promise((r) => setTimeout(r, 250));
+  let triggered: boolean | undefined;
+  if (observerArmed) {
+    const mutations = await readClickMutationCount(cdp);
+    if (mutations >= 0) {
+      // 0 mutations = click did nothing the page reacted to. Anything
+      // positive means the page changed something — class toggle, aria
+      // flip, child append, navigation start. Read failure (-1) leaves
+      // `triggered` undefined so the agent gets no warning.
+      triggered = mutations > 0;
+    }
+  }
+
+  return {
+    x: clamped.x,
+    y: clamped.y,
+    button,
+    warning: clamped.warning,
+    triggered_anything: triggered,
+  };
 }
 
 export async function performMouseDrag(
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 4fc2674..448632a 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1615,6 +1615,10 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                     message = self._format_intercepted_message(
                         intercepted, button, count
                     )
+                elif self._click_was_a_no_op(result_dict):
+                    message += self._format_no_op_warning_from_candidates(
+                        extra.get("candidates") or []
+                    )
                 return self._build_observation_from_result(result_dict, message)
 
             if action_type == "mouse_drag_pixel":
@@ -1758,6 +1762,8 @@ def _execute_mouse_action(
                     message = self._format_intercepted_message(
                         intercepted, action.button, action.count
                     )
+                elif self._click_was_a_no_op(result_dict):
+                    message += self._format_no_op_warning(gate)
                 return self._build_observation_from_result(result_dict, message)
 
             if kind == "drag":
@@ -1913,6 +1919,74 @@ def _execute_keyboard_action(
                 success=False, error=str(e), small_model=self._uses_small_model()
             )
 
+    def _format_no_op_warning(
+        self, gate: Optional[Dict[str, Any]]
+    ) -> str:
+        """Warning text for a click that committed but produced no DOM change.
+
+        When `gate` carries a neighborhood from the pixel-target probe, the
+        nearby interactable elements are listed so the agent has somewhere
+        concrete to re-aim at. Empty-space clicks otherwise leave the agent
+        with no signal beyond "it didn't work."
+        """
+        candidates: list = []
+        if gate:
+            neighborhood = gate.get("neighborhood") or []
+            viewport = gate.get("viewport") or {}
+            vw = int(viewport.get("width") or 0)
+            vh = int(viewport.get("height") or 0)
+            if not vw or not vh:
+                vp = self._get_viewport()
+                if vp is not None:
+                    vw, vh = vp
+            candidates = self._serialize_pixel_candidates(neighborhood, vw, vh)
+        return self._format_no_op_warning_from_candidates(candidates)
+
+    def _format_no_op_warning_from_candidates(
+        self, candidates: list
+    ) -> str:
+        """Render the no-op warning + candidate block from pre-serialized data."""
+        lines = [
+            "",
+            "⚠ The click produced no DOM change. The element may be "
+            "disabled, behind a modal, off-screen, or non-interactable. "
+            "Re-clicking the same spot will not help — re-aim at one of "
+            "the nearby interactable elements below, scroll the target "
+            "into view, or focus a different control.",
+        ]
+        block = self._format_pixel_candidates_block(
+            candidates,
+            header="Nearby interactable elements (centers in [0,1000] space)",
+        )
+        if block:
+            lines.append(block)
+        else:
+            lines[-1] = (
+                "⚠ The click produced no DOM change and no nearby "
+                "interactable element was detected. Scroll the target "
+                "into view or pick a different region of the screen."
+            )
+        return "\n".join(lines)
+
+    @staticmethod
+    def _click_was_a_no_op(result_dict: Optional[Dict[str, Any]]) -> bool:
+        """True iff the extension's post-click probe saw zero DOM mutations.
+
+        A MutationObserver is armed before the CDP click and read ~250ms
+        after — `triggered_anything: false` means the observer recorded
+        zero non-cursor mutations during that window, i.e. the click did
+        nothing the page reacted to. Any other shape (including a probe
+        failure) is treated as "did something" so we never warn falsely.
+        """
+        if not result_dict:
+            return False
+        data = result_dict.get("data")
+        if isinstance(data, dict) and "triggered_anything" in data:
+            return data.get("triggered_anything") is False
+        if "triggered_anything" in result_dict:
+            return result_dict.get("triggered_anything") is False
+        return False
+
     @staticmethod
     def _extract_intercepted_form_control(
         result_dict: Optional[Dict[str, Any]],

From 81c4927957ac768c3d1bfea1a690f257a3510808 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 6 May 2026 16:25:55 +0800
Subject: [PATCH 10/14] feat(agent): draw orange-dashed candidates on no-op
 clicks; clear overlays extension-side
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups to the no-op-click warning:

1. **Visualize candidates on the live page.** When `triggered_anything:
   false` comes back from the click probe, the server now also fires
   `render_pixel_confirm` (mode `pixel_miss`, no banner) so the same
   orange-dashed boxes the agent reads as text in its observation are
   painted onto the live page DOM. A human watching the browser sees
   exactly what alternatives the agent is being told to consider.
   Serialized candidates gained a `bbox_css` field so the confirm path
   can draw the overlay without re-running the gate.

2. **Tear overlays down extension-side.** Cleanup moved from the server
   (which only fired before mouse actions) into the extension's
   pixel-action dispatcher, so every incoming agent action — mouse,
   keyboard, select, upload — clears any leftover overlay before
   running. `_execute_mouse_action` and `_commit_pending_pixel_action`
   no longer send their own `clear_pixel_overlay` (the local cleanup
   covers both gated-preview and no-op-warning overlays uniformly).

Also fix a leak in the Gmail eval mock: the "Create a new label" input
had `placeholder="Finance/Board-Prep"` — the exact label the test
asks the agent to create — and no `autocomplete="off"`, so Chrome
also surfaced the value as a history suggestion. Replaced the
placeholder with a generic `e.g. Marketing/Q4-Launch` and disabled
autocomplete.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 eval/gmail/js/gmail.js                 |   2 +-
 extension/src/background/index.ts      |  12 +++
 server/agent/tools/browser_executor.py | 129 ++++++++++++++++++++-----
 3 files changed, 119 insertions(+), 24 deletions(-)

diff --git a/eval/gmail/js/gmail.js b/eval/gmail/js/gmail.js
index 415600b..eabaded 100644
--- a/eval/gmail/js/gmail.js
+++ b/eval/gmail/js/gmail.js
@@ -793,7 +793,7 @@ window.tracker = new AgentTracker("mail.google.com", "hard");
             </div>
             <div style="margin-top: 18px;">
               <label class="mock-kicker" for="new-label-name">Create a new label</label>
-              <input id="new-label-name" class="mock-input" data-role="new-label-name" value="${escapeHtml(modal.newLabelName || "")}" placeholder="Finance/Board-Prep">
+              <input id="new-label-name" class="mock-input" data-role="new-label-name" value="${escapeHtml(modal.newLabelName || "")}" placeholder="e.g. Marketing/Q4-Launch" autocomplete="off">
             </div>
           </div>
           <div class="mock-modal-footer">
diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index d8d3ab2..632504c 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -1918,6 +1918,18 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         await tabManager.ensureTabManaged(activeTabId, conversationId);
         tabManager.updateTabActivity(activeTabId, conversationId);
 
+        // Drop any pixel-confirm overlay left over from a previous gated
+        // preview or no-op warning the moment the next agent action lands.
+        // The server also clears in front of mouse moves/clicks but does
+        // not clear before keyboard/select/upload actions; doing it here
+        // keeps the overlay tied to "the agent's last decision turn"
+        // regardless of which action follows.
+        try {
+          await clearPixelConfirmOverlay(activeTabId, conversationId);
+        } catch (e) {
+          console.warn('[PixelAction] pre-action overlay clear failed', e);
+        }
+
         // eslint-disable-next-line @typescript-eslint/no-explicit-any
         let actionDetail: Record<string, any> = {};
         try {
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 448632a..1321f14 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -42,7 +42,6 @@
     HighlightDropPreviewCommand,
     AnalyzePixelTargetsCommand,
     RenderPixelConfirmCommand,
-    ClearPixelOverlayCommand,
     MouseMoveCommand,
     MouseClickCommand,
     MouseDragCommand,
@@ -1212,6 +1211,12 @@ def _serialize_pixel_candidates(
                     # HTML snippet for grounding without re-querying.
                     "html": html,
                     "selector": c.get("selector"),
+                    "bbox_css": {
+                        "x": int(bx),
+                        "y": int(by),
+                        "width": int(bw),
+                        "height": int(bh),
+                    },
                     "bbox_norm": {
                         "x": round(bx / vw * 1000),
                         "y": round(by / vh * 1000),
@@ -1582,14 +1587,9 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                 small_model=self._uses_small_model(),
             )
 
-        # Strip the in-page overlay before committing so the click commits on
-        # an unhighlighted page and the resulting screenshot is clean.
-        try:
-            self._execute_command_sync(
-                ClearPixelOverlayCommand(conversation_id=self.conversation_id)
-            )
-        except Exception as e:
-            logger.debug("clear_pixel_overlay before commit failed: %s", e)
+        # No explicit clear needed: the actual mouse_click / mouse_drag
+        # dispatched below routes through the extension's pixel-action
+        # case, which clears the gated-preview overlay at its top.
 
         action_type = pending.get("action_type")
         extra = pending.get("extra_data") or {}
@@ -1616,9 +1616,18 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                         intercepted, button, count
                     )
                 elif self._click_was_a_no_op(result_dict):
+                    serialized = extra.get("candidates") or []
                     message += self._format_no_op_warning_from_candidates(
-                        extra.get("candidates") or []
+                        serialized
                     )
+                    # Same overlay as the direct-click path so the live
+                    # page visually surfaces the candidates the agent was
+                    # given as alternatives.
+                    px, py = extra.get("px"), extra.get("py")
+                    if isinstance(px, int) and isinstance(py, int):
+                        self._draw_no_op_overlay_from_serialized(
+                            (px, py), serialized
+                        )
                 return self._build_observation_from_result(result_dict, message)
 
             if action_type == "mouse_drag_pixel":
@@ -1678,19 +1687,10 @@ def _execute_mouse_action(
         kind = action.action
         logger.debug(f"DEBUG: _execute_mouse_action kind={kind}")
 
-        # Strip any DOM overlay left over from a previous gated preview so the
-        # live page matches the agent's new intent. `confirm` keeps the overlay
-        # in place — the next observation after the click will redraw or the
-        # navigation will replace the page.
-        if kind != "confirm":
-            try:
-                self._execute_command_sync(
-                    ClearPixelOverlayCommand(
-                        conversation_id=self.conversation_id
-                    )
-                )
-            except Exception as e:
-                logger.debug("clear_pixel_overlay best-effort failed: %s", e)
+        # Overlay teardown lives in the extension's pixel-action dispatcher
+        # so every incoming agent action — mouse, keyboard, select, upload —
+        # clears any leftover overlay before running. No server-side clear
+        # needed here.
 
         try:
             if kind == "move":
@@ -1764,6 +1764,11 @@ def _execute_mouse_action(
                     )
                 elif self._click_was_a_no_op(result_dict):
                     message += self._format_no_op_warning(gate)
+                    # Draw orange-dashed candidates on the live page so a
+                    # human watching the browser sees what the agent is
+                    # told to re-aim at. Cleared on the agent's next
+                    # mouse action via clear_pixel_overlay.
+                    self._draw_no_op_overlay(cursor, gate)
                 return self._build_observation_from_result(result_dict, message)
 
             if kind == "drag":
@@ -1919,6 +1924,84 @@ def _execute_keyboard_action(
                 success=False, error=str(e), small_model=self._uses_small_model()
             )
 
+    def _draw_no_op_overlay(
+        self,
+        cursor: Optional[tuple[int, int]],
+        gate: Optional[Dict[str, Any]],
+    ) -> None:
+        """Inject orange-dashed candidate boxes onto the live page for the
+        no-op case. Mirrors the gated-preview overlay so a human watching
+        the browser sees the same alternatives the agent is told to re-aim
+        at. Best-effort: failures are logged and swallowed.
+        """
+        if cursor is None or not gate:
+            return
+        neighborhood = gate.get("neighborhood") or []
+        if not neighborhood:
+            return
+        candidate_selectors: list = []
+        candidate_bboxes: list = []
+        for c in neighborhood:
+            if not isinstance(c, dict):
+                continue
+            sel = c.get("selector")
+            bbox = c.get("bbox") if isinstance(c.get("bbox"), dict) else None
+            if isinstance(sel, str):
+                candidate_selectors.append(sel)
+            if bbox:
+                candidate_bboxes.append(bbox)
+        if not candidate_selectors and not candidate_bboxes:
+            return
+        try:
+            self._render_pixel_preview(
+                mode="pixel_miss",
+                x_css=cursor[0],
+                y_css=cursor[1],
+                target_bbox=None,
+                candidate_bboxes=candidate_bboxes,
+                target_selector=None,
+                candidate_selectors=candidate_selectors,
+                banner_kind=None,
+            )
+        except Exception as e:
+            logger.debug("no-op overlay render failed: %s", e)
+
+    def _draw_no_op_overlay_from_serialized(
+        self,
+        cursor: Optional[tuple[int, int]],
+        candidates: list,
+    ) -> None:
+        """Same as `_draw_no_op_overlay` but takes pre-serialized candidates
+        (the form stashed in `extra_data` for confirm-path commits)."""
+        if cursor is None or not candidates:
+            return
+        candidate_selectors: list = []
+        candidate_bboxes: list = []
+        for c in candidates:
+            if not isinstance(c, dict):
+                continue
+            sel = c.get("selector")
+            bbox = c.get("bbox_css")
+            if isinstance(sel, str):
+                candidate_selectors.append(sel)
+            if isinstance(bbox, dict):
+                candidate_bboxes.append(bbox)
+        if not candidate_selectors and not candidate_bboxes:
+            return
+        try:
+            self._render_pixel_preview(
+                mode="pixel_miss",
+                x_css=cursor[0],
+                y_css=cursor[1],
+                target_bbox=None,
+                candidate_bboxes=candidate_bboxes,
+                target_selector=None,
+                candidate_selectors=candidate_selectors,
+                banner_kind=None,
+            )
+        except Exception as e:
+            logger.debug("no-op overlay render failed: %s", e)
+
     def _format_no_op_warning(
         self, gate: Optional[Dict[str, Any]]
     ) -> str:

From f7d052b88d0943bd4cb9a1379b114cb4c3706383 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 7 May 2026 09:17:06 +0800
Subject: [PATCH 11/14] fix(agent): keyboard clear that actually clears + click
 no-op probe that doesn't false-alarm on focus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`keyboard clear` previously sent Ctrl+A + Backspace via two CDP key
events. On macOS Cmd is the select-all modifier; Ctrl+A no-ops, the
Backspace deletes one character, and the SUCCESS message lies. Replace
with a dedicated `keyboard_clear` command that runs JS against
`document.activeElement` (input/textarea value setter, contenteditable
textContent, role=textbox) and dispatches input+change so framework-
controlled widgets observe the reset. Reports SUCCESS only when the
field actually ended up empty; otherwise surfaces the focused element
descriptor so the agent can recover.

The post-click no-op probe only counted DOM mutations, so clicking into
an input fired a false "no DOM change" warning — :focus is a CSS state
that updates `document.activeElement` without mutating the tree. Extend
the probe to also capture activeElement, scrollY, and Selection state
at arm time and diff at read time. Trivial body/null transitions are
filtered so initial-load focus shifts don't false-positive in the other
direction.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts             |   8 +
 extension/src/commands/pixel-actions.ts       | 224 +++++++++++++++---
 extension/src/types.ts                        |   5 +
 .../agent/prompts/big_model/keyboard_tool.j2  |   3 +-
 .../prompts/small_model/keyboard_tool.j2      |   2 +-
 server/agent/tools/browser_executor.py        |  42 ++--
 server/models/commands.py                     |  14 ++
 7 files changed, 248 insertions(+), 50 deletions(-)

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 632504c..24e5986 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -31,6 +31,7 @@ import {
   performMouseScroll,
   performKeyboardType,
   performKeyboardPress,
+  performKeyboardClear,
   performResetMouse,
   performSelectOption,
   performUploadFilePending,
@@ -1272,6 +1273,7 @@ function isHeavyBrowserCommand(data: any): boolean {
     case 'mouse_scroll':
     case 'keyboard_type':
     case 'keyboard_press':
+    case 'keyboard_clear':
     case 'reset_mouse':
       return true;
     case 'tab':
@@ -1900,6 +1902,7 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
       case 'mouse_scroll':
       case 'keyboard_type':
       case 'keyboard_press':
+      case 'keyboard_clear':
       case 'reset_mouse':
       case 'select_option':
       case 'upload_file_pending': {
@@ -1999,6 +2002,11 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
               actionDetail = r;
               break;
             }
+            case 'keyboard_clear': {
+              const r = await performKeyboardClear(activeTabId, conversationId);
+              actionDetail = r;
+              break;
+            }
             case 'reset_mouse': {
               const r = await performResetMouse(activeTabId, conversationId);
               actionDetail = r;
diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index a71beed..8a4c906 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -109,14 +109,23 @@ function clampToViewport(
 }
 
 /**
- * No-op click detection via MutationObserver.
+ * No-op click detection.
  *
- * A snapshot/diff probe (URL, title, active-element, body-text length,
- * dialog count) misses common interactions that only flip a CSS class
- * or toggle an aria attribute — e.g. tapping a like-heart on Xiaohongshu
- * changes the SVG color and `aria-pressed` but no body text, no URL.
- * Counting DOM mutations during the click window catches those: any
- * non-cursor-sprite mutation means the click did *something*.
+ * A snapshot/diff probe (URL, title, body-text length, dialog count)
+ * misses common interactions that only flip a CSS class or toggle an
+ * aria attribute — e.g. tapping a like-heart on Xiaohongshu changes the
+ * SVG color and `aria-pressed` but no body text, no URL. So a
+ * MutationObserver counts DOM mutations during the click window.
+ *
+ * But pure-focus / scroll / selection changes are NOT DOM mutations:
+ *   - clicking into an `<input>` only flips `:focus` (a CSS state) and
+ *     updates `document.activeElement`, neither of which mutates the
+ *     tree
+ *   - clicking an anchor link can scroll the document without mutating
+ *   - clicking inside text moves the caret / selection
+ * The probe also captures activeElement / scrollY / selection at arm
+ * time and compares at read time so any of those changes counts as
+ * "the click did something."
  *
  * Mutations on the agent's injected cursor sprite are filtered out so
  * `refreshCursor` running between the click and the read does not
@@ -154,6 +163,28 @@ const ARM_MUTATION_OBSERVER_SCRIPT = `
         characterData: true,
       });
       w.__ob_click_obs__ = obs;
+      // Capture before-state for focus / scroll / selection so the read
+      // can detect changes that don't trip the MutationObserver.
+      w.__ob_click_active_before__ = document.activeElement;
+      const se = document.scrollingElement || document.documentElement;
+      w.__ob_click_scroll_before__ = {
+        x: (se && se.scrollLeft) || w.scrollX || 0,
+        y: (se && se.scrollTop) || w.scrollY || 0,
+      };
+      try {
+        const sel = w.getSelection && w.getSelection();
+        w.__ob_click_selection_before__ = sel
+          ? {
+              anchorNode: sel.anchorNode,
+              anchorOffset: sel.anchorOffset,
+              focusNode: sel.focusNode,
+              focusOffset: sel.focusOffset,
+              isCollapsed: sel.isCollapsed,
+            }
+          : null;
+      } catch (_) {
+        w.__ob_click_selection_before__ = null;
+      }
       return { armed: true };
     } catch (e) {
       return { armed: false, error: String(e) };
@@ -168,10 +199,46 @@ const READ_MUTATION_OBSERVER_SCRIPT = `
       const obs = w.__ob_click_obs__;
       if (!obs) return { mutations: -1 };
       try { obs.disconnect(); } catch (_) {}
-      const c = w.__ob_click_mutations__ || 0;
+      const mutations = w.__ob_click_mutations__ || 0;
+      const beforeActive = w.__ob_click_active_before__ || null;
+      const beforeScroll = w.__ob_click_scroll_before__ || { x: 0, y: 0 };
+      const beforeSel = w.__ob_click_selection_before__ || null;
+      const afterActive = document.activeElement;
+      // Treat <body>/null transitions as a no-op (they fire spuriously
+      // during page lifecycle); a real focus change is between two
+      // distinct elements where at least one isn't body/html/null.
+      const trivial = (n) => !n || n === document.body || n === document.documentElement;
+      const activeChanged = afterActive !== beforeActive
+        && !(trivial(beforeActive) && trivial(afterActive));
+      const se = document.scrollingElement || document.documentElement;
+      const ax = (se && se.scrollLeft) || w.scrollX || 0;
+      const ay = (se && se.scrollTop) || w.scrollY || 0;
+      const scrollChanged = ax !== beforeScroll.x || ay !== beforeScroll.y;
+      let selectionChanged = false;
+      try {
+        const sel = w.getSelection && w.getSelection();
+        if (sel && beforeSel) {
+          selectionChanged =
+            sel.anchorNode !== beforeSel.anchorNode ||
+            sel.anchorOffset !== beforeSel.anchorOffset ||
+            sel.focusNode !== beforeSel.focusNode ||
+            sel.focusOffset !== beforeSel.focusOffset ||
+            sel.isCollapsed !== beforeSel.isCollapsed;
+        } else if (sel && !beforeSel) {
+          selectionChanged = !sel.isCollapsed || !!sel.anchorNode;
+        }
+      } catch (_) {}
       delete w.__ob_click_obs__;
       delete w.__ob_click_mutations__;
-      return { mutations: c };
+      delete w.__ob_click_active_before__;
+      delete w.__ob_click_scroll_before__;
+      delete w.__ob_click_selection_before__;
+      return {
+        mutations,
+        active_changed: activeChanged,
+        scroll_changed: scrollChanged,
+        selection_changed: selectionChanged,
+      };
     } catch (e) {
       return { mutations: -1, error: String(e) };
     }
@@ -198,10 +265,17 @@ async function armClickMutationObserver(cdp: CdpCommander): Promise<boolean> {
   }
 }
 
-async function readClickMutationCount(cdp: CdpCommander): Promise<number> {
+interface ClickEffectsProbe {
+  mutations: number;
+  active_changed?: boolean;
+  scroll_changed?: boolean;
+  selection_changed?: boolean;
+}
+
+async function readClickEffects(cdp: CdpCommander): Promise<ClickEffectsProbe> {
   try {
     const probe = await cdp.sendCommand<{
-      result?: { value?: { mutations?: number } };
+      result?: { value?: ClickEffectsProbe };
     }>(
       'Runtime.evaluate',
       {
@@ -211,11 +285,14 @@ async function readClickMutationCount(cdp: CdpCommander): Promise<number> {
       4000,
       0,
     );
-    const n = probe?.result?.value?.mutations;
-    return typeof n === 'number' ? n : -1;
+    const v = probe?.result?.value;
+    if (!v || typeof v.mutations !== 'number') {
+      return { mutations: -1 };
+    }
+    return v;
   } catch (e) {
     console.warn('[PixelActions] read mutation observer failed', e);
-    return -1;
+    return { mutations: -1 };
   }
 }
 
@@ -542,13 +619,22 @@ export async function performMouseClick(
   await new Promise((r) => setTimeout(r, 250));
   let triggered: boolean | undefined;
   if (observerArmed) {
-    const mutations = await readClickMutationCount(cdp);
-    if (mutations >= 0) {
-      // 0 mutations = click did nothing the page reacted to. Anything
-      // positive means the page changed something — class toggle, aria
-      // flip, child append, navigation start. Read failure (-1) leaves
+    const effects = await readClickEffects(cdp);
+    if (effects.mutations >= 0) {
+      // The click "did something" if any of these signals fired:
+      //   - DOM mutations (class/attribute/child/text changes)
+      //   - active element changed (focus moved into an input/button)
+      //   - page scrolled (anchor link, scroll-into-view handler)
+      //   - selection / caret moved (clicked into editable text)
+      // Pure-focus clicks on inputs flip `:focus` (a CSS state) and
+      // update activeElement without mutating the tree, so we'd false-
+      // alarm without the activeElement check. Read failure (-1) leaves
       // `triggered` undefined so the agent gets no warning.
-      triggered = mutations > 0;
+      triggered =
+        effects.mutations > 0 ||
+        effects.active_changed === true ||
+        effects.scroll_changed === true ||
+        effects.selection_changed === true;
     }
   }
 
@@ -952,19 +1038,99 @@ export async function performKeyboardPress(
 }
 
 /**
- * Clear the currently focused input by selecting all then deleting.
- * Convenience wrapper so the agent doesn't have to chain Ctrl+A →
- * Backspace as two separate `press` calls.
+ * Clear the currently focused input. JS-based: targets
+ * `document.activeElement`, sets value/textContent empty, dispatches
+ * `input` + `change` so framework-controlled widgets (React, Lit, Gmail
+ * search, etc.) actually observe the reset. Returns `cleared: true` only
+ * when the field ended up empty; otherwise reports `cleared: false` with
+ * a reason so the caller can surface a real failure to the agent.
+ *
+ * Avoids the keyboard-shortcut path (Ctrl+A + Backspace) which silently
+ * no-ops on macOS — Cmd is the system select-all modifier there, and CDP
+ * `Input.dispatchKeyEvent` with Control doesn't trigger Chromium's
+ * built-in select-all binding.
  */
 export async function performKeyboardClear(
   tabId: number,
   conversationId: string,
-): Promise<{ cleared: true }> {
-  // Select all (Ctrl+A — works on macOS in browser inputs too).
-  await performKeyboardPress(tabId, conversationId, 'a', ['Control']);
-  await sleep(20);
-  await performKeyboardPress(tabId, conversationId, 'Backspace', undefined);
-  return { cleared: true };
+): Promise<{
+  cleared: boolean;
+  target?: string;
+  reason?: string;
+}> {
+  await attachWithDialogTracking(tabId, conversationId);
+  const cdp = new CdpCommander(tabId);
+  const expr = `(() => {
+    const el = document.activeElement;
+    if (!el || el === document.body) {
+      return { cleared: false, reason: 'no element focused' };
+    }
+    const tag = (el.tagName || '').toLowerCase();
+    const describe = () => {
+      const id = el.id ? '#' + el.id : '';
+      const name = el.getAttribute && el.getAttribute('name')
+        ? '[name=' + el.getAttribute('name') + ']' : '';
+      const role = el.getAttribute && el.getAttribute('role')
+        ? '[role=' + el.getAttribute('role') + ']' : '';
+      return tag + id + name + role;
+    };
+    const fire = (target) => {
+      try {
+        target.dispatchEvent(new Event('input', { bubbles: true }));
+        target.dispatchEvent(new Event('change', { bubbles: true }));
+      } catch (_) {}
+    };
+    // <input> / <textarea>
+    if (tag === 'input' || tag === 'textarea') {
+      const proto = tag === 'input'
+        ? window.HTMLInputElement && window.HTMLInputElement.prototype
+        : window.HTMLTextAreaElement && window.HTMLTextAreaElement.prototype;
+      const desc = proto && Object.getOwnPropertyDescriptor(proto, 'value');
+      if (desc && desc.set) {
+        desc.set.call(el, '');
+      } else {
+        el.value = '';
+      }
+      fire(el);
+      return { cleared: el.value === '', target: describe() };
+    }
+    // contenteditable (Gmail search, rich editors)
+    if (el.isContentEditable) {
+      el.textContent = '';
+      fire(el);
+      return { cleared: (el.textContent || '') === '', target: describe() };
+    }
+    // role=textbox / role=searchbox / role=combobox custom widgets
+    const role = el.getAttribute && el.getAttribute('role');
+    if (role === 'textbox' || role === 'searchbox' || role === 'combobox') {
+      // Try value first, then textContent.
+      let cleared = false;
+      if ('value' in el) {
+        try { el.value = ''; cleared = el.value === ''; } catch (_) {}
+      }
+      if (!cleared) {
+        el.textContent = '';
+        cleared = (el.textContent || '') === '';
+      }
+      fire(el);
+      return { cleared, target: describe() };
+    }
+    return { cleared: false, reason: 'focused element is not editable: ' + describe() };
+  })()`;
+  const resp = await cdp.sendCommand<{
+    result?: { value?: { cleared?: boolean; target?: string; reason?: string } };
+  }>(
+    'Runtime.evaluate',
+    { expression: expr, returnByValue: true },
+    8000,
+    0,
+  );
+  const value = resp?.result?.value || {};
+  return {
+    cleared: !!value.cleared,
+    target: value.target,
+    reason: value.reason,
+  };
 }
 
 export async function performResetMouse(
diff --git a/extension/src/types.ts b/extension/src/types.ts
index 456eed8..edb656d 100644
--- a/extension/src/types.ts
+++ b/extension/src/types.ts
@@ -77,6 +77,10 @@ export interface KeyboardPressCommand extends BaseCommand {
   modifiers?: string[];
 }
 
+export interface KeyboardClearCommand extends BaseCommand {
+  type: 'keyboard_clear';
+}
+
 export interface SelectOptionCommand extends BaseCommand {
   type: 'select_option';
   values: string[];
@@ -366,6 +370,7 @@ export type Command =
   | ResetMouseCommand
   | KeyboardTypeCommand
   | KeyboardPressCommand
+  | KeyboardClearCommand
   | SelectOptionCommand
   | UploadFilePendingCommand
   | ScreenshotCommand
diff --git a/server/agent/prompts/big_model/keyboard_tool.j2 b/server/agent/prompts/big_model/keyboard_tool.j2
index 3a88ecd..16b76c0 100644
--- a/server/agent/prompts/big_model/keyboard_tool.j2
+++ b/server/agent/prompts/big_model/keyboard_tool.j2
@@ -36,7 +36,7 @@ Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`, `ArrowD
 Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
 
 ### clear
-Select-all + delete the contents of the currently focused field. Use this to overwrite a field that already has text in it.
+Empty the currently focused field. Works on `<input>`, `<textarea>`, and contenteditable widgets (Gmail search, rich editors). The result reports whether the field actually ended up empty — if it didn't, click into the field first and try again.
 
 ```json
 { "action": "clear" }
@@ -48,7 +48,6 @@ Select-all + delete the contents of the currently focused field. Use this to ove
 - **Replace existing text**: `mouse move` to the field → `mouse click` → `keyboard clear` → `keyboard type`.
 - **Submit a form / trigger search**: `keyboard press` `key: "Enter"` after typing.
 - **Erase a single character**: `keyboard press` `key: "Backspace"`.
-- **Select all in a field**: `keyboard press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS).
 - **Tab to next field**: `keyboard press` `key: "Tab"`.
 
 ## Notes
diff --git a/server/agent/prompts/small_model/keyboard_tool.j2 b/server/agent/prompts/small_model/keyboard_tool.j2
index 863654a..ea2484d 100644
--- a/server/agent/prompts/small_model/keyboard_tool.j2
+++ b/server/agent/prompts/small_model/keyboard_tool.j2
@@ -28,7 +28,7 @@ Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`/`Down`/`
 Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
 
 ### clear
-Select-all + delete the contents of the focused field. Use before overwriting a field that already has text.
+Empty the focused field. Works on `<input>`, `<textarea>`, and contenteditable widgets. The result tells you whether the field actually ended up empty — if not, click into the field first and try again.
 ```json
 { "action": "clear" }
 ```
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 1321f14..35a6b65 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -48,6 +48,7 @@
     MouseScrollCommand,
     KeyboardTypeCommand,
     KeyboardPressCommand,
+    KeyboardClearCommand,
     ResetMouseCommand,
     SelectOptionCommand,
     UploadFilePendingCommand,
@@ -1894,26 +1895,31 @@ def _execute_keyboard_action(
                 )
 
             if kind == "clear":
-                # Clear == select-all then Backspace. Two press commands
-                # so each fires its own event sequence on the focused
-                # element. Done at the wire level via two
-                # KeyboardPressCommands so behavior matches what the
-                # agent would have manually scripted.
-                first = KeyboardPressCommand(
-                    key="a",
-                    modifiers=["Control"],
+                # JS-based clear on document.activeElement: set value /
+                # textContent to empty and dispatch input + change events.
+                # Reports SUCCESS only when the focused element actually
+                # ended up empty. The previous select-all + Backspace path
+                # used Ctrl+A which silently no-ops on macOS where Cmd is
+                # the select-all modifier, leaving stale text in the field.
+                command = KeyboardClearCommand(
                     conversation_id=self.conversation_id,
                 )
-                self._execute_command_sync(first)
-                second = KeyboardPressCommand(
-                    key="Backspace",
-                    modifiers=[],
-                    conversation_id=self.conversation_id,
-                )
-                result_dict = self._execute_command_sync(second)
-                return self._build_observation_from_result(
-                    result_dict, "Cleared focused field (select-all + Backspace)"
-                )
+                result_dict = self._execute_command_sync(command)
+                detail = (result_dict or {}).get("data", {}) or {}
+                cleared = bool(detail.get("cleared"))
+                target = detail.get("target") or "focused field"
+                if cleared:
+                    msg = f"Cleared {target}"
+                else:
+                    reason = detail.get("reason") or "no editable element focused"
+                    msg = (
+                        f"Clear had no effect ({reason}). "
+                        "Click into the field first, then clear."
+                    )
+                obs = self._build_observation_from_result(result_dict, msg)
+                if not cleared:
+                    obs.success = False
+                return obs
 
             raise ValueError(f"Unknown keyboard action: {kind}")
         except Exception as e:
diff --git a/server/models/commands.py b/server/models/commands.py
index b0d93ac..d2bb911 100644
--- a/server/models/commands.py
+++ b/server/models/commands.py
@@ -178,6 +178,18 @@ class KeyboardPressCommand(BaseCommand):
     )
 
 
+class KeyboardClearCommand(BaseCommand):
+    """Clear the currently focused input/textarea/contenteditable.
+
+    Implemented in the extension as a JS-based reset on `document.activeElement`
+    (set value/textContent empty + dispatch input/change), not a keyboard
+    shortcut — Ctrl+A select-all is unreliable on macOS where Cmd is the
+    select-all modifier, so the keyboard path leaves stale text behind.
+    """
+
+    type: Literal["keyboard_clear"] = "keyboard_clear"
+
+
 class SelectOptionCommand(BaseCommand):
     """Choose option(s) on a `<select>` previously focused by `mouse_click`.
 
@@ -805,6 +817,7 @@ class TabsResponse(CommandResponse):
     ResetMouseCommand,
     KeyboardTypeCommand,
     KeyboardPressCommand,
+    KeyboardClearCommand,
     SelectOptionCommand,
     UploadFilePendingCommand,
     ScreenshotCommand,
@@ -848,6 +861,7 @@ def parse_command(data: dict) -> Command:
         "reset_mouse": ResetMouseCommand,
         "keyboard_type": KeyboardTypeCommand,
         "keyboard_press": KeyboardPressCommand,
+        "keyboard_clear": KeyboardClearCommand,
         "select_option": SelectOptionCommand,
         "upload_file_pending": UploadFilePendingCommand,
         "screenshot": ScreenshotCommand,

From 4ce0f4156fa6e7282acd345986df39fdfe827852 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 7 May 2026 09:27:00 +0800
Subject: [PATCH 12/14] refactor(mouse): coordinate as [x,y] array; default
 action move; render only set fields
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous prompt taught coordinates with `{x, y}` set notation and
`(x, y)` tuple notation. Plus-tier models read those literally and
emitted `{"x": [519, 669], "y": [519, 669]}` arrays — 110 events across
50 conversations on the last full-eval, the biggest single error
cluster. Replace x/y/x2/y2 scalar fields with `coordinate`,
`start_coordinate`, `end_coordinate` arrays so the agent's natural
representation matches the schema instead of fighting it.

Make `action` default to "move" so a bare `{"coordinate":[x,y]}` works
when flash models forget the action key (14 events, 100% flash on the
prior eval). Tighten the prompt examples so every snippet is a complete
JSON envelope `{ "action": "...", "coordinate": [...] }` with no
ambient prose like `mouse click {x, y}` that small models picked up as
a tool-name pattern (`Tool 'click' not found` — 65 events, 100% flash).
Document right-click and double-click explicitly so agents stop
inventing `"action": "right_click"` / `"double_click"` literals.

Override OpenBrowserAction.visualize so persisted ActionEvent text only
contains fields the agent actually set (model_fields_set + action).
Stops `text:null, key:null, modifiers:[], kind:KeyboardAction` and
`start_coordinate:null, end_coordinate:null, button:left, count:1,
direction:down, amount:300, steps:10` from polluting the rendered
arguments shown to humans, the compiler agent, and any condense path.

Mirrors mouse-tool prompt edits in the agent-sdk system prompts
(system_prompt_{large,small}.j2) and the venv copy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/agent/prompts/big_model/mouse_tool.j2  |  53 +++++-----
 .../agent/prompts/small_model/mouse_tool.j2   |  47 ++++----
 server/agent/tools/base.py                    |  29 +++++
 server/agent/tools/browser_executor.py        |  51 ++++-----
 server/agent/tools/mouse_tool.py              | 100 ++++++++++++------
 5 files changed, 174 insertions(+), 106 deletions(-)

diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
index 7905449..6c496ee 100644
--- a/server/agent/prompts/big_model/mouse_tool.j2
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -1,54 +1,56 @@
 # Mouse Tool
 
-Drive a virtual mouse cursor: move, click, drag, scroll.
+Drive a virtual mouse cursor: move, click, drag, scroll. Every action is a single JSON object passed as the `mouse` tool's arguments.
 
 ## Coordinates
 
-`(x, y)` and `(x2, y2)` are integers in **[0, 1000]** normalized space:
+Coordinates are 2-element arrays `[x, y]` where `x` and `y` are integers in **[0, 1000]** normalized space:
 
-- `(0, 0)` = top-left of the viewport.
-- `(1000, 1000)` = bottom-right.
+- `[0, 0]` = top-left of the viewport.
+- `[1000, 1000]` = bottom-right.
 
-Estimate from the screenshot. Aim for the visual center of your target. The system rescales to real pixels.
+Estimate from the screenshot. Aim for the visual center of your target. The system rescales `[0, 1000]` to real pixels.
 
 ## The Cursor
 
 A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
 
-`click {x, y}` moves the cursor to `(x, y)` and clicks there in one step. `click` with no coordinates clicks at the cursor's current position — use that after a `move` when you want a hover effect to fire first.
+A `click` with `coordinate` moves the cursor there and clicks in one step. A `click` without `coordinate` clicks at the cursor's current position — use that after a `move` when you want a hover effect to fire first.
 
 ## Actions
 
+`action` defaults to `"move"` when omitted, so `{ "coordinate": [500, 320] }` slides the cursor to that point.
+
 ### move
-Slide the cursor to a point. The cursor traces an eased path so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way.
+Slide the cursor to `coordinate`. The cursor traces an eased path so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way.
 
 ```json
-{ "action": "move", "x": 500, "y": 320 }
+{ "action": "move", "coordinate": [500, 320] }
 ```
 
 ### click
 Click on the page.
 
 ```json
-{ "action": "click", "x": 612, "y": 318 }
+{ "action": "click", "coordinate": [612, 318] }
 { "action": "click" }
-{ "action": "click", "x": 612, "y": 318, "count": 2 }
-{ "action": "click", "button": "right" }
+{ "action": "click", "coordinate": [612, 318], "count": 2 }
+{ "action": "click", "coordinate": [612, 318], "button": "right" }
 ```
 
-- With `x, y`: cursor moves to `(x, y)` and clicks there. Default form for "click this thing".
-- Without `x, y`: clicks at the cursor's current position. Use after a `move` for hover-then-click flows, or after a gated preview when re-aiming would be wrong.
-- `button`: `"left"` (default), `"right"`, `"middle"`.
+- With `coordinate`: cursor moves there and clicks. Default form for "click this thing".
+- Without `coordinate`: clicks at the cursor's current position. Use after a `move` for hover-then-click flows, or after a gated preview when re-aiming would be wrong.
+- `button`: `"left"` (default), `"right"` (context menu), `"middle"`.
 - `count`: `1` (default), `2` for double-click, `3` for triple-click (text selection).
 
 ### drag
-Press at `(x, y)`, drag to `(x2, y2)`, release. One call.
+Press at `start_coordinate`, drag to `end_coordinate`, release. One call.
 
 ```json
-{ "action": "drag", "x": 200, "y": 400, "x2": 800, "y2": 400 }
+{ "action": "drag", "start_coordinate": [200, 400], "end_coordinate": [800, 400] }
 ```
 
-Use for sliders, kanban moves, marquee selection, drag-and-drop. `steps` (optional, default 10) controls the smoothness for DnD libraries that need many intermediate move events.
+Use for sliders, kanban moves, marquee selection, drag-and-drop. `steps` (optional, default 10) controls smoothness for DnD libraries that need many intermediate move events.
 
 ### confirm
 Commit a pending click or drag that was previewed in the previous response.
@@ -60,7 +62,7 @@ Commit a pending click or drag that was previewed in the previous response.
 Only valid right after a preview-style observation (zoomed crop with a yellow box or red crosshair). See **Confirmation previews** below.
 
 ### scroll
-Scroll at the cursor's current position by `amount` CSS pixels in `direction`.
+Scroll at the cursor's current position by `amount` CSS pixels in `direction`. `amount` is always positive — `direction` carries the sign.
 
 ```json
 { "action": "scroll", "direction": "down", "amount": 600 }
@@ -83,17 +85,18 @@ When `click` or `drag` lands in an area with several interactable controls close
 - A **yellow** outline marks the element the click would commit on.
 - **Orange dashed** outlines mark nearby candidates. The message lists each candidate's HTML and center coordinates in `[0, 1000]` space.
 
-Check the yellow-highlighted element. If it matches what you wanted to click (or drag), reply `{"action": "confirm"}` to commit. If it does not, re-emit `click` (or `drag`) with one of the listed candidate centers as `x, y`.
+Check the yellow-highlighted element. If it matches what you wanted to click (or drag), reply `{ "action": "confirm" }` to commit. If it does not, re-emit `click` (or `drag`) with one of the listed candidate centers as the `coordinate`.
 
-For a drag preview, the same rules apply at each endpoint. `confirm` commits the drag as previewed; otherwise re-emit `drag` with corrected `x, y, x2, y2`.
+For a drag preview, the same rules apply at each endpoint. `confirm` commits the drag as previewed; otherwise re-emit `drag` with corrected `start_coordinate` and `end_coordinate`.
 
 ## Patterns
 
-- **Click a button**: `click` with the button's center coordinates.
-- **Hover-reveal menu**: `move` over the trigger; the next screenshot shows the menu open. Then `click` (with or without coords) on the revealed item.
-- **Scroll to find something**: `scroll` direction `down`, then check the new screenshot. Repeat as needed.
-- **Drag a slider**: one `drag` from the handle's current position to the target position.
-- **Right-click for context menu**: `click` with the target's coords and `button: "right"`.
+- **Click a button**: `{ "action": "click", "coordinate": [x, y] }` with the button's center.
+- **Hover-reveal menu**: `{ "action": "move", "coordinate": [x, y] }` over the trigger; the next screenshot shows the menu open. Then `{ "action": "click" }` (no coords) on the revealed item — or `click` with the item's coords directly.
+- **Scroll to find something**: `{ "action": "scroll", "direction": "down", "amount": 600 }`, then check the new screenshot. Repeat as needed.
+- **Drag a slider**: one `drag` from the handle's current position to the target.
+- **Right-click for context menu**: `{ "action": "click", "coordinate": [x, y], "button": "right" }`.
+- **Double-click**: `{ "action": "click", "coordinate": [x, y], "count": 2 }`.
 
 ## Notes
 
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
index a88ef44..35f8fa1 100644
--- a/server/agent/prompts/small_model/mouse_tool.j2
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -1,48 +1,51 @@
 # Mouse Tool
 
-Move, click, drag, and scroll a virtual mouse cursor.
+Move, click, drag, and scroll a virtual mouse cursor. Every call is a single JSON object passed as the `mouse` tool's arguments.
 
 ## Coordinates
 
-`(x, y)` and `(x2, y2)` are integers in **[0, 1000]**: `(0, 0)` is viewport top-left, `(1000, 1000)` is bottom-right. Estimate from the screenshot.
+Coordinates are 2-element arrays `[x, y]` of integers in **[0, 1000]**: `[0, 0]` is viewport top-left, `[1000, 1000]` is bottom-right. Estimate from the screenshot.
 
 ## Cursor
 
 A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot.
 
-`click {x, y}` moves the cursor to `(x, y)` and clicks in one step. `click` with no coords clicks at the cursor's current position — use that after a `move` when you want a hover effect to fire first.
+A `click` with `coordinate` moves the cursor there and clicks in one step. A `click` without `coordinate` clicks at the cursor's current position — use that after a `move` when you want a hover effect to fire first.
 
 ## Actions
 
+`action` defaults to `"move"`, so `{ "coordinate": [500, 320] }` slides the cursor to that point.
+
 ### move
-Slide the cursor to `(x, y)`.
+Slide the cursor to `coordinate`.
 ```json
-{ "action": "move", "x": 500, "y": 320 }
+{ "action": "move", "coordinate": [500, 320] }
 ```
 
 ### click
 Click on the page.
 ```json
-{ "action": "click", "x": 612, "y": 318 }
+{ "action": "click", "coordinate": [612, 318] }
 { "action": "click" }
-{ "action": "click", "x": 612, "y": 318, "count": 2 }
-{ "action": "click", "button": "right" }
+{ "action": "click", "coordinate": [612, 318], "count": 2 }
+{ "action": "click", "coordinate": [612, 318], "button": "right" }
 ```
-- With `x, y`: cursor moves to `(x, y)` and clicks. Default form.
-- Without `x, y`: clicks at the cursor's current position. Use after a `move` for hover-then-click flows.
-- `button`: `"left"` (default), `"right"`, `"middle"`.
-- `count`: 1 (default), 2 for double-click.
+- With `coordinate`: cursor moves there and clicks. Default form.
+- Without `coordinate`: clicks at the cursor's current position. Use after a `move` for hover-then-click flows.
+- `button`: `"left"` (default), `"right"` (context menu), `"middle"`.
+- `count`: 1 (default), 2 for double-click, 3 for triple-click.
 
 ### drag
-Press at `(x, y)`, drag to `(x2, y2)`, release.
+Press at `start_coordinate`, drag to `end_coordinate`, release.
 ```json
-{ "action": "drag", "x": 200, "y": 400, "x2": 800, "y2": 400 }
+{ "action": "drag", "start_coordinate": [200, 400], "end_coordinate": [800, 400] }
 ```
 
 ### scroll
-Scroll at the cursor by `amount` CSS pixels. The wheel event lands on whatever container is under the cursor — to scroll inside a panel, sidebar, or modal, `move` over it first.
+Scroll at the cursor by `amount` CSS pixels. `amount` is always positive — `direction` carries the sign. To scroll inside a panel, sidebar, or modal, `move` over it first so the wheel event lands there.
 ```json
 { "action": "scroll", "direction": "down", "amount": 600 }
+{ "action": "scroll", "direction": "up", "amount": 300 }
 ```
 `direction`: `"down"`, `"up"`, `"left"`, `"right"`.
 
@@ -66,12 +69,14 @@ If `click` or `drag` falls in a crowded area, the next observation is a zoomed c
 - A **yellow** outline marks the element the click would commit on.
 - **Orange dashed** outlines mark nearby candidates, listed in the message with HTML and center coordinates in `[0, 1000]` space.
 
-Check the yellow-highlighted element. If it matches your intent, reply `{"action": "confirm"}` to commit. Otherwise, re-emit `click` (or `drag`) with one of the listed candidate centers as `x, y`.
+Check the yellow-highlighted element. If it matches your intent, reply `{ "action": "confirm" }` to commit. Otherwise, re-emit `click` (or `drag`) with one of the listed candidate centers as the `coordinate`.
 
 ## Patterns
 
-- **Click a button**: `click` with the button's center coordinates.
-- **Hover**: `move` over the trigger; next screenshot shows the result, then `click` to commit.
-- **Scroll to find**: `scroll` then check the new screenshot.
-- **Drag**: one `drag` with start and end coordinates.
-- **Type into a field**: `click {x, y}` on the field to focus it → `keyboard type` the text.
+- **Click a button**: `{ "action": "click", "coordinate": [x, y] }`.
+- **Hover**: `{ "action": "move", "coordinate": [x, y] }` over the trigger; the next screenshot shows the result, then `{ "action": "click" }` to commit.
+- **Scroll to find**: `{ "action": "scroll", "direction": "down", "amount": 600 }`, then check the new screenshot.
+- **Drag**: one `drag` with `start_coordinate` and `end_coordinate`.
+- **Right-click**: `{ "action": "click", "coordinate": [x, y], "button": "right" }`.
+- **Double-click**: `{ "action": "click", "coordinate": [x, y], "count": 2 }`.
+- **Type into a field**: `{ "action": "click", "coordinate": [x, y] }` to focus → `keyboard type` the text.
diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py
index 514ef90..9c16237 100644
--- a/server/agent/tools/base.py
+++ b/server/agent/tools/base.py
@@ -9,8 +9,10 @@
 from typing import Any, Dict, List, Optional
 
 from openhands.sdk import Action, ImageContent, Observation, TextContent
+from openhands.sdk.utils.visualize import display_dict
 from pydantic import Field
 from pydantic.json_schema import SkipJsonSchema
+from rich.text import Text
 
 
 def _format_display_id(el: Dict[str, Any]) -> str:
@@ -204,6 +206,33 @@ class OpenBrowserAction(Action):
         exclude=True,
     )
 
+    @property
+    def visualize(self) -> Text:
+        """Render the action so only fields the agent actually set appear.
+
+        The base `Action.visualize` calls `model_dump()` which serializes
+        every field with its default — `text: null`, `key: null`,
+        `start_coordinate: null`, `count: 1`, etc. These pollute the
+        rendered ActionEvent text the SDK persists, condenses, and shows
+        to humans / the compiler agent. Use `model_fields_set` (only the
+        names the LLM emitted) plus `action` (always included so the verb
+        is visible even when defaulted).
+        """
+        content = Text()
+        content.append("Action: ", style="bold")
+        content.append(self.__class__.__name__)
+        content.append("\n\n")
+        content.append("Arguments:", style="bold")
+        include = set(self.model_fields_set)
+        if "action" in self.__class__.model_fields:
+            include.add("action")
+        rendered = self.model_dump(include=include) if include else {}
+        # `kind` is the discriminator the SDK adds — agent never sees or
+        # sets it, so don't display it either.
+        rendered.pop("kind", None)
+        content.append(display_dict(rendered))
+        return content
+
 
 class OpenBrowserObservation(Observation):
     """Base observation returned by OpenBrowser tools after each action.
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 35a6b65..2398992 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1695,9 +1695,13 @@ def _execute_mouse_action(
 
         try:
             if kind == "move":
-                if action.x is None or action.y is None:
-                    raise ValueError("mouse move requires x and y")
-                px, py = self._denormalize_xy(action.x, action.y)
+                if not action.coordinate:
+                    raise ValueError(
+                        "mouse move requires `coordinate: [x, y]`"
+                    )
+                px, py = self._denormalize_xy(
+                    action.coordinate[0], action.coordinate[1]
+                )
                 command = MouseMoveCommand(
                     x=px, y=py, conversation_id=self.conversation_id
                 )
@@ -1709,20 +1713,15 @@ def _execute_mouse_action(
                 )
 
             if kind == "click":
-                # `click {x, y}` is move-then-click in one call. With both
-                # coordinates supplied, slide the cursor to (x, y), cache it,
-                # and run the gate against that fresh position. With no
-                # coordinates, click at the cursor's current position
-                # (hover-then-click flows). Mixing — e.g. only x — is
-                # rejected so the agent re-emits cleanly.
-                has_x = action.x is not None
-                has_y = action.y is not None
-                if has_x ^ has_y:
-                    raise ValueError(
-                        "mouse click with explicit coordinates needs both x and y"
+                # `click coordinate=[x, y]` is move-then-click in one call.
+                # With `coordinate` supplied, slide the cursor there, cache
+                # it, and run the gate against that fresh position. Without
+                # `coordinate`, click at the cursor's current position
+                # (hover-then-click flows).
+                if action.coordinate is not None:
+                    px, py = self._denormalize_xy(
+                        action.coordinate[0], action.coordinate[1]
                     )
-                if has_x and has_y:
-                    px, py = self._denormalize_xy(action.x, action.y)
                     move_command = MouseMoveCommand(
                         x=px, y=py, conversation_id=self.conversation_id
                     )
@@ -1773,15 +1772,17 @@ def _execute_mouse_action(
                 return self._build_observation_from_result(result_dict, message)
 
             if kind == "drag":
-                if (
-                    action.x is None
-                    or action.y is None
-                    or action.x2 is None
-                    or action.y2 is None
-                ):
-                    raise ValueError("mouse drag requires x, y, x2, y2")
-                sx, sy = self._denormalize_xy(action.x, action.y)
-                ex, ey = self._denormalize_xy(action.x2, action.y2)
+                if not action.start_coordinate or not action.end_coordinate:
+                    raise ValueError(
+                        "mouse drag requires `start_coordinate: [x, y]` "
+                        "and `end_coordinate: [x, y]`"
+                    )
+                sx, sy = self._denormalize_xy(
+                    action.start_coordinate[0], action.start_coordinate[1]
+                )
+                ex, ey = self._denormalize_xy(
+                    action.end_coordinate[0], action.end_coordinate[1]
+                )
 
                 start_gate = self._gate_pixel_target(sx, sy)
                 end_gate = self._gate_pixel_target(ex, ey)
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
index b3a5f60..a850727 100644
--- a/server/agent/tools/mouse_tool.py
+++ b/server/agent/tools/mouse_tool.py
@@ -8,14 +8,14 @@
 """
 
 from collections.abc import Sequence
-from typing import Literal, Optional
+from typing import List, Literal, Optional
 
 from openhands.sdk.tool import (
     ToolDefinition,
     ToolAnnotations,
     register_tool,
 )
-from pydantic import Field
+from pydantic import Field, field_validator
 
 from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation
 from server.agent.tools.prompt_context import get_prompt_render_context
@@ -36,65 +36,95 @@ def get_mouse_tool_description(conv_state=None) -> str:
 ]
 
 
+def _validate_coordinate_pair(v: Optional[List[int]]) -> Optional[List[int]]:
+    """Coordinate fields are 2-element [x, y] arrays, each int in [0, 1000].
+
+    Accepts None (omitted), rejects everything else with a single clear
+    message so the agent can self-correct without parsing pydantic's
+    multi-error output.
+    """
+    if v is None:
+        return v
+    if not isinstance(v, (list, tuple)) or len(v) != 2:
+        raise ValueError(
+            "coordinate must be a 2-element array like [x, y] in [0, 1000] space"
+        )
+    out: List[int] = []
+    for i, n in enumerate(v):
+        if isinstance(n, bool) or not isinstance(n, int):
+            try:
+                n = int(n)
+            except (TypeError, ValueError):
+                raise ValueError(
+                    f"coordinate[{i}] must be an integer in [0, 1000]"
+                )
+        if n < 0 or n > 1000:
+            raise ValueError(
+                f"coordinate[{i}] = {n} is outside [0, 1000] normalized space"
+            )
+        out.append(n)
+    return out
+
+
 class MouseAction(OpenBrowserAction):
     """Move, click, drag, or scroll the virtual mouse cursor.
 
-    Coordinates `x, y, x2, y2` are integers in the Qwen-VL [0, 1000]
-    normalized space, with `(0, 0)` at the top-left of the viewport and
-    `(1000, 1000)` at the bottom-right.
+    Coordinates are 2-element `[x, y]` arrays in the Qwen-VL [0, 1000]
+    normalized space, with `[0, 0]` at the top-left of the viewport and
+    `[1000, 1000]` at the bottom-right.
     """
 
     action: MouseActionKind = Field(
+        default="move",
         description=(
-            "What to do with the mouse. "
-            "'move' — slide the cursor to (x, y). The cursor traces an eased "
-            "path so hover effects fire naturally along the way. "
-            "'click' — click on the page. Pass `x, y` to move the cursor "
-            "there and click in one step; omit `x, y` to click at the "
-            "cursor's current position (use this after a `move` for a "
+            "What to do with the mouse. Defaults to 'move' when omitted. "
+            "'move' — slide the cursor to `coordinate`. The cursor traces an "
+            "eased path so hover effects fire naturally along the way. "
+            "'click' — click on the page. Pass `coordinate` to move the "
+            "cursor there and click in one step; omit `coordinate` to click "
+            "at the cursor's current position (use this after a 'move' for a "
             "hover-then-click flow). `count: 2` double-clicks, `count: 3` "
-            "triple-clicks. `button: 'right'` opens the context menu. "
-            "'drag' — press at (x, y), drag to (x2, y2), release. "
+            "triple-clicks. `button: \"right\"` opens the context menu. "
+            "'drag' — press at `start_coordinate`, drag to `end_coordinate`, "
+            "release. "
             "'scroll' — scroll at the cursor position by `amount` in "
             "`direction`. "
             "'reset' — return the cursor to the viewport center. "
             "'confirm' — commit a pending click or drag that was previewed "
             "as a zoomed crop in the previous response."
-        )
+        ),
     )
 
-    x: Optional[int] = Field(
+    coordinate: Optional[List[int]] = Field(
         default=None,
         description=(
-            "Target X in [0, 1000] normalized space. For 'move' and 'click' "
-            "this is the destination; for 'drag' this is the start of the "
-            "drag."
+            "Target as `[x, y]` in [0, 1000] normalized space. Used by "
+            "'move' (destination), 'click' (where to click — omit to click "
+            "the cursor's current position), and 'scroll' (cursor position). "
+            "Example: `[405, 157]`."
         ),
-        ge=0,
-        le=1000,
     )
-    y: Optional[int] = Field(
+    start_coordinate: Optional[List[int]] = Field(
         default=None,
         description=(
-            "Target Y in [0, 1000] normalized space. For 'move' and 'click' "
-            "this is the destination; for 'drag' this is the start of the "
-            "drag."
+            "Drag start point as `[x, y]` in [0, 1000] normalized space. "
+            "Required for 'drag'. Example: `[200, 400]`."
         ),
-        ge=0,
-        le=1000,
     )
-    x2: Optional[int] = Field(
+    end_coordinate: Optional[List[int]] = Field(
         default=None,
-        description="Drag end X in [0, 1000]. Required for 'drag'.",
-        ge=0,
-        le=1000,
+        description=(
+            "Drag end point as `[x, y]` in [0, 1000] normalized space. "
+            "Required for 'drag'. Example: `[820, 540]`."
+        ),
     )
-    y2: Optional[int] = Field(
-        default=None,
-        description="Drag end Y in [0, 1000]. Required for 'drag'.",
-        ge=0,
-        le=1000,
+
+    @field_validator(
+        "coordinate", "start_coordinate", "end_coordinate", mode="before"
     )
+    @classmethod
+    def _check_coord(cls, v):
+        return _validate_coordinate_pair(v)
 
     button: Literal["left", "right", "middle"] = Field(
         default="left",

From 228cc86af93aa10cfaa9095f28799f87b1e4f1aa Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 7 May 2026 12:50:03 +0800
Subject: [PATCH 13/14] chore(deps): bump agent-sdk pin to 1ac8fff4
 (coordinate-as-array prompts)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Picks up agent-sdk commit that drops `{x,y}`/`(x,y)` set/tuple notation
from the system prompts in favor of explicit `coordinate: [x, y]` arrays
and drag's `start_coordinate`/`end_coordinate`. Pairs with the in-repo
MouseAction schema rewrite (4ce0f41).

Refresh evaluation_report.json with the post-fix run (20260507_092809):
255 → 36 tool-call validation errors (86% reduction); pattern E
`x:[a,b]` 110 → 1 events (99%). Aggregate pass rate 75.0% → 78.6% vs
main; net +35.7 task-score across the four `*-fast` aliases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 eval/evaluation_report.json | 1930 +++++++++++++++++------------------
 pyproject.toml              |    4 +-
 uv.lock                     |    8 +-
 3 files changed, 971 insertions(+), 971 deletions(-)

diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json
index b472db0..b10faf2 100644
--- a/eval/evaluation_report.json
+++ b/eval/evaluation_report.json
@@ -1,11 +1,11 @@
 {
   "evaluation": {
-    "timestamp": "2026-04-24 13:29:29",
-    "unix_timestamp": 1777008569.139304,
+    "timestamp": "2026-05-07 12:11:55",
+    "unix_timestamp": 1778127115.410769,
     "summary": {
       "total_tests": 140,
-      "passed_tests": 105,
-      "pass_rate": 75.0,
+      "passed_tests": 110,
+      "pass_rate": 78.57,
       "models_tested": [
         "dashscope/qwen3.5-plus",
         "dashscope/qwen3.6-plus",
@@ -15,51 +15,51 @@
     },
     "model_performance": {
       "dashscope/qwen3.5-plus": {
-        "pass_rate": 85.71,
-        "task_score": 274.3,
+        "pass_rate": 88.57,
+        "task_score": 281.9,
         "task_max_score": 304.8,
-        "efficiency_score": 15.4927,
-        "usage_score": 24.5326,
-        "composite_score": 0.743,
-        "avg_duration": 351.23,
-        "avg_cost": 0.454042,
-        "passed_count": 30,
+        "efficiency_score": 21.1601,
+        "usage_score": 27.902,
+        "composite_score": 0.8118,
+        "avg_duration": 252.32,
+        "avg_cost": 0.295084,
+        "passed_count": 31,
         "total_tests": 35
       },
       "dashscope/qwen3.6-plus": {
         "pass_rate": 74.29,
-        "task_score": 251.4,
+        "task_score": 262.4,
         "task_max_score": 304.8,
-        "efficiency_score": 17.2362,
-        "usage_score": 7.5517,
-        "composite_score": 0.5874,
-        "avg_duration": 315.08,
-        "avg_cost": 1.509188,
+        "efficiency_score": 21.6591,
+        "usage_score": 15.4985,
+        "composite_score": 0.658,
+        "avg_duration": 237.55,
+        "avg_cost": 0.933219,
         "passed_count": 26,
         "total_tests": 35
       },
       "dashscope/qwen3.5-flash": {
-        "pass_rate": 60.0,
-        "task_score": 232.2,
+        "pass_rate": 65.71,
+        "task_score": 248.7,
         "task_max_score": 304.8,
-        "efficiency_score": 19.0893,
-        "usage_score": 31.7588,
-        "composite_score": 0.6506,
-        "avg_duration": 286.05,
-        "avg_cost": 0.127535,
-        "passed_count": 21,
+        "efficiency_score": 21.3812,
+        "usage_score": 32.8972,
+        "composite_score": 0.7044,
+        "avg_duration": 257.51,
+        "avg_cost": 0.096369,
+        "passed_count": 23,
         "total_tests": 35
       },
       "dashscope/qwen3.6-flash": {
-        "pass_rate": 80.0,
-        "task_score": 274.5,
+        "pass_rate": 85.71,
+        "task_score": 274.3,
         "task_max_score": 304.8,
-        "efficiency_score": 21.4989,
-        "usage_score": 19.6902,
-        "composite_score": 0.7154,
-        "avg_duration": 235.99,
-        "avg_cost": 0.769821,
-        "passed_count": 28,
+        "efficiency_score": 23.4369,
+        "usage_score": 26.2986,
+        "composite_score": 0.7985,
+        "avg_duration": 207.89,
+        "avg_cost": 0.39813,
+        "passed_count": 30,
         "total_tests": 35
       }
     },
@@ -71,45 +71,45 @@
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.5198,
-            "usage_score": 0.7104,
-            "composite_score": 0.846,
-            "total_score": 7.23,
-            "duration": 144.07,
-            "cost": 0.173763
+            "efficiency_score": 0.7519,
+            "usage_score": 0.8734,
+            "composite_score": 0.9251,
+            "total_score": 7.63,
+            "duration": 74.44,
+            "cost": 0.075952
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.4912,
-            "usage_score": 0,
-            "composite_score": 0.6982,
-            "total_score": 6.49,
-            "duration": 152.64,
-            "cost": 0.74978
+            "efficiency_score": 0.7811,
+            "usage_score": 0.4455,
+            "composite_score": 0.8453,
+            "total_score": 7.23,
+            "duration": 65.67,
+            "cost": 0.332728
           },
           "dashscope/qwen3.5-flash": {
             "passed": false,
             "task_score": 4.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.7368,
-            "usage_score": 0.9585,
-            "composite_score": 0.3391,
-            "total_score": 5.7,
-            "duration": 78.95,
-            "cost": 0.024923
+            "efficiency_score": 0.7869,
+            "usage_score": 0.9621,
+            "composite_score": 0.3498,
+            "total_score": 5.75,
+            "duration": 63.92,
+            "cost": 0.02272
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.7143,
-            "usage_score": 0.6667,
-            "composite_score": 0.8762,
-            "total_score": 7.38,
-            "duration": 85.71,
-            "cost": 0.19998
+            "efficiency_score": 0.8312,
+            "usage_score": 0.8184,
+            "composite_score": 0.9299,
+            "total_score": 7.65,
+            "duration": 50.64,
+            "cost": 0.108959
           }
         }
       },
@@ -118,47 +118,47 @@
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
             "passed": true,
-            "task_score": 10.5,
+            "task_score": 9.0,
             "task_max_score": 10.5,
-            "efficiency_score": 0.3993,
-            "usage_score": 0.7181,
-            "composite_score": 0.8235,
-            "total_score": 11.62,
-            "duration": 324.38,
-            "cost": 0.422857
+            "efficiency_score": 0.5209,
+            "usage_score": 0.7535,
+            "composite_score": 0.8549,
+            "total_score": 10.27,
+            "duration": 258.7,
+            "cost": 0.369752
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 10.5,
             "task_max_score": 10.5,
-            "efficiency_score": 0.2953,
-            "usage_score": 0,
-            "composite_score": 0.6591,
-            "total_score": 10.8,
-            "duration": 380.52,
-            "cost": 1.72029
+            "efficiency_score": 0.5489,
+            "usage_score": 0.2868,
+            "composite_score": 0.7671,
+            "total_score": 11.34,
+            "duration": 243.57,
+            "cost": 1.069812
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 10.5,
+            "task_score": 9.5,
             "task_max_score": 10.5,
-            "efficiency_score": 0.4653,
-            "usage_score": 0.9296,
-            "composite_score": 0.879,
-            "total_score": 11.89,
-            "duration": 288.74,
-            "cost": 0.10561
+            "efficiency_score": 0.7282,
+            "usage_score": 0.9687,
+            "composite_score": 0.9394,
+            "total_score": 11.2,
+            "duration": 146.75,
+            "cost": 0.046948
           },
           "dashscope/qwen3.6-flash": {
-            "passed": true,
-            "task_score": 10.5,
+            "passed": false,
+            "task_score": 6.0,
             "task_max_score": 10.5,
-            "efficiency_score": 0.5915,
-            "usage_score": 0.6543,
-            "composite_score": 0.8492,
-            "total_score": 11.75,
-            "duration": 220.58,
-            "cost": 0.518601
+            "efficiency_score": 0,
+            "usage_score": 0.9832,
+            "composite_score": 0.1966,
+            "total_score": 6.98,
+            "duration": 540.0,
+            "cost": 0.025178
           }
         }
       },
@@ -169,45 +169,45 @@
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7417,
-            "usage_score": 0.8568,
-            "composite_score": 0.9197,
-            "total_score": 4.6,
-            "duration": 77.48,
-            "cost": 0.114521
+            "efficiency_score": 0.8373,
+            "usage_score": 0.9307,
+            "composite_score": 0.9536,
+            "total_score": 4.77,
+            "duration": 48.8,
+            "cost": 0.055405
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.8122,
-            "usage_score": 0.6829,
-            "composite_score": 0.899,
-            "total_score": 4.5,
-            "duration": 56.34,
-            "cost": 0.25366
+            "efficiency_score": 0.8388,
+            "usage_score": 0.7708,
+            "composite_score": 0.9219,
+            "total_score": 4.61,
+            "duration": 48.37,
+            "cost": 0.183396
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.8619,
-            "usage_score": 0.9734,
-            "composite_score": 0.9671,
-            "total_score": 4.84,
-            "duration": 41.42,
-            "cost": 0.02126
+            "efficiency_score": 0.8279,
+            "usage_score": 0.9777,
+            "composite_score": 0.9611,
+            "total_score": 4.81,
+            "duration": 51.64,
+            "cost": 0.017845
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.799,
-            "usage_score": 0.7669,
-            "composite_score": 0.9132,
-            "total_score": 4.57,
-            "duration": 60.31,
-            "cost": 0.186456
+            "efficiency_score": 0.9203,
+            "usage_score": 0.9277,
+            "composite_score": 0.9696,
+            "total_score": 4.85,
+            "duration": 23.91,
+            "cost": 0.057861
           }
         }
       },
@@ -218,45 +218,45 @@
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.5894,
-            "usage_score": 0.7983,
-            "composite_score": 0.8775,
-            "total_score": 10.39,
-            "duration": 287.45,
-            "cost": 0.403474
+            "efficiency_score": 0.7663,
+            "usage_score": 0.8964,
+            "composite_score": 0.9325,
+            "total_score": 10.66,
+            "duration": 163.61,
+            "cost": 0.207102
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.5886,
-            "usage_score": 0.2489,
-            "composite_score": 0.7675,
-            "total_score": 9.84,
-            "duration": 288.0,
-            "cost": 1.502212
+            "efficiency_score": 0.8109,
+            "usage_score": 0.6821,
+            "composite_score": 0.8986,
+            "total_score": 10.49,
+            "duration": 132.4,
+            "cost": 0.635748
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.6878,
-            "usage_score": 0.9516,
-            "composite_score": 0.9279,
-            "total_score": 10.64,
-            "duration": 218.54,
-            "cost": 0.096849
+            "efficiency_score": 0.0434,
+            "usage_score": 0.813,
+            "composite_score": 0.7713,
+            "total_score": 9.86,
+            "duration": 669.63,
+            "cost": 0.373979
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.7502,
-            "usage_score": 0.739,
-            "composite_score": 0.8978,
-            "total_score": 10.49,
-            "duration": 174.89,
-            "cost": 0.521971
+            "efficiency_score": 0.8552,
+            "usage_score": 0.8743,
+            "composite_score": 0.9459,
+            "total_score": 10.73,
+            "duration": 101.33,
+            "cost": 0.251393
           }
         }
       },
@@ -267,45 +267,45 @@
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.7283,
-            "usage_score": 0.8644,
-            "composite_score": 0.9186,
-            "total_score": 4.09,
-            "duration": 108.67,
-            "cost": 0.108459
+            "efficiency_score": 0.8623,
+            "usage_score": 0.9257,
+            "composite_score": 0.9576,
+            "total_score": 4.29,
+            "duration": 55.09,
+            "cost": 0.059466
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.7818,
-            "usage_score": 0.407,
-            "composite_score": 0.8378,
-            "total_score": 3.69,
-            "duration": 87.29,
-            "cost": 0.47437
+            "efficiency_score": 0.8726,
+            "usage_score": 0.6838,
+            "composite_score": 0.9113,
+            "total_score": 4.06,
+            "duration": 50.95,
+            "cost": 0.252924
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.8239,
-            "usage_score": 0.9722,
-            "composite_score": 0.9592,
-            "total_score": 4.3,
-            "duration": 70.44,
-            "cost": 0.022272
+            "efficiency_score": 0.9266,
+            "usage_score": 0.9874,
+            "composite_score": 0.9828,
+            "total_score": 4.41,
+            "duration": 29.35,
+            "cost": 0.010062
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.8676,
-            "usage_score": 0.8489,
-            "composite_score": 0.9433,
-            "total_score": 4.22,
-            "duration": 52.95,
-            "cost": 0.120876
+            "efficiency_score": 0.9113,
+            "usage_score": 0.903,
+            "composite_score": 0.9629,
+            "total_score": 4.31,
+            "duration": 35.47,
+            "cost": 0.077588
           }
         }
       },
@@ -313,48 +313,48 @@
         "name": "Gmail Finance Follow-up",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 5.5,
+            "passed": true,
+            "task_score": 8.0,
             "task_max_score": 8.0,
-            "efficiency_score": 0,
-            "usage_score": 0.2955,
-            "composite_score": 0.0591,
-            "total_score": 5.8,
-            "duration": 660.0,
-            "cost": 0.986231
+            "efficiency_score": 0.6936,
+            "usage_score": 0.8373,
+            "composite_score": 0.9062,
+            "total_score": 9.53,
+            "duration": 202.23,
+            "cost": 0.227799
           },
           "dashscope/qwen3.6-plus": {
             "passed": false,
-            "task_score": 4.5,
+            "task_score": 5.5,
             "task_max_score": 8.0,
-            "efficiency_score": 0.51,
-            "usage_score": 0.9466,
-            "composite_score": 0.2913,
-            "total_score": 5.96,
-            "duration": 323.4,
-            "cost": 0.074744
+            "efficiency_score": 0.6971,
+            "usage_score": 0.3727,
+            "composite_score": 0.214,
+            "total_score": 6.57,
+            "duration": 199.91,
+            "cost": 0.878164
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 4.5,
+            "passed": true,
+            "task_score": 8.0,
             "task_max_score": 8.0,
-            "efficiency_score": 0.3837,
-            "usage_score": 0.861,
-            "composite_score": 0.2489,
-            "total_score": 5.74,
-            "duration": 406.78,
-            "cost": 0.194652
+            "efficiency_score": 0.73,
+            "usage_score": 0.9504,
+            "composite_score": 0.9361,
+            "total_score": 9.68,
+            "duration": 178.18,
+            "cost": 0.069446
           },
           "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 6.0,
+            "passed": true,
+            "task_score": 8.0,
             "task_max_score": 8.0,
-            "efficiency_score": 0.6636,
-            "usage_score": 0.6235,
-            "composite_score": 0.2574,
-            "total_score": 7.29,
-            "duration": 222.01,
-            "cost": 0.527086
+            "efficiency_score": 0.7944,
+            "usage_score": 0.778,
+            "composite_score": 0.9145,
+            "total_score": 9.57,
+            "duration": 135.71,
+            "cost": 0.310808
           }
         }
       },
@@ -363,47 +363,47 @@
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
             "passed": true,
-            "task_score": 10.0,
+            "task_score": 8.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.3583,
-            "usage_score": 0.626,
-            "composite_score": 0.7969,
-            "total_score": 10.98,
-            "duration": 462.06,
-            "cost": 0.635744
+            "efficiency_score": 0.6089,
+            "usage_score": 0.7668,
+            "composite_score": 0.8751,
+            "total_score": 9.38,
+            "duration": 281.58,
+            "cost": 0.396458
           },
           "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 9.0,
+            "passed": false,
+            "task_score": 1.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.2028,
-            "usage_score": 0,
-            "composite_score": 0.6406,
-            "total_score": 9.2,
-            "duration": 573.96,
-            "cost": 3.022142
+            "efficiency_score": 0,
+            "usage_score": 0.9643,
+            "composite_score": 0.1929,
+            "total_score": 1.96,
+            "duration": 720.0,
+            "cost": 0.060764
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 10.0,
+            "task_score": 9.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.5475,
-            "usage_score": 0.9321,
-            "composite_score": 0.8959,
-            "total_score": 11.48,
-            "duration": 325.79,
-            "cost": 0.11543
+            "efficiency_score": 0.7594,
+            "usage_score": 0.9658,
+            "composite_score": 0.945,
+            "total_score": 10.73,
+            "duration": 173.23,
+            "cost": 0.058207
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 10.0,
+            "task_score": 8.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.4464,
-            "usage_score": 0.3569,
-            "composite_score": 0.7606,
-            "total_score": 10.8,
-            "duration": 398.61,
-            "cost": 1.09333
+            "efficiency_score": 0.7839,
+            "usage_score": 0.761,
+            "composite_score": 0.909,
+            "total_score": 9.54,
+            "duration": 155.57,
+            "cost": 0.406261
           }
         }
       },
@@ -411,48 +411,48 @@
         "name": "GitHub PR Review",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.0,
+            "passed": false,
+            "task_score": 5.6,
             "task_max_score": 9.0,
-            "efficiency_score": 0.4003,
-            "usage_score": 0.6638,
-            "composite_score": 0.8128,
-            "total_score": 10.06,
-            "duration": 431.79,
-            "cost": 0.571544
+            "efficiency_score": 0,
+            "usage_score": 0.9855,
+            "composite_score": 0.1971,
+            "total_score": 6.59,
+            "duration": 720.0,
+            "cost": 0.024722
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
-            "task_score": 9.0,
+            "task_score": 7.9,
             "task_max_score": 9.0,
-            "efficiency_score": 0.2815,
-            "usage_score": 0,
-            "composite_score": 0.6563,
-            "total_score": 9.28,
-            "duration": 517.35,
-            "cost": 2.367362
+            "efficiency_score": 0.6006,
+            "usage_score": 0.1583,
+            "composite_score": 0.7518,
+            "total_score": 8.66,
+            "duration": 287.55,
+            "cost": 1.430808
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 9.0,
+            "passed": false,
+            "task_score": 5.6,
             "task_max_score": 9.0,
-            "efficiency_score": 0.3765,
-            "usage_score": 0.8931,
-            "composite_score": 0.8539,
-            "total_score": 10.27,
-            "duration": 448.9,
-            "cost": 0.181648
+            "efficiency_score": 0.1552,
+            "usage_score": 0.8198,
+            "composite_score": 0.195,
+            "total_score": 6.58,
+            "duration": 608.24,
+            "cost": 0.306346
           },
           "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 5.7,
+            "passed": true,
+            "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.7148,
-            "usage_score": 0.7237,
-            "composite_score": 0.2877,
-            "total_score": 7.14,
-            "duration": 205.36,
-            "cost": 0.469627
+            "efficiency_score": 0.5592,
+            "usage_score": 0.4888,
+            "composite_score": 0.8096,
+            "total_score": 10.05,
+            "duration": 317.4,
+            "cost": 0.869086
           }
         }
       },
@@ -461,47 +461,47 @@
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
             "passed": true,
-            "task_score": 15.0,
+            "task_score": 13.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.4643,
-            "usage_score": 0.8091,
-            "composite_score": 0.8547,
-            "total_score": 16.27,
-            "duration": 321.39,
-            "cost": 0.381834
+            "efficiency_score": 0.4963,
+            "usage_score": 0.7822,
+            "composite_score": 0.8557,
+            "total_score": 14.28,
+            "duration": 302.23,
+            "cost": 0.435573
           },
           "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 15.0,
+            "passed": false,
+            "task_score": 11.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.3839,
-            "usage_score": 0.1466,
-            "composite_score": 0.7061,
-            "total_score": 15.53,
-            "duration": 369.69,
-            "cost": 1.706792
+            "efficiency_score": 0.5717,
+            "usage_score": 0.3799,
+            "composite_score": 0.1903,
+            "total_score": 11.95,
+            "duration": 256.97,
+            "cost": 1.24024
           },
           "dashscope/qwen3.5-flash": {
             "passed": false,
-            "task_score": 6.5,
+            "task_score": 10.5,
             "task_max_score": 15.0,
-            "efficiency_score": 0.459,
-            "usage_score": 0.902,
-            "composite_score": 0.2722,
-            "total_score": 7.86,
-            "duration": 324.6,
-            "cost": 0.19598
+            "efficiency_score": 0.4921,
+            "usage_score": 0.9396,
+            "composite_score": 0.2863,
+            "total_score": 11.93,
+            "duration": 304.74,
+            "cost": 0.12087
           },
           "dashscope/qwen3.6-flash": {
-            "passed": true,
-            "task_score": 15.0,
+            "passed": false,
+            "task_score": 9.5,
             "task_max_score": 15.0,
-            "efficiency_score": 0.5892,
-            "usage_score": 0.6753,
-            "composite_score": 0.8529,
-            "total_score": 16.26,
-            "duration": 246.49,
-            "cost": 0.649499
+            "efficiency_score": 0.3789,
+            "usage_score": 0.407,
+            "composite_score": 0.1572,
+            "total_score": 10.29,
+            "duration": 372.69,
+            "cost": 1.186083
           }
         }
       },
@@ -512,45 +512,45 @@
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.6348,
-            "usage_score": 0.7667,
-            "composite_score": 0.8803,
-            "total_score": 10.9,
-            "duration": 182.6,
-            "cost": 0.233308
+            "efficiency_score": 0.6558,
+            "usage_score": 0.7328,
+            "composite_score": 0.8777,
+            "total_score": 10.89,
+            "duration": 172.11,
+            "cost": 0.267171
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.6508,
-            "usage_score": 0.1267,
-            "composite_score": 0.7555,
-            "total_score": 10.28,
-            "duration": 174.6,
-            "cost": 0.873344
+            "efficiency_score": 0.6589,
+            "usage_score": 0.2235,
+            "composite_score": 0.7765,
+            "total_score": 10.38,
+            "duration": 170.53,
+            "cost": 0.776472
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 4.5,
+            "passed": true,
+            "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.544,
-            "usage_score": 0.8979,
-            "composite_score": 0.2884,
-            "total_score": 5.94,
-            "duration": 227.98,
-            "cost": 0.102127
+            "efficiency_score": 0.5344,
+            "usage_score": 0.9002,
+            "composite_score": 0.8869,
+            "total_score": 10.93,
+            "duration": 232.78,
+            "cost": 0.099769
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.7796,
-            "usage_score": 0.7119,
-            "composite_score": 0.8983,
-            "total_score": 10.99,
-            "duration": 110.18,
-            "cost": 0.288074
+            "efficiency_score": 0.4905,
+            "usage_score": 0.1818,
+            "composite_score": 0.7345,
+            "total_score": 10.17,
+            "duration": 254.76,
+            "cost": 0.818165
           }
         }
       },
@@ -559,47 +559,47 @@
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
             "passed": false,
-            "task_score": 4,
+            "task_score": 3,
             "task_max_score": 10,
-            "efficiency_score": 0.4322,
-            "usage_score": 0.4604,
-            "composite_score": 0.1785,
-            "total_score": 4.89,
-            "duration": 340.7,
-            "cost": 0.539591
+            "efficiency_score": 0.6762,
+            "usage_score": 0.7927,
+            "composite_score": 0.2938,
+            "total_score": 4.47,
+            "duration": 194.28,
+            "cost": 0.207348
           },
           "dashscope/qwen3.6-plus": {
             "passed": false,
-            "task_score": 5,
+            "task_score": 3,
             "task_max_score": 10,
-            "efficiency_score": 0,
-            "usage_score": 0.8828,
-            "composite_score": 0.1766,
-            "total_score": 5.88,
-            "duration": 600.0,
-            "cost": 0.117208
+            "efficiency_score": 0.2159,
+            "usage_score": 0,
+            "composite_score": 0.0432,
+            "total_score": 3.22,
+            "duration": 470.46,
+            "cost": 2.344964
           },
           "dashscope/qwen3.5-flash": {
             "passed": false,
-            "task_score": 3,
+            "task_score": 4,
             "task_max_score": 10,
             "efficiency_score": 0,
-            "usage_score": 0.9933,
-            "composite_score": 0.1987,
-            "total_score": 3.99,
+            "usage_score": 0.9954,
+            "composite_score": 0.1991,
+            "total_score": 5.0,
             "duration": 600.0,
-            "cost": 0.006699
+            "cost": 0.004625
           },
           "dashscope/qwen3.6-flash": {
             "passed": false,
-            "task_score": 5,
+            "task_score": 3,
             "task_max_score": 10,
             "efficiency_score": 0,
-            "usage_score": 0,
-            "composite_score": 0,
-            "total_score": 5.0,
+            "usage_score": 0.9745,
+            "composite_score": 0.1949,
+            "total_score": 3.97,
             "duration": 600.0,
-            "cost": 3.799832
+            "cost": 0.025519
           }
         }
       },
@@ -610,45 +610,45 @@
             "passed": true,
             "task_score": 12,
             "task_max_score": 12,
-            "efficiency_score": 0,
-            "usage_score": 0.9769,
-            "composite_score": 0.7954,
-            "total_score": 12.98,
-            "duration": 600.0,
-            "cost": 0.023114
+            "efficiency_score": 0.0086,
+            "usage_score": 0,
+            "composite_score": 0.6017,
+            "total_score": 12.01,
+            "duration": 594.83,
+            "cost": 1.17386
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 12,
             "task_max_score": 12,
-            "efficiency_score": 0.4066,
+            "efficiency_score": 0.2091,
             "usage_score": 0,
-            "composite_score": 0.6813,
-            "total_score": 12.41,
-            "duration": 356.04,
-            "cost": 3.03913
+            "composite_score": 0.6418,
+            "total_score": 12.21,
+            "duration": 474.55,
+            "cost": 2.420046
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 12,
+            "task_score": 11,
             "task_max_score": 12,
-            "efficiency_score": 0.198,
-            "usage_score": 0.2385,
-            "composite_score": 0.6873,
-            "total_score": 12.44,
-            "duration": 481.17,
-            "cost": 0.761452
+            "efficiency_score": 0.7108,
+            "usage_score": 0.9274,
+            "composite_score": 0.9276,
+            "total_score": 12.64,
+            "duration": 173.53,
+            "cost": 0.072578
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 11,
+            "task_score": 12,
             "task_max_score": 12,
-            "efficiency_score": 0.66,
-            "usage_score": 0,
-            "composite_score": 0.732,
-            "total_score": 11.66,
-            "duration": 204.0,
-            "cost": 1.345655
+            "efficiency_score": 0.7041,
+            "usage_score": 0.4385,
+            "composite_score": 0.8285,
+            "total_score": 13.14,
+            "duration": 177.55,
+            "cost": 0.561479
           }
         }
       },
@@ -659,45 +659,45 @@
             "passed": true,
             "task_score": 13.0,
             "task_max_score": 13.0,
-            "efficiency_score": 0.0364,
-            "usage_score": 0.4954,
-            "composite_score": 0.7064,
-            "total_score": 13.53,
-            "duration": 578.15,
-            "cost": 1.009128
+            "efficiency_score": 0.4275,
+            "usage_score": 0.7683,
+            "composite_score": 0.8391,
+            "total_score": 14.2,
+            "duration": 343.52,
+            "cost": 0.463489
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 13.0,
             "task_max_score": 13.0,
-            "efficiency_score": 0.0967,
-            "usage_score": 0,
-            "composite_score": 0.6193,
-            "total_score": 13.1,
-            "duration": 541.99,
-            "cost": 2.88689
+            "efficiency_score": 0.6046,
+            "usage_score": 0.4005,
+            "composite_score": 0.801,
+            "total_score": 14.01,
+            "duration": 237.25,
+            "cost": 1.199092
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 5.0,
+            "passed": true,
+            "task_score": 11.5,
             "task_max_score": 13.0,
-            "efficiency_score": 0.4638,
-            "usage_score": 0.9199,
-            "composite_score": 0.2767,
-            "total_score": 6.38,
-            "duration": 321.69,
-            "cost": 0.160237
+            "efficiency_score": 0.7038,
+            "usage_score": 0.9682,
+            "composite_score": 0.9344,
+            "total_score": 13.17,
+            "duration": 177.7,
+            "cost": 0.063611
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 13.0,
+            "task_score": 11.5,
             "task_max_score": 13.0,
-            "efficiency_score": 0.5427,
-            "usage_score": 0.632,
-            "composite_score": 0.8349,
-            "total_score": 14.17,
-            "duration": 274.37,
-            "cost": 0.735939
+            "efficiency_score": 0.33,
+            "usage_score": 0.3386,
+            "composite_score": 0.7337,
+            "total_score": 12.17,
+            "duration": 402.0,
+            "cost": 1.322876
           }
         }
       },
@@ -708,45 +708,45 @@
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.4775,
-            "usage_score": 0.6789,
-            "composite_score": 0.8313,
-            "total_score": 13.16,
-            "duration": 261.27,
-            "cost": 0.385295
+            "efficiency_score": 0.7972,
+            "usage_score": 0.8933,
+            "composite_score": 0.9381,
+            "total_score": 13.69,
+            "duration": 101.4,
+            "cost": 0.128081
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.5295,
-            "usage_score": 0.0059,
-            "composite_score": 0.7071,
-            "total_score": 12.54,
-            "duration": 235.23,
-            "cost": 1.192924
+            "efficiency_score": 0.7719,
+            "usage_score": 0.605,
+            "composite_score": 0.8754,
+            "total_score": 13.38,
+            "duration": 114.04,
+            "cost": 0.474032
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 5.5,
+            "passed": true,
+            "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.5687,
-            "usage_score": 0.9369,
-            "composite_score": 0.3011,
-            "total_score": 7.01,
-            "duration": 215.66,
-            "cost": 0.075755
+            "efficiency_score": 0.7945,
+            "usage_score": 0.9724,
+            "composite_score": 0.9534,
+            "total_score": 13.77,
+            "duration": 102.74,
+            "cost": 0.033113
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6898,
-            "usage_score": 0.6924,
-            "composite_score": 0.8764,
-            "total_score": 13.38,
-            "duration": 155.11,
-            "cost": 0.369068
+            "efficiency_score": 0.867,
+            "usage_score": 0.8884,
+            "composite_score": 0.9511,
+            "total_score": 13.76,
+            "duration": 66.49,
+            "cost": 0.133872
           }
         }
       },
@@ -757,45 +757,45 @@
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.3467,
-            "usage_score": 0.5285,
-            "composite_score": 0.775,
-            "total_score": 10.88,
-            "duration": 640.27,
-            "cost": 0.990182
+            "efficiency_score": 0.5189,
+            "usage_score": 0.7048,
+            "composite_score": 0.8447,
+            "total_score": 11.22,
+            "duration": 471.52,
+            "cost": 0.619899
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.2931,
-            "usage_score": 0,
-            "composite_score": 0.6586,
-            "total_score": 10.29,
-            "duration": 692.8,
-            "cost": 3.856454
+            "efficiency_score": 0.6548,
+            "usage_score": 0.3107,
+            "composite_score": 0.7931,
+            "total_score": 10.97,
+            "duration": 338.34,
+            "cost": 1.44752
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 8.6,
+            "passed": false,
+            "task_score": 7.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.4762,
-            "usage_score": 0.8951,
-            "composite_score": 0.8743,
-            "total_score": 9.97,
-            "duration": 513.3,
-            "cost": 0.220264
+            "efficiency_score": 0,
+            "usage_score": 0.9983,
+            "composite_score": 0.1997,
+            "total_score": 8.0,
+            "duration": 980.0,
+            "cost": 0.003504
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.5981,
-            "usage_score": 0.427,
-            "composite_score": 0.805,
-            "total_score": 11.03,
-            "duration": 393.9,
-            "cost": 1.203249
+            "efficiency_score": 0.3288,
+            "usage_score": 0,
+            "composite_score": 0.6658,
+            "total_score": 10.33,
+            "duration": 657.75,
+            "cost": 2.32915
           }
         }
       },
@@ -806,45 +806,45 @@
             "passed": true,
             "task_score": 11.0,
             "task_max_score": 11.0,
-            "efficiency_score": 0.3758,
-            "usage_score": 0.5656,
-            "composite_score": 0.7883,
-            "total_score": 11.94,
-            "duration": 649.19,
-            "cost": 1.042656
+            "efficiency_score": 0.5918,
+            "usage_score": 0.7165,
+            "composite_score": 0.8617,
+            "total_score": 12.31,
+            "duration": 424.55,
+            "cost": 0.680458
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 11.0,
             "task_max_score": 11.0,
-            "efficiency_score": 0.2978,
-            "usage_score": 0,
-            "composite_score": 0.6596,
-            "total_score": 11.3,
-            "duration": 730.3,
-            "cost": 3.790736
+            "efficiency_score": 0.6647,
+            "usage_score": 0.2763,
+            "composite_score": 0.7882,
+            "total_score": 11.94,
+            "duration": 348.73,
+            "cost": 1.736828
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 11.0,
             "task_max_score": 11.0,
-            "efficiency_score": 0.5398,
-            "usage_score": 0.9191,
-            "composite_score": 0.8918,
-            "total_score": 12.46,
-            "duration": 478.62,
-            "cost": 0.194207
+            "efficiency_score": 0.7423,
+            "usage_score": 0.9593,
+            "composite_score": 0.9403,
+            "total_score": 12.7,
+            "duration": 268.02,
+            "cost": 0.09777
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 11.0,
             "task_max_score": 11.0,
-            "efficiency_score": 0.6066,
-            "usage_score": 0.5081,
-            "composite_score": 0.8229,
-            "total_score": 12.11,
-            "duration": 409.09,
-            "cost": 1.180595
+            "efficiency_score": 0.7793,
+            "usage_score": 0.7133,
+            "composite_score": 0.8985,
+            "total_score": 12.49,
+            "duration": 229.48,
+            "cost": 0.688161
           }
         }
       },
@@ -855,45 +855,45 @@
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.8464,
-            "usage_score": 0.8393,
-            "composite_score": 0.9371,
-            "total_score": 3.69,
-            "duration": 46.07,
-            "cost": 0.080343
+            "efficiency_score": 0.885,
+            "usage_score": 0.9293,
+            "composite_score": 0.9628,
+            "total_score": 3.81,
+            "duration": 34.51,
+            "cost": 0.035371
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.8487,
-            "usage_score": 0.598,
-            "composite_score": 0.8893,
-            "total_score": 3.45,
-            "duration": 45.39,
-            "cost": 0.201014
+            "efficiency_score": 0.8784,
+            "usage_score": 0.7339,
+            "composite_score": 0.9225,
+            "total_score": 3.61,
+            "duration": 36.47,
+            "cost": 0.13306
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.8854,
-            "usage_score": 0.977,
-            "composite_score": 0.9725,
-            "total_score": 3.86,
-            "duration": 34.37,
-            "cost": 0.011498
+            "efficiency_score": 0.9231,
+            "usage_score": 0.9845,
+            "composite_score": 0.9815,
+            "total_score": 3.91,
+            "duration": 23.08,
+            "cost": 0.007746
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.91,
-            "usage_score": 0.88,
-            "composite_score": 0.958,
-            "total_score": 3.79,
-            "duration": 26.99,
-            "cost": 0.060003
+            "efficiency_score": 0.936,
+            "usage_score": 0.9139,
+            "composite_score": 0.97,
+            "total_score": 3.85,
+            "duration": 19.2,
+            "cost": 0.04303
           }
         }
       },
@@ -901,48 +901,48 @@
         "name": "Gmail Inbox Cleanup",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 7.0,
+            "passed": false,
+            "task_score": 5.5,
             "task_max_score": 7.0,
-            "efficiency_score": 0.3149,
-            "usage_score": 0.4488,
-            "composite_score": 0.7527,
-            "total_score": 7.76,
-            "duration": 411.08,
-            "cost": 0.661413
+            "efficiency_score": 0,
+            "usage_score": 0.9793,
+            "composite_score": 0.1959,
+            "total_score": 6.48,
+            "duration": 600.0,
+            "cost": 0.024847
           },
           "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 7.0,
+            "passed": false,
+            "task_score": 4.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.2703,
-            "usage_score": 0,
-            "composite_score": 0.6541,
-            "total_score": 7.27,
-            "duration": 437.79,
-            "cost": 2.304078
+            "efficiency_score": 0.0752,
+            "usage_score": 0.9475,
+            "composite_score": 0.2045,
+            "total_score": 5.02,
+            "duration": 554.88,
+            "cost": 0.063044
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 7.0,
+            "passed": false,
+            "task_score": 2.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.1128,
-            "usage_score": 0.7381,
-            "composite_score": 0.7702,
-            "total_score": 7.85,
-            "duration": 532.32,
-            "cost": 0.314275
+            "efficiency_score": 0.6932,
+            "usage_score": 0.9416,
+            "composite_score": 0.327,
+            "total_score": 3.63,
+            "duration": 184.05,
+            "cost": 0.070105
           },
           "dashscope/qwen3.6-flash": {
             "passed": false,
-            "task_score": 4.0,
+            "task_score": 3.5,
             "task_max_score": 7.0,
-            "efficiency_score": 0.2905,
-            "usage_score": 0,
-            "composite_score": 0.0581,
-            "total_score": 4.29,
-            "duration": 425.71,
-            "cost": 1.520833
+            "efficiency_score": 0,
+            "usage_score": 0.9798,
+            "composite_score": 0.196,
+            "total_score": 4.48,
+            "duration": 600.0,
+            "cost": 0.024216
           }
         }
       },
@@ -953,45 +953,45 @@
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.5478,
-            "usage_score": 0.7201,
-            "composite_score": 0.8536,
-            "total_score": 6.27,
-            "duration": 180.87,
-            "cost": 0.279854
+            "efficiency_score": 0.6988,
+            "usage_score": 0.8315,
+            "composite_score": 0.9061,
+            "total_score": 6.53,
+            "duration": 120.48,
+            "cost": 0.16851
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.3453,
-            "usage_score": 0,
-            "composite_score": 0.6691,
-            "total_score": 5.35,
-            "duration": 261.89,
-            "cost": 1.409976
+            "efficiency_score": 0.62,
+            "usage_score": 0.3761,
+            "composite_score": 0.7992,
+            "total_score": 6.0,
+            "duration": 152.0,
+            "cost": 0.623928
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.6321,
-            "usage_score": 0.9111,
-            "composite_score": 0.9086,
-            "total_score": 6.54,
-            "duration": 147.18,
-            "cost": 0.088939
+            "efficiency_score": 0.8279,
+            "usage_score": 0.9774,
+            "composite_score": 0.961,
+            "total_score": 6.81,
+            "duration": 68.85,
+            "cost": 0.022635
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0,
-            "usage_score": 0,
-            "composite_score": 0.6,
-            "total_score": 5.0,
-            "duration": 400.0,
-            "cost": 2.483521
+            "efficiency_score": 0.7878,
+            "usage_score": 0.8014,
+            "composite_score": 0.9178,
+            "total_score": 6.59,
+            "duration": 84.87,
+            "cost": 0.198635
           }
         }
       },
@@ -1000,47 +1000,47 @@
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
             "passed": true,
-            "task_score": 12.0,
+            "task_score": 10.5,
             "task_max_score": 12.0,
-            "efficiency_score": 0.4577,
-            "usage_score": 0.7468,
-            "composite_score": 0.8409,
-            "total_score": 13.2,
-            "duration": 325.38,
-            "cost": 0.506384
+            "efficiency_score": 0.7348,
+            "usage_score": 0.8938,
+            "composite_score": 0.9257,
+            "total_score": 12.13,
+            "duration": 159.11,
+            "cost": 0.212449
           },
           "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 10.5,
+            "passed": false,
+            "task_score": 9.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.2862,
-            "usage_score": 0.0318,
-            "composite_score": 0.6636,
-            "total_score": 10.82,
-            "duration": 428.29,
-            "cost": 1.936474
+            "efficiency_score": 0.6785,
+            "usage_score": 0.5536,
+            "composite_score": 0.2464,
+            "total_score": 10.23,
+            "duration": 192.89,
+            "cost": 0.892756
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 10.5,
+            "passed": false,
+            "task_score": 8.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6821,
-            "usage_score": 0.9637,
-            "composite_score": 0.9292,
-            "total_score": 12.15,
-            "duration": 190.73,
-            "cost": 0.072515
+            "efficiency_score": 0.4969,
+            "usage_score": 0.9316,
+            "composite_score": 0.2857,
+            "total_score": 9.43,
+            "duration": 301.89,
+            "cost": 0.136738
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 10.5,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6666,
-            "usage_score": 0.7428,
-            "composite_score": 0.8819,
-            "total_score": 11.91,
-            "duration": 200.06,
-            "cost": 0.514314
+            "efficiency_score": 0.6685,
+            "usage_score": 0.5992,
+            "composite_score": 0.8535,
+            "total_score": 11.77,
+            "duration": 198.87,
+            "cost": 0.801651
           }
         }
       },
@@ -1051,45 +1051,45 @@
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.669,
-            "usage_score": 0.83,
-            "composite_score": 0.8998,
-            "total_score": 5.0,
-            "duration": 165.5,
-            "cost": 0.203975
+            "efficiency_score": 0.7859,
+            "usage_score": 0.905,
+            "composite_score": 0.9382,
+            "total_score": 5.19,
+            "duration": 107.07,
+            "cost": 0.113952
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.3874,
-            "usage_score": 0,
-            "composite_score": 0.6775,
-            "total_score": 3.89,
-            "duration": 306.32,
-            "cost": 1.640526
+            "efficiency_score": 0.7919,
+            "usage_score": 0.6121,
+            "composite_score": 0.8808,
+            "total_score": 4.9,
+            "duration": 104.07,
+            "cost": 0.465436
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.7505,
-            "usage_score": 0.9652,
-            "composite_score": 0.9431,
-            "total_score": 5.22,
-            "duration": 124.77,
-            "cost": 0.041761
+            "efficiency_score": 0.8477,
+            "usage_score": 0.9804,
+            "composite_score": 0.9656,
+            "total_score": 5.33,
+            "duration": 76.13,
+            "cost": 0.023503
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.7147,
-            "usage_score": 0.7104,
-            "composite_score": 0.885,
-            "total_score": 4.93,
-            "duration": 142.65,
-            "cost": 0.347546
+            "efficiency_score": 0.8288,
+            "usage_score": 0.8455,
+            "composite_score": 0.9349,
+            "total_score": 5.17,
+            "duration": 85.59,
+            "cost": 0.185456
           }
         }
       },
@@ -1097,48 +1097,48 @@
         "name": "StayBnB Book \u2014 Filters, Gallery & Two-Step Booking",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 4.5,
+            "passed": true,
+            "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0,
-            "usage_score": 0.6572,
-            "composite_score": 0.1314,
-            "total_score": 5.16,
-            "duration": 600.0,
-            "cost": 0.685567
+            "efficiency_score": 0.5076,
+            "usage_score": 0.7975,
+            "composite_score": 0.861,
+            "total_score": 16.31,
+            "duration": 295.43,
+            "cost": 0.405032
           },
           "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 2.0,
+            "passed": true,
+            "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0,
-            "usage_score": 0,
-            "composite_score": 0,
-            "total_score": 2.0,
-            "duration": 600.0,
-            "cost": 3.217476
+            "efficiency_score": 0.5737,
+            "usage_score": 0.483,
+            "composite_score": 0.8113,
+            "total_score": 16.06,
+            "duration": 255.79,
+            "cost": 1.034026
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 11.0,
+            "passed": true,
+            "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.5078,
-            "usage_score": 0.9389,
-            "composite_score": 0.2893,
-            "total_score": 12.45,
-            "duration": 295.33,
-            "cost": 0.122194
+            "efficiency_score": 0.6828,
+            "usage_score": 0.9678,
+            "composite_score": 0.9301,
+            "total_score": 16.65,
+            "duration": 190.35,
+            "cost": 0.064467
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 13.0,
+            "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.0766,
-            "usage_score": 0,
-            "composite_score": 0.6153,
-            "total_score": 13.08,
-            "duration": 554.04,
-            "cost": 2.141542
+            "efficiency_score": 0.7949,
+            "usage_score": 0.8624,
+            "composite_score": 0.9314,
+            "total_score": 16.66,
+            "duration": 123.08,
+            "cost": 0.275256
           }
         }
       },
@@ -1146,48 +1146,48 @@
         "name": "MapQuest Navigate \u2014 Autocomplete, Directions & Collapse",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.5,
+            "passed": false,
+            "task_score": 6.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.4877,
-            "usage_score": 0.7188,
-            "composite_score": 0.8413,
-            "total_score": 10.71,
-            "duration": 276.65,
-            "cost": 0.421804
+            "efficiency_score": 0.6945,
+            "usage_score": 0.8785,
+            "composite_score": 0.3146,
+            "total_score": 8.07,
+            "duration": 164.98,
+            "cost": 0.182317
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
-            "task_score": 9.5,
+            "task_score": 8.0,
             "task_max_score": 9.5,
-            "efficiency_score": 0.4738,
-            "usage_score": 0.114,
-            "composite_score": 0.7176,
-            "total_score": 10.09,
-            "duration": 284.12,
-            "cost": 1.329044
+            "efficiency_score": 0.6615,
+            "usage_score": 0.4869,
+            "composite_score": 0.8297,
+            "total_score": 9.15,
+            "duration": 182.79,
+            "cost": 0.769716
           },
           "dashscope/qwen3.5-flash": {
             "passed": false,
             "task_score": 5.0,
             "task_max_score": 9.5,
-            "efficiency_score": 0.6743,
-            "usage_score": 0.9557,
-            "composite_score": 0.326,
-            "total_score": 6.63,
-            "duration": 175.89,
-            "cost": 0.066523
+            "efficiency_score": 0.5191,
+            "usage_score": 0.9194,
+            "composite_score": 0.2877,
+            "total_score": 6.44,
+            "duration": 259.68,
+            "cost": 0.120943
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 9.5,
+            "task_score": 8.0,
             "task_max_score": 9.5,
-            "efficiency_score": 0.638,
-            "usage_score": 0.665,
-            "composite_score": 0.8606,
-            "total_score": 10.8,
-            "duration": 195.48,
-            "cost": 0.502484
+            "efficiency_score": 0.8246,
+            "usage_score": 0.8489,
+            "composite_score": 0.9347,
+            "total_score": 9.67,
+            "duration": 94.72,
+            "cost": 0.226632
           }
         }
       },
@@ -1198,45 +1198,45 @@
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.55,
-            "usage_score": 0.7714,
-            "composite_score": 0.8643,
-            "total_score": 10.32,
-            "duration": 297.0,
-            "cost": 0.342954
+            "efficiency_score": 0.6427,
+            "usage_score": 0.7972,
+            "composite_score": 0.888,
+            "total_score": 10.44,
+            "duration": 235.82,
+            "cost": 0.30418
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.5708,
-            "usage_score": 0.066,
-            "composite_score": 0.7274,
-            "total_score": 9.64,
-            "duration": 283.26,
-            "cost": 1.401068
+            "efficiency_score": 0.6182,
+            "usage_score": 0.2733,
+            "composite_score": 0.7783,
+            "total_score": 9.89,
+            "duration": 251.97,
+            "cost": 1.09007
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 9.0,
+            "task_score": 8.2,
             "task_max_score": 9.0,
-            "efficiency_score": 0.65,
-            "usage_score": 0.9565,
-            "composite_score": 0.9213,
-            "total_score": 10.61,
-            "duration": 230.98,
-            "cost": 0.065318
+            "efficiency_score": 0.7991,
+            "usage_score": 0.9717,
+            "composite_score": 0.9542,
+            "total_score": 9.97,
+            "duration": 132.6,
+            "cost": 0.042404
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.7214,
-            "usage_score": 0.7387,
-            "composite_score": 0.892,
+            "efficiency_score": 0.74,
+            "usage_score": 0.7243,
+            "composite_score": 0.8929,
             "total_score": 10.46,
-            "duration": 183.91,
-            "cost": 0.392011
+            "duration": 171.58,
+            "cost": 0.41355
           }
         }
       },
@@ -1247,45 +1247,45 @@
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.4354,
-            "usage_score": 0.7418,
-            "composite_score": 0.8354,
-            "total_score": 13.18,
-            "duration": 304.9,
-            "cost": 0.387372
+            "efficiency_score": 0.6836,
+            "usage_score": 0.8682,
+            "composite_score": 0.9104,
+            "total_score": 13.55,
+            "duration": 170.83,
+            "cost": 0.197702
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.5308,
-            "usage_score": 0.1223,
-            "composite_score": 0.7306,
-            "total_score": 12.65,
-            "duration": 253.36,
-            "cost": 1.316514
+            "efficiency_score": 0.3887,
+            "usage_score": 0.0881,
+            "composite_score": 0.6953,
+            "total_score": 12.48,
+            "duration": 330.12,
+            "cost": 1.367892
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 12.0,
+            "passed": false,
+            "task_score": 8.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.3917,
-            "usage_score": 0.9233,
-            "composite_score": 0.863,
-            "total_score": 13.32,
-            "duration": 328.48,
-            "cost": 0.114982
+            "efficiency_score": 0.5188,
+            "usage_score": 0.9429,
+            "composite_score": 0.2923,
+            "total_score": 9.46,
+            "duration": 259.83,
+            "cost": 0.08569
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 12.0,
+            "task_score": 10.5,
             "task_max_score": 12.0,
-            "efficiency_score": 0.4785,
-            "usage_score": 0.5683,
-            "composite_score": 0.8094,
-            "total_score": 13.05,
-            "duration": 281.59,
-            "cost": 0.647604
+            "efficiency_score": 0.6623,
+            "usage_score": 0.7383,
+            "composite_score": 0.8801,
+            "total_score": 11.9,
+            "duration": 182.37,
+            "cost": 0.392616
           }
         }
       },
@@ -1296,45 +1296,45 @@
             "passed": true,
             "task_score": 10.2,
             "task_max_score": 10.2,
-            "efficiency_score": 0.5724,
-            "usage_score": 0.724,
-            "composite_score": 0.8593,
-            "total_score": 11.5,
-            "duration": 299.3,
-            "cost": 0.44159
+            "efficiency_score": 0.7419,
+            "usage_score": 0.857,
+            "composite_score": 0.9198,
+            "total_score": 11.8,
+            "duration": 180.67,
+            "cost": 0.228795
           },
           "dashscope/qwen3.6-plus": {
             "passed": false,
-            "task_score": 5.0,
+            "task_score": 6.6,
             "task_max_score": 10.2,
-            "efficiency_score": 0.5455,
-            "usage_score": 0.0787,
-            "composite_score": 0.1248,
-            "total_score": 5.62,
-            "duration": 318.17,
-            "cost": 1.474142
+            "efficiency_score": 0.7862,
+            "usage_score": 0.6,
+            "composite_score": 0.2772,
+            "total_score": 7.99,
+            "duration": 149.69,
+            "cost": 0.639994
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 10.2,
             "task_max_score": 10.2,
-            "efficiency_score": 0.6671,
-            "usage_score": 0.9587,
-            "composite_score": 0.9252,
-            "total_score": 11.83,
-            "duration": 233.0,
-            "cost": 0.066143
+            "efficiency_score": 0.5111,
+            "usage_score": 0.9174,
+            "composite_score": 0.8857,
+            "total_score": 11.63,
+            "duration": 342.26,
+            "cost": 0.132226
           },
           "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 5.0,
+            "passed": true,
+            "task_score": 10.2,
             "task_max_score": 10.2,
-            "efficiency_score": 0.7593,
-            "usage_score": 0.742,
-            "composite_score": 0.3003,
-            "total_score": 6.5,
-            "duration": 168.47,
-            "cost": 0.412757
+            "efficiency_score": 0.8572,
+            "usage_score": 0.8712,
+            "composite_score": 0.9457,
+            "total_score": 11.93,
+            "duration": 99.95,
+            "cost": 0.206018
           }
         }
       },
@@ -1345,45 +1345,45 @@
             "passed": true,
             "task_score": 11.5,
             "task_max_score": 11.5,
-            "efficiency_score": 0,
-            "usage_score": 0.5974,
-            "composite_score": 0.7195,
-            "total_score": 12.1,
-            "duration": 540.0,
-            "cost": 0.603908
+            "efficiency_score": 0.6714,
+            "usage_score": 0.8696,
+            "composite_score": 0.9082,
+            "total_score": 13.04,
+            "duration": 177.42,
+            "cost": 0.195545
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 11.5,
             "task_max_score": 11.5,
-            "efficiency_score": 0.3247,
-            "usage_score": 0,
-            "composite_score": 0.6649,
-            "total_score": 11.82,
-            "duration": 364.66,
-            "cost": 1.920328
+            "efficiency_score": 0.6717,
+            "usage_score": 0.437,
+            "composite_score": 0.8217,
+            "total_score": 12.61,
+            "duration": 177.29,
+            "cost": 0.844436
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 8.0,
+            "passed": true,
+            "task_score": 11.5,
             "task_max_score": 11.5,
-            "efficiency_score": 0.4087,
-            "usage_score": 0.9047,
-            "composite_score": 0.2627,
-            "total_score": 9.31,
-            "duration": 319.33,
-            "cost": 0.142899
+            "efficiency_score": 0.572,
+            "usage_score": 0.9374,
+            "composite_score": 0.9019,
+            "total_score": 13.01,
+            "duration": 231.1,
+            "cost": 0.09388
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 11.5,
             "task_max_score": 11.5,
-            "efficiency_score": 0.5323,
-            "usage_score": 0.5423,
-            "composite_score": 0.8149,
-            "total_score": 12.57,
-            "duration": 252.55,
-            "cost": 0.686623
+            "efficiency_score": 0.8142,
+            "usage_score": 0.839,
+            "composite_score": 0.9306,
+            "total_score": 13.15,
+            "duration": 100.34,
+            "cost": 0.241561
           }
         }
       },
@@ -1391,48 +1391,48 @@
         "name": "Amazon Offer Disambiguation",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 7.0,
+            "passed": true,
+            "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.6603,
-            "usage_score": 0.7856,
-            "composite_score": 0.2892,
-            "total_score": 8.45,
-            "duration": 346.45,
-            "cost": 0.493124
+            "efficiency_score": 0.7894,
+            "usage_score": 0.8841,
+            "composite_score": 0.9347,
+            "total_score": 11.67,
+            "duration": 214.77,
+            "cost": 0.266625
           },
           "dashscope/qwen3.6-plus": {
             "passed": false,
-            "task_score": 6.2,
+            "task_score": 7.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.7479,
-            "usage_score": 0.4872,
-            "composite_score": 0.247,
-            "total_score": 7.44,
-            "duration": 257.14,
-            "cost": 1.17953
+            "efficiency_score": 0.8707,
+            "usage_score": 0.7579,
+            "composite_score": 0.3257,
+            "total_score": 8.63,
+            "duration": 131.92,
+            "cost": 0.556892
           },
           "dashscope/qwen3.5-flash": {
             "passed": false,
-            "task_score": 7.0,
+            "task_score": 6.2,
             "task_max_score": 10.0,
-            "efficiency_score": 0.7907,
-            "usage_score": 0.9741,
-            "composite_score": 0.353,
-            "total_score": 8.76,
-            "duration": 213.53,
-            "cost": 0.059478
+            "efficiency_score": 0.8995,
+            "usage_score": 0.9827,
+            "composite_score": 0.3764,
+            "total_score": 8.08,
+            "duration": 102.55,
+            "cost": 0.039861
           },
           "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 6.2,
+            "passed": true,
+            "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.8258,
-            "usage_score": 0.8236,
-            "composite_score": 0.3299,
-            "total_score": 7.85,
-            "duration": 177.73,
-            "cost": 0.405803
+            "efficiency_score": 0.903,
+            "usage_score": 0.9029,
+            "composite_score": 0.9612,
+            "total_score": 11.81,
+            "duration": 98.97,
+            "cost": 0.22334
           }
         }
       },
@@ -1443,45 +1443,45 @@
             "passed": true,
             "task_score": 6.6,
             "task_max_score": 6.6,
-            "efficiency_score": 0.3815,
-            "usage_score": 0.7203,
-            "composite_score": 0.8204,
-            "total_score": 7.7,
-            "duration": 383.46,
-            "cost": 0.363587
+            "efficiency_score": 0.6937,
+            "usage_score": 0.8318,
+            "composite_score": 0.9051,
+            "total_score": 8.13,
+            "duration": 189.89,
+            "cost": 0.218695
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 6.6,
             "task_max_score": 6.6,
-            "efficiency_score": 0.5794,
-            "usage_score": 0,
-            "composite_score": 0.7159,
-            "total_score": 7.18,
-            "duration": 260.74,
-            "cost": 1.416344
+            "efficiency_score": 0.7288,
+            "usage_score": 0.4106,
+            "composite_score": 0.8279,
+            "total_score": 7.74,
+            "duration": 168.15,
+            "cost": 0.766188
           },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 6.6,
+            "passed": false,
+            "task_score": 5.0,
             "task_max_score": 6.6,
-            "efficiency_score": 0.5448,
-            "usage_score": 0.9241,
-            "composite_score": 0.8938,
-            "total_score": 8.07,
-            "duration": 282.22,
-            "cost": 0.098646
+            "efficiency_score": 0.8099,
+            "usage_score": 0.9673,
+            "composite_score": 0.3555,
+            "total_score": 6.78,
+            "duration": 117.83,
+            "cost": 0.042507
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 6.6,
             "task_max_score": 6.6,
-            "efficiency_score": 0.6796,
-            "usage_score": 0.642,
-            "composite_score": 0.8643,
-            "total_score": 7.92,
-            "duration": 198.67,
-            "cost": 0.465433
+            "efficiency_score": 0.8357,
+            "usage_score": 0.8218,
+            "composite_score": 0.9315,
+            "total_score": 8.26,
+            "duration": 101.87,
+            "cost": 0.231676
           }
         }
       },
@@ -1492,45 +1492,45 @@
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7141,
-            "usage_score": 0.6034,
-            "composite_score": 0.8635,
-            "total_score": 4.32,
-            "duration": 171.55,
-            "cost": 0.198317
+            "efficiency_score": 0.7063,
+            "usage_score": 0.5322,
+            "composite_score": 0.8477,
+            "total_score": 4.24,
+            "duration": 176.2,
+            "cost": 0.233879
           },
           "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 2,
+            "passed": true,
+            "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7881,
+            "efficiency_score": 0.7583,
             "usage_score": 0,
-            "composite_score": 0.1576,
-            "total_score": 2.79,
-            "duration": 127.15,
-            "cost": 0.744218
+            "composite_score": 0.7517,
+            "total_score": 3.76,
+            "duration": 145.02,
+            "cost": 0.559202
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7996,
-            "usage_score": 0.918,
-            "composite_score": 0.9435,
-            "total_score": 4.72,
-            "duration": 120.27,
-            "cost": 0.040996
+            "efficiency_score": 0.8806,
+            "usage_score": 0.9508,
+            "composite_score": 0.9663,
+            "total_score": 4.83,
+            "duration": 71.64,
+            "cost": 0.024616
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.7892,
-            "usage_score": 0.3318,
-            "composite_score": 0.8242,
-            "total_score": 4.12,
-            "duration": 126.46,
-            "cost": 0.334076
+            "efficiency_score": 0.8964,
+            "usage_score": 0.7447,
+            "composite_score": 0.9282,
+            "total_score": 4.64,
+            "duration": 62.19,
+            "cost": 0.127642
           }
         }
       },
@@ -1539,47 +1539,47 @@
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
             "passed": true,
-            "task_score": 7.0,
+            "task_score": 6.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.4259,
-            "usage_score": 0.6482,
-            "composite_score": 0.8148,
-            "total_score": 8.07,
-            "duration": 344.45,
-            "cost": 0.527692
+            "efficiency_score": 0.6271,
+            "usage_score": 0.7832,
+            "composite_score": 0.882,
+            "total_score": 7.41,
+            "duration": 223.76,
+            "cost": 0.325231
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
-            "task_score": 7.0,
+            "task_score": 6.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.6641,
-            "usage_score": 0.2713,
-            "composite_score": 0.7871,
-            "total_score": 7.94,
-            "duration": 201.52,
-            "cost": 1.092992
+            "efficiency_score": 0.7771,
+            "usage_score": 0.6216,
+            "composite_score": 0.8797,
+            "total_score": 7.4,
+            "duration": 133.76,
+            "cost": 0.567618
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 5.0,
+            "passed": true,
+            "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.5865,
-            "usage_score": 0.9251,
-            "composite_score": 0.3023,
-            "total_score": 6.51,
-            "duration": 248.12,
-            "cost": 0.1123
+            "efficiency_score": 0.7356,
+            "usage_score": 0.9589,
+            "composite_score": 0.9389,
+            "total_score": 8.69,
+            "duration": 158.65,
+            "cost": 0.061596
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.7813,
-            "usage_score": 0.7971,
-            "composite_score": 0.9157,
-            "total_score": 8.58,
-            "duration": 131.21,
-            "cost": 0.304347
+            "efficiency_score": 0.8542,
+            "usage_score": 0.8649,
+            "composite_score": 0.9438,
+            "total_score": 8.72,
+            "duration": 87.5,
+            "cost": 0.202584
           }
         }
       },
@@ -1590,45 +1590,45 @@
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.1944,
-            "usage_score": 0.3343,
-            "composite_score": 0.7057,
-            "total_score": 9.53,
-            "duration": 725.04,
-            "cost": 1.464523
+            "efficiency_score": 0.483,
+            "usage_score": 0.709,
+            "composite_score": 0.8384,
+            "total_score": 10.19,
+            "duration": 465.33,
+            "cost": 0.640286
           },
           "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 0.8,
+            "passed": true,
+            "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.8048,
-            "usage_score": 0.6023,
-            "composite_score": 0.2814,
-            "total_score": 2.21,
-            "duration": 175.7,
-            "cost": 0.874832
+            "efficiency_score": 0.6995,
+            "usage_score": 0.3796,
+            "composite_score": 0.8158,
+            "total_score": 10.08,
+            "duration": 270.41,
+            "cost": 1.364906
           },
           "dashscope/qwen3.5-flash": {
             "passed": false,
-            "task_score": 0.8,
+            "task_score": 0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.5804,
-            "usage_score": 0.9308,
-            "composite_score": 0.3022,
-            "total_score": 2.31,
-            "duration": 377.66,
-            "cost": 0.152193
+            "efficiency_score": 0,
+            "usage_score": 0.7188,
+            "composite_score": 0.1438,
+            "total_score": 0.72,
+            "duration": 900.0,
+            "cost": 0.618624
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.6364,
-            "usage_score": 0.5803,
-            "composite_score": 0.8433,
-            "total_score": 10.22,
-            "duration": 327.24,
-            "cost": 0.923247
+            "efficiency_score": 0.7449,
+            "usage_score": 0.7287,
+            "composite_score": 0.8947,
+            "total_score": 10.47,
+            "duration": 229.56,
+            "cost": 0.596822
           }
         }
       },
@@ -1639,45 +1639,45 @@
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.7334,
-            "usage_score": 0.8429,
-            "composite_score": 0.9153,
-            "total_score": 7.58,
-            "duration": 143.94,
-            "cost": 0.188553
+            "efficiency_score": 0.8189,
+            "usage_score": 0.9154,
+            "composite_score": 0.9469,
+            "total_score": 7.73,
+            "duration": 97.77,
+            "cost": 0.101474
           },
           "dashscope/qwen3.6-plus": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.7553,
-            "usage_score": 0.4473,
-            "composite_score": 0.8405,
-            "total_score": 7.2,
-            "duration": 132.12,
-            "cost": 0.663296
+            "efficiency_score": 0.796,
+            "usage_score": 0.5724,
+            "composite_score": 0.8737,
+            "total_score": 7.37,
+            "duration": 110.18,
+            "cost": 0.513078
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.8067,
-            "usage_score": 0.9772,
-            "composite_score": 0.9568,
-            "total_score": 7.78,
-            "duration": 104.38,
-            "cost": 0.027413
+            "efficiency_score": 0.8965,
+            "usage_score": 0.9847,
+            "composite_score": 0.9762,
+            "total_score": 7.88,
+            "duration": 55.92,
+            "cost": 0.018326
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.8015,
-            "usage_score": 0.8021,
-            "composite_score": 0.9207,
-            "total_score": 7.6,
-            "duration": 107.2,
-            "cost": 0.237437
+            "efficiency_score": 0.9125,
+            "usage_score": 0.9143,
+            "composite_score": 0.9654,
+            "total_score": 7.83,
+            "duration": 47.24,
+            "cost": 0.102852
           }
         }
       },
@@ -1689,44 +1689,44 @@
             "task_score": 7.5,
             "task_max_score": 7.5,
             "efficiency_score": 0,
-            "usage_score": 0.9875,
-            "composite_score": 0.7975,
-            "total_score": 8.49,
+            "usage_score": 0.3885,
+            "composite_score": 0.6777,
+            "total_score": 7.89,
             "duration": 660.0,
-            "cost": 0.018818
+            "cost": 0.917267
           },
           "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 0,
+            "passed": true,
+            "task_score": 7.5,
             "task_max_score": 7.5,
-            "efficiency_score": 0.9969,
-            "usage_score": 1.0,
-            "composite_score": 0.3994,
-            "total_score": 2.0,
-            "duration": 2.02,
-            "cost": null
+            "efficiency_score": 0,
+            "usage_score": 0,
+            "composite_score": 0.6,
+            "total_score": 7.5,
+            "duration": 660.0,
+            "cost": 3.334074
           },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 3.5,
+            "passed": true,
+            "task_score": 7.5,
             "task_max_score": 7.5,
-            "efficiency_score": 0.0325,
-            "usage_score": 0.797,
-            "composite_score": 0.1659,
-            "total_score": 4.33,
-            "duration": 638.54,
-            "cost": 0.304568
+            "efficiency_score": 0.0856,
+            "usage_score": 0.8027,
+            "composite_score": 0.7777,
+            "total_score": 8.39,
+            "duration": 603.52,
+            "cost": 0.295921
           },
           "dashscope/qwen3.6-flash": {
             "passed": false,
-            "task_score": 4.0,
+            "task_score": 5.5,
             "task_max_score": 7.5,
-            "efficiency_score": 0.5637,
-            "usage_score": 0.4845,
-            "composite_score": 0.2096,
-            "total_score": 5.05,
-            "duration": 287.98,
-            "cost": 0.773199
+            "efficiency_score": 0,
+            "usage_score": 0.9834,
+            "composite_score": 0.1967,
+            "total_score": 6.48,
+            "duration": 660.0,
+            "cost": 0.024974
           }
         }
       },
@@ -1734,48 +1734,48 @@
         "name": "GitHub Issue Triage Deep",
         "results_by_model": {
           "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 0,
+            "passed": true,
+            "task_score": 8.5,
             "task_max_score": 8.5,
-            "efficiency_score": 0.997,
-            "usage_score": 1.0,
-            "composite_score": 0.3994,
-            "total_score": 2.0,
-            "duration": 2.02,
-            "cost": null
+            "efficiency_score": 0.782,
+            "usage_score": 0.8912,
+            "composite_score": 0.9346,
+            "total_score": 10.17,
+            "duration": 148.26,
+            "cost": 0.163233
           },
           "dashscope/qwen3.6-plus": {
             "passed": false,
             "task_score": 4.8,
             "task_max_score": 8.5,
-            "efficiency_score": 0.6943,
-            "usage_score": 0.2855,
-            "composite_score": 0.196,
-            "total_score": 5.78,
-            "duration": 207.85,
-            "cost": 1.071704
+            "efficiency_score": 0.7832,
+            "usage_score": 0.6048,
+            "composite_score": 0.2776,
+            "total_score": 6.19,
+            "duration": 147.41,
+            "cost": 0.592826
           },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 8.5,
             "task_max_score": 8.5,
-            "efficiency_score": 0.4526,
-            "usage_score": 0.9455,
-            "composite_score": 0.8796,
-            "total_score": 9.9,
-            "duration": 372.21,
-            "cost": 0.081701
+            "efficiency_score": 0.7463,
+            "usage_score": 0.9535,
+            "composite_score": 0.94,
+            "total_score": 10.2,
+            "duration": 172.54,
+            "cost": 0.069738
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 8.5,
             "task_max_score": 8.5,
-            "efficiency_score": 0.6201,
-            "usage_score": 0.6126,
-            "composite_score": 0.8465,
-            "total_score": 9.73,
-            "duration": 258.3,
-            "cost": 0.581136
+            "efficiency_score": 0.8418,
+            "usage_score": 0.8376,
+            "composite_score": 0.9359,
+            "total_score": 10.18,
+            "duration": 107.59,
+            "cost": 0.243587
           }
         }
       }
diff --git a/pyproject.toml b/pyproject.toml
index 5eba3ec..d467cd4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "50a52fad63b96a48082146739ab40feafbc37423" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "50a52fad63b96a48082146739ab40feafbc37423" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
diff --git a/uv.lock b/uv.lock
index 8781ac2..ae027e7 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1678,8 +1678,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=50a52fad63b96a48082146739ab40feafbc37423" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=50a52fad63b96a48082146739ab40feafbc37423" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2224,7 +2224,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=50a52fad63b96a48082146739ab40feafbc37423#50a52fad63b96a48082146739ab40feafbc37423" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282#1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2244,7 +2244,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=50a52fad63b96a48082146739ab40feafbc37423#50a52fad63b96a48082146739ab40feafbc37423" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282#1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 9dc3dff6d75fe5c20d12c6925af96d3b4270378d Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Thu, 7 May 2026 12:57:13 +0800
Subject: [PATCH 14/14] chore: pre-commit autoformat + refresh stale
 toolset/prompt tests

CI surfaced two issues:
1. Pre-commit (black + extension prettier) wanted to reformat several
   files touched by recent commits. Apply the formatter pass.
2. Three unit tests still asserted the pre-pixel-paradigm contract
   (`['tab', 'highlight', 'element_interaction', ...]` toolset; tab
   prompt mentioning `default highlight element_type:"any" page 1`).
   Update them to the current pixel-paradigm shape: `['tab', 'mouse',
   'keyboard', 'dialog', 'select_option', 'upload_file', ...]` toolset
   and the new "virtual cursor visible" contract from the rewritten
   tab prompt.

499 pytest pass / 4 skipped locally. No production code changed beyond
the formatter.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts             |  26 ++--
 extension/src/commands/pixel-actions.ts       |  47 ++++----
 .../src/commands/pixel-confirm-render.ts      |  31 +++--
 .../src/commands/pixel-target-analyzer.ts     |   8 +-
 extension/src/commands/virtual-cursor.ts      |   4 +-
 server/agent/api.py                           |   1 +
 server/agent/tools/browser_executor.py        | 114 ++++++------------
 server/agent/tools/mouse_tool.py              |  14 +--
 server/agent/tools/select_option_tool.py      |   8 +-
 server/agent/tools/upload_file_tool.py        |   8 +-
 server/core/processor.py                      |   4 +-
 .../tests/unit/test_agent_manager_process.py  |  16 ++-
 server/tests/unit/test_prompt_contracts.py    |   4 +-
 server/tests/unit/test_tab_tool.py            |   4 +-
 14 files changed, 119 insertions(+), 170 deletions(-)

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index 24e5986..db444a3 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -1869,21 +1869,17 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         await tabManager.ensureTabManaged(activeTabId, conversationId);
         tabManager.updateTabActivity(activeTabId, conversationId);
 
-        const rendered = await renderPixelConfirm(
-          activeTabId,
-          conversationId,
-          {
-            mode: command.mode,
-            x: command.x,
-            y: command.y,
-            target_bbox: command.target_bbox,
-            candidate_bboxes: command.candidate_bboxes,
-            target_selector: command.target_selector,
-            candidate_selectors: command.candidate_selectors,
-            banner_kind: command.banner_kind,
-            drag_end: command.drag_end,
-          },
-        );
+        const rendered = await renderPixelConfirm(activeTabId, conversationId, {
+          mode: command.mode,
+          x: command.x,
+          y: command.y,
+          target_bbox: command.target_bbox,
+          candidate_bboxes: command.candidate_bboxes,
+          target_selector: command.target_selector,
+          candidate_selectors: command.candidate_selectors,
+          banner_kind: command.banner_kind,
+          drag_end: command.drag_end,
+        });
 
         return {
           success: true,
diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index 8a4c906..9afda56 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -79,12 +79,12 @@ async function getViewport(
       0,
     );
     const value = probe?.result?.value;
-    const w = typeof value?.width === 'number' && value.width > 0
-      ? value.width
-      : 1280;
-    const h = typeof value?.height === 'number' && value.height > 0
-      ? value.height
-      : 720;
+    const w =
+      typeof value?.width === 'number' && value.width > 0 ? value.width : 1280;
+    const h =
+      typeof value?.height === 'number' && value.height > 0
+        ? value.height
+        : 720;
     return { width: w, height: h };
   } catch {
     return { width: 1280, height: 720 };
@@ -515,10 +515,7 @@ async function detectNativeFormControl(
     );
     return res?.result?.value ?? null;
   } catch (err) {
-    console.warn(
-      '[PixelActions] Native form-control hit-test failed:',
-      err,
-    );
+    console.warn('[PixelActions] Native form-control hit-test failed:', err);
     return null;
   }
 }
@@ -788,7 +785,10 @@ export async function performMouseScroll(
 // a time — feels like a human typing and lets per-character JS handlers
 // (autocomplete, validation) react in order. Anything outside this map
 // (CJK, emoji, accented Latin, etc.) falls through to `Input.insertText`.
-const SHIFT_PUNCT: Record<string, { key: string; code: string; keyCode: number }> = {
+const SHIFT_PUNCT: Record<
+  string,
+  { key: string; code: string; keyCode: number }
+> = {
   '!': { key: '!', code: 'Digit1', keyCode: 49 },
   '@': { key: '@', code: 'Digit2', keyCode: 50 },
   '#': { key: '#', code: 'Digit3', keyCode: 51 },
@@ -811,7 +811,10 @@ const SHIFT_PUNCT: Record<string, { key: string; code: string; keyCode: number }
   '?': { key: '?', code: 'Slash', keyCode: 191 },
   '~': { key: '~', code: 'Backquote', keyCode: 192 },
 };
-const PLAIN_PUNCT: Record<string, { key: string; code: string; keyCode: number }> = {
+const PLAIN_PUNCT: Record<
+  string,
+  { key: string; code: string; keyCode: number }
+> = {
   '`': { key: '`', code: 'Backquote', keyCode: 192 },
   '-': { key: '-', code: 'Minus', keyCode: 189 },
   '=': { key: '=', code: 'Equal', keyCode: 187 },
@@ -1118,13 +1121,10 @@ export async function performKeyboardClear(
     return { cleared: false, reason: 'focused element is not editable: ' + describe() };
   })()`;
   const resp = await cdp.sendCommand<{
-    result?: { value?: { cleared?: boolean; target?: string; reason?: string } };
-  }>(
-    'Runtime.evaluate',
-    { expression: expr, returnByValue: true },
-    8000,
-    0,
-  );
+    result?: {
+      value?: { cleared?: boolean; target?: string; reason?: string };
+    };
+  }>('Runtime.evaluate', { expression: expr, returnByValue: true }, 8000, 0);
   const value = resp?.result?.value || {};
   return {
     cleared: !!value.cleared,
@@ -1234,12 +1234,9 @@ export async function performSelectOption(
     return { ok: true, selected: matched };
   })(${JSON.stringify(values)})`;
   try {
-    const r = await cdp.sendCommand<{ result?: { value?: SelectOptionResult } }>(
-      'Runtime.evaluate',
-      { expression: expr, returnByValue: true },
-      8000,
-      0,
-    );
+    const r = await cdp.sendCommand<{
+      result?: { value?: SelectOptionResult };
+    }>('Runtime.evaluate', { expression: expr, returnByValue: true }, 8000, 0);
     return r?.result?.value ?? { ok: false, error: 'no_result' };
   } catch (err) {
     return {
diff --git a/extension/src/commands/pixel-confirm-render.ts b/extension/src/commands/pixel-confirm-render.ts
index 6cb22b9..58cfd7d 100644
--- a/extension/src/commands/pixel-confirm-render.ts
+++ b/extension/src/commands/pixel-confirm-render.ts
@@ -98,9 +98,10 @@ function unionBbox(boxes: BBox[]): BBox {
   return { x: x1, y: y1, width: x2 - x1, height: y2 - y1 };
 }
 
-function chooseCropCenter(
-  request: PixelConfirmRenderRequest,
-): { center: PointXY; focusBbox: BBox } {
+function chooseCropCenter(request: PixelConfirmRenderRequest): {
+  center: PointXY;
+  focusBbox: BBox;
+} {
   if (request.mode === 'pixel_hit' && request.target_bbox) {
     const focus = request.drag_end
       ? unionBbox([
@@ -461,22 +462,18 @@ export async function renderPixelConfirm(
       : 0;
 
   const [, base64] = screenshotDataUrl.split(',');
-  const header = screenshotDataUrl.slice(
-    0,
-    screenshotDataUrl.indexOf(','),
+  const header = screenshotDataUrl.slice(0, screenshotDataUrl.indexOf(','));
+  const mimeType = header.substring(
+    header.indexOf(':') + 1,
+    header.indexOf(';'),
   );
-  const mimeType = header.substring(header.indexOf(':') + 1, header.indexOf(';'));
   const binary = atob(base64);
   const bytes = new Uint8Array(binary.length);
   for (let i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
-  const bitmap = await createImageBitmap(
-    new Blob([bytes], { type: mimeType }),
-  );
+  const bitmap = await createImageBitmap(new Blob([bytes], { type: mimeType }));
 
-  const actualScaleX =
-    viewportWidth > 0 ? bitmap.width / viewportWidth : 1;
-  const actualScaleY =
-    viewportHeight > 0 ? bitmap.height / viewportHeight : 1;
+  const actualScaleX = viewportWidth > 0 ? bitmap.width / viewportWidth : 1;
+  const actualScaleY = viewportHeight > 0 ? bitmap.height / viewportHeight : 1;
   const scale = (actualScaleX + actualScaleY) / 2 || 1;
 
   const crop = calculateCrop(bitmap.width, bitmap.height, scale, request);
@@ -549,12 +546,12 @@ export async function renderPixelConfirm(
   const compressed =
     typeof compressedRaw === 'string'
       ? compressedRaw
-      : (compressedRaw &&
+      : compressedRaw &&
           typeof compressedRaw === 'object' &&
           'imageData' in compressedRaw &&
           typeof compressedRaw.imageData === 'string'
-          ? compressedRaw.imageData
-          : dataUrl);
+        ? compressedRaw.imageData
+        : dataUrl;
 
   return {
     screenshot_data_url: compressed,
diff --git a/extension/src/commands/pixel-target-analyzer.ts b/extension/src/commands/pixel-target-analyzer.ts
index 8ed1dda..3bf6077 100644
--- a/extension/src/commands/pixel-target-analyzer.ts
+++ b/extension/src/commands/pixel-target-analyzer.ts
@@ -171,8 +171,12 @@ export async function analyzePixelTargets(
   // center is hundreds of pixels away — useless guidance for course
   // correction. The hit element (smallest containing) is reported on its
   // own; everything else must be a true outside-but-close neighbor.
-  const isOutside = (b: { x: number; y: number; width: number; height: number }) =>
-    x < b.x || x > b.x + b.width || y < b.y || y > b.y + b.height;
+  const isOutside = (b: {
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+  }) => x < b.x || x > b.x + b.width || y < b.y || y > b.y + b.height;
 
   // Two distinct distance metrics, each with its own threshold:
   //
diff --git a/extension/src/commands/virtual-cursor.ts b/extension/src/commands/virtual-cursor.ts
index f62fd59..f83af45 100644
--- a/extension/src/commands/virtual-cursor.ts
+++ b/extension/src/commands/virtual-cursor.ts
@@ -190,9 +190,7 @@ export async function resolveCursorOrCenter(
       0,
     );
     const value = (probe as { result?: { value?: unknown } } | undefined)
-      ?.result?.value as
-      | { width?: number; height?: number }
-      | undefined;
+      ?.result?.value as { width?: number; height?: number } | undefined;
     const w =
       typeof value?.width === 'number' && value.width > 0 ? value.width : 1280;
     const h =
diff --git a/server/agent/api.py b/server/agent/api.py
index ebf6186..726bf80 100644
--- a/server/agent/api.py
+++ b/server/agent/api.py
@@ -508,6 +508,7 @@ def initialize_agent():
         from .tools.keyboard_tool import KeyboardTool
         from .tools.select_option_tool import SelectOptionTool  # noqa: F401
         from .tools.upload_file_tool import UploadFileTool  # noqa: F401
+
         # Imported for legacy tooling (routine recording) — not registered
         # for the live agent.
         from .tools.highlight_tool import HighlightTool  # noqa: F401
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 2398992..be42fe7 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -342,10 +342,9 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
             rejected_pixel_candidates: list = []
             if should_clear:
                 pending = self._get_pending_confirmation()
-                if (
-                    pending
-                    and pending.get("action_type")
-                    in ("mouse_click_pixel", "mouse_drag_pixel")
+                if pending and pending.get("action_type") in (
+                    "mouse_click_pixel",
+                    "mouse_drag_pixel",
                 ):
                     cands = (pending.get("extra_data") or {}).get("candidates")
                     if isinstance(cands, list):
@@ -379,10 +378,7 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation:
             # agent has them fresh when course-correcting. Skip if the new
             # action triggered its own pixel gate (its message already
             # contains a fresh candidates block).
-            if (
-                rejected_pixel_candidates
-                and getattr(obs, "success", False)
-            ):
+            if rejected_pixel_candidates and getattr(obs, "success", False):
                 new_pending = self._get_pending_confirmation()
                 new_is_pixel_gate = bool(
                     new_pending
@@ -1127,9 +1123,7 @@ def _denormalize_xy(
     PIXEL_GATE_RADIUS_CSS = 30
     PIXEL_GATE_CANDIDATE_LIMIT = 5
 
-    def _gate_pixel_target(
-        self, x_css: int, y_css: int
-    ) -> Optional[Dict[str, Any]]:
+    def _gate_pixel_target(self, x_css: int, y_css: int) -> Optional[Dict[str, Any]]:
         """Probe (x, y) for the hit element + nearby interactables.
 
         Returns the analysis dict from the extension on success, or None if
@@ -1275,9 +1269,7 @@ def _format_pixel_candidates_block(
             cx = cn.get("x")
             cy = cn.get("y")
             if cx is not None and cy is not None:
-                element_lines[0] = (
-                    f"{element_lines[0]}  → center=({cx}, {cy})"
-                )
+                element_lines[0] = f"{element_lines[0]}  → center=({cx}, {cy})"
             lines.extend(element_lines)
         return "\n".join(lines)
 
@@ -1419,14 +1411,16 @@ def _gate_pixel_click(
             c["bbox"] for c in neighborhood if isinstance(c.get("bbox"), dict)
         ]
         candidate_selectors = [
-            c.get("selector") for c in neighborhood
+            c.get("selector")
+            for c in neighborhood
             if isinstance(c.get("selector"), str)
         ]
         target_bbox = (
             hit.get("bbox") if hit and isinstance(hit.get("bbox"), dict) else None
         )
         target_selector = (
-            hit.get("selector") if hit and isinstance(hit.get("selector"), str)
+            hit.get("selector")
+            if hit and isinstance(hit.get("selector"), str)
             else None
         )
 
@@ -1507,12 +1501,11 @@ def _gate_pixel_drag(
         focus_neighborhood = (focus_gate or {}).get("neighborhood") or []
         candidates = self._serialize_pixel_candidates(focus_neighborhood, vw, vh)
         candidate_bboxes = [
-            c["bbox"]
-            for c in focus_neighborhood
-            if isinstance(c.get("bbox"), dict)
+            c["bbox"] for c in focus_neighborhood if isinstance(c.get("bbox"), dict)
         ]
         candidate_selectors = [
-            c.get("selector") for c in focus_neighborhood
+            c.get("selector")
+            for c in focus_neighborhood
             if isinstance(c.get("selector"), str)
         ]
         target_bbox = (
@@ -1618,17 +1611,13 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                     )
                 elif self._click_was_a_no_op(result_dict):
                     serialized = extra.get("candidates") or []
-                    message += self._format_no_op_warning_from_candidates(
-                        serialized
-                    )
+                    message += self._format_no_op_warning_from_candidates(serialized)
                     # Same overlay as the direct-click path so the live
                     # page visually surfaces the candidates the agent was
                     # given as alternatives.
                     px, py = extra.get("px"), extra.get("py")
                     if isinstance(px, int) and isinstance(py, int):
-                        self._draw_no_op_overlay_from_serialized(
-                            (px, py), serialized
-                        )
+                        self._draw_no_op_overlay_from_serialized((px, py), serialized)
                 return self._build_observation_from_result(result_dict, message)
 
             if action_type == "mouse_drag_pixel":
@@ -1639,9 +1628,7 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                 button = extra.get("button", "left")
                 steps = int(extra.get("steps", 10))
                 if None in (sx, sy, ex, ey):
-                    raise ValueError(
-                        "Pending drag is missing endpoint coordinates."
-                    )
+                    raise ValueError("Pending drag is missing endpoint coordinates.")
                 command = MouseDragCommand(
                     start_x=int(sx),
                     start_y=int(sy),
@@ -1668,17 +1655,13 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                 small_model=self._uses_small_model(),
             )
         except Exception as e:
-            logger.error(
-                "Failed to commit pending pixel action: %s", e, exc_info=True
-            )
+            logger.error("Failed to commit pending pixel action: %s", e, exc_info=True)
             self._clear_pending_confirmation()
             return OpenBrowserObservation(
                 success=False, error=str(e), small_model=self._uses_small_model()
             )
 
-    def _execute_mouse_action(
-        self, action: MouseAction
-    ) -> OpenBrowserObservation:
+    def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
         """Execute one mouse action (move/click/drag/scroll/reset).
 
         Coordinates from Qwen models are in [0, 1000] normalized space and are
@@ -1696,9 +1679,7 @@ def _execute_mouse_action(
         try:
             if kind == "move":
                 if not action.coordinate:
-                    raise ValueError(
-                        "mouse move requires `coordinate: [x, y]`"
-                    )
+                    raise ValueError("mouse move requires `coordinate: [x, y]`")
                 px, py = self._denormalize_xy(
                     action.coordinate[0], action.coordinate[1]
                 )
@@ -1734,11 +1715,7 @@ def _execute_mouse_action(
                     if cursor is not None
                     else None
                 )
-                if (
-                    gate
-                    and gate.get("verdict") == "dense"
-                    and cursor is not None
-                ):
+                if gate and gate.get("verdict") == "dense" and cursor is not None:
                     return self._gate_pixel_click(action, cursor, gate)
 
                 command = MouseClickCommand(
@@ -1750,12 +1727,11 @@ def _execute_mouse_action(
                 result_dict = self._execute_command_sync(command)
                 cx, cy = cursor or (None, None)
                 where = (
-                    f"({cx}, {cy})" if cx is not None and cy is not None
+                    f"({cx}, {cy})"
+                    if cx is not None and cy is not None
                     else "the cursor"
                 )
-                count_note = (
-                    f", count={action.count}" if action.count != 1 else ""
-                )
+                count_note = f", count={action.count}" if action.count != 1 else ""
                 message = f"Clicked {action.button} at {where}{count_note}."
                 intercepted = self._extract_intercepted_form_control(result_dict)
                 if intercepted:
@@ -1786,12 +1762,8 @@ def _execute_mouse_action(
 
                 start_gate = self._gate_pixel_target(sx, sy)
                 end_gate = self._gate_pixel_target(ex, ey)
-                start_dense = bool(
-                    start_gate and start_gate.get("verdict") == "dense"
-                )
-                end_dense = bool(
-                    end_gate and end_gate.get("verdict") == "dense"
-                )
+                start_dense = bool(start_gate and start_gate.get("verdict") == "dense")
+                end_dense = bool(end_gate and end_gate.get("verdict") == "dense")
                 if start_dense or end_dense:
                     return self._gate_pixel_drag(
                         action,
@@ -1832,9 +1804,7 @@ def _execute_mouse_action(
                 )
 
             if kind == "reset":
-                command = ResetMouseCommand(
-                    conversation_id=self.conversation_id
-                )
+                command = ResetMouseCommand(conversation_id=self.conversation_id)
                 result_dict = self._execute_command_sync(command)
                 viewport = self._get_viewport()
                 if viewport is not None:
@@ -1869,9 +1839,7 @@ def _execute_keyboard_action(
                 )
                 result_dict = self._execute_command_sync(command)
                 preview = (
-                    action.text
-                    if len(action.text) <= 32
-                    else action.text[:29] + "..."
+                    action.text if len(action.text) <= 32 else action.text[:29] + "..."
                 )
                 return self._build_observation_from_result(
                     result_dict, f"Typed text: {preview!r}"
@@ -1887,9 +1855,7 @@ def _execute_keyboard_action(
                 )
                 result_dict = self._execute_command_sync(command)
                 mod_text = (
-                    f" with {'+'.join(action.modifiers)}"
-                    if action.modifiers
-                    else ""
+                    f" with {'+'.join(action.modifiers)}" if action.modifiers else ""
                 )
                 return self._build_observation_from_result(
                     result_dict, f"Pressed {action.key}{mod_text}"
@@ -1924,9 +1890,7 @@ def _execute_keyboard_action(
 
             raise ValueError(f"Unknown keyboard action: {kind}")
         except Exception as e:
-            logger.error(
-                f"Keyboard action failed (kind={kind}): {e}", exc_info=True
-            )
+            logger.error(f"Keyboard action failed (kind={kind}): {e}", exc_info=True)
             return OpenBrowserObservation(
                 success=False, error=str(e), small_model=self._uses_small_model()
             )
@@ -2009,9 +1973,7 @@ def _draw_no_op_overlay_from_serialized(
         except Exception as e:
             logger.debug("no-op overlay render failed: %s", e)
 
-    def _format_no_op_warning(
-        self, gate: Optional[Dict[str, Any]]
-    ) -> str:
+    def _format_no_op_warning(self, gate: Optional[Dict[str, Any]]) -> str:
         """Warning text for a click that committed but produced no DOM change.
 
         When `gate` carries a neighborhood from the pixel-target probe, the
@@ -2032,9 +1994,7 @@ def _format_no_op_warning(
             candidates = self._serialize_pixel_candidates(neighborhood, vw, vh)
         return self._format_no_op_warning_from_candidates(candidates)
 
-    def _format_no_op_warning_from_candidates(
-        self, candidates: list
-    ) -> str:
+    def _format_no_op_warning_from_candidates(self, candidates: list) -> str:
         """Render the no-op warning + candidate block from pre-serialized data."""
         lines = [
             "",
@@ -2591,11 +2551,15 @@ def _build_observation_from_result(
                     if raw_vw is None or raw_vh is None:
                         meta = data.get("metadata")
                         if isinstance(meta, dict):
-                            raw_vw = raw_vw if raw_vw is not None else meta.get(
-                                "viewportWidth"
+                            raw_vw = (
+                                raw_vw
+                                if raw_vw is not None
+                                else meta.get("viewportWidth")
                             )
-                            raw_vh = raw_vh if raw_vh is not None else meta.get(
-                                "viewportHeight"
+                            raw_vh = (
+                                raw_vh
+                                if raw_vh is not None
+                                else meta.get("viewportHeight")
                             )
                     if isinstance(raw_vw, (int, float)) and isinstance(
                         raw_vh, (int, float)
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
index a850727..555598f 100644
--- a/server/agent/tools/mouse_tool.py
+++ b/server/agent/tools/mouse_tool.py
@@ -31,9 +31,7 @@ def get_mouse_tool_description(conv_state=None) -> str:
     )
 
 
-MouseActionKind = Literal[
-    "move", "click", "drag", "scroll", "reset", "confirm"
-]
+MouseActionKind = Literal["move", "click", "drag", "scroll", "reset", "confirm"]
 
 
 def _validate_coordinate_pair(v: Optional[List[int]]) -> Optional[List[int]]:
@@ -55,9 +53,7 @@ def _validate_coordinate_pair(v: Optional[List[int]]) -> Optional[List[int]]:
             try:
                 n = int(n)
             except (TypeError, ValueError):
-                raise ValueError(
-                    f"coordinate[{i}] must be an integer in [0, 1000]"
-                )
+                raise ValueError(f"coordinate[{i}] must be an integer in [0, 1000]")
         if n < 0 or n > 1000:
             raise ValueError(
                 f"coordinate[{i}] = {n} is outside [0, 1000] normalized space"
@@ -84,7 +80,7 @@ class MouseAction(OpenBrowserAction):
             "cursor there and click in one step; omit `coordinate` to click "
             "at the cursor's current position (use this after a 'move' for a "
             "hover-then-click flow). `count: 2` double-clicks, `count: 3` "
-            "triple-clicks. `button: \"right\"` opens the context menu. "
+            'triple-clicks. `button: "right"` opens the context menu. '
             "'drag' — press at `start_coordinate`, drag to `end_coordinate`, "
             "release. "
             "'scroll' — scroll at the cursor position by `amount` in "
@@ -119,9 +115,7 @@ class MouseAction(OpenBrowserAction):
         ),
     )
 
-    @field_validator(
-        "coordinate", "start_coordinate", "end_coordinate", mode="before"
-    )
+    @field_validator("coordinate", "start_coordinate", "end_coordinate", mode="before")
     @classmethod
     def _check_coord(cls, v):
         return _validate_coordinate_pair(v)
diff --git a/server/agent/tools/select_option_tool.py b/server/agent/tools/select_option_tool.py
index 4863df2..f356712 100644
--- a/server/agent/tools/select_option_tool.py
+++ b/server/agent/tools/select_option_tool.py
@@ -50,17 +50,13 @@ class SelectOptionAction(OpenBrowserAction):
     )
 
 
-class SelectOptionTool(
-    ToolDefinition[SelectOptionAction, OpenBrowserObservation]
-):
+class SelectOptionTool(ToolDefinition[SelectOptionAction, OpenBrowserObservation]):
     """Pick from a native `<select>` after clicking it."""
 
     name = "select_option"
 
     @classmethod
-    def create(
-        cls, conv_state, terminal_executor=None
-    ) -> Sequence["SelectOptionTool"]:
+    def create(cls, conv_state, terminal_executor=None) -> Sequence["SelectOptionTool"]:
         if terminal_executor is not None:
             executor = terminal_executor
         else:
diff --git a/server/agent/tools/upload_file_tool.py b/server/agent/tools/upload_file_tool.py
index 88f5e92..ff0b859 100644
--- a/server/agent/tools/upload_file_tool.py
+++ b/server/agent/tools/upload_file_tool.py
@@ -45,17 +45,13 @@ class UploadFileAction(OpenBrowserAction):
     )
 
 
-class UploadFileTool(
-    ToolDefinition[UploadFileAction, OpenBrowserObservation]
-):
+class UploadFileTool(ToolDefinition[UploadFileAction, OpenBrowserObservation]):
     """Upload file(s) to a native file input after clicking it."""
 
     name = "upload_file"
 
     @classmethod
-    def create(
-        cls, conv_state, terminal_executor=None
-    ) -> Sequence["UploadFileTool"]:
+    def create(cls, conv_state, terminal_executor=None) -> Sequence["UploadFileTool"]:
         if terminal_executor is not None:
             executor = terminal_executor
         else:
diff --git a/server/core/processor.py b/server/core/processor.py
index a8e2e21..d2897b6 100644
--- a/server/core/processor.py
+++ b/server/core/processor.py
@@ -318,9 +318,7 @@ async def _execute_mouse_click(self, command: MouseClickCommand) -> CommandRespo
         response = await self._send_prepared_command(command)
         return response
 
-    async def _execute_mouse_drag(
-        self, command: MouseDragCommand
-    ) -> CommandResponse:
+    async def _execute_mouse_drag(self, command: MouseDragCommand) -> CommandResponse:
         """Execute mouse drag command"""
         response = await self._send_prepared_command(command)
         return response
diff --git a/server/tests/unit/test_agent_manager_process.py b/server/tests/unit/test_agent_manager_process.py
index 232d181..2e4ddf4 100644
--- a/server/tests/unit/test_agent_manager_process.py
+++ b/server/tests/unit/test_agent_manager_process.py
@@ -66,7 +66,7 @@ def test_multi_process_mode_initializes_infrastructure(self) -> None:
         assert manager._ipc_router is not None
 
     def test_large_models_keep_core_browser_toolset(self) -> None:
-        """Large models should expose the four browser tools plus general tools."""
+        """Large models expose the pixel-paradigm browser tools plus general tools."""
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
             manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
@@ -83,9 +83,11 @@ def test_large_models_keep_core_browser_toolset(self) -> None:
 
         assert tool_names == [
             "tab",
-            "highlight",
-            "element_interaction",
+            "mouse",
+            "keyboard",
             "dialog",
+            "select_option",
+            "upload_file",
             "please_help_me",
             "terminal",
             "file_editor",
@@ -93,7 +95,7 @@ def test_large_models_keep_core_browser_toolset(self) -> None:
         ]
 
     def test_small_models_keep_the_same_browser_toolset(self) -> None:
-        """Small models should use the same four browser tools."""
+        """Small models use the same pixel-paradigm browser tools as large models."""
         with patch("server.agent.manager.llm_config_manager") as mock_llm_config:
             manager = OpenBrowserAgentManager()
             mock_llm_config.reload_config.return_value = MagicMock()
@@ -110,9 +112,11 @@ def test_small_models_keep_the_same_browser_toolset(self) -> None:
 
         assert tool_names == [
             "tab",
-            "highlight",
-            "element_interaction",
+            "mouse",
+            "keyboard",
             "dialog",
+            "select_option",
+            "upload_file",
             "please_help_me",
             "terminal",
             "file_editor",
diff --git a/server/tests/unit/test_prompt_contracts.py b/server/tests/unit/test_prompt_contracts.py
index 446c0a1..8698918 100644
--- a/server/tests/unit/test_prompt_contracts.py
+++ b/server/tests/unit/test_prompt_contracts.py
@@ -210,7 +210,9 @@ def test_tab_prompt_points_agents_to_tab_view_for_clean_screenshots(self) -> Non
 
         assert "tab view" in description
         assert "clean screenshot" in description.lower()
-        assert 'default `highlight` `element_type: "any"` page 1' in description
+        # Pixel paradigm: tab actions return a clean screenshot with the
+        # virtual cursor visible — no highlight overlay step required.
+        assert "virtual cursor visible" in description
 
     def test_element_interaction_prompt_requires_click_before_keyboard_input(
         self,
diff --git a/server/tests/unit/test_tab_tool.py b/server/tests/unit/test_tab_tool.py
index 8bd6396..4f2cd65 100644
--- a/server/tests/unit/test_tab_tool.py
+++ b/server/tests/unit/test_tab_tool.py
@@ -91,7 +91,9 @@ def test_description_documents_clean_screenshot_and_history_navigation(
 
         assert "tab view" in description
         assert "clean screenshot" in description.lower()
-        assert 'default `highlight` `element_type: "any"` page 1' in description
+        # Pixel paradigm: tab actions return a clean screenshot with the
+        # virtual cursor visible — replaces the prior highlight pagination.
+        assert "virtual cursor visible" in description
         assert "tab back" in description
         assert "tab forward" in description