From e1429fac5478f9e7d98da8d0229e07454d342e87 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Wed, 29 Apr 2026 20:03:16 +0800 Subject: [PATCH 01/14] feat(agent): pure pixel-level browser interaction with virtual cursor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the highlight + element_id paradigm with a human-like pixel control loop: the agent sees a clean screenshot with a visible cursor sprite and drives the page via a virtual mouse and keyboard. Same toolset for fresh tasks and routine replay; the legacy highlight / element-interaction modules stay on disk for non-agent flows but are no longer exposed. Tools (live agent surface): tab, mouse, keyboard, dialog. - mouse: move (eased lerp), click (in-place — must move first), drag, scroll, reset. Coordinates in Qwen-VL [0,1000] normalized space; the server denormalizes to CSS pixels via the captured viewport. - keyboard: type (one char at a time, real keydown/keypress/input events for ASCII printables, fallback insertText for CJK/emoji), press (named keys + modifiers; Enter/Tab/Space carry text so keypress fires and form-submit works), clear (Ctrl+A → Backspace). - tab: clean screenshots with the cursor in-frame on every action; refresh/view/back/forward auto-fill the active tab_id. Cursor sprite is a 36x36 white-and-black arrow with a red dot and pulsing red ring at the click point, injected via preCaptureScript so it lands in the captured frame even after navigation. Schema: extend MouseClickCommand with optional x/y (now ignored), add MouseDragCommand, drop le=1280/le=720 bounds on MouseMoveCommand, add live_mode flag to BaseCommand for extension routing. Pixel commands finally reach the wire — added MouseDragCommand routing in CommandProcessor.execute and case handlers in the extension switch. Co-Authored-By: Claude Opus 4.7 (1M context) --- extension/src/background/index.ts | 377 ++++++++-- extension/src/commands/pixel-actions.ts | 693 ++++++++++++++++++ extension/src/commands/tab-manager.ts | 6 + extension/src/commands/virtual-cursor.ts | 215 ++++++ extension/src/types.ts | 20 + server/agent/api.py | 18 +- server/agent/manager.py | 24 +- server/agent/prompts/big_model/dialog_tool.j2 | 4 +- .../agent/prompts/big_model/keyboard_tool.j2 | 56 ++ server/agent/prompts/big_model/mouse_tool.j2 | 84 +++ server/agent/prompts/big_model/tab_tool.j2 | 47 +- .../agent/prompts/small_model/dialog_tool.j2 | 2 +- .../prompts/small_model/keyboard_tool.j2 | 38 + .../agent/prompts/small_model/mouse_tool.j2 | 59 ++ server/agent/prompts/small_model/tab_tool.j2 | 22 +- server/agent/tools/base.py | 15 + server/agent/tools/browser_executor.py | 297 ++++++++ .../agent/tools/element_interaction_tool.py | 3 + server/agent/tools/highlight_tool.py | 3 + server/agent/tools/keyboard_tool.py | 107 +++ server/agent/tools/mouse_tool.py | 157 ++++ server/api/routes/commands.py | 10 +- server/core/processor.py | 35 +- server/models/commands.py | 68 +- 24 files changed, 2249 insertions(+), 111 deletions(-) create mode 100644 extension/src/commands/pixel-actions.ts create mode 100644 extension/src/commands/virtual-cursor.ts create mode 100644 server/agent/prompts/big_model/keyboard_tool.j2 create mode 100644 server/agent/prompts/big_model/mouse_tool.j2 create mode 100644 server/agent/prompts/small_model/keyboard_tool.j2 create mode 100644 server/agent/prompts/small_model/mouse_tool.j2 create mode 100644 server/agent/tools/keyboard_tool.py create mode 100644 server/agent/tools/mouse_tool.py diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 3cf9d86..280660c 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -19,6 +19,20 @@ import { tabManager } from '../commands/tab-manager'; import { javascript } from '../commands/javascript'; import { debuggerSessionManager } from '../commands/debugger-manager'; import { dialogManager } from '../commands/dialog'; +import { + buildCursorInjectScript, + resolveCursorOrCenter, + getCursorPosition, +} from '../commands/virtual-cursor'; +import { + performMouseMove, + performMouseClick, + performMouseDrag, + performMouseScroll, + performKeyboardType, + performKeyboardPress, + performResetMouse, +} from '../commands/pixel-actions'; import { clearScreenshotCache } from '../commands/computer'; import { @@ -688,6 +702,12 @@ interface ScreenshotPayload { screenshot?: string; dialog_auto_accepted?: unknown; dialog_auto_accepted_list?: unknown; + // Viewport metadata in CSS pixels — required by the live agent for + // denormalizing Qwen-VL [0,1000] coordinates to real pixels before + // dispatching CDP input events. + viewport_width?: number; + viewport_height?: number; + device_pixel_ratio?: number; } interface HighlightedPageStateData extends ScreenshotPayload { @@ -716,10 +736,16 @@ function buildScreenshotPayload( imageData?: string; dialog_auto_accepted?: unknown; dialog_auto_accepted_list?: unknown; + metadata?: { + viewportWidth?: number; + viewportHeight?: number; + devicePixelRatio?: number; + }; } | null | undefined, ): ScreenshotPayload { + const meta = screenshotResult?.metadata; return { screenshot: screenshotResult?.imageData, ...(screenshotResult?.dialog_auto_accepted @@ -732,6 +758,15 @@ function buildScreenshotPayload( dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list, } : {}), + ...(typeof meta?.viewportWidth === 'number' + ? { viewport_width: meta.viewportWidth } + : {}), + ...(typeof meta?.viewportHeight === 'number' + ? { viewport_height: meta.viewportHeight } + : {}), + ...(typeof meta?.devicePixelRatio === 'number' + ? { device_pixel_ratio: meta.devicePixelRatio } + : {}), }; } @@ -1042,6 +1077,7 @@ async function captureHighlightedPageState( imageData: screenshotResult.imageData, dialog_auto_accepted: screenshotResult.dialog_auto_accepted, dialog_auto_accepted_list: screenshotResult.dialog_auto_accepted_list, + metadata: screenshotResult.metadata, }); console.log( `⏱️ [HighlightTrace] background compress ${Date.now() - compressStart}ms`, @@ -1073,6 +1109,44 @@ async function captureHighlightedPageState( throw new Error('Failed to produce a stable highlight screenshot'); } +/** + * Live-mode capture: a clean (no-highlight) screenshot with the virtual + * cursor injected via preCaptureScript. Used by the live pixel-only agent + * path on tab navigation, dialog handling, etc., in place of the highlight + * pipeline. Returns the same `ScreenshotPayload` shape that + * `captureDefaultHighlightedPageState` falls back to on failure, so callers + * don't need shape-aware branching. + */ +async function captureLiveCleanPageState(options: { + tabId: number; + conversationId: string; + logLabel: string; + waitForRender?: number; + captureOptions?: ScreenshotCaptureOptions; +}): Promise { + const { + tabId, + conversationId, + logLabel, + waitForRender = 350, + captureOptions = TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + } = options; + const cursor = await resolveCursorOrCenter(tabId, conversationId); + const screenshotResult = await captureScreenshot( + tabId, + conversationId, + true, // includeCursor (no-op for CDP, kept for legacy callers) + 90, + false, + waitForRender, + captureOptions, + buildCursorInjectScript(cursor.x, cursor.y), + ); + const compressed = await compressScreenshotResult(screenshotResult); + console.log(`✅ [${logLabel}] Live clean screenshot captured`); + return buildScreenshotPayload(compressed); +} + async function captureDefaultHighlightedPageState(options: { tabId: number; conversationId: string; @@ -1185,6 +1259,13 @@ function isHeavyBrowserCommand(data: any): boolean { case 'select_element': case 'upload_file': case 'handle_dialog': + case 'mouse_move': + case 'mouse_click': + case 'mouse_drag': + case 'mouse_scroll': + case 'keyboard_type': + case 'keyboard_press': + case 'reset_mouse': return true; case 'tab': return ( @@ -1664,6 +1745,17 @@ async function handleCommand(command: Command): Promise { await tabManager.ensureTabManaged(activeTabId, conversationId); tabManager.updateTabActivity(activeTabId, conversationId); + // Resolve the virtual cursor position (defaults to viewport center on + // first call) and inject it into the page DOM via preCaptureScript so + // it appears in the captured frame. Live agents always see a cursor. + const cursorBeforeShot = + command.include_visual_mouse !== false + ? await resolveCursorOrCenter(activeTabId, conversationId) + : null; + const cursorPreCaptureScript = cursorBeforeShot + ? buildCursorInjectScript(cursorBeforeShot.x, cursorBeforeShot.y) + : undefined; + // Take screenshot in background (no tab activation) const screenshotResult = await captureScreenshot( activeTabId, @@ -1672,6 +1764,8 @@ async function handleCommand(command: Command): Promise { command.quality || 90, false, // resizeToPreset: false for WYSIWYG mode 0, // waitForRender + undefined, // capture options + cursorPreCaptureScript, ); const compressedScreenshotResult = await compressScreenshotResult(screenshotResult); @@ -1686,6 +1780,151 @@ async function handleCommand(command: Command): Promise { }; } + // ============== Pixel-level mouse / keyboard ============== + // The live agent uses these instead of the highlight + element-id flow. + // The server has already denormalized Qwen [0,1000] coords to CSS px. + case 'mouse_move': + case 'mouse_click': + case 'mouse_drag': + case 'mouse_scroll': + case 'keyboard_type': + case 'keyboard_press': + case 'reset_mouse': { + if (!command.conversation_id) { + throw new Error( + `conversation_id is required for ${command.type} command (strict mode)`, + ); + } + const conversationId = command.conversation_id; + const activeTabId = tabManager.getCurrentActiveTabId(conversationId); + if (!activeTabId) { + throw new Error( + `No active tab found for conversation ${conversationId}. Use tab init first.`, + ); + } + await tabManager.ensureTabManaged(activeTabId, conversationId); + tabManager.updateTabActivity(activeTabId, conversationId); + + let actionDetail: Record = {}; + try { + switch (command.type) { + case 'mouse_move': { + const r = await performMouseMove( + activeTabId, + conversationId, + command.x, + command.y, + ); + actionDetail = r; + break; + } + case 'mouse_click': { + const r = await performMouseClick( + activeTabId, + conversationId, + command.x, + command.y, + command.button || 'left', + command.count || (command.double ? 2 : 1), + ); + actionDetail = r; + break; + } + case 'mouse_drag': { + const r = await performMouseDrag( + activeTabId, + conversationId, + command.start_x, + command.start_y, + command.end_x, + command.end_y, + command.button || 'left', + command.steps || 10, + ); + actionDetail = r; + break; + } + case 'mouse_scroll': { + const r = await performMouseScroll( + activeTabId, + conversationId, + command.direction || 'down', + command.amount || 300, + ); + actionDetail = r; + break; + } + case 'keyboard_type': { + const r = await performKeyboardType( + activeTabId, + conversationId, + command.text || '', + ); + actionDetail = r; + break; + } + case 'keyboard_press': { + const r = await performKeyboardPress( + activeTabId, + conversationId, + command.key || '', + command.modifiers, + ); + actionDetail = r; + break; + } + case 'reset_mouse': { + const r = await performResetMouse(activeTabId, conversationId); + actionDetail = r; + break; + } + } + } catch (err) { + throw new Error( + `Pixel action ${command.type} failed: ${err instanceof Error ? err.message : String(err)}`, + ); + } + + // Capture a fresh post-action screenshot with the cursor visible. + // For actions that can navigate or trigger heavy re-render + // (`mouse_click`, `mouse_drag`, `keyboard_press` Enter), give the + // browser a brief settle window so the captured frame reflects the + // new state instead of a transitional DOM. Lighter actions + // (mouse_move, mouse_scroll, keyboard_type, reset_mouse) take 0. + const settleMs = + command.type === 'mouse_click' || + command.type === 'mouse_drag' || + command.type === 'keyboard_press' + ? 350 + : 0; + const cursorAfter = + getCursorPosition(activeTabId) ?? + (await resolveCursorOrCenter(activeTabId, conversationId)); + const postScreenshotResult = await captureScreenshot( + activeTabId, + conversationId, + true, + 90, + false, + settleMs, + undefined, + buildCursorInjectScript(cursorAfter.x, cursorAfter.y), + ); + const compressedPost = + await compressScreenshotResult(postScreenshotResult); + + return { + success: true, + message: `Pixel action ${command.type} completed`, + data: { + ...(compressedPost || {}), + pixel_action: command.type, + ...actionDetail, + }, + timestamp: Date.now(), + }; + } + case 'tab': { // ✅ STRICT MODE: conversation_id is REQUIRED if (!command.conversation_id) { @@ -1713,13 +1952,20 @@ async function handleCommand(command: Command): Promise { // Set the newly created tab as active tabManager.setCurrentActiveTabId(conversationId, initResult.tabId); - // Capture screenshot after initialization - const initPageState = await captureDefaultHighlightedPageState({ - tabId: initResult.tabId, - conversationId, - logLabel: 'Tab Init', - primeWithRawScreenshot: true, - }); + // Capture screenshot after initialization. Live agent path + // returns clean+cursor; replay returns highlight inventory. + const initPageState = command.live_mode + ? await captureLiveCleanPageState({ + tabId: initResult.tabId, + conversationId, + logLabel: 'Tab Init', + }) + : await captureDefaultHighlightedPageState({ + tabId: initResult.tabId, + conversationId, + logLabel: 'Tab Init', + primeWithRawScreenshot: true, + }); return { success: true, @@ -1751,12 +1997,18 @@ async function handleCommand(command: Command): Promise { // Capture screenshot after opening const openPageState = openResult.tabId - ? await captureDefaultHighlightedPageState({ - tabId: openResult.tabId, - conversationId, - logLabel: 'Tab Open', - primeWithRawScreenshot: true, - }) + ? command.live_mode + ? await captureLiveCleanPageState({ + tabId: openResult.tabId, + conversationId, + logLabel: 'Tab Open', + }) + : await captureDefaultHighlightedPageState({ + tabId: openResult.tabId, + conversationId, + logLabel: 'Tab Open', + primeWithRawScreenshot: true, + }) : {}; return { @@ -1797,12 +2049,18 @@ async function handleCommand(command: Command): Promise { tabManager.setCurrentActiveTabId(conversationId, command.tab_id); // Capture screenshot after switching - const switchPageState = await captureDefaultHighlightedPageState({ - tabId: command.tab_id, - conversationId, - logLabel: 'Tab Switch', - primeWithRawScreenshot: true, - }); + const switchPageState = command.live_mode + ? await captureLiveCleanPageState({ + tabId: command.tab_id, + conversationId, + logLabel: 'Tab Switch', + }) + : await captureDefaultHighlightedPageState({ + tabId: command.tab_id, + conversationId, + logLabel: 'Tab Switch', + primeWithRawScreenshot: true, + }); return { success: true, @@ -1839,12 +2097,18 @@ async function handleCommand(command: Command): Promise { const refreshResult = await tabs.refreshTab(command.tab_id); // Capture screenshot after refresh - const refreshPageState = await captureDefaultHighlightedPageState({ - tabId: command.tab_id, - conversationId, - logLabel: 'Tab Refresh', - primeWithRawScreenshot: true, - }); + const refreshPageState = command.live_mode + ? await captureLiveCleanPageState({ + tabId: command.tab_id, + conversationId, + logLabel: 'Tab Refresh', + }) + : await captureDefaultHighlightedPageState({ + tabId: command.tab_id, + conversationId, + logLabel: 'Tab Refresh', + primeWithRawScreenshot: true, + }); return { success: true, @@ -1873,6 +2137,14 @@ async function handleCommand(command: Command): Promise { `👁️ [Tab View] Capturing screenshot for tab ${viewActiveTabId}, conversation: ${conversationId}`, ); + // Inject the virtual cursor before capture so live-mode screenshots + // always show the pointer. For replay/legacy callers, this is + // harmless — the cursor is just an extra DOM element below the + // highlight overlay's z-index. + const viewCursor = await resolveCursorOrCenter( + viewActiveTabId, + conversationId, + ); const viewScreenshotResult = await captureScreenshot( viewActiveTabId, conversationId, @@ -1881,6 +2153,7 @@ async function handleCommand(command: Command): Promise { false, 350, TAB_VIEW_SCREENSHOT_CAPTURE_OPTIONS, + buildCursorInjectScript(viewCursor.x, viewCursor.y), ); const compressedViewScreenshotResult = await compressScreenshotResult(viewScreenshotResult); @@ -1945,14 +2218,20 @@ async function handleCommand(command: Command): Promise { ? await tabs.goBack(targetTabId) : await tabs.goForward(targetTabId); - const navigationPageState = - await captureDefaultHighlightedPageState({ - tabId: targetTabId, - conversationId, - logLabel: - command.action === 'back' ? 'Tab Back' : 'Tab Forward', - primeWithRawScreenshot: true, - }); + const navigationPageState = command.live_mode + ? await captureLiveCleanPageState({ + tabId: targetTabId, + conversationId, + logLabel: + command.action === 'back' ? 'Tab Back' : 'Tab Forward', + }) + : await captureDefaultHighlightedPageState({ + tabId: targetTabId, + conversationId, + logLabel: + command.action === 'back' ? 'Tab Back' : 'Tab Forward', + primeWithRawScreenshot: true, + }); return { success: true, @@ -2213,11 +2492,17 @@ async function handleCommand(command: Command): Promise { console.log(`💬 [HandleDialog] Auto-accepting cascading alert`); await dialogManager.autoAcceptDialog(activeTabId); - const dialogPageState = await captureDefaultHighlightedPageState({ - tabId: activeTabId, - conversationId, - logLabel: 'HandleDialog', - }); + const dialogPageState = command.live_mode + ? await captureLiveCleanPageState({ + tabId: activeTabId, + conversationId, + logLabel: 'HandleDialog', + }) + : await captureDefaultHighlightedPageState({ + tabId: activeTabId, + conversationId, + logLabel: 'HandleDialog', + }); return { success: true, @@ -2254,11 +2539,17 @@ async function handleCommand(command: Command): Promise { }; } - const dialogPageState = await captureDefaultHighlightedPageState({ - tabId: activeTabId, - conversationId, - logLabel: 'HandleDialog', - }); + const dialogPageState = command.live_mode + ? await captureLiveCleanPageState({ + tabId: activeTabId, + conversationId, + logLabel: 'HandleDialog', + }) + : await captureDefaultHighlightedPageState({ + tabId: activeTabId, + conversationId, + logLabel: 'HandleDialog', + }); console.log( `✅ [HandleDialog] Dialog handling complete, screenshot captured`, diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts new file mode 100644 index 0000000..f00a150 --- /dev/null +++ b/extension/src/commands/pixel-actions.ts @@ -0,0 +1,693 @@ +/** + * Pixel-level mouse and keyboard dispatch via CDP. + * + * Used by the live agent path: the model emits Qwen-VL [0,1000] coordinates, + * the server denormalizes them to CSS pixels, and these helpers turn the CSS + * pixels into `Input.dispatchMouseEvent` / `Input.dispatchKeyEvent` calls. + * + * All entry points clamp coordinates to the live viewport before dispatch + * (defense-in-depth) and refresh the in-DOM virtual cursor on every call. + * + * Coordinates are CSS viewport pixels — same space CDP `Input.*` consumes. + */ + +import { CdpCommander } from './cdp-commander'; +import { debuggerSessionManager } from './debugger-manager'; +import { dialogManager } from './dialog'; +import { + buildCursorInjectScript, + setCursorPosition, + getCursorPosition, + resolveCursorOrCenter, + buildViewportProbeScript, +} from './virtual-cursor'; + +/** + * Common pre-flight: ensure CDP debugger is attached AND dialog tracking is + * enabled for this tab. Without dialog tracking, a click that opens a + * confirm/prompt would block subsequent CDP calls without surfacing a + * `dialog_opened` state to the agent. + */ +async function attachWithDialogTracking( + tabId: number, + conversationId: string, +): Promise { + await debuggerSessionManager.attachDebugger(tabId, conversationId); + try { + await dialogManager.enableForTab(tabId); + } catch (err) { + console.warn( + `⚠️ [PixelActions] dialogManager.enableForTab failed on tab ${tabId}:`, + err, + ); + } +} + +const MODIFIER_BITS: Record = { + alt: 1, + control: 2, + ctrl: 2, + meta: 4, + cmd: 4, + command: 4, + shift: 8, +}; + +function modifiersBitmask(modifiers: string[] | undefined | null): number { + if (!modifiers || modifiers.length === 0) return 0; + let mask = 0; + for (const m of modifiers) { + const bit = MODIFIER_BITS[m.toLowerCase()]; + if (bit) mask |= bit; + } + return mask; +} + +async function getViewport( + cdp: CdpCommander, +): Promise<{ width: number; height: number }> { + try { + const probe = await cdp.sendCommand<{ + result?: { value?: { width?: number; height?: number } }; + }>( + 'Runtime.evaluate', + { + expression: buildViewportProbeScript(), + returnByValue: true, + }, + 8000, + 0, + ); + const value = probe?.result?.value; + const w = typeof value?.width === 'number' && value.width > 0 + ? value.width + : 1280; + const h = typeof value?.height === 'number' && value.height > 0 + ? value.height + : 720; + return { width: w, height: h }; + } catch { + return { width: 1280, height: 720 }; + } +} + +function clampToViewport( + x: number, + y: number, + vw: number, + vh: number, +): { x: number; y: number; warning?: string } { + let warning: string | undefined; + let cx = Math.round(x); + let cy = Math.round(y); + if (cx < 0 || cx > vw || cy < 0 || cy > vh) { + warning = `(${cx}, ${cy}) outside viewport ${vw}x${vh}; clamped`; + } + cx = Math.max(0, Math.min(vw, cx)); + cy = Math.max(0, Math.min(vh, cy)); + return { x: cx, y: cy, warning }; +} + +async function refreshCursor( + cdp: CdpCommander, + tabId: number, + x: number, + y: number, +): Promise { + setCursorPosition(tabId, x, y); + try { + await cdp.sendCommand( + 'Runtime.evaluate', + { + expression: buildCursorInjectScript(x, y), + returnByValue: true, + }, + 8000, + 0, + ); + } catch (err) { + console.warn( + `⚠️ [PixelActions] Cursor refresh failed on tab ${tabId}:`, + err, + ); + } +} + +// Cubic ease-in-out — slow start, fast middle, slow stop. Mimics a real +// human reach instead of teleporting between two points. +function easeInOut(t: number): number { + return t < 0.5 ? 4 * t * t * t : 1 - Math.pow(-2 * t + 2, 3) / 2; +} + +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +export async function performMouseMove( + tabId: number, + conversationId: string, + x: number, + y: number, +): Promise<{ x: number; y: number; warning?: string }> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const { width: vw, height: vh } = await getViewport(cdp); + const target = clampToViewport(x, y, vw, vh); + + // Lerp the cursor from its last position to the target with eased + // intermediate `mouseMoved` events. This makes the move look like a + // real human stroke: hover/mouseenter events fire in order along the + // path, and live observers see the cursor sprite glide instead of + // teleporting. The CSS transition on the cursor div smooths the + // visible sprite; the CDP step-through smooths the input-event side. + const start = getCursorPosition(tabId) ?? { x: target.x, y: target.y }; + const dx = target.x - start.x; + const dy = target.y - start.y; + const distance = Math.sqrt(dx * dx + dy * dy); + const steps = Math.max(2, Math.min(30, Math.round(distance / 24))); + // Total move duration scales with distance (≈ 1.5 ms / px) and is + // capped so even cross-screen sweeps complete in well under a second. + const totalMs = Math.max(60, Math.min(450, distance * 1.5)); + const stepDelay = steps > 1 ? totalMs / (steps - 1) : 0; + + for (let i = 1; i <= steps; i++) { + const t = easeInOut(i / steps); + const ix = Math.round(start.x + dx * t); + const iy = Math.round(start.y + dy * t); + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseMoved', + x: ix, + y: iy, + button: 'none', + buttons: 0, + }, + 8000, + 0, + ); + if (stepDelay > 4 && i < steps) { + await sleep(stepDelay); + } + } + + await refreshCursor(cdp, tabId, target.x, target.y); + return target; +} + +export async function performMouseClick( + tabId: number, + conversationId: string, + // `click` is an in-place action. `_x` and `_y` are kept on the wire + // schema for compatibility but are intentionally ignored: if the + // agent wants to click somewhere new, it must `move` there first. + // This makes the cursor's visible position load-bearing — the click + // commits exactly where the agent (and any human observer) sees it. + _x: number | undefined, + _y: number | undefined, + button: 'left' | 'right' | 'middle' = 'left', + count: number = 1, +): Promise<{ x: number; y: number; button: string; warning?: string }> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const { width: vw, height: vh } = await getViewport(cdp); + + // Click happens at the cursor's last known position. On the very + // first action of a tab, fall back to viewport center — same default + // the cursor sprite uses on first inject. + const cursor = + getCursorPosition(tabId) ?? + (await resolveCursorOrCenter(tabId, conversationId)); + const clamped = clampToViewport(cursor.x, cursor.y, vw, vh); + + const cdpButton: 'left' | 'right' | 'middle' = button; + const buttons = button === 'left' ? 1 : button === 'right' ? 2 : 4; + const safeCount = Math.max(1, Math.min(3, count | 0)); + + // CDP convention: emit one press/release pair per click and increment + // `clickCount` (1, 2, 3) so Chrome interprets it as a single → double → + // triple click sequence. Sending N pairs each with `clickCount:N` produces + // N independent N-clicks, which is wrong for double-click semantics. + for (let i = 1; i <= safeCount; i++) { + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mousePressed', + x: clamped.x, + y: clamped.y, + button: cdpButton, + buttons, + clickCount: i, + }, + 8000, + 0, + ); + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseReleased', + x: clamped.x, + y: clamped.y, + button: cdpButton, + buttons, + clickCount: i, + }, + 8000, + 0, + ); + } + + await refreshCursor(cdp, tabId, clamped.x, clamped.y); + return { x: clamped.x, y: clamped.y, button, warning: clamped.warning }; +} + +export async function performMouseDrag( + tabId: number, + conversationId: string, + startX: number, + startY: number, + endX: number, + endY: number, + button: 'left' | 'right' | 'middle' = 'left', + steps: number = 10, +): Promise<{ + start: { x: number; y: number }; + end: { x: number; y: number }; + warning?: string; +}> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const { width: vw, height: vh } = await getViewport(cdp); + const start = clampToViewport(startX, startY, vw, vh); + const end = clampToViewport(endX, endY, vw, vh); + const safeSteps = Math.max(2, Math.min(40, steps | 0)); + const cdpButton: 'left' | 'right' | 'middle' = button; + const buttons = button === 'left' ? 1 : button === 'right' ? 2 : 4; + const warning = start.warning || end.warning; + + // Pre-move to start + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseMoved', + x: start.x, + y: start.y, + button: 'none', + buttons: 0, + }, + 8000, + 0, + ); + // Press + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mousePressed', + x: start.x, + y: start.y, + button: cdpButton, + buttons, + clickCount: 1, + }, + 8000, + 0, + ); + // Lerp moves + for (let i = 1; i <= safeSteps; i++) { + const t = i / safeSteps; + const ix = Math.round(start.x + (end.x - start.x) * t); + const iy = Math.round(start.y + (end.y - start.y) * t); + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseMoved', + x: ix, + y: iy, + button: cdpButton, + buttons, + }, + 8000, + 0, + ); + } + // Release + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseReleased', + x: end.x, + y: end.y, + button: cdpButton, + buttons, + clickCount: 1, + }, + 8000, + 0, + ); + + await refreshCursor(cdp, tabId, end.x, end.y); + return { + start: { x: start.x, y: start.y }, + end: { x: end.x, y: end.y }, + warning, + }; +} + +export async function performMouseScroll( + tabId: number, + conversationId: string, + direction: 'up' | 'down' | 'left' | 'right', + amount: number, +): Promise<{ x: number; y: number; deltaX: number; deltaY: number }> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const cursor = + getCursorPosition(tabId) ?? + (await resolveCursorOrCenter(tabId, conversationId)); + const safeAmount = Math.max(1, Math.min(2000, amount | 0)); + let deltaX = 0; + let deltaY = 0; + switch (direction) { + case 'down': + deltaY = safeAmount; + break; + case 'up': + deltaY = -safeAmount; + break; + case 'right': + deltaX = safeAmount; + break; + case 'left': + deltaX = -safeAmount; + break; + } + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseWheel', + x: cursor.x, + y: cursor.y, + deltaX, + deltaY, + }, + 8000, + 0, + ); + await refreshCursor(cdp, tabId, cursor.x, cursor.y); + return { x: cursor.x, y: cursor.y, deltaX, deltaY }; +} + +// Per-character US-keyboard mapping for plain ASCII printables. Used by +// `performKeyboardType` to dispatch real keyDown/keyUp events one char at +// a time — feels like a human typing and lets per-character JS handlers +// (autocomplete, validation) react in order. Anything outside this map +// (CJK, emoji, accented Latin, etc.) falls through to `Input.insertText`. +const SHIFT_PUNCT: Record = { + '!': { key: '!', code: 'Digit1', keyCode: 49 }, + '@': { key: '@', code: 'Digit2', keyCode: 50 }, + '#': { key: '#', code: 'Digit3', keyCode: 51 }, + $: { key: '$', code: 'Digit4', keyCode: 52 }, + '%': { key: '%', code: 'Digit5', keyCode: 53 }, + '^': { key: '^', code: 'Digit6', keyCode: 54 }, + '&': { key: '&', code: 'Digit7', keyCode: 55 }, + '*': { key: '*', code: 'Digit8', keyCode: 56 }, + '(': { key: '(', code: 'Digit9', keyCode: 57 }, + ')': { key: ')', code: 'Digit0', keyCode: 48 }, + _: { key: '_', code: 'Minus', keyCode: 189 }, + '+': { key: '+', code: 'Equal', keyCode: 187 }, + '{': { key: '{', code: 'BracketLeft', keyCode: 219 }, + '}': { key: '}', code: 'BracketRight', keyCode: 221 }, + '|': { key: '|', code: 'Backslash', keyCode: 220 }, + ':': { key: ':', code: 'Semicolon', keyCode: 186 }, + '"': { key: '"', code: 'Quote', keyCode: 222 }, + '<': { key: '<', code: 'Comma', keyCode: 188 }, + '>': { key: '>', code: 'Period', keyCode: 190 }, + '?': { key: '?', code: 'Slash', keyCode: 191 }, + '~': { key: '~', code: 'Backquote', keyCode: 192 }, +}; +const PLAIN_PUNCT: Record = { + '`': { key: '`', code: 'Backquote', keyCode: 192 }, + '-': { key: '-', code: 'Minus', keyCode: 189 }, + '=': { key: '=', code: 'Equal', keyCode: 187 }, + '[': { key: '[', code: 'BracketLeft', keyCode: 219 }, + ']': { key: ']', code: 'BracketRight', keyCode: 221 }, + '\\': { key: '\\', code: 'Backslash', keyCode: 220 }, + ';': { key: ';', code: 'Semicolon', keyCode: 186 }, + "'": { key: "'", code: 'Quote', keyCode: 222 }, + ',': { key: ',', code: 'Comma', keyCode: 188 }, + '.': { key: '.', code: 'Period', keyCode: 190 }, + '/': { key: '/', code: 'Slash', keyCode: 191 }, +}; + +function keyParamsForChar( + ch: string, +): { key: string; code: string; keyCode: number; shift: boolean } | null { + if (ch.length !== 1) return null; + const code = ch.charCodeAt(0); + if (code > 0x7e || code < 0x20) return null; + if (ch >= 'a' && ch <= 'z') { + return { + key: ch, + code: `Key${ch.toUpperCase()}`, + keyCode: ch.toUpperCase().charCodeAt(0), + shift: false, + }; + } + if (ch >= 'A' && ch <= 'Z') { + return { + key: ch, + code: `Key${ch}`, + keyCode: ch.charCodeAt(0), + shift: true, + }; + } + if (ch >= '0' && ch <= '9') { + return { + key: ch, + code: `Digit${ch}`, + keyCode: ch.charCodeAt(0), + shift: false, + }; + } + if (ch === ' ') return { key: ' ', code: 'Space', keyCode: 32, shift: false }; + if (PLAIN_PUNCT[ch]) return { ...PLAIN_PUNCT[ch], shift: false }; + if (SHIFT_PUNCT[ch]) return { ...SHIFT_PUNCT[ch], shift: true }; + return null; +} + +export async function performKeyboardType( + tabId: number, + conversationId: string, + text: string, +): Promise<{ length: number }> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + + // Type one character at a time so the page sees real `keydown` → + // `keypress` → `input` → `keyup` events for each char. This matches + // what a human keyboard produces and lets per-char JS handlers + // (autocomplete dropdowns, live validation, debounced search) react + // in order. Small inter-char delays keep the cadence human-paced. + // Non-ASCII characters (CJK, emoji, accented Latin) fall back to + // `Input.insertText` because they don't have a clean US-keyboard + // representation. + const PER_CHAR_DELAY_MS = 28; + for (let i = 0; i < text.length; i++) { + const ch = text[i]; + const params = keyParamsForChar(ch); + if (params) { + const modifiers = params.shift ? 8 : 0; + // keyDown with `text` fires keypress + input as well as keydown. + await cdp.sendCommand( + 'Input.dispatchKeyEvent', + { + type: 'keyDown', + key: params.key, + code: params.code, + windowsVirtualKeyCode: params.keyCode, + text: params.key, + unmodifiedText: params.key, + modifiers, + }, + 8000, + 0, + ); + await cdp.sendCommand( + 'Input.dispatchKeyEvent', + { + type: 'keyUp', + key: params.key, + code: params.code, + windowsVirtualKeyCode: params.keyCode, + modifiers, + }, + 8000, + 0, + ); + } else { + // Non-ASCII: insert as raw text. Fires `input` but not keydown. + await cdp.sendCommand('Input.insertText', { text: ch }, 8000, 0); + } + if (PER_CHAR_DELAY_MS > 0 && i < text.length - 1) { + await sleep(PER_CHAR_DELAY_MS); + } + } + return { length: text.length }; +} + +const NAMED_KEY_MAP: Record< + string, + { key: string; code: string; keyCode?: number } +> = { + enter: { key: 'Enter', code: 'Enter', keyCode: 13 }, + return: { key: 'Enter', code: 'Enter', keyCode: 13 }, + escape: { key: 'Escape', code: 'Escape', keyCode: 27 }, + esc: { key: 'Escape', code: 'Escape', keyCode: 27 }, + tab: { key: 'Tab', code: 'Tab', keyCode: 9 }, + backspace: { key: 'Backspace', code: 'Backspace', keyCode: 8 }, + delete: { key: 'Delete', code: 'Delete', keyCode: 46 }, + arrowup: { key: 'ArrowUp', code: 'ArrowUp', keyCode: 38 }, + arrowdown: { key: 'ArrowDown', code: 'ArrowDown', keyCode: 40 }, + arrowleft: { key: 'ArrowLeft', code: 'ArrowLeft', keyCode: 37 }, + arrowright: { key: 'ArrowRight', code: 'ArrowRight', keyCode: 39 }, + pageup: { key: 'PageUp', code: 'PageUp', keyCode: 33 }, + pagedown: { key: 'PageDown', code: 'PageDown', keyCode: 34 }, + home: { key: 'Home', code: 'Home', keyCode: 36 }, + end: { key: 'End', code: 'End', keyCode: 35 }, + space: { key: ' ', code: 'Space', keyCode: 32 }, +}; + +function resolveNamedKey(rawKey: string): { + key: string; + code: string; + keyCode?: number; +} { + const direct = NAMED_KEY_MAP[rawKey.toLowerCase()]; + if (direct) return direct; + if (rawKey.length === 1) { + const ch = rawKey; + if (ch >= 'a' && ch <= 'z') { + return { + key: ch, + code: `Key${ch.toUpperCase()}`, + keyCode: ch.toUpperCase().charCodeAt(0), + }; + } + if (ch >= 'A' && ch <= 'Z') { + return { key: ch, code: `Key${ch}`, keyCode: ch.charCodeAt(0) }; + } + if (ch >= '0' && ch <= '9') { + return { + key: ch, + code: `Digit${ch}`, + keyCode: ch.charCodeAt(0), + }; + } + } + // Unknown: pass through verbatim. + return { key: rawKey, code: rawKey }; +} + +// Keys that produce a character — these need a `text` field on the +// keyDown so Chrome fires `keypress` (which is what most form-submit +// handlers and search shortcuts listen for). Without the text, keypress +// never fires and pressing Enter looks like nothing happened. +const KEY_TEXT: Record = { + Enter: '\r', + Tab: '\t', + Space: ' ', +}; + +export async function performKeyboardPress( + tabId: number, + conversationId: string, + rawKey: string, + modifiers: string[] | undefined, +): Promise<{ key: string; modifiers: number }> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const resolved = resolveNamedKey(rawKey); + const mod = modifiersBitmask(modifiers); + + // For single printable characters with no modifiers, attach `text` so + // the keypress event fires. Pure shortcuts (Ctrl+A, Cmd+K) intentionally + // omit text — keypress shouldn't fire there. + let text: string | undefined; + if (KEY_TEXT[resolved.key]) { + text = KEY_TEXT[resolved.key]; + } else if (resolved.key.length === 1 && mod === 0) { + text = resolved.key; + } + + const downParams: Record = { + type: 'keyDown', + key: resolved.key, + code: resolved.code, + modifiers: mod, + }; + if (resolved.keyCode !== undefined) { + downParams.windowsVirtualKeyCode = resolved.keyCode; + } + if (text !== undefined) { + downParams.text = text; + downParams.unmodifiedText = text; + } + + await cdp.sendCommand('Input.dispatchKeyEvent', downParams, 8000, 0); + await cdp.sendCommand( + 'Input.dispatchKeyEvent', + { + type: 'keyUp', + key: resolved.key, + code: resolved.code, + ...(resolved.keyCode !== undefined + ? { windowsVirtualKeyCode: resolved.keyCode } + : {}), + modifiers: mod, + }, + 8000, + 0, + ); + return { key: resolved.key, modifiers: mod }; +} + +/** + * Clear the currently focused input by selecting all then deleting. + * Convenience wrapper so the agent doesn't have to chain Ctrl+A → + * Backspace as two separate `press` calls. + */ +export async function performKeyboardClear( + tabId: number, + conversationId: string, +): Promise<{ cleared: true }> { + // Select all (Ctrl+A — works on macOS in browser inputs too). + await performKeyboardPress(tabId, conversationId, 'a', ['Control']); + await sleep(20); + await performKeyboardPress(tabId, conversationId, 'Backspace', undefined); + return { cleared: true }; +} + +export async function performResetMouse( + tabId: number, + conversationId: string, +): Promise<{ x: number; y: number }> { + await attachWithDialogTracking(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const { width: vw, height: vh } = await getViewport(cdp); + const cx = Math.round(vw / 2); + const cy = Math.round(vh / 2); + await cdp.sendCommand( + 'Input.dispatchMouseEvent', + { + type: 'mouseMoved', + x: cx, + y: cy, + button: 'none', + buttons: 0, + }, + 8000, + 0, + ); + await refreshCursor(cdp, tabId, cx, cy); + return { x: cx, y: cy }; +} diff --git a/extension/src/commands/tab-manager.ts b/extension/src/commands/tab-manager.ts index 37e8b3b..1d866bc 100644 --- a/extension/src/commands/tab-manager.ts +++ b/extension/src/commands/tab-manager.ts @@ -4,6 +4,8 @@ * Inspired by MANUS Chrome Plugin design */ +import { clearCursorPosition } from './virtual-cursor'; + // Tab group constants const TAB_GROUP_NAME = 'OpenBrowser'; const TAB_GROUP_COLOR = 'grey' as chrome.tabGroups.Color; @@ -855,6 +857,10 @@ export class TabManager { private setupListeners(): void { // Listen for tab removal chrome.tabs.onRemoved.addListener((tabId) => { + // Drop any cached virtual-cursor position for this tab so the entry + // doesn't outlive the tab. + clearCursorPosition(tabId); + // Find which session this tab belongs to for (const [conversationId, session] of this.sessions.entries()) { if (session.managedTabs.has(tabId)) { diff --git a/extension/src/commands/virtual-cursor.ts b/extension/src/commands/virtual-cursor.ts new file mode 100644 index 0000000..f62fd59 --- /dev/null +++ b/extension/src/commands/virtual-cursor.ts @@ -0,0 +1,215 @@ +/** + * Virtual on-screen cursor — DOM overlay rendered into the page so the agent + * can see where the pointer is in every screenshot. + * + * Why an in-page DOM overlay instead of the native OS cursor: CDP + * `Page.captureScreenshot` does not include the OS cursor (see + * `screenshot.ts` includeCursor docstring). So the live agent path renders a + * 24×24 SVG arrow into the document with `position:fixed; pointer-events:none; + * z-index: 2147483646` (one below the highlight overlay's z-index). + * + * The cursor is **always** injected via `preCaptureScript` immediately before + * `Page.captureScreenshot` runs, so it appears fresh on whatever DOM exists at + * the moment of capture. This sidesteps races against navigation / async + * layout that a `chrome.webNavigation.onCommitted` listener would have. + * + * The hotspot (click point) is the upper-left tip of the arrow (pixel (2, 2) + * inside the 24×24 sprite), matching OS cursor convention. The agent's `(x, + * y)` coordinate aligns with the arrow's tip — the body extends down-right + * away from the target. + */ + +import { CdpCommander } from './cdp-commander'; +import { debuggerSessionManager } from './debugger-manager'; + +const CURSOR_OVERLAY_ID = '__ob_cursor_overlay__'; +const CURSOR_Z_INDEX = 2147483646; +// Cursor sprite size in CSS pixels. Hotspot stays at (2, 2) inside the +// sprite so the agent's `(x, y)` aligns with the upper-left tip of the arrow. +const CURSOR_SIZE = 36; + +/** + * Build a JS source string that creates or updates the virtual cursor at + * (x, y) and returns viewport metadata. Designed to be passed as the + * `preCaptureScript` argument to `captureScreenshot` so the cursor lands in + * the captured image. + * + * Coordinates are CSS viewport pixels. The script is idempotent: it creates + * the overlay div once per page load and only repositions on subsequent + * calls. Position uses `transform: translate()` (cheaper than left/top + * reflow) and writes are batched in a single style assignment. + */ +export function buildCursorInjectScript(x: number, y: number): string { + const safeX = Math.max(0, Math.round(x)); + const safeY = Math.max(0, Math.round(y)); + return ` + (() => { + try { + const ID = ${JSON.stringify(CURSOR_OVERLAY_ID)}; + const Z = ${CURSOR_Z_INDEX}; + const SZ = ${CURSOR_SIZE}; + let host = document.getElementById(ID); + if (!host) { + host = document.createElement('div'); + host.id = ID; + host.setAttribute('data-ob-virtual-cursor', '1'); + host.style.cssText = [ + 'position:fixed', + 'top:0', + 'left:0', + 'width:' + SZ + 'px', + 'height:' + SZ + 'px', + 'pointer-events:none', + 'z-index:' + Z, + 'will-change:transform', + 'contain:layout style paint', + // Smooth interpolation between consecutive position updates so + // the cursor visibly glides instead of teleporting when watched + // live. CDP screenshots capture whatever frame is current at + // capture time, so this also makes mid-animation captures less + // jarring during navigation. + 'transition:transform 120ms cubic-bezier(.25,.46,.45,.94)', + ].join(';'); + // Layered sprite: + // 1. A red ring + dot at the click hotspot (top-left, hotspot 2,2). + // The ring pulses subtly so the agent can spot it even on busy + // or low-contrast pages. + // 2. A white arrow with a thick black outline and strong drop + // shadow on top. The arrow's tip aligns with the dot so the + // intended click point is unambiguous in screenshots. + host.innerHTML = [ + '', + '
', + '
', + '', + ' ', + '', + ].join(''); + (document.documentElement || document.body || document).appendChild(host); + } + host.style.transform = 'translate(' + ${safeX} + 'px,' + ${safeY} + 'px)'; + return { + ok: true, + x: ${safeX}, + y: ${safeY}, + viewportWidth: window.innerWidth, + viewportHeight: window.innerHeight, + devicePixelRatio: window.devicePixelRatio || 1, + }; + } catch (err) { + return { ok: false, error: String(err) }; + } + })() + `; +} + +/** + * Build a JS source string that returns the current viewport size in CSS + * pixels. Used to place the cursor at the viewport center on first injection + * before any pixel action has been issued. + */ +export function buildViewportProbeScript(): string { + return `({ width: window.innerWidth, height: window.innerHeight })`; +} + +/** + * Track the virtual cursor position per tab. The position is updated every + * time the agent issues a pixel action; the next screenshot's + * `preCaptureScript` reads from here. + * + * Default position on first read is (0, 0) — callers should resolve to + * viewport center via `resolveCursorOrCenter()` before injecting. + */ +const cursorByTab = new Map(); + +export function setCursorPosition(tabId: number, x: number, y: number): void { + cursorByTab.set(tabId, { + x: Math.max(0, Math.round(x)), + y: Math.max(0, Math.round(y)), + }); +} + +export function getCursorPosition( + tabId: number, +): { x: number; y: number } | undefined { + return cursorByTab.get(tabId); +} + +export function clearCursorPosition(tabId: number): void { + cursorByTab.delete(tabId); +} + +/** + * Resolve the cursor position for a tab, defaulting to viewport center on + * first call. Queries the page for `window.innerWidth/innerHeight` via CDP + * Runtime.evaluate so we use the real viewport even when the extension has + * no other source of truth yet. + * + * Returns CSS pixel coordinates suitable for `buildCursorInjectScript`. + */ +export async function resolveCursorOrCenter( + tabId: number, + conversationId: string, +): Promise<{ x: number; y: number }> { + const known = cursorByTab.get(tabId); + if (known) return known; + try { + await debuggerSessionManager.attachDebugger(tabId, conversationId); + const cdp = new CdpCommander(tabId); + const probe = await cdp.sendCommand( + 'Runtime.evaluate', + { + expression: buildViewportProbeScript(), + returnByValue: true, + }, + 3000, + 0, + ); + const value = (probe as { result?: { value?: unknown } } | undefined) + ?.result?.value as + | { width?: number; height?: number } + | undefined; + const w = + typeof value?.width === 'number' && value.width > 0 ? value.width : 1280; + const h = + typeof value?.height === 'number' && value.height > 0 + ? value.height + : 720; + const center = { x: Math.round(w / 2), y: Math.round(h / 2) }; + cursorByTab.set(tabId, center); + return center; + } catch (err) { + console.warn( + `⚠️ [VirtualCursor] resolveCursorOrCenter failed on tab ${tabId}:`, + err, + ); + // Conservative default — clamps to a typical viewport. + const fallback = { x: 640, y: 360 }; + cursorByTab.set(tabId, fallback); + return fallback; + } +} diff --git a/extension/src/types.ts b/extension/src/types.ts index 97a72e0..f0dc526 100644 --- a/extension/src/types.ts +++ b/extension/src/types.ts @@ -21,6 +21,10 @@ export interface BaseCommand { timestamp?: number; tab_id?: number; conversation_id?: string; // For multi-session support + // When true, the live agent path is active: skip highlight injection and + // return a clean screenshot with the virtual cursor. Default false keeps + // routine-replay's highlight + element-id behavior. + live_mode?: boolean; } export interface MouseMoveCommand extends BaseCommand { @@ -35,6 +39,21 @@ export interface MouseClickCommand extends BaseCommand { button?: MouseButton; double?: boolean; count?: number; + // Optional CSS-pixel target. When provided, the extension pre-moves the + // cursor to (x, y) before dispatching the click. When omitted, the click + // fires at the cursor's current position. + x?: number; + y?: number; +} + +export interface MouseDragCommand extends BaseCommand { + type: 'mouse_drag'; + start_x: number; + start_y: number; + end_x: number; + end_y: number; + button?: MouseButton; + steps?: number; } export interface MouseScrollCommand extends BaseCommand { @@ -294,6 +313,7 @@ export interface GroundedElementsResponse { export type Command = | MouseMoveCommand | MouseClickCommand + | MouseDragCommand | MouseScrollCommand | ResetMouseCommand | KeyboardTypeCommand diff --git a/server/agent/api.py b/server/agent/api.py index 53aa826..0e59450 100644 --- a/server/agent/api.py +++ b/server/agent/api.py @@ -497,14 +497,24 @@ def initialize_agent(): # Import the old OpenBrowserTool for backward compatibility # logger.info("OpenBrowserTool registered (deprecated, for backward compatibility)") - # Import new focused tools to ensure they're registered + # Tools exposed to the agent: tab, mouse, keyboard, dialog. The + # legacy highlight + element_interaction modules are imported only + # to keep them importable for non-agent flows; they are not in the + # agent's toolset. from .tools.tab_tool import TabTool - from .tools.highlight_tool import HighlightTool - from .tools.element_interaction_tool import ElementInteractionTool from .tools.dialog_tool import DialogTool + from .tools.mouse_tool import MouseTool + from .tools.keyboard_tool import KeyboardTool + # Imported for legacy tooling (routine recording) — not registered + # for the live agent. + from .tools.highlight_tool import HighlightTool # noqa: F401 + from .tools.element_interaction_tool import ( # noqa: F401 + ElementInteractionTool, + ) logger.info( - "4 focused OpenBrowser tools registered: tab, highlight, element_interaction, dialog" + "4 OpenBrowser tools registered for the agent: " + "tab, mouse, keyboard, dialog" ) except Exception as e: diff --git a/server/agent/manager.py b/server/agent/manager.py index 7e87026..d59c726 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -93,10 +93,14 @@ def __init__(self, multi_process_mode: bool = False): else: logger.info("AgentManager initialized in single-process mode") + # The agent drives the browser like a human, with a virtual mouse, + # keyboard, and screenshots — same toolset for fresh tasks and routine + # replay. The legacy highlight + element_interaction tools live on + # disk for non-agent flows (recording tooling) but are not exposed. self.browser_tools = [ Tool(name="tab"), # Tab management - Tool(name="highlight"), # Element discovery with visual overlays - Tool(name="element_interaction"), # Click/input with 2PC, others direct + Tool(name="mouse"), # Virtual mouse: move/click/drag/scroll/reset + Tool(name="keyboard"), # Virtual keyboard: type/press Tool(name="dialog"), # Browser dialog handling ] self.general_tools = [ @@ -134,9 +138,17 @@ def _resolve_llm_settings( return model_to_use, base_url_to_use, selected_llm def _get_tools_for_model( - self, model: Optional[str] = None, model_alias: Optional[str] = None + self, + model: Optional[str] = None, + model_alias: Optional[str] = None, + mode: Optional[str] = None, ) -> list[Tool]: - """Return the tool list for a model tier.""" + """Return the tool list for a model tier. + + ``mode`` is accepted for back-compat but no longer changes the + toolset — every conversation, including routine replay, uses the + pixel paradigm. + """ self._resolve_llm_settings(model=model, model_alias=model_alias) return list(self.browser_tools) + list(self.general_tools) @@ -328,7 +340,7 @@ def _create_conversation_in_process( # Create agent with tools agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) - tools = self._get_tools_for_model(model, model_alias) + tools = self._get_tools_for_model(model, model_alias, mode) tool_image_window = get_context_image_window( routine_replay=self._is_routine_replay_mode(mode) ) @@ -577,7 +589,7 @@ def get_or_create_conversation( # Create agent with tools agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) - tools = self._get_tools_for_model(model, model_alias) + tools = self._get_tools_for_model(model, model_alias, mode) tool_image_window = get_context_image_window( routine_replay=self._is_routine_replay_mode(mode) ) diff --git a/server/agent/prompts/big_model/dialog_tool.j2 b/server/agent/prompts/big_model/dialog_tool.j2 index a970f18..9b2f866 100644 --- a/server/agent/prompts/big_model/dialog_tool.j2 +++ b/server/agent/prompts/big_model/dialog_tool.j2 @@ -46,11 +46,11 @@ Handle the currently open dialog. - **Handle immediately**: Automation is blocked until you handle the dialog - **One at a time**: After handling, check if another dialog appeared (cascading dialogs) -- **Returns highlighted screenshot**: When dialog handling completes without another blocking dialog, you get the default `highlight` `element_type: "any"` page 1 screenshot of the resulting page state +- **Returns clean screenshot with cursor**: When dialog handling completes without another blocking dialog, you get a clean screenshot of the resulting page state with the virtual cursor visible. ## Screenshot Behavior -The dialog tool returns the default `highlight` `element_type: "any"` page 1 screenshot after handling the dialog, showing the resulting page state with fresh interactive IDs. +The dialog tool returns a clean screenshot of the resulting page state after handling the dialog, with the virtual cursor visible. ## Error Handling diff --git a/server/agent/prompts/big_model/keyboard_tool.j2 b/server/agent/prompts/big_model/keyboard_tool.j2 new file mode 100644 index 0000000..a31437b --- /dev/null +++ b/server/agent/prompts/big_model/keyboard_tool.j2 @@ -0,0 +1,56 @@ +# Keyboard Tool + +Type text and press keys at the current focus. Click a field with `mouse` first to focus it; then use this tool. + +## Actions + +### type +Type literal text where the keyboard focus is right now. Characters are sent one at a time, with a small delay between each, so per-character handlers (autocomplete, live validation) react in order — just like a real keyboard. + +```json +{ "action": "type", "text": "hello world" } +``` + +`type` does not interpret special characters — newlines and tabs are inserted literally. For Enter / Tab / shortcuts, use `press`. + +### press +Press a single named key, optionally with modifiers. + +```json +{ "action": "press", "key": "Enter" } +{ "action": "press", "key": "Escape" } +{ "action": "press", "key": "Tab" } +{ "action": "press", "key": "Backspace" } +{ "action": "press", "key": "Delete" } +{ "action": "press", "key": "ArrowDown" } +{ "action": "press", "key": "a", "modifiers": ["Control"] } +{ "action": "press", "key": "Tab", "modifiers": ["Shift"] } +``` + +Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`, `ArrowDown`, `ArrowLeft`, `ArrowRight`, `PageUp`, `PageDown`, `Home`, `End`, single letters/digits. + +Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS). + +### clear +Convenience: select-all + delete the contents of the currently focused field. Use this when you want to overwrite a field that already has text in it. + +```json +{ "action": "clear" } +``` + +Equivalent to `press a` with `modifiers: ["Control"]` then `press Backspace`. + +## Patterns + +- **Fill an empty form field**: `mouse` `click` on the field → `keyboard` `type` the value. +- **Replace existing text in a field**: `mouse` `click` on the field → `keyboard` `clear` → `keyboard` `type` the new value. +- **Submit a form / trigger search**: `keyboard` `press` `key: "Enter"` after typing. +- **Erase a single character**: `keyboard` `press` `key: "Backspace"`. +- **Select all in a field**: `keyboard` `press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS). +- **Tab to next field**: `keyboard` `press` `key: "Tab"`. + +## Notes + +- One action per turn. +- Typing or pressing goes to whatever has keyboard focus. If nothing is focused, nothing happens — `mouse click` on a field first. +- Don't type into the address bar via this tool; use the `tab` tool to navigate. diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2 new file mode 100644 index 0000000..bcd7181 --- /dev/null +++ b/server/agent/prompts/big_model/mouse_tool.j2 @@ -0,0 +1,84 @@ +# Mouse Tool + +Drive a virtual mouse cursor: move, click, drag, scroll. + +## Coordinates + +`(x, y)` and `(x2, y2)` are integers in the **[0, 1000] normalized space**: + +- `(0, 0)` = top-left of the viewport. +- `(1000, 1000)` = bottom-right. + +Estimate from the screenshot. Aim for the visual center of your target. The system rescales to real pixels. + +## The Cursor Is Load-Bearing + +A red dot with a pulsing red ring sits inside a white-and-black arrow. **The dot is the click point.** It appears in every screenshot on this page. + +`click` is **in-place**: it commits exactly where the dot is right now. It does not take a target coordinate. If you want to click somewhere new, **`move` there first**, then verify in the next screenshot that the red dot is on top of the intended target, then `click`. + +This is a hard rule. Skipping the move-first step and clicking will commit at whatever position the cursor was last left at — which is rarely what you want. + +## Actions + +### move +Slide the cursor to a point. The cursor traces an eased path, so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way. + +```json +{ "action": "move", "x": 500, "y": 320 } +``` + +### click +Click **where the cursor is now**. Does not take coordinates. + +```json +{ "action": "click" } +{ "action": "click", "button": "right" } +{ "action": "click", "count": 2 } +``` + +- `button`: `"left"` (default), `"right"`, `"middle"`. +- `count`: `1` (default), `2` for double-click, `3` for triple-click (text selection). + +Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it is not, call `move` first. + +### drag +Press at `(x, y)`, drag to `(x2, y2)`, release. One call. + +```json +{ "action": "drag", "x": 200, "y": 400, "x2": 800, "y2": 400 } +``` + +Use for sliders, kanban moves, marquee selection, drag-and-drop. `steps` (optional, default 10) controls the smoothness for DnD libraries that need many intermediate move events. + +### scroll +Scroll at the cursor's current position by `amount` CSS pixels in `direction`. + +```json +{ "action": "scroll", "direction": "down", "amount": 600 } +{ "action": "scroll", "direction": "up", "amount": 300 } +``` + +`direction`: `"down"`, `"up"`, `"left"`, `"right"`. To scroll inside a specific panel/container, `move` over it first so the wheel event lands in the right scroll target. + +### reset +Return the cursor to the viewport center. + +```json +{ "action": "reset" } +``` + +## Patterns + +- **Click a button**: `move` to the button → check the screenshot → `click`. +- **Hover-reveal menu**: `move` over the trigger; the next screenshot shows the menu open. +- **Scroll to find something**: `scroll` direction `down`, then check the new screenshot. Repeat as needed. +- **Drag a slider**: one `drag` from the handle's current position to the target position. +- **Right-click for context menu**: `move` to the target → `click` with `button: "right"`. + +## Notes + +- One action per turn. The next observation reflects the post-action state. +- The cursor position persists across actions — the cursor remains where you last left it until you `move` it again. +- If a target isn't visible, `scroll` to bring it in view; don't try to click coordinates outside the viewport. +- If a confirm/prompt dialog opens, the next mouse action will fail — handle the dialog first. diff --git a/server/agent/prompts/big_model/tab_tool.j2 b/server/agent/prompts/big_model/tab_tool.j2 index ac7d8e0..374b17b 100644 --- a/server/agent/prompts/big_model/tab_tool.j2 +++ b/server/agent/prompts/big_model/tab_tool.j2 @@ -4,18 +4,17 @@ Manage browser tabs for the current conversation and establish the active page s ## Core Contract -- `tab init` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the loaded page -- `tab open` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the opened tab -- `tab switch` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the switched-to tab -- `tab refresh` - Returns the default `highlight` `element_type: "any"` page 1 screenshot of the refreshed page -- `tab back` - Returns the default `highlight` `element_type: "any"` page 1 screenshot after navigating back -- `tab forward` - Returns the default `highlight` `element_type: "any"` page 1 screenshot after navigating forward -- `tab list` - Returns tab list only -- `tab close` - Returns close result only -- `tab view` - Returns a clean screenshot without overlays -- If you need fresh `element_id`s after `tab view`, call `highlight`. +- `tab init` - Returns a clean screenshot of the loaded page with the virtual cursor visible. +- `tab open` - Returns a clean screenshot of the opened tab with the virtual cursor visible. +- `tab switch` - Returns a clean screenshot of the switched-to tab with the virtual cursor visible. +- `tab refresh` - Returns a clean screenshot of the refreshed page with the virtual cursor visible. +- `tab back` - Returns a clean screenshot after navigating back with the virtual cursor visible. +- `tab forward` - Returns a clean screenshot after navigating forward with the virtual cursor visible. +- `tab list` - Returns tab list only. +- `tab close` - Returns close result only. +- `tab view` - Returns a clean screenshot of the current tab with the virtual cursor visible. - Keep work in one active tab unless opening or switching tabs clearly improves the task. -- After navigation, use the returned observation first. If a likely target is already visible but clipped or cramped, use `scroll` before asking `highlight` for more pages. +- After navigation, use the returned observation first. If a likely target is already visible but clipped or cramped, scroll to recenter it before clicking. ## Commands @@ -26,7 +25,7 @@ Initialize a new browser session with its managed tab group. { "action": "init", "url": "https://example.com" } ``` -Start a task on a URL and get the default interactive observation for the loaded page. +Start a task on a URL and get a screenshot of the loaded page. ### tab open Open a new tab in the current session and switch to it. @@ -53,8 +52,6 @@ Switch to a specific tab in the session. { "action": "switch", "tab_id": 123 } ``` -Use the returned default observation before calling `highlight` again. - ### tab list List all tabs in the current session. @@ -71,16 +68,14 @@ Refresh the current active tab. { "action": "refresh", "tab_id": 123 } ``` -Use this to reload the page and get a fresh default observation. - ### tab view -Get a clean screenshot of the current active tab without element highlights. +Get a fresh clean screenshot of the current active tab without performing any action. ```json { "action": "view" } ``` -Use this when you need the raw page image. It does not refresh the interactive inventory by itself. +Use this when you want a current-state snapshot without moving the cursor or clicking. ### tab back Navigate back in the browser history (equivalent to clicking the browser's back button). @@ -89,30 +84,26 @@ Navigate back in the browser history (equivalent to clicking the browser's back { "action": "back" } ``` -Use the returned observation before deciding whether more discovery is needed. - ### tab forward -Navigate forward in the browser history (equivalent to clicking the browser's forward button). +Navigate forward in the browser history. ```json { "action": "forward" } ``` -Use the returned observation before deciding whether more discovery is needed. - ## Workflow Integration 1. **Start session**: `tab init https://example.com` 2. **Navigate to other pages**: `tab open https://other.com` -3. **Switch between tabs**: `tab switch` with appropriate tab_id -4. These tab actions already return the default `highlight` `element_type: "any"` page 1 result for the new page state, so you can use the returned `element_id`s immediately. -5. If the target is already partly visible after navigation, fix geometry with `scroll` before more discovery. -6. If page 1 misses the target and it is not already partly visible, continue with the highlight tool on `element_type: "any"` page 2, 3, and so on before changing strategy. +3. **Switch between tabs**: `tab switch` with the appropriate `tab_id`. +4. These tab actions already return a clean screenshot of the new page state with the virtual cursor visible, so you can act on it directly with `mouse` and `keyboard`. +5. If the target is already partly visible after navigation, scroll to recenter it before clicking. +6. If the target is not visible on the current view, scroll or navigate to find it; do not click coordinates outside the viewport. ## Notes - **Tab IDs**: Integer identifiers assigned by Chrome. These are returned in tab list responses and used in other commands. -- **Auto-resolution**: element_interaction tool actions (click, hover, scroll, swipe, keyboard_input) automatically use the conversation's active tab if `tab_id` is not provided. +- **Auto-resolution**: `mouse` and `keyboard` actions automatically use the conversation's active tab. - **Session persistence**: Tabs for a conversation stay grouped together until explicitly closed or the browser restarts. ## Error Handling diff --git a/server/agent/prompts/small_model/dialog_tool.j2 b/server/agent/prompts/small_model/dialog_tool.j2 index 5796ba9..6a2bdda 100644 --- a/server/agent/prompts/small_model/dialog_tool.j2 +++ b/server/agent/prompts/small_model/dialog_tool.j2 @@ -25,5 +25,5 @@ If a dialog is open, do not use other browser tools first. Browser execution is ## After Handling -- Check the returned default `highlight` `element_type: "any"` page 1 screenshot +- Check the returned clean screenshot of the resulting page state. - If another dialog appears, handle that next diff --git a/server/agent/prompts/small_model/keyboard_tool.j2 b/server/agent/prompts/small_model/keyboard_tool.j2 new file mode 100644 index 0000000..1c5cc49 --- /dev/null +++ b/server/agent/prompts/small_model/keyboard_tool.j2 @@ -0,0 +1,38 @@ +# Keyboard Tool + +Type text, press named keys, clear a field. Click a field with `mouse` first to focus it. + +## Actions + +### type +Type literal text, one character at a time. +```json +{ "action": "type", "text": "hello world" } +``` + +### press +Press a single named key, optionally with modifiers. +```json +{ "action": "press", "key": "Enter" } +{ "action": "press", "key": "Backspace" } +{ "action": "press", "key": "Tab" } +{ "action": "press", "key": "a", "modifiers": ["Control"] } +``` + +Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`/`Down`/`Left`/`Right`, `PageDown`, `Home`, `End`, single letters/digits. + +Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS). + +### clear +Select-all + delete the contents of the focused field. Use before overwriting a field that already has text. +```json +{ "action": "clear" } +``` + +## Patterns + +- **Fill an empty field**: `mouse click` → `keyboard type`. +- **Replace text in a field**: `mouse click` on it → `keyboard clear` → `keyboard type`. +- **Submit / search**: `keyboard press Enter`. +- **Erase one char**: `keyboard press Backspace`. +- **Tab to next field**: `keyboard press Tab`. diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2 new file mode 100644 index 0000000..0630dd2 --- /dev/null +++ b/server/agent/prompts/small_model/mouse_tool.j2 @@ -0,0 +1,59 @@ +# Mouse Tool + +Move, click, drag, and scroll a virtual mouse cursor. + +## Coordinates + +`(x, y)` and `(x2, y2)` are integers in **[0, 1000]**: `(0, 0)` is viewport top-left, `(1000, 1000)` is bottom-right. Estimate from the screenshot. + +## Cursor + +A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot. + +`click` is **in-place**: it commits where the dot is right now. It does not take coordinates. To click a new target: `move` there → check the screenshot → `click`. + +## Actions + +### move +Slide the cursor to `(x, y)`. +```json +{ "action": "move", "x": 500, "y": 320 } +``` + +### click +Click where the cursor is now. No coordinates. +```json +{ "action": "click" } +{ "action": "click", "count": 2 } +{ "action": "click", "button": "right" } +``` +- `button`: `"left"` (default), `"right"`, `"middle"`. +- `count`: 1 (default), 2 for double-click. + +Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` first. + +### drag +Press at `(x, y)`, drag to `(x2, y2)`, release. +```json +{ "action": "drag", "x": 200, "y": 400, "x2": 800, "y2": 400 } +``` + +### scroll +Scroll at the cursor by `amount` CSS pixels. +```json +{ "action": "scroll", "direction": "down", "amount": 600 } +``` +`direction`: `"down"`, `"up"`, `"left"`, `"right"`. + +### reset +Return cursor to viewport center. +```json +{ "action": "reset" } +``` + +## Patterns + +- **Click a button**: `move` → check screenshot → `click`. +- **Hover**: `move` over the trigger; next screenshot shows the result. +- **Scroll to find**: `scroll` then check the new screenshot. +- **Drag**: one `drag` with start and end coordinates. diff --git a/server/agent/prompts/small_model/tab_tool.j2 b/server/agent/prompts/small_model/tab_tool.j2 index 8757d1c..db54eb5 100644 --- a/server/agent/prompts/small_model/tab_tool.j2 +++ b/server/agent/prompts/small_model/tab_tool.j2 @@ -6,12 +6,9 @@ Manage tabs for the current conversation. 1. Keep the workflow in one active tab unless a new tab is clearly necessary. 2. After navigation, look at the returned screenshot before the next action. -3. `tab init`, `tab open`, `tab switch`, `tab back`, `tab forward`, and `tab refresh` already return the default `highlight` `element_type: "any"` page 1 screenshot and IDs for the new page state. -4. Prefer `tab view` when you only need a clean screenshot. -5. If the target is already partly visible after navigation, scroll first to reposition it. -6. If `tab view` gave you only a clean screenshot and you need `element_id`s, call `highlight`. -7. If page 1 missed the target on the same unchanged page state, continue the same highlight mode before changing strategy. -8. If dense UI, a sidebar, a tab strip, or collision-aware label placement may have split nearby controls across pages, keep paginating that same mode before narrowing or switching strategies. +3. All `tab` actions return a clean screenshot of the new page state with the virtual cursor visible — act on it directly with `mouse` and `keyboard`. +4. If the target is already partly visible after navigation, scroll to recenter it before clicking. +5. If the target is not visible at all, scroll or navigate; do not click coordinates outside the viewport. ## Commands @@ -44,7 +41,7 @@ List tabs when you need to know available `tab_id`s. ``` ### tab view -Get a clean screenshot of the active tab. +Get a fresh screenshot of the active tab without acting. ```json { "action": "view" } @@ -74,10 +71,7 @@ Close an unused tab. ## Recommended Flow -- Start with `tab init` -- If you need a clean screenshot, use `tab view` -- Otherwise, use the returned `element_id`s from the default `highlight` `element_type: "any"` page 1 result -- If the target is already partly visible or clipped, use `scroll` before more discovery -- If the page state is unchanged and the target is still missing, your default next step is the next `any` page -- If highlight page 1 does not show the target and it is not already partly visible, continue `element_type: "any"` pagination before changing strategy -- If dense UI or collision-aware label placement may have split nearby controls across pages, keep paginating the same mode before narrowing or switching strategies +- Start with `tab init`. +- After init, the returned screenshot is your working state — use `mouse` and `keyboard` to act on it. +- If you need a fresh screenshot without acting, use `tab view`. +- If the target is partly visible or clipped, scroll to recenter it before clicking. diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py index 33b14a6..514ef90 100644 --- a/server/agent/tools/base.py +++ b/server/agent/tools/base.py @@ -287,6 +287,17 @@ class OpenBrowserObservation(Observation): default=None, description="Whether the active conversation uses the small-model profile.", ) + # Viewport dimensions in CSS pixels at the time of the most recent screenshot. + # Surfaced to the model so it can self-correct if it ever drifts away from + # the [0,1000] normalized convention or the captured viewport changes. + viewport_width: Optional[int] = Field( + default=None, + description="CSS-pixel viewport width at screenshot time (None if unknown).", + ) + viewport_height: Optional[int] = Field( + default=None, + description="CSS-pixel viewport height at screenshot time (None if unknown).", + ) def _pending_confirmation_llm_content( self, @@ -397,6 +408,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]: # Operation Status Section text_parts.append("## Operation Status") text_parts.append("") + # Viewport size is intentionally not surfaced to the agent — the + # server denormalizes [0,1000] coords to real pixels automatically, + # so the agent never needs to reason about page dimensions. The + # cached vw/vh on the executor still drives that conversion. if not self.success: text_parts.append(f"**Status**: FAILED") text_parts.append(f"**Error**: {self.error}") diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 81feb97..a8311b5 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -40,6 +40,15 @@ SetSliderValueCommand, UploadFileCommand, HighlightDropPreviewCommand, + MouseMoveCommand, + MouseClickCommand, + MouseDragCommand, + MouseScrollCommand, + KeyboardTypeCommand, + KeyboardPressCommand, + ResetMouseCommand, + MouseButton, + ScrollDirection, ) # Import action types for type checking @@ -47,6 +56,8 @@ from server.agent.tools.highlight_tool import BaseHighlightAction from server.agent.tools.element_interaction_tool import ElementInteractionAction from server.agent.tools.dialog_tool import DialogHandleAction +from server.agent.tools.mouse_tool import MouseAction +from server.agent.tools.keyboard_tool import KeyboardAction from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation from server.core.llm_config import llm_config_manager @@ -110,6 +121,11 @@ def __init__(self): # Used in routine-replay mode to auto-confirm clicks/selects/keyboard_input # when the target was just uniquely highlighted. self.last_highlight_elements: Dict[str, List[Dict[str, Any]]] = {} + # Most recent CSS-pixel viewport per conversation (vw, vh). Captured + # from screenshot responses; consumed by the pixel-interaction path to + # denormalize Qwen-VL [0,1000] coordinates before dispatching to the + # extension. None = no screenshot yet — caller must take one first. + self.last_viewport_by_conv: Dict[str, tuple[int, int]] = {} def _uses_small_model(self) -> bool: """Whether the active conversation uses the small-model profile.""" @@ -137,6 +153,46 @@ def _uses_small_model(self) -> bool: return is_small_model(model_name) + def _cache_viewport(self, vw: int, vh: int) -> None: + """Cache the latest CSS-pixel viewport for the active conversation.""" + if not self.conversation_id: + return + if vw <= 0 or vh <= 0: + return + self.last_viewport_by_conv[str(self.conversation_id)] = (vw, vh) + + def _get_viewport(self) -> Optional[tuple[int, int]]: + """Return the latest cached CSS-pixel viewport, or None if unknown.""" + if not self.conversation_id: + return None + return self.last_viewport_by_conv.get(str(self.conversation_id)) + + def _is_qwen_model(self) -> bool: + """Whether the active conversation uses a Qwen vision model. + + Qwen-VL emits coordinates in the [0, 1000] normalized space, so the + server must denormalize before dispatching CDP input. Detection is + prefix-based on the canonical dashscope model id. + """ + if not self.conversation_id: + return False + session = session_manager.get_session(str(self.conversation_id)) + if session is None: + return False + + model_name = session.metadata.get("model") + if not isinstance(model_name, str) or not model_name: + raw_alias = session.metadata.get("model_alias") + if isinstance(raw_alias, str) and raw_alias: + try: + model_name = llm_config_manager.get_llm_config(raw_alias).model + except ValueError: + model_name = None + + if not isinstance(model_name, str): + return False + return model_name.startswith("dashscope/qwen") or model_name.startswith("qwen") + def _is_routine_replay_mode(self) -> bool: """Whether the active conversation is running in routine-replay mode.""" if not self.conversation_id: @@ -244,6 +300,10 @@ def _execute_action_sync(self, action: Any) -> OpenBrowserObservation: return self._execute_element_interaction_action(action) elif isinstance(action, DialogHandleAction): return self._execute_dialog_action(action) + elif isinstance(action, MouseAction): + return self._execute_mouse_action(action) + elif isinstance(action, KeyboardAction): + return self._execute_keyboard_action(action) else: raise ValueError(f"Unknown action type: {type(action).__name__}") @@ -930,6 +990,213 @@ def _format_select_value_preview(value: Any) -> str: return f"[{joined}]" return f"'{value}'" + def _denormalize_xy( + self, x: Optional[int], y: Optional[int] + ) -> tuple[Optional[int], Optional[int]]: + """Convert Qwen-VL [0,1000] coords to CSS pixels using cached viewport. + + For Qwen models, the agent emits coordinates in [0,1000] normalized + space; the server rescales using the captured viewport size before + dispatching CDP input events. For non-Qwen models, coordinates pass + through unchanged. + + Returns the same `(x, y)` shape — `None` for any input that was None. + """ + if x is None and y is None: + return (None, None) + if not self._is_qwen_model(): + return (x, y) + viewport = self._get_viewport() + if viewport is None: + # No screenshot has populated the viewport yet. Best we can do + # is interpret coords as already-CSS pixels and let the extension + # clamp; a warning marks this so we can audit if it happens. + logger.warning( + "Pixel action dispatched before any screenshot populated the " + "viewport cache; passing coordinates through without " + "denormalization (conversation_id=%s).", + self.conversation_id, + ) + return (x, y) + vw, vh = viewport + px = round(x * vw / 1000) if x is not None else None + py = round(y * vh / 1000) if y is not None else None + return (px, py) + + def _execute_mouse_action( + self, action: MouseAction + ) -> OpenBrowserObservation: + """Execute one mouse action (move/click/drag/scroll/reset). + + Coordinates from Qwen models are in [0, 1000] normalized space and are + denormalized to CSS pixels using the most recent viewport captured + from a screenshot. Non-Qwen models pass through. + """ + kind = action.action + logger.debug(f"DEBUG: _execute_mouse_action kind={kind}") + + try: + if kind == "move": + if action.x is None or action.y is None: + raise ValueError("mouse move requires x and y") + px, py = self._denormalize_xy(action.x, action.y) + command = MouseMoveCommand( + x=px, y=py, conversation_id=self.conversation_id + ) + result_dict = self._execute_command_sync(command) + return self._build_observation_from_result( + result_dict, f"Mouse moved to ({px}, {py})" + ) + + if kind == "click": + # Click is in-place at the cursor's current position. Any + # x/y the model emitted is ignored on purpose — if it wants + # to click somewhere new it must `move` there first, so the + # visible cursor in the screenshot is the click point. + if action.x is not None or action.y is not None: + logger.debug( + "Mouse click ignored x=%s, y=%s (click is in-place)", + action.x, + action.y, + ) + command = MouseClickCommand( + button=MouseButton(action.button), + count=action.count, + double=(action.count == 2), + conversation_id=self.conversation_id, + ) + result_dict = self._execute_command_sync(command) + return self._build_observation_from_result( + result_dict, + f"Clicked {action.button} at the cursor " + f"(count={action.count})", + ) + + if kind == "drag": + if ( + action.x is None + or action.y is None + or action.x2 is None + or action.y2 is None + ): + raise ValueError("mouse drag requires x, y, x2, y2") + sx, sy = self._denormalize_xy(action.x, action.y) + ex, ey = self._denormalize_xy(action.x2, action.y2) + command = MouseDragCommand( + start_x=sx, + start_y=sy, + end_x=ex, + end_y=ey, + button=MouseButton(action.button), + steps=action.steps, + conversation_id=self.conversation_id, + ) + result_dict = self._execute_command_sync(command) + return self._build_observation_from_result( + result_dict, f"Dragged from ({sx}, {sy}) to ({ex}, {ey})" + ) + + if kind == "scroll": + command = MouseScrollCommand( + direction=ScrollDirection(action.direction), + amount=action.amount, + conversation_id=self.conversation_id, + ) + result_dict = self._execute_command_sync(command) + return self._build_observation_from_result( + result_dict, + f"Scrolled {action.direction} by {action.amount}px", + ) + + if kind == "reset": + command = ResetMouseCommand( + conversation_id=self.conversation_id + ) + result_dict = self._execute_command_sync(command) + return self._build_observation_from_result( + result_dict, "Reset cursor to viewport center" + ) + + raise ValueError(f"Unknown mouse action: {kind}") + except Exception as e: + logger.error(f"Mouse action failed (kind={kind}): {e}", exc_info=True) + return OpenBrowserObservation( + success=False, error=str(e), small_model=self._uses_small_model() + ) + + def _execute_keyboard_action( + self, action: KeyboardAction + ) -> OpenBrowserObservation: + """Execute one keyboard action (type/press/clear).""" + kind = action.action + logger.debug(f"DEBUG: _execute_keyboard_action kind={kind}") + + try: + if kind == "type": + if not action.text: + raise ValueError("keyboard type requires text") + command = KeyboardTypeCommand( + text=action.text, conversation_id=self.conversation_id + ) + result_dict = self._execute_command_sync(command) + preview = ( + action.text + if len(action.text) <= 32 + else action.text[:29] + "..." + ) + return self._build_observation_from_result( + result_dict, f"Typed text: {preview!r}" + ) + + if kind == "press": + if not action.key: + raise ValueError("keyboard press requires key") + command = KeyboardPressCommand( + key=action.key, + modifiers=list(action.modifiers or []), + conversation_id=self.conversation_id, + ) + result_dict = self._execute_command_sync(command) + mod_text = ( + f" with {'+'.join(action.modifiers)}" + if action.modifiers + else "" + ) + return self._build_observation_from_result( + result_dict, f"Pressed {action.key}{mod_text}" + ) + + if kind == "clear": + # Clear == select-all then Backspace. Two press commands + # so each fires its own event sequence on the focused + # element. Done at the wire level via two + # KeyboardPressCommands so behavior matches what the + # agent would have manually scripted. + first = KeyboardPressCommand( + key="a", + modifiers=["Control"], + conversation_id=self.conversation_id, + ) + self._execute_command_sync(first) + second = KeyboardPressCommand( + key="Backspace", + modifiers=[], + conversation_id=self.conversation_id, + ) + result_dict = self._execute_command_sync(second) + return self._build_observation_from_result( + result_dict, "Cleared focused field (select-all + Backspace)" + ) + + raise ValueError(f"Unknown keyboard action: {kind}") + except Exception as e: + logger.error( + f"Keyboard action failed (kind={kind}): {e}", exc_info=True + ) + return OpenBrowserObservation( + success=False, error=str(e), small_model=self._uses_small_model() + ) + def _execute_dialog_action( self, action: DialogHandleAction ) -> OpenBrowserObservation: @@ -1246,6 +1513,26 @@ def _build_observation_from_result( f"DEBUG: Extracted screenshot from data['imageData'], length={len(screenshot_data_url) if screenshot_data_url else 0}" ) + # Capture viewport dims (CSS pixels) for Qwen [0,1000] + # → pixel denormalization. Two shapes: + # highlighted/buildScreenshotPayload → viewport_width/height + # raw screenshot → metadata.viewportWidth/Height + raw_vw = data.get("viewport_width") + raw_vh = data.get("viewport_height") + if raw_vw is None or raw_vh is None: + meta = data.get("metadata") + if isinstance(meta, dict): + raw_vw = raw_vw if raw_vw is not None else meta.get( + "viewportWidth" + ) + raw_vh = raw_vh if raw_vh is not None else meta.get( + "viewportHeight" + ) + if isinstance(raw_vw, (int, float)) and isinstance( + raw_vh, (int, float) + ): + self._cache_viewport(int(raw_vw), int(raw_vh)) + # Extract highlighted elements for highlight_elements action if highlighted_elements is None and "elements" in data: highlighted_elements = data["elements"] @@ -1355,6 +1642,7 @@ def _build_observation_from_result( pending_confirmation = self._get_pending_confirmation() # Build observation + cached_viewport = self._get_viewport() observation = OpenBrowserObservation( success=success, message=message, @@ -1378,6 +1666,8 @@ def _build_observation_from_result( scroll_warning=scroll_warning, pending_confirmation=pending_confirmation, small_model=self._uses_small_model(), + viewport_width=cached_viewport[0] if cached_viewport else None, + viewport_height=cached_viewport[1] if cached_viewport else None, ) return observation @@ -1393,6 +1683,13 @@ def _execute_command_sync(self, command) -> Any: if command.conversation_id is None: command.conversation_id = self.conversation_id + # The agent loop is pixel-only — every screenshot returned to the + # model should be clean and show the virtual cursor. Highlights + # are reserved for non-agent flows (recording tooling) that don't + # come through this executor. + if hasattr(command, "live_mode"): + command.live_mode = True + # Convert command to dict using model_dump cmd_dict = command.model_dump() logger.info( diff --git a/server/agent/tools/element_interaction_tool.py b/server/agent/tools/element_interaction_tool.py index 28b4e3c..8ad3150 100644 --- a/server/agent/tools/element_interaction_tool.py +++ b/server/agent/tools/element_interaction_tool.py @@ -5,6 +5,9 @@ flow. Hover, scroll, and swipe execute directly. """ +# Legacy: kept for /ob-routines recording/replay; not exposed to the live agent +# (which now uses PixelInteractionTool for pure pixel-level mouse/keyboard control). + from collections.abc import Sequence from typing import List, Literal, Optional, Union diff --git a/server/agent/tools/highlight_tool.py b/server/agent/tools/highlight_tool.py index 773e809..6a2fa20 100644 --- a/server/agent/tools/highlight_tool.py +++ b/server/agent/tools/highlight_tool.py @@ -5,6 +5,9 @@ allowing the AI agent to see and interact with elements via labeled overlays. """ +# Legacy: kept for /ob-routines recording/replay; not exposed to the live agent +# (which now uses PixelInteractionTool for pure pixel-level mouse/keyboard control). + from collections.abc import Sequence from typing import Optional, List diff --git a/server/agent/tools/keyboard_tool.py b/server/agent/tools/keyboard_tool.py new file mode 100644 index 0000000..7852f61 --- /dev/null +++ b/server/agent/tools/keyboard_tool.py @@ -0,0 +1,107 @@ +""" +KeyboardTool - Type text and press named keys at the current focus. + +Use 'type' for plain text input (after clicking a field to focus it). Use +'press' for named keys (Enter, Escape, Tab, arrows) and shortcuts with +modifiers (Ctrl+A, Cmd+K). +""" + +from collections.abc import Sequence +from typing import List, Literal, Optional + +from openhands.sdk.tool import ( + ToolDefinition, + ToolAnnotations, + register_tool, +) +from pydantic import Field + +from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation +from server.agent.tools.prompt_context import get_prompt_render_context +from server.agent.tools.prompt_loader import render_tool_prompt + + +def get_keyboard_tool_description(conv_state=None) -> str: + """Get the KeyboardTool description, rendered from Jinja2 template.""" + return render_tool_prompt( + "keyboard_tool.j2", + conv_state, + context=get_prompt_render_context(conv_state), + ) + + +KeyboardActionKind = Literal["type", "press", "clear"] + + +class KeyboardAction(OpenBrowserAction): + """Type text, press a named key, or clear the focused field.""" + + action: KeyboardActionKind = Field( + description=( + "'type' — type literal text one character at a time at the " + "current focus (click a field first to focus it). " + "'press' — press a single named key, optionally with modifiers. " + "Use this for Enter/Tab/Escape/Backspace/Delete/arrows and " + "shortcuts like Ctrl+A. " + "'clear' — convenience wrapper that selects all and deletes " + "the contents of the currently focused field (equivalent to " + "`press a` with `modifiers: ['Control']` then `press Backspace`)." + ) + ) + + text: Optional[str] = Field( + default=None, + description="Text to type for 'type' (max 1000 chars).", + max_length=1000, + ) + key: Optional[str] = Field( + default=None, + description=( + "Key name for 'press', e.g. 'Enter', 'Escape', 'Tab', " + "'Backspace', 'ArrowDown', 'PageDown'. Single letters/digits " + "also work ('a', '5')." + ), + max_length=50, + ) + modifiers: List[str] = Field( + default_factory=list, + description=( + "Modifier keys for 'press' (e.g. ['Control'], ['Shift', 'Alt']). " + "Use 'Meta' for Cmd on macOS." + ), + ) + + +class KeyboardTool(ToolDefinition[KeyboardAction, OpenBrowserObservation]): + """Virtual keyboard — type text, press keys.""" + + name = "keyboard" + + @classmethod + def create(cls, conv_state, terminal_executor=None) -> Sequence["KeyboardTool"]: + if terminal_executor is not None: + executor = terminal_executor + else: + conversation_id = getattr(conv_state, "id", None) + from server.agent.tools.browser_executor import get_browser_executor + + executor = get_browser_executor(conversation_id) + + return [ + cls( + description=get_keyboard_tool_description(conv_state), + action_type=KeyboardAction, + observation_type=OpenBrowserObservation, + annotations=ToolAnnotations( + title="Keyboard", + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + openWorldHint=True, + ), + executor=executor, + ) + ] + + +register_tool("keyboard", KeyboardTool.create) diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py new file mode 100644 index 0000000..d9139ee --- /dev/null +++ b/server/agent/tools/mouse_tool.py @@ -0,0 +1,157 @@ +""" +MouseTool - Move, click, drag, and scroll a virtual mouse cursor. + +The agent emits target coordinates in the Qwen-VL [0, 1000] normalized space +(0 = viewport top-left, 1000 = bottom-right). The server denormalizes against +the captured viewport before dispatching CDP input events. A small arrow +cursor is rendered into the page DOM and appears in every screenshot. +""" + +from collections.abc import Sequence +from typing import Literal, Optional + +from openhands.sdk.tool import ( + ToolDefinition, + ToolAnnotations, + register_tool, +) +from pydantic import Field + +from server.agent.tools.base import OpenBrowserAction, OpenBrowserObservation +from server.agent.tools.prompt_context import get_prompt_render_context +from server.agent.tools.prompt_loader import render_tool_prompt + + +def get_mouse_tool_description(conv_state=None) -> str: + """Get the MouseTool description, rendered from Jinja2 template.""" + return render_tool_prompt( + "mouse_tool.j2", + conv_state, + context=get_prompt_render_context(conv_state), + ) + + +MouseActionKind = Literal["move", "click", "drag", "scroll", "reset"] + + +class MouseAction(OpenBrowserAction): + """Move, click, drag, or scroll the virtual mouse cursor. + + Coordinates `x, y, x2, y2` are integers in the Qwen-VL [0, 1000] + normalized space, with `(0, 0)` at the top-left of the viewport and + `(1000, 1000)` at the bottom-right. + """ + + action: MouseActionKind = Field( + description=( + "What to do with the mouse. " + "'move' — slide the cursor to (x, y). The cursor traces an eased " + "path so hover effects fire naturally along the way. " + "'click' — click WHERE THE CURSOR IS NOW. This is an in-place " + "action: it does not accept a target coordinate. Move there " + "first, verify the cursor is on the intended target in the " + "screenshot, then click. Use `count: 2` for double-click, " + "`count: 3` for triple-click. `button: 'right'` for context " + "menus. " + "'drag' — press at (x, y), drag to (x2, y2), release. " + "'scroll' — scroll at the cursor position by `amount` in " + "`direction`. " + "'reset' — return the cursor to the viewport center." + ) + ) + + x: Optional[int] = Field( + default=None, + description=( + "Target X in Qwen-VL [0, 1000] normalized space. Required for " + "'move' and 'drag' (start). Ignored by 'click' — click is " + "in-place; move first if you need to retarget." + ), + ge=0, + le=1000, + ) + y: Optional[int] = Field( + default=None, + description=( + "Target Y in Qwen-VL [0, 1000] normalized space. Required for " + "'move' and 'drag' (start). Ignored by 'click'." + ), + ge=0, + le=1000, + ) + x2: Optional[int] = Field( + default=None, + description="Drag end X in [0, 1000]. Required for 'drag'.", + ge=0, + le=1000, + ) + y2: Optional[int] = Field( + default=None, + description="Drag end Y in [0, 1000]. Required for 'drag'.", + ge=0, + le=1000, + ) + + button: Literal["left", "right", "middle"] = Field( + default="left", + description="Mouse button for 'click' and 'drag'.", + ) + count: int = Field( + default=1, + ge=1, + le=3, + description="Click count for 'click' (1 = single, 2 = double, 3 = triple).", + ) + + direction: Literal["up", "down", "left", "right"] = Field( + default="down", + description="Scroll direction for 'scroll'.", + ) + amount: int = Field( + default=300, + ge=1, + le=2000, + description="Scroll amount in CSS pixels for 'scroll'.", + ) + + steps: int = Field( + default=10, + ge=2, + le=40, + description="Intermediate move steps for 'drag' (smoother for DnD libraries).", + ) + + +class MouseTool(ToolDefinition[MouseAction, OpenBrowserObservation]): + """Virtual mouse — move, click, drag, scroll.""" + + name = "mouse" + + @classmethod + def create(cls, conv_state, terminal_executor=None) -> Sequence["MouseTool"]: + if terminal_executor is not None: + executor = terminal_executor + else: + conversation_id = getattr(conv_state, "id", None) + from server.agent.tools.browser_executor import get_browser_executor + + executor = get_browser_executor(conversation_id) + + return [ + cls( + description=get_mouse_tool_description(conv_state), + action_type=MouseAction, + observation_type=OpenBrowserObservation, + annotations=ToolAnnotations( + title="Mouse", + readOnlyHint=False, + destructiveHint=False, + idempotentHint=False, + openWorldHint=True, + ), + executor=executor, + ) + ] + + +register_tool("mouse", MouseTool.create) diff --git a/server/api/routes/commands.py b/server/api/routes/commands.py index e1a3854..a8f4f47 100644 --- a/server/api/routes/commands.py +++ b/server/api/routes/commands.py @@ -96,7 +96,15 @@ async def execute_command(command_data: dict): @router.post("/mouse/move") async def mouse_move(x: int, y: int, browser_id: str, duration: float = 0.1): - """Move mouse to absolute position in preset coordinate system (0-1280, 0-720)""" + """Move mouse to an absolute CSS-pixel position in the live viewport. + + `x` and `y` are CSS pixels with `x, y >= 0`. The extension clamps to the + live viewport before dispatch, so callers don't need to know the exact + viewport size — but values should be within plausible browser dimensions + (e.g. up to 4K). The legacy 0-1280/0-720 cap is no longer enforced now + that the live agent path uses Qwen-VL [0,1000] coords with server-side + denormalization (see `BrowserExecutor._denormalize_xy`). + """ command = { "type": "mouse_move", "x": x, diff --git a/server/core/processor.py b/server/core/processor.py index 1bb55b2..a16dc4f 100644 --- a/server/core/processor.py +++ b/server/core/processor.py @@ -11,6 +11,7 @@ parse_command, MouseMoveCommand, MouseClickCommand, + MouseDragCommand, MouseScrollCommand, ResetMouseCommand, KeyboardTypeCommand, @@ -132,6 +133,7 @@ def _prepare_command_dict(self, command: Command) -> dict: ScreenshotCommand, MouseMoveCommand, MouseClickCommand, + MouseDragCommand, MouseScrollCommand, ResetMouseCommand, KeyboardTypeCommand, @@ -166,12 +168,24 @@ def _prepare_command_dict(self, command: Command) -> dict: ): # Check command type to decide if we should fill tab_id if isinstance(command, TabCommand): - # For tab commands, only fill tab_id for certain actions - # init and open create new tabs - don't fill - # close and switch need specific tab_id - don't fill if not specified - # list gets all tabs - don't fill - # So generally don't auto-fill for TabCommand - pass + # init/open create new tabs; close/switch need an explicit + # tab_id; list gets all tabs. But refresh/view/back/forward + # operate on "the current tab" semantically, so fill in the + # active tab when the agent didn't bother to pass it. + action_value = command_dict.get("action") or getattr( + command, "action", None + ) + action_name = ( + action_value.value + if hasattr(action_value, "value") + else action_value + ) + if action_name in {"refresh", "view", "back", "forward"}: + command_dict["tab_id"] = current_tab_id + logger.debug( + f"Auto-filled tab_id {current_tab_id} for " + f"tab.{action_name} in conversation {conversation_id}" + ) elif isinstance(command, GetTabsCommand): # GetTabsCommand gets all tabs, doesn't need tab_id pass @@ -214,6 +228,8 @@ async def execute(self, command: Command) -> CommandResponse: return await self._execute_mouse_move(command) elif isinstance(command, MouseClickCommand): return await self._execute_mouse_click(command) + elif isinstance(command, MouseDragCommand): + return await self._execute_mouse_drag(command) elif isinstance(command, MouseScrollCommand): return await self._execute_mouse_scroll(command) elif isinstance(command, KeyboardTypeCommand): @@ -285,6 +301,13 @@ async def _execute_mouse_click(self, command: MouseClickCommand) -> CommandRespo response = await self._send_prepared_command(command) return response + async def _execute_mouse_drag( + self, command: MouseDragCommand + ) -> CommandResponse: + """Execute mouse drag command""" + response = await self._send_prepared_command(command) + return response + async def _execute_mouse_scroll( self, command: MouseScrollCommand ) -> CommandResponse: diff --git a/server/models/commands.py b/server/models/commands.py index 0b4f7dd..5eb5af5 100644 --- a/server/models/commands.py +++ b/server/models/commands.py @@ -52,21 +52,37 @@ class BaseCommand(BaseModel): default=None, description="Browser UUID capability token for targeted routing", ) + # When True, the extension returns a clean (no-highlight) screenshot with + # the virtual cursor rendered, instead of a highlighted screenshot. Set by + # BrowserExecutor for live-agent-path conversations; False for routine + # replay (which still depends on the highlight + element-id inventory). + live_mode: bool = Field( + default=False, + description=( + "If True, the extension skips highlight injection and returns a " + "clean screenshot with the virtual cursor for the live pixel-only " + "agent path. Default False preserves the legacy highlight flow." + ), + ) class MouseMoveCommand(BaseCommand): - """Move mouse to absolute position in preset coordinate system""" + """Move mouse to an absolute CSS-pixel position in the live viewport. + + Coordinates are CSS pixels (post-denormalization). The extension clamps + out-of-range values to the live viewport before dispatch, so no upper + bound is enforced here — server-side denormalization is the single source + of truth on coordinate space. + """ type: Literal["mouse_move"] = "mouse_move" x: int = Field( - description="X coordinate in preset coordinate system (0 to 1280, left to right)", + description="X coordinate in CSS pixels from viewport left.", ge=0, - le=1280, ) y: int = Field( - description="Y coordinate in preset coordinate system (0 to 720, top to bottom)", + description="Y coordinate in CSS pixels from viewport top.", ge=0, - le=720, ) duration: Optional[float] = Field( default=0.1, @@ -77,12 +93,47 @@ class MouseMoveCommand(BaseCommand): class MouseClickCommand(BaseCommand): - """Click at current mouse position""" + """Click at the current cursor position, or at an explicit (x, y). + + When ``x`` and ``y`` are provided, the extension first dispatches a + ``mouseMoved`` to that position so the click registers there; when omitted, + the click fires at the most recent cursor position tracked per tab. + """ type: Literal["mouse_click"] = "mouse_click" button: MouseButton = Field(default=MouseButton.LEFT) double: bool = Field(default=False, description="Double click if True") count: int = Field(default=1, ge=1, le=3, description="Number of clicks (1-3)") + x: Optional[int] = Field( + default=None, + description="Optional X in CSS pixels — pre-move cursor before clicking.", + ge=0, + ) + y: Optional[int] = Field( + default=None, + description="Optional Y in CSS pixels — pre-move cursor before clicking.", + ge=0, + ) + + +class MouseDragCommand(BaseCommand): + """Drag from (start_x, start_y) to (end_x, end_y) in CSS pixels. + + Sequence: mouseMoved → mousePressed → N lerped mouseMoved → mouseReleased. + """ + + type: Literal["mouse_drag"] = "mouse_drag" + start_x: int = Field(description="Drag start X in CSS pixels.", ge=0) + start_y: int = Field(description="Drag start Y in CSS pixels.", ge=0) + end_x: int = Field(description="Drag end X in CSS pixels.", ge=0) + end_y: int = Field(description="Drag end Y in CSS pixels.", ge=0) + button: MouseButton = Field(default=MouseButton.LEFT) + steps: int = Field( + default=10, + ge=2, + le=40, + description="Intermediate mouseMoved events between start and end.", + ) class MouseScrollCommand(BaseCommand): @@ -259,6 +310,9 @@ class GetAccessibilityTreeCommand(BaseCommand): ) +# Legacy element-id commands (HighlightElementsCommand through HighlightDropPreviewCommand): +# kept for /ob-routines recording/replay. The live agent uses pixel-level commands +# (MouseMoveCommand, MouseClickCommand, etc.) and never references element_id. class HighlightElementsCommand(BaseCommand): """Highlight interactive elements on the page for visual selection @@ -593,6 +647,7 @@ class TabsResponse(CommandResponse): Command = Union[ MouseMoveCommand, MouseClickCommand, + MouseDragCommand, MouseScrollCommand, ResetMouseCommand, KeyboardTypeCommand, @@ -630,6 +685,7 @@ def parse_command(data: dict) -> Command: command_map = { "mouse_move": MouseMoveCommand, "mouse_click": MouseClickCommand, + "mouse_drag": MouseDragCommand, "mouse_scroll": MouseScrollCommand, "reset_mouse": ResetMouseCommand, "keyboard_type": KeyboardTypeCommand, From 9780ba730f5762276af8c7f258cd046e473c0712 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Wed, 29 Apr 2026 20:38:31 +0800 Subject: [PATCH 02/14] prompts(agent): tighten mouse/keyboard wording, drop implementation leaks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the mouse and keyboard tool prompts and Field descriptions to describe only the canonical use, instead of warning the model away from shapes that the executor already silently handles. - mouse: drop "click does not take coordinates" / "MUST NOT supply x/y" language. Field descriptions now state the affirmative use only. Examples already only show the in-place form. - keyboard: lead with "focus an input first" as a positive rule, drop the per-character / "convenience wrapper" implementation notes. Add a small_model `type into a field` pattern that pairs with mouse. - mouse scroll (small_model): document that the wheel hits the container under the cursor — `move` first to scroll an inner panel. - mouse_tool.py: revert the validator that rejected x/y on click; the executor already drops them silently. The schema reflects only what the agent should learn to send. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../agent/prompts/big_model/keyboard_tool.j2 | 28 ++++++++++--------- server/agent/prompts/big_model/mouse_tool.j2 | 24 ++++++++-------- .../prompts/small_model/keyboard_tool.j2 | 12 +++++--- .../agent/prompts/small_model/mouse_tool.j2 | 9 +++--- server/agent/tools/keyboard_tool.py | 9 +++--- server/agent/tools/mouse_tool.py | 20 ++++++------- 6 files changed, 52 insertions(+), 50 deletions(-) diff --git a/server/agent/prompts/big_model/keyboard_tool.j2 b/server/agent/prompts/big_model/keyboard_tool.j2 index a31437b..3a88ecd 100644 --- a/server/agent/prompts/big_model/keyboard_tool.j2 +++ b/server/agent/prompts/big_model/keyboard_tool.j2 @@ -1,11 +1,15 @@ # Keyboard Tool -Type text and press keys at the current focus. Click a field with `mouse` first to focus it; then use this tool. +Type text and press keys at the current focus. + +## Focus an input first + +Keyboard events go to whatever element currently has focus. Before typing into a text field, search box, contenteditable, etc., **`move` the cursor over the field and `click` it to focus it.** Then call this tool. The same applies after navigating to a new page or opening a dialog — re-focus the input by clicking it before the next `type`. ## Actions ### type -Type literal text where the keyboard focus is right now. Characters are sent one at a time, with a small delay between each, so per-character handlers (autocomplete, live validation) react in order — just like a real keyboard. +Type literal text into the focused field. ```json { "action": "type", "text": "hello world" } @@ -32,25 +36,23 @@ Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`, `ArrowD Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS). ### clear -Convenience: select-all + delete the contents of the currently focused field. Use this when you want to overwrite a field that already has text in it. +Select-all + delete the contents of the currently focused field. Use this to overwrite a field that already has text in it. ```json { "action": "clear" } ``` -Equivalent to `press a` with `modifiers: ["Control"]` then `press Backspace`. - ## Patterns -- **Fill an empty form field**: `mouse` `click` on the field → `keyboard` `type` the value. -- **Replace existing text in a field**: `mouse` `click` on the field → `keyboard` `clear` → `keyboard` `type` the new value. -- **Submit a form / trigger search**: `keyboard` `press` `key: "Enter"` after typing. -- **Erase a single character**: `keyboard` `press` `key: "Backspace"`. -- **Select all in a field**: `keyboard` `press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS). -- **Tab to next field**: `keyboard` `press` `key: "Tab"`. +- **Fill an empty form field**: `mouse move` to the field → `mouse click` → `keyboard type`. +- **Replace existing text**: `mouse move` to the field → `mouse click` → `keyboard clear` → `keyboard type`. +- **Submit a form / trigger search**: `keyboard press` `key: "Enter"` after typing. +- **Erase a single character**: `keyboard press` `key: "Backspace"`. +- **Select all in a field**: `keyboard press` `key: "a"` with `modifiers: ["Control"]` (or `"Meta"` on macOS). +- **Tab to next field**: `keyboard press` `key: "Tab"`. ## Notes - One action per turn. -- Typing or pressing goes to whatever has keyboard focus. If nothing is focused, nothing happens — `mouse click` on a field first. -- Don't type into the address bar via this tool; use the `tab` tool to navigate. +- Click an input area before typing into it so it has focus. +- For URL changes use the `tab` tool to navigate, not the keyboard. diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2 index bcd7181..4f17ee6 100644 --- a/server/agent/prompts/big_model/mouse_tool.j2 +++ b/server/agent/prompts/big_model/mouse_tool.j2 @@ -4,32 +4,30 @@ Drive a virtual mouse cursor: move, click, drag, scroll. ## Coordinates -`(x, y)` and `(x2, y2)` are integers in the **[0, 1000] normalized space**: +`(x, y)` and `(x2, y2)` are integers in **[0, 1000]** normalized space: - `(0, 0)` = top-left of the viewport. - `(1000, 1000)` = bottom-right. Estimate from the screenshot. Aim for the visual center of your target. The system rescales to real pixels. -## The Cursor Is Load-Bearing +## The Cursor -A red dot with a pulsing red ring sits inside a white-and-black arrow. **The dot is the click point.** It appears in every screenshot on this page. +A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot. -`click` is **in-place**: it commits exactly where the dot is right now. It does not take a target coordinate. If you want to click somewhere new, **`move` there first**, then verify in the next screenshot that the red dot is on top of the intended target, then `click`. - -This is a hard rule. Skipping the move-first step and clicking will commit at whatever position the cursor was last left at — which is rarely what you want. +`click` commits at the cursor's current position. To click a new target: `move` there → look at the next screenshot to confirm the red dot is on the target → `click`. ## Actions ### move -Slide the cursor to a point. The cursor traces an eased path, so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way. +Slide the cursor to a point. The cursor traces an eased path so hover effects (CSS `:hover`, menus, tooltips) fire naturally along the way. ```json { "action": "move", "x": 500, "y": 320 } ``` ### click -Click **where the cursor is now**. Does not take coordinates. +Click at the cursor's current position. ```json { "action": "click" } @@ -40,7 +38,7 @@ Click **where the cursor is now**. Does not take coordinates. - `button`: `"left"` (default), `"right"`, `"middle"`. - `count`: `1` (default), `2` for double-click, `3` for triple-click (text selection). -Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it is not, call `move` first. +Before calling `click`, look at the most recent screenshot and confirm the red dot is on top of the element you want to act on. If it isn't, `move` again first. ### drag Press at `(x, y)`, drag to `(x2, y2)`, release. One call. @@ -59,7 +57,7 @@ Scroll at the cursor's current position by `amount` CSS pixels in `direction`. { "action": "scroll", "direction": "up", "amount": 300 } ``` -`direction`: `"down"`, `"up"`, `"left"`, `"right"`. To scroll inside a specific panel/container, `move` over it first so the wheel event lands in the right scroll target. +`direction`: `"down"`, `"up"`, `"left"`, `"right"`. To scroll inside a specific panel, `move` over it first so the wheel event lands in the right scroll target. ### reset Return the cursor to the viewport center. @@ -79,6 +77,6 @@ Return the cursor to the viewport center. ## Notes - One action per turn. The next observation reflects the post-action state. -- The cursor position persists across actions — the cursor remains where you last left it until you `move` it again. -- If a target isn't visible, `scroll` to bring it in view; don't try to click coordinates outside the viewport. -- If a confirm/prompt dialog opens, the next mouse action will fail — handle the dialog first. +- The cursor position persists across actions — it stays where you last left it until the next `move`. +- If a target isn't in the viewport, `scroll` to bring it in view before pointing at it. +- If a confirm/prompt dialog opens, handle it with the dialog tool before the next mouse action. diff --git a/server/agent/prompts/small_model/keyboard_tool.j2 b/server/agent/prompts/small_model/keyboard_tool.j2 index 1c5cc49..863654a 100644 --- a/server/agent/prompts/small_model/keyboard_tool.j2 +++ b/server/agent/prompts/small_model/keyboard_tool.j2 @@ -1,11 +1,15 @@ # Keyboard Tool -Type text, press named keys, clear a field. Click a field with `mouse` first to focus it. +Type text and press named keys at the current focus. + +## Focus an input first + +Keyboard events go to the focused element. Before typing into a text box, search field, or any input area, **`mouse move` to the field and `mouse click` it** to focus it. Then call this tool. ## Actions ### type -Type literal text, one character at a time. +Type literal text into the focused field. ```json { "action": "type", "text": "hello world" } ``` @@ -31,8 +35,8 @@ Select-all + delete the contents of the focused field. Use before overwriting a ## Patterns -- **Fill an empty field**: `mouse click` → `keyboard type`. -- **Replace text in a field**: `mouse click` on it → `keyboard clear` → `keyboard type`. +- **Fill an empty field**: `mouse move` → `mouse click` → `keyboard type`. +- **Replace text in a field**: `mouse move` → `mouse click` → `keyboard clear` → `keyboard type`. - **Submit / search**: `keyboard press Enter`. - **Erase one char**: `keyboard press Backspace`. - **Tab to next field**: `keyboard press Tab`. diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2 index 0630dd2..c9cc716 100644 --- a/server/agent/prompts/small_model/mouse_tool.j2 +++ b/server/agent/prompts/small_model/mouse_tool.j2 @@ -10,7 +10,7 @@ Move, click, drag, and scroll a virtual mouse cursor. A red dot with a pulsing red ring sits inside a white-and-black arrow on the page. **The red dot is the click point.** It appears in every screenshot. -`click` is **in-place**: it commits where the dot is right now. It does not take coordinates. To click a new target: `move` there → check the screenshot → `click`. +`click` commits at the cursor's current position. To click a new target: `move` there → check the screenshot → `click`. ## Actions @@ -21,7 +21,7 @@ Slide the cursor to `(x, y)`. ``` ### click -Click where the cursor is now. No coordinates. +Click at the cursor's current position. ```json { "action": "click" } { "action": "click", "count": 2 } @@ -30,7 +30,7 @@ Click where the cursor is now. No coordinates. - `button`: `"left"` (default), `"right"`, `"middle"`. - `count`: 1 (default), 2 for double-click. -Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` first. +Before clicking, verify in the screenshot that the red dot is on top of the target. If not, `move` again first. ### drag Press at `(x, y)`, drag to `(x2, y2)`, release. @@ -39,7 +39,7 @@ Press at `(x, y)`, drag to `(x2, y2)`, release. ``` ### scroll -Scroll at the cursor by `amount` CSS pixels. +Scroll at the cursor by `amount` CSS pixels. The wheel event lands on whatever container is under the cursor — to scroll inside a panel, sidebar, or modal, `move` over it first. ```json { "action": "scroll", "direction": "down", "amount": 600 } ``` @@ -57,3 +57,4 @@ Return cursor to viewport center. - **Hover**: `move` over the trigger; next screenshot shows the result. - **Scroll to find**: `scroll` then check the new screenshot. - **Drag**: one `drag` with start and end coordinates. +- **Type into a field**: `move` to the field → `click` to focus it → `keyboard type` the text. diff --git a/server/agent/tools/keyboard_tool.py b/server/agent/tools/keyboard_tool.py index 7852f61..a53bf00 100644 --- a/server/agent/tools/keyboard_tool.py +++ b/server/agent/tools/keyboard_tool.py @@ -38,14 +38,13 @@ class KeyboardAction(OpenBrowserAction): action: KeyboardActionKind = Field( description=( - "'type' — type literal text one character at a time at the " - "current focus (click a field first to focus it). " + "'type' — type literal text into the focused field (click the " + "field first to focus it). " "'press' — press a single named key, optionally with modifiers. " "Use this for Enter/Tab/Escape/Backspace/Delete/arrows and " "shortcuts like Ctrl+A. " - "'clear' — convenience wrapper that selects all and deletes " - "the contents of the currently focused field (equivalent to " - "`press a` with `modifiers: ['Control']` then `press Backspace`)." + "'clear' — select all and delete the contents of the focused " + "field, then leave it empty for a fresh `type`." ) ) diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py index d9139ee..d7b52df 100644 --- a/server/agent/tools/mouse_tool.py +++ b/server/agent/tools/mouse_tool.py @@ -47,12 +47,11 @@ class MouseAction(OpenBrowserAction): "What to do with the mouse. " "'move' — slide the cursor to (x, y). The cursor traces an eased " "path so hover effects fire naturally along the way. " - "'click' — click WHERE THE CURSOR IS NOW. This is an in-place " - "action: it does not accept a target coordinate. Move there " - "first, verify the cursor is on the intended target in the " - "screenshot, then click. Use `count: 2` for double-click, " - "`count: 3` for triple-click. `button: 'right'` for context " - "menus. " + "'click' — click at the cursor's current position. To click a " + "different target, 'move' there first, then verify the red dot " + "is on the target in the next screenshot, then 'click'. Use " + "`count: 2` for double-click, `count: 3` for triple-click. " + "`button: 'right'` for context menus. " "'drag' — press at (x, y), drag to (x2, y2), release. " "'scroll' — scroll at the cursor position by `amount` in " "`direction`. " @@ -63,9 +62,8 @@ class MouseAction(OpenBrowserAction): x: Optional[int] = Field( default=None, description=( - "Target X in Qwen-VL [0, 1000] normalized space. Required for " - "'move' and 'drag' (start). Ignored by 'click' — click is " - "in-place; move first if you need to retarget." + "Target X in Qwen-VL [0, 1000] normalized space. Used by 'move' " + "and 'drag' (start)." ), ge=0, le=1000, @@ -73,8 +71,8 @@ class MouseAction(OpenBrowserAction): y: Optional[int] = Field( default=None, description=( - "Target Y in Qwen-VL [0, 1000] normalized space. Required for " - "'move' and 'drag' (start). Ignored by 'click'." + "Target Y in Qwen-VL [0, 1000] normalized space. Used by 'move' " + "and 'drag' (start)." ), ge=0, le=1000, From eda0b73a41bfde7b3d5e448c1cc059665b31b7be Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Wed, 29 Apr 2026 20:43:38 +0800 Subject: [PATCH 03/14] chore(deps): bump agent-sdk pin to 37227545 (pixel-paradigm system prompts) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Picks up the rewrite of system_prompt_{large,small}.j2 in softpudding/agent-sdk@37227545 — drops element_id/highlight/replay language and aligns the top-level system prompts with this repo's mouse + keyboard tools. uv.lock regenerated by `uv sync`. Co-Authored-By: Claude Opus 4.7 (1M context) --- pyproject.toml | 4 ++-- uv.lock | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 256f2fb..7789f92 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,5 +76,5 @@ override-dependencies = [ ] [tool.uv.sources] -openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "9b289cd393078641ea413dfd5f45d443dbb10b17" } -openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "9b289cd393078641ea413dfd5f45d443dbb10b17" } +openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "37227545d9d371423757ce47cddedd9521cc62d5" } +openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "37227545d9d371423757ce47cddedd9521cc62d5" } diff --git a/uv.lock b/uv.lock index a34b2f2..43f2cfc 100644 --- a/uv.lock +++ b/uv.lock @@ -1678,8 +1678,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=9b289cd393078641ea413dfd5f45d443dbb10b17" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=9b289cd393078641ea413dfd5f45d443dbb10b17" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=37227545d9d371423757ce47cddedd9521cc62d5" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=37227545d9d371423757ce47cddedd9521cc62d5" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2224,7 +2224,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=9b289cd393078641ea413dfd5f45d443dbb10b17#9b289cd393078641ea413dfd5f45d443dbb10b17" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=37227545d9d371423757ce47cddedd9521cc62d5#37227545d9d371423757ce47cddedd9521cc62d5" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2244,7 +2244,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=9b289cd393078641ea413dfd5f45d443dbb10b17#9b289cd393078641ea413dfd5f45d443dbb10b17" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=37227545d9d371423757ce47cddedd9521cc62d5#37227545d9d371423757ce47cddedd9521cc62d5" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" }, From 6608c323f58463e1dad44f36d7b803de2ed964ae Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Wed, 29 Apr 2026 21:55:02 +0800 Subject: [PATCH 04/14] feat(agent): unthrottle background tab + intercept native ` dropdowns and OS file pickers don't render into CDP screenshots, so a click on one left the agent blind. Now `mouse_click` hit-tests the cursor (walking through `