From 9b9eedb506bc1b5bce9e9a3d6b6803bac69d8242 Mon Sep 17 00:00:00 2001 From: Xiao Yang Date: Thu, 16 Apr 2026 21:38:54 +0800 Subject: [PATCH 01/12] Add image input for user prompts and upload_file browser action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Users can now attach images to a task (paste, drag-drop, or paperclip) and the agent can attach local files to controls via CDP DOM.setFileInputFiles, bypassing the native OS picker. - Frontend: paste/drop/picker β†’ data URIs in POST body, up to 8 images at 10MB each; thumbnails with Γ— remove. - Server: validates data URIs and size, builds multimodal Message (TextContent + ImageContent) in both in-process and multi-process dispatch paths. - Extension: new `uploadable` element type with a dedicated detection pass that surfaces display:none file inputs and anchors the overlay on the nearest visible label/button. New `upload_file` action on ElementInteractionTool resolves selector β†’ CDP nodeId and calls DOM.setFileInputFiles. - Prompts + highlight tool updated for big-model and small-model to advertise the new element type and action. Co-Authored-By: Claude Opus 4.6 (1M context) --- extension/src/background/index.ts | 40 ++- extension/src/commands/element-actions.ts | 114 +++++++ .../commands/highlight-detection.injected.js | 181 ++++++++++- extension/src/types.ts | 16 + frontend/index.html | 290 +++++++++++++++++- server/agent/api.py | 64 +++- .../big_model/element_interaction_tool.j2 | 18 +- .../agent/prompts/big_model/highlight_tool.j2 | 4 +- .../small_model/element_interaction_tool.j2 | 9 + .../prompts/small_model/highlight_tool.j2 | 4 +- server/agent/tools/browser_executor.py | 21 ++ .../agent/tools/element_interaction_tool.py | 13 +- server/agent/tools/highlight_tool.py | 2 +- server/api/routes/agent.py | 84 ++++- server/core/browser_executor_bundle.py | 18 +- server/core/process_manager.py | 1 + server/core/processor.py | 36 +++ server/models/commands.py | 26 +- .../tests/unit/test_agent_api_multiprocess.py | 1 + server/tests/unit/test_api_uuid.py | 5 +- 20 files changed, 916 insertions(+), 31 deletions(-) diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 155bc87..b35132c 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -37,6 +37,7 @@ import { performElementSwipe, performElementDragAndDrop, performElementSetSlider, + performElementUpload, performKeyboardInput, performElementSelect, replayHoverState, @@ -295,6 +296,7 @@ const IN_PAGE_HIGHLIGHT_COLORS: Record = selectable: { border: '#FF6B6B', bg: 'rgba(255,107,107,0.7)' }, draggable: { border: '#FF6600', bg: 'rgba(255,102,0,0.7)' }, droppable: { border: '#339966', bg: 'rgba(51,153,102,0.7)' }, + uploadable: { border: '#AA66FF', bg: 'rgba(170,102,255,0.7)' }, any: { border: '#00CCCC', bg: 'rgba(0,204,204,0.7)' }, }; @@ -307,7 +309,10 @@ function buildInPageHighlightScript(elements: InteractiveElement[]): string { IN_PAGE_HIGHLIGHT_COLORS[el.type] || IN_PAGE_HIGHLIGHT_COLORS.clickable; return { id: el.id, - selector: el.selector, + // The overlay script renders the box on `selector`. For uploadable + // file inputs that are display:none, the visible anchor's selector + // lets the overlay land on something the user can actually see. + selector: el.overlaySelector || el.selector, borderColor: colors.border, bgColor: colors.bg, labelPos: el.labelPosition || 'above', @@ -953,6 +958,7 @@ function isHeavyBrowserCommand(data: any): boolean { case 'set_slider_value': case 'keyboard_input': case 'select_element': + case 'upload_file': case 'handle_dialog': return true; case 'tab': @@ -2328,6 +2334,38 @@ async function handleCommand(command: Command): Promise { }; } + case 'upload_file': { + if (!command.conversation_id) + throw new Error('conversation_id required'); + const uploadTabId = command.tab_id; + if (uploadTabId === undefined || uploadTabId === null) + throw new Error('tab_id is required'); + if (!command.file_path || typeof command.file_path !== 'string') + throw new Error('file_path is required for upload_file'); + + const uploadResult = await performElementUpload( + command.conversation_id, + command.element_id, + uploadTabId, + command.file_path, + ); + const uploadPageState = await captureDefaultHighlightedPageState({ + tabId: uploadTabId, + conversationId: command.conversation_id, + logLabel: 'UploadFile', + }); + + return { + success: uploadResult.success, + data: { + ...uploadResult, + ...uploadPageState, + }, + error: uploadResult.error, + timestamp: Date.now(), + }; + } + case 'keyboard_input': { if (!command.conversation_id) throw new Error('conversation_id required'); diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index e8a1797..1561422 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -11,6 +11,7 @@ import type { ElementActionResult } from '../types'; * - Handles dialog events using the same pattern as javascript.ts */ +import { CdpCommander } from './cdp-commander'; import { buildElementCacheMissMessage, elementCache } from './element-cache'; import { executeJavaScript, type JavaScriptResult } from './javascript'; import { buildHitTestVisibilityHelpersScript } from '../utils/hit-test-visibility'; @@ -516,6 +517,15 @@ export interface HoverResult extends ElementActionResult { error?: string; } +/** + * Result type for file upload operation + */ +export interface UploadResult extends ElementActionResult { + uploaded: boolean; + staleElement?: boolean; + error?: string; +} + /** * Result type for element select operation */ @@ -4096,6 +4106,109 @@ export async function performElementSelect( return result; } +/** + * Attach a local file (by absolute path on the host) to an + * via CDP `DOM.setFileInputFiles`. This bypasses the native OS file picker β€” + * attempting to click the input would pop the picker in front of the user, + * which the agent cannot drive. + * + * The server validates the path before dispatching, so here we only need to + * resolve the cached selector to a CDP `nodeId` and invoke setFileInputFiles. + */ +export async function performElementUpload( + conversationId: string, + elementId: string, + tabId: number, + filePath: string, +): Promise { + console.log( + `πŸ“Ž [ElementUpload] Uploading "${filePath}" to element ${elementId} on tab ${tabId}`, + ); + + const cachedElement = elementCache.getElementById( + conversationId, + tabId, + elementId, + ); + if (!cachedElement) { + console.log(`❌ [ElementUpload] Element ${elementId} not found in cache`); + return { + success: false, + ...buildResolvedElementResultFields(elementId, elementId), + uploaded: false, + staleElement: false, + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), + }; + } + + const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); + const cdp = new CdpCommander(tabId); + + try { + // Resolve selector β†’ CDP nodeId. DOM.getDocument returns the document root + // node; DOM.querySelector is scoped to that root and accepts any CSS + // selector. A nodeId of 0 indicates no match (selector went stale). + const doc = (await cdp.sendCommand('DOM.getDocument', { depth: 0 })) as { + root?: { nodeId: number }; + }; + if (!doc || !doc.root || typeof doc.root.nodeId !== 'number') { + return { + success: false, + ...resolvedElementFields, + uploaded: false, + error: 'CDP DOM.getDocument returned no root node', + }; + } + + const queryResult = (await cdp.sendCommand('DOM.querySelector', { + nodeId: doc.root.nodeId, + selector: element.selector, + })) as { nodeId?: number }; + + if (!queryResult || !queryResult.nodeId) { + return { + success: false, + ...resolvedElementFields, + uploaded: false, + staleElement: true, + error: `Selector "${element.selector}" no longer resolves to a DOM node (element became stale).`, + }; + } + + await cdp.sendCommand('DOM.setFileInputFiles', { + nodeId: queryResult.nodeId, + files: [filePath], + }); + + console.log( + `βœ… [ElementUpload] DOM.setFileInputFiles succeeded for ${elementId} (${filePath})`, + ); + + return { + success: true, + ...resolvedElementFields, + uploaded: true, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`❌ [ElementUpload] failed: ${message}`); + return { + success: false, + ...resolvedElementFields, + uploaded: false, + error: message, + }; + } +} + /** * Export element actions module */ @@ -4105,4 +4218,5 @@ export const elementActions = { performElementScroll, performKeyboardInput, performElementSelect, + performElementUpload, }; diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index c800fff..ad31ef5 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -5,6 +5,7 @@ const HIGHLIGHT_TYPE_PRIORITY = { scrollable: 3, draggable: 4, droppable: 5, + uploadable: 6, }; const HIGHLIGHT_SIGNAL_SCORE = { @@ -16,6 +17,7 @@ const HIGHLIGHT_SIGNAL_SCORE = { inputable: 360, selectable: 340, scrollable: 220, + uploadable: 360, }; const POINTER_ROLE_SET = new Set([ @@ -812,6 +814,69 @@ function isSelectableCandidate(el) { return !isDisabledForDetection(el) && el.tagName.toLowerCase() === 'select'; } +/** + * Match regardless of visibility. File inputs are almost + * always hidden (display:none / size:0) behind a styled