diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index 155bc87..3b0f24b 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -37,6 +37,7 @@ import { performElementSwipe, performElementDragAndDrop, performElementSetSlider, + performElementUpload, performKeyboardInput, performElementSelect, replayHoverState, @@ -295,6 +296,7 @@ const IN_PAGE_HIGHLIGHT_COLORS: Record = selectable: { border: '#FF6B6B', bg: 'rgba(255,107,107,0.7)' }, draggable: { border: '#FF6600', bg: 'rgba(255,102,0,0.7)' }, droppable: { border: '#339966', bg: 'rgba(51,153,102,0.7)' }, + uploadable: { border: '#AA66FF', bg: 'rgba(170,102,255,0.7)' }, any: { border: '#00CCCC', bg: 'rgba(0,204,204,0.7)' }, }; @@ -307,7 +309,10 @@ function buildInPageHighlightScript(elements: InteractiveElement[]): string { IN_PAGE_HIGHLIGHT_COLORS[el.type] || IN_PAGE_HIGHLIGHT_COLORS.clickable; return { id: el.id, - selector: el.selector, + // The overlay script renders the box on `selector`. For uploadable + // file inputs that are display:none, the visible anchor's selector + // lets the overlay land on something the user can actually see. + selector: el.overlaySelector || el.selector, borderColor: colors.border, bgColor: colors.bg, labelPos: el.labelPosition || 'above', @@ -578,7 +583,11 @@ async function captureHighlightedPageState( : ''; const detectedViewport = detectionResult.result.value.viewport || {}; const layoutStability = detectionResult.result.value.layoutStability; + const inPagePerf = detectionResult.result.value._perf || {}; const highlightTraceStart = Date.now(); + let paginationMs = 0; + let screenshotMs = 0; + let consistencyMs = 0; const detectedViewportWidth = typeof detectedViewport.width === 'number' ? detectedViewport.width : 0; const detectedViewportHeight = @@ -651,8 +660,9 @@ async function captureHighlightedPageState( console.log( `πŸ“„ [${logLabel}] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`, ); + paginationMs = Date.now() - paginationBuildStart; console.log( - `⏱️ [HighlightTrace] background pagination build-pages=${Date.now() - paginationBuildStart}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, + `⏱️ [HighlightTrace] background pagination build-pages=${paginationMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, ); } @@ -697,9 +707,8 @@ async function captureHighlightedPageState( console.log( `πŸ“Έ [${logLabel}] Screenshot captured (with in-page highlights), size: ${screenshotResult.imageData.length} bytes`, ); - console.log( - `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`, - ); + screenshotMs = Date.now() - screenshotStart; + console.log(`⏱️ [HighlightTrace] background screenshot ${screenshotMs}ms`); // Apply bboxes returned from the highlight injection script const preCaptureData = screenshotResult.preCaptureResult; @@ -761,8 +770,9 @@ async function captureHighlightedPageState( })), currentConsistencySamples, ); + consistencyMs = Date.now() - consistencyCheckStart; console.log( - `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, + `⏱️ [HighlightTrace] background consistency-check ${consistencyMs}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, ); const repeatedDrift = isRepeatedHighlightDrift( highlightConsistency, @@ -836,6 +846,15 @@ async function captureHighlightedPageState( page: currentPage, pageState, readinessReasons, + _perf: { + scan_ms: + typeof inPagePerf.scan_ms === 'number' ? inPagePerf.scan_ms : 0, + scan_stats: inPagePerf.scan_stats || {}, + scan_times: inPagePerf.scan_times || {}, + pagination_ms: paginationMs, + screenshot_ms: screenshotMs, + consistency_ms: consistencyMs, + }, ...buildScreenshotPayload(compressedScreenshotResult), }; } @@ -953,6 +972,7 @@ function isHeavyBrowserCommand(data: any): boolean { case 'set_slider_value': case 'keyboard_input': case 'select_element': + case 'upload_file': case 'handle_dialog': return true; case 'tab': @@ -2328,6 +2348,38 @@ async function handleCommand(command: Command): Promise { }; } + case 'upload_file': { + if (!command.conversation_id) + throw new Error('conversation_id required'); + const uploadTabId = command.tab_id; + if (uploadTabId === undefined || uploadTabId === null) + throw new Error('tab_id is required'); + if (!command.file_path || typeof command.file_path !== 'string') + throw new Error('file_path is required for upload_file'); + + const uploadResult = await performElementUpload( + command.conversation_id, + command.element_id, + uploadTabId, + command.file_path, + ); + const uploadPageState = await captureDefaultHighlightedPageState({ + tabId: uploadTabId, + conversationId: command.conversation_id, + logLabel: 'UploadFile', + }); + + return { + success: uploadResult.success, + data: { + ...uploadResult, + ...uploadPageState, + }, + error: uploadResult.error, + timestamp: Date.now(), + }; + } + case 'keyboard_input': { if (!command.conversation_id) throw new Error('conversation_id required'); diff --git a/extension/src/commands/element-actions.ts b/extension/src/commands/element-actions.ts index e8a1797..1561422 100644 --- a/extension/src/commands/element-actions.ts +++ b/extension/src/commands/element-actions.ts @@ -11,6 +11,7 @@ import type { ElementActionResult } from '../types'; * - Handles dialog events using the same pattern as javascript.ts */ +import { CdpCommander } from './cdp-commander'; import { buildElementCacheMissMessage, elementCache } from './element-cache'; import { executeJavaScript, type JavaScriptResult } from './javascript'; import { buildHitTestVisibilityHelpersScript } from '../utils/hit-test-visibility'; @@ -516,6 +517,15 @@ export interface HoverResult extends ElementActionResult { error?: string; } +/** + * Result type for file upload operation + */ +export interface UploadResult extends ElementActionResult { + uploaded: boolean; + staleElement?: boolean; + error?: string; +} + /** * Result type for element select operation */ @@ -4096,6 +4106,109 @@ export async function performElementSelect( return result; } +/** + * Attach a local file (by absolute path on the host) to an + * via CDP `DOM.setFileInputFiles`. This bypasses the native OS file picker β€” + * attempting to click the input would pop the picker in front of the user, + * which the agent cannot drive. + * + * The server validates the path before dispatching, so here we only need to + * resolve the cached selector to a CDP `nodeId` and invoke setFileInputFiles. + */ +export async function performElementUpload( + conversationId: string, + elementId: string, + tabId: number, + filePath: string, +): Promise { + console.log( + `πŸ“Ž [ElementUpload] Uploading "${filePath}" to element ${elementId} on tab ${tabId}`, + ); + + const cachedElement = elementCache.getElementById( + conversationId, + tabId, + elementId, + ); + if (!cachedElement) { + console.log(`❌ [ElementUpload] Element ${elementId} not found in cache`); + return { + success: false, + ...buildResolvedElementResultFields(elementId, elementId), + uploaded: false, + staleElement: false, + error: buildElementCacheMissMessage({ + conversationId, + tabId, + elementId, + }), + }; + } + + const element = cachedElement.element; + const resolvedElementFields = buildResolvedElementResultFields( + cachedElement.requestedElementId, + cachedElement.resolvedElementId, + ); + const cdp = new CdpCommander(tabId); + + try { + // Resolve selector β†’ CDP nodeId. DOM.getDocument returns the document root + // node; DOM.querySelector is scoped to that root and accepts any CSS + // selector. A nodeId of 0 indicates no match (selector went stale). + const doc = (await cdp.sendCommand('DOM.getDocument', { depth: 0 })) as { + root?: { nodeId: number }; + }; + if (!doc || !doc.root || typeof doc.root.nodeId !== 'number') { + return { + success: false, + ...resolvedElementFields, + uploaded: false, + error: 'CDP DOM.getDocument returned no root node', + }; + } + + const queryResult = (await cdp.sendCommand('DOM.querySelector', { + nodeId: doc.root.nodeId, + selector: element.selector, + })) as { nodeId?: number }; + + if (!queryResult || !queryResult.nodeId) { + return { + success: false, + ...resolvedElementFields, + uploaded: false, + staleElement: true, + error: `Selector "${element.selector}" no longer resolves to a DOM node (element became stale).`, + }; + } + + await cdp.sendCommand('DOM.setFileInputFiles', { + nodeId: queryResult.nodeId, + files: [filePath], + }); + + console.log( + `βœ… [ElementUpload] DOM.setFileInputFiles succeeded for ${elementId} (${filePath})`, + ); + + return { + success: true, + ...resolvedElementFields, + uploaded: true, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`❌ [ElementUpload] failed: ${message}`); + return { + success: false, + ...resolvedElementFields, + uploaded: false, + error: message, + }; + } +} + /** * Export element actions module */ @@ -4105,4 +4218,5 @@ export const elementActions = { performElementScroll, performKeyboardInput, performElementSelect, + performElementUpload, }; diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index c800fff..4140016 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -5,6 +5,7 @@ const HIGHLIGHT_TYPE_PRIORITY = { scrollable: 3, draggable: 4, droppable: 5, + uploadable: 6, }; const HIGHLIGHT_SIGNAL_SCORE = { @@ -16,6 +17,7 @@ const HIGHLIGHT_SIGNAL_SCORE = { inputable: 360, selectable: 340, scrollable: 220, + uploadable: 360, }; const POINTER_ROLE_SET = new Set([ @@ -75,6 +77,130 @@ function hasCallableMethod(value, methodNames) { ); } +// Layout reads (getBoundingClientRect, getComputedStyle) and elementsFromPoint +// are the single biggest cost in collectHighlightCandidates: every visibility +// predicate re-reads them for the same element. Within one synchronous +// Runtime.evaluate task no page JS runs concurrently, so the values cannot +// change mid-scan. We monkey-patch the prototypes for the duration of one +// scan, populate a per-element WeakMap, and restore originals at the end. +const SCAN_NON_INTERACTIVE_TAGS = new Set([ + 'script', + 'style', + 'link', + 'meta', + 'head', + 'title', + 'noscript', + 'br', + 'hr', + 'source', + 'track', + 'template', + 'param', + 'col', + 'colgroup', +]); + +function isScanSkippableTag(el) { + if (!el || !el.tagName) return false; + return SCAN_NON_INTERACTIVE_TAGS.has(el.tagName.toLowerCase()); +} + +// Per-scan memoization caches for pure-function classifiers that get hit many +// times for the same element during the resolve phase (each candidate walks +// up to 5 ancestors, each ancestor calls hasExplicitClickableAncestor which +// walks ALL ancestors, etc.). Reset at the start of each scan, leak nothing +// outside it. WeakMap so any GC'd nodes drop out automatically. +let _scanSemanticSignalCache = null; +let _scanClickableCandidateCache = null; +let _scanBaseClickableSignalCache = null; +let _scanTextContentCache = null; +let _scanSearchTextCache = null; +let _scanExplicitAncestorCache = null; + +function withScanLayoutCache(fn) { + const rectCache = new WeakMap(); + const styleCache = new WeakMap(); + // elementsFromPoint dedup keyed by rounded "x:y" + const efpCache = new Map(); + _scanSemanticSignalCache = new WeakMap(); + _scanClickableCandidateCache = new WeakMap(); + _scanBaseClickableSignalCache = new WeakMap(); + _scanTextContentCache = new WeakMap(); + _scanSearchTextCache = new WeakMap(); + _scanExplicitAncestorCache = new WeakMap(); + + const origElementRect = Element.prototype.getBoundingClientRect; + const SVGGraphicsProto = + typeof SVGGraphicsElement !== 'undefined' + ? SVGGraphicsElement.prototype + : null; + const origSVGRect = + SVGGraphicsProto && SVGGraphicsProto.getBoundingClientRect; + const origGetComputedStyle = window.getComputedStyle; + // Patch Document.prototype rather than the document instance so we don't + // leave an own-property shadowing the prototype after the scan finishes. + const DocumentProto = + typeof Document !== 'undefined' ? Document.prototype : null; + const origElementsFromPoint = + DocumentProto && DocumentProto.elementsFromPoint; + + function patchedRect() { + let r = rectCache.get(this); + if (r === undefined) { + r = origElementRect.call(this); + rectCache.set(this, r); + } + return r; + } + + Element.prototype.getBoundingClientRect = patchedRect; + if (SVGGraphicsProto && origSVGRect) { + SVGGraphicsProto.getBoundingClientRect = patchedRect; + } + + window.getComputedStyle = function (el, pseudo) { + if (pseudo) return origGetComputedStyle.call(window, el, pseudo); + let s = styleCache.get(el); + if (s === undefined) { + s = origGetComputedStyle.call(window, el); + styleCache.set(el, s); + } + return s; + }; + + if (DocumentProto && origElementsFromPoint) { + DocumentProto.elementsFromPoint = function (x, y) { + const key = Math.round(x) + ':' + Math.round(y); + let stack = efpCache.get(key); + if (stack === undefined) { + stack = origElementsFromPoint.call(this, x, y); + efpCache.set(key, stack); + } + return stack; + }; + } + + try { + return fn(); + } finally { + Element.prototype.getBoundingClientRect = origElementRect; + if (SVGGraphicsProto && origSVGRect) { + SVGGraphicsProto.getBoundingClientRect = origSVGRect; + } + window.getComputedStyle = origGetComputedStyle; + if (DocumentProto && origElementsFromPoint) { + DocumentProto.elementsFromPoint = origElementsFromPoint; + } + _scanSemanticSignalCache = null; + _scanClickableCandidateCache = null; + _scanBaseClickableSignalCache = null; + _scanTextContentCache = null; + _scanSearchTextCache = null; + _scanExplicitAncestorCache = null; + } +} + function createHighlightTrace() { const traceStart = performance.now(); @@ -303,6 +429,15 @@ function getSwipeMarkerText(el) { } function getElementTextForDetection(el) { + if (_scanTextContentCache && _scanTextContentCache.has(el)) { + return _scanTextContentCache.get(el); + } + const r = getElementTextForDetectionImpl(el); + if (_scanTextContentCache) _scanTextContentCache.set(el, r); + return r; +} + +function getElementTextForDetectionImpl(el) { if (el instanceof HTMLInputElement) { const inputType = (el.type || '').toLowerCase(); if ( @@ -314,10 +449,22 @@ function getElementTextForDetection(el) { } } + // textContent on a deep node walks the entire subtree of text nodes β€” for + // a table row with hundreds of descendants this is expensive enough to + // dominate the resolve phase. Cache so each candidate pays at most once. return normalizeWhitespace(el.textContent || '', 240); } function getElementSearchText(el) { + if (_scanSearchTextCache && _scanSearchTextCache.has(el)) { + return _scanSearchTextCache.get(el); + } + const r = getElementSearchTextImpl(el); + if (_scanSearchTextCache) _scanSearchTextCache.set(el, r); + return r; +} + +function getElementSearchTextImpl(el) { const tokens = [ el.tagName.toLowerCase(), ...getAttributeTextTokens(el, [ @@ -478,6 +625,15 @@ function hasPointerCursor(el) { } function getBaseClickableSignal(el) { + if (_scanBaseClickableSignalCache && _scanBaseClickableSignalCache.has(el)) { + return _scanBaseClickableSignalCache.get(el); + } + const r = getBaseClickableSignalImpl(el); + if (_scanBaseClickableSignalCache) _scanBaseClickableSignalCache.set(el, r); + return r; +} + +function getBaseClickableSignalImpl(el) { const semanticSignal = getSemanticClickableSignal(el); if (semanticSignal) { return semanticSignal; @@ -571,6 +727,15 @@ function getControlAffinityScore(el) { } function getSemanticClickableSignal(el) { + if (_scanSemanticSignalCache && _scanSemanticSignalCache.has(el)) { + return _scanSemanticSignalCache.get(el); + } + const r = getSemanticClickableSignalImpl(el); + if (_scanSemanticSignalCache) _scanSemanticSignalCache.set(el, r); + return r; +} + +function getSemanticClickableSignalImpl(el) { const tag = el.tagName.toLowerCase(); const role = (el.getAttribute('role') || '').toLowerCase(); @@ -767,18 +932,30 @@ function countDirectClickableChildren(el) { } function hasExplicitClickableAncestor(el) { + if (_scanExplicitAncestorCache && _scanExplicitAncestorCache.has(el)) { + return _scanExplicitAncestorCache.get(el); + } + // Per-call top-level memoization only. A previous version tried to + // walk-and-memoize each visited ancestor too, but that's incorrect β€” + // a node's own `hasExplicitClickableAncestor` is about ITS ancestors, + // not about its own signal, and it's also influenced by its own signal + // when answering the same question for *its* descendants. Doing the full + // walk per unique element (with getSemanticClickableSignal cached) is + // already cheap enough thanks to the upstream caches. let current = el.parentElement; - + let answer = false; while (current && current !== document.body) { const signal = getSemanticClickableSignal(current); if (signal === 'semantic' || signal === 'attribute') { - return true; + answer = true; + break; } - current = current.parentElement; } - - return false; + if (_scanExplicitAncestorCache) { + _scanExplicitAncestorCache.set(el, answer); + } + return answer; } function isInputableCandidate(el) { @@ -812,6 +989,69 @@ function isSelectableCandidate(el) { return !isDisabledForDetection(el) && el.tagName.toLowerCase() === 'select'; } +/** + * Match regardless of visibility. File inputs are almost + * always hidden (display:none / size:0) behind a styled