diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts index b35132c..3b0f24b 100644 --- a/extension/src/background/index.ts +++ b/extension/src/background/index.ts @@ -583,7 +583,11 @@ async function captureHighlightedPageState( : ''; const detectedViewport = detectionResult.result.value.viewport || {}; const layoutStability = detectionResult.result.value.layoutStability; + const inPagePerf = detectionResult.result.value._perf || {}; const highlightTraceStart = Date.now(); + let paginationMs = 0; + let screenshotMs = 0; + let consistencyMs = 0; const detectedViewportWidth = typeof detectedViewport.width === 'number' ? detectedViewport.width : 0; const detectedViewportHeight = @@ -656,8 +660,9 @@ async function captureHighlightedPageState( console.log( `📄 [${logLabel}] Page ${page}/${totalPages}, showing ${paginatedElements.length} of ${filteredElements.length} elements`, ); + paginationMs = Date.now() - paginationBuildStart; console.log( - `⏱️ [HighlightTrace] background pagination build-pages=${Date.now() - paginationBuildStart}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, + `⏱️ [HighlightTrace] background pagination build-pages=${paginationMs}ms (page=${page}, viewport=${detectedViewportWidth}x${detectedViewportHeight})`, ); } @@ -702,9 +707,8 @@ async function captureHighlightedPageState( console.log( `📸 [${logLabel}] Screenshot captured (with in-page highlights), size: ${screenshotResult.imageData.length} bytes`, ); - console.log( - `⏱️ [HighlightTrace] background screenshot ${Date.now() - screenshotStart}ms`, - ); + screenshotMs = Date.now() - screenshotStart; + console.log(`⏱️ [HighlightTrace] background screenshot ${screenshotMs}ms`); // Apply bboxes returned from the highlight injection script const preCaptureData = screenshotResult.preCaptureResult; @@ -766,8 +770,9 @@ async function captureHighlightedPageState( })), currentConsistencySamples, ); + consistencyMs = Date.now() - consistencyCheckStart; console.log( - `⏱️ [HighlightTrace] background consistency-check ${Date.now() - consistencyCheckStart}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, + `⏱️ [HighlightTrace] background consistency-check ${consistencyMs}ms (checked=${highlightConsistency.checkedCount}, matched=${highlightConsistency.matchedCount}, missing=${highlightConsistency.missingCount}, shifted=${highlightConsistency.shiftedCount}, maxCenterShift=${highlightConsistency.maxCenterShift}, maxSizeDelta=${highlightConsistency.maxSizeDelta}, retry=${highlightConsistency.shouldRetry})`, ); const repeatedDrift = isRepeatedHighlightDrift( highlightConsistency, @@ -841,6 +846,15 @@ async function captureHighlightedPageState( page: currentPage, pageState, readinessReasons, + _perf: { + scan_ms: + typeof inPagePerf.scan_ms === 'number' ? inPagePerf.scan_ms : 0, + scan_stats: inPagePerf.scan_stats || {}, + scan_times: inPagePerf.scan_times || {}, + pagination_ms: paginationMs, + screenshot_ms: screenshotMs, + consistency_ms: consistencyMs, + }, ...buildScreenshotPayload(compressedScreenshotResult), }; } diff --git a/extension/src/commands/highlight-detection.injected.js b/extension/src/commands/highlight-detection.injected.js index 72f3e8e..4140016 100644 --- a/extension/src/commands/highlight-detection.injected.js +++ b/extension/src/commands/highlight-detection.injected.js @@ -77,6 +77,130 @@ function hasCallableMethod(value, methodNames) { ); } +// Layout reads (getBoundingClientRect, getComputedStyle) and elementsFromPoint +// are the single biggest cost in collectHighlightCandidates: every visibility +// predicate re-reads them for the same element. Within one synchronous +// Runtime.evaluate task no page JS runs concurrently, so the values cannot +// change mid-scan. We monkey-patch the prototypes for the duration of one +// scan, populate a per-element WeakMap, and restore originals at the end. +const SCAN_NON_INTERACTIVE_TAGS = new Set([ + 'script', + 'style', + 'link', + 'meta', + 'head', + 'title', + 'noscript', + 'br', + 'hr', + 'source', + 'track', + 'template', + 'param', + 'col', + 'colgroup', +]); + +function isScanSkippableTag(el) { + if (!el || !el.tagName) return false; + return SCAN_NON_INTERACTIVE_TAGS.has(el.tagName.toLowerCase()); +} + +// Per-scan memoization caches for pure-function classifiers that get hit many +// times for the same element during the resolve phase (each candidate walks +// up to 5 ancestors, each ancestor calls hasExplicitClickableAncestor which +// walks ALL ancestors, etc.). Reset at the start of each scan, leak nothing +// outside it. WeakMap so any GC'd nodes drop out automatically. +let _scanSemanticSignalCache = null; +let _scanClickableCandidateCache = null; +let _scanBaseClickableSignalCache = null; +let _scanTextContentCache = null; +let _scanSearchTextCache = null; +let _scanExplicitAncestorCache = null; + +function withScanLayoutCache(fn) { + const rectCache = new WeakMap(); + const styleCache = new WeakMap(); + // elementsFromPoint dedup keyed by rounded "x:y" + const efpCache = new Map(); + _scanSemanticSignalCache = new WeakMap(); + _scanClickableCandidateCache = new WeakMap(); + _scanBaseClickableSignalCache = new WeakMap(); + _scanTextContentCache = new WeakMap(); + _scanSearchTextCache = new WeakMap(); + _scanExplicitAncestorCache = new WeakMap(); + + const origElementRect = Element.prototype.getBoundingClientRect; + const SVGGraphicsProto = + typeof SVGGraphicsElement !== 'undefined' + ? SVGGraphicsElement.prototype + : null; + const origSVGRect = + SVGGraphicsProto && SVGGraphicsProto.getBoundingClientRect; + const origGetComputedStyle = window.getComputedStyle; + // Patch Document.prototype rather than the document instance so we don't + // leave an own-property shadowing the prototype after the scan finishes. + const DocumentProto = + typeof Document !== 'undefined' ? Document.prototype : null; + const origElementsFromPoint = + DocumentProto && DocumentProto.elementsFromPoint; + + function patchedRect() { + let r = rectCache.get(this); + if (r === undefined) { + r = origElementRect.call(this); + rectCache.set(this, r); + } + return r; + } + + Element.prototype.getBoundingClientRect = patchedRect; + if (SVGGraphicsProto && origSVGRect) { + SVGGraphicsProto.getBoundingClientRect = patchedRect; + } + + window.getComputedStyle = function (el, pseudo) { + if (pseudo) return origGetComputedStyle.call(window, el, pseudo); + let s = styleCache.get(el); + if (s === undefined) { + s = origGetComputedStyle.call(window, el); + styleCache.set(el, s); + } + return s; + }; + + if (DocumentProto && origElementsFromPoint) { + DocumentProto.elementsFromPoint = function (x, y) { + const key = Math.round(x) + ':' + Math.round(y); + let stack = efpCache.get(key); + if (stack === undefined) { + stack = origElementsFromPoint.call(this, x, y); + efpCache.set(key, stack); + } + return stack; + }; + } + + try { + return fn(); + } finally { + Element.prototype.getBoundingClientRect = origElementRect; + if (SVGGraphicsProto && origSVGRect) { + SVGGraphicsProto.getBoundingClientRect = origSVGRect; + } + window.getComputedStyle = origGetComputedStyle; + if (DocumentProto && origElementsFromPoint) { + DocumentProto.elementsFromPoint = origElementsFromPoint; + } + _scanSemanticSignalCache = null; + _scanClickableCandidateCache = null; + _scanBaseClickableSignalCache = null; + _scanTextContentCache = null; + _scanSearchTextCache = null; + _scanExplicitAncestorCache = null; + } +} + function createHighlightTrace() { const traceStart = performance.now(); @@ -305,6 +429,15 @@ function getSwipeMarkerText(el) { } function getElementTextForDetection(el) { + if (_scanTextContentCache && _scanTextContentCache.has(el)) { + return _scanTextContentCache.get(el); + } + const r = getElementTextForDetectionImpl(el); + if (_scanTextContentCache) _scanTextContentCache.set(el, r); + return r; +} + +function getElementTextForDetectionImpl(el) { if (el instanceof HTMLInputElement) { const inputType = (el.type || '').toLowerCase(); if ( @@ -316,10 +449,22 @@ function getElementTextForDetection(el) { } } + // textContent on a deep node walks the entire subtree of text nodes — for + // a table row with hundreds of descendants this is expensive enough to + // dominate the resolve phase. Cache so each candidate pays at most once. return normalizeWhitespace(el.textContent || '', 240); } function getElementSearchText(el) { + if (_scanSearchTextCache && _scanSearchTextCache.has(el)) { + return _scanSearchTextCache.get(el); + } + const r = getElementSearchTextImpl(el); + if (_scanSearchTextCache) _scanSearchTextCache.set(el, r); + return r; +} + +function getElementSearchTextImpl(el) { const tokens = [ el.tagName.toLowerCase(), ...getAttributeTextTokens(el, [ @@ -480,6 +625,15 @@ function hasPointerCursor(el) { } function getBaseClickableSignal(el) { + if (_scanBaseClickableSignalCache && _scanBaseClickableSignalCache.has(el)) { + return _scanBaseClickableSignalCache.get(el); + } + const r = getBaseClickableSignalImpl(el); + if (_scanBaseClickableSignalCache) _scanBaseClickableSignalCache.set(el, r); + return r; +} + +function getBaseClickableSignalImpl(el) { const semanticSignal = getSemanticClickableSignal(el); if (semanticSignal) { return semanticSignal; @@ -573,6 +727,15 @@ function getControlAffinityScore(el) { } function getSemanticClickableSignal(el) { + if (_scanSemanticSignalCache && _scanSemanticSignalCache.has(el)) { + return _scanSemanticSignalCache.get(el); + } + const r = getSemanticClickableSignalImpl(el); + if (_scanSemanticSignalCache) _scanSemanticSignalCache.set(el, r); + return r; +} + +function getSemanticClickableSignalImpl(el) { const tag = el.tagName.toLowerCase(); const role = (el.getAttribute('role') || '').toLowerCase(); @@ -769,18 +932,30 @@ function countDirectClickableChildren(el) { } function hasExplicitClickableAncestor(el) { + if (_scanExplicitAncestorCache && _scanExplicitAncestorCache.has(el)) { + return _scanExplicitAncestorCache.get(el); + } + // Per-call top-level memoization only. A previous version tried to + // walk-and-memoize each visited ancestor too, but that's incorrect — + // a node's own `hasExplicitClickableAncestor` is about ITS ancestors, + // not about its own signal, and it's also influenced by its own signal + // when answering the same question for *its* descendants. Doing the full + // walk per unique element (with getSemanticClickableSignal cached) is + // already cheap enough thanks to the upstream caches. let current = el.parentElement; - + let answer = false; while (current && current !== document.body) { const signal = getSemanticClickableSignal(current); if (signal === 'semantic' || signal === 'attribute') { - return true; + answer = true; + break; } - current = current.parentElement; } - - return false; + if (_scanExplicitAncestorCache) { + _scanExplicitAncestorCache.set(el, answer); + } + return answer; } function isInputableCandidate(el) { @@ -911,6 +1086,15 @@ function hasStructuredInteractiveDescendant(el) { } function isClickableCandidate(el) { + if (_scanClickableCandidateCache && _scanClickableCandidateCache.has(el)) { + return _scanClickableCandidateCache.get(el); + } + const r = isClickableCandidateImpl(el); + if (_scanClickableCandidateCache) _scanClickableCandidateCache.set(el, r); + return r; +} + +function isClickableCandidateImpl(el) { if (isDisabledForDetection(el)) { return null; } @@ -2473,6 +2657,12 @@ function collectUploadableCandidates(trace) { } function collectHighlightCandidates(config, trace, layoutStability) { + return withScanLayoutCache(() => + collectHighlightCandidatesImpl(config, trace, layoutStability), + ); +} + +function collectHighlightCandidatesImpl(config, trace, layoutStability) { const activeTopLayerRoot = getActiveTopLayerRoot(); const registry = new Map(); @@ -2519,6 +2709,27 @@ function collectHighlightCandidates(config, trace, layoutStability) { ); let scannedCount = 0; + // Per-phase reject counters and timings — gated behind the trace, helps + // identify where the scan budget is spent without per-element console spam. + const phaseStats = { + tagSkip: 0, + notInViewport: 0, + notVisible: 0, + scrollParentClipped: 0, + notInActiveTopLayer: 0, + hitTestOccluded: 0, + notResolvable: 0, + matched: 0, + }; + const phaseTimes = { + tag: 0, + viewport: 0, + visible: 0, + scrollParent: 0, + topLayer: 0, + hitTest: 0, + resolve: 0, + }; for (const element of allElements) { scannedCount += 1; @@ -2529,34 +2740,65 @@ function collectHighlightCandidates(config, trace, layoutStability) { ); } - if (!isElementInViewportForDetection(element)) { + let t = performance.now(); + if (isScanSkippableTag(element)) { + phaseStats.tagSkip += 1; + phaseTimes.tag += performance.now() - t; continue; } + phaseTimes.tag += performance.now() - t; - if (!isElementVisibleForDetection(element)) { + t = performance.now(); + const inViewport = isElementInViewportForDetection(element); + phaseTimes.viewport += performance.now() - t; + if (!inViewport) { + phaseStats.notInViewport += 1; continue; } - if (!isElementVisibleInScrollParent(element)) { + t = performance.now(); + const visible = isElementVisibleForDetection(element); + phaseTimes.visible += performance.now() - t; + if (!visible) { + phaseStats.notVisible += 1; + continue; + } + + t = performance.now(); + const scrollOk = isElementVisibleInScrollParent(element); + phaseTimes.scrollParent += performance.now() - t; + if (!scrollOk) { + phaseStats.scrollParentClipped += 1; continue; } - if (!isElementInActiveTopLayer(element, activeTopLayerRoot)) { + t = performance.now(); + const topLayerOk = isElementInActiveTopLayer(element, activeTopLayerRoot); + phaseTimes.topLayer += performance.now() - t; + if (!topLayerOk) { + phaseStats.notInActiveTopLayer += 1; continue; } + t = performance.now(); const hitTestVisibility = getElementHitTestVisibility(element); + phaseTimes.hitTest += performance.now() - t; if (!hitTestVisibility.visible) { + phaseStats.hitTestOccluded += 1; continue; } + t = performance.now(); const resolvedCandidate = resolveElementCandidate( element, config.elementType, ); + phaseTimes.resolve += performance.now() - t; if (!resolvedCandidate) { + phaseStats.notResolvable += 1; continue; } + phaseStats.matched += 1; const candidate = { element: resolvedCandidate.element, @@ -2605,14 +2847,20 @@ function collectHighlightCandidates(config, trace, layoutStability) { return element; }); + const roundedTimes = {}; + for (const k of Object.keys(phaseTimes)) { + roundedTimes[k] = Math.round(phaseTimes[k]); + } trace( 'scan:done', - `processed=${scannedCount} matched=${elements.length} counts=${JSON.stringify(counts)}`, + `processed=${scannedCount} matched=${elements.length} counts=${JSON.stringify(counts)} reject=${JSON.stringify(phaseStats)} ms=${JSON.stringify(roundedTimes)}`, ); return { elements, counts, + _scan_stats: phaseStats, + _scan_times: roundedTimes, }; } @@ -2625,11 +2873,10 @@ async function runOpenBrowserHighlightDetection(config) { const layoutStability = evaluateReadinessSnapshot(trace); - const { elements, counts } = collectHighlightCandidates( - config, - trace, - layoutStability, - ); + const scanStart = performance.now(); + const scanResult = collectHighlightCandidates(config, trace, layoutStability); + const { elements, counts } = scanResult; + const scanMs = Math.round(performance.now() - scanStart); trace('return', `elements=${elements.length}`); return { @@ -2641,5 +2888,10 @@ async function runOpenBrowserHighlightDetection(config) { width: window.innerWidth, height: window.innerHeight, }, + _perf: { + scan_ms: scanMs, + scan_stats: scanResult._scan_stats || {}, + scan_times: scanResult._scan_times || {}, + }, }; } diff --git a/extension/src/utils/collision-detection.ts b/extension/src/utils/collision-detection.ts index 054abc2..a409c64 100644 --- a/extension/src/utils/collision-detection.ts +++ b/extension/src/utils/collision-detection.ts @@ -36,6 +36,108 @@ interface RemainingCandidate { element: InteractiveElement; } +// Coarse spatial grid used to skip O(N) scans of `selected` and `remaining` +// when checking collisions. Cell size is a heuristic — large enough that most +// label rects touch only a couple of cells, small enough that a typical +// query returns far fewer than the full set. +const SPATIAL_INDEX_CELL_PX = 96; + +class SelectedSpatialIndex { + private cells = new Map(); + + add(element: InteractiveElement): void { + const labelBBox = getLabelBBox( + element.bbox, + element.labelPosition ?? 'above', + element.id, + ); + const union = unionBBox(element.bbox, labelBBox); + this.forEachCell(union, (key) => { + let bucket = this.cells.get(key); + if (!bucket) { + bucket = []; + this.cells.set(key, bucket); + } + // Avoid duplicate registration when a single element straddles cells we + // visit out of order — the per-call dedup Set in queryNear handles dup + // results across cells. + if (bucket[bucket.length - 1] !== element) { + bucket.push(element); + } + }); + } + + // Returns elements whose registered union-rect lies in any cell touched by + // the query rect (inflated by clearance on each side). Includes elements + // whose registration cells are *adjacent* to the query rect — see + // `queryNear` callers, which already inflate the query rect with clearance. + queryNear(query: BBox): InteractiveElement[] { + const seen = new Set(); + const out: InteractiveElement[] = []; + this.forEachCell(query, (key) => { + const bucket = this.cells.get(key); + if (!bucket) return; + for (const el of bucket) { + if (!seen.has(el)) { + seen.add(el); + out.push(el); + } + } + }); + return out; + } + + private forEachCell(rect: BBox, fn: (key: number) => void): void { + // Real bboxes from getBoundingClientRect are always finite, but synthetic + // test inputs or future callers might pass NaN/Infinity. Without this + // guard Math.floor would yield NaN, the loop would skip, and we'd + // silently drop a registration — masking real collisions. + if ( + !Number.isFinite(rect.x) || + !Number.isFinite(rect.y) || + !Number.isFinite(rect.width) || + !Number.isFinite(rect.height) + ) { + // Single sentinel cell so the registration is still discoverable. + fn(Number.MIN_SAFE_INTEGER); + return; + } + const minCx = Math.floor(rect.x / SPATIAL_INDEX_CELL_PX); + const maxCx = Math.floor( + (rect.x + Math.max(0, rect.width)) / SPATIAL_INDEX_CELL_PX, + ); + const minCy = Math.floor(rect.y / SPATIAL_INDEX_CELL_PX); + const maxCy = Math.floor( + (rect.y + Math.max(0, rect.height)) / SPATIAL_INDEX_CELL_PX, + ); + for (let cy = minCy; cy <= maxCy; cy++) { + for (let cx = minCx; cx <= maxCx; cx++) { + // Cantor-pair-ish key: cy gets the high bits, cx the low bits. + // Negative coords are uncommon for label rects but still encode safely + // because Math.floor preserves order under shift. + fn(cy * 100000 + cx); + } + } + } +} + +function unionBBox(a: BBox, b: BBox): BBox { + const x = Math.min(a.x, b.x); + const y = Math.min(a.y, b.y); + const xMax = Math.max(a.x + a.width, b.x + b.width); + const yMax = Math.max(a.y + a.height, b.y + b.height); + return { x, y, width: xMax - x, height: yMax - y }; +} + +function inflateBBox(rect: BBox, padding: number): BBox { + return { + x: rect.x - padding, + y: rect.y - padding, + width: rect.width + 2 * padding, + height: rect.height + 2 * padding, + }; +} + interface PlacementEvaluation { position: LabelPosition; blockedCandidateCount: number; @@ -302,12 +404,14 @@ function buildCollisionFreePages( while (remaining.length > 0) { const selected: InteractiveElement[] = []; + const selectedIndex = new SelectedSpatialIndex(); let pageRemaining = remaining; while (pageRemaining.length > 0) { const nextSelection = chooseNextCandidate( pageRemaining, selected, + selectedIndex, viewportWidth, viewportHeight, ); @@ -316,10 +420,12 @@ function buildCollisionFreePages( break; } - selected.push({ + const placed: InteractiveElement = { ...nextSelection.candidate.element, labelPosition: nextSelection.position, - }); + }; + selected.push(placed); + selectedIndex.add(placed); pageRemaining = pageRemaining.filter( (candidate) => candidate.sourceIndex !== nextSelection.candidate.sourceIndex, @@ -347,14 +453,16 @@ function tryBuildUniformPositionPage( viewportHeight?: number, ): InteractiveElement[] | null { const selected: InteractiveElement[] = []; + const index = new SelectedSpatialIndex(); for (const element of elements) { + const nearby = nearbySelectedFor(element, position, element.id, index); if ( !isPlacementFeasible( element, element.id, position, - selected, + nearby, viewportWidth, viewportHeight, ) @@ -362,10 +470,12 @@ function tryBuildUniformPositionPage( return null; } - selected.push({ + const placed: InteractiveElement = { ...element, labelPosition: position, - }); + }; + selected.push(placed); + index.add(placed); } return selected; @@ -374,6 +484,7 @@ function tryBuildUniformPositionPage( function chooseNextCandidate( remaining: RemainingCandidate[], selected: InteractiveElement[], + selectedIndex: SelectedSpatialIndex, viewportWidth?: number, viewportHeight?: number, ): (PlacementEvaluation & { candidate: RemainingCandidate }) | null { @@ -388,6 +499,7 @@ function chooseNextCandidate( candidate.element, candidate.element.id, selected, + selectedIndex, viewportWidth, viewportHeight, ); @@ -415,6 +527,7 @@ function chooseNextCandidate( constrainedCandidate.feasiblePositions, remaining, selected, + selectedIndex, viewportWidth, viewportHeight, ), @@ -426,6 +539,7 @@ function chooseLeastBlockingPlacement( feasiblePositions: LabelPosition[], remaining: RemainingCandidate[], selected: InteractiveElement[], + selectedIndex: SelectedSpatialIndex, viewportWidth?: number, viewportHeight?: number, ): PlacementEvaluation { @@ -435,31 +549,109 @@ function chooseLeastBlockingPlacement( ); let bestPlacement: PlacementEvaluation | null = null; - for (const position of feasiblePositions) { - const hypotheticalSelected = [ - ...selected, - { - ...candidate.element, - labelPosition: position, - }, - ]; - let blockedCandidateCount = 0; - let totalFutureOptions = 0; - - futureCandidates.forEach((candidate) => { - const futureOptions = getFeasiblePositions( - candidate.element, - candidate.element.id, - hypotheticalSelected, - viewportWidth, - viewportHeight, + // Pre-compute each future candidate's baseline feasible positions against + // the current `selected` set. When we test a hypothetical placement of + // `candidate@position`, only future candidates whose bbox/label is + // geometrically near that placement can have their feasibility change. The + // rest keep their baseline feasibility — saving the O(|future|×4×|selected|) + // recomputation per position. + interface FutureBaseline { + candidate: RemainingCandidate; + elementUnion: BBox; // bbox ∪ all four label rects + feasibleCount: number; + totalLength: number; + } + const futureBaselines: FutureBaseline[] = futureCandidates.map((fc) => { + const baseline = getFeasiblePositions( + fc.element, + fc.element.id, + selected, + selectedIndex, + viewportWidth, + viewportHeight, + ); + let union = fc.element.bbox; + for (const pos of POSITION_PRIORITY) { + union = unionBBox( + union, + getLabelBBox(fc.element.bbox, pos, fc.element.id), ); + } + return { + candidate: fc, + elementUnion: union, + feasibleCount: baseline.length, + totalLength: baseline.length, + }; + }); + + const baselineBlockedCount = futureBaselines.reduce( + (acc, fb) => (fb.feasibleCount === 0 ? acc + 1 : acc), + 0, + ); + const baselineTotalOptions = futureBaselines.reduce( + (acc, fb) => acc + fb.totalLength, + 0, + ); + + for (const position of feasiblePositions) { + const hypotheticalElement: InteractiveElement = { + ...candidate.element, + labelPosition: position, + }; + const hypotheticalLabelBBox = getLabelBBox( + candidate.element.bbox, + position, + candidate.element.id, + ); + // Influence rect: anything whose elementUnion does NOT intersect this + // (inflated by clearance) cannot be affected by adding the hypothetical + // candidate. We only need to recompute for future candidates inside it. + const influenceRect = inflateBBox( + unionBBox(candidate.element.bbox, hypotheticalLabelBBox), + VISUAL_LABEL_CLEARANCE_PX, + ); - if (futureOptions.length === 0) { + let blockedCandidateCount = baselineBlockedCount; + let totalFutureOptions = baselineTotalOptions; + + for (const fb of futureBaselines) { + if (!bboxesIntersect(fb.elementUnion, influenceRect)) { + continue; + } + // Feasibility can change for this future candidate. Re-test against + // the spatially-near selected set plus the hypothetical candidate. + let updatedFeasibleLen = 0; + for (const pos of POSITION_PRIORITY) { + const nearby = nearbySelectedFor( + fb.candidate.element, + pos, + fb.candidate.element.id, + selectedIndex, + [hypotheticalElement], + ); + if ( + isPlacementFeasible( + fb.candidate.element, + fb.candidate.element.id, + pos, + nearby, + viewportWidth, + viewportHeight, + ) + ) { + updatedFeasibleLen++; + } + } + + // Adjust baseline aggregates for the delta on this single future. + if (fb.feasibleCount === 0 && updatedFeasibleLen > 0) { + blockedCandidateCount--; + } else if (fb.feasibleCount > 0 && updatedFeasibleLen === 0) { blockedCandidateCount++; } - totalFutureOptions += futureOptions.length; - }); + totalFutureOptions += updatedFeasibleLen - fb.totalLength; + } if ( !bestPlacement || @@ -492,18 +684,22 @@ function getFeasiblePositions( element: InteractiveElement, labelText: string, selected: InteractiveElement[], + selectedIndex: SelectedSpatialIndex | null, viewportWidth?: number, viewportHeight?: number, ): LabelPosition[] { const feasiblePositions: LabelPosition[] = []; for (const position of POSITION_PRIORITY) { + const nearby = selectedIndex + ? nearbySelectedFor(element, position, labelText, selectedIndex) + : selected; if ( isPlacementFeasible( element, labelText, position, - selected, + nearby, viewportWidth, viewportHeight, ) @@ -515,6 +711,28 @@ function getFeasiblePositions( return feasiblePositions; } +// Returns the subset of `selected` that could plausibly collide with the +// candidate placement. The query rect is the union of the candidate's bbox +// and its label rect for the requested position, inflated by the visible +// clearance threshold. Optional `extras` are appended (e.g. a hypothetical +// candidate not yet inserted into the index). +function nearbySelectedFor( + element: InteractiveElement, + position: LabelPosition, + labelText: string, + index: SelectedSpatialIndex, + extras: InteractiveElement[] = [], +): InteractiveElement[] { + const labelBBox = getLabelBBox(element.bbox, position, labelText); + const query = inflateBBox( + unionBBox(element.bbox, labelBBox), + VISUAL_LABEL_CLEARANCE_PX, + ); + const near = index.queryNear(query); + if (extras.length === 0) return near; + return near.concat(extras); +} + function isPlacementFeasible( element: InteractiveElement, labelText: string, diff --git a/extension/vite.config.ts b/extension/vite.config.ts index bf660fc..3ebf0bd 100644 --- a/extension/vite.config.ts +++ b/extension/vite.config.ts @@ -122,16 +122,18 @@ const devReloadPlugin = () => { return; } - // Otherwise wait for the extension to connect (up to 10s) + // Otherwise wait for the extension to connect (up to 40s — covers a + // full chrome.alarms keepalive cycle when the MV3 service worker has + // been terminated by Chrome). console.log( '🔄 [DevReload] Build complete — waiting for extension to connect...', ); const timeout = setTimeout(() => { console.warn( - '🔄 [DevReload] No extension connected within 10s. Reload the extension manually once, then future `npm run dev` runs will auto-reload.', + '🔄 [DevReload] No extension connected within 40s. Reload the extension manually once, then future `npm run dev` runs will auto-reload.', ); process.exit(0); - }, 10_000); + }, 40_000); // Check periodically if a client has connected const poll = setInterval(() => { diff --git a/pyproject.toml b/pyproject.toml index dd933be..69ae578 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,5 +76,5 @@ override-dependencies = [ ] [tool.uv.sources] -openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "764fb87256d7bc20b3eccf82c8a4d241e6740d63" } -openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "764fb87256d7bc20b3eccf82c8a4d241e6740d63" } +openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "bd4cb296355c3d03dd411883e78527b1915fa8c4" } +openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "bd4cb296355c3d03dd411883e78527b1915fa8c4" } diff --git a/server/agent/context_image_window.py b/server/agent/context_image_window.py index c2da913..39f09cc 100644 --- a/server/agent/context_image_window.py +++ b/server/agent/context_image_window.py @@ -12,9 +12,16 @@ DEFAULT_CONTEXT_IMAGE_WINDOW = 3 -def get_context_image_window() -> int | None: +ROUTINE_REPLAY_CONTEXT_IMAGE_WINDOW = 1 + + +def get_context_image_window(routine_replay: bool = False) -> int | None: """Return the tool-image window passed to the SDK Agent. + Routine-replay conversations use a fixed window of 1: the SOP already + spells out each step, so a single most-recent screenshot is enough to + ground the next action and three-frame history would only pad context. + The default is to keep only the latest screenshot-bearing tool message. Environment variable semantics: - `-1`: disable SDK filtering entirely (`None`) @@ -22,6 +29,9 @@ def get_context_image_window() -> int | None: - `N >= 1`: keep the latest N screenshot-bearing tool messages """ + if routine_replay: + return ROUTINE_REPLAY_CONTEXT_IMAGE_WINDOW + raw_value = os.getenv(ENV_CONTEXT_IMAGE_WINDOW) if raw_value is None or raw_value.strip() == "": return DEFAULT_CONTEXT_IMAGE_WINDOW diff --git a/server/agent/manager.py b/server/agent/manager.py index cef99c0..7e87026 100644 --- a/server/agent/manager.py +++ b/server/agent/manager.py @@ -329,7 +329,9 @@ def _create_conversation_in_process( agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) - tool_image_window = get_context_image_window() + tool_image_window = get_context_image_window( + routine_replay=self._is_routine_replay_mode(mode) + ) condenser_llm = llm_instance.model_copy(update={"usage_id": "condenser"}) agent = Agent( llm=llm_instance, @@ -576,7 +578,9 @@ def get_or_create_conversation( agent_context = self._build_agent_context() llm_instance = self._create_llm_from_config(model, base_url, model_alias) tools = self._get_tools_for_model(model, model_alias) - tool_image_window = get_context_image_window() + tool_image_window = get_context_image_window( + routine_replay=self._is_routine_replay_mode(mode) + ) condenser_llm = llm_instance.model_copy(update={"usage_id": "condenser"}) agent = Agent( llm=llm_instance, diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py index 26b3f35..81feb97 100644 --- a/server/agent/tools/browser_executor.py +++ b/server/agent/tools/browser_executor.py @@ -105,6 +105,11 @@ def __init__(self): self.conversation_id = None # Pending confirmations per conversation for 2PC actions. self.pending_confirmations: Dict[str, Dict[str, Any]] = {} + # Most recent highlight result per conversation. Keyed by conversation_id, + # value is the list of element dicts returned by the last highlight call. + # Used in routine-replay mode to auto-confirm clicks/selects/keyboard_input + # when the target was just uniquely highlighted. + self.last_highlight_elements: Dict[str, List[Dict[str, Any]]] = {} def _uses_small_model(self) -> bool: """Whether the active conversation uses the small-model profile.""" @@ -132,6 +137,38 @@ def _uses_small_model(self) -> bool: return is_small_model(model_name) + def _is_routine_replay_mode(self) -> bool: + """Whether the active conversation is running in routine-replay mode.""" + if not self.conversation_id: + return False + + session = session_manager.get_session(str(self.conversation_id)) + if session is None: + return False + + return session.metadata.get("mode") == "routine_replay" + + def _auto_confirm_target_id(self, requested_element_id: str) -> str | None: + """Return the resolved element id if auto-confirm applies, else None. + + In routine-replay mode, when the most recent highlight call in this + conversation returned exactly one element whose id matches the one the + agent is now targeting, we can skip the two-phase confirmation round + trip: the routine SOP's precise keywords already disambiguated the + target, so a confirmation prompt adds latency without adding safety. + """ + if not self._is_routine_replay_mode(): + return None + if not self.conversation_id or not requested_element_id: + return None + recent = self.last_highlight_elements.get(self.conversation_id) + if not recent or len(recent) != 1: + return None + only_id = recent[0].get("id") + if not only_id or only_id != requested_element_id: + return None + return only_id + def __call__( self, action: OpenBrowserAction, conversation ) -> OpenBrowserObservation: @@ -333,6 +370,8 @@ def _execute_highlight_action( # Extract elements and pagination info elements = result_dict.get("data", {}).get("elements", []) total_elements = result_dict.get("data", {}).get("totalElements", 0) + if self.conversation_id: + self.last_highlight_elements[self.conversation_id] = list(elements) element_label = self._format_highlight_element_label( element_type=element_type, count=len(elements) ) @@ -366,6 +405,22 @@ def _execute_element_interaction_action( if action_type == "click": if not action.element_id: raise ValueError("click requires element_id parameter") + auto_id = self._auto_confirm_target_id(action.element_id) + if auto_id: + command = ClickElementCommand( + element_id=auto_id, + conversation_id=self.conversation_id, + tab_id=action.tab_id, + ) + result_dict = self._execute_command_sync(command) + if not result_dict or not result_dict.get("success"): + ext_error = self._extract_result_error(result_dict) + raise RuntimeError(f"Failed to click element: {ext_error}") + return self._build_observation_from_result( + result_dict, + f"Auto-confirmed and clicked element: {auto_id}", + element_id=auto_id, + ) element_preview = self._get_element_full_html(action.element_id, "click") full_html = element_preview[0] screenshot = element_preview[1] @@ -572,6 +627,23 @@ def _execute_element_interaction_action( raise ValueError("keyboard_input requires element_id parameter") if not action.text: raise ValueError("keyboard_input requires text parameter") + auto_id = self._auto_confirm_target_id(action.element_id) + if auto_id: + command = KeyboardInputCommand( + element_id=auto_id, + text=action.text, + conversation_id=self.conversation_id, + tab_id=action.tab_id, + ) + result_dict = self._execute_command_sync(command) + if not result_dict or not result_dict.get("success"): + ext_error = self._extract_result_error(result_dict) + raise RuntimeError(f"Failed to input text: {ext_error}") + return self._build_observation_from_result( + result_dict, + f"Auto-confirmed and input text to element: {auto_id}", + element_id=auto_id, + ) element_preview = self._get_element_full_html( action.element_id, "keyboard_input" ) @@ -622,6 +694,25 @@ def _execute_element_interaction_action( raise ValueError("select requires element_id parameter") if action.value is None: raise ValueError("select requires value parameter") + auto_id = self._auto_confirm_target_id(action.element_id) + if auto_id: + command = SelectElementCommand( + element_id=auto_id, + value=action.value, + conversation_id=self.conversation_id, + tab_id=action.tab_id, + ) + result_dict = self._execute_command_sync(command) + if not result_dict or not result_dict.get("success"): + ext_error = self._extract_result_error(result_dict) + raise RuntimeError(f"Failed to select option: {ext_error}") + value_preview = self._format_select_value_preview(action.value) + return self._build_observation_from_result( + result_dict, + f"Auto-confirmed and selected option {value_preview} in element: " + f"{auto_id}", + element_id=auto_id, + ) element_preview = self._get_element_full_html(action.element_id, "select") full_html = element_preview[0] screenshot = element_preview[1] diff --git a/skill/claude/ob-routines/SKILL.md b/skill/claude/ob-routines/SKILL.md new file mode 100644 index 0000000..589bd0e --- /dev/null +++ b/skill/claude/ob-routines/SKILL.md @@ -0,0 +1,250 @@ +--- +name: ob-routines +description: Record, compile, and replay Browser Routines — saved, named browser workflows. (Alias for openbrowser-routines.) Supports subcommands: "list [query]" to list/search routines, "new" to record a new routine, "execute " to replay a saved routine. Use when the user says "list routines", "record a routine", "replay X", "execute X", or "/ob-routines ". +--- + +# Browser Routines + +Browser Routines are named, compiled workflows captured from real Chrome sessions. +The pipeline has four stages: **record → compile → name → replay**. + +## Subcommand dispatch + +When invoked with arguments, act immediately — do not ask the user what they want: + +| Invocation | Action | +|---|---| +| `/ob-routines` | Show available routines and ask what to do | +| `/ob-routines list [query]` | Run `list_routines.py [query]` and display results | +| `/ob-routines new` | Ask **only** for the one-line goal/intention, then start recording immediately (see "Before recording" below) | +| `/ob-routines execute ` | Run `replay.py ` immediately | + +--- + +## Your role during compilation + +You are a **bridge and quality gate**, not the compiler. The Compiler Agent does +the reasoning; you ensure it did its job correctly before finalizing. + +### Bridge duties +1. Run `compile.py` in a tmux pane (mandatory — see below). +2. Watch for `[compiler:question]` — relay it to the user, send their answer back. +3. Watch for `[compiler:stalled]` — show the agent's message, optionally prompt a follow-up. +4. At `[compiler:name_prompt]` — help the user pick a short slug. + +### Quality gate (run before every finalize) + +After the compiler reports `status=review`, read the compiled routine markdown +and check **both** of the following before calling `/compile/finalize`: + +#### Gate 1 — Intent clarity +Did the compiler understand *why* the user performed each action, not just *what* +they clicked? Red flags: +- Steps that say "click X" with no explanation of goal or condition +- A position-based selection from a sorted/filtered list without asking whether + to replay by position or by identity (e.g. "upvote the top 3 posts" — top 3 + today vs. the same 3 posts always?) +- A value (date, search query, ticker, ID) that will obviously change between + runs, not parameterized + +If any red flag is present and the compiler did NOT ask about it: relay the +ambiguity to the user yourself, get their answer, then send it via +`POST /recordings/{id}/compile/answer` so the compiler can revise. + +#### Gate 2 — Delivery goal for read-only workflows + +A workflow is **read-only** if it has no form submission, no purchase, no +send/post/create/delete action — the user only navigated, read, filtered, or +inspected. For read-only workflows, ask: does the compiled routine end with a +delivery step (a `file_editor` write, a `terminal` command, or an explicit +instruction to report results in chat)? + +**If the routine is read-only AND has no delivery step, the compiler made an +error.** Do not finalize. Instead: + +1. Tell the user: "This routine reads data but doesn't capture results anywhere. + How do you want results delivered on replay?" + - (a) Summary shown in chat (brief / structured table / full details?) + - (b) Written to a local file (path + format: plain text, Markdown, CSV, JSON?) + - (c) Both +2. Get their answer. +3. Send it to the compiler via `POST /recordings/{id}/compile/answer` — the + compiler will revise the routine to include the delivery step. +4. Wait for the next `status=review`, then re-run both gates. + +> **Why this matters:** A routine that just clicks through pages is useless on +> replay — OpenBrowser will navigate and stop with no output. The delivery step +> is what makes the routine meaningful. + +--- + +## Preconditions + +**First time?** Complete the full setup in `skill/claude/open-browser/references/setup.md` +before using this skill. That guide covers: loading the Chrome extension, connecting +it to the server, and obtaining a valid `OPENBROWSER_CHROME_UUID`. Without that, +recording and replay will fail immediately. + +For subsequent uses, confirm: +- OpenBrowser server at `http://127.0.0.1:8765` +- Chrome extension connected +- `OPENBROWSER_CHROME_UUID` set (or passed via `--chrome-uuid`) + +Quick check: +```bash +python3 skill/claude/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" +``` + +Start the server if needed: +```bash +cd /Users/yangxiao/git/OpenBrowser && uv run local-chrome-server serve +``` + +Scripts path: `skill/claude/ob-routines/scripts/` (run from repo root). + +--- + +## List & search routines + +```bash +python3 skill/claude/ob-routines/scripts/list_routines.py +python3 skill/claude/ob-routines/scripts/list_routines.py "login" +python3 skill/claude/ob-routines/scripts/list_routines.py --recordings +``` + +--- + +## Record a routine + +### Before recording — DO NOT interrogate the user + +The whole point of record → compile is that the browser actions are **observed**, +and the Compiler Agent asks clarifying questions *after* it has seen them. + +Ask the user **only** for a short goal/intention (one line). Do **NOT** ask: +- which site or URL to start from +- which tool/screener to use +- how to define filter terms ("what's high-value?", "what's significant?") +- which parameters should vary between runs + +All of that is the compiler's job during Gate 1. Pre-record interrogation +defeats the pipeline and wastes the user's time. If the user's goal is vague +("find good stocks"), that's fine — start recording. The compiler will ask. + +### Step 1 — start recording +```bash +python3 skill/claude/ob-routines/scripts/start_recording.py \ + --chrome-uuid "$OPENBROWSER_CHROME_UUID" \ + --name "xiaohongshu-messages" \ + --intent "check messages on Xiaohongshu" +``` + +Prints `[recording:started] `. **Save this ID.** + +Tell the user: **"Perform your actions in the browser window, then come back and say done."** +Do NOT proceed until the user confirms. + +### Step 2 — stop recording +```bash +python3 skill/claude/ob-routines/scripts/stop_recording.py +``` + +--- + +## Compile to a routine — MANDATORY: tmux interactive session + +**compile.py uses `input()` for Q&A and the name prompt. It MUST run in an +interactive shell. Never invoke it directly via the Bash tool — it will block +and then be killed, losing the compiler session.** + +### Launch in tmux +```bash +tmux new-window -n "compile" \ + "cd /Users/yangxiao/git/OpenBrowser && python3 skill/claude/ob-routines/scripts/compile.py ; echo '[compile-done]'" +``` + +### Monitor output +```bash +tmux capture-pane -t "compile" -p +``` + +### Send an answer +```bash +tmux send-keys -t "compile" "the answer" Enter +``` + +### Markers to watch for + +| Marker | Your action | +|---|---| +| `[compiler:thought]` / `[compiler:action]` | Relay as progress to user | +| `[compiler:question] ` | Relay to user, wait for answer, send via `tmux send-keys` | +| `[compiler:stalled] ` | Show message, ask user for follow-up | +| `[compiler:complete] goal=… steps=N` | Compilation reached review state | +| `[compiler:routine_draft]` | Full routine markdown printed for inspection | +| `[compiler:gate_check]` | **Run both quality gates here.** Send feedback or press Enter | +| `[compiler:name_prompt]` | Gates passed — help user pick slug | +| `[compiler:saved]` | Done — report name and id | + +### Quality gate checkpoint +When `[compiler:gate_check]` appears in the pane, compile.py is explicitly +paused waiting for your review of `[compiler:routine_draft]`. Run Gate 1 and Gate 2: + +- **Gates pass** → send an empty Enter: `tmux send-keys -t main:compile "" Enter` +- **Gate fails** → send corrective feedback: + `tmux send-keys -t main:compile "Please add a delivery step: summarise results in chat as a structured list of tickers with metrics." Enter` + +compile.py forwards non-empty input back to the compiler, streams the revision, +and loops back to another `[compiler:gate_check]`. Only an empty Enter advances +to `[compiler:name_prompt]`. + +**Never send gate feedback at the `[compiler:name_prompt]` stage** — that input +goes directly to the routine name field, not the compiler. + +--- + +## Replay a routine + +```bash +python3 skill/claude/ob-routines/scripts/replay.py "routine-name" \ + --chrome-uuid "$OPENBROWSER_CHROME_UUID" + +# List without replaying +python3 skill/claude/ob-routines/scripts/replay.py --list +``` + +Name matching: exact → ID → prefix → substring. + +--- + +## Full example workflow + +``` +1. /ob-routines new → ask user what to record +2. start_recording → [recording:started] abc123 +3. (user records in browser, says "done") +4. stop_recording abc123 → [recording:events] 21 events +5. tmux new-window "compile.py abc123" +6. monitor pane → relay questions → send answers +7. [compiler:complete] → run Gate 1 + Gate 2 + Gate 2 fails: routine is read-only, no delivery step + → ask user: chat summary, file, or both? + → send answer via tmux send-keys + → wait for next [compiler:complete] +8. Gates pass → [compiler:name_prompt] → user picks slug +9. [compiler:saved] name='…' id=… +10. /ob-routines execute → streams [action] … [complete] +``` + +--- + +## Failure handling + +- **Server unreachable**: `uv run local-chrome-server serve` +- **Browser UUID invalid**: reconnect Chrome extension, get fresh UUID +- **0 events captured**: browser disconnected; re-record +- **tmux not found**: `brew install tmux` +- **tmux window conflict**: check `tmux list-windows`, use a unique `-n` name +- **Compiler session expired** (pane exited before finalize): call + `POST /recordings/{id}/compile` again to restart — session is fresh +- **Relay stuck**: `[observation:error]` lines in SSE stream; relay to user diff --git a/skill/claude/ob-routines/scripts/compile.py b/skill/claude/ob-routines/scripts/compile.py new file mode 100644 index 0000000..5f56b6e --- /dev/null +++ b/skill/claude/ob-routines/scripts/compile.py @@ -0,0 +1,397 @@ +#!/usr/bin/env python3 +"""Compile a stopped recording into a named Browser Routine. + +Starts the Compiler Agent, streams its SSE output, and acts as a bridge +between the agent and the user: + - Agent reasoning and tool calls are printed to stdout as they arrive. + - When the compiler agent asks a clarification question (status=asking), + this script prints the question and reads the user's answer from stdin, + then resumes compilation via /compile/answer. + - When the agent stalls (status=stalled), the agent's last message is + shown and the user can send a follow-up. + - When compilation completes (status=review), the script prompts the user + to name the routine, then calls /compile/finalize to save it. + +The outer agent (Claude Code / Codex) should relay the printed questions to +the user and feed their responses back via stdin — it should NOT try to +re-implement compiler logic. + +Example: + python3 compile.py abc123-recording-id + python3 compile.py abc123-recording-id --model-alias fast +""" + +from __future__ import annotations + +import argparse +import json +import sys +from urllib.error import HTTPError, URLError +from urllib.request import Request, urlopen + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 15, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +# --------------------------------------------------------------------------- +# SSE event formatting (same conventions as send_task.py) +# --------------------------------------------------------------------------- + + +def _format_compiler_event(event_type: str, data: dict) -> None: + """Print one SSE event from the compiler agent stream.""" + if event_type == "error": + print(f"[compiler:error] {data.get('error', data)}", flush=True) + return + + if event_type != "agent_event": + # Pass-through for unknown top-level event types + print(f"[{event_type}] {json.dumps(data, ensure_ascii=False)}", flush=True) + return + + data_type = data.get("type", "unknown") + + if data_type == "SystemPromptEvent": + text_len = len(data.get("text", "")) + print( + f"[compiler:system_prompt] suppressed ({text_len} chars)", + flush=True, + ) + return + + if data_type == "ThoughtEvent": + thought = data.get("thought", data.get("content", "")) + print(f"[compiler:thought] {thought}", flush=True) + return + + if data_type == "ActionEvent": + action = data.get("action", {}) + if isinstance(action, dict): + action_name = action.get("action", "unknown") + if action_name == "ask_user": + question = action.get("question", "") + print(f"[compiler:ask_user] {question}", flush=True) + else: + # FileEditorTool, TraceViewerTool, SubmitWorkflowTool, etc. + extras = { + k: v for k, v in action.items() if k != "action" and v is not None + } + suffix = ( + (" " + json.dumps(extras, ensure_ascii=False)) if extras else "" + ) + print(f"[compiler:action] {action_name}{suffix}", flush=True) + else: + print(f"[compiler:action] {action}", flush=True) + return + + if data_type == "ObservationEvent": + success = data.get("success", False) + message = data.get("message", "") + state = "ok" if success else "error" + print(f"[compiler:observation:{state}] {message}", flush=True) + return + + if data_type == "MessageEvent": + role = data.get("role", "unknown") + text = data.get("text", "") + print(f"[compiler:message:{role}] {text}", flush=True) + return + + if data_type == "ErrorEvent": + print(f"[compiler:error] {data.get('error', 'unknown error')}", flush=True) + return + + print( + f"[compiler:agent_event:{data_type}] {json.dumps(data, ensure_ascii=False)}", + flush=True, + ) + + +# --------------------------------------------------------------------------- +# SSE streaming +# --------------------------------------------------------------------------- + + +def _stream_sse(url: str, body: dict) -> dict | None: + """POST to url with body, stream SSE events, return the final complete result. + + Returns the ``result`` dict from the complete event, or None on error. + """ + req = Request( + url, + data=json.dumps(body).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Accept": "text/event-stream", + }, + method="POST", + ) + + complete_result: dict | None = None + sse_event: str | None = None + sse_data: str | None = None + + try: + with urlopen(req, timeout=None) as response: + for raw_line in response: + line = raw_line.decode("utf-8").rstrip("\n") + if not line: + if sse_event and sse_data is not None: + try: + parsed = json.loads(sse_data) + except json.JSONDecodeError: + parsed = {"raw": sse_data} + + if sse_event == "complete": + complete_result = parsed.get("result", parsed) + else: + _format_compiler_event(sse_event, parsed) + + sse_event = None + sse_data = None + continue + + if line.startswith("event:"): + sse_event = line[6:].strip() + elif line.startswith("data:"): + sse_data = line[5:].lstrip() + + except HTTPError as exc: + body_text = exc.read().decode("utf-8", errors="replace") + print( + f"[compiler:http_error] {exc.code} {exc.reason}: {body_text}", + file=sys.stderr, + ) + return None + + return complete_result + + +# --------------------------------------------------------------------------- +# Compile loop +# --------------------------------------------------------------------------- + + +def compile_recording(base_url: str, recording_id: str, model_alias: str | None) -> int: + """Run the compile → Q&A → finalize flow. Returns exit code.""" + print(f"[compiler:start] recording={recording_id}", flush=True) + + # ── Phase 1: initial compile ────────────────────────────────────────── + compile_body: dict = {} + if model_alias: + compile_body["model_alias"] = model_alias + + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile", + body=compile_body, + ) + if result is None: + return 1 + + # ── Phase 2: Q&A loop ───────────────────────────────────────────────── + while True: + status = result.get("status") + + if status == "asking": + question = result.get("question", "") + print(f"\n[compiler:question] {question}", flush=True) + print( + "[compiler:waiting_for_answer] Type your answer and press Enter:", + flush=True, + ) + try: + answer = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Compilation cancelled.", flush=True) + return 130 + + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile/answer", + body={"answer": answer}, + ) + if result is None: + return 1 + + elif status == "stalled": + # Agent replied in prose instead of calling ask_user. + # Show the message and let the user send a follow-up. + message = result.get("message", "") + if message: + print(f"\n[compiler:stalled] {message}", flush=True) + print( + "[compiler:waiting_for_follow_up] Agent stalled — send a follow-up " + "(or press Enter to continue without one):", + flush=True, + ) + try: + follow_up = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Compilation cancelled.", flush=True) + return 130 + + if not follow_up: + follow_up = "Please continue." + + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile/answer", + body={"answer": follow_up}, + ) + if result is None: + return 1 + + elif status == "review": + # Compilation done — show the draft and pause for quality gate + # before proceeding to the name prompt. The outer agent (Claude + # Code / Codex) reads the routine here and may send corrective + # feedback (e.g. missing delivery step) via the gate prompt. + # Only an empty Enter moves forward to naming. + goal = result.get("goal", "") + step_count = result.get("step_count", "?") + routine_markdown = result.get("routine_markdown", "") + print( + f"\n[compiler:complete] goal={goal!r} steps={step_count}", flush=True + ) + if routine_markdown: + print(f"[compiler:routine_draft]\n{routine_markdown}", flush=True) + print( + "\n[compiler:gate_check] Review the routine above.\n" + "Press Enter to proceed to naming, or type feedback to send back to the compiler:", + flush=True, + ) + try: + gate_input = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Compilation cancelled.", flush=True) + return 130 + + if gate_input: + # Outer agent has feedback — send it back to the compiler + result = _stream_sse( + f"{base_url}/recordings/{recording_id}/compile/answer", + body={"answer": gate_input}, + ) + if result is None: + return 1 + # Loop back to handle the next status + continue + + # Gate passed — proceed to naming + break + + else: + print( + f"[compiler:unexpected_status] {status} — result: {result}", + file=sys.stderr, + ) + return 1 + + # ── Phase 3: name the routine and finalize ──────────────────────────── + goal = result.get("goal", "") + step_count = result.get("step_count", "?") + + # Suggest a slug derived from the goal + suggested = _slugify(goal) if goal else "my-routine" + print( + f"\n[compiler:name_prompt] Suggested name: {suggested!r}\n" + f"Accept (press Enter) or type a new name:", + flush=True, + ) + try: + chosen_name = input().strip() + except (EOFError, KeyboardInterrupt): + print("\n[compiler:interrupted] Finalization cancelled.", flush=True) + return 130 + + if not chosen_name: + chosen_name = suggested + + # ── Phase 4: finalize ───────────────────────────────────────────────── + try: + finalize_result = request_json( + f"{base_url}/recordings/{recording_id}/compile/finalize", + method="POST", + body={"name": chosen_name}, + ) + except HTTPError as exc: + body_text = exc.read().decode("utf-8", errors="replace") + print(f"[compiler:finalize_error] {exc.code}: {body_text}", file=sys.stderr) + return 1 + except Exception as exc: + print(f"[compiler:finalize_error] {exc}", file=sys.stderr) + return 1 + + routine = finalize_result.get("routine", {}) + routine_id = routine.get("routine_id", "?") + name = routine.get("name", chosen_name) + steps = routine.get("step_count", "?") + + print(f"[compiler:saved] name={name!r} id={routine_id} steps={steps}", flush=True) + print( + f"\nRoutine saved. To replay it, run:\n\n" f" python3 replay.py {name!r}\n", + flush=True, + ) + return 0 + + +def _slugify(text: str) -> str: + """Turn a goal string into a short, lowercase, hyphenated slug.""" + import re + + # Lowercase, keep only alnum and spaces, collapse and replace with hyphens + slug = re.sub(r"[^\w\s]", "", text.lower()) + slug = re.sub(r"\s+", "-", slug.strip()) + # Truncate to 40 chars, trim trailing hyphens + slug = slug[:40].rstrip("-") + return slug or "routine" + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Compile a stopped recording into a named Browser Routine", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("recording_id", help="Recording ID from stop_recording.py") + parser.add_argument( + "--model-alias", + help="LLM model alias to use for compilation (uses server default if omitted)", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + try: + return compile_recording(args.url, args.recording_id, args.model_alias) + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except KeyboardInterrupt: + print("Interrupted.", file=sys.stderr) + return 130 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/list_routines.py b/skill/claude/ob-routines/scripts/list_routines.py new file mode 100644 index 0000000..a1ab1e7 --- /dev/null +++ b/skill/claude/ob-routines/scripts/list_routines.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""List saved routines and/or stopped recordings. + +Routines are named, compiled browser workflows ready to replay. +Recordings are raw captured traces that may not yet be compiled. + +Examples: + python3 list_routines.py # list all routines + python3 list_routines.py login # filter by name/goal substring + python3 list_routines.py --recordings # list stopped recordings instead + python3 list_routines.py --recordings login # filter recordings by name +""" + +from __future__ import annotations + +import argparse +import json +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +def request_json(url: str, *, timeout: int = 10) -> dict: + req = Request(url, headers={"Accept": "application/json"}) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +def list_routines(base_url: str, query: str | None) -> int: + try: + data = request_json(f"{base_url}/routines") + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + + items = data.get("routines", []) + if query: + q = query.lower() + items = [ + r for r in items if q in r["name"].lower() or q in r.get("goal", "").lower() + ] + + if not items: + suffix = f" matching {query!r}" if query else "" + print(f"No routines found{suffix}.") + return 0 + + print(f"{'NAME':<30} {'STEPS':>5} {'GOAL'}") + print("-" * 72) + for r in items: + name = r["name"] + steps = r.get("step_count", "?") + goal = r.get("goal", "") + routine_id = r["routine_id"] + print(f"{name:<30} {steps:>5} {goal}") + print(f" id={routine_id}") + return 0 + + +def list_recordings(base_url: str, query: str | None) -> int: + try: + data = request_json(f"{base_url}/recordings?status=stopped") + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + + items = data.get("recordings", []) + if query: + q = query.lower() + items = [r for r in items if q in (r.get("name") or "").lower()] + + if not items: + suffix = f" matching {query!r}" if query else "" + print(f"No stopped recordings found{suffix}.") + return 0 + + print(f"{'NAME':<30} {'EVENTS':>6} {'RECORDING ID'}") + print("-" * 72) + for r in items: + name = r.get("name") or "(unnamed)" + events = r.get("event_count", "?") + recording_id = r["recording_id"] + compiled = "(compiled)" if (r.get("metadata") or {}).get("routine_id") else "" + print(f"{name:<30} {events:>6} {recording_id} {compiled}") + return 0 + + +def main() -> int: + parser = argparse.ArgumentParser( + description="List saved routines or stopped recordings", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "query", + nargs="?", + help="Filter by name or goal substring (case-insensitive)", + ) + parser.add_argument( + "--recordings", + action="store_true", + help="List stopped recordings instead of compiled routines", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + if args.recordings: + return list_recordings(args.url, args.query) + return list_routines(args.url, args.query) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/replay.py b/skill/claude/ob-routines/scripts/replay.py new file mode 100644 index 0000000..8b61d7b --- /dev/null +++ b/skill/claude/ob-routines/scripts/replay.py @@ -0,0 +1,348 @@ +#!/usr/bin/env python3 +"""Execute a saved Browser Routine in Chrome. + +Looks up the routine by name (exact or prefix match, case-insensitive), +creates an agent conversation in routine_replay mode, sends the routine +markdown as the task, and streams execution output. + +Examples: + python3 replay.py "techforum-upvote" --chrome-uuid "$OPENBROWSER_CHROME_UUID" + python3 replay.py login # prefix match + python3 replay.py --list # list all available routines +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + +# --------------------------------------------------------------------------- +# HTTP helpers +# --------------------------------------------------------------------------- + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 10, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +# --------------------------------------------------------------------------- +# Routine lookup +# --------------------------------------------------------------------------- + + +def find_routine(base_url: str, query: str) -> dict | None: + """Return a single routine matching query by exact name, then prefix, then substring.""" + data = request_json(f"{base_url}/routines") + routines = data.get("routines", []) + if not routines: + return None + + q = query.lower() + + # 1. Exact name match + for r in routines: + if r["name"].lower() == q: + return r + + # 2. Exact routine_id match + for r in routines: + if r["routine_id"].lower() == q: + return r + + # 3. Prefix match on name + prefix = [r for r in routines if r["name"].lower().startswith(q)] + if len(prefix) == 1: + return prefix[0] + if len(prefix) > 1: + print("[replay:ambiguous] Multiple routines match that prefix:", flush=True) + for r in prefix: + print(f" {r['name']} (id={r['routine_id']})", flush=True) + print("Provide a more specific name or the full routine_id.", flush=True) + return None + + # 4. Substring match on name or goal + sub = [ + r for r in routines if q in r["name"].lower() or q in r.get("goal", "").lower() + ] + if len(sub) == 1: + return sub[0] + if len(sub) > 1: + print("[replay:ambiguous] Multiple routines match that substring:", flush=True) + for r in sub: + print(f" {r['name']} (id={r['routine_id']})", flush=True) + print("Provide a more specific name or the full routine_id.", flush=True) + return None + + return None + + +# --------------------------------------------------------------------------- +# SSE streaming (same conventions as send_task.py) +# --------------------------------------------------------------------------- + + +def _format_event(event_type: str, data: dict) -> None: + if event_type == "complete": + print(f"[complete] {data.get('message', '')}", flush=True) + return + + if event_type == "usage_metrics": + metrics = data.get("metrics", {}) + model_name = metrics.get("model_name", "unknown") + cost = metrics.get("accumulated_cost", 0) + token_usage = metrics.get("accumulated_token_usage", {}) + total_tokens = token_usage.get("total_tokens", 0) + if total_tokens == 0: + total_tokens = ( + token_usage.get("prompt_tokens", 0) + + token_usage.get("completion_tokens", 0) + + token_usage.get("reasoning_tokens", 0) + ) + print( + f"[usage] model={model_name} cost_rmb={cost:.6f} tokens={total_tokens}", + flush=True, + ) + return + + if event_type != "agent_event": + print(f"[{event_type}] {json.dumps(data, ensure_ascii=False)}", flush=True) + return + + data_type = data.get("type", "unknown") + + if data_type == "SystemPromptEvent": + text_len = len(data.get("text", "")) + print( + f"[system_prompt] suppressed ({text_len} chars)", + flush=True, + ) + return + + if data_type == "MessageEvent": + role = data.get("role", "unknown") + text = data.get("text", "") + print(f"[message:{role}] {text}", flush=True) + return + + if data_type == "ThoughtEvent": + thought = data.get("thought", data.get("content", "")) + print(f"[thought] {thought}", flush=True) + return + + if data_type == "ActionEvent": + action = data.get("action", {}) + if isinstance(action, dict): + action_name = action.get("action", "unknown") + element_id = action.get("element_id") + url = action.get("url") + text = action.get("text") + extras = [] + if element_id: + extras.append(f"element_id={element_id}") + if url: + extras.append(f"url={url}") + if text: + extras.append(f"text={text!r}") + suffix = (" " + " ".join(extras)) if extras else "" + print(f"[action] {action_name}{suffix}", flush=True) + else: + print(f"[action] {action}", flush=True) + return + + if data_type == "ObservationEvent": + success = data.get("success", False) + message = data.get("message", "") + state = "ok" if success else "error" + print(f"[observation:{state}] {message}", flush=True) + return + + if data_type == "ErrorEvent": + print(f"[error] {data.get('error', 'unknown error')}", flush=True) + return + + print( + f"[agent_event:{data_type}] {json.dumps(data, ensure_ascii=False)}", + flush=True, + ) + + +def stream_replay( + base_url: str, + conversation_id: str, + task: str, + cwd: str, + chrome_uuid: str, +) -> None: + req = Request( + f"{base_url}/agent/conversations/{conversation_id}/messages", + data=json.dumps( + { + "text": task, + "cwd": cwd, + "browser_id": chrome_uuid, + } + ).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Accept": "text/event-stream", + }, + method="POST", + ) + + with urlopen(req, timeout=None) as response: + sse_event: str | None = None + sse_data: str | None = None + for raw_line in response: + line = raw_line.decode("utf-8").rstrip("\n") + if not line: + if sse_event and sse_data is not None: + try: + _format_event(sse_event, json.loads(sse_data)) + except json.JSONDecodeError: + print(f"[{sse_event}] {sse_data}", flush=True) + sse_event = None + sse_data = None + continue + + if line.startswith("event:"): + sse_event = line[6:].strip() + elif line.startswith("data:"): + sse_data = line[5:].lstrip() + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Replay a saved Browser Routine in Chrome", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "routine", + nargs="?", + help="Routine name, ID, or prefix to replay", + ) + parser.add_argument( + "--chrome-uuid", + default=os.environ.get("OPENBROWSER_CHROME_UUID"), + help="Browser UUID capability token (or set OPENBROWSER_CHROME_UUID)", + ) + parser.add_argument( + "--cwd", + default=".", + help="Working directory passed to the agent", + ) + parser.add_argument( + "--list", + action="store_true", + help="List available routines and exit", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + try: + if args.list or not args.routine: + data = request_json(f"{args.url}/routines") + routines = data.get("routines", []) + if not routines: + print("No routines saved yet.") + return 0 + print(f"{'NAME':<30} {'STEPS':>5} GOAL") + print("-" * 72) + for r in routines: + print( + f"{r['name']:<30} {r.get('step_count', '?'):>5} {r.get('goal', '')}" + ) + return 0 + + if not args.chrome_uuid: + print( + "Browser UUID is required. Set OPENBROWSER_CHROME_UUID or pass --chrome-uuid.", + file=sys.stderr, + ) + return 2 + + # ── Find the routine ────────────────────────────────────────────── + routine = find_routine(args.url, args.routine) + if routine is None: + print( + f"[replay:not_found] No routine found matching {args.routine!r}. " + "Run with --list to see available routines.", + file=sys.stderr, + ) + return 1 + + name = routine["name"] + routine_id = routine["routine_id"] + goal = routine.get("goal", "") + routine_markdown = routine.get("routine_markdown", "") + + print(f"[replay:routine] {name} id={routine_id}", flush=True) + if goal: + print(f"[replay:goal] {goal}", flush=True) + + # ── Validate browser UUID ───────────────────────────────────────── + browser_status = request_json(f"{args.url}/browsers/{args.chrome_uuid}/valid") + if not browser_status.get("valid", False): + msg = browser_status.get("message", "browser UUID is not valid") + print(f"Browser UUID validation failed: {msg}", file=sys.stderr) + return 1 + + # ── Create conversation in routine_replay mode ──────────────────── + conv_result = request_json( + f"{args.url}/agent/conversations", + method="POST", + body={ + "cwd": args.cwd, + "browser_id": args.chrome_uuid, + "mode": "routine_replay", + }, + ) + conversation_id = conv_result["conversation_id"] + print(f"[replay:conversation] {conversation_id}", flush=True) + + # ── Send routine markdown as the task ──────────────────────────── + stream_replay( + args.url, + conversation_id, + routine_markdown, + args.cwd, + args.chrome_uuid, + ) + return 0 + + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except KeyboardInterrupt: + print("Interrupted.", file=sys.stderr) + return 130 + except Exception as exc: + print(f"Replay failed: {exc}", file=sys.stderr) + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/start_recording.py b/skill/claude/ob-routines/scripts/start_recording.py new file mode 100644 index 0000000..a34fad2 --- /dev/null +++ b/skill/claude/ob-routines/scripts/start_recording.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +"""Start a new browser recording session. + +The server sends a command to the Chrome extension which opens a dedicated +recording window. After this script exits, the user performs their actions +in that browser window. When done, they return to the terminal and run +stop_recording.py with the printed recording_id. + +Example: + python3 start_recording.py \\ + --chrome-uuid "$OPENBROWSER_CHROME_UUID" \\ + --name "Gmail compose flow" \\ + --intent "draft a new email to a contact and send it" +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 10, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Start a new recording session in Chrome", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument( + "--chrome-uuid", + default=os.environ.get("OPENBROWSER_CHROME_UUID"), + help="Browser UUID capability token (or set OPENBROWSER_CHROME_UUID)", + ) + parser.add_argument( + "--name", + help="Human-readable name for this recording session", + ) + parser.add_argument( + "--intent", + help="Short description of what you intend to record (guides compilation later)", + ) + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + if not args.chrome_uuid: + print( + "Browser UUID is required. Set OPENBROWSER_CHROME_UUID or pass --chrome-uuid.", + file=sys.stderr, + ) + return 2 + + try: + # Validate browser connectivity first + browser_status = request_json(f"{args.url}/browsers/{args.chrome_uuid}/valid") + if not browser_status.get("valid", False): + msg = browser_status.get("message", "browser UUID is not valid") + print(f"Browser UUID validation failed: {msg}", file=sys.stderr) + return 1 + + # Create and start recording + payload: dict = {"browser_id": args.chrome_uuid} + if args.name: + payload["name"] = args.name + + result = request_json(f"{args.url}/recordings", method="POST", body=payload) + if not result.get("success"): + print(f"Failed to create recording: {result}", file=sys.stderr) + return 1 + + recording = result["recording"] + recording_id = recording["recording_id"] + + # Save intent note if provided + if args.intent: + request_json( + f"{args.url}/recordings/{recording_id}/intent-note", + method="POST", + body={"intent_note": args.intent}, + ) + + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except Exception as exc: + print(f"Failed to start recording: {exc}", file=sys.stderr) + return 1 + + name_display = f" ({args.name})" if args.name else "" + print(f"[recording:started] {recording_id}{name_display}", flush=True) + if args.intent: + print(f"[recording:intent] {args.intent}", flush=True) + print( + "\nA recording window has opened in Chrome.\n" + "Perform your actions in the browser, then return here and run:\n\n" + f" python3 stop_recording.py {recording_id}\n", + flush=True, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/ob-routines/scripts/stop_recording.py b/skill/claude/ob-routines/scripts/stop_recording.py new file mode 100644 index 0000000..6d91656 --- /dev/null +++ b/skill/claude/ob-routines/scripts/stop_recording.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +"""Stop an active recording session. + +Sends a stop command to the Chrome extension, which closes the recording +window and flushes the event buffer. Prints the final event count so the +agent knows how much was captured before kicking off compilation. + +Example: + python3 stop_recording.py abc123-recording-id +""" + +from __future__ import annotations + +import argparse +import json +import sys +from urllib.error import URLError +from urllib.request import Request, urlopen + + +def request_json( + url: str, + *, + method: str = "GET", + body: dict | None = None, + timeout: int = 15, +) -> dict: + headers = {"Content-Type": "application/json", "Accept": "application/json"} + data = None if body is None else json.dumps(body).encode("utf-8") + req = Request(url, data=data, headers=headers, method=method) + with urlopen(req, timeout=timeout) as r: + return json.loads(r.read().decode("utf-8")) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Stop an active recording session", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + parser.add_argument("recording_id", help="Recording ID from start_recording.py") + parser.add_argument( + "--url", + default="http://127.0.0.1:8765", + help="OpenBrowser server URL", + ) + args = parser.parse_args() + + try: + result = request_json( + f"{args.url}/recordings/{args.recording_id}/stop", + method="POST", + body={}, + ) + except URLError as exc: + print(f"Cannot reach OpenBrowser server: {exc}", file=sys.stderr) + return 1 + except Exception as exc: + print(f"Failed to stop recording: {exc}", file=sys.stderr) + return 1 + + if not result.get("success"): + print(f"Stop failed: {result}", file=sys.stderr) + return 1 + + recording = result.get("recording") or {} + event_count = recording.get("event_count", "?") + name = recording.get("name") or "" + stop_reason = result.get("stop_reason", "") + + display = f" ({name})" if name else "" + print(f"[recording:stopped] {args.recording_id}{display}", flush=True) + print(f"[recording:events] {event_count} events captured", flush=True) + if stop_reason == "browser_disconnected": + print( + "[recording:warning] Browser was disconnected — recording marked stopped " + "locally. Event capture may be incomplete.", + flush=True, + ) + + print( + f"\nRecording stopped. To compile this recording into a routine, run:\n\n" + f" python3 compile.py {args.recording_id}\n", + flush=True, + ) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/skill/claude/open-browser/SKILL.md b/skill/claude/open-browser/SKILL.md index 1130574..3b7478c 100644 --- a/skill/claude/open-browser/SKILL.md +++ b/skill/claude/open-browser/SKILL.md @@ -37,7 +37,7 @@ Before sending a browser task, confirm all of the following: Run this first: ```bash -python3 skill/claude/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" +python3 ~/.claude/skills/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" ``` If readiness fails, read [references/setup.md](references/setup.md) or @@ -72,7 +72,7 @@ Code, because the SSE stream becomes part of your conversation context without any extra plumbing: ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Open https://example.com and report the page title" \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" ``` @@ -115,7 +115,7 @@ encoded, and sent as data URIs — no upload endpoint or static server is required. Limit: 10 MB per image, up to 8 images per message. ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Open the local dashboard and tell me which section looks different from this screenshot." \ --image /tmp/reference.png \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" @@ -141,7 +141,7 @@ keeps its prior screenshots and observations), reuse the conversation ID from the previous run: ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Now click the 'Sign in' button you just identified" \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" \ --conversation-id 1b32b26a-1a7e-4b6c-9599-139fc6b9c89b @@ -153,14 +153,16 @@ report a value it already saw. ## Working Directory -Run commands from the OpenBrowser repo root so the relative script -paths resolve cleanly. +The skill's scripts live at `~/.claude/skills/open-browser/` so they +work from any project's current working directory. The OpenBrowser +server itself must still be started from the repo root +(`uv run local-chrome-server serve` in `~/git/OpenBrowser`). Use `--cwd` when the browser task should operate with context from another workspace: ```bash -python3 skill/claude/open-browser/scripts/send_task.py \ +python3 ~/.claude/skills/open-browser/scripts/send_task.py \ "Open the local app and verify the login flow" \ --cwd /absolute/path/to/project \ --chrome-uuid "$OPENBROWSER_CHROME_UUID" diff --git a/skill/claude/open-browser/references/setup.md b/skill/claude/open-browser/references/setup.md index 5abbc1c..477596e 100644 --- a/skill/claude/open-browser/references/setup.md +++ b/skill/claude/open-browser/references/setup.md @@ -45,7 +45,7 @@ drive the browser that registered it. ## Quick verification ```bash -python3 skill/claude/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" +python3 ~/.claude/skills/open-browser/scripts/check_status.py --chrome-uuid "$OPENBROWSER_CHROME_UUID" ``` Expected outcome: diff --git a/skill/claude/open-browser/scripts/check_status.py b/skill/claude/open-browser/scripts/check_status.py index c218162..8752bf7 100644 --- a/skill/claude/open-browser/scripts/check_status.py +++ b/skill/claude/open-browser/scripts/check_status.py @@ -136,7 +136,7 @@ def main() -> int: print("Ready for browser automation.") return 0 - print("Not ready. See skill/claude/open-browser/references/setup.md if needed.") + print("Not ready. See ~/.claude/skills/open-browser/references/setup.md if needed.") return 1 diff --git a/uv.lock b/uv.lock index 418acbe..36f3fc5 100644 --- a/uv.lock +++ b/uv.lock @@ -1678,8 +1678,8 @@ requires-dist = [ { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=2eb7db59461e9117b1e3e0519616b39f1497c0f9" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" }, { name = "numpy", specifier = ">=1.24.0" }, - { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63" }, - { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63" }, + { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4" }, + { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4" }, { name = "pillow", specifier = ">=10.0.0" }, { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" }, { name = "pydantic", specifier = ">=2.5.0" }, @@ -2224,7 +2224,7 @@ wheels = [ [[package]] name = "openhands-sdk" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63#764fb87256d7bc20b3eccf82c8a4d241e6740d63" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4#bd4cb296355c3d03dd411883e78527b1915fa8c4" } dependencies = [ { name = "agent-client-protocol" }, { name = "deprecation" }, @@ -2244,7 +2244,7 @@ dependencies = [ [[package]] name = "openhands-tools" version = "1.12.0" -source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=764fb87256d7bc20b3eccf82c8a4d241e6740d63#764fb87256d7bc20b3eccf82c8a4d241e6740d63" } +source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=bd4cb296355c3d03dd411883e78527b1915fa8c4#bd4cb296355c3d03dd411883e78527b1915fa8c4" } dependencies = [ { name = "bashlex" }, { name = "binaryornot" },