From a89e46ced9af05488848fc9937bbfda4c3693010 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Sat, 9 May 2026 13:39:08 +0800
Subject: [PATCH 01/12] fix(frontend): surface reasoning_content /
 thinking_blocks on event cards

When an assistant turn has no tool call and empty content but non-empty
reasoning (e.g. qwen-flash thinking-only responses), the timeline showed
a mystery empty "AGENT / Role: assistant" card with no clue why. The
SSE-payload whitelist in normalizeFrontendEvent was dropping the fields
even after the visualizer added them.

Carry reasoning_content and thinking_blocks through the visualizer for
both MessageEvent and ActionEvent, pass them through the normalizer,
and render a collapsed grey Reasoning/Thinking expander on the card.

Also bump agent-sdk pin to 3799d1cf so qwen3-coder-style XML tool calls
that arrive in reasoning_content get recovered into structured tool
calls instead of stalling the agent loop on empty messages.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 frontend/index.html        | 37 ++++++++++++++++++++++++++++++++++++-
 pyproject.toml             |  4 ++--
 server/agent/visualizer.py | 18 ++++++++++++++++++
 uv.lock                    |  8 ++++----
 4 files changed, 60 insertions(+), 7 deletions(-)
diff --git a/frontend/index.html b/frontend/index.html
index 0959910..45ea3f7 100644
--- a/frontend/index.html
+++ b/frontend/index.html
@@ -416,6 +416,29 @@
             opacity: 0.7;
         }
 
+        .event-reasoning {
+            color: #888888;
+            font-size: 11px;
+            font-family: monospace;
+            margin: 4px 0 6px 0;
+            border-left: 2px solid #444444;
+            padding-left: 8px;
+        }
+
+        .event-reasoning summary {
+            cursor: pointer;
+            color: #aaaaaa;
+            opacity: 0.85;
+            user-select: none;
+        }
+
+        .event-reasoning pre {
+            margin: 4px 0 0 0;
+            white-space: pre-wrap;
+            word-break: break-word;
+            color: #888888;
+        }
+
         .prompt-line {
             color: #cccccc;
         }
@@ -6017,7 +6040,9 @@ <h1>Sisyphus</h1>
                 tool_name: data.tool_name || fallback.tool_name || null,
                 tool_call_id: data.tool_call_id || fallback.tool_call_id || null,
                 help_request: data.help_request || fallback.help_request || null,
-                awaiting_user_help: Boolean(data.awaiting_user_help || fallback.awaiting_user_help)
+                awaiting_user_help: Boolean(data.awaiting_user_help || fallback.awaiting_user_help),
+                reasoning_content: data.reasoning_content || fallback.reasoning_content || null,
+                thinking_blocks: data.thinking_blocks || fallback.thinking_blocks || null
             };
         }
 
@@ -11177,6 +11202,16 @@ <h1>Sisyphus</h1>
                     metadataHtml += `<div class="event-meta"><small>Sender: ${escapeHtml(event.sender)}</small></div>`;
                 }
             }
+            // Reasoning / thinking — surfaced for both MessageEvent and ActionEvent
+            if (event.reasoning_content) {
+                metadataHtml += `<details class="event-reasoning"><summary>Reasoning</summary><pre>${escapeHtml(event.reasoning_content)}</pre></details>`;
+            }
+            if (event.thinking_blocks && event.thinking_blocks.length > 0) {
+                const tbText = event.thinking_blocks
+                    .map(tb => tb.thinking || tb.text || JSON.stringify(tb))
+                    .join('\n\n');
+                metadataHtml += `<details class="event-reasoning"><summary>Thinking</summary><pre>${escapeHtml(tbText)}</pre></details>`;
+            }
             
             // Create HTML
             eventLine.innerHTML = `
diff --git a/pyproject.toml b/pyproject.toml
index d467cd4..b9cb737 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -76,5 +76,5 @@ override-dependencies = [
 ]
 
 [tool.uv.sources]
-openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
-openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
+openhands-sdk = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-sdk", rev = "3799d1cf2af72f8ce21a4942ffab67ffe208b551" }
+openhands-tools = { git = "https://github.com/softpudding/agent-sdk.git", subdirectory = "openhands-tools", rev = "3799d1cf2af72f8ce21a4942ffab67ffe208b551" }
diff --git a/server/agent/visualizer.py b/server/agent/visualizer.py
index 545d9ef..b452032 100644
--- a/server/agent/visualizer.py
+++ b/server/agent/visualizer.py
@@ -106,6 +106,14 @@ def on_event(self, event: Event) -> None:
                     sse_data["action"] = str(event.action)
                 if event.summary:
                     sse_data["summary"] = str(event.summary)
+                rc = getattr(event, "reasoning_content", None)
+                if rc:
+                    sse_data["reasoning_content"] = rc
+                tbs = getattr(event, "thinking_blocks", None)
+                if tbs:
+                    sse_data["thinking_blocks"] = [
+                        tb.model_dump() for tb in tbs
+                    ]
                 if event.tool_name == PLEASE_HELP_ME_TOOL_NAME and event.action:
                     help_request = getattr(event.action, "message", None)
                     if isinstance(help_request, str) and help_request.strip():
@@ -144,6 +152,16 @@ def on_event(self, event: Event) -> None:
             elif isinstance(event, MessageEvent):
                 # MessageEvent has llm_message with role information
                 sse_data["role"] = event.llm_message.role
+                # Surface reasoning so it's visible in the frontend even when
+                # `content` is empty (e.g. qwen-flash thinking-only responses).
+                rc = getattr(event.llm_message, "reasoning_content", None)
+                if rc:
+                    sse_data["reasoning_content"] = rc
+                tbs = getattr(event.llm_message, "thinking_blocks", None)
+                if tbs:
+                    sse_data["thinking_blocks"] = [
+                        tb.model_dump() for tb in tbs
+                    ]
                 # Also include activated_skills if present
                 if event.activated_skills:
                     sse_data["activated_skills"] = event.activated_skills
diff --git a/uv.lock b/uv.lock
index ae027e7..dc71d98 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1678,8 +1678,8 @@ requires-dist = [
     { name = "litellm", git = "https://github.com/softpudding/litellm.git?rev=363075400d97a5252fd2eb60c4f8d44bb529057c" },
     { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.7.0" },
     { name = "numpy", specifier = ">=1.24.0" },
-    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" },
-    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" },
+    { name = "openhands-sdk", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=3799d1cf2af72f8ce21a4942ffab67ffe208b551" },
+    { name = "openhands-tools", git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=3799d1cf2af72f8ce21a4942ffab67ffe208b551" },
     { name = "pillow", specifier = ">=10.0.0" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = ">=4.0.0" },
     { name = "pydantic", specifier = ">=2.5.0" },
@@ -2224,7 +2224,7 @@ wheels = [
 [[package]]
 name = "openhands-sdk"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282#1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-sdk&rev=3799d1cf2af72f8ce21a4942ffab67ffe208b551#3799d1cf2af72f8ce21a4942ffab67ffe208b551" }
 dependencies = [
     { name = "agent-client-protocol" },
     { name = "deprecation" },
@@ -2244,7 +2244,7 @@ dependencies = [
 [[package]]
 name = "openhands-tools"
 version = "1.12.0"
-source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=1ac8fff47e78cc5cc65b5261859f3b2ec01ff282#1ac8fff47e78cc5cc65b5261859f3b2ec01ff282" }
+source = { git = "https://github.com/softpudding/agent-sdk.git?subdirectory=openhands-tools&rev=3799d1cf2af72f8ce21a4942ffab67ffe208b551#3799d1cf2af72f8ce21a4942ffab67ffe208b551" }
 dependencies = [
     { name = "bashlex" },
     { name = "binaryornot" },

From 2470ebf9aa01dc830ac719f720847c21bb5070cc Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Sat, 9 May 2026 15:58:09 +0800
Subject: [PATCH 02/12] refactor(pixel-confirm): drop the zoom-crop, render
 preview at original size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The confirmation preview used to crop and rescale the viewport around
the click target. That gave the agent a screenshot whose coordinate
system did not match every other screenshot in the conversation, so a
"retarget" reply could land pixels picked from zoom space — wrong.

Return the full viewport screenshot instead, with the existing yellow
target box and orange candidate outlines drawn on the live DOM (which
the screenshot picks up naturally) plus a canvas-side fail-safe in
device-pixel space. The agent now confirms the marked element, or
emits a fresh coordinate from the same coordinate system it sees
everywhere else — extension-side detection no longer constrains the
retarget, since fresh-pixel estimates remain valid.

Update both small- and big-model mouse_tool.j2 prompts to describe the
preview in affirmative terms (no "zoomed crop" wording) and to invite
either a candidate-center retarget or a fresh-pixel estimate.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../src/commands/pixel-confirm-render.ts      | 165 ++----------------
 server/agent/prompts/big_model/mouse_tool.j2  |   6 +-
 .../agent/prompts/small_model/mouse_tool.j2   |   6 +-
 server/agent/tools/browser_executor.py        |   7 +-
 4 files changed, 27 insertions(+), 157 deletions(-)

diff --git a/extension/src/commands/pixel-confirm-render.ts b/extension/src/commands/pixel-confirm-render.ts
index 58cfd7d..0413c60 100644
--- a/extension/src/commands/pixel-confirm-render.ts
+++ b/extension/src/commands/pixel-confirm-render.ts
@@ -1,17 +1,19 @@
 /**
  * Pixel-Confirmation Render Module
  *
- * Produces a zoomed confirmation screenshot for a pending pixel mouse action
- * (click or drag). Two visual modes:
+ * Produces a confirmation screenshot for a pending pixel mouse action
+ * (click or drag) at the page's original viewport size, so the agent's
+ * coordinate system matches what it sees in every other screenshot. Two
+ * visual modes:
  *
- *   - 'pixel_hit'  → YELLOW box around the hit element + zoom-crop centered on it.
- *   - 'pixel_miss' → red crosshair at the click coord + thin grey outlines on
- *                    nearby candidate elements + zoom-crop centered on the click.
+ *   - 'pixel_hit'  → YELLOW box around the hit element.
+ *   - 'pixel_miss' → orange dashed outlines on nearby candidate elements;
+ *                    no crosshair (the candidates already tell the agent
+ *                    where to re-aim).
  *
- * Both modes capture a fresh viewport screenshot (no virtual cursor — we draw
- * our own crosshair / box so the cursor sprite would be redundant) and return
- * a base64 PNG data URL keyed under `screenshot_data_url` to match the shape
- * used by other 2PC previews.
+ * Both modes capture a fresh viewport screenshot (no virtual cursor) and
+ * return a base64 PNG data URL keyed under `screenshot_data_url` to match
+ * the shape used by other 2PC previews.
  */
 
 import { captureScreenshot, compressIfNeeded } from './screenshot';
@@ -33,12 +35,6 @@ const DRAG_LINE_COLOR = 'rgba(255, 212, 0, 0.85)';
 const DRAG_LINE_WIDTH = 3;
 const DRAG_ARROW_HEAD = 14;
 
-const BASE_CONTEXT_PADDING_X = 96;
-const BASE_CONTEXT_PADDING_Y = 112;
-const BASE_MIN_CROP_WIDTH = 520;
-const BASE_MIN_CROP_HEIGHT = 320;
-const MIN_CROP_RATIO = 0.58;
-
 interface BBox {
   x: number;
   y: number;
@@ -67,11 +63,6 @@ export interface PixelConfirmRenderResult {
   screenshot_data_url: string;
   viewport: { width: number; height: number };
   scale: number;
-  crop: BBox;
-}
-
-function clamp(value: number, min: number, max: number): number {
-  return Math.max(min, Math.min(max, value));
 }
 
 function expandBbox(b: BBox, padding: number): BBox {
@@ -83,115 +74,6 @@ function expandBbox(b: BBox, padding: number): BBox {
   };
 }
 
-function unionBbox(boxes: BBox[]): BBox {
-  if (boxes.length === 0) return { x: 0, y: 0, width: 0, height: 0 };
-  let x1 = Infinity;
-  let y1 = Infinity;
-  let x2 = -Infinity;
-  let y2 = -Infinity;
-  for (const b of boxes) {
-    x1 = Math.min(x1, b.x);
-    y1 = Math.min(y1, b.y);
-    x2 = Math.max(x2, b.x + b.width);
-    y2 = Math.max(y2, b.y + b.height);
-  }
-  return { x: x1, y: y1, width: x2 - x1, height: y2 - y1 };
-}
-
-function chooseCropCenter(request: PixelConfirmRenderRequest): {
-  center: PointXY;
-  focusBbox: BBox;
-} {
-  if (request.mode === 'pixel_hit' && request.target_bbox) {
-    const focus = request.drag_end
-      ? unionBbox([
-          request.target_bbox,
-          {
-            x: request.drag_end.x,
-            y: request.drag_end.y,
-            width: 1,
-            height: 1,
-          },
-        ])
-      : request.target_bbox;
-    return {
-      center: {
-        x: focus.x + focus.width / 2,
-        y: focus.y + focus.height / 2,
-      },
-      focusBbox: focus,
-    };
-  }
-  // pixel_miss or hit without bbox → center on the click point.
-  const focus: BBox = request.drag_end
-    ? unionBbox([
-        { x: request.x, y: request.y, width: 1, height: 1 },
-        {
-          x: request.drag_end.x,
-          y: request.drag_end.y,
-          width: 1,
-          height: 1,
-        },
-      ])
-    : { x: request.x - 1, y: request.y - 1, width: 2, height: 2 };
-  return {
-    center: {
-      x: focus.x + focus.width / 2,
-      y: focus.y + focus.height / 2,
-    },
-    focusBbox: focus,
-  };
-}
-
-function calculateCrop(
-  imageWidth: number,
-  imageHeight: number,
-  scale: number,
-  request: PixelConfirmRenderRequest,
-): BBox {
-  const { focusBbox } = chooseCropCenter(request);
-
-  const focusDevice = {
-    x: focusBbox.x * scale,
-    y: focusBbox.y * scale,
-    width: Math.max(1, focusBbox.width * scale),
-    height: Math.max(1, focusBbox.height * scale),
-  };
-
-  const contextX = BASE_CONTEXT_PADDING_X * scale;
-  const contextY = BASE_CONTEXT_PADDING_Y * scale;
-  const minCropW = Math.min(
-    imageWidth,
-    Math.max(BASE_MIN_CROP_WIDTH * scale, imageWidth * MIN_CROP_RATIO),
-  );
-  const minCropH = Math.min(
-    imageHeight,
-    Math.max(BASE_MIN_CROP_HEIGHT * scale, imageHeight * MIN_CROP_RATIO),
-  );
-
-  const desiredW = Math.max(minCropW, focusDevice.width + contextX * 2);
-  const desiredH = Math.max(minCropH, focusDevice.height + contextY * 2);
-
-  const cropW = Math.min(imageWidth, Math.round(desiredW));
-  const cropH = Math.min(imageHeight, Math.round(desiredH));
-
-  const centerX = focusDevice.x + focusDevice.width / 2;
-  const centerY = focusDevice.y + focusDevice.height / 2;
-
-  const cropX = clamp(
-    Math.round(centerX - cropW / 2),
-    0,
-    Math.max(0, imageWidth - cropW),
-  );
-  const cropY = clamp(
-    Math.round(centerY - cropH / 2),
-    0,
-    Math.max(0, imageHeight - cropH),
-  );
-
-  return { x: cropX, y: cropY, width: cropW, height: cropH };
-}
-
 function drawCandidateOutline(
   ctx: OffscreenCanvasRenderingContext2D,
   rect: BBox,
@@ -476,38 +358,26 @@ export async function renderPixelConfirm(
   const actualScaleY = viewportHeight > 0 ? bitmap.height / viewportHeight : 1;
   const scale = (actualScaleX + actualScaleY) / 2 || 1;
 
-  const crop = calculateCrop(bitmap.width, bitmap.height, scale, request);
-
-  const canvas = new OffscreenCanvas(crop.width, crop.height);
+  const canvas = new OffscreenCanvas(bitmap.width, bitmap.height);
   const ctx = canvas.getContext('2d');
   if (!ctx) {
     bitmap.close();
     throw new Error('[PixelConfirmRender] Failed to acquire 2d context');
   }
 
-  ctx.drawImage(
-    bitmap,
-    crop.x,
-    crop.y,
-    crop.width,
-    crop.height,
-    0,
-    0,
-    crop.width,
-    crop.height,
-  );
+  ctx.drawImage(bitmap, 0, 0);
   bitmap.close();
 
   const toDeviceRect = (b: BBox): BBox => ({
-    x: Math.round(b.x * scale - crop.x),
-    y: Math.round(b.y * scale - crop.y),
+    x: Math.round(b.x * scale),
+    y: Math.round(b.y * scale),
     width: Math.max(1, Math.round(b.width * scale)),
     height: Math.max(1, Math.round(b.height * scale)),
   });
 
   const toDevicePoint = (p: PointXY): PointXY => ({
-    x: Math.round(p.x * scale - crop.x),
-    y: Math.round(p.y * scale - crop.y),
+    x: Math.round(p.x * scale),
+    y: Math.round(p.y * scale),
   });
 
   // Candidate outlines first (so the hit box / crosshair sits on top).
@@ -557,6 +427,5 @@ export async function renderPixelConfirm(
     screenshot_data_url: compressed,
     viewport: { width: viewportWidth, height: viewportHeight },
     scale,
-    crop,
   };
 }
diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
index 6c496ee..b471608 100644
--- a/server/agent/prompts/big_model/mouse_tool.j2
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -59,7 +59,7 @@ Commit a pending click or drag that was previewed in the previous response.
 { "action": "confirm" }
 ```
 
-Only valid right after a preview-style observation (zoomed crop with a yellow box or red crosshair). See **Confirmation previews** below.
+Use this right after a confirmation preview. See **Confirmation previews** below.
 
 ### scroll
 Scroll at the cursor's current position by `amount` CSS pixels in `direction`. `amount` is always positive — `direction` carries the sign.
@@ -80,12 +80,12 @@ Return the cursor to the viewport center.
 
 ## Confirmation previews
 
-When `click` or `drag` lands in an area with several interactable controls close together, the next observation is a zoomed crop showing exactly what your coordinate selected. The same outlines are also painted onto the live page DOM, so the screenshot reflects what a human watching the browser would see.
+When `click` or `drag` lands in an area with several interactable controls close together, the next observation is the page at its normal size with the target marked. The outlines are painted onto the live page DOM, so the screenshot reflects what a human watching the browser would see.
 
 - A **yellow** outline marks the element the click would commit on.
 - **Orange dashed** outlines mark nearby candidates. The message lists each candidate's HTML and center coordinates in `[0, 1000]` space.
 
-Check the yellow-highlighted element. If it matches what you wanted to click (or drag), reply `{ "action": "confirm" }` to commit. If it does not, re-emit `click` (or `drag`) with one of the listed candidate centers as the `coordinate`.
+If the yellow-highlighted element matches what you wanted to click (or drag), reply `{ "action": "confirm" }` to commit. To retarget, emit `click` (or `drag`) again with a new `coordinate` — pick a center from the candidate list, or estimate a fresh pixel from the screenshot.
 
 For a drag preview, the same rules apply at each endpoint. `confirm` commits the drag as previewed; otherwise re-emit `drag` with corrected `start_coordinate` and `end_coordinate`.
 
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
index 35f8fa1..36312f2 100644
--- a/server/agent/prompts/small_model/mouse_tool.j2
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -60,16 +60,16 @@ Commit a previewed click or drag.
 ```json
 { "action": "confirm" }
 ```
-Only valid after a preview observation (zoomed crop with a yellow box or red crosshair). See **Confirmation previews** below.
+Use this right after a confirmation preview. See **Confirmation previews** below.
 
 ## Confirmation previews
 
-If `click` or `drag` falls in a crowded area, the next observation is a zoomed crop. The same outlines are also painted on the live page DOM.
+When `click` or `drag` falls in a crowded area, the next observation is the page at its normal size with the target marked.
 
 - A **yellow** outline marks the element the click would commit on.
 - **Orange dashed** outlines mark nearby candidates, listed in the message with HTML and center coordinates in `[0, 1000]` space.
 
-Check the yellow-highlighted element. If it matches your intent, reply `{ "action": "confirm" }` to commit. Otherwise, re-emit `click` (or `drag`) with one of the listed candidate centers as the `coordinate`.
+If the yellow-highlighted element matches your intent, reply `{ "action": "confirm" }` to commit. To retarget, emit `click` (or `drag`) again with a new `coordinate` — pick a center from the candidate list, or estimate a fresh pixel from the screenshot.
 
 ## Patterns
 
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index be42fe7..b66fc64 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1355,9 +1355,10 @@ def _build_pixel_gate_message(
     ) -> str:
         """Compose the human-readable confirmation message for the agent.
 
-        Kept terse on purpose: the zoomed crop already shows the yellow
-        target and orange neighbors visually, so the message contributes
-        only the candidate list (HTML + centers) and one-line guidance.
+        Kept terse on purpose: the preview screenshot already shows the
+        yellow target and orange neighbors visually at original size, so
+        the message contributes only the candidate list (HTML + centers)
+        and one-line guidance.
         """
         lines: list[str] = []
         if kind == "click":

From 6de8677738ee735f247075fb6b369fad9fc92580 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 18:39:13 +0800
Subject: [PATCH 03/12] fix(eval): drive move picker drills down; runner
 supports --tests subset

The drive eval's Move-items dialog rendered every folder as a single flat
scrollable list, forcing the agent to scan ~30 path-labelled rows to find
a known nested target. Replace with a real drill-down: breadcrumb header
at the top (clickable segments navigate up) and a list of direct child
folders for the current location (click to drill in). The current
breadcrumb tail is the destination, so 'Move items' commits to wherever
you've navigated to.

Add a --tests flag to evaluate_browser_agent.py that filters the
all-tests scheduler to a named subset, so rerunning a handful of
failing cases doesn't burn the whole benchmark slot.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 eval/drive/css/drive.css       | 55 ++++++++++++++++++++++++++++++----
 eval/drive/js/drive.js         | 42 +++++++++++++++++++-------
 eval/evaluate_browser_agent.py | 21 +++++++++++++
 3 files changed, 102 insertions(+), 16 deletions(-)

diff --git a/eval/drive/css/drive.css b/eval/drive/css/drive.css
index 05e3f54..a508ff4 100644
--- a/eval/drive/css/drive.css
+++ b/eval/drive/css/drive.css
@@ -196,12 +196,56 @@
 }
 
 .drive-destination-picker {
-  max-height: 320px;
-  overflow: auto;
   border: 1px solid var(--mock-border);
   border-radius: 18px;
-  padding: 12px;
   background: rgba(15, 23, 34, 0.03);
+  display: flex;
+  flex-direction: column;
+  overflow: hidden;
+}
+
+.drive-destination-breadcrumb {
+  display: flex;
+  flex-wrap: wrap;
+  align-items: center;
+  gap: 4px;
+  padding: 10px 14px;
+  border-bottom: 1px solid var(--mock-border);
+  background: rgba(15, 23, 34, 0.04);
+}
+
+.drive-destination-crumb {
+  background: transparent;
+  border: 0;
+  padding: 4px 8px;
+  border-radius: 8px;
+  color: var(--mock-text);
+  font-size: 13px;
+  cursor: pointer;
+}
+
+.drive-destination-crumb:hover {
+  background: rgba(15, 23, 34, 0.06);
+}
+
+.drive-destination-crumb.active {
+  background: rgba(15, 157, 88, 0.12);
+  color: rgba(15, 157, 88, 0.95);
+  font-weight: 600;
+}
+
+.drive-destination-sep {
+  color: rgba(15, 23, 34, 0.35);
+  font-size: 13px;
+}
+
+.drive-destination-list {
+  max-height: 280px;
+  overflow: auto;
+  padding: 8px;
+  display: flex;
+  flex-direction: column;
+  gap: 2px;
 }
 
 .drive-destination-item {
@@ -216,9 +260,8 @@
   color: var(--mock-text);
 }
 
-.drive-destination-item.active {
-  border-color: rgba(15, 157, 88, 0.24);
-  background: rgba(15, 157, 88, 0.08);
+.drive-destination-item:hover {
+  background: rgba(15, 23, 34, 0.05);
 }
 
 .drive-upload-option {
diff --git a/eval/drive/js/drive.js b/eval/drive/js/drive.js
index f1b34a5..8b4e896 100644
--- a/eval/drive/js/drive.js
+++ b/eval/drive/js/drive.js
@@ -370,24 +370,46 @@ window.tracker = new AgentTracker("drive.google.com", "hard");
     `;
   }
 
-  function getAllDestinationFolders(state) {
-    return state.items.filter((item) => item.type === "folder");
+  function getChildFolders(state, parentId) {
+    return state.items.filter(
+      (item) => item.type === "folder" && item.parentId === parentId && item.section === "my-drive",
+    );
   }
 
   function renderDestinationPicker(state, activeDestinationId) {
-    return `
-      <div class="drive-destination-picker">
-        ${getAllDestinationFolders(state)
+    const trail = getFolderPath(state, activeDestinationId);
+    const children = getChildFolders(state, activeDestinationId || null);
+
+    const breadcrumb = `
+      <div class="drive-destination-breadcrumb">
+        <button class="drive-destination-crumb ${!activeDestinationId ? "active" : ""}" data-action="set-modal-destination" data-destination-id="">My Drive</button>
+        ${trail
+          .map((folder, idx) => {
+            const isLast = idx === trail.length - 1;
+            return `<span class="drive-destination-sep">/</span><button class="drive-destination-crumb ${isLast ? "active" : ""}" data-action="set-modal-destination" data-destination-id="${folder.id}">${escapeHtml(folder.name)}</button>`;
+          })
+          .join("")}
+      </div>
+    `;
+
+    const list = children.length
+      ? children
           .map((folder) => {
-            const path = getFolderPath(state, folder.parentId).map((item) => item.name).join(" / ");
+            const hasSub = getChildFolders(state, folder.id).length > 0;
             return `
-              <button class="drive-destination-item ${activeDestinationId === folder.id ? "active" : ""}" data-action="set-modal-destination" data-destination-id="${folder.id}">
+              <button class="drive-destination-item" data-action="set-modal-destination" data-destination-id="${folder.id}">
                 <span>${escapeHtml(folder.name)}</span>
-                <span class="mock-subtle">${escapeHtml(path || folder.section)}</span>
+                <span class="mock-subtle">${hasSub ? "Open ▸" : "Select"}</span>
               </button>
             `;
           })
-          .join("")}
+          .join("")
+      : `<p class="mock-subtle" style="padding: 12px;">No subfolders. The current folder is selected as the destination.</p>`;
+
+    return `
+      <div class="drive-destination-picker">
+        ${breadcrumb}
+        <div class="drive-destination-list">${list}</div>
       </div>
     `;
   }
@@ -439,7 +461,7 @@ window.tracker = new AgentTracker("drive.google.com", "hard");
               ${renderDestinationPicker(state, modal.destinationId || null)}
             </div>
             <div class="mock-modal-footer">
-              <span class="mock-subtle">The destination picker is intentionally scrollable and nested.</span>
+              <span class="mock-subtle">Click a folder to drill in. Click a breadcrumb segment to navigate up.</span>
               <button class="mock-btn" data-action="${modal.type === "move" ? "commit-move" : "commit-shortcut"}">
                 ${modal.type === "move" ? "Move items" : "Create shortcut"}
               </button>
diff --git a/eval/evaluate_browser_agent.py b/eval/evaluate_browser_agent.py
index 2924a64..8884f7f 100644
--- a/eval/evaluate_browser_agent.py
+++ b/eval/evaluate_browser_agent.py
@@ -2485,6 +2485,7 @@ def run_all(
         manual: bool = False,
         parallel: int = 1,
         single_model_parallel: int = 1,
+        tests_filter: Optional[List[str]] = None,
     ):
         """Run all test cases for specified LLM targets."""
         if not self.ensure_services(skip_services=skip_services, manual=manual):
@@ -2505,6 +2506,19 @@ def run_all(
             logger.warning("No test cases found")
             return False
 
+        if tests_filter:
+            wanted = set(tests_filter)
+            test_cases = [tc for tc in test_cases if tc.id in wanted]
+            missing = wanted - {tc.id for tc in test_cases}
+            if missing:
+                logger.error(f"Tests not found: {sorted(missing)}")
+                return False
+            logger.info(
+                "Filtered to %d test(s): %s",
+                len(test_cases),
+                [tc.id for tc in test_cases],
+            )
+
         scheduled_results = self._run_scheduled_jobs(
             test_cases=test_cases,
             targets=targets,
@@ -3223,6 +3237,12 @@ def main():
         ),
     )
     parser.add_argument("--test", help="Run specific test by ID")
+    parser.add_argument(
+        "--tests",
+        nargs="+",
+        help="Run a subset of tests by ID (space-separated). Routed through the "
+        "all-tests scheduler so --parallel / --single-model-parallel apply.",
+    )
     parser.add_argument(
         "--repair-output",
         help="Repair one saved evaluation output directory using persisted usage data.",
@@ -3502,6 +3522,7 @@ def main():
                     manual=False,
                     parallel=args.parallel,
                     single_model_parallel=args.single_model_parallel,
+                    tests_filter=args.tests,
                 )
                 if not success:
                     sys.exit(1)

From d3b0271669b94cf3ab3e77acd6fa88471c4c194d Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 18:39:27 +0800
Subject: [PATCH 04/12] fix(keyboard): reliable select-all + clear on macOS

Several keyboard paths failed on macOS so the agent couldn't replace
text in a focused field, which broke rename flows across the eval:

- Agents reached for Ctrl+A from training, which on macOS is "go to
  start of line" (Emacs binding), not select-all. Remap
  Control+<shortcut-key> -> Meta+<shortcut-key> on the host side so the
  intent lands correctly, and surface the swap in the observation
  message so the agent learns what actually fired. Add a `literal: true`
  field on KeyboardAction to bypass the remap when the agent really
  wants the raw Control combination.

- CDP `Input.dispatchKeyEvent` doesn't trigger Chromium's built-in
  select-all accelerator even when given Meta+a (the comment on the
  existing `clear` action already noted this). Add an
  `ensureSelectAllOnActive` JS fallback that runs after the key event
  for any Meta+a or Control+a press and forces the visual selection on
  the focused input/textarea/contenteditable, so a following `type`
  replaces instead of appending.

- The `clear` action was silently dead in production: the extension and
  command model existed, but the server processor's dispatch had no
  branch for KeyboardClearCommand, so every clear invocation raised
  "Unknown command type". Add the missing dispatch.

- Replace a frozen-Observation mutation in the clear branch
  (`obs.success = False`) with `model_copy(update=...)`, which is the
  pydantic-v2 idiom for "modify a frozen model".

Update the small- and big-model keyboard prompts to teach the macOS
shape affirmatively (Meta+a/c/v/x/z as the command shortcuts) and to
mention the auto-translation and `literal` escape hatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/commands/pixel-actions.ts       | 55 ++++++++++++++++++
 .../agent/prompts/big_model/keyboard_tool.j2  |  2 +
 .../prompts/small_model/keyboard_tool.j2      |  2 +
 server/agent/tools/browser_executor.py        | 56 ++++++++++++++++---
 server/agent/tools/keyboard_tool.py           |  8 +++
 server/core/processor.py                      | 10 ++++
 6 files changed, 124 insertions(+), 9 deletions(-)

diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index 9afda56..cf9e080 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -987,6 +987,48 @@ const KEY_TEXT: Record<string, string> = {
   Space: ' ',
 };
 
+// Selection-all on the focused element via JS. Chromium's accelerator
+// bindings for Meta+A / Control+A are not triggered by CDP
+// `Input.dispatchKeyEvent`, so the dispatched key combination fires
+// keyboard listeners but leaves the field unselected. After the key
+// event, we run a small JS snippet that calls `.select()` on inputs /
+// textareas and `Selection.selectNodeContents` on contenteditables, so
+// the visual selection actually exists for a following `type` or
+// `Backspace`.
+async function ensureSelectAllOnActive(cdp: CdpCommander): Promise<void> {
+  const expr = `(() => {
+    const el = document.activeElement;
+    if (!el || el === document.body) return false;
+    const tag = (el.tagName || '').toLowerCase();
+    try {
+      if (tag === 'input' || tag === 'textarea') {
+        if (typeof el.select === 'function') { el.select(); return true; }
+      }
+      if (el.isContentEditable) {
+        const sel = window.getSelection && window.getSelection();
+        if (sel && document.createRange) {
+          const range = document.createRange();
+          range.selectNodeContents(el);
+          sel.removeAllRanges();
+          sel.addRange(range);
+          return true;
+        }
+      }
+    } catch (_) {}
+    return false;
+  })()`;
+  try {
+    await cdp.sendCommand(
+      'Runtime.evaluate',
+      { expression: expr, returnByValue: true },
+      4000,
+      0,
+    );
+  } catch (_) {
+    // best-effort; the CDP key event still fired
+  }
+}
+
 export async function performKeyboardPress(
   tabId: number,
   conversationId: string,
@@ -1037,6 +1079,19 @@ export async function performKeyboardPress(
     8000,
     0,
   );
+
+  // Select-all accelerator (Meta+A / Control+A) doesn't fire via CDP key
+  // events. After listeners have observed the keydown/keyup pair, force
+  // the visual selection so a subsequent `type` or `Backspace` replaces
+  // the contents instead of appending to them.
+  const isSelectAll =
+    resolved.key.toLowerCase() === 'a' &&
+    (mod & 0x4 || mod & 0x2) &&
+    !(mod & 0x1);
+  if (isSelectAll) {
+    await ensureSelectAllOnActive(cdp);
+  }
+
   return { key: resolved.key, modifiers: mod };
 }
 
diff --git a/server/agent/prompts/big_model/keyboard_tool.j2 b/server/agent/prompts/big_model/keyboard_tool.j2
index 16b76c0..748e0ee 100644
--- a/server/agent/prompts/big_model/keyboard_tool.j2
+++ b/server/agent/prompts/big_model/keyboard_tool.j2
@@ -35,6 +35,8 @@ Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`, `ArrowD
 
 Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
 
+On macOS, use `Meta` for command shortcuts: `Meta+a` selects all, `Meta+c`/`v`/`x` are copy/paste/cut, `Meta+z` is undo, `Meta+f` opens find. If you pass `Control` with one of these shortcut keys (`a`, `c`, `v`, `x`, `z`, `y`, `s`, `f`, `g`, `p`, `n`, `t`, `w`, `r`, `l`), the server translates it to `Meta` and the observation reports the actual press. Set `literal: true` on the action to send the exact `Control` combination without translation — useful when you need the literal `Control` binding (e.g. Emacs-style cursor moves).
+
 ### clear
 Empty the currently focused field. Works on `<input>`, `<textarea>`, and contenteditable widgets (Gmail search, rich editors). The result reports whether the field actually ended up empty — if it didn't, click into the field first and try again.
 
diff --git a/server/agent/prompts/small_model/keyboard_tool.j2 b/server/agent/prompts/small_model/keyboard_tool.j2
index ea2484d..c3eee46 100644
--- a/server/agent/prompts/small_model/keyboard_tool.j2
+++ b/server/agent/prompts/small_model/keyboard_tool.j2
@@ -27,6 +27,8 @@ Common keys: `Enter`, `Escape`, `Tab`, `Backspace`, `Delete`, `ArrowUp`/`Down`/`
 
 Modifiers: `Control`, `Shift`, `Alt`, `Meta` (Cmd on macOS).
 
+On macOS, use `Meta` for command shortcuts: `Meta+a` selects all, `Meta+c`/`v`/`x` are copy/paste/cut, `Meta+z` is undo. If you pass `Control` with one of these shortcut keys (`a`, `c`, `v`, `x`, `z`, `y`, `s`, `f`, `g`, `p`, `n`, `t`, `w`, `r`, `l`), the server translates it to `Meta` and the observation reports the actual press. Set `literal: true` to send the exact `Control` combination without translation.
+
 ### clear
 Empty the focused field. Works on `<input>`, `<textarea>`, and contenteditable widgets. The result tells you whether the field actually ended up empty — if not, click into the field first and try again.
 ```json
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index b66fc64..567a27b 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -13,8 +13,9 @@
 
 import asyncio
 import logging
+import sys
 import threading
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from openhands.sdk.tool import ToolExecutor
 import requests
@@ -1824,6 +1825,36 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                 success=False, error=str(e), small_model=self._uses_small_model()
             )
 
+    # Keys where Control on Windows/Linux maps to Cmd (Meta) on macOS.
+    # Excludes navigation/cursor combos (arrows, Home/End) since those have
+    # different semantics on macOS that can't be remapped 1:1.
+    _MAC_REMAP_KEYS = {
+        "a", "c", "v", "x", "z", "y", "s", "f", "g",
+        "p", "n", "t", "w", "r", "l", "+", "-", "0",
+    }
+
+    @classmethod
+    def _maybe_remap_for_host(
+        cls, key: str, modifiers: List[str]
+    ) -> Tuple[List[str], Optional[str]]:
+        """Translate Control+<shortcut-key> → Meta+<shortcut-key> on macOS.
+
+        Returns (modifiers, note). `note` is a short human-readable string
+        describing what swapped, suitable for appending to the observation
+        message so the agent learns the actual keys that hit the page.
+        """
+        if sys.platform != "darwin" or not modifiers or not key:
+            return modifiers, None
+        if key.lower() not in cls._MAC_REMAP_KEYS:
+            return modifiers, None
+        if "Control" not in modifiers:
+            return modifiers, None
+        new_mods = ["Meta" if m == "Control" else m for m in modifiers]
+        before = "+".join(modifiers + [key])
+        after = "+".join(new_mods + [key])
+        note = f"(remapped {before} to {after} on macOS)"
+        return new_mods, note
+
     def _execute_keyboard_action(
         self, action: KeyboardAction
     ) -> OpenBrowserObservation:
@@ -1849,18 +1880,25 @@ def _execute_keyboard_action(
             if kind == "press":
                 if not action.key:
                     raise ValueError("keyboard press requires key")
+                modifiers = list(action.modifiers or [])
+                remap_note: Optional[str] = None
+                if not action.literal:
+                    modifiers, remap_note = self._maybe_remap_for_host(
+                        action.key, modifiers
+                    )
+                if remap_note:
+                    logger.info("Keyboard press %s", remap_note)
                 command = KeyboardPressCommand(
                     key=action.key,
-                    modifiers=list(action.modifiers or []),
+                    modifiers=modifiers,
                     conversation_id=self.conversation_id,
                 )
                 result_dict = self._execute_command_sync(command)
-                mod_text = (
-                    f" with {'+'.join(action.modifiers)}" if action.modifiers else ""
-                )
-                return self._build_observation_from_result(
-                    result_dict, f"Pressed {action.key}{mod_text}"
-                )
+                mod_text = f" with {'+'.join(modifiers)}" if modifiers else ""
+                msg = f"Pressed {action.key}{mod_text}"
+                if remap_note:
+                    msg = f"{msg} {remap_note}"
+                return self._build_observation_from_result(result_dict, msg)
 
             if kind == "clear":
                 # JS-based clear on document.activeElement: set value /
@@ -1886,7 +1924,7 @@ def _execute_keyboard_action(
                     )
                 obs = self._build_observation_from_result(result_dict, msg)
                 if not cleared:
-                    obs.success = False
+                    obs = obs.model_copy(update={"success": False})
                 return obs
 
             raise ValueError(f"Unknown keyboard action: {kind}")
diff --git a/server/agent/tools/keyboard_tool.py b/server/agent/tools/keyboard_tool.py
index a53bf00..b7ddd4e 100644
--- a/server/agent/tools/keyboard_tool.py
+++ b/server/agent/tools/keyboard_tool.py
@@ -69,6 +69,14 @@ class KeyboardAction(OpenBrowserAction):
             "Use 'Meta' for Cmd on macOS."
         ),
     )
+    literal: bool = Field(
+        default=False,
+        description=(
+            "Skip OS-aware modifier translation and send the exact "
+            "key+modifiers as written. Set true when you need the literal "
+            "Control behavior (e.g. an Emacs-style cursor binding on macOS)."
+        ),
+    )
 
 
 class KeyboardTool(ToolDefinition[KeyboardAction, OpenBrowserObservation]):
diff --git a/server/core/processor.py b/server/core/processor.py
index d2897b6..50052cf 100644
--- a/server/core/processor.py
+++ b/server/core/processor.py
@@ -16,6 +16,7 @@
     ResetMouseCommand,
     KeyboardTypeCommand,
     KeyboardPressCommand,
+    KeyboardClearCommand,
     SelectOptionCommand,
     UploadFilePendingCommand,
     ScreenshotCommand,
@@ -243,6 +244,8 @@ async def execute(self, command: Command) -> CommandResponse:
                 return await self._execute_keyboard_type(command)
             elif isinstance(command, KeyboardPressCommand):
                 return await self._execute_keyboard_press(command)
+            elif isinstance(command, KeyboardClearCommand):
+                return await self._execute_keyboard_clear(command)
             elif isinstance(command, SelectOptionCommand):
                 return await self._execute_select_option(command)
             elif isinstance(command, UploadFilePendingCommand):
@@ -344,6 +347,13 @@ async def _execute_keyboard_press(
         response = await self._send_prepared_command(command)
         return response
 
+    async def _execute_keyboard_clear(
+        self, command: KeyboardClearCommand
+    ) -> CommandResponse:
+        """Execute keyboard clear command — JS-based reset of the focused field."""
+        response = await self._send_prepared_command(command)
+        return response
+
     async def _execute_select_option(
         self, command: SelectOptionCommand
     ) -> CommandResponse:

From a9b20a0eacc135fe719bc38018722aa84498077a Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 18:58:00 +0800
Subject: [PATCH 05/12] fix(pixel): align Qwen action coords; warn on keyboard
 no-focus; tiered no-op radius

- Echo click/move/drag/confirm observation coordinates in the agent's own
  space (Qwen [0,1000]) so the value matches what the model emitted.
- Refuse keyboard type when nothing editable is focused and surface a
  warning analogous to the click no-op message; annotate successful type
  with the target field id.
- When a click no-op produces no nearby interactables at 30px, re-probe
  at 100px then 300px and tag each hint with its distance, so the agent
  always has somewhere concrete to re-aim at.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/commands/pixel-actions.ts | 70 +++++++++++++++++-
 server/agent/tools/browser_executor.py  | 94 ++++++++++++++++++++++---
 2 files changed, 154 insertions(+), 10 deletions(-)

diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index cf9e080..02d0e2a 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -868,10 +868,76 @@ export async function performKeyboardType(
   tabId: number,
   conversationId: string,
   text: string,
-): Promise<{ length: number }> {
+): Promise<{
+  length: number;
+  typed: boolean;
+  target?: string;
+  reason?: string;
+}> {
   await attachWithDialogTracking(tabId, conversationId);
   const cdp = new CdpCommander(tabId);
 
+  // Refuse to dispatch when nothing editable is focused. Without this
+  // the CDP keystrokes still fire (against the body / a focused button
+  // / nothing), the page never receives an `input` event, and the agent
+  // sees a generic "Typed text" success — wasting a turn and producing
+  // a stale screenshot. Mirrors the focused-element check used by
+  // `keyboard clear`.
+  const focusProbeExpr = `(() => {
+    const el = document.activeElement;
+    if (!el || el === document.body) {
+      return { editable: false, reason: 'no element focused' };
+    }
+    const tag = (el.tagName || '').toLowerCase();
+    const role = el.getAttribute && el.getAttribute('role');
+    const describe = () => {
+      const id = el.id ? '#' + el.id : '';
+      const name = el.getAttribute && el.getAttribute('name')
+        ? '[name=' + el.getAttribute('name') + ']' : '';
+      const r = role ? '[role=' + role + ']' : '';
+      return tag + id + name + r;
+    };
+    if (tag === 'input') {
+      const t = (el.getAttribute('type') || 'text').toLowerCase();
+      const nonText = new Set([
+        'button', 'submit', 'reset', 'checkbox', 'radio', 'file', 'image',
+        'hidden', 'range', 'color',
+      ]);
+      if (nonText.has(t)) {
+        return {
+          editable: false,
+          target: describe(),
+          reason: 'focused <input type=' + t + '> does not accept typed text',
+        };
+      }
+      return { editable: true, target: describe() };
+    }
+    if (tag === 'textarea') return { editable: true, target: describe() };
+    if (el.isContentEditable) return { editable: true, target: describe() };
+    if (role === 'textbox' || role === 'searchbox' || role === 'combobox') {
+      return { editable: true, target: describe() };
+    }
+    return {
+      editable: false,
+      target: describe(),
+      reason: 'focused element is not an editable field',
+    };
+  })()`;
+  const focusResp = await cdp.sendCommand<{
+    result?: {
+      value?: { editable?: boolean; target?: string; reason?: string };
+    };
+  }>('Runtime.evaluate', { expression: focusProbeExpr, returnByValue: true }, 8000, 0);
+  const focus = focusResp?.result?.value || {};
+  if (!focus.editable) {
+    return {
+      length: 0,
+      typed: false,
+      target: focus.target,
+      reason: focus.reason || 'no editable element focused',
+    };
+  }
+
   // Type one character at a time so the page sees real `keydown` →
   // `keypress` → `input` → `keyup` events for each char. This matches
   // what a human keyboard produces and lets per-char JS handlers
@@ -921,7 +987,7 @@ export async function performKeyboardType(
       await sleep(PER_CHAR_DELAY_MS);
     }
   }
-  return { length: text.length };
+  return { length: text.length, typed: true, target: focus.target };
 }
 
 const NAMED_KEY_MAP: Record<
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 567a27b..80b79bf 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1119,12 +1119,41 @@ def _denormalize_xy(
         py = round(y * vh / 1000) if y is not None else None
         return (px, py)
 
+    def _format_action_xy(
+        self, x_css: Optional[int], y_css: Optional[int]
+    ) -> str:
+        """Render an (x, y) pair in the coordinate space the agent uses.
+
+        For Qwen models the agent emits and reads coordinates in [0, 1000]
+        normalized space, so the observation must echo back the same space
+        — otherwise the agent sees a CSS-pixel value it can't reconcile with
+        the input it just sent. Non-Qwen models work in CSS pixels and get
+        the values unchanged.
+        """
+        if x_css is None or y_css is None:
+            return "(?, ?)"
+        if self._is_qwen_model():
+            viewport = self._get_viewport()
+            if viewport is not None:
+                vw, vh = viewport
+                if vw > 0 and vh > 0:
+                    nx = round(x_css / vw * 1000)
+                    ny = round(y_css / vh * 1000)
+                    return f"({nx}, {ny})"
+        return f"({int(x_css)}, {int(y_css)})"
+
     # ========== Pixel-action density gate ==========
 
     PIXEL_GATE_RADIUS_CSS = 30
     PIXEL_GATE_CANDIDATE_LIMIT = 5
+    PIXEL_GATE_FALLBACK_RADII_CSS = (30, 100, 300)
 
-    def _gate_pixel_target(self, x_css: int, y_css: int) -> Optional[Dict[str, Any]]:
+    def _gate_pixel_target(
+        self,
+        x_css: int,
+        y_css: int,
+        radius: Optional[int] = None,
+    ) -> Optional[Dict[str, Any]]:
         """Probe (x, y) for the hit element + nearby interactables.
 
         Returns the analysis dict from the extension on success, or None if
@@ -1136,7 +1165,7 @@ def _gate_pixel_target(self, x_css: int, y_css: int) -> Optional[Dict[str, Any]]
             cmd = AnalyzePixelTargetsCommand(
                 x=int(x_css),
                 y=int(y_css),
-                radius=self.PIXEL_GATE_RADIUS_CSS,
+                radius=int(radius) if radius is not None else self.PIXEL_GATE_RADIUS_CSS,
                 candidate_limit=self.PIXEL_GATE_CANDIDATE_LIMIT,
                 conversation_id=self.conversation_id,
             )
@@ -1162,6 +1191,32 @@ def _gate_pixel_target(self, x_css: int, y_css: int) -> Optional[Dict[str, Any]]
             return None
         return data
 
+    def _expand_gate_for_warning(
+        self,
+        cursor: Optional[tuple[int, int]],
+        initial_gate: Optional[Dict[str, Any]],
+    ) -> Optional[Dict[str, Any]]:
+        """Re-probe with progressively larger radii so the no-op warning
+        always carries at least one interactable hint when the page has any.
+
+        The pre-click density gate uses a tight 30px radius (sized for the
+        dense/sparse verdict). After a click no-op, a completely empty
+        neighborhood is unhelpful — the agent learns nothing about where
+        the nearest clickable target actually is. Expand to 100, then 300,
+        and return the first probe that surfaces candidates.
+        """
+        if cursor is None:
+            return initial_gate
+        if initial_gate and (initial_gate.get("neighborhood") or []):
+            return initial_gate
+        for radius in self.PIXEL_GATE_FALLBACK_RADII_CSS:
+            if radius == self.PIXEL_GATE_RADIUS_CSS and initial_gate is not None:
+                continue
+            probe = self._gate_pixel_target(cursor[0], cursor[1], radius=radius)
+            if probe and (probe.get("neighborhood") or []):
+                return probe
+        return initial_gate
+
     def _serialize_pixel_candidates(
         self,
         candidates: list,
@@ -1271,6 +1326,9 @@ def _format_pixel_candidates_block(
             cy = cn.get("y")
             if cx is not None and cy is not None:
                 element_lines[0] = f"{element_lines[0]}  → center=({cx}, {cy})"
+            dist = c.get("distance_css")
+            if isinstance(dist, (int, float)):
+                element_lines[0] = f"{element_lines[0]}  · dist={int(round(dist))}px"
             lines.extend(element_lines)
         return "\n".join(lines)
 
@@ -1604,7 +1662,8 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                 self._clear_pending_confirmation()
                 message = (
                     f"Confirmed click {button} at "
-                    f"({extra.get('px')}, {extra.get('py')}) (count={count})"
+                    f"{self._format_action_xy(extra.get('px'), extra.get('py'))} "
+                    f"(count={count})"
                 )
                 intercepted = self._extract_intercepted_form_control(result_dict)
                 if intercepted:
@@ -1644,7 +1703,9 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                 self._clear_pending_confirmation()
                 self._cache_cursor(int(ex), int(ey))
                 return self._build_observation_from_result(
-                    result_dict, f"Confirmed drag from ({sx}, {sy}) to ({ex}, {ey})"
+                    result_dict,
+                    f"Confirmed drag from {self._format_action_xy(sx, sy)} "
+                    f"to {self._format_action_xy(ex, ey)}",
                 )
 
             self._clear_pending_confirmation()
@@ -1692,7 +1753,7 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                 if px is not None and py is not None:
                     self._cache_cursor(px, py)
                 return self._build_observation_from_result(
-                    result_dict, f"Mouse moved to ({px}, {py})"
+                    result_dict, f"Mouse moved to {self._format_action_xy(px, py)}"
                 )
 
             if kind == "click":
@@ -1729,7 +1790,7 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                 result_dict = self._execute_command_sync(command)
                 cx, cy = cursor or (None, None)
                 where = (
-                    f"({cx}, {cy})"
+                    self._format_action_xy(cx, cy)
                     if cx is not None and cy is not None
                     else "the cursor"
                 )
@@ -1741,6 +1802,7 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                         intercepted, action.button, action.count
                     )
                 elif self._click_was_a_no_op(result_dict):
+                    gate = self._expand_gate_for_warning(cursor, gate)
                     message += self._format_no_op_warning(gate)
                     # Draw orange-dashed candidates on the live page so a
                     # human watching the browser sees what the agent is
@@ -1789,8 +1851,10 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                 result_dict = self._execute_command_sync(command)
                 if ex is not None and ey is not None:
                     self._cache_cursor(ex, ey)
+                start_str = self._format_action_xy(sx, sy)
+                end_str = self._format_action_xy(ex, ey)
                 return self._build_observation_from_result(
-                    result_dict, f"Dragged from ({sx}, {sy}) to ({ex}, {ey})"
+                    result_dict, f"Dragged from {start_str} to {end_str}"
                 )
 
             if kind == "scroll":
@@ -1873,8 +1937,22 @@ def _execute_keyboard_action(
                 preview = (
                     action.text if len(action.text) <= 32 else action.text[:29] + "..."
                 )
+                detail = (result_dict or {}).get("data", {}) or {}
+                # Older extension builds don't emit `typed`; treat missing as
+                # success so we don't false-warn during a rolling upgrade.
+                typed = detail.get("typed")
+                if typed is False:
+                    reason = detail.get("reason") or "no editable element focused"
+                    msg = (
+                        f"Type had no effect ({reason}). Click into the "
+                        f"target input field first, then type."
+                    )
+                    obs = self._build_observation_from_result(result_dict, msg)
+                    return obs.model_copy(update={"success": False})
+                target = detail.get("target")
+                target_note = f" into {target}" if isinstance(target, str) and target else ""
                 return self._build_observation_from_result(
-                    result_dict, f"Typed text: {preview!r}"
+                    result_dict, f"Typed text: {preview!r}{target_note}"
                 )
 
             if kind == "press":

From d80133158b75ccaa161fca832dc1093b56eb26bd Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 21:15:23 +0800
Subject: [PATCH 06/12] fix(scroll/gate/render): post-scroll settle, gate
 target, surface FAILED message
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Mouse scroll now actively polls window.scrollY until it stabilizes
  before screenshotting; a single 1000px wheel on a page with
  `scroll-behavior: smooth` animates 500-900ms and was returning a
  mid-animation blank viewport. Capped at 1.5s with a 200ms paint
  settle after.
- Gate preview now includes the previewed target element's structured
  identity (Target: <div> "4" → center=(584, 257)) on the same line as
  the candidate list, so the agent can read what the yellow box is
  before deciding whether to confirm or re-aim.
- Observation renderer at base.py:444 now emits `**Action**: {message}`
  on FAILED observations too; previously the keyboard no-focus warning
  ("Click into the target input field first…") was dropped because the
  failure renderer only emitted Status + Error, and the agent saw a
  meaningless `Status: FAILED, Error: None`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/background/index.ts       |  9 +++-
 extension/src/commands/pixel-actions.ts | 56 +++++++++++++++++++++++++
 server/agent/tools/base.py              |  5 ++-
 server/agent/tools/browser_executor.py  | 44 +++++++++++++++++++
 4 files changed, 112 insertions(+), 2 deletions(-)

diff --git a/extension/src/background/index.ts b/extension/src/background/index.ts
index db444a3..9a54566 100644
--- a/extension/src/background/index.ts
+++ b/extension/src/background/index.ts
@@ -2056,7 +2056,14 @@ async function handleCommand(command: Command): Promise<CommandResponse> {
         // somewhere mid-glide. `reset_mouse` jumps to viewport center via
         // the same sprite path and needs the same wait.
         let settleMs = 0;
-        if (
+        if (command.type === 'mouse_scroll') {
+          // `performMouseScroll` actively polls `window.scrollY` until it
+          // stabilizes before returning, so the smooth-scroll animation is
+          // already done by the time we get here. A small additional wait
+          // lets any post-settle paint / IntersectionObserver-triggered
+          // content (lazy images, virtualized rows) reach the screen.
+          settleMs = 200;
+        } else if (
           command.type === 'mouse_click' ||
           command.type === 'mouse_drag' ||
           command.type === 'keyboard_press' ||
diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index 02d0e2a..03bc151 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -776,10 +776,66 @@ export async function performMouseScroll(
     8000,
     0,
   );
+
+  // Wait for the scroll position to stop changing. Real pages frequently
+  // set `html { scroll-behavior: smooth }`, which turns the wheel event
+  // into a multi-frame animation lasting 500–900ms for a 1000px delta;
+  // capturing the screenshot during the animation shows the page mid-
+  // glide (blank destination region, lazy content not yet hydrated) and
+  // makes the agent think the scroll did nothing.
+  //
+  // Active polling beats a fixed sleep: short scrolls return in ~150ms,
+  // long smooth-scrolls get the full window, and we don't pay 1.2s on
+  // every scroll regardless of size.
+  await waitForScrollSettle(cdp);
+
   await refreshCursor(cdp, tabId, cursor.x, cursor.y);
   return { x: cursor.x, y: cursor.y, deltaX, deltaY };
 }
 
+async function waitForScrollSettle(
+  cdp: CdpCommander,
+  pollIntervalMs: number = 80,
+  maxWaitMs: number = 1500,
+  stableSamples: number = 2,
+): Promise<void> {
+  const readScroll = async (): Promise<[number, number] | null> => {
+    try {
+      const resp = await cdp.sendCommand<{
+        result?: { value?: { x?: number; y?: number } };
+      }>(
+        'Runtime.evaluate',
+        {
+          expression: '({x: window.scrollX, y: window.scrollY})',
+          returnByValue: true,
+        },
+        4000,
+        0,
+      );
+      const v = resp?.result?.value;
+      if (!v || typeof v.x !== 'number' || typeof v.y !== 'number') return null;
+      return [v.x, v.y];
+    } catch {
+      return null;
+    }
+  };
+
+  const start = Date.now();
+  let last = await readScroll();
+  let stable = 0;
+  while (Date.now() - start < maxWaitMs) {
+    await sleep(pollIntervalMs);
+    const cur = await readScroll();
+    if (cur && last && cur[0] === last[0] && cur[1] === last[1]) {
+      stable += 1;
+      if (stable >= stableSamples) return;
+    } else {
+      stable = 0;
+    }
+    last = cur;
+  }
+}
+
 // Per-character US-keyboard mapping for plain ASCII printables. Used by
 // `performKeyboardType` to dispatch real keyDown/keyUp events one char at
 // a time — feels like a human typing and lets per-character JS handlers
diff --git a/server/agent/tools/base.py b/server/agent/tools/base.py
index 9c16237..736c238 100644
--- a/server/agent/tools/base.py
+++ b/server/agent/tools/base.py
@@ -443,7 +443,10 @@ def to_llm_content(self) -> Sequence[TextContent | ImageContent]:
         # cached vw/vh on the executor still drives that conversion.
         if not self.success:
             text_parts.append(f"**Status**: FAILED")
-            text_parts.append(f"**Error**: {self.error}")
+            if self.error:
+                text_parts.append(f"**Error**: {self.error}")
+            if self.message:
+                text_parts.append(f"**Action**: {self.message}")
         else:
             text_parts.append(f"**Status**: SUCCESS")
             # For JavaScript operations, show minimal confirmation
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 80b79bf..bc67761 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1404,6 +1404,41 @@ def _render_pixel_preview(
             return url
         return None
 
+    def _format_pixel_target_line(
+        self,
+        hit: Optional[Dict[str, Any]],
+        vw: int,
+        vh: int,
+    ) -> str:
+        """Render the previewed-target element on a single line.
+
+        Same descriptor-first shape as `_format_pixel_candidates_block`,
+        labeled `Target` instead of a numeric id. Lets the agent read what
+        the yellow rectangle actually is before deciding whether to confirm
+        or re-aim.
+        """
+        if not hit or vw <= 0 or vh <= 0:
+            return ""
+        from server.agent.tools.base import _format_highlighted_element_lines
+
+        serialized = self._serialize_pixel_candidates([hit], vw, vh)
+        if not serialized:
+            return ""
+        el = serialized[0]
+        element_lines = _format_highlighted_element_lines("Target", el)
+        if not element_lines:
+            return ""
+        if element_lines[0].rstrip().endswith(">"):
+            snippet = self._html_snippet_for_candidate(el)
+            if snippet:
+                element_lines[0] = f"{element_lines[0]} · {snippet}"
+        cn = el.get("center_norm") or {}
+        cx = cn.get("x")
+        cy = cn.get("y")
+        if cx is not None and cy is not None:
+            element_lines[0] = f"{element_lines[0]}  → center=({cx}, {cy})"
+        return "\n".join(element_lines)
+
     def _build_pixel_gate_message(
         self,
         kind: str,
@@ -1411,6 +1446,7 @@ def _build_pixel_gate_message(
         hit: Optional[Dict[str, Any]],
         candidates: list,
         drag_endpoints: Optional[Dict[str, str]] = None,
+        target_line: str = "",
     ) -> str:
         """Compose the human-readable confirmation message for the agent.
 
@@ -1427,6 +1463,8 @@ def _build_pixel_gate_message(
                     "commit, or re-emit `click` with one of the candidate "
                     "centers below."
                 )
+                if target_line:
+                    lines.append(target_line)
             else:
                 lines.append(
                     "No element under the cursor — re-emit `click` with one "
@@ -1443,6 +1481,8 @@ def _build_pixel_gate_message(
                 "Drag previewed" + note + ". Confirm to commit, or re-emit "
                 "`drag` with corrected endpoints."
             )
+            if target_line:
+                lines.append(target_line)
         block = self._format_pixel_candidates_block(
             candidates,
             header="Nearby candidates (centers in [0,1000] space)",
@@ -1495,11 +1535,13 @@ def _gate_pixel_click(
             banner_kind="click",
         )
 
+        target_line = self._format_pixel_target_line(hit, vw, vh)
         message = self._build_pixel_gate_message(
             kind="click",
             verdict="dense",
             hit=hit,
             candidates=candidates,
+            target_line=target_line,
         )
 
         self._set_pending_confirmation(
@@ -1595,12 +1637,14 @@ def _gate_pixel_drag(
             "start": "dense" if start_dense else "sparse",
             "end": "dense" if end_dense else "sparse",
         }
+        target_line = self._format_pixel_target_line(focus_hit, vw, vh)
         message = self._build_pixel_gate_message(
             kind="drag",
             verdict="dense",
             hit=focus_hit,
             candidates=candidates,
             drag_endpoints=endpoints,
+            target_line=target_line,
         )
 
         self._set_pending_confirmation(

From afa8005b3d16c8d4c068219ee57a7ce9c10e84a1 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 21:37:25 +0800
Subject: [PATCH 07/12] fix(scroll): align Qwen scroll amount with normalized
 coord space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Qwen models emit clicks/moves in [0, 1000] normalized space, but scroll
amount was still being treated as raw CSS pixels — so the same number
that means "viewport center" for a click meant "500 actual pixels" for
a scroll, an inconsistency the agent had to mentally compensate for.

- Denormalize the scroll amount against the axis-relevant viewport
  dimension before dispatching the wheel event (vh for up/down, vw
  for left/right). Non-Qwen models pass through unchanged.
- Echo the agent's input amount back in the observation message
  ("Scrolled down by 500", no "px") so action and observation use the
  same space.
- Update the mouse tool prompts and the `amount` field description
  to teach the [0, 1000] semantic: 1000 ≈ one full viewport.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/agent/prompts/big_model/mouse_tool.j2  |  2 +-
 .../agent/prompts/small_model/mouse_tool.j2   |  2 +-
 server/agent/tools/browser_executor.py        | 38 ++++++++++++++++++-
 server/agent/tools/mouse_tool.py              |  6 ++-
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/server/agent/prompts/big_model/mouse_tool.j2 b/server/agent/prompts/big_model/mouse_tool.j2
index b471608..6801880 100644
--- a/server/agent/prompts/big_model/mouse_tool.j2
+++ b/server/agent/prompts/big_model/mouse_tool.j2
@@ -62,7 +62,7 @@ Commit a pending click or drag that was previewed in the previous response.
 Use this right after a confirmation preview. See **Confirmation previews** below.
 
 ### scroll
-Scroll at the cursor's current position by `amount` CSS pixels in `direction`. `amount` is always positive — `direction` carries the sign.
+Scroll at the cursor's current position by `amount` in the same [0, 1000] space as coordinates: `amount: 1000` is one full viewport in the chosen direction, `amount: 500` is half. `amount` is always positive — `direction` carries the sign.
 
 ```json
 { "action": "scroll", "direction": "down", "amount": 600 }
diff --git a/server/agent/prompts/small_model/mouse_tool.j2 b/server/agent/prompts/small_model/mouse_tool.j2
index 36312f2..e7d56b3 100644
--- a/server/agent/prompts/small_model/mouse_tool.j2
+++ b/server/agent/prompts/small_model/mouse_tool.j2
@@ -42,7 +42,7 @@ Press at `start_coordinate`, drag to `end_coordinate`, release.
 ```
 
 ### scroll
-Scroll at the cursor by `amount` CSS pixels. `amount` is always positive — `direction` carries the sign. To scroll inside a panel, sidebar, or modal, `move` over it first so the wheel event lands there.
+Scroll at the cursor by `amount` in the same [0, 1000] space as coordinates: `amount: 1000` is one full viewport in the chosen direction, `amount: 500` is half. `amount` is always positive — `direction` carries the sign. To scroll inside a panel, sidebar, or modal, `move` over it first so the wheel event lands there.
 ```json
 { "action": "scroll", "direction": "down", "amount": 600 }
 { "action": "scroll", "direction": "up", "amount": 300 }
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index bc67761..d3fef51 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1119,6 +1119,31 @@ def _denormalize_xy(
         py = round(y * vh / 1000) if y is not None else None
         return (px, py)
 
+    def _denormalize_scroll_amount(
+        self, amount: int, direction: str
+    ) -> int:
+        """Convert a Qwen-normalized scroll amount to CSS pixels.
+
+        Qwen emits scroll deltas in [0, 1000] (same space as click coords),
+        so `amount=800, direction=down` means "scroll 80% of the viewport
+        height down." Vertical scrolls scale against viewport height;
+        horizontal against width. Non-Qwen models already pass CSS pixels.
+        """
+        try:
+            amt = int(amount)
+        except (TypeError, ValueError):
+            return amount
+        if amt <= 0 or not self._is_qwen_model():
+            return amt
+        viewport = self._get_viewport()
+        if viewport is None:
+            return amt
+        vw, vh = viewport
+        axis = vh if direction in ("up", "down") else vw
+        if axis <= 0:
+            return amt
+        return max(1, round(amt * axis / 1000))
+
     def _format_action_xy(
         self, x_css: Optional[int], y_css: Optional[int]
     ) -> str:
@@ -1902,15 +1927,24 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                 )
 
             if kind == "scroll":
+                # Qwen emits scroll amounts in the same [0,1000] normalized
+                # space it uses for click/move coords — `amount: 800` means
+                # "scroll 80% of the viewport," not 800 CSS pixels. Convert
+                # against the axis-relevant viewport dimension before the
+                # extension dispatches the wheel event. Non-Qwen models pass
+                # through unchanged (amount is already CSS pixels).
+                amount_css = self._denormalize_scroll_amount(
+                    action.amount, action.direction
+                )
                 command = MouseScrollCommand(
                     direction=ScrollDirection(action.direction),
-                    amount=action.amount,
+                    amount=amount_css,
                     conversation_id=self.conversation_id,
                 )
                 result_dict = self._execute_command_sync(command)
                 return self._build_observation_from_result(
                     result_dict,
-                    f"Scrolled {action.direction} by {action.amount}px",
+                    f"Scrolled {action.direction} by {action.amount}",
                 )
 
             if kind == "reset":
diff --git a/server/agent/tools/mouse_tool.py b/server/agent/tools/mouse_tool.py
index 555598f..1fdd6cb 100644
--- a/server/agent/tools/mouse_tool.py
+++ b/server/agent/tools/mouse_tool.py
@@ -139,7 +139,11 @@ def _check_coord(cls, v):
         default=300,
         ge=1,
         le=2000,
-        description="Scroll amount in CSS pixels for 'scroll'.",
+        description=(
+            "Scroll distance for 'scroll', in the same [0, 1000] space as "
+            "coordinates: 1000 is one full viewport in the chosen direction, "
+            "500 is half."
+        ),
     )
 
     steps: int = Field(

From f219e37a2b257f8273546a0e851c38fabfd679ef Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 22:06:08 +0800
Subject: [PATCH 08/12] fix(scroll): keep agent surface fully in normalized
 space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The agent's scroll amount is normalized [0, 1000]; the server
denormalizes to CSS pixels before constructing MouseScrollCommand.
MouseScrollCommand.amount was capped at le=1000, so on a 1080-tall
viewport the denormalized 1080 hit Pydantic validation — the resulting
error message ("Input should be <= 1000 [input_value=1080]") leaked the
CSS-pixel number back into the agent's observation, breaking the
"agent only ever sees [0, 1000]" contract.

Raise the wire-type cap to 20000 (way beyond any real viewport) so the
denormalization path can never overflow into a validation error. The
field is documented as internal — agents talk to mouse.scroll which
stays in [0, 1000].

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 server/models/commands.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/server/models/commands.py b/server/models/commands.py
index d2bb911..0d69be0 100644
--- a/server/models/commands.py
+++ b/server/models/commands.py
@@ -137,13 +137,17 @@ class MouseDragCommand(BaseCommand):
 
 
 class MouseScrollCommand(BaseCommand):
-    """Scroll at current mouse position"""
+    """Scroll at current mouse position. Internal wire type — `amount` is
+    in CSS pixels here, but the agent never sees this field directly; the
+    agent's `mouse.scroll` tool takes a normalized [0, 1000] amount and
+    the server denormalizes before constructing this command. The upper
+    bound is intentionally permissive so denormalization on tall viewports
+    (4K, vertical monitors) never overflows back into a validation error
+    that would leak pixel numbers into agent observations."""
 
     type: Literal["mouse_scroll"] = "mouse_scroll"
     direction: ScrollDirection = Field(default=ScrollDirection.DOWN)
-    amount: int = Field(
-        default=100, ge=1, le=1000, description="Scroll amount in pixels"
-    )
+    amount: int = Field(default=100, ge=1, le=20000)
 
 
 class ResetMouseCommand(BaseCommand):

From a63032c200dfb221c5a0145fb4f18733975dfda0 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 22:27:09 +0800
Subject: [PATCH 09/12] fix(scroll): warn agent when scroll produced no
 viewport movement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scrolling at the bottom of a page, at an edge in the unscrollable
direction, or on a non-scrollable region dispatches the wheel event
successfully but doesn't move the viewport. The previous observation
said "Scrolled down by N" regardless — the agent saw an apparent
success and assumed progress had been made, often looping its scroll
without checking.

- performMouseScroll now snapshots scrollX/Y before the wheel event
  and re-reads after waitForScrollSettle returns. If the position
  didn't change, it probes documentElement scrollHeight/innerHeight
  to label the edge ("already at the top/bottom/left/right of the
  page") so the agent gets a specific hint, not a generic stall.
- Server scroll dispatch reads detail.moved; on false it surfaces a
  warning ("Scroll N had no effect — <reason>. Try a different
  region, the opposite direction, or a different navigation.")
- Missing `moved` field is treated as success so older builds during
  a rolling upgrade don't false-warn.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/commands/pixel-actions.ts | 157 ++++++++++++++++++++----
 server/agent/tools/browser_executor.py  |  22 +++-
 2 files changed, 150 insertions(+), 29 deletions(-)

diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index 03bc151..c459607 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -741,7 +741,14 @@ export async function performMouseScroll(
   conversationId: string,
   direction: 'up' | 'down' | 'left' | 'right',
   amount: number,
-): Promise<{ x: number; y: number; deltaX: number; deltaY: number }> {
+): Promise<{
+  x: number;
+  y: number;
+  deltaX: number;
+  deltaY: number;
+  moved: boolean;
+  reason?: string;
+}> {
   await attachWithDialogTracking(tabId, conversationId);
   const cdp = new CdpCommander(tabId);
   const cursor =
@@ -764,6 +771,14 @@ export async function performMouseScroll(
       deltaX = -safeAmount;
       break;
   }
+
+  // Capture pre-scroll position so we can tell whether the wheel event
+  // moved the viewport at all. Scrolling at the bottom of a page, on a
+  // non-scrollable container, or in a direction the page can't move
+  // dispatches successfully but produces no visible change — the agent
+  // would otherwise see "Scrolled down by N" and assume progress.
+  const before = await readScroll(cdp);
+
   await cdp.sendCommand(
     'Input.dispatchMouseEvent',
     {
@@ -789,8 +804,121 @@ export async function performMouseScroll(
   // every scroll regardless of size.
   await waitForScrollSettle(cdp);
 
+  const after = await readScroll(cdp);
+  const movedX =
+    before && after ? Math.abs(after[0] - before[0]) >= 1 : true;
+  const movedY =
+    before && after ? Math.abs(after[1] - before[1]) >= 1 : true;
+  const moved = movedX || movedY;
+
+  let reason: string | undefined;
+  if (!moved && before && after) {
+    if (direction === 'down' || direction === 'up') {
+      // Detect end-of-page so the agent gets a specific hint, not just
+      // a generic no-op message.
+      const atEdge = await detectVerticalEdge(cdp, direction);
+      if (atEdge === 'top') {
+        reason = 'already at the top of the page';
+      } else if (atEdge === 'bottom') {
+        reason = 'already at the bottom of the page';
+      } else {
+        reason = 'the wheel event did not move the viewport';
+      }
+    } else {
+      const atEdge = await detectHorizontalEdge(cdp, direction);
+      if (atEdge === 'left') {
+        reason = 'already at the left edge';
+      } else if (atEdge === 'right') {
+        reason = 'already at the right edge';
+      } else {
+        reason = 'the wheel event did not move the viewport';
+      }
+    }
+  }
+
   await refreshCursor(cdp, tabId, cursor.x, cursor.y);
-  return { x: cursor.x, y: cursor.y, deltaX, deltaY };
+  return { x: cursor.x, y: cursor.y, deltaX, deltaY, moved, reason };
+}
+
+async function readScroll(
+  cdp: CdpCommander,
+): Promise<[number, number] | null> {
+  try {
+    const resp = await cdp.sendCommand<{
+      result?: { value?: { x?: number; y?: number } };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression: '({x: window.scrollX, y: window.scrollY})',
+        returnByValue: true,
+      },
+      4000,
+      0,
+    );
+    const v = resp?.result?.value;
+    if (!v || typeof v.x !== 'number' || typeof v.y !== 'number') return null;
+    return [v.x, v.y];
+  } catch {
+    return null;
+  }
+}
+
+async function detectVerticalEdge(
+  cdp: CdpCommander,
+  direction: 'up' | 'down',
+): Promise<'top' | 'bottom' | null> {
+  try {
+    const resp = await cdp.sendCommand<{
+      result?: {
+        value?: { y: number; max: number };
+      };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression:
+          '({y: window.scrollY, max: Math.max(0, document.documentElement.scrollHeight - window.innerHeight)})',
+        returnByValue: true,
+      },
+      4000,
+      0,
+    );
+    const v = resp?.result?.value;
+    if (!v) return null;
+    if (direction === 'up' && v.y <= 1) return 'top';
+    if (direction === 'down' && v.y >= v.max - 1) return 'bottom';
+    return null;
+  } catch {
+    return null;
+  }
+}
+
+async function detectHorizontalEdge(
+  cdp: CdpCommander,
+  direction: 'left' | 'right',
+): Promise<'left' | 'right' | null> {
+  try {
+    const resp = await cdp.sendCommand<{
+      result?: {
+        value?: { x: number; max: number };
+      };
+    }>(
+      'Runtime.evaluate',
+      {
+        expression:
+          '({x: window.scrollX, max: Math.max(0, document.documentElement.scrollWidth - window.innerWidth)})',
+        returnByValue: true,
+      },
+      4000,
+      0,
+    );
+    const v = resp?.result?.value;
+    if (!v) return null;
+    if (direction === 'left' && v.x <= 1) return 'left';
+    if (direction === 'right' && v.x >= v.max - 1) return 'right';
+    return null;
+  } catch {
+    return null;
+  }
 }
 
 async function waitForScrollSettle(
@@ -799,33 +927,12 @@ async function waitForScrollSettle(
   maxWaitMs: number = 1500,
   stableSamples: number = 2,
 ): Promise<void> {
-  const readScroll = async (): Promise<[number, number] | null> => {
-    try {
-      const resp = await cdp.sendCommand<{
-        result?: { value?: { x?: number; y?: number } };
-      }>(
-        'Runtime.evaluate',
-        {
-          expression: '({x: window.scrollX, y: window.scrollY})',
-          returnByValue: true,
-        },
-        4000,
-        0,
-      );
-      const v = resp?.result?.value;
-      if (!v || typeof v.x !== 'number' || typeof v.y !== 'number') return null;
-      return [v.x, v.y];
-    } catch {
-      return null;
-    }
-  };
-
   const start = Date.now();
-  let last = await readScroll();
+  let last = await readScroll(cdp);
   let stable = 0;
   while (Date.now() - start < maxWaitMs) {
     await sleep(pollIntervalMs);
-    const cur = await readScroll();
+    const cur = await readScroll(cdp);
     if (cur && last && cur[0] === last[0] && cur[1] === last[1]) {
       stable += 1;
       if (stable >= stableSamples) return;
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index d3fef51..179e0b8 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1942,10 +1942,24 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                     conversation_id=self.conversation_id,
                 )
                 result_dict = self._execute_command_sync(command)
-                return self._build_observation_from_result(
-                    result_dict,
-                    f"Scrolled {action.direction} by {action.amount}",
-                )
+                detail = (result_dict or {}).get("data", {}) or {}
+                # Older extension builds don't report `moved`; treat missing
+                # as success so we don't false-warn during a rolling upgrade.
+                moved = detail.get("moved")
+                if moved is False:
+                    reason = (
+                        detail.get("reason")
+                        or "the wheel event did not move the viewport"
+                    )
+                    msg = (
+                        f"Scroll {action.direction} by {action.amount} had "
+                        f"no effect — {reason}. Try a different region (move "
+                        f"the cursor over the inner scroll area first), the "
+                        f"opposite direction, or a different navigation."
+                    )
+                else:
+                    msg = f"Scrolled {action.direction} by {action.amount}"
+                return self._build_observation_from_result(result_dict, msg)
 
             if kind == "reset":
                 command = ResetMouseCommand(conversation_id=self.conversation_id)

From 81ca85d0ecbed2bea705379a6104cfe4d0906ae3 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Tue, 12 May 2026 22:55:32 +0800
Subject: [PATCH 10/12] fix(click no-op): surface candidate overlay in
 screenshot + tighten detection

Close the design gap on empty-space clicks: the agent now both reads a
"no DOM change" warning with nearby-element coordinates AND sees the
orange-dashed candidate outlines in the screenshot it observes.

- _draw_no_op_overlay now returns the post-overlay screenshot URL.
  _overlay_screenshot_into_result swaps it into result_dict["data"][
  "screenshot"], so the agent's observation image shows the highlighted
  candidates instead of the pre-overlay snapshot.
- Tighten the MutationObserver in performPixelClick to skip idempotent
  attribute mutations (oldValue === newValue), which a doc-level click
  handler doing `el.style.display = 'none'` on every click would
  otherwise spam.
- Tighten the selection-change probe: count "selection changed" only
  when a range is selected or the caret landed in a real editable
  context (input/textarea/contenteditable). A click on body whitespace
  resolves the caret to some text node but is not an agent-meaningful
  state change.
- Surface mutations/active_changed/scroll_changed/selection_changed
  from performPixelClick alongside triggered_anything; useful for
  future debugging without re-instrumenting.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/commands/pixel-actions.ts | 56 +++++++++++++++----
 server/agent/tools/browser_executor.py  | 72 +++++++++++++++++++++----
 2 files changed, 108 insertions(+), 20 deletions(-)

diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index c459607..b834b46 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -147,6 +147,25 @@ const ARM_MUTATION_OBSERVER_SCRIPT = `
       const skipIds = new Set([cursorId, overlayId]);
       const obs = new MutationObserver((muts) => {
         for (const m of muts) {
+          // Idempotent style/attribute assignments (e.g. a doc-level click
+          // handler that does \`sortMenu.style.display = 'none'\` on every
+          // click when it's already none) still produce MutationRecords —
+          // the browser fires the record on every setter, regardless of
+          // whether the value actually changed. Skip those so the agent
+          // doesn't see a false "click did something" signal on pages
+          // with such handlers.
+          if (m.type === 'attributes') {
+            let oldVal = m.oldValue;
+            let newVal = null;
+            try {
+              newVal = m.target.getAttribute
+                ? m.target.getAttribute(m.attributeName)
+                : null;
+            } catch (_) { newVal = null; }
+            if ((oldVal == null ? '' : oldVal) === (newVal == null ? '' : newVal)) {
+              continue;
+            }
+          }
           let t = m.target;
           let skip = false;
           while (t && t.nodeType === 1) {
@@ -160,6 +179,7 @@ const ARM_MUTATION_OBSERVER_SCRIPT = `
         childList: true,
         subtree: true,
         attributes: true,
+        attributeOldValue: true,
         characterData: true,
       });
       w.__ob_click_obs__ = obs;
@@ -217,15 +237,27 @@ const READ_MUTATION_OBSERVER_SCRIPT = `
       let selectionChanged = false;
       try {
         const sel = w.getSelection && w.getSelection();
-        if (sel && beforeSel) {
-          selectionChanged =
-            sel.anchorNode !== beforeSel.anchorNode ||
-            sel.anchorOffset !== beforeSel.anchorOffset ||
-            sel.focusNode !== beforeSel.focusNode ||
-            sel.focusOffset !== beforeSel.focusOffset ||
-            sel.isCollapsed !== beforeSel.isCollapsed;
-        } else if (sel && !beforeSel) {
-          selectionChanged = !sel.isCollapsed || !!sel.anchorNode;
+        // The only "selection change" worth flagging is one that signals
+        // a real interaction: caret landed inside an editable field
+        // (<input>, <textarea>, or contenteditable), or the user dragged
+        // out a non-collapsed range. A click in empty space resolves the
+        // caret to a text node somewhere in the page, but that's a
+        // browser default — no agent-meaningful state changed.
+        const editableContext = (node) => {
+          if (!node) return false;
+          let el = node.nodeType === 1 ? node : node.parentElement;
+          while (el) {
+            if (el.isContentEditable) return true;
+            const tag = (el.tagName || '').toLowerCase();
+            if (tag === 'input' || tag === 'textarea') return true;
+            el = el.parentElement;
+          }
+          return false;
+        };
+        if (sel) {
+          const rangeSelected = !sel.isCollapsed && !!sel.anchorNode;
+          const caretInEditable = sel.isCollapsed && editableContext(sel.anchorNode);
+          selectionChanged = rangeSelected || caretInEditable;
         }
       } catch (_) {}
       delete w.__ob_click_obs__;
@@ -615,8 +647,10 @@ export async function performMouseClick(
   // passes without perceptibly slowing the action loop.
   await new Promise((r) => setTimeout(r, 250));
   let triggered: boolean | undefined;
+  let effectsOut: ClickEffectsProbe | null = null;
   if (observerArmed) {
     const effects = await readClickEffects(cdp);
+    effectsOut = effects;
     if (effects.mutations >= 0) {
       // The click "did something" if any of these signals fired:
       //   - DOM mutations (class/attribute/child/text changes)
@@ -641,6 +675,10 @@ export async function performMouseClick(
     button,
     warning: clamped.warning,
     triggered_anything: triggered,
+    mutations: effectsOut?.mutations,
+    active_changed: effectsOut?.active_changed,
+    scroll_changed: effectsOut?.scroll_changed,
+    selection_changed: effectsOut?.selection_changed,
   };
 }
 
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index 179e0b8..bbb1c1c 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1747,7 +1747,10 @@ def _commit_pending_pixel_action(self) -> OpenBrowserObservation:
                     # given as alternatives.
                     px, py = extra.get("px"), extra.get("py")
                     if isinstance(px, int) and isinstance(py, int):
-                        self._draw_no_op_overlay_from_serialized((px, py), serialized)
+                        overlay_url = self._draw_no_op_overlay_from_serialized(
+                            (px, py), serialized
+                        )
+                        self._overlay_screenshot_into_result(result_dict, overlay_url)
                 return self._build_observation_from_result(result_dict, message)
 
             if action_type == "mouse_drag_pixel":
@@ -1877,7 +1880,11 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
                     # human watching the browser sees what the agent is
                     # told to re-aim at. Cleared on the agent's next
                     # mouse action via clear_pixel_overlay.
-                    self._draw_no_op_overlay(cursor, gate)
+                    overlay_url = self._draw_no_op_overlay(cursor, gate)
+                    # Swap the agent's screenshot for the post-overlay one
+                    # so the candidate highlights show up in the image the
+                    # model reads — text coords plus a visual cue.
+                    self._overlay_screenshot_into_result(result_dict, overlay_url)
                 return self._build_observation_from_result(result_dict, message)
 
             if kind == "drag":
@@ -2108,17 +2115,22 @@ def _draw_no_op_overlay(
         self,
         cursor: Optional[tuple[int, int]],
         gate: Optional[Dict[str, Any]],
-    ) -> None:
+    ) -> Optional[str]:
         """Inject orange-dashed candidate boxes onto the live page for the
         no-op case. Mirrors the gated-preview overlay so a human watching
         the browser sees the same alternatives the agent is told to re-aim
         at. Best-effort: failures are logged and swallowed.
+
+        Returns the data URL of the post-overlay screenshot when the
+        extension produced one, so the caller can swap it into the agent's
+        observation; the agent then sees the highlighted candidates
+        visually, not just as a list of coordinates.
         """
         if cursor is None or not gate:
-            return
+            return None
         neighborhood = gate.get("neighborhood") or []
         if not neighborhood:
-            return
+            return None
         candidate_selectors: list = []
         candidate_bboxes: list = []
         for c in neighborhood:
@@ -2131,9 +2143,9 @@ def _draw_no_op_overlay(
             if bbox:
                 candidate_bboxes.append(bbox)
         if not candidate_selectors and not candidate_bboxes:
-            return
+            return None
         try:
-            self._render_pixel_preview(
+            return self._render_pixel_preview(
                 mode="pixel_miss",
                 x_css=cursor[0],
                 y_css=cursor[1],
@@ -2145,16 +2157,17 @@ def _draw_no_op_overlay(
             )
         except Exception as e:
             logger.debug("no-op overlay render failed: %s", e)
+            return None
 
     def _draw_no_op_overlay_from_serialized(
         self,
         cursor: Optional[tuple[int, int]],
         candidates: list,
-    ) -> None:
+    ) -> Optional[str]:
         """Same as `_draw_no_op_overlay` but takes pre-serialized candidates
         (the form stashed in `extra_data` for confirm-path commits)."""
         if cursor is None or not candidates:
-            return
+            return None
         candidate_selectors: list = []
         candidate_bboxes: list = []
         for c in candidates:
@@ -2167,9 +2180,9 @@ def _draw_no_op_overlay_from_serialized(
             if isinstance(bbox, dict):
                 candidate_bboxes.append(bbox)
         if not candidate_selectors and not candidate_bboxes:
-            return
+            return None
         try:
-            self._render_pixel_preview(
+            return self._render_pixel_preview(
                 mode="pixel_miss",
                 x_css=cursor[0],
                 y_css=cursor[1],
@@ -2181,6 +2194,7 @@ def _draw_no_op_overlay_from_serialized(
             )
         except Exception as e:
             logger.debug("no-op overlay render failed: %s", e)
+            return None
 
     def _format_no_op_warning(self, gate: Optional[Dict[str, Any]]) -> str:
         """Warning text for a click that committed but produced no DOM change.
@@ -2227,6 +2241,42 @@ def _format_no_op_warning_from_candidates(self, candidates: list) -> str:
             )
         return "\n".join(lines)
 
+    @staticmethod
+    def _overlay_screenshot_into_result(
+        result_dict: Optional[Dict[str, Any]],
+        overlay_url: Optional[str],
+    ) -> None:
+        """Replace the post-click screenshot in `result_dict` with one that
+        already has the no-op candidate overlay painted on it.
+
+        The original click screenshot was captured inside the extension
+        dispatcher before the server-side overlay decision ran, so the
+        agent would otherwise see no visual hint of the highlighted
+        candidates — only their text coordinates. `_render_pixel_preview`
+        repaints the live page with the orange-dashed boxes and returns a
+        fresh capture; swapping it in puts the visual cue into the image
+        the model reads. No-op if the overlay render didn't produce a URL
+        (e.g. zero candidates or extension error).
+        """
+        if not overlay_url or not isinstance(overlay_url, str):
+            return
+        if not overlay_url.startswith("data:"):
+            return
+        if not isinstance(result_dict, dict):
+            return
+        data = result_dict.get("data")
+        if not isinstance(data, dict):
+            return
+        # `_build_observation_from_result` reads `data["screenshot"]` first,
+        # then `data["imageData"]` as a fallback. Replace whichever key
+        # was originally set so the new image wins.
+        if "screenshot" in data:
+            data["screenshot"] = overlay_url
+        elif "imageData" in data:
+            data["imageData"] = overlay_url
+        else:
+            data["screenshot"] = overlay_url
+
     @staticmethod
     def _click_was_a_no_op(result_dict: Optional[Dict[str, Any]]) -> bool:
         """True iff the extension's post-click probe saw zero DOM mutations.

From f550ebe658edaa56576a4c334d39b42ca1956f58 Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 13 May 2026 08:18:21 +0800
Subject: [PATCH 11/12] eval: refresh full-run report (flash models only)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drop dashscope/qwen3.5-plus from evaluation_report.json — the plus
slot was contaminated by a DashScope hourly-quota exhaustion mid-run
(20 of 35 second-alias tests died with LLMRateLimitError). Keeping
only the two flash models gives a clean comparison: qwen3.5-flash
82.9% (29/35), qwen3.6-flash 91.4% (32/35), aggregate 87.1% (61/70).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 eval/evaluation_report.json | 1750 ++++++++++-------------------------
 1 file changed, 477 insertions(+), 1273 deletions(-)

diff --git a/eval/evaluation_report.json b/eval/evaluation_report.json
index b10faf2..edd035b 100644
--- a/eval/evaluation_report.json
+++ b/eval/evaluation_report.json
@@ -1,65 +1,39 @@
 {
   "evaluation": {
-    "timestamp": "2026-05-07 12:11:55",
-    "unix_timestamp": 1778127115.410769,
+    "timestamp": "2026-05-13 03:52:46",
+    "unix_timestamp": 1778615566.1621199,
     "summary": {
-      "total_tests": 140,
-      "passed_tests": 110,
-      "pass_rate": 78.57,
+      "total_tests": 70,
+      "passed_tests": 61,
+      "pass_rate": 87.14,
       "models_tested": [
-        "dashscope/qwen3.5-plus",
-        "dashscope/qwen3.6-plus",
         "dashscope/qwen3.5-flash",
         "dashscope/qwen3.6-flash"
       ]
     },
     "model_performance": {
-      "dashscope/qwen3.5-plus": {
-        "pass_rate": 88.57,
-        "task_score": 281.9,
-        "task_max_score": 304.8,
-        "efficiency_score": 21.1601,
-        "usage_score": 27.902,
-        "composite_score": 0.8118,
-        "avg_duration": 252.32,
-        "avg_cost": 0.295084,
-        "passed_count": 31,
-        "total_tests": 35
-      },
-      "dashscope/qwen3.6-plus": {
-        "pass_rate": 74.29,
-        "task_score": 262.4,
-        "task_max_score": 304.8,
-        "efficiency_score": 21.6591,
-        "usage_score": 15.4985,
-        "composite_score": 0.658,
-        "avg_duration": 237.55,
-        "avg_cost": 0.933219,
-        "passed_count": 26,
-        "total_tests": 35
-      },
       "dashscope/qwen3.5-flash": {
-        "pass_rate": 65.71,
-        "task_score": 248.7,
+        "pass_rate": 82.86,
+        "task_score": 265.6,
         "task_max_score": 304.8,
-        "efficiency_score": 21.3812,
-        "usage_score": 32.8972,
-        "composite_score": 0.7044,
-        "avg_duration": 257.51,
-        "avg_cost": 0.096369,
-        "passed_count": 23,
+        "efficiency_score": 24.1137,
+        "usage_score": 33.7357,
+        "composite_score": 0.8277,
+        "avg_duration": 197.1,
+        "avg_cost": 0.05228,
+        "passed_count": 29,
         "total_tests": 35
       },
       "dashscope/qwen3.6-flash": {
-        "pass_rate": 85.71,
-        "task_score": 274.3,
+        "pass_rate": 91.43,
+        "task_score": 289.8,
         "task_max_score": 304.8,
-        "efficiency_score": 23.4369,
-        "usage_score": 26.2986,
-        "composite_score": 0.7985,
-        "avg_duration": 207.89,
-        "avg_cost": 0.39813,
-        "passed_count": 30,
+        "efficiency_score": 26.4271,
+        "usage_score": 26.5251,
+        "composite_score": 0.8512,
+        "avg_duration": 153.69,
+        "avg_cost": 0.362941,
+        "passed_count": 32,
         "total_tests": 35
       }
     },
@@ -67,1715 +41,945 @@
       "bluebook_simple": {
         "name": "BlueBook Search And Like Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 6.0,
-            "task_max_score": 6.0,
-            "efficiency_score": 0.7519,
-            "usage_score": 0.8734,
-            "composite_score": 0.9251,
-            "total_score": 7.63,
-            "duration": 74.44,
-            "cost": 0.075952
-          },
-          "dashscope/qwen3.6-plus": {
+          "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.7811,
-            "usage_score": 0.4455,
-            "composite_score": 0.8453,
-            "total_score": 7.23,
-            "duration": 65.67,
-            "cost": 0.332728
-          },
-          "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 4.0,
-            "task_max_score": 6.0,
-            "efficiency_score": 0.7869,
-            "usage_score": 0.9621,
-            "composite_score": 0.3498,
-            "total_score": 5.75,
-            "duration": 63.92,
-            "cost": 0.02272
+            "efficiency_score": 0.8598,
+            "usage_score": 0.9688,
+            "composite_score": 0.9657,
+            "total_score": 7.83,
+            "duration": 42.07,
+            "cost": 0.018699
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.8312,
-            "usage_score": 0.8184,
-            "composite_score": 0.9299,
+            "efficiency_score": 0.8433,
+            "usage_score": 0.8094,
+            "composite_score": 0.9305,
             "total_score": 7.65,
-            "duration": 50.64,
-            "cost": 0.108959
+            "duration": 47.0,
+            "cost": 0.114372
           }
         }
       },
       "staybnb_search": {
-        "name": "StayBnB Search \u2014 Segmented Pill, Calendar & Guest Stepper",
+        "name": "StayBnB Search — Segmented Pill, Calendar & Guest Stepper",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 10.5,
-            "efficiency_score": 0.5209,
-            "usage_score": 0.7535,
-            "composite_score": 0.8549,
-            "total_score": 10.27,
-            "duration": 258.7,
-            "cost": 0.369752
-          },
-          "dashscope/qwen3.6-plus": {
+          "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 10.5,
             "task_max_score": 10.5,
-            "efficiency_score": 0.5489,
-            "usage_score": 0.2868,
-            "composite_score": 0.7671,
-            "total_score": 11.34,
-            "duration": 243.57,
-            "cost": 1.069812
+            "efficiency_score": 0.5176,
+            "usage_score": 0.9193,
+            "composite_score": 0.8874,
+            "total_score": 11.94,
+            "duration": 260.49,
+            "cost": 0.121005
           },
-          "dashscope/qwen3.5-flash": {
+          "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 10.5,
-            "efficiency_score": 0.7282,
-            "usage_score": 0.9687,
-            "composite_score": 0.9394,
-            "total_score": 11.2,
-            "duration": 146.75,
-            "cost": 0.046948
-          },
-          "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 6.0,
-            "task_max_score": 10.5,
-            "efficiency_score": 0,
-            "usage_score": 0.9832,
-            "composite_score": 0.1966,
-            "total_score": 6.98,
-            "duration": 540.0,
-            "cost": 0.025178
+            "efficiency_score": 0.7912,
+            "usage_score": 0.8396,
+            "composite_score": 0.9262,
+            "total_score": 11.13,
+            "duration": 112.78,
+            "cost": 0.240549
           }
         }
       },
       "finviz_simple": {
         "name": "Finviz Simple Screener Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 3,
-            "task_max_score": 3,
-            "efficiency_score": 0.8373,
-            "usage_score": 0.9307,
-            "composite_score": 0.9536,
-            "total_score": 4.77,
-            "duration": 48.8,
-            "cost": 0.055405
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 3,
-            "task_max_score": 3,
-            "efficiency_score": 0.8388,
-            "usage_score": 0.7708,
-            "composite_score": 0.9219,
-            "total_score": 4.61,
-            "duration": 48.37,
-            "cost": 0.183396
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.8279,
-            "usage_score": 0.9777,
-            "composite_score": 0.9611,
-            "total_score": 4.81,
-            "duration": 51.64,
-            "cost": 0.017845
+            "efficiency_score": 0.9159,
+            "usage_score": 0.9852,
+            "composite_score": 0.9802,
+            "total_score": 4.9,
+            "duration": 25.23,
+            "cost": 0.011877
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.9203,
-            "usage_score": 0.9277,
-            "composite_score": 0.9696,
-            "total_score": 4.85,
-            "duration": 23.91,
-            "cost": 0.057861
+            "efficiency_score": 0.8975,
+            "usage_score": 0.9125,
+            "composite_score": 0.962,
+            "total_score": 4.81,
+            "duration": 30.75,
+            "cost": 0.070007
           }
         }
       },
       "cloudstack_interactive": {
         "name": "CloudStack DAS Interactive Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.7663,
-            "usage_score": 0.8964,
-            "composite_score": 0.9325,
-            "total_score": 10.66,
-            "duration": 163.61,
-            "cost": 0.207102
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.8109,
-            "usage_score": 0.6821,
-            "composite_score": 0.8986,
-            "total_score": 10.49,
-            "duration": 132.4,
-            "cost": 0.635748
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 9.0,
+            "task_score": 7.5,
             "task_max_score": 9.0,
-            "efficiency_score": 0.0434,
-            "usage_score": 0.813,
-            "composite_score": 0.7713,
-            "total_score": 9.86,
-            "duration": 669.63,
-            "cost": 0.373979
+            "efficiency_score": 0.8433,
+            "usage_score": 0.9798,
+            "composite_score": 0.9646,
+            "total_score": 9.32,
+            "duration": 109.69,
+            "cost": 0.040421
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.8552,
-            "usage_score": 0.8743,
-            "composite_score": 0.9459,
-            "total_score": 10.73,
-            "duration": 101.33,
-            "cost": 0.251393
+            "efficiency_score": 0.7222,
+            "usage_score": 0.7803,
+            "composite_score": 0.9005,
+            "total_score": 10.5,
+            "duration": 194.46,
+            "cost": 0.439468
           }
         }
       },
       "gbr": {
         "name": "GBR Search Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 2.5,
-            "task_max_score": 2.5,
-            "efficiency_score": 0.8623,
-            "usage_score": 0.9257,
-            "composite_score": 0.9576,
-            "total_score": 4.29,
-            "duration": 55.09,
-            "cost": 0.059466
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 2.5,
-            "task_max_score": 2.5,
-            "efficiency_score": 0.8726,
-            "usage_score": 0.6838,
-            "composite_score": 0.9113,
-            "total_score": 4.06,
-            "duration": 50.95,
-            "cost": 0.252924
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.9266,
-            "usage_score": 0.9874,
-            "composite_score": 0.9828,
-            "total_score": 4.41,
-            "duration": 29.35,
-            "cost": 0.010062
+            "efficiency_score": 0.9379,
+            "usage_score": 0.9834,
+            "composite_score": 0.9843,
+            "total_score": 4.42,
+            "duration": 24.85,
+            "cost": 0.013273
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 2.5,
             "task_max_score": 2.5,
-            "efficiency_score": 0.9113,
-            "usage_score": 0.903,
-            "composite_score": 0.9629,
+            "efficiency_score": 0.9105,
+            "usage_score": 0.9003,
+            "composite_score": 0.9622,
             "total_score": 4.31,
-            "duration": 35.47,
-            "cost": 0.077588
+            "duration": 35.78,
+            "cost": 0.079771
           }
         }
       },
       "gmail_exec_followup": {
         "name": "Gmail Finance Follow-up",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 8.0,
-            "task_max_score": 8.0,
-            "efficiency_score": 0.6936,
-            "usage_score": 0.8373,
-            "composite_score": 0.9062,
-            "total_score": 9.53,
-            "duration": 202.23,
-            "cost": 0.227799
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 5.5,
-            "task_max_score": 8.0,
-            "efficiency_score": 0.6971,
-            "usage_score": 0.3727,
-            "composite_score": 0.214,
-            "total_score": 6.57,
-            "duration": 199.91,
-            "cost": 0.878164
-          },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 8.0,
+            "passed": false,
+            "task_score": 4.5,
             "task_max_score": 8.0,
-            "efficiency_score": 0.73,
-            "usage_score": 0.9504,
-            "composite_score": 0.9361,
-            "total_score": 9.68,
-            "duration": 178.18,
-            "cost": 0.069446
+            "efficiency_score": 0.7598,
+            "usage_score": 0.9487,
+            "composite_score": 0.3417,
+            "total_score": 6.21,
+            "duration": 158.55,
+            "cost": 0.071834
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 8.0,
             "task_max_score": 8.0,
-            "efficiency_score": 0.7944,
-            "usage_score": 0.778,
-            "composite_score": 0.9145,
-            "total_score": 9.57,
-            "duration": 135.71,
-            "cost": 0.310808
+            "efficiency_score": 0.6672,
+            "usage_score": 0.6519,
+            "composite_score": 0.8638,
+            "total_score": 9.32,
+            "duration": 219.65,
+            "cost": 0.487299
           }
         }
       },
       "booking_compare_and_book": {
         "name": "Booking Compare And Book",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 8.0,
-            "task_max_score": 10.0,
-            "efficiency_score": 0.6089,
-            "usage_score": 0.7668,
-            "composite_score": 0.8751,
-            "total_score": 9.38,
-            "duration": 281.58,
-            "cost": 0.396458
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 1.0,
-            "task_max_score": 10.0,
-            "efficiency_score": 0,
-            "usage_score": 0.9643,
-            "composite_score": 0.1929,
-            "total_score": 1.96,
-            "duration": 720.0,
-            "cost": 0.060764
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 9.0,
+            "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.7594,
-            "usage_score": 0.9658,
-            "composite_score": 0.945,
-            "total_score": 10.73,
-            "duration": 173.23,
-            "cost": 0.058207
+            "efficiency_score": 0.7825,
+            "usage_score": 0.9606,
+            "composite_score": 0.9486,
+            "total_score": 11.74,
+            "duration": 156.62,
+            "cost": 0.066981
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 8.0,
+            "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.7839,
-            "usage_score": 0.761,
-            "composite_score": 0.909,
-            "total_score": 9.54,
-            "duration": 155.57,
-            "cost": 0.406261
+            "efficiency_score": 0.6872,
+            "usage_score": 0.6318,
+            "composite_score": 0.8638,
+            "total_score": 11.32,
+            "duration": 225.18,
+            "cost": 0.625883
           }
         }
       },
       "github_pr_review": {
         "name": "GitHub PR Review",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 5.6,
-            "task_max_score": 9.0,
-            "efficiency_score": 0,
-            "usage_score": 0.9855,
-            "composite_score": 0.1971,
-            "total_score": 6.59,
-            "duration": 720.0,
-            "cost": 0.024722
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 7.9,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.6006,
-            "usage_score": 0.1583,
-            "composite_score": 0.7518,
-            "total_score": 8.66,
-            "duration": 287.55,
-            "cost": 1.430808
-          },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 5.6,
+            "passed": true,
+            "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.1552,
-            "usage_score": 0.8198,
-            "composite_score": 0.195,
-            "total_score": 6.58,
-            "duration": 608.24,
-            "cost": 0.306346
+            "efficiency_score": 0.7899,
+            "usage_score": 0.9672,
+            "composite_score": 0.9514,
+            "total_score": 10.76,
+            "duration": 151.24,
+            "cost": 0.055686
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.5592,
-            "usage_score": 0.4888,
-            "composite_score": 0.8096,
-            "total_score": 10.05,
-            "duration": 317.4,
-            "cost": 0.869086
+            "efficiency_score": 0.7376,
+            "usage_score": 0.7357,
+            "composite_score": 0.8946,
+            "total_score": 10.47,
+            "duration": 188.95,
+            "cost": 0.44937
           }
         }
       },
       "vidhub_comment": {
-        "name": "VidHub Comment \u2014 Description, Nested Replies & Volume Slider",
+        "name": "VidHub Comment — Description, Nested Replies & Volume Slider",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 13.0,
-            "task_max_score": 15.0,
-            "efficiency_score": 0.4963,
-            "usage_score": 0.7822,
-            "composite_score": 0.8557,
-            "total_score": 14.28,
-            "duration": 302.23,
-            "cost": 0.435573
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 11.0,
-            "task_max_score": 15.0,
-            "efficiency_score": 0.5717,
-            "usage_score": 0.3799,
-            "composite_score": 0.1903,
-            "total_score": 11.95,
-            "duration": 256.97,
-            "cost": 1.24024
-          },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 10.5,
+            "passed": true,
+            "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.4921,
-            "usage_score": 0.9396,
-            "composite_score": 0.2863,
-            "total_score": 11.93,
-            "duration": 304.74,
-            "cost": 0.12087
+            "efficiency_score": 0.6764,
+            "usage_score": 0.9555,
+            "composite_score": 0.9264,
+            "total_score": 16.63,
+            "duration": 194.17,
+            "cost": 0.08896
           },
           "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 9.5,
+            "passed": true,
+            "task_score": 13.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.3789,
-            "usage_score": 0.407,
-            "composite_score": 0.1572,
-            "total_score": 10.29,
-            "duration": 372.69,
-            "cost": 1.186083
+            "efficiency_score": 0.6575,
+            "usage_score": 0.7483,
+            "composite_score": 0.8812,
+            "total_score": 14.41,
+            "duration": 205.49,
+            "cost": 0.503464
           }
         }
       },
       "techforum_reply": {
         "name": "TechForum Comment Reply Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.5,
-            "task_max_score": 9.5,
-            "efficiency_score": 0.6558,
-            "usage_score": 0.7328,
-            "composite_score": 0.8777,
-            "total_score": 10.89,
-            "duration": 172.11,
-            "cost": 0.267171
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 9.5,
-            "task_max_score": 9.5,
-            "efficiency_score": 0.6589,
-            "usage_score": 0.2235,
-            "composite_score": 0.7765,
-            "total_score": 10.38,
-            "duration": 170.53,
-            "cost": 0.776472
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.5344,
-            "usage_score": 0.9002,
-            "composite_score": 0.8869,
-            "total_score": 10.93,
-            "duration": 232.78,
-            "cost": 0.099769
+            "efficiency_score": 0.7047,
+            "usage_score": 0.9397,
+            "composite_score": 0.9289,
+            "total_score": 11.14,
+            "duration": 147.66,
+            "cost": 0.060331
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.4905,
-            "usage_score": 0.1818,
-            "composite_score": 0.7345,
-            "total_score": 10.17,
-            "duration": 254.76,
-            "cost": 0.818165
+            "efficiency_score": 0.8418,
+            "usage_score": 0.8146,
+            "composite_score": 0.9313,
+            "total_score": 11.16,
+            "duration": 79.12,
+            "cost": 0.185433
           }
         }
       },
       "replay_techforum_upvote": {
         "name": "Replay: TechForum search + upvote AI agent posts",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 3,
-            "task_max_score": 10,
-            "efficiency_score": 0.6762,
-            "usage_score": 0.7927,
-            "composite_score": 0.2938,
-            "total_score": 4.47,
-            "duration": 194.28,
-            "cost": 0.207348
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 3,
-            "task_max_score": 10,
-            "efficiency_score": 0.2159,
-            "usage_score": 0,
-            "composite_score": 0.0432,
-            "total_score": 3.22,
-            "duration": 470.46,
-            "cost": 2.344964
-          },
           "dashscope/qwen3.5-flash": {
             "passed": false,
             "task_score": 4,
             "task_max_score": 10,
-            "efficiency_score": 0,
-            "usage_score": 0.9954,
-            "composite_score": 0.1991,
-            "total_score": 5.0,
-            "duration": 600.0,
-            "cost": 0.004625
+            "efficiency_score": 0.8635,
+            "usage_score": 0.9712,
+            "composite_score": 0.3669,
+            "total_score": 5.83,
+            "duration": 81.9,
+            "cost": 0.028833
           },
           "dashscope/qwen3.6-flash": {
             "passed": false,
-            "task_score": 3,
+            "task_score": 4,
             "task_max_score": 10,
-            "efficiency_score": 0,
-            "usage_score": 0.9745,
-            "composite_score": 0.1949,
-            "total_score": 3.97,
-            "duration": 600.0,
-            "cost": 0.025519
+            "efficiency_score": 0.4107,
+            "usage_score": 0,
+            "composite_score": 0.0821,
+            "total_score": 4.41,
+            "duration": 353.6,
+            "cost": 1.165961
           }
         }
       },
       "replay_finviz_filter_simple": {
         "name": "Replay: Finviz multi-filter screening routine",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 12,
-            "task_max_score": 12,
-            "efficiency_score": 0.0086,
-            "usage_score": 0,
-            "composite_score": 0.6017,
-            "total_score": 12.01,
-            "duration": 594.83,
-            "cost": 1.17386
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 12,
-            "task_max_score": 12,
-            "efficiency_score": 0.2091,
-            "usage_score": 0,
-            "composite_score": 0.6418,
-            "total_score": 12.21,
-            "duration": 474.55,
-            "cost": 2.420046
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 11,
             "task_max_score": 12,
-            "efficiency_score": 0.7108,
-            "usage_score": 0.9274,
-            "composite_score": 0.9276,
-            "total_score": 12.64,
-            "duration": 173.53,
-            "cost": 0.072578
+            "efficiency_score": 0.7031,
+            "usage_score": 0.9059,
+            "composite_score": 0.9218,
+            "total_score": 12.61,
+            "duration": 178.14,
+            "cost": 0.094109
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 12,
             "task_max_score": 12,
-            "efficiency_score": 0.7041,
-            "usage_score": 0.4385,
-            "composite_score": 0.8285,
-            "total_score": 13.14,
-            "duration": 177.55,
-            "cost": 0.561479
+            "efficiency_score": 0.6907,
+            "usage_score": 0.4586,
+            "composite_score": 0.8299,
+            "total_score": 13.15,
+            "duration": 185.57,
+            "cost": 0.541425
           }
         }
       },
       "taskflow_full_workflow": {
-        "name": "TaskFlow Full Workflow \u2014 Create, Label, Drag & Filter",
+        "name": "TaskFlow Full Workflow — Create, Label, Drag & Filter",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 13.0,
-            "task_max_score": 13.0,
-            "efficiency_score": 0.4275,
-            "usage_score": 0.7683,
-            "composite_score": 0.8391,
-            "total_score": 14.2,
-            "duration": 343.52,
-            "cost": 0.463489
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 13.0,
-            "task_max_score": 13.0,
-            "efficiency_score": 0.6046,
-            "usage_score": 0.4005,
-            "composite_score": 0.801,
-            "total_score": 14.01,
-            "duration": 237.25,
-            "cost": 1.199092
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 11.5,
+            "task_score": 13.0,
             "task_max_score": 13.0,
-            "efficiency_score": 0.7038,
-            "usage_score": 0.9682,
-            "composite_score": 0.9344,
-            "total_score": 13.17,
-            "duration": 177.7,
-            "cost": 0.063611
+            "efficiency_score": 0.6588,
+            "usage_score": 0.9488,
+            "composite_score": 0.9215,
+            "total_score": 14.61,
+            "duration": 204.71,
+            "cost": 0.102386
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 11.5,
+            "task_score": 13.0,
             "task_max_score": 13.0,
-            "efficiency_score": 0.33,
-            "usage_score": 0.3386,
-            "composite_score": 0.7337,
-            "total_score": 12.17,
-            "duration": 402.0,
-            "cost": 1.322876
+            "efficiency_score": 0.7434,
+            "usage_score": 0.782,
+            "composite_score": 0.9051,
+            "total_score": 14.53,
+            "duration": 153.95,
+            "cost": 0.435922
           }
         }
       },
       "bluebook_complex": {
         "name": "BlueBook Multi-Image Reply Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 12.0,
-            "task_max_score": 12.0,
-            "efficiency_score": 0.7972,
-            "usage_score": 0.8933,
-            "composite_score": 0.9381,
-            "total_score": 13.69,
-            "duration": 101.4,
-            "cost": 0.128081
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 12.0,
-            "task_max_score": 12.0,
-            "efficiency_score": 0.7719,
-            "usage_score": 0.605,
-            "composite_score": 0.8754,
-            "total_score": 13.38,
-            "duration": 114.04,
-            "cost": 0.474032
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.7945,
-            "usage_score": 0.9724,
-            "composite_score": 0.9534,
-            "total_score": 13.77,
-            "duration": 102.74,
-            "cost": 0.033113
+            "efficiency_score": 0.8681,
+            "usage_score": 0.9776,
+            "composite_score": 0.9691,
+            "total_score": 13.85,
+            "duration": 65.96,
+            "cost": 0.026936
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.867,
-            "usage_score": 0.8884,
-            "composite_score": 0.9511,
-            "total_score": 13.76,
-            "duration": 66.49,
-            "cost": 0.133872
+            "efficiency_score": 0.8306,
+            "usage_score": 0.8463,
+            "composite_score": 0.9354,
+            "total_score": 13.68,
+            "duration": 84.7,
+            "cost": 0.184394
           }
         }
       },
       "drive_bulk_release_assets": {
         "name": "Drive Bulk Release Assets",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 10.0,
-            "task_max_score": 10.0,
-            "efficiency_score": 0.5189,
-            "usage_score": 0.7048,
-            "composite_score": 0.8447,
-            "total_score": 11.22,
-            "duration": 471.52,
-            "cost": 0.619899
-          },
-          "dashscope/qwen3.6-plus": {
+          "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.6548,
-            "usage_score": 0.3107,
-            "composite_score": 0.7931,
-            "total_score": 10.97,
-            "duration": 338.34,
-            "cost": 1.44752
-          },
-          "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 7.0,
-            "task_max_score": 10.0,
-            "efficiency_score": 0,
-            "usage_score": 0.9983,
-            "composite_score": 0.1997,
-            "total_score": 8.0,
-            "duration": 980.0,
-            "cost": 0.003504
+            "efficiency_score": 0.7852,
+            "usage_score": 0.9626,
+            "composite_score": 0.9496,
+            "total_score": 11.75,
+            "duration": 210.48,
+            "cost": 0.078479
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.3288,
-            "usage_score": 0,
-            "composite_score": 0.6658,
-            "total_score": 10.33,
-            "duration": 657.75,
-            "cost": 2.32915
+            "efficiency_score": 0.7639,
+            "usage_score": 0.7164,
+            "composite_score": 0.8961,
+            "total_score": 11.48,
+            "duration": 231.4,
+            "cost": 0.595505
           }
         }
       },
       "booking_family_trip_edgecase": {
         "name": "Booking Family Trip Edge Case",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 11.0,
-            "task_max_score": 11.0,
-            "efficiency_score": 0.5918,
-            "usage_score": 0.7165,
-            "composite_score": 0.8617,
-            "total_score": 12.31,
-            "duration": 424.55,
-            "cost": 0.680458
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 11.0,
-            "task_max_score": 11.0,
-            "efficiency_score": 0.6647,
-            "usage_score": 0.2763,
-            "composite_score": 0.7882,
-            "total_score": 11.94,
-            "duration": 348.73,
-            "cost": 1.736828
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 11.0,
             "task_max_score": 11.0,
-            "efficiency_score": 0.7423,
-            "usage_score": 0.9593,
-            "composite_score": 0.9403,
-            "total_score": 12.7,
-            "duration": 268.02,
-            "cost": 0.09777
+            "efficiency_score": 0.7878,
+            "usage_score": 0.957,
+            "composite_score": 0.949,
+            "total_score": 12.74,
+            "duration": 220.68,
+            "cost": 0.10327
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 11.0,
             "task_max_score": 11.0,
-            "efficiency_score": 0.7793,
-            "usage_score": 0.7133,
-            "composite_score": 0.8985,
-            "total_score": 12.49,
-            "duration": 229.48,
-            "cost": 0.688161
+            "efficiency_score": 0.718,
+            "usage_score": 0.6472,
+            "composite_score": 0.873,
+            "total_score": 12.37,
+            "duration": 293.29,
+            "cost": 0.846601
           }
         }
       },
       "techforum": {
         "name": "TechForum Upvote Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 2,
-            "task_max_score": 2,
-            "efficiency_score": 0.885,
-            "usage_score": 0.9293,
-            "composite_score": 0.9628,
-            "total_score": 3.81,
-            "duration": 34.51,
-            "cost": 0.035371
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 2,
-            "task_max_score": 2,
-            "efficiency_score": 0.8784,
-            "usage_score": 0.7339,
-            "composite_score": 0.9225,
-            "total_score": 3.61,
-            "duration": 36.47,
-            "cost": 0.13306
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.9231,
-            "usage_score": 0.9845,
-            "composite_score": 0.9815,
-            "total_score": 3.91,
-            "duration": 23.08,
-            "cost": 0.007746
+            "efficiency_score": 0.9371,
+            "usage_score": 0.984,
+            "composite_score": 0.9842,
+            "total_score": 3.92,
+            "duration": 18.86,
+            "cost": 0.008013
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 2,
             "task_max_score": 2,
-            "efficiency_score": 0.936,
-            "usage_score": 0.9139,
-            "composite_score": 0.97,
-            "total_score": 3.85,
-            "duration": 19.2,
-            "cost": 0.04303
+            "efficiency_score": 0.9269,
+            "usage_score": 0.9162,
+            "composite_score": 0.9686,
+            "total_score": 3.84,
+            "duration": 21.92,
+            "cost": 0.041878
           }
         }
       },
       "gmail_inbox_cleanup": {
         "name": "Gmail Inbox Cleanup",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 5.5,
-            "task_max_score": 7.0,
-            "efficiency_score": 0,
-            "usage_score": 0.9793,
-            "composite_score": 0.1959,
-            "total_score": 6.48,
-            "duration": 600.0,
-            "cost": 0.024847
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 4.0,
-            "task_max_score": 7.0,
-            "efficiency_score": 0.0752,
-            "usage_score": 0.9475,
-            "composite_score": 0.2045,
-            "total_score": 5.02,
-            "duration": 554.88,
-            "cost": 0.063044
-          },
           "dashscope/qwen3.5-flash": {
             "passed": false,
             "task_score": 2.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.6932,
-            "usage_score": 0.9416,
-            "composite_score": 0.327,
-            "total_score": 3.63,
-            "duration": 184.05,
-            "cost": 0.070105
+            "efficiency_score": 0,
+            "usage_score": 0.9955,
+            "composite_score": 0.1991,
+            "total_score": 3.0,
+            "duration": 600.0,
+            "cost": 0.005435
           },
           "dashscope/qwen3.6-flash": {
             "passed": false,
-            "task_score": 3.5,
+            "task_score": 5.5,
             "task_max_score": 7.0,
             "efficiency_score": 0,
-            "usage_score": 0.9798,
-            "composite_score": 0.196,
-            "total_score": 4.48,
+            "usage_score": 0.9794,
+            "composite_score": 0.1959,
+            "total_score": 6.48,
             "duration": 600.0,
-            "cost": 0.024216
+            "cost": 0.024718
           }
         }
       },
       "finviz_complex": {
         "name": "Finviz Multi-Filter Screener Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 5.0,
-            "task_max_score": 5.0,
-            "efficiency_score": 0.6988,
-            "usage_score": 0.8315,
-            "composite_score": 0.9061,
-            "total_score": 6.53,
-            "duration": 120.48,
-            "cost": 0.16851
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 5.0,
-            "task_max_score": 5.0,
-            "efficiency_score": 0.62,
-            "usage_score": 0.3761,
-            "composite_score": 0.7992,
-            "total_score": 6.0,
-            "duration": 152.0,
-            "cost": 0.623928
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.8279,
-            "usage_score": 0.9774,
-            "composite_score": 0.961,
-            "total_score": 6.81,
-            "duration": 68.85,
-            "cost": 0.022635
+            "efficiency_score": 0.6824,
+            "usage_score": 0.9485,
+            "composite_score": 0.9262,
+            "total_score": 6.63,
+            "duration": 127.05,
+            "cost": 0.05147
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 5.0,
             "task_max_score": 5.0,
-            "efficiency_score": 0.7878,
-            "usage_score": 0.8014,
-            "composite_score": 0.9178,
-            "total_score": 6.59,
-            "duration": 84.87,
-            "cost": 0.198635
+            "efficiency_score": 0.8404,
+            "usage_score": 0.8403,
+            "composite_score": 0.9361,
+            "total_score": 6.68,
+            "duration": 63.83,
+            "cost": 0.159746
           }
         }
       },
       "mapquest_nearby_pins": {
-        "name": "MapQuest Nearby Pins \u2014 Scroll Chips, Ambiguous Pins & Directions",
+        "name": "MapQuest Nearby Pins — Scroll Chips, Ambiguous Pins & Directions",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 10.5,
-            "task_max_score": 12.0,
-            "efficiency_score": 0.7348,
-            "usage_score": 0.8938,
-            "composite_score": 0.9257,
-            "total_score": 12.13,
-            "duration": 159.11,
-            "cost": 0.212449
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 9.0,
-            "task_max_score": 12.0,
-            "efficiency_score": 0.6785,
-            "usage_score": 0.5536,
-            "composite_score": 0.2464,
-            "total_score": 10.23,
-            "duration": 192.89,
-            "cost": 0.892756
-          },
           "dashscope/qwen3.5-flash": {
             "passed": false,
-            "task_score": 8.0,
+            "task_score": 2.5,
             "task_max_score": 12.0,
-            "efficiency_score": 0.4969,
-            "usage_score": 0.9316,
-            "composite_score": 0.2857,
-            "total_score": 9.43,
-            "duration": 301.89,
-            "cost": 0.136738
+            "efficiency_score": 0,
+            "usage_score": 0.9973,
+            "composite_score": 0.1995,
+            "total_score": 3.5,
+            "duration": 600.0,
+            "cost": 0.005353
           },
           "dashscope/qwen3.6-flash": {
-            "passed": true,
-            "task_score": 10.5,
+            "passed": false,
+            "task_score": 7.5,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6685,
-            "usage_score": 0.5992,
-            "composite_score": 0.8535,
-            "total_score": 11.77,
-            "duration": 198.87,
-            "cost": 0.801651
+            "efficiency_score": 0.4053,
+            "usage_score": 0.4453,
+            "composite_score": 0.1701,
+            "total_score": 8.35,
+            "duration": 356.84,
+            "cost": 1.109344
           }
         }
       },
       "cloudstack": {
         "name": "CloudStack DAS Agent Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 3.5,
-            "task_max_score": 3.5,
-            "efficiency_score": 0.7859,
-            "usage_score": 0.905,
-            "composite_score": 0.9382,
-            "total_score": 5.19,
-            "duration": 107.07,
-            "cost": 0.113952
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 3.5,
-            "task_max_score": 3.5,
-            "efficiency_score": 0.7919,
-            "usage_score": 0.6121,
-            "composite_score": 0.8808,
-            "total_score": 4.9,
-            "duration": 104.07,
-            "cost": 0.465436
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.8477,
-            "usage_score": 0.9804,
-            "composite_score": 0.9656,
-            "total_score": 5.33,
-            "duration": 76.13,
-            "cost": 0.023503
+            "efficiency_score": 0.8532,
+            "usage_score": 0.9704,
+            "composite_score": 0.9647,
+            "total_score": 5.32,
+            "duration": 73.41,
+            "cost": 0.035572
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 3.5,
             "task_max_score": 3.5,
-            "efficiency_score": 0.8288,
-            "usage_score": 0.8455,
-            "composite_score": 0.9349,
-            "total_score": 5.17,
-            "duration": 85.59,
-            "cost": 0.185456
+            "efficiency_score": 0.9033,
+            "usage_score": 0.9058,
+            "composite_score": 0.9618,
+            "total_score": 5.31,
+            "duration": 48.36,
+            "cost": 0.113003
           }
         }
       },
       "staybnb_book": {
-        "name": "StayBnB Book \u2014 Filters, Gallery & Two-Step Booking",
+        "name": "StayBnB Book — Filters, Gallery & Two-Step Booking",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 15.0,
-            "task_max_score": 15.0,
-            "efficiency_score": 0.5076,
-            "usage_score": 0.7975,
-            "composite_score": 0.861,
-            "total_score": 16.31,
-            "duration": 295.43,
-            "cost": 0.405032
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 15.0,
-            "task_max_score": 15.0,
-            "efficiency_score": 0.5737,
-            "usage_score": 0.483,
-            "composite_score": 0.8113,
-            "total_score": 16.06,
-            "duration": 255.79,
-            "cost": 1.034026
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.6828,
-            "usage_score": 0.9678,
-            "composite_score": 0.9301,
-            "total_score": 16.65,
-            "duration": 190.35,
-            "cost": 0.064467
+            "efficiency_score": 0.7576,
+            "usage_score": 0.9705,
+            "composite_score": 0.9456,
+            "total_score": 16.73,
+            "duration": 145.42,
+            "cost": 0.058979
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 15.0,
             "task_max_score": 15.0,
-            "efficiency_score": 0.7949,
-            "usage_score": 0.8624,
-            "composite_score": 0.9314,
-            "total_score": 16.66,
-            "duration": 123.08,
-            "cost": 0.275256
+            "efficiency_score": 0.654,
+            "usage_score": 0.7325,
+            "composite_score": 0.8773,
+            "total_score": 16.39,
+            "duration": 207.61,
+            "cost": 0.535072
           }
         }
       },
       "mapquest_navigate": {
-        "name": "MapQuest Navigate \u2014 Autocomplete, Directions & Collapse",
+        "name": "MapQuest Navigate — Autocomplete, Directions & Collapse",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": false,
-            "task_score": 6.5,
-            "task_max_score": 9.5,
-            "efficiency_score": 0.6945,
-            "usage_score": 0.8785,
-            "composite_score": 0.3146,
-            "total_score": 8.07,
-            "duration": 164.98,
-            "cost": 0.182317
-          },
-          "dashscope/qwen3.6-plus": {
+          "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 8.0,
             "task_max_score": 9.5,
-            "efficiency_score": 0.6615,
-            "usage_score": 0.4869,
-            "composite_score": 0.8297,
-            "total_score": 9.15,
-            "duration": 182.79,
-            "cost": 0.769716
-          },
-          "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 5.0,
-            "task_max_score": 9.5,
-            "efficiency_score": 0.5191,
-            "usage_score": 0.9194,
-            "composite_score": 0.2877,
-            "total_score": 6.44,
-            "duration": 259.68,
-            "cost": 0.120943
+            "efficiency_score": 0.8254,
+            "usage_score": 0.9748,
+            "composite_score": 0.96,
+            "total_score": 9.8,
+            "duration": 94.3,
+            "cost": 0.03777
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 8.0,
+            "task_score": 9.5,
             "task_max_score": 9.5,
-            "efficiency_score": 0.8246,
-            "usage_score": 0.8489,
-            "composite_score": 0.9347,
-            "total_score": 9.67,
-            "duration": 94.72,
-            "cost": 0.226632
+            "efficiency_score": 0.8469,
+            "usage_score": 0.8554,
+            "composite_score": 0.9405,
+            "total_score": 11.2,
+            "duration": 82.69,
+            "cost": 0.216923
           }
         }
       },
       "booking_room_selection": {
         "name": "Booking Room Selection",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.6427,
-            "usage_score": 0.7972,
-            "composite_score": 0.888,
-            "total_score": 10.44,
-            "duration": 235.82,
-            "cost": 0.30418
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.6182,
-            "usage_score": 0.2733,
-            "composite_score": 0.7783,
-            "total_score": 9.89,
-            "duration": 251.97,
-            "cost": 1.09007
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
-            "task_score": 8.2,
+            "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.7991,
-            "usage_score": 0.9717,
-            "composite_score": 0.9542,
-            "total_score": 9.97,
-            "duration": 132.6,
-            "cost": 0.042404
+            "efficiency_score": 0.7605,
+            "usage_score": 0.9581,
+            "composite_score": 0.9437,
+            "total_score": 10.72,
+            "duration": 158.06,
+            "cost": 0.062918
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.74,
-            "usage_score": 0.7243,
-            "composite_score": 0.8929,
-            "total_score": 10.46,
-            "duration": 171.58,
-            "cost": 0.41355
+            "efficiency_score": 0.827,
+            "usage_score": 0.8141,
+            "composite_score": 0.9282,
+            "total_score": 10.64,
+            "duration": 114.2,
+            "cost": 0.278777
           }
         }
       },
       "vidhub_player": {
-        "name": "VidHub Player \u2014 Search, Auto-Hide Controls & Nested Settings",
+        "name": "VidHub Player — Search, Auto-Hide Controls & Nested Settings",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 12.0,
-            "task_max_score": 12.0,
-            "efficiency_score": 0.6836,
-            "usage_score": 0.8682,
-            "composite_score": 0.9104,
-            "total_score": 13.55,
-            "duration": 170.83,
-            "cost": 0.197702
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 12.0,
-            "task_max_score": 12.0,
-            "efficiency_score": 0.3887,
-            "usage_score": 0.0881,
-            "composite_score": 0.6953,
-            "total_score": 12.48,
-            "duration": 330.12,
-            "cost": 1.367892
-          },
           "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 8.0,
+            "passed": true,
+            "task_score": 10.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.5188,
-            "usage_score": 0.9429,
-            "composite_score": 0.2923,
-            "total_score": 9.46,
-            "duration": 259.83,
-            "cost": 0.08569
+            "efficiency_score": 0.7332,
+            "usage_score": 0.9576,
+            "composite_score": 0.9382,
+            "total_score": 11.69,
+            "duration": 144.08,
+            "cost": 0.063571
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
-            "task_score": 10.5,
+            "task_score": 12.0,
             "task_max_score": 12.0,
-            "efficiency_score": 0.6623,
-            "usage_score": 0.7383,
-            "composite_score": 0.8801,
-            "total_score": 11.9,
-            "duration": 182.37,
-            "cost": 0.392616
+            "efficiency_score": 0.7269,
+            "usage_score": 0.7465,
+            "composite_score": 0.8947,
+            "total_score": 13.47,
+            "duration": 147.47,
+            "cost": 0.38029
           }
         }
       },
       "amazon_variant_checkout": {
         "name": "Amazon Variant Checkout",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 10.2,
-            "task_max_score": 10.2,
-            "efficiency_score": 0.7419,
-            "usage_score": 0.857,
-            "composite_score": 0.9198,
-            "total_score": 11.8,
-            "duration": 180.67,
-            "cost": 0.228795
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 6.6,
-            "task_max_score": 10.2,
-            "efficiency_score": 0.7862,
-            "usage_score": 0.6,
-            "composite_score": 0.2772,
-            "total_score": 7.99,
-            "duration": 149.69,
-            "cost": 0.639994
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 10.2,
             "task_max_score": 10.2,
-            "efficiency_score": 0.5111,
-            "usage_score": 0.9174,
-            "composite_score": 0.8857,
-            "total_score": 11.63,
-            "duration": 342.26,
-            "cost": 0.132226
+            "efficiency_score": 0.8465,
+            "usage_score": 0.9728,
+            "composite_score": 0.9639,
+            "total_score": 12.02,
+            "duration": 107.42,
+            "cost": 0.043447
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 10.2,
             "task_max_score": 10.2,
-            "efficiency_score": 0.8572,
-            "usage_score": 0.8712,
-            "composite_score": 0.9457,
-            "total_score": 11.93,
-            "duration": 99.95,
-            "cost": 0.206018
+            "efficiency_score": 0.8596,
+            "usage_score": 0.8528,
+            "composite_score": 0.9425,
+            "total_score": 11.91,
+            "duration": 98.25,
+            "cost": 0.235514
           }
         }
       },
       "taskflow_drag_and_edit": {
-        "name": "TaskFlow Drag & Edit \u2014 DnD, Checklist & Hover Quick-Edit",
+        "name": "TaskFlow Drag & Edit — DnD, Checklist & Hover Quick-Edit",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 11.5,
-            "task_max_score": 11.5,
-            "efficiency_score": 0.6714,
-            "usage_score": 0.8696,
-            "composite_score": 0.9082,
-            "total_score": 13.04,
-            "duration": 177.42,
-            "cost": 0.195545
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 11.5,
-            "task_max_score": 11.5,
-            "efficiency_score": 0.6717,
-            "usage_score": 0.437,
-            "composite_score": 0.8217,
-            "total_score": 12.61,
-            "duration": 177.29,
-            "cost": 0.844436
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 11.5,
             "task_max_score": 11.5,
-            "efficiency_score": 0.572,
-            "usage_score": 0.9374,
-            "composite_score": 0.9019,
-            "total_score": 13.01,
-            "duration": 231.1,
-            "cost": 0.09388
+            "efficiency_score": 0.7671,
+            "usage_score": 0.9622,
+            "composite_score": 0.9459,
+            "total_score": 13.23,
+            "duration": 125.76,
+            "cost": 0.056685
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 11.5,
             "task_max_score": 11.5,
-            "efficiency_score": 0.8142,
-            "usage_score": 0.839,
-            "composite_score": 0.9306,
+            "efficiency_score": 0.8248,
+            "usage_score": 0.8226,
+            "composite_score": 0.9295,
             "total_score": 13.15,
-            "duration": 100.34,
-            "cost": 0.241561
+            "duration": 94.62,
+            "cost": 0.266026
           }
         }
       },
       "amazon_offer_disambiguation": {
         "name": "Amazon Offer Disambiguation",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
+          "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.7894,
-            "usage_score": 0.8841,
-            "composite_score": 0.9347,
-            "total_score": 11.67,
-            "duration": 214.77,
-            "cost": 0.266625
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 7.0,
-            "task_max_score": 10.0,
-            "efficiency_score": 0.8707,
-            "usage_score": 0.7579,
-            "composite_score": 0.3257,
-            "total_score": 8.63,
-            "duration": 131.92,
-            "cost": 0.556892
-          },
-          "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 6.2,
-            "task_max_score": 10.0,
-            "efficiency_score": 0.8995,
-            "usage_score": 0.9827,
-            "composite_score": 0.3764,
-            "total_score": 8.08,
-            "duration": 102.55,
-            "cost": 0.039861
+            "efficiency_score": 0.9011,
+            "usage_score": 0.9825,
+            "composite_score": 0.9767,
+            "total_score": 11.88,
+            "duration": 100.86,
+            "cost": 0.040161
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 10.0,
             "task_max_score": 10.0,
-            "efficiency_score": 0.903,
-            "usage_score": 0.9029,
-            "composite_score": 0.9612,
-            "total_score": 11.81,
-            "duration": 98.97,
-            "cost": 0.22334
+            "efficiency_score": 0.9014,
+            "usage_score": 0.894,
+            "composite_score": 0.9591,
+            "total_score": 11.8,
+            "duration": 100.56,
+            "cost": 0.243812
           }
         }
       },
       "drive_permission_cleanup": {
         "name": "Drive Permission Cleanup",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 6.6,
-            "task_max_score": 6.6,
-            "efficiency_score": 0.6937,
-            "usage_score": 0.8318,
-            "composite_score": 0.9051,
-            "total_score": 8.13,
-            "duration": 189.89,
-            "cost": 0.218695
-          },
-          "dashscope/qwen3.6-plus": {
+          "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 6.6,
             "task_max_score": 6.6,
-            "efficiency_score": 0.7288,
-            "usage_score": 0.4106,
-            "composite_score": 0.8279,
-            "total_score": 7.74,
-            "duration": 168.15,
-            "cost": 0.766188
-          },
-          "dashscope/qwen3.5-flash": {
-            "passed": false,
-            "task_score": 5.0,
-            "task_max_score": 6.6,
-            "efficiency_score": 0.8099,
-            "usage_score": 0.9673,
-            "composite_score": 0.3555,
-            "total_score": 6.78,
-            "duration": 117.83,
-            "cost": 0.042507
+            "efficiency_score": 0.7347,
+            "usage_score": 0.9479,
+            "composite_score": 0.9365,
+            "total_score": 8.28,
+            "duration": 164.5,
+            "cost": 0.067764
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 6.6,
             "task_max_score": 6.6,
-            "efficiency_score": 0.8357,
-            "usage_score": 0.8218,
-            "composite_score": 0.9315,
-            "total_score": 8.26,
-            "duration": 101.87,
-            "cost": 0.231676
+            "efficiency_score": 0.836,
+            "usage_score": 0.7984,
+            "composite_score": 0.9269,
+            "total_score": 8.23,
+            "duration": 101.69,
+            "cost": 0.262092
           }
         }
       },
       "dataflow": {
         "name": "DataFlow Visual Challenge Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 3,
-            "task_max_score": 3,
-            "efficiency_score": 0.7063,
-            "usage_score": 0.5322,
-            "composite_score": 0.8477,
-            "total_score": 4.24,
-            "duration": 176.2,
-            "cost": 0.233879
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 3,
-            "task_max_score": 3,
-            "efficiency_score": 0.7583,
-            "usage_score": 0,
-            "composite_score": 0.7517,
-            "total_score": 3.76,
-            "duration": 145.02,
-            "cost": 0.559202
-          },
           "dashscope/qwen3.5-flash": {
-            "passed": true,
-            "task_score": 3,
+            "passed": false,
+            "task_score": 2,
             "task_max_score": 3,
-            "efficiency_score": 0.8806,
-            "usage_score": 0.9508,
-            "composite_score": 0.9663,
-            "total_score": 4.83,
-            "duration": 71.64,
-            "cost": 0.024616
+            "efficiency_score": 0,
+            "usage_score": 0.9889,
+            "composite_score": 0.1978,
+            "total_score": 2.99,
+            "duration": 600.0,
+            "cost": 0.005542
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 3,
             "task_max_score": 3,
-            "efficiency_score": 0.8964,
-            "usage_score": 0.7447,
-            "composite_score": 0.9282,
-            "total_score": 4.64,
-            "duration": 62.19,
-            "cost": 0.127642
+            "efficiency_score": 0.9112,
+            "usage_score": 0.7561,
+            "composite_score": 0.9335,
+            "total_score": 4.67,
+            "duration": 53.29,
+            "cost": 0.12196
           }
         }
       },
       "gbr_detailed": {
         "name": "GBR Detailed Search & Read Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 6.0,
-            "task_max_score": 7.0,
-            "efficiency_score": 0.6271,
-            "usage_score": 0.7832,
-            "composite_score": 0.882,
-            "total_score": 7.41,
-            "duration": 223.76,
-            "cost": 0.325231
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 6.0,
-            "task_max_score": 7.0,
-            "efficiency_score": 0.7771,
-            "usage_score": 0.6216,
-            "composite_score": 0.8797,
-            "total_score": 7.4,
-            "duration": 133.76,
-            "cost": 0.567618
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.7356,
-            "usage_score": 0.9589,
-            "composite_score": 0.9389,
-            "total_score": 8.69,
-            "duration": 158.65,
-            "cost": 0.061596
+            "efficiency_score": 0.4522,
+            "usage_score": 0.9044,
+            "composite_score": 0.8713,
+            "total_score": 8.36,
+            "duration": 328.67,
+            "cost": 0.143411
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 7.0,
             "task_max_score": 7.0,
-            "efficiency_score": 0.8542,
-            "usage_score": 0.8649,
-            "composite_score": 0.9438,
-            "total_score": 8.72,
-            "duration": 87.5,
-            "cost": 0.202584
+            "efficiency_score": 0.8233,
+            "usage_score": 0.8031,
+            "composite_score": 0.9253,
+            "total_score": 8.63,
+            "duration": 106.04,
+            "cost": 0.295286
           }
         }
       },
       "gmail_vendor_escalation": {
         "name": "Gmail Vendor Escalation",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.483,
-            "usage_score": 0.709,
-            "composite_score": 0.8384,
-            "total_score": 10.19,
-            "duration": 465.33,
-            "cost": 0.640286
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 9.0,
-            "task_max_score": 9.0,
-            "efficiency_score": 0.6995,
-            "usage_score": 0.3796,
-            "composite_score": 0.8158,
-            "total_score": 10.08,
-            "duration": 270.41,
-            "cost": 1.364906
-          },
           "dashscope/qwen3.5-flash": {
             "passed": false,
-            "task_score": 0,
+            "task_score": 0.8,
             "task_max_score": 9.0,
             "efficiency_score": 0,
-            "usage_score": 0.7188,
-            "composite_score": 0.1438,
-            "total_score": 0.72,
+            "usage_score": 0.9975,
+            "composite_score": 0.1995,
+            "total_score": 1.8,
             "duration": 900.0,
-            "cost": 0.618624
+            "cost": 0.00559
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 9.0,
             "task_max_score": 9.0,
-            "efficiency_score": 0.7449,
-            "usage_score": 0.7287,
-            "composite_score": 0.8947,
-            "total_score": 10.47,
-            "duration": 229.56,
-            "cost": 0.596822
+            "efficiency_score": 0.8367,
+            "usage_score": 0.828,
+            "composite_score": 0.9329,
+            "total_score": 10.66,
+            "duration": 146.94,
+            "cost": 0.37847
           }
         }
       },
       "northstar_add_bag": {
         "name": "Northstar Fit Guide + Add To Bag Test",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 6.0,
-            "task_max_score": 6.0,
-            "efficiency_score": 0.8189,
-            "usage_score": 0.9154,
-            "composite_score": 0.9469,
-            "total_score": 7.73,
-            "duration": 97.77,
-            "cost": 0.101474
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 6.0,
-            "task_max_score": 6.0,
-            "efficiency_score": 0.796,
-            "usage_score": 0.5724,
-            "composite_score": 0.8737,
-            "total_score": 7.37,
-            "duration": 110.18,
-            "cost": 0.513078
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.8965,
-            "usage_score": 0.9847,
-            "composite_score": 0.9762,
-            "total_score": 7.88,
-            "duration": 55.92,
-            "cost": 0.018326
+            "efficiency_score": 0.8544,
+            "usage_score": 0.9746,
+            "composite_score": 0.9658,
+            "total_score": 7.83,
+            "duration": 78.65,
+            "cost": 0.030482
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 6.0,
             "task_max_score": 6.0,
-            "efficiency_score": 0.9125,
-            "usage_score": 0.9143,
-            "composite_score": 0.9654,
-            "total_score": 7.83,
-            "duration": 47.24,
-            "cost": 0.102852
+            "efficiency_score": 0.8969,
+            "usage_score": 0.8797,
+            "composite_score": 0.9553,
+            "total_score": 7.78,
+            "duration": 55.68,
+            "cost": 0.144374
           }
         }
       },
       "drive_project_reorg": {
         "name": "Drive Project Reorg",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 7.5,
-            "task_max_score": 7.5,
-            "efficiency_score": 0,
-            "usage_score": 0.3885,
-            "composite_score": 0.6777,
-            "total_score": 7.89,
-            "duration": 660.0,
-            "cost": 0.917267
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": true,
-            "task_score": 7.5,
-            "task_max_score": 7.5,
-            "efficiency_score": 0,
-            "usage_score": 0,
-            "composite_score": 0.6,
-            "total_score": 7.5,
-            "duration": 660.0,
-            "cost": 3.334074
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 7.5,
             "task_max_score": 7.5,
-            "efficiency_score": 0.0856,
-            "usage_score": 0.8027,
-            "composite_score": 0.7777,
-            "total_score": 8.39,
-            "duration": 603.52,
-            "cost": 0.295921
+            "efficiency_score": 0.7869,
+            "usage_score": 0.9642,
+            "composite_score": 0.9502,
+            "total_score": 9.25,
+            "duration": 140.67,
+            "cost": 0.053702
           },
           "dashscope/qwen3.6-flash": {
-            "passed": false,
-            "task_score": 5.5,
+            "passed": true,
+            "task_score": 7.5,
             "task_max_score": 7.5,
-            "efficiency_score": 0,
-            "usage_score": 0.9834,
-            "composite_score": 0.1967,
-            "total_score": 6.48,
-            "duration": 660.0,
-            "cost": 0.024974
+            "efficiency_score": 0.6471,
+            "usage_score": 0.5584,
+            "composite_score": 0.8411,
+            "total_score": 8.71,
+            "duration": 232.89,
+            "cost": 0.662465
           }
         }
       },
       "github_issue_triage_deep": {
         "name": "GitHub Issue Triage Deep",
         "results_by_model": {
-          "dashscope/qwen3.5-plus": {
-            "passed": true,
-            "task_score": 8.5,
-            "task_max_score": 8.5,
-            "efficiency_score": 0.782,
-            "usage_score": 0.8912,
-            "composite_score": 0.9346,
-            "total_score": 10.17,
-            "duration": 148.26,
-            "cost": 0.163233
-          },
-          "dashscope/qwen3.6-plus": {
-            "passed": false,
-            "task_score": 4.8,
-            "task_max_score": 8.5,
-            "efficiency_score": 0.7832,
-            "usage_score": 0.6048,
-            "composite_score": 0.2776,
-            "total_score": 6.19,
-            "duration": 147.41,
-            "cost": 0.592826
-          },
           "dashscope/qwen3.5-flash": {
             "passed": true,
             "task_score": 8.5,
             "task_max_score": 8.5,
-            "efficiency_score": 0.7463,
-            "usage_score": 0.9535,
-            "composite_score": 0.94,
-            "total_score": 10.2,
-            "duration": 172.54,
-            "cost": 0.069738
+            "efficiency_score": 0.7672,
+            "usage_score": 0.9528,
+            "composite_score": 0.944,
+            "total_score": 10.22,
+            "duration": 158.29,
+            "cost": 0.070864
           },
           "dashscope/qwen3.6-flash": {
             "passed": true,
             "task_score": 8.5,
             "task_max_score": 8.5,
-            "efficiency_score": 0.8418,
-            "usage_score": 0.8376,
-            "composite_score": 0.9359,
-            "total_score": 10.18,
-            "duration": 107.59,
-            "cost": 0.243587
+            "efficiency_score": 0.8461,
+            "usage_score": 0.8215,
+            "composite_score": 0.9335,
+            "total_score": 10.17,
+            "duration": 104.67,
+            "cost": 0.26775
           }
         }
       }

From ca0e764af79b073b6bbfc0d4a43d2d3ab5aef57a Mon Sep 17 00:00:00 2001
From: Xiao Yang <yangxiao1098@gmail.com>
Date: Wed, 13 May 2026 08:21:16 +0800
Subject: [PATCH 12/12] style: apply pre-commit formatting (black + prettier)

Pre-commit auto-formatted spillover from the scroll/click/keyboard
fixes in this stack. No behavior changes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 extension/src/commands/pixel-actions.ts | 17 ++++++------
 server/agent/tools/browser_executor.py  | 36 ++++++++++++++++++-------
 server/agent/visualizer.py              |  8 ++----
 3 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/extension/src/commands/pixel-actions.ts b/extension/src/commands/pixel-actions.ts
index b834b46..e6edcef 100644
--- a/extension/src/commands/pixel-actions.ts
+++ b/extension/src/commands/pixel-actions.ts
@@ -843,10 +843,8 @@ export async function performMouseScroll(
   await waitForScrollSettle(cdp);
 
   const after = await readScroll(cdp);
-  const movedX =
-    before && after ? Math.abs(after[0] - before[0]) >= 1 : true;
-  const movedY =
-    before && after ? Math.abs(after[1] - before[1]) >= 1 : true;
+  const movedX = before && after ? Math.abs(after[0] - before[0]) >= 1 : true;
+  const movedY = before && after ? Math.abs(after[1] - before[1]) >= 1 : true;
   const moved = movedX || movedY;
 
   let reason: string | undefined;
@@ -878,9 +876,7 @@ export async function performMouseScroll(
   return { x: cursor.x, y: cursor.y, deltaX, deltaY, moved, reason };
 }
 
-async function readScroll(
-  cdp: CdpCommander,
-): Promise<[number, number] | null> {
+async function readScroll(cdp: CdpCommander): Promise<[number, number] | null> {
   try {
     const resp = await cdp.sendCommand<{
       result?: { value?: { x?: number; y?: number } };
@@ -1128,7 +1124,12 @@ export async function performKeyboardType(
     result?: {
       value?: { editable?: boolean; target?: string; reason?: string };
     };
-  }>('Runtime.evaluate', { expression: focusProbeExpr, returnByValue: true }, 8000, 0);
+  }>(
+    'Runtime.evaluate',
+    { expression: focusProbeExpr, returnByValue: true },
+    8000,
+    0,
+  );
   const focus = focusResp?.result?.value || {};
   if (!focus.editable) {
     return {
diff --git a/server/agent/tools/browser_executor.py b/server/agent/tools/browser_executor.py
index bbb1c1c..72b17ed 100644
--- a/server/agent/tools/browser_executor.py
+++ b/server/agent/tools/browser_executor.py
@@ -1119,9 +1119,7 @@ def _denormalize_xy(
         py = round(y * vh / 1000) if y is not None else None
         return (px, py)
 
-    def _denormalize_scroll_amount(
-        self, amount: int, direction: str
-    ) -> int:
+    def _denormalize_scroll_amount(self, amount: int, direction: str) -> int:
         """Convert a Qwen-normalized scroll amount to CSS pixels.
 
         Qwen emits scroll deltas in [0, 1000] (same space as click coords),
@@ -1144,9 +1142,7 @@ def _denormalize_scroll_amount(
             return amt
         return max(1, round(amt * axis / 1000))
 
-    def _format_action_xy(
-        self, x_css: Optional[int], y_css: Optional[int]
-    ) -> str:
+    def _format_action_xy(self, x_css: Optional[int], y_css: Optional[int]) -> str:
         """Render an (x, y) pair in the coordinate space the agent uses.
 
         For Qwen models the agent emits and reads coordinates in [0, 1000]
@@ -1190,7 +1186,9 @@ def _gate_pixel_target(
             cmd = AnalyzePixelTargetsCommand(
                 x=int(x_css),
                 y=int(y_css),
-                radius=int(radius) if radius is not None else self.PIXEL_GATE_RADIUS_CSS,
+                radius=(
+                    int(radius) if radius is not None else self.PIXEL_GATE_RADIUS_CSS
+                ),
                 candidate_limit=self.PIXEL_GATE_CANDIDATE_LIMIT,
                 conversation_id=self.conversation_id,
             )
@@ -1992,8 +1990,24 @@ def _execute_mouse_action(self, action: MouseAction) -> OpenBrowserObservation:
     # Excludes navigation/cursor combos (arrows, Home/End) since those have
     # different semantics on macOS that can't be remapped 1:1.
     _MAC_REMAP_KEYS = {
-        "a", "c", "v", "x", "z", "y", "s", "f", "g",
-        "p", "n", "t", "w", "r", "l", "+", "-", "0",
+        "a",
+        "c",
+        "v",
+        "x",
+        "z",
+        "y",
+        "s",
+        "f",
+        "g",
+        "p",
+        "n",
+        "t",
+        "w",
+        "r",
+        "l",
+        "+",
+        "-",
+        "0",
     }
 
     @classmethod
@@ -2049,7 +2063,9 @@ def _execute_keyboard_action(
                     obs = self._build_observation_from_result(result_dict, msg)
                     return obs.model_copy(update={"success": False})
                 target = detail.get("target")
-                target_note = f" into {target}" if isinstance(target, str) and target else ""
+                target_note = (
+                    f" into {target}" if isinstance(target, str) and target else ""
+                )
                 return self._build_observation_from_result(
                     result_dict, f"Typed text: {preview!r}{target_note}"
                 )
diff --git a/server/agent/visualizer.py b/server/agent/visualizer.py
index b452032..2010142 100644
--- a/server/agent/visualizer.py
+++ b/server/agent/visualizer.py
@@ -111,9 +111,7 @@ def on_event(self, event: Event) -> None:
                     sse_data["reasoning_content"] = rc
                 tbs = getattr(event, "thinking_blocks", None)
                 if tbs:
-                    sse_data["thinking_blocks"] = [
-                        tb.model_dump() for tb in tbs
-                    ]
+                    sse_data["thinking_blocks"] = [tb.model_dump() for tb in tbs]
                 if event.tool_name == PLEASE_HELP_ME_TOOL_NAME and event.action:
                     help_request = getattr(event.action, "message", None)
                     if isinstance(help_request, str) and help_request.strip():
@@ -159,9 +157,7 @@ def on_event(self, event: Event) -> None:
                     sse_data["reasoning_content"] = rc
                 tbs = getattr(event.llm_message, "thinking_blocks", None)
                 if tbs:
-                    sse_data["thinking_blocks"] = [
-                        tb.model_dump() for tb in tbs
-                    ]
+                    sse_data["thinking_blocks"] = [tb.model_dump() for tb in tbs]
                 # Also include activated_skills if present
                 if event.activated_skills:
                     sse_data["activated_skills"] = event.activated_skills