From 6e74140ad3e10bec962b2f64cedf39a7b26b6667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 31 May 2026 10:21:00 +0200 Subject: [PATCH 1/2] perf(ios): skip backend re-read in get text for non-editable elements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit readTextForNode dispatched a coordinate 'read' to the iOS XCUITest runner for every get text, where readTextAt() enumerates the full element tree (allElementsBoundByIndex) — ~20x slower than the snapshot already captured to resolve the node. That re-read only recovers fuller text for editable/expandable inputs (textField/searchField/textView/…); for all other element types the freshly-captured snapshot node text is authoritative. Return the snapshot node text directly for non-editable nodes with non-empty readable text, skipping the round-trip. Measured on iPhone 17 sim: get text on a labeled control drops from ~25s to ~0.3s steady-state. Editable inputs keep the backend re-read (live value can exceed the snapshot). --- .../__tests__/interaction-read.test.ts | 57 +++++++++++++++++++ src/daemon/handlers/interaction-read.ts | 11 ++++ src/utils/text-surface.ts | 7 ++- 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 src/daemon/handlers/__tests__/interaction-read.test.ts diff --git a/src/daemon/handlers/__tests__/interaction-read.test.ts b/src/daemon/handlers/__tests__/interaction-read.test.ts new file mode 100644 index 000000000..bf871dccc --- /dev/null +++ b/src/daemon/handlers/__tests__/interaction-read.test.ts @@ -0,0 +1,57 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; +import type { SnapshotNode } from '../../../utils/snapshot.ts'; + +vi.mock('../../../core/dispatch.ts', async (importOriginal) => { + const actual = await importOriginal(); + return { + ...actual, + dispatchCommand: vi.fn(async () => ({ text: 'backend-text' })), + }; +}); + +import { dispatchCommand } from '../../../core/dispatch.ts'; +import { readTextForNode } from '../interaction-read.ts'; + +const mockDispatch = vi.mocked(dispatchCommand); + +function node(overrides: Partial): SnapshotNode { + return { + ref: 'e1', + index: 0, + rect: { x: 0, y: 0, width: 100, height: 40 }, + ...overrides, + } as SnapshotNode; +} + +const baseParams = { + device: { platform: 'ios' } as never, + flags: undefined, + contextFromFlags: () => ({}) as never, +}; + +describe('readTextForNode', () => { + beforeEach(() => mockDispatch.mockClear()); + + it('returns snapshot text without a backend read for non-editable nodes', async () => { + const text = await readTextForNode({ ...baseParams, node: node({ type: 'button', label: 'General' }) }); + expect(text).toBe('General'); + expect(mockDispatch).not.toHaveBeenCalled(); + }); + + it('still re-reads via the backend for editable text inputs (live value may exceed snapshot)', async () => { + const text = await readTextForNode({ ...baseParams, node: node({ type: 'textfield', value: 'snap' }) }); + expect(mockDispatch).toHaveBeenCalledOnce(); + expect(text).toBe('backend-text'); + }); + + it('re-reads when the snapshot node has no readable text', async () => { + await readTextForNode({ ...baseParams, node: node({ type: 'other' }) }); + expect(mockDispatch).toHaveBeenCalledOnce(); + }); + + it('returns snapshot text without a backend read when the node has no resolvable center', async () => { + const text = await readTextForNode({ ...baseParams, node: node({ type: 'button', label: 'General', rect: undefined }) }); + expect(text).toBe('General'); + expect(mockDispatch).not.toHaveBeenCalled(); + }); +}); diff --git a/src/daemon/handlers/interaction-read.ts b/src/daemon/handlers/interaction-read.ts index 409ec887d..8592c7ce8 100644 --- a/src/daemon/handlers/interaction-read.ts +++ b/src/daemon/handlers/interaction-read.ts @@ -3,6 +3,7 @@ import { emitDiagnostic } from '../../utils/diagnostics.ts'; import { extractNodeReadText } from '../snapshot-processing.ts'; import type { SessionState } from '../types.ts'; import type { SnapshotNode } from '../../utils/snapshot.ts'; +import { prefersValueForReadableText } from '../../utils/text-surface.ts'; import type { ContextFromFlags } from './interaction-common.ts'; import { resolveRectCenter } from './interaction-targeting.ts'; @@ -22,6 +23,16 @@ export async function readTextForNode(params: { return fallbackText; } + // The backend `read` re-resolves the element at a point, which on iOS XCUITest enumerates + // the full element tree (allElementsBoundByIndex) and is ~20x slower than the snapshot we + // already captured to resolve this node. That re-read only recovers fuller text for + // editable/expandable inputs (textField/searchField/textView/…), where the live value can + // exceed the snapshot. For every other element type the snapshot node text is authoritative, + // so return it directly and skip the expensive round-trip. + if (fallbackText && !prefersValueForReadableText(node.type ?? '')) { + return fallbackText; + } + try { const rawData = await dispatchCommand( device, diff --git a/src/utils/text-surface.ts b/src/utils/text-surface.ts index 6ea12d0e3..03208c784 100644 --- a/src/utils/text-surface.ts +++ b/src/utils/text-surface.ts @@ -85,7 +85,12 @@ export function normalizeType(type: string): string { return normalized; } -function prefersValueForReadableText(type: string): boolean { +/** + * Editable / expandable text-bearing element types whose live on-screen value can exceed + * the captured snapshot text. For these the readable text prefers `value`, and a backend + * (e.g. iOS XCUITest) re-read at the element can recover fuller text than the snapshot node. + */ +export function prefersValueForReadableText(type: string): boolean { const normalized = normalizeType(type); return ( normalized.includes('textfield') || From 6f7104e07e8e82f6b37274a36af1242d7e0d261e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Sun, 31 May 2026 11:22:23 +0200 Subject: [PATCH 2/2] fix(ios): gate get text snapshot-text fast-path to iOS only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review (P1): the fast-path skipped the backend read on Android/Linux/macOS too, but those backends read value-first (macOS helper: AXValue→title→description; Linux similar) whereas snapshot readable text is label-first for non-editables — so skipping their read changed get text output. Restrict the optimization to the iOS XCUITest path (the slow allElementsBoundByIndex re-read it targets). Adds a test asserting non-iOS platforms still dispatch the backend read. --- .../handlers/__tests__/interaction-read.test.ts | 13 +++++++++++++ src/daemon/handlers/interaction-read.ts | 16 +++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/src/daemon/handlers/__tests__/interaction-read.test.ts b/src/daemon/handlers/__tests__/interaction-read.test.ts index bf871dccc..07e8ce3f5 100644 --- a/src/daemon/handlers/__tests__/interaction-read.test.ts +++ b/src/daemon/handlers/__tests__/interaction-read.test.ts @@ -54,4 +54,17 @@ describe('readTextForNode', () => { expect(text).toBe('General'); expect(mockDispatch).not.toHaveBeenCalled(); }); + + it('does NOT skip the backend read on non-iOS platforms (value-first read semantics differ)', async () => { + for (const platform of ['android', 'macos', 'linux'] as const) { + mockDispatch.mockClear(); + const text = await readTextForNode({ + ...baseParams, + device: { platform } as never, + node: node({ type: 'button', label: 'General' }), + }); + expect(mockDispatch).toHaveBeenCalledOnce(); + expect(text).toBe('backend-text'); + } + }); }); diff --git a/src/daemon/handlers/interaction-read.ts b/src/daemon/handlers/interaction-read.ts index 8592c7ce8..cc416757f 100644 --- a/src/daemon/handlers/interaction-read.ts +++ b/src/daemon/handlers/interaction-read.ts @@ -23,13 +23,19 @@ export async function readTextForNode(params: { return fallbackText; } - // The backend `read` re-resolves the element at a point, which on iOS XCUITest enumerates - // the full element tree (allElementsBoundByIndex) and is ~20x slower than the snapshot we + // iOS only: the XCUITest backend `read` re-resolves the element at a point by enumerating + // the full element tree (allElementsBoundByIndex), which is ~20x slower than the snapshot we // already captured to resolve this node. That re-read only recovers fuller text for // editable/expandable inputs (textField/searchField/textView/…), where the live value can - // exceed the snapshot. For every other element type the snapshot node text is authoritative, - // so return it directly and skip the expensive round-trip. - if (fallbackText && !prefersValueForReadableText(node.type ?? '')) { + // exceed the snapshot; for every other element type the snapshot node text is authoritative. + // Restricted to iOS because other backends read differently — macOS helper and Linux reads + // are value-first (AXValue/title/description), unlike the label-first snapshot readable text, + // so skipping their backend read would change the returned text. + if ( + device.platform === 'ios' && + fallbackText && + !prefersValueForReadableText(node.type ?? '') + ) { return fallbackText; }