diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts index ba16431c..d2a0b7d8 100644 --- a/examples/pdf-server/server.ts +++ b/examples/pdf-server/server.ts @@ -305,6 +305,33 @@ function waitForSaveData( }); } +const pendingStateRequests = new Map void>(); + +/** + * Wait for the viewer to report its current state (page, zoom, selection, …) + * as a JSON string. Same timeout/abort semantics as waitForSaveData. + */ +function waitForViewerState( + requestId: string, + signal?: AbortSignal, +): Promise { + return new Promise((resolve, reject) => { + const settle = (v: string | Error) => { + clearTimeout(timer); + signal?.removeEventListener("abort", onAbort); + pendingStateRequests.delete(requestId); + v instanceof Error ? reject(v) : resolve(v); + }; + const onAbort = () => settle(new Error("interact request cancelled")); + const timer = setTimeout( + () => settle(new Error("Timeout waiting for viewer state")), + GET_PAGES_TIMEOUT_MS, + ); + signal?.addEventListener("abort", onAbort); + pendingStateRequests.set(requestId, settle); + }); +} + interface QueueEntry { commands: PdfCommand[]; /** Timestamp of the most recent enqueue or dequeue */ @@ -1350,7 +1377,8 @@ Returns a viewUUID in structuredContent. Pass it to \`interact\`: - add_annotations, update_annotations, remove_annotations, highlight_text - fill_form (fill PDF form fields) - navigate, search, find, search_navigate, zoom -- get_text, get_screenshot (extract content) +- get_text, get_screenshot, get_viewer_state (extract content / read selection & current page) +- save_as (write annotated PDF to disk) Accepts local files (use list_pdfs), client MCP root directories, or any HTTPS URL. Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before display.`, @@ -1650,6 +1678,7 @@ URL: ${normalized}`, "fill_form", "get_text", "get_screenshot", + "get_viewer_state", "save_as", ]) .describe("Action to perform"), @@ -2238,6 +2267,26 @@ URL: ${normalized}`, ); } } + case "get_viewer_state": { + const requestId = randomUUID(); + enqueueCommand(uuid, { type: "get_viewer_state", requestId }); + let state: string; + try { + await ensureViewerIsPolling(uuid); + state = await waitForViewerState(requestId, signal); + } catch (err) { + return { + content: [ + { + type: "text", + text: `Error: ${err instanceof Error ? err.message : String(err)}`, + }, + ], + isError: true, + }; + } + return { content: [{ type: "text", text: state }] }; + } default: return { content: [{ type: "text", text: `Unknown action: ${action}` }], @@ -2295,6 +2344,7 @@ Example — add a signature image and a stamp, then screenshot to verify: **TEXT/SCREENSHOTS**: • get_text: extract text from pages. Optional \`page\` for single page, or \`intervals\` for ranges [{start?,end?}]. Max 20 pages. • get_screenshot: capture a single page as PNG image. Requires \`page\`. +• get_viewer_state: snapshot of the live viewer — JSON {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection:{text,contextBefore,contextAfter,boundingRect}|null}. Use this to read what the user has selected or which page they're on. **FORMS** — fill_form: fill fields with \`fields\` array of {name, value}. @@ -2320,6 +2370,7 @@ Example — add a signature image and a stamp, then screenshot to verify: "fill_form", "get_text", "get_screenshot", + "get_viewer_state", "save_as", ]) .optional() @@ -2603,6 +2654,48 @@ Example — add a signature image and a stamp, then screenshot to verify: }, ); + // Tool: submit_viewer_state (app-only) - Viewer reports its live state + registerAppTool( + server, + "submit_viewer_state", + { + title: "Submit Viewer State", + description: + "Submit a viewer-state snapshot for a get_viewer_state request (used by viewer). The model should NOT call this tool directly.", + inputSchema: { + requestId: z + .string() + .describe("The request ID from the get_viewer_state command"), + state: z + .string() + .optional() + .describe("JSON-encoded viewer state snapshot"), + error: z + .string() + .optional() + .describe("Error message if the viewer failed to read state"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ requestId, state, error }): Promise => { + const settle = pendingStateRequests.get(requestId); + if (!settle) { + return { + content: [ + { type: "text", text: `No pending request for ${requestId}` }, + ], + isError: true, + }; + } + if (error || !state) { + settle(new Error(error || "Viewer returned no state")); + } else { + settle(state); + } + return { content: [{ type: "text", text: "Submitted" }] }; + }, + ); + // Tool: poll_pdf_commands (app-only) - Poll for pending commands registerAppTool( server, diff --git a/examples/pdf-server/src/commands.ts b/examples/pdf-server/src/commands.ts index c469ebd6..6d27edf9 100644 --- a/examples/pdf-server/src/commands.ts +++ b/examples/pdf-server/src/commands.ts @@ -66,4 +66,5 @@ export type PdfCommand = getScreenshots: boolean; } | { type: "save_as"; requestId: string } + | { type: "get_viewer_state"; requestId: string } | { type: "file_changed"; mtimeMs: number }; diff --git a/examples/pdf-server/src/mcp-app.ts b/examples/pdf-server/src/mcp-app.ts index deec2e7f..fe30b14f 100644 --- a/examples/pdf-server/src/mcp-app.ts +++ b/examples/pdf-server/src/mcp-app.ts @@ -2428,6 +2428,84 @@ async function renderPageOffscreen(pageNum: number): Promise { return dataUrl.split(",")[1]; } +/** + * Snapshot the live viewer for `interact({action:"get_viewer_state"})`. + * + * Selection is read from `window.getSelection()` at call time — no caching; + * if the user navigated away or nothing is selected, `selection` is `null`. + * `boundingRect` is in model coords (PDF points, origin top-left, y-down) so + * it can be fed straight back into `add_annotations`. + */ +async function handleGetViewerState(requestId: string): Promise { + const CONTEXT_CHARS = 200; + + let selection: { + text: string; + contextBefore: string; + contextAfter: string; + boundingRect: { x: number; y: number; width: number; height: number }; + } | null = null; + + const sel = window.getSelection(); + const selectedText = sel?.toString().replace(/\s+/g, " ").trim(); + if (sel && selectedText && sel.rangeCount > 0) { + // Only treat it as a PDF selection if it lives inside the text layer of + // the rendered page (not the toolbar, search box, etc.). + const range = sel.getRangeAt(0); + const anchor = + range.commonAncestorContainer.nodeType === Node.ELEMENT_NODE + ? (range.commonAncestorContainer as Element) + : range.commonAncestorContainer.parentElement; + if (anchor && textLayerEl.contains(anchor)) { + // Context: locate selection in the page's extracted text and slice + // ±CONTEXT_CHARS around it. Falls back to empty strings if fuzzy + // match fails (still return text + rect — they're the load-bearing + // bits). + const pageText = pageTextCache.get(currentPage) ?? ""; + const loc = findSelectionInText(pageText, selectedText); + const contextBefore = loc + ? pageText.slice(Math.max(0, loc.start - CONTEXT_CHARS), loc.start) + : ""; + const contextAfter = loc + ? pageText.slice(loc.end, loc.end + CONTEXT_CHARS) + : ""; + + // Single bounding box, page-relative model coords. getBoundingClientRect + // is viewport-relative; subtract the page-wrapper origin then divide by + // scale → PDF points (top-left origin, y-down — matches the coord + // system documented in the interact tool description). + const r = range.getBoundingClientRect(); + const origin = pageWrapperEl.getBoundingClientRect(); + const round = (n: number) => Math.round(n * 100) / 100; + selection = { + text: selectedText, + contextBefore, + contextAfter, + boundingRect: { + x: round((r.left - origin.left) / scale), + y: round((r.top - origin.top) / scale), + width: round(r.width / scale), + height: round(r.height / scale), + }, + }; + } + } + + const state = { + currentPage, + pageCount: totalPages, + zoom: Math.round(scale * 100), + displayMode: currentDisplayMode, + selectedAnnotationIds: [...selectedAnnotationIds], + selection, + }; + + await app.callServerTool({ + name: "submit_viewer_state", + arguments: { requestId, state: JSON.stringify(state, null, 2) }, + }); +} + async function handleGetPages(cmd: { requestId: string; intervals: Array<{ start?: number; end?: number }>; @@ -4678,6 +4756,23 @@ async function processCommands(commands: PdfCommand[]): Promise { .catch(() => {}); } break; + case "get_viewer_state": + // Same await-before-next-poll discipline as get_pages/save_as. + try { + await handleGetViewerState(cmd.requestId); + } catch (err) { + log.error("get_viewer_state failed — submitting error:", err); + await app + .callServerTool({ + name: "submit_viewer_state", + arguments: { + requestId: cmd.requestId, + error: err instanceof Error ? err.message : String(err), + }, + }) + .catch(() => {}); + } + break; case "file_changed": { // Skip our own save_pdf echo: either save is still in flight, or the // event's mtime matches what save_pdf just returned. diff --git a/playwright.config.ts b/playwright.config.ts index 8485b000..6f6cb65b 100644 --- a/playwright.config.ts +++ b/playwright.config.ts @@ -30,6 +30,7 @@ export default defineConfig({ ...devices["Desktop Chrome"], // Use default Chromium everywhere for consistent screenshot rendering // Run `npm run test:e2e:docker` locally for CI-identical results + ...(process.env.PW_CHANNEL ? { channel: process.env.PW_CHANNEL } : {}), }, }, ], diff --git a/tests/e2e/pdf-annotations.spec.ts b/tests/e2e/pdf-annotations.spec.ts index 1b8077bb..af67bb79 100644 --- a/tests/e2e/pdf-annotations.spec.ts +++ b/tests/e2e/pdf-annotations.spec.ts @@ -314,3 +314,93 @@ test.describe("PDF Server - Annotations", () => { ); }); }); + +/** + * Read the most recent interact result text from the basic-host UI. + * Waits for the result-panel count to reach `expectedCount` first — + * `callInteract` doesn't block, so `.last()` would otherwise race to the + * previous (display_pdf) panel. + */ +async function readLastToolResult( + page: Page, + expectedCount: number, +): Promise { + const panels = page.locator('text="📤 Tool Result"'); + await expect(panels).toHaveCount(expectedCount, { timeout: 30000 }); + await panels.last().click(); + const pre = page.locator("pre").last(); + await expect(pre).toBeVisible({ timeout: 5000 }); + return (await pre.textContent()) ?? ""; +} + +/** Unwrap basic-host's `CallToolResult` JSON to the first text block. */ +function unwrapTextResult(raw: string): string { + const parsed = JSON.parse(raw) as { + content?: { type: string; text?: string }[]; + }; + const block = parsed.content?.find((c) => c.type === "text"); + if (!block?.text) throw new Error(`No text block in: ${raw.slice(0, 200)}`); + return block.text; +} + +test.describe("PDF Server - get_viewer_state", () => { + test("returns page/zoom/mode and selection:null when nothing is selected", async ({ + page, + }) => { + await loadPdfServer(page); + await waitForPdfCanvas(page); + + const viewUUID = await extractViewUUID(page); + + await callInteract(page, { viewUUID, action: "get_viewer_state" }); + const raw = await readLastToolResult(page, 2); + const state = JSON.parse(unwrapTextResult(raw)); + + expect(state.currentPage).toBe(1); + expect(state.pageCount).toBeGreaterThan(1); + expect(typeof state.zoom).toBe("number"); + expect(state.displayMode).toBe("inline"); + expect(state.selection).toBeNull(); + expect(Array.isArray(state.selectedAnnotationIds)).toBe(true); + }); + + test("returns selected text and bounding rect when text-layer text is selected", async ({ + page, + }) => { + await loadPdfServer(page); + await waitForPdfCanvas(page); + + const viewUUID = await extractViewUUID(page); + const app = getAppFrame(page); + + // Programmatically select the contents of the first text-layer span. + const selectedText = await app + .locator("#text-layer span") + .first() + .evaluate((span) => { + const range = span.ownerDocument.createRange(); + range.selectNodeContents(span); + const sel = span.ownerDocument.defaultView!.getSelection()!; + sel.removeAllRanges(); + sel.addRange(range); + return sel.toString().replace(/\s+/g, " ").trim(); + }); + expect(selectedText.length).toBeGreaterThan(0); + + await callInteract(page, { viewUUID, action: "get_viewer_state" }); + const raw = await readLastToolResult(page, 2); + const state = JSON.parse(unwrapTextResult(raw)); + + expect(state.currentPage).toBe(1); + expect(state.selection).not.toBeNull(); + expect(state.selection.text).toContain(selectedText); + expect(state.selection.boundingRect).toEqual( + expect.objectContaining({ + x: expect.any(Number), + y: expect.any(Number), + width: expect.any(Number), + height: expect.any(Number), + }), + ); + }); +});