From 1836155a154191557fb14a96ad84f28d77c943c9 Mon Sep 17 00:00:00 2001 From: Olivier Chafik Date: Thu, 2 Apr 2026 03:23:40 -0400 Subject: [PATCH 1/2] feat(pdf-server): get_viewer_state interact action New interact action that returns a JSON snapshot of the live viewer: {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection: {text, contextBefore, contextAfter, boundingRect} | null}. The viewer already pushes selection passively via setModelContext as tags, but not all hosts surface model-context. This gives the model an explicit pull. selection.boundingRect is a single bbox in PDF points (top-left origin, y-down) so it can be fed straight back into add_annotations. selection is null when nothing is selected or the selection is outside the text-layer. Wiring: new PdfCommand variant -> processCommands case -> handleGetViewerState -> submit_viewer_state (new app-only tool, mirrors submit_save_data) -> waitForViewerState -> text content block. Also fills a gap in the display_pdf description: it listed interact actions but was missing save_as; added that and get_viewer_state. e2e: two tests covering selection:null and a programmatic text-layer selection. --- examples/pdf-server/server.ts | 95 ++++++++++++++++++++++++++++- examples/pdf-server/src/commands.ts | 1 + examples/pdf-server/src/mcp-app.ts | 95 +++++++++++++++++++++++++++++ tests/e2e/pdf-annotations.spec.ts | 68 +++++++++++++++++++++ 4 files changed, 258 insertions(+), 1 deletion(-) diff --git a/examples/pdf-server/server.ts b/examples/pdf-server/server.ts index ba16431c..d2a0b7d8 100644 --- a/examples/pdf-server/server.ts +++ b/examples/pdf-server/server.ts @@ -305,6 +305,33 @@ function waitForSaveData( }); } +const pendingStateRequests = new Map void>(); + +/** + * Wait for the viewer to report its current state (page, zoom, selection, …) + * as a JSON string. Same timeout/abort semantics as waitForSaveData. + */ +function waitForViewerState( + requestId: string, + signal?: AbortSignal, +): Promise { + return new Promise((resolve, reject) => { + const settle = (v: string | Error) => { + clearTimeout(timer); + signal?.removeEventListener("abort", onAbort); + pendingStateRequests.delete(requestId); + v instanceof Error ? reject(v) : resolve(v); + }; + const onAbort = () => settle(new Error("interact request cancelled")); + const timer = setTimeout( + () => settle(new Error("Timeout waiting for viewer state")), + GET_PAGES_TIMEOUT_MS, + ); + signal?.addEventListener("abort", onAbort); + pendingStateRequests.set(requestId, settle); + }); +} + interface QueueEntry { commands: PdfCommand[]; /** Timestamp of the most recent enqueue or dequeue */ @@ -1350,7 +1377,8 @@ Returns a viewUUID in structuredContent. Pass it to \`interact\`: - add_annotations, update_annotations, remove_annotations, highlight_text - fill_form (fill PDF form fields) - navigate, search, find, search_navigate, zoom -- get_text, get_screenshot (extract content) +- get_text, get_screenshot, get_viewer_state (extract content / read selection & current page) +- save_as (write annotated PDF to disk) Accepts local files (use list_pdfs), client MCP root directories, or any HTTPS URL. Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before display.`, @@ -1650,6 +1678,7 @@ URL: ${normalized}`, "fill_form", "get_text", "get_screenshot", + "get_viewer_state", "save_as", ]) .describe("Action to perform"), @@ -2238,6 +2267,26 @@ URL: ${normalized}`, ); } } + case "get_viewer_state": { + const requestId = randomUUID(); + enqueueCommand(uuid, { type: "get_viewer_state", requestId }); + let state: string; + try { + await ensureViewerIsPolling(uuid); + state = await waitForViewerState(requestId, signal); + } catch (err) { + return { + content: [ + { + type: "text", + text: `Error: ${err instanceof Error ? err.message : String(err)}`, + }, + ], + isError: true, + }; + } + return { content: [{ type: "text", text: state }] }; + } default: return { content: [{ type: "text", text: `Unknown action: ${action}` }], @@ -2295,6 +2344,7 @@ Example — add a signature image and a stamp, then screenshot to verify: **TEXT/SCREENSHOTS**: • get_text: extract text from pages. Optional \`page\` for single page, or \`intervals\` for ranges [{start?,end?}]. Max 20 pages. • get_screenshot: capture a single page as PNG image. Requires \`page\`. +• get_viewer_state: snapshot of the live viewer — JSON {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection:{text,contextBefore,contextAfter,boundingRect}|null}. Use this to read what the user has selected or which page they're on. **FORMS** — fill_form: fill fields with \`fields\` array of {name, value}. @@ -2320,6 +2370,7 @@ Example — add a signature image and a stamp, then screenshot to verify: "fill_form", "get_text", "get_screenshot", + "get_viewer_state", "save_as", ]) .optional() @@ -2603,6 +2654,48 @@ Example — add a signature image and a stamp, then screenshot to verify: }, ); + // Tool: submit_viewer_state (app-only) - Viewer reports its live state + registerAppTool( + server, + "submit_viewer_state", + { + title: "Submit Viewer State", + description: + "Submit a viewer-state snapshot for a get_viewer_state request (used by viewer). The model should NOT call this tool directly.", + inputSchema: { + requestId: z + .string() + .describe("The request ID from the get_viewer_state command"), + state: z + .string() + .optional() + .describe("JSON-encoded viewer state snapshot"), + error: z + .string() + .optional() + .describe("Error message if the viewer failed to read state"), + }, + _meta: { ui: { visibility: ["app"] } }, + }, + async ({ requestId, state, error }): Promise => { + const settle = pendingStateRequests.get(requestId); + if (!settle) { + return { + content: [ + { type: "text", text: `No pending request for ${requestId}` }, + ], + isError: true, + }; + } + if (error || !state) { + settle(new Error(error || "Viewer returned no state")); + } else { + settle(state); + } + return { content: [{ type: "text", text: "Submitted" }] }; + }, + ); + // Tool: poll_pdf_commands (app-only) - Poll for pending commands registerAppTool( server, diff --git a/examples/pdf-server/src/commands.ts b/examples/pdf-server/src/commands.ts index c469ebd6..6d27edf9 100644 --- a/examples/pdf-server/src/commands.ts +++ b/examples/pdf-server/src/commands.ts @@ -66,4 +66,5 @@ export type PdfCommand = getScreenshots: boolean; } | { type: "save_as"; requestId: string } + | { type: "get_viewer_state"; requestId: string } | { type: "file_changed"; mtimeMs: number }; diff --git a/examples/pdf-server/src/mcp-app.ts b/examples/pdf-server/src/mcp-app.ts index deec2e7f..fe30b14f 100644 --- a/examples/pdf-server/src/mcp-app.ts +++ b/examples/pdf-server/src/mcp-app.ts @@ -2428,6 +2428,84 @@ async function renderPageOffscreen(pageNum: number): Promise { return dataUrl.split(",")[1]; } +/** + * Snapshot the live viewer for `interact({action:"get_viewer_state"})`. + * + * Selection is read from `window.getSelection()` at call time — no caching; + * if the user navigated away or nothing is selected, `selection` is `null`. + * `boundingRect` is in model coords (PDF points, origin top-left, y-down) so + * it can be fed straight back into `add_annotations`. + */ +async function handleGetViewerState(requestId: string): Promise { + const CONTEXT_CHARS = 200; + + let selection: { + text: string; + contextBefore: string; + contextAfter: string; + boundingRect: { x: number; y: number; width: number; height: number }; + } | null = null; + + const sel = window.getSelection(); + const selectedText = sel?.toString().replace(/\s+/g, " ").trim(); + if (sel && selectedText && sel.rangeCount > 0) { + // Only treat it as a PDF selection if it lives inside the text layer of + // the rendered page (not the toolbar, search box, etc.). + const range = sel.getRangeAt(0); + const anchor = + range.commonAncestorContainer.nodeType === Node.ELEMENT_NODE + ? (range.commonAncestorContainer as Element) + : range.commonAncestorContainer.parentElement; + if (anchor && textLayerEl.contains(anchor)) { + // Context: locate selection in the page's extracted text and slice + // ±CONTEXT_CHARS around it. Falls back to empty strings if fuzzy + // match fails (still return text + rect — they're the load-bearing + // bits). + const pageText = pageTextCache.get(currentPage) ?? ""; + const loc = findSelectionInText(pageText, selectedText); + const contextBefore = loc + ? pageText.slice(Math.max(0, loc.start - CONTEXT_CHARS), loc.start) + : ""; + const contextAfter = loc + ? pageText.slice(loc.end, loc.end + CONTEXT_CHARS) + : ""; + + // Single bounding box, page-relative model coords. getBoundingClientRect + // is viewport-relative; subtract the page-wrapper origin then divide by + // scale → PDF points (top-left origin, y-down — matches the coord + // system documented in the interact tool description). + const r = range.getBoundingClientRect(); + const origin = pageWrapperEl.getBoundingClientRect(); + const round = (n: number) => Math.round(n * 100) / 100; + selection = { + text: selectedText, + contextBefore, + contextAfter, + boundingRect: { + x: round((r.left - origin.left) / scale), + y: round((r.top - origin.top) / scale), + width: round(r.width / scale), + height: round(r.height / scale), + }, + }; + } + } + + const state = { + currentPage, + pageCount: totalPages, + zoom: Math.round(scale * 100), + displayMode: currentDisplayMode, + selectedAnnotationIds: [...selectedAnnotationIds], + selection, + }; + + await app.callServerTool({ + name: "submit_viewer_state", + arguments: { requestId, state: JSON.stringify(state, null, 2) }, + }); +} + async function handleGetPages(cmd: { requestId: string; intervals: Array<{ start?: number; end?: number }>; @@ -4678,6 +4756,23 @@ async function processCommands(commands: PdfCommand[]): Promise { .catch(() => {}); } break; + case "get_viewer_state": + // Same await-before-next-poll discipline as get_pages/save_as. + try { + await handleGetViewerState(cmd.requestId); + } catch (err) { + log.error("get_viewer_state failed — submitting error:", err); + await app + .callServerTool({ + name: "submit_viewer_state", + arguments: { + requestId: cmd.requestId, + error: err instanceof Error ? err.message : String(err), + }, + }) + .catch(() => {}); + } + break; case "file_changed": { // Skip our own save_pdf echo: either save is still in flight, or the // event's mtime matches what save_pdf just returned. diff --git a/tests/e2e/pdf-annotations.spec.ts b/tests/e2e/pdf-annotations.spec.ts index 1b8077bb..d080fa50 100644 --- a/tests/e2e/pdf-annotations.spec.ts +++ b/tests/e2e/pdf-annotations.spec.ts @@ -314,3 +314,71 @@ test.describe("PDF Server - Annotations", () => { ); }); }); + +/** + * Read the most recent interact result text from the basic-host UI. + * Expands the latest "📤 Tool Result" panel and returns the
 text.
+ */
+async function readLastToolResult(page: Page): Promise {
+  const panel = page.locator('text="📤 Tool Result"').last();
+  await expect(panel).toBeVisible({ timeout: 30000 });
+  await panel.click();
+  const pre = page.locator("pre").last();
+  await expect(pre).toBeVisible({ timeout: 5000 });
+  return (await pre.textContent()) ?? "";
+}
+
+test.describe("PDF Server - get_viewer_state", () => {
+  test("returns page/zoom/mode and selection:null when nothing is selected", async ({
+    page,
+  }) => {
+    await loadPdfServer(page);
+    await waitForPdfCanvas(page);
+
+    const viewUUID = await extractViewUUID(page);
+
+    await callInteract(page, { viewUUID, action: "get_viewer_state" });
+    const result = await readLastToolResult(page);
+
+    // Basic-host renders text content blocks as JSON-ish; the viewer's reply
+    // is a JSON object — assert key fields without being brittle on
+    // surrounding chrome.
+    expect(result).toMatch(/"currentPage"\s*:\s*1/);
+    expect(result).toMatch(/"pageCount"\s*:\s*\d+/);
+    expect(result).toMatch(/"zoom"\s*:\s*\d+/);
+    expect(result).toMatch(/"displayMode"\s*:\s*"inline"/);
+    expect(result).toMatch(/"selection"\s*:\s*null/);
+  });
+
+  test("returns selected text and bounding rect when text-layer text is selected", async ({
+    page,
+  }) => {
+    await loadPdfServer(page);
+    await waitForPdfCanvas(page);
+
+    const viewUUID = await extractViewUUID(page);
+    const app = getAppFrame(page);
+
+    // Programmatically select the contents of the first text-layer span.
+    const selectedText = await app
+      .locator("#text-layer span")
+      .first()
+      .evaluate((span) => {
+        const range = span.ownerDocument.createRange();
+        range.selectNodeContents(span);
+        const sel = span.ownerDocument.defaultView!.getSelection()!;
+        sel.removeAllRanges();
+        sel.addRange(range);
+        return sel.toString().replace(/\s+/g, " ").trim();
+      });
+    expect(selectedText.length).toBeGreaterThan(0);
+
+    await callInteract(page, { viewUUID, action: "get_viewer_state" });
+    const result = await readLastToolResult(page);
+
+    expect(result).toMatch(/"selection"\s*:\s*\{/);
+    expect(result).toContain(JSON.stringify(selectedText).slice(1, -1));
+    expect(result).toMatch(/"boundingRect"\s*:\s*\{/);
+    expect(result).toMatch(/"currentPage"\s*:\s*1/);
+  });
+});

From 0717483d9bf08a8642dc95e5ae033cb3662237be Mon Sep 17 00:00:00 2001
From: Olivier Chafik 
Date: Thu, 2 Apr 2026 03:46:40 -0400
Subject: [PATCH 2/2] test(pdf-server): fix get_viewer_state e2e race +
 assertions

readLastToolResult clicked .last() before the interact result panel
existed (callInteract doesn't block), so it expanded the display_pdf
panel instead. Wait for the expected panel count first.

Also: basic-host renders the full CallToolResult JSON, with the state
double-escaped inside content[0].text. Parse instead of regex-matching.

playwright.config.ts: honor PW_CHANNEL env to use system Chrome locally
when the bundled chromium_headless_shell is broken.
---
 playwright.config.ts              |  1 +
 tests/e2e/pdf-annotations.spec.ts | 64 +++++++++++++++++++++----------
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/playwright.config.ts b/playwright.config.ts
index 8485b000..6f6cb65b 100644
--- a/playwright.config.ts
+++ b/playwright.config.ts
@@ -30,6 +30,7 @@ export default defineConfig({
         ...devices["Desktop Chrome"],
         // Use default Chromium everywhere for consistent screenshot rendering
         // Run `npm run test:e2e:docker` locally for CI-identical results
+        ...(process.env.PW_CHANNEL ? { channel: process.env.PW_CHANNEL } : {}),
       },
     },
   ],
diff --git a/tests/e2e/pdf-annotations.spec.ts b/tests/e2e/pdf-annotations.spec.ts
index d080fa50..af67bb79 100644
--- a/tests/e2e/pdf-annotations.spec.ts
+++ b/tests/e2e/pdf-annotations.spec.ts
@@ -317,17 +317,32 @@ test.describe("PDF Server - Annotations", () => {
 
 /**
  * Read the most recent interact result text from the basic-host UI.
- * Expands the latest "📤 Tool Result" panel and returns the 
 text.
+ * Waits for the result-panel count to reach `expectedCount` first —
+ * `callInteract` doesn't block, so `.last()` would otherwise race to the
+ * previous (display_pdf) panel.
  */
-async function readLastToolResult(page: Page): Promise {
-  const panel = page.locator('text="📤 Tool Result"').last();
-  await expect(panel).toBeVisible({ timeout: 30000 });
-  await panel.click();
+async function readLastToolResult(
+  page: Page,
+  expectedCount: number,
+): Promise {
+  const panels = page.locator('text="📤 Tool Result"');
+  await expect(panels).toHaveCount(expectedCount, { timeout: 30000 });
+  await panels.last().click();
   const pre = page.locator("pre").last();
   await expect(pre).toBeVisible({ timeout: 5000 });
   return (await pre.textContent()) ?? "";
 }
 
+/** Unwrap basic-host's `CallToolResult` JSON to the first text block. */
+function unwrapTextResult(raw: string): string {
+  const parsed = JSON.parse(raw) as {
+    content?: { type: string; text?: string }[];
+  };
+  const block = parsed.content?.find((c) => c.type === "text");
+  if (!block?.text) throw new Error(`No text block in: ${raw.slice(0, 200)}`);
+  return block.text;
+}
+
 test.describe("PDF Server - get_viewer_state", () => {
   test("returns page/zoom/mode and selection:null when nothing is selected", async ({
     page,
@@ -338,16 +353,15 @@ test.describe("PDF Server - get_viewer_state", () => {
     const viewUUID = await extractViewUUID(page);
 
     await callInteract(page, { viewUUID, action: "get_viewer_state" });
-    const result = await readLastToolResult(page);
-
-    // Basic-host renders text content blocks as JSON-ish; the viewer's reply
-    // is a JSON object — assert key fields without being brittle on
-    // surrounding chrome.
-    expect(result).toMatch(/"currentPage"\s*:\s*1/);
-    expect(result).toMatch(/"pageCount"\s*:\s*\d+/);
-    expect(result).toMatch(/"zoom"\s*:\s*\d+/);
-    expect(result).toMatch(/"displayMode"\s*:\s*"inline"/);
-    expect(result).toMatch(/"selection"\s*:\s*null/);
+    const raw = await readLastToolResult(page, 2);
+    const state = JSON.parse(unwrapTextResult(raw));
+
+    expect(state.currentPage).toBe(1);
+    expect(state.pageCount).toBeGreaterThan(1);
+    expect(typeof state.zoom).toBe("number");
+    expect(state.displayMode).toBe("inline");
+    expect(state.selection).toBeNull();
+    expect(Array.isArray(state.selectedAnnotationIds)).toBe(true);
   });
 
   test("returns selected text and bounding rect when text-layer text is selected", async ({
@@ -374,11 +388,19 @@ test.describe("PDF Server - get_viewer_state", () => {
     expect(selectedText.length).toBeGreaterThan(0);
 
     await callInteract(page, { viewUUID, action: "get_viewer_state" });
-    const result = await readLastToolResult(page);
-
-    expect(result).toMatch(/"selection"\s*:\s*\{/);
-    expect(result).toContain(JSON.stringify(selectedText).slice(1, -1));
-    expect(result).toMatch(/"boundingRect"\s*:\s*\{/);
-    expect(result).toMatch(/"currentPage"\s*:\s*1/);
+    const raw = await readLastToolResult(page, 2);
+    const state = JSON.parse(unwrapTextResult(raw));
+
+    expect(state.currentPage).toBe(1);
+    expect(state.selection).not.toBeNull();
+    expect(state.selection.text).toContain(selectedText);
+    expect(state.selection.boundingRect).toEqual(
+      expect.objectContaining({
+        x: expect.any(Number),
+        y: expect.any(Number),
+        width: expect.any(Number),
+        height: expect.any(Number),
+      }),
+    );
   });
 });