Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 94 additions & 1 deletion examples/pdf-server/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,33 @@ function waitForSaveData(
});
}

const pendingStateRequests = new Map<string, (v: string | Error) => void>();

/**
* Wait for the viewer to report its current state (page, zoom, selection, …)
* as a JSON string. Same timeout/abort semantics as waitForSaveData.
*/
function waitForViewerState(
requestId: string,
signal?: AbortSignal,
): Promise<string> {
return new Promise<string>((resolve, reject) => {
const settle = (v: string | Error) => {
clearTimeout(timer);
signal?.removeEventListener("abort", onAbort);
pendingStateRequests.delete(requestId);
v instanceof Error ? reject(v) : resolve(v);
};
const onAbort = () => settle(new Error("interact request cancelled"));
const timer = setTimeout(
() => settle(new Error("Timeout waiting for viewer state")),
GET_PAGES_TIMEOUT_MS,
);
signal?.addEventListener("abort", onAbort);
pendingStateRequests.set(requestId, settle);
});
}

interface QueueEntry {
commands: PdfCommand[];
/** Timestamp of the most recent enqueue or dequeue */
Expand Down Expand Up @@ -1350,7 +1377,8 @@ Returns a viewUUID in structuredContent. Pass it to \`interact\`:
- add_annotations, update_annotations, remove_annotations, highlight_text
- fill_form (fill PDF form fields)
- navigate, search, find, search_navigate, zoom
- get_text, get_screenshot (extract content)
- get_text, get_screenshot, get_viewer_state (extract content / read selection & current page)
- save_as (write annotated PDF to disk)

Accepts local files (use list_pdfs), client MCP root directories, or any HTTPS URL.
Set \`elicit_form_inputs\` to true to prompt the user to fill form fields before display.`,
Expand Down Expand Up @@ -1650,6 +1678,7 @@ URL: ${normalized}`,
"fill_form",
"get_text",
"get_screenshot",
"get_viewer_state",
"save_as",
])
.describe("Action to perform"),
Expand Down Expand Up @@ -2238,6 +2267,26 @@ URL: ${normalized}`,
);
}
}
case "get_viewer_state": {
const requestId = randomUUID();
enqueueCommand(uuid, { type: "get_viewer_state", requestId });
let state: string;
try {
await ensureViewerIsPolling(uuid);
state = await waitForViewerState(requestId, signal);
} catch (err) {
return {
content: [
{
type: "text",
text: `Error: ${err instanceof Error ? err.message : String(err)}`,
},
],
isError: true,
};
}
return { content: [{ type: "text", text: state }] };
}
default:
return {
content: [{ type: "text", text: `Unknown action: ${action}` }],
Expand Down Expand Up @@ -2295,6 +2344,7 @@ Example — add a signature image and a stamp, then screenshot to verify:
**TEXT/SCREENSHOTS**:
• get_text: extract text from pages. Optional \`page\` for single page, or \`intervals\` for ranges [{start?,end?}]. Max 20 pages.
• get_screenshot: capture a single page as PNG image. Requires \`page\`.
• get_viewer_state: snapshot of the live viewer — JSON {currentPage, pageCount, zoom, displayMode, selectedAnnotationIds, selection:{text,contextBefore,contextAfter,boundingRect}|null}. Use this to read what the user has selected or which page they're on.

**FORMS** — fill_form: fill fields with \`fields\` array of {name, value}.

Expand All @@ -2320,6 +2370,7 @@ Example — add a signature image and a stamp, then screenshot to verify:
"fill_form",
"get_text",
"get_screenshot",
"get_viewer_state",
"save_as",
])
.optional()
Expand Down Expand Up @@ -2603,6 +2654,48 @@ Example — add a signature image and a stamp, then screenshot to verify:
},
);

// Tool: submit_viewer_state (app-only) - Viewer reports its live state
registerAppTool(
server,
"submit_viewer_state",
{
title: "Submit Viewer State",
description:
"Submit a viewer-state snapshot for a get_viewer_state request (used by viewer). The model should NOT call this tool directly.",
inputSchema: {
requestId: z
.string()
.describe("The request ID from the get_viewer_state command"),
state: z
.string()
.optional()
.describe("JSON-encoded viewer state snapshot"),
error: z
.string()
.optional()
.describe("Error message if the viewer failed to read state"),
},
_meta: { ui: { visibility: ["app"] } },
},
async ({ requestId, state, error }): Promise<CallToolResult> => {
const settle = pendingStateRequests.get(requestId);
if (!settle) {
return {
content: [
{ type: "text", text: `No pending request for ${requestId}` },
],
isError: true,
};
}
if (error || !state) {
settle(new Error(error || "Viewer returned no state"));
} else {
settle(state);
}
return { content: [{ type: "text", text: "Submitted" }] };
},
);

// Tool: poll_pdf_commands (app-only) - Poll for pending commands
registerAppTool(
server,
Expand Down
1 change: 1 addition & 0 deletions examples/pdf-server/src/commands.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,5 @@ export type PdfCommand =
getScreenshots: boolean;
}
| { type: "save_as"; requestId: string }
| { type: "get_viewer_state"; requestId: string }
| { type: "file_changed"; mtimeMs: number };
95 changes: 95 additions & 0 deletions examples/pdf-server/src/mcp-app.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2428,6 +2428,84 @@ async function renderPageOffscreen(pageNum: number): Promise<string> {
return dataUrl.split(",")[1];
}

/**
* Snapshot the live viewer for `interact({action:"get_viewer_state"})`.
*
* Selection is read from `window.getSelection()` at call time — no caching;
* if the user navigated away or nothing is selected, `selection` is `null`.
* `boundingRect` is in model coords (PDF points, origin top-left, y-down) so
* it can be fed straight back into `add_annotations`.
*/
async function handleGetViewerState(requestId: string): Promise<void> {
const CONTEXT_CHARS = 200;

let selection: {
text: string;
contextBefore: string;
contextAfter: string;
boundingRect: { x: number; y: number; width: number; height: number };
} | null = null;

const sel = window.getSelection();
const selectedText = sel?.toString().replace(/\s+/g, " ").trim();
if (sel && selectedText && sel.rangeCount > 0) {
// Only treat it as a PDF selection if it lives inside the text layer of
// the rendered page (not the toolbar, search box, etc.).
const range = sel.getRangeAt(0);
const anchor =
range.commonAncestorContainer.nodeType === Node.ELEMENT_NODE
? (range.commonAncestorContainer as Element)
: range.commonAncestorContainer.parentElement;
if (anchor && textLayerEl.contains(anchor)) {
// Context: locate selection in the page's extracted text and slice
// ±CONTEXT_CHARS around it. Falls back to empty strings if fuzzy
// match fails (still return text + rect — they're the load-bearing
// bits).
const pageText = pageTextCache.get(currentPage) ?? "";
const loc = findSelectionInText(pageText, selectedText);
const contextBefore = loc
? pageText.slice(Math.max(0, loc.start - CONTEXT_CHARS), loc.start)
: "";
const contextAfter = loc
? pageText.slice(loc.end, loc.end + CONTEXT_CHARS)
: "";

// Single bounding box, page-relative model coords. getBoundingClientRect
// is viewport-relative; subtract the page-wrapper origin then divide by
// scale → PDF points (top-left origin, y-down — matches the coord
// system documented in the interact tool description).
const r = range.getBoundingClientRect();
const origin = pageWrapperEl.getBoundingClientRect();
const round = (n: number) => Math.round(n * 100) / 100;
selection = {
text: selectedText,
contextBefore,
contextAfter,
boundingRect: {
x: round((r.left - origin.left) / scale),
y: round((r.top - origin.top) / scale),
width: round(r.width / scale),
height: round(r.height / scale),
},
};
}
}

const state = {
currentPage,
pageCount: totalPages,
zoom: Math.round(scale * 100),
displayMode: currentDisplayMode,
selectedAnnotationIds: [...selectedAnnotationIds],
selection,
};

await app.callServerTool({
name: "submit_viewer_state",
arguments: { requestId, state: JSON.stringify(state, null, 2) },
});
}

async function handleGetPages(cmd: {
requestId: string;
intervals: Array<{ start?: number; end?: number }>;
Expand Down Expand Up @@ -4678,6 +4756,23 @@ async function processCommands(commands: PdfCommand[]): Promise<void> {
.catch(() => {});
}
break;
case "get_viewer_state":
// Same await-before-next-poll discipline as get_pages/save_as.
try {
await handleGetViewerState(cmd.requestId);
} catch (err) {
log.error("get_viewer_state failed — submitting error:", err);
await app
.callServerTool({
name: "submit_viewer_state",
arguments: {
requestId: cmd.requestId,
error: err instanceof Error ? err.message : String(err),
},
})
.catch(() => {});
}
break;
case "file_changed": {
// Skip our own save_pdf echo: either save is still in flight, or the
// event's mtime matches what save_pdf just returned.
Expand Down
1 change: 1 addition & 0 deletions playwright.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export default defineConfig({
...devices["Desktop Chrome"],
// Use default Chromium everywhere for consistent screenshot rendering
// Run `npm run test:e2e:docker` locally for CI-identical results
...(process.env.PW_CHANNEL ? { channel: process.env.PW_CHANNEL } : {}),
},
},
],
Expand Down
90 changes: 90 additions & 0 deletions tests/e2e/pdf-annotations.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -314,3 +314,93 @@ test.describe("PDF Server - Annotations", () => {
);
});
});

/**
* Read the most recent interact result text from the basic-host UI.
* Waits for the result-panel count to reach `expectedCount` first —
* `callInteract` doesn't block, so `.last()` would otherwise race to the
* previous (display_pdf) panel.
*/
async function readLastToolResult(
page: Page,
expectedCount: number,
): Promise<string> {
const panels = page.locator('text="📤 Tool Result"');
await expect(panels).toHaveCount(expectedCount, { timeout: 30000 });
await panels.last().click();
const pre = page.locator("pre").last();
await expect(pre).toBeVisible({ timeout: 5000 });
return (await pre.textContent()) ?? "";
}

/** Unwrap basic-host's `CallToolResult` JSON to the first text block. */
function unwrapTextResult(raw: string): string {
const parsed = JSON.parse(raw) as {
content?: { type: string; text?: string }[];
};
const block = parsed.content?.find((c) => c.type === "text");
if (!block?.text) throw new Error(`No text block in: ${raw.slice(0, 200)}`);
return block.text;
}

test.describe("PDF Server - get_viewer_state", () => {
test("returns page/zoom/mode and selection:null when nothing is selected", async ({
page,
}) => {
await loadPdfServer(page);
await waitForPdfCanvas(page);

const viewUUID = await extractViewUUID(page);

await callInteract(page, { viewUUID, action: "get_viewer_state" });
const raw = await readLastToolResult(page, 2);
const state = JSON.parse(unwrapTextResult(raw));

expect(state.currentPage).toBe(1);
expect(state.pageCount).toBeGreaterThan(1);
expect(typeof state.zoom).toBe("number");
expect(state.displayMode).toBe("inline");
expect(state.selection).toBeNull();
expect(Array.isArray(state.selectedAnnotationIds)).toBe(true);
});

test("returns selected text and bounding rect when text-layer text is selected", async ({
page,
}) => {
await loadPdfServer(page);
await waitForPdfCanvas(page);

const viewUUID = await extractViewUUID(page);
const app = getAppFrame(page);

// Programmatically select the contents of the first text-layer span.
const selectedText = await app
.locator("#text-layer span")
.first()
.evaluate((span) => {
const range = span.ownerDocument.createRange();
range.selectNodeContents(span);
const sel = span.ownerDocument.defaultView!.getSelection()!;
sel.removeAllRanges();
sel.addRange(range);
return sel.toString().replace(/\s+/g, " ").trim();
});
expect(selectedText.length).toBeGreaterThan(0);

await callInteract(page, { viewUUID, action: "get_viewer_state" });
const raw = await readLastToolResult(page, 2);
const state = JSON.parse(unwrapTextResult(raw));

expect(state.currentPage).toBe(1);
expect(state.selection).not.toBeNull();
expect(state.selection.text).toContain(selectedText);
expect(state.selection.boundingRect).toEqual(
expect.objectContaining({
x: expect.any(Number),
y: expect.any(Number),
width: expect.any(Number),
height: expect.any(Number),
}),
);
});
});
Loading