Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .claude/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"Bash(pnpm test:*)",
"Bash(pnpm format:*)",
"Bash(pnpm --filter:*)",
"Bash(gh api *)"
"Bash(gh api:*)"
]
}
}
13 changes: 11 additions & 2 deletions apps/browser-scraper/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { Logger } from "@repo/logger";
import { validateSourceUrl } from "@repo/url-validator";
import { z } from "zod";

import { PlaywrightService } from "./services/playwright";
import { classifyBrowserError, PlaywrightService } from "./services/playwright";

type Env = { BROWSER: BrowserWorker };

Expand Down Expand Up @@ -47,9 +47,18 @@ const handle = async (request: Request, env: Env): Promise<Response> => {
);
return Response.json(result);
} catch (error) {
const errorMessage =
error instanceof Error ? error.message : "Unknown error";
if (classifyBrowserError(error) === "browser_unavailable") {
logger.warn("browser unavailable", { url: parsed.url, errorMessage });
return Response.json(
{ ok: false, error: "Browser unavailable" },
{ status: 503 }
);
}
logger.error("unexpected render failure", {
url: parsed.url,
errorMessage: error instanceof Error ? error.message : "Unknown error",
errorMessage,
});
return Response.json(
{ ok: false, error: "Internal render failure" },
Expand Down
30 changes: 29 additions & 1 deletion apps/browser-scraper/src/services/playwright.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@ vi.mock("@cloudflare/playwright", () => ({
}));

const { launch } = await import("@cloudflare/playwright");
const { PlaywrightService } = await import("./playwright");
const { classifyBrowserError, PlaywrightService } =
await import("./playwright");

type GotoFn = () => Promise<{
status: () => number;
Expand Down Expand Up @@ -115,3 +116,30 @@ describe("PlaywrightService.render", () => {
expect(result).toEqual({ ok: false, error: "HTTP 404", status: 404 });
});
});

describe("classifyBrowserError", () => {
it.each([
["browserType.connectOverCDP: Timeout 30000ms exceeded."],
["Timeout 30000ms exceeded"],
["WebSocket error: SessionID: abc [object ErrorEvent]"],
["Target closed"],
["Connection closed while reading from the driver"],
])("classifies %s as browser_unavailable", (message) => {
expect(classifyBrowserError(new Error(message))).toBe(
"browser_unavailable"
);
});

it("classifies unknown errors as browser_internal", () => {
expect(classifyBrowserError(new Error("Something else broke"))).toBe(
"browser_internal"
);
});

it("handles non-Error throwables", () => {
expect(classifyBrowserError("plain string")).toBe("browser_internal");
expect(classifyBrowserError({ toString: () => "WebSocket boom" })).toBe(
"browser_unavailable"
);
});
});
30 changes: 28 additions & 2 deletions apps/browser-scraper/src/services/playwright.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,27 @@ import { validateSourceUrl } from "@repo/url-validator";
const NAV_TIMEOUT_MS = 15_000;
const MAX_HTML_CHARS = 2 * 1024 * 1024;

type BrowserErrorKind = "browser_unavailable" | "browser_internal";

const BROWSER_UNAVAILABLE_PATTERNS = [
"connectOverCDP",
"Timeout 30000ms exceeded",
"WebSocket",
"ErrorEvent",
"Target closed",
"Connection closed",
] as const;

const classifyBrowserError = (error: unknown): BrowserErrorKind => {
const message = error instanceof Error ? error.message : String(error);
for (const pattern of BROWSER_UNAVAILABLE_PATTERNS) {
if (message.includes(pattern)) {
return "browser_unavailable";
}
}
return "browser_internal";
};

type RenderSuccess = {
ok: true;
html: string;
Expand Down Expand Up @@ -98,5 +119,10 @@ class PlaywrightService {
}
}

export { MAX_HTML_CHARS, NAV_TIMEOUT_MS, PlaywrightService };
export type { RenderError, RenderResult, RenderSuccess };
export {
classifyBrowserError,
MAX_HTML_CHARS,
NAV_TIMEOUT_MS,
PlaywrightService,
};
export type { BrowserErrorKind, RenderError, RenderResult, RenderSuccess };
2 changes: 1 addition & 1 deletion apps/operator/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
"@repo/logger": "workspace:*",
"@repo/url-validator": "workspace:*",
"drizzle-orm": "0.45.2",
"hono": "4.12.12",
"hono": "4.12.18",
"node-html-markdown": "2.0.0",
"openai": "6.34.0",
"zod": "4.3.6"
Expand Down
10 changes: 5 additions & 5 deletions apps/operator/src/services/scrape.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -259,18 +259,18 @@ describe("scrapeUrl", () => {
}
});

it("returns error when response body exceeds 2MB", async () => {
const largeBody = "x".repeat(3 * 1024 * 1024);
it("truncates response bodies exceeding 2MB and marks truncated", async () => {
const largeBody = `<p>${"x".repeat(3 * 1024 * 1024)}</p>`;
vi.stubGlobal(
"fetch",
vi.fn().mockResolvedValueOnce(createMockResponse(largeBody))
);

const result = await scrapeUrl("https://example.com");

expect(result.ok).toBe(false);
if (!result.ok) {
expect(result.error).toBe("Response exceeds 2MB size limit");
expect(result.ok).toBe(true);
if (result.ok) {
expect(result.truncated).toBe(true);
}
});
});
Expand Down
20 changes: 11 additions & 9 deletions apps/operator/src/services/scrape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -61,29 +61,34 @@ const concatChunks = (chunks: Uint8Array[], totalBytes: number): Uint8Array => {

const readBodyWithLimit = async (
body: ReadableStream<Uint8Array>
): Promise<{ bytes: Uint8Array } | { error: string }> => {
): Promise<{ bytes: Uint8Array; truncated: boolean }> => {
const reader = body.getReader();
const chunks: Uint8Array[] = [];
let totalBytes = 0;
let truncated = false;

try {
for (;;) {
const { done, value } = await reader.read();
if (done) {
break;
}
totalBytes += value.byteLength;
if (totalBytes > MAX_BODY_BYTES) {
if (totalBytes + value.byteLength > MAX_BODY_BYTES) {
const remaining = MAX_BODY_BYTES - totalBytes;
chunks.push(value.subarray(0, remaining));
totalBytes += remaining;
truncated = true;
await reader.cancel();
return { error: "Response exceeds 2MB size limit" };
break;
}
chunks.push(value);
totalBytes += value.byteLength;
}
} finally {
reader.releaseLock();
}

return { bytes: concatChunks(chunks, totalBytes) };
return { bytes: concatChunks(chunks, totalBytes), truncated };
};

const collapseWhitespace = (text: string): string =>
Expand Down Expand Up @@ -189,14 +194,11 @@ const fetchViaNative = async (url: string): Promise<FetchedContent> => {
const bodyResult = await readBodyWithLimit(
response.body as ReadableStream<Uint8Array>
);
if ("error" in bodyResult) {
return { ok: false, error: bodyResult.error };
}

const raw = new TextDecoder().decode(bodyResult.bytes);
const contentType = response.headers.get("content-type") ?? "";

return { ok: true, raw, contentType };
return { ok: true, raw, contentType, truncated: bodyResult.truncated };
};

const fetchViaBrowserScraper = async (
Expand Down
7 changes: 7 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,12 @@
"@repo/prettier": "workspace:*",
"prettier": "3.8.2",
"prettier-plugin-tailwindcss": "0.7.2"
},
"pnpm": {
"overrides": {
"esbuild": "0.25.0",
"postcss": "8.5.10",
"brace-expansion": "5.0.6"
}
}
}
Loading
Loading