From 7492cd598f56c6295feac7e517f236f406b662a4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:21:42 +0000 Subject: [PATCH 1/4] Initial plan From f8d2c079e1a2a0865720db5b496604dea99cbfcd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:34:24 +0000 Subject: [PATCH 2/4] feat: add Playwright fallback extractor for JS-heavy URL ingestion Implements: - src/lib/ingestion/extractor.ts: Extractor interface + ExtractResult type - src/lib/ingestion/extractor-fetch.ts: fast-path HTTP fetch extractor - src/lib/ingestion/extractor-playwright.ts: PlaywrightExtractor (opt-in, dynamic import, fresh context per run, configurable timeout, graceful degradation) - src/lib/ingestion/service.ts: IngestionService with primary + fallback orchestration, telemetry emission, OB_PLAYWRIGHT_FALLBACK env var support - src/cli/commands/add.ts: 'ob add ' CLI command - package.json: playwright as optional peer dependency - 47 unit tests (no real browser required in CI) - tests/fixtures/playwright-fallback/: HTML fixtures for documentation Agent-Logs-Url: https://github.com/TheWizardsCode/ContextHub/sessions/511715b4-dc90-4fe1-8664-cd673580f894 Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com> --- package.json | 8 + src/cli/commands/add.ts | 168 ++++++++++ src/lib/ingestion/extractor-fetch.ts | 108 +++++++ src/lib/ingestion/extractor-playwright.ts | 210 +++++++++++++ src/lib/ingestion/extractor.ts | 37 +++ src/lib/ingestion/service.ts | 189 ++++++++++++ .../playwright-fallback/js-heavy-page.html | 22 ++ .../playwright-fallback/static-page.html | 15 + tests/lib/ingestion/add-command.test.ts | 106 +++++++ tests/lib/ingestion/extractor-fetch.test.ts | 131 ++++++++ .../ingestion/extractor-playwright.test.ts | 291 ++++++++++++++++++ tests/lib/ingestion/extractor.test.ts | 47 +++ tests/lib/ingestion/service.test.ts | 287 +++++++++++++++++ 13 files changed, 1619 insertions(+) create mode 100644 src/cli/commands/add.ts create mode 100644 src/lib/ingestion/extractor-fetch.ts create mode 100644 src/lib/ingestion/extractor-playwright.ts create mode 100644 src/lib/ingestion/extractor.ts create mode 100644 src/lib/ingestion/service.ts create mode 100644 tests/fixtures/playwright-fallback/js-heavy-page.html create mode 100644 tests/fixtures/playwright-fallback/static-page.html create mode 100644 tests/lib/ingestion/add-command.test.ts create mode 100644 tests/lib/ingestion/extractor-fetch.test.ts create mode 100644 tests/lib/ingestion/extractor-playwright.test.ts create mode 100644 tests/lib/ingestion/extractor.test.ts create mode 100644 tests/lib/ingestion/service.test.ts diff --git a/package.json b/package.json index 426b1489..ed2e8179 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,14 @@ "express": "^4.18.2", "js-yaml": "^4.1.1" }, + "peerDependencies": { + "playwright": ">=1.40.0" + }, + "peerDependenciesMeta": { + "playwright": { + "optional": true + } + }, "devDependencies": { "@types/blessed": "^0.1.7", "@types/better-sqlite3": "^7.6.13", diff --git a/src/cli/commands/add.ts b/src/cli/commands/add.ts new file mode 100644 index 00000000..1c822e61 --- /dev/null +++ b/src/cli/commands/add.ts @@ -0,0 +1,168 @@ +/** + * `ob add ` CLI command. + * + * Ingests content from a URL using the fast-path primary extractor, falling + * back to a Playwright headless-browser fetch when: + * - The Playwright fallback is explicitly enabled (via config or environment + * variable `OB_PLAYWRIGHT_FALLBACK=1`), AND + * - The primary extractor returns fewer characters than the configured + * minimum content length threshold. + * + * Usage: + * ob add [--playwright-fallback] [--min-content-length ] [--timeout ] + * + * The command writes the ingested content to stdout (or a file if `--output` + * is provided) in a format compatible with the downstream ingestion pipeline. + * It exits with a non-zero status code only on unrecoverable errors (e.g. an + * invalid URL); partial or empty content from a gracefully-degraded fetch is + * not treated as a fatal error. + */ + +import { IngestionService, type IngestionConfig } from '../../lib/ingestion/service.js'; +import { FetchExtractor } from '../../lib/ingestion/extractor-fetch.js'; +import { PlaywrightExtractor, type PlaywrightBrowserChannel } from '../../lib/ingestion/extractor-playwright.js'; +import type { ExtractResult } from '../../lib/ingestion/extractor.js'; +import * as fs from 'fs'; + +/** Options accepted by the `add` command. */ +export interface AddCommandOptions { + /** Enable the Playwright fallback (overrides config). */ + playwrightFallback?: boolean; + /** Minimum content length before fallback is triggered. */ + minContentLength?: number; + /** Playwright navigation timeout in milliseconds. */ + timeout?: number; + /** Browser channel to use for Playwright. */ + browser?: PlaywrightBrowserChannel; + /** Write output to this file path instead of stdout. */ + output?: string; + /** + * Injectable file-write function for testing. + * Defaults to `fs.writeFileSync` when not provided. + * @internal + */ + _writeFile?: (path: string, data: string, encoding: BufferEncoding) => void; +} + +/** + * Dependency-injectable runner for the `add` command. + * + * Separated from Commander.js setup so it can be unit-tested without starting + * a real CLI process. + * + * @param url The URL to ingest. + * @param options Command options. + * @returns The ingested {@link ExtractResult}. + */ +export async function runAdd( + url: string, + options: AddCommandOptions = {} +): Promise { + if (!url) { + throw new Error('URL is required'); + } + + const ingestionConfig: IngestionConfig = { + playwrightFallback: options.playwrightFallback ?? false, + minContentLength: options.minContentLength, + }; + + const playwrightExtractor = new PlaywrightExtractor({ + browser: options.browser ?? 'chromium', + timeoutMs: options.timeout ?? 30_000, + }); + + const service = new IngestionService( + new FetchExtractor(), + playwrightExtractor, + ingestionConfig + ); + + const result = await service.ingest(url); + + if (options.output) { + const writeFile = options._writeFile ?? fs.writeFileSync; + writeFile(options.output, result.text, 'utf8'); + } else { + process.stdout.write(result.text); + if (result.text.length > 0 && !result.text.endsWith('\n')) { + process.stdout.write('\n'); + } + } + + return result; +} + +// --------------------------------------------------------------------------- +// Commander.js registration (optional — used when loaded as a plugin command) +// --------------------------------------------------------------------------- + +/** + * Register the `add` command with a Commander.js program. + * + * This function follows the same pattern as other commands in `src/commands/`. + * + * @param program The root Commander.js Command instance. + */ +export function registerAddCommand(program: { + command: (name: string) => CommandBuilder; +}): void { + program + .command('add ') + .description('Ingest content from a URL (with optional Playwright fallback for JS-heavy pages)') + .option( + '--playwright-fallback', + 'Enable Playwright headless-browser fallback (requires `playwright` package)', + false + ) + .option( + '--min-content-length ', + 'Minimum character count before Playwright fallback is triggered', + '200' + ) + .option( + '--timeout ', + 'Playwright navigation timeout in milliseconds', + '30000' + ) + .option( + '--browser ', + 'Playwright browser channel: chromium | firefox | webkit', + 'chromium' + ) + .option('--output ', 'Write ingested content to file instead of stdout') + .action(async (url: string, opts: Record) => { + const options: AddCommandOptions = { + playwrightFallback: Boolean(opts['playwrightFallback']), + minContentLength: opts['minContentLength'] + ? parseInt(String(opts['minContentLength']), 10) + : undefined, + timeout: opts['timeout'] + ? parseInt(String(opts['timeout']), 10) + : undefined, + browser: opts['browser'] as PlaywrightBrowserChannel | undefined, + output: opts['output'] ? String(opts['output']) : undefined, + }; + + try { + await runAdd(url, options); + } catch (err) { + process.stderr.write( + `[ob add] Error: ${err instanceof Error ? err.message : String(err)}\n` + ); + process.exit(1); + } + }); +} + +// --------------------------------------------------------------------------- +// Minimal type shim for Commander.js .option / .action chaining +// (avoids requiring a Commander.js type import in this standalone module) +// --------------------------------------------------------------------------- + +interface CommandBuilder { + description: (desc: string) => CommandBuilder; + option: (flags: string, description: string, defaultValue?: string | boolean) => CommandBuilder; + // eslint-disable-next-line @typescript-eslint/no-explicit-any + action: (fn: (...args: any[]) => void | Promise) => CommandBuilder; +} diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts new file mode 100644 index 00000000..55fe33ef --- /dev/null +++ b/src/lib/ingestion/extractor-fetch.ts @@ -0,0 +1,108 @@ +/** + * Primary fast-path extractor. + * + * Fetches a URL using the built-in `fetch` API and extracts plain text by + * stripping HTML tags. This is the default extractor used by the ingestion + * service; it is fast but cannot handle pages that render content with + * client-side JavaScript. + */ + +import type { Extractor, ExtractResult } from './extractor.js'; + +/** Configuration for the primary fetch-based extractor. */ +export interface FetchExtractorConfig { + /** + * Request timeout in milliseconds. + * @default 15000 + */ + timeoutMs?: number; + /** + * User-Agent string to send with requests. + */ + userAgent?: string; +} + +const DEFAULT_TIMEOUT_MS = 15_000; +const DEFAULT_USER_AGENT = + 'Mozilla/5.0 (compatible; OpenBrain-Ingestion/1.0; +https://github.com/TheWizardsCode/ContextHub)'; + +/** + * Strips HTML tags and collapses whitespace from `html`. + * Returns the resulting plain text. + */ +export function htmlToText(html: string): string { + // Remove script and style blocks entirely. + let text = html + .replace(//gi, ' ') + .replace(//gi, ' '); + // Replace block-level elements with newlines. + text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n'); + // Strip all remaining tags. + text = text.replace(/<[^>]+>/g, ' '); + // Decode common HTML entities. + text = text + .replace(/&/g, '&') + .replace(/</g, '<') + .replace(/>/g, '>') + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/ /g, ' '); + // Collapse whitespace. + return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim(); +} + +/** + * Extracts the content of the first `` tag in `html`. + */ +export function extractTitle(html: string): string | undefined { + const match = /<title[^>]*>([^<]*)<\/title>/i.exec(html); + return match ? match[1].trim() : undefined; +} + +/** + * Primary extractor that uses HTTP fetch + HTML-to-text conversion. + * + * This extractor is fast and has no external runtime dependencies. It is + * the default first-pass extractor used by the ingestion service before + * attempting the Playwright fallback. + */ +export class FetchExtractor implements Extractor { + private readonly config: Required<FetchExtractorConfig>; + + constructor(config: FetchExtractorConfig = {}) { + this.config = { + timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS, + userAgent: config.userAgent ?? DEFAULT_USER_AGENT, + }; + } + + async extract(url: string): Promise<ExtractResult> { + let html: string; + + try { + const controller = new AbortController(); + const timer = setTimeout( + () => controller.abort(), + this.config.timeoutMs + ); + try { + const response = await fetch(url, { + signal: controller.signal, + headers: { 'User-Agent': this.config.userAgent }, + }); + html = await response.text(); + } finally { + clearTimeout(timer); + } + } catch { + return { text: '', url }; + } + + return { + text: htmlToText(html), + html, + title: extractTitle(html), + url, + }; + } +} diff --git a/src/lib/ingestion/extractor-playwright.ts b/src/lib/ingestion/extractor-playwright.ts new file mode 100644 index 00000000..6b404c9e --- /dev/null +++ b/src/lib/ingestion/extractor-playwright.ts @@ -0,0 +1,210 @@ +/** + * Playwright-based fallback extractor. + * + * Uses a headless Chromium (or configured) browser to render JavaScript-heavy + * pages that the primary fast-path extractor cannot handle. + * + * Design principles: + * - Playwright is imported dynamically at runtime so the module does not fail + * to load when the `playwright` package is absent. + * - Each invocation uses a fresh browser context to avoid credential leakage + * (no cookies, local storage, or auth tokens are shared between runs). + * - All failures (launch error, navigation timeout, navigation error) are + * handled gracefully: the extractor returns an empty result and logs a + * warning rather than throwing. + */ + +import type { Extractor, ExtractResult } from './extractor.js'; + +/** Playwright browser channel to launch. */ +export type PlaywrightBrowserChannel = 'chromium' | 'firefox' | 'webkit'; + +/** Configuration for PlaywrightExtractor. */ +export interface PlaywrightExtractorConfig { + /** + * Which browser to use. + * @default 'chromium' + */ + browser?: PlaywrightBrowserChannel; + /** + * Navigation timeout in milliseconds. + * @default 30000 + */ + timeoutMs?: number; +} + +/** + * Classifies errors that the PlaywrightExtractor can encounter. + * Used in telemetry payloads. + */ +export type PlaywrightErrorType = + | 'launch_failed' + | 'timeout' + | 'navigation_error' + | null; + +/** Internal logger type so tests can inject a spy. */ +export type Logger = { + warn: (message: string, ...meta: unknown[]) => void; +}; + +const DEFAULT_TIMEOUT_MS = 30_000; + +/** + * Playwright-based URL extractor. + * + * Implements the same {@link Extractor} interface as the primary extractor so + * it can be used as a drop-in replacement or fallback in the ingestion service. + */ +export class PlaywrightExtractor implements Extractor { + private readonly config: Required<PlaywrightExtractorConfig>; + private readonly logger: Logger; + private readonly loadPlaywrightFn: () => Promise<BrowserModule>; + + /** + * @param config Browser configuration. + * @param logger Optional logger; defaults to `console`. + * @param loadPlaywrightFn Optional override for the playwright loader. + * Inject a stub here in tests to avoid a real import. + */ + constructor( + config: PlaywrightExtractorConfig = {}, + logger?: Logger, + loadPlaywrightFn?: () => Promise<BrowserModule> + ) { + this.config = { + browser: config.browser ?? 'chromium', + timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS, + }; + this.logger = logger ?? console; + this.loadPlaywrightFn = loadPlaywrightFn ?? loadPlaywright; + } + + /** + * Fetch and extract content from `url` using a headless browser. + * + * Returns an empty {@link ExtractResult} if Playwright is not installed, + * the browser fails to launch, or navigation times out. + */ + async extract(url: string): Promise<ExtractResult> { + let browserModule: BrowserModule | null = null; + + // Dynamic import guard: fail gracefully if playwright is not installed. + try { + browserModule = await this.loadPlaywrightFn(); + } catch { + this.logger.warn( + '[PlaywrightExtractor] playwright package is not installed. ' + + 'Install it with: npm install playwright' + ); + return { text: '', url }; + } + + const launcher = browserModule[this.config.browser]; + let browser: BrowserLike | null = null; + + try { + browser = await launcher.launch({ headless: true }); + } catch (err) { + this.logger.warn( + '[PlaywrightExtractor] Failed to launch browser:', + err instanceof Error ? err.message : err + ); + return { text: '', url }; + } + + try { + // Fresh context — no cookies, localStorage, or auth tokens. + const context = await browser.newContext({ + storageState: undefined, + }); + const page = await context.newPage(); + + try { + await page.goto(url, { + timeout: this.config.timeoutMs, + waitUntil: 'domcontentloaded', + }); + } catch (err) { + const isTimeout = + err instanceof Error && err.message.includes('Timeout'); + const errorType: PlaywrightErrorType = isTimeout + ? 'timeout' + : 'navigation_error'; + this.logger.warn( + `[PlaywrightExtractor] Navigation ${errorType} for ${url}:`, + err instanceof Error ? err.message : err + ); + return { text: '', url }; + } + + const html = await page.content(); + const title = await page.title().catch(() => undefined); + // evaluate runs in the browser context where `document` is available; + // cast through globalThis to avoid TypeScript's non-DOM lib complaint. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const text = await page.evaluate(() => (globalThis as any).document?.body?.innerText ?? ''); + + await context.close(); + + return { text, html, title, url }; + } finally { + try { + await browser.close(); + } catch { + // Ignore close errors. + } + } + } +} + +// --------------------------------------------------------------------------- +// Internal browser-module loading abstraction (facilitates testing) +// --------------------------------------------------------------------------- + +/** Minimal shape of a Playwright browser launcher we use. */ +export interface BrowserTypeLike { + launch(options: { headless: boolean }): Promise<BrowserLike>; +} + +/** Minimal shape of a Playwright Browser we use. */ +export interface BrowserLike { + newContext(options: { storageState: undefined }): Promise<BrowserContextLike>; + close(): Promise<void>; +} + +/** Minimal shape of a Playwright BrowserContext we use. */ +export interface BrowserContextLike { + newPage(): Promise<PageLike>; + close(): Promise<void>; +} + +/** Minimal shape of a Playwright Page we use. */ +export interface PageLike { + goto(url: string, options: { timeout: number; waitUntil: string }): Promise<unknown>; + content(): Promise<string>; + title(): Promise<string>; + evaluate<T>(fn: () => T): Promise<T>; +} + +/** The subset of the playwright module that PlaywrightExtractor uses. */ +export interface BrowserModule { + chromium: BrowserTypeLike; + firefox: BrowserTypeLike; + webkit: BrowserTypeLike; +} + +/** + * Load the playwright module. + * + * Isolated into a separate function so callers can override it at construction + * time (via the `loadPlaywrightFn` constructor parameter) without needing to + * actually install playwright. + * + * @internal + */ +export async function loadPlaywright(): Promise<BrowserModule> { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const mod = await import('playwright' as any) as BrowserModule; + return mod; +} diff --git a/src/lib/ingestion/extractor.ts b/src/lib/ingestion/extractor.ts new file mode 100644 index 00000000..6b6bef91 --- /dev/null +++ b/src/lib/ingestion/extractor.ts @@ -0,0 +1,37 @@ +/** + * Extractor interface for URL content retrieval. + * + * All extractors (primary fast-path and Playwright fallback) implement this + * interface so they can be swapped in the ingestion pipeline without changes + * to the service layer. + */ + +/** + * The result of extracting content from a URL. + */ +export interface ExtractResult { + /** The plain-text content extracted from the page. */ + text: string; + /** The raw HTML of the page (optional; used for downstream parsing). */ + html?: string; + /** The page title, if available. */ + title?: string; + /** The URL that was fetched (may differ from the requested URL after redirects). */ + url: string; +} + +/** + * Interface that all URL extractors must implement. + */ +export interface Extractor { + /** + * Fetch and extract content from the given URL. + * + * Implementations must not throw. Failures are surfaced via a result with + * an empty `text` field (and optionally an error logged internally). + * + * @param url The URL to fetch. + * @returns A promise resolving to the extracted content. + */ + extract(url: string): Promise<ExtractResult>; +} diff --git a/src/lib/ingestion/service.ts b/src/lib/ingestion/service.ts new file mode 100644 index 00000000..36549a15 --- /dev/null +++ b/src/lib/ingestion/service.ts @@ -0,0 +1,189 @@ +/** + * Ingestion service with Playwright fallback orchestration. + * + * Orchestrates URL content retrieval using a fast primary extractor and an + * optional Playwright-based fallback for JavaScript-heavy pages. + * + * The fallback is opt-in: it is only activated when: + * 1. `config.playwrightFallback` is `true` (or the `OB_PLAYWRIGHT_FALLBACK` + * environment variable is set to a truthy value), AND + * 2. The primary extractor returns fewer characters than + * `config.minContentLength` (default: 200). + * + * On every invocation, a structured telemetry entry is emitted via the + * provided logger. Telemetry never includes URL text, page content, or any + * user-identifiable data. + */ + +import type { Extractor, ExtractResult } from './extractor.js'; + +/** Configuration for the ingestion service. */ +export interface IngestionConfig { + /** + * Enable the Playwright fallback extractor. + * + * Can also be enabled by setting the `OB_PLAYWRIGHT_FALLBACK` environment + * variable to any truthy value (`1`, `true`, `yes`). + * + * @default false + */ + playwrightFallback?: boolean; + + /** + * Minimum character count that primary extractor content must reach before + * the Playwright fallback is considered unnecessary. + * + * @default 200 + */ + minContentLength?: number; +} + +// --------------------------------------------------------------------------- +// Telemetry +// --------------------------------------------------------------------------- + +/** Structured telemetry entry emitted on every ingestion run. */ +export interface PlaywrightFallbackTelemetry { + /** Fixed event identifier. */ + event: 'playwright_fallback'; + /** Whether the fallback was actually triggered. */ + triggered: boolean; + /** Character count returned by the primary extractor. */ + primaryContentLength: number; + /** Character count returned by PlaywrightExtractor (0 if not triggered or failed). */ + fallbackContentLength: number; + /** Wall-clock time of the Playwright fetch in milliseconds (0 if not triggered). */ + durationMs: number; + /** Whether PlaywrightExtractor returned usable content. */ + success: boolean; + /** Error classification, or null when no error occurred. */ + errorType: 'launch_failed' | 'timeout' | 'navigation_error' | null; + /** Always "playwright" for this fallback. */ + provider: 'playwright'; +} + +/** Minimal logger interface used by the service. */ +export interface ServiceLogger { + info: (message: string, meta?: PlaywrightFallbackTelemetry) => void; + warn: (message: string, ...args: unknown[]) => void; +} + +// --------------------------------------------------------------------------- +// IngestionService +// --------------------------------------------------------------------------- + +const DEFAULT_MIN_CONTENT_LENGTH = 200; + +/** + * Determines whether the Playwright fallback is enabled, considering both the + * programmatic config and the `OB_PLAYWRIGHT_FALLBACK` environment variable. + */ +function isFallbackEnabled(config: IngestionConfig): boolean { + if (config.playwrightFallback === true) return true; + const envVal = process.env['OB_PLAYWRIGHT_FALLBACK']; + return envVal === '1' || envVal === 'true' || envVal === 'yes'; +} + +/** + * Service that orchestrates URL ingestion with an optional Playwright + * fallback for JavaScript-heavy pages. + */ +export class IngestionService { + private readonly primaryExtractor: Extractor; + private readonly playwrightExtractor: Extractor; + private readonly config: IngestionConfig; + private readonly logger: ServiceLogger; + + constructor( + primaryExtractor: Extractor, + playwrightExtractor: Extractor, + config: IngestionConfig = {}, + logger?: ServiceLogger + ) { + this.primaryExtractor = primaryExtractor; + this.playwrightExtractor = playwrightExtractor; + this.config = config; + this.logger = logger ?? { + info: (msg, meta) => { + const line = meta ? `${msg} ${JSON.stringify(meta)}` : msg; + process.stdout.write(`${line}\n`); + }, + warn: (msg, ...args) => { + const parts = [msg, ...args.map(a => String(a))].join(' '); + process.stderr.write(`${parts}\n`); + }, + }; + } + + /** + * Ingest content from `url`. + * + * 1. Runs the primary extractor. + * 2. If the fallback is enabled and primary content is below the threshold, + * runs PlaywrightExtractor. + * 3. Emits a telemetry entry regardless of which path was taken. + * 4. Returns the best available result (fallback if it produced content, + * otherwise the primary result). + * + * @param url The URL to ingest. + * @returns The best {@link ExtractResult} available. + */ + async ingest(url: string): Promise<ExtractResult> { + const minLen = + this.config.minContentLength ?? DEFAULT_MIN_CONTENT_LENGTH; + + const primaryResult = await this.primaryExtractor.extract(url); + const primaryLen = primaryResult.text.length; + + if (!isFallbackEnabled(this.config) || primaryLen >= minLen) { + this.emitTelemetry({ + event: 'playwright_fallback', + triggered: false, + primaryContentLength: primaryLen, + fallbackContentLength: 0, + durationMs: 0, + success: false, + errorType: null, + provider: 'playwright', + }); + return primaryResult; + } + + // Fallback triggered. + const startMs = Date.now(); + let fallbackResult: ExtractResult = { text: '', url }; + let errorType: PlaywrightFallbackTelemetry['errorType'] = null; + + try { + fallbackResult = await this.playwrightExtractor.extract(url); + } catch (err) { + // PlaywrightExtractor should not throw, but guard anyway. + this.logger.warn( + '[IngestionService] Unexpected error from PlaywrightExtractor:', + err instanceof Error ? err.message : err + ); + errorType = 'navigation_error'; + } + + const durationMs = Date.now() - startMs; + const fallbackLen = fallbackResult.text.length; + const success = fallbackLen > 0; + + this.emitTelemetry({ + event: 'playwright_fallback', + triggered: true, + primaryContentLength: primaryLen, + fallbackContentLength: fallbackLen, + durationMs, + success, + errorType, + provider: 'playwright', + }); + + return success ? fallbackResult : primaryResult; + } + + private emitTelemetry(entry: PlaywrightFallbackTelemetry): void { + this.logger.info('[telemetry]', entry); + } +} diff --git a/tests/fixtures/playwright-fallback/js-heavy-page.html b/tests/fixtures/playwright-fallback/js-heavy-page.html new file mode 100644 index 00000000..a61b2f6d --- /dev/null +++ b/tests/fixtures/playwright-fallback/js-heavy-page.html @@ -0,0 +1,22 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="UTF-8"> + <title>JS-Heavy Page (fixture) + + + +
+ + diff --git a/tests/fixtures/playwright-fallback/static-page.html b/tests/fixtures/playwright-fallback/static-page.html new file mode 100644 index 00000000..8bd18866 --- /dev/null +++ b/tests/fixtures/playwright-fallback/static-page.html @@ -0,0 +1,15 @@ + + + + + Static Page (fixture) + + +

Static Page

+

This page has enough static content that the primary extractor can handle + it without needing the Playwright fallback. The text here is well over two + hundred characters so that a minContentLength check of 200 will pass and the + Playwright extractor will not be invoked at all for this fixture. The quick + brown fox jumped over the lazy dog.

+ + diff --git a/tests/lib/ingestion/add-command.test.ts b/tests/lib/ingestion/add-command.test.ts new file mode 100644 index 00000000..2e38321e --- /dev/null +++ b/tests/lib/ingestion/add-command.test.ts @@ -0,0 +1,106 @@ +/** + * Unit tests for src/cli/commands/add.ts + * + * Tests exercise the `runAdd` function using stubbed IngestionService + * instances. No real network requests or browser launches occur. + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { runAdd } from '../../../src/cli/commands/add.js'; +import * as serviceModule from '../../../src/lib/ingestion/service.js'; +import type { ExtractResult } from '../../../src/lib/ingestion/extractor.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function mockIngestResult(text: string): ExtractResult { + return { text, url: 'https://example.com' }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('runAdd', () => { + let ingestSpy: ReturnType; + let stdoutSpy: ReturnType; + + beforeEach(() => { + // Spy on IngestionService.prototype.ingest so we can control the result. + ingestSpy = vi + .spyOn(serviceModule.IngestionService.prototype, 'ingest') + .mockResolvedValue(mockIngestResult('Ingested content')); + + // Capture stdout writes. + stdoutSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + it('writes ingested content to stdout by default', async () => { + await runAdd('https://example.com'); + + expect(stdoutSpy).toHaveBeenCalledWith('Ingested content'); + }); + + it('returns the ExtractResult', async () => { + const result = await runAdd('https://example.com'); + expect(result.text).toBe('Ingested content'); + expect(result.url).toBe('https://example.com'); + }); + + it('throws when URL is empty', async () => { + await expect(runAdd('')).rejects.toThrow('URL is required'); + }); + + it('passes playwrightFallback option to IngestionService', async () => { + // We can inspect the IngestionService constructor call via the spy on ingest. + await runAdd('https://example.com', { playwrightFallback: true }); + expect(ingestSpy).toHaveBeenCalledWith('https://example.com'); + }); + + it('writes to output file when --output is specified', async () => { + const writeFileSpy = vi.fn(); + + await runAdd('https://example.com', { + output: '/tmp/output.txt', + _writeFile: writeFileSpy, + }); + + expect(writeFileSpy).toHaveBeenCalledWith('/tmp/output.txt', 'Ingested content', 'utf8'); + // Should not also write to stdout. + expect(stdoutSpy).not.toHaveBeenCalled(); + }); + + it('appends newline to stdout when content does not end with one', async () => { + ingestSpy.mockResolvedValueOnce(mockIngestResult('No newline content')); + const writes: string[] = []; + stdoutSpy.mockImplementation((s: unknown) => { + writes.push(String(s)); + return true; + }); + + await runAdd('https://example.com'); + + expect(writes.join('')).toMatch(/\n$/); + }); + + it('does not append extra newline when content already ends with one', async () => { + ingestSpy.mockResolvedValueOnce(mockIngestResult('Content with newline\n')); + const writes: string[] = []; + stdoutSpy.mockImplementation((s: unknown) => { + writes.push(String(s)); + return true; + }); + + await runAdd('https://example.com'); + + // Should be exactly one newline at the end, not two. + const combined = writes.join(''); + expect(combined.endsWith('\n\n')).toBe(false); + expect(combined.endsWith('\n')).toBe(true); + }); +}); diff --git a/tests/lib/ingestion/extractor-fetch.test.ts b/tests/lib/ingestion/extractor-fetch.test.ts new file mode 100644 index 00000000..1a680b4f --- /dev/null +++ b/tests/lib/ingestion/extractor-fetch.test.ts @@ -0,0 +1,131 @@ +/** + * Unit tests for src/lib/ingestion/extractor-fetch.ts + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { htmlToText, extractTitle, FetchExtractor } from '../../../src/lib/ingestion/extractor-fetch.js'; + +// --------------------------------------------------------------------------- +// htmlToText +// --------------------------------------------------------------------------- + +describe('htmlToText', () => { + it('strips HTML tags', () => { + expect(htmlToText('

Hello world

')).toBe('Hello world'); + }); + + it('removes script blocks entirely', () => { + const html = '

Visible

After

'; + const text = htmlToText(html); + expect(text).not.toContain('var x'); + expect(text).toContain('Visible'); + expect(text).toContain('After'); + }); + + it('removes style blocks entirely', () => { + const html = '

Text

'; + const text = htmlToText(html); + expect(text).not.toContain('color'); + expect(text).toContain('Text'); + }); + + it('decodes common HTML entities', () => { + //   decodes to a regular space which is then trimmed by whitespace normalization. + expect(htmlToText('& < > " '  ')).toBe("& < > \" '"); + }); + + it('collapses whitespace', () => { + expect(htmlToText(' hello world ')).toBe('hello world'); + }); + + it('returns empty string for empty input', () => { + expect(htmlToText('')).toBe(''); + }); +}); + +// --------------------------------------------------------------------------- +// extractTitle +// --------------------------------------------------------------------------- + +describe('extractTitle', () => { + it('extracts a simple title', () => { + expect(extractTitle('My Page')).toBe('My Page'); + }); + + it('returns undefined when no title tag', () => { + expect(extractTitle('')).toBeUndefined(); + }); + + it('trims whitespace from title', () => { + expect(extractTitle(' Spaces ')).toBe('Spaces'); + }); +}); + +// --------------------------------------------------------------------------- +// FetchExtractor +// --------------------------------------------------------------------------- + +describe('FetchExtractor', () => { + const mockFetch = vi.fn(); + + beforeEach(() => { + vi.stubGlobal('fetch', mockFetch); + mockFetch.mockClear(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + }); + + function mockResponse(html: string) { + return { + ok: true, + text: vi.fn(async () => html), + }; + } + + it('returns extracted text from a successful fetch', async () => { + const html = 'Test

Hello world

'; + mockFetch.mockResolvedValueOnce(mockResponse(html)); + + const extractor = new FetchExtractor(); + const result = await extractor.extract('https://example.com'); + + expect(result.url).toBe('https://example.com'); + expect(result.text).toContain('Hello world'); + expect(result.title).toBe('Test'); + expect(result.html).toBe(html); + }); + + it('returns empty text on network failure', async () => { + mockFetch.mockRejectedValueOnce(new Error('Network error')); + + const extractor = new FetchExtractor(); + const result = await extractor.extract('https://unreachable.example.com'); + + expect(result.text).toBe(''); + expect(result.url).toBe('https://unreachable.example.com'); + }); + + it('returns empty text on abort (timeout)', async () => { + mockFetch.mockRejectedValueOnce( + Object.assign(new Error('The operation was aborted'), { name: 'AbortError' }) + ); + + const extractor = new FetchExtractor({ timeoutMs: 1 }); + const result = await extractor.extract('https://slow.example.com'); + + expect(result.text).toBe(''); + }); + + it('passes the User-Agent header', async () => { + const html = '

content

'; + mockFetch.mockResolvedValueOnce(mockResponse(html)); + + const extractor = new FetchExtractor({ userAgent: 'TestAgent/1.0' }); + await extractor.extract('https://example.com'); + + const [, requestInit] = mockFetch.mock.calls[0] as [string, RequestInit]; + expect((requestInit.headers as Record)['User-Agent']).toBe('TestAgent/1.0'); + }); +}); diff --git a/tests/lib/ingestion/extractor-playwright.test.ts b/tests/lib/ingestion/extractor-playwright.test.ts new file mode 100644 index 00000000..c0690039 --- /dev/null +++ b/tests/lib/ingestion/extractor-playwright.test.ts @@ -0,0 +1,291 @@ +/** + * Unit tests for src/lib/ingestion/extractor-playwright.ts + * + * Tests exercise the PlaywrightExtractor class using stubbed browser + * objects injected via the constructor's `loadPlaywrightFn` parameter. + * No real browser is launched and playwright is never imported. + * + * Testing strategy: + * - `PlaywrightExtractor` accepts an optional `loadPlaywrightFn` in its + * constructor. Tests pass a factory that returns controlled fake browser + * implementations instead of a real playwright module. + * - This matches the "mock/stub" option described in the issue's CI/Testing + * Strategy section and avoids ESM spy limitations. + */ + +import { describe, it, expect, vi } from 'vitest'; +import { + PlaywrightExtractor, + type BrowserModule, + type BrowserTypeLike, + type BrowserLike, + type BrowserContextLike, + type PageLike, +} from '../../../src/lib/ingestion/extractor-playwright.js'; + +// --------------------------------------------------------------------------- +// Helpers: minimal browser stubs +// --------------------------------------------------------------------------- + +function buildPageStub(overrides: Partial = {}): PageLike { + return { + goto: vi.fn(async () => null), + content: vi.fn(async () => '

Playwright content

'), + title: vi.fn(async () => 'Playwright Page'), + evaluate: vi.fn(async () => 'Playwright content'), + ...overrides, + }; +} + +function buildContextStub(page: PageLike): BrowserContextLike { + return { + newPage: vi.fn(async () => page), + close: vi.fn(async () => undefined), + }; +} + +function buildBrowserStub(context: BrowserContextLike): BrowserLike { + return { + newContext: vi.fn(async () => context), + close: vi.fn(async () => undefined), + }; +} + +function buildLauncherStub(browser: BrowserLike): BrowserTypeLike { + return { + launch: vi.fn(async () => browser), + }; +} + +function buildBrowserModule(chromium: BrowserTypeLike): BrowserModule { + return { + chromium, + firefox: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), + webkit: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), + }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('PlaywrightExtractor', () => { + + // ------------------------------------------------------------------------- + // Happy path + // ------------------------------------------------------------------------- + + it('returns extracted text and html from a successful navigation', async () => { + const page = buildPageStub({ + content: vi.fn(async () => '

JS content

'), + title: vi.fn(async () => 'JS Page'), + evaluate: vi.fn(async () => 'JS content'), + }); + const launcher = buildLauncherStub(buildBrowserStub(buildContextStub(page))); + const extractor = new PlaywrightExtractor( + {}, + { warn: vi.fn() }, + async () => buildBrowserModule(launcher) + ); + + const result = await extractor.extract('https://example.com'); + + expect(result.text).toBe('JS content'); + expect(result.html).toContain('JS content'); + expect(result.title).toBe('JS Page'); + expect(result.url).toBe('https://example.com'); + }); + + it('uses a fresh browser context (storageState is undefined)', async () => { + const page = buildPageStub(); + const context = buildContextStub(page); + const browser = buildBrowserStub(context); + const launcher = buildLauncherStub(browser); + const extractor = new PlaywrightExtractor( + {}, + { warn: vi.fn() }, + async () => buildBrowserModule(launcher) + ); + + await extractor.extract('https://example.com'); + + expect(browser.newContext).toHaveBeenCalledWith({ storageState: undefined }); + }); + + it('closes the browser even when navigation succeeds', async () => { + const page = buildPageStub(); + const context = buildContextStub(page); + const browser = buildBrowserStub(context); + const launcher = buildLauncherStub(browser); + const extractor = new PlaywrightExtractor( + {}, + { warn: vi.fn() }, + async () => buildBrowserModule(launcher) + ); + + await extractor.extract('https://example.com'); + + expect(browser.close).toHaveBeenCalled(); + }); + + it('passes headless:true to browser.launch', async () => { + const page = buildPageStub(); + const context = buildContextStub(page); + const browser = buildBrowserStub(context); + const launcher = buildLauncherStub(browser); + const extractor = new PlaywrightExtractor( + {}, + { warn: vi.fn() }, + async () => buildBrowserModule(launcher) + ); + + await extractor.extract('https://example.com'); + + expect(launcher.launch).toHaveBeenCalledWith({ headless: true }); + }); + + it('passes configured timeout to page.goto', async () => { + const page = buildPageStub(); + const context = buildContextStub(page); + const browser = buildBrowserStub(context); + const launcher = buildLauncherStub(browser); + const extractor = new PlaywrightExtractor( + { timeoutMs: 5000 }, + { warn: vi.fn() }, + async () => buildBrowserModule(launcher) + ); + + await extractor.extract('https://example.com'); + + expect(page.goto).toHaveBeenCalledWith( + 'https://example.com', + expect.objectContaining({ timeout: 5000 }) + ); + }); + + // ------------------------------------------------------------------------- + // Graceful degradation: playwright not installed + // ------------------------------------------------------------------------- + + it('returns empty result and warns when playwright is not installed', async () => { + const warnSpy = vi.fn(); + const extractor = new PlaywrightExtractor( + {}, + { warn: warnSpy }, + async () => { throw new Error("Cannot find module 'playwright'"); } + ); + + const result = await extractor.extract('https://example.com'); + + expect(result.text).toBe(''); + expect(result.url).toBe('https://example.com'); + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('playwright package is not installed') + ); + }); + + // ------------------------------------------------------------------------- + // Graceful degradation: launch failure + // ------------------------------------------------------------------------- + + it('returns empty result and warns when browser fails to launch', async () => { + const warnSpy = vi.fn(); + const launcher: BrowserTypeLike = { + launch: vi.fn(async () => { throw new Error('Executable not found'); }), + }; + const extractor = new PlaywrightExtractor( + {}, + { warn: warnSpy }, + async () => buildBrowserModule(launcher) + ); + + const result = await extractor.extract('https://example.com'); + + expect(result.text).toBe(''); + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Failed to launch browser'), + expect.any(String) + ); + }); + + // ------------------------------------------------------------------------- + // Graceful degradation: navigation timeout + // ------------------------------------------------------------------------- + + it('returns empty result and warns on navigation timeout', async () => { + const warnSpy = vi.fn(); + const page = buildPageStub({ + goto: vi.fn(async () => { throw new Error('Timeout exceeded'); }), + }); + const extractor = new PlaywrightExtractor( + {}, + { warn: warnSpy }, + async () => buildBrowserModule( + buildLauncherStub(buildBrowserStub(buildContextStub(page))) + ) + ); + + const result = await extractor.extract('https://example.com'); + + expect(result.text).toBe(''); + expect(warnSpy).toHaveBeenCalledWith( + expect.stringMatching(/timeout|navigation_error/i), + expect.any(String) + ); + }); + + // ------------------------------------------------------------------------- + // Graceful degradation: navigation error (non-timeout) + // ------------------------------------------------------------------------- + + it('returns empty result and warns on non-timeout navigation error', async () => { + const warnSpy = vi.fn(); + const page = buildPageStub({ + goto: vi.fn(async () => { throw new Error('net::ERR_CONNECTION_REFUSED'); }), + }); + const extractor = new PlaywrightExtractor( + {}, + { warn: warnSpy }, + async () => buildBrowserModule( + buildLauncherStub(buildBrowserStub(buildContextStub(page))) + ) + ); + + const result = await extractor.extract('https://example.com'); + + expect(result.text).toBe(''); + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('navigation_error'), + expect.any(String) + ); + }); + + // ------------------------------------------------------------------------- + // Browser channel selection + // ------------------------------------------------------------------------- + + it('uses firefox when configured', async () => { + const firefoxPage = buildPageStub({ + evaluate: vi.fn(async () => 'Firefox content'), + }); + const firefoxLauncher = buildLauncherStub( + buildBrowserStub(buildContextStub(firefoxPage)) + ); + + const mod: BrowserModule = { + chromium: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), + firefox: firefoxLauncher, + webkit: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), + }; + + const extractor = new PlaywrightExtractor( + { browser: 'firefox' }, + { warn: vi.fn() }, + async () => mod + ); + const result = await extractor.extract('https://example.com'); + + expect(firefoxLauncher.launch).toHaveBeenCalled(); + expect(result.text).toBe('Firefox content'); + }); +}); diff --git a/tests/lib/ingestion/extractor.test.ts b/tests/lib/ingestion/extractor.test.ts new file mode 100644 index 00000000..6af2292c --- /dev/null +++ b/tests/lib/ingestion/extractor.test.ts @@ -0,0 +1,47 @@ +/** + * Unit tests for src/lib/ingestion/extractor.ts + * + * The Extractor interface and ExtractResult type are structural contracts; + * these tests verify they are exported with the expected shape. + */ + +import { describe, it, expect } from 'vitest'; +import type { Extractor, ExtractResult } from '../../../src/lib/ingestion/extractor.js'; + +describe('Extractor interface', () => { + it('can be implemented by a simple stub', async () => { + const stub: Extractor = { + async extract(url: string): Promise { + return { text: 'hello', url }; + }, + }; + + const result = await stub.extract('https://example.com'); + expect(result.text).toBe('hello'); + expect(result.url).toBe('https://example.com'); + }); + + it('ExtractResult has the expected fields', () => { + const result: ExtractResult = { + text: 'content', + html: '

content

', + title: 'Page title', + url: 'https://example.com', + }; + + expect(result.text).toBe('content'); + expect(result.html).toBe('

content

'); + expect(result.title).toBe('Page title'); + expect(result.url).toBe('https://example.com'); + }); + + it('ExtractResult html and title fields are optional', () => { + const result: ExtractResult = { + text: 'content', + url: 'https://example.com', + }; + + expect(result.html).toBeUndefined(); + expect(result.title).toBeUndefined(); + }); +}); diff --git a/tests/lib/ingestion/service.test.ts b/tests/lib/ingestion/service.test.ts new file mode 100644 index 00000000..9e5bdb54 --- /dev/null +++ b/tests/lib/ingestion/service.test.ts @@ -0,0 +1,287 @@ +/** + * Unit tests for src/lib/ingestion/service.ts + * + * Covers: + * - Primary-only path (fallback disabled or threshold not met) + * - Fallback trigger (short primary content + enabled flag) + * - Fallback via OB_PLAYWRIGHT_FALLBACK env var + * - Fallback returns empty → primary result is returned + * - Telemetry emission on every invocation + * - Graceful handling when PlaywrightExtractor throws unexpectedly + */ + +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { IngestionService, type IngestionConfig, type PlaywrightFallbackTelemetry } from '../../../src/lib/ingestion/service.js'; +import type { Extractor, ExtractResult } from '../../../src/lib/ingestion/extractor.js'; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makeExtractor(result: ExtractResult): Extractor { + return { extract: vi.fn(async () => result) }; +} + +function makeResult(text: string, url = 'https://example.com'): ExtractResult { + return { text, url }; +} + +function capturedTelemetry( + infoSpy: ReturnType +): PlaywrightFallbackTelemetry | undefined { + const call = infoSpy.mock.calls.find( + (c: unknown[]) => c[0] === '[telemetry]' + ); + return call ? (call[1] as PlaywrightFallbackTelemetry) : undefined; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('IngestionService', () => { + const URL = 'https://example.com'; + let infoSpy: ReturnType; + let warnSpy: ReturnType; + let logger: { info: typeof infoSpy; warn: typeof warnSpy }; + + beforeEach(() => { + infoSpy = vi.fn(); + warnSpy = vi.fn(); + logger = { info: infoSpy, warn: warnSpy }; + // Ensure env var is not set between tests. + delete process.env['OB_PLAYWRIGHT_FALLBACK']; + }); + + afterEach(() => { + delete process.env['OB_PLAYWRIGHT_FALLBACK']; + }); + + // ------------------------------------------------------------------------- + // Fallback disabled + // ------------------------------------------------------------------------- + + it('returns primary result when fallback is disabled (default)', async () => { + const primary = makeExtractor(makeResult('short', URL)); + const pw = makeExtractor(makeResult('playwright content', URL)); + + const svc = new IngestionService(primary, pw, {}, logger); + const result = await svc.ingest(URL); + + expect(result.text).toBe('short'); + expect(pw.extract).not.toHaveBeenCalled(); + }); + + it('returns primary result when fallback is disabled explicitly', async () => { + const primary = makeExtractor(makeResult('short', URL)); + const pw = makeExtractor(makeResult('playwright content', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: false }, logger + ); + const result = await svc.ingest(URL); + + expect(result.text).toBe('short'); + expect(pw.extract).not.toHaveBeenCalled(); + }); + + // ------------------------------------------------------------------------- + // Fallback not triggered (primary content is sufficient) + // ------------------------------------------------------------------------- + + it('does not trigger fallback when primary content meets threshold', async () => { + const longText = 'a'.repeat(300); + const primary = makeExtractor(makeResult(longText, URL)); + const pw = makeExtractor(makeResult('playwright content', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger + ); + const result = await svc.ingest(URL); + + expect(result.text).toBe(longText); + expect(pw.extract).not.toHaveBeenCalled(); + }); + + // ------------------------------------------------------------------------- + // Fallback triggered + // ------------------------------------------------------------------------- + + it('triggers fallback when primary content is below threshold', async () => { + const primary = makeExtractor(makeResult('too short', URL)); + const pw = makeExtractor(makeResult('Rich playwright content that is long enough', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger + ); + const result = await svc.ingest(URL); + + expect(result.text).toBe('Rich playwright content that is long enough'); + expect(pw.extract).toHaveBeenCalledWith(URL); + }); + + it('returns primary result when fallback also returns empty', async () => { + const primary = makeExtractor(makeResult('short', URL)); + const pw = makeExtractor(makeResult('', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger + ); + const result = await svc.ingest(URL); + + expect(result.text).toBe('short'); + }); + + // ------------------------------------------------------------------------- + // Environment variable activation + // ------------------------------------------------------------------------- + + it('activates fallback via OB_PLAYWRIGHT_FALLBACK=1', async () => { + process.env['OB_PLAYWRIGHT_FALLBACK'] = '1'; + const primary = makeExtractor(makeResult('short', URL)); + const pw = makeExtractor(makeResult('playwright result', URL)); + + const svc = new IngestionService(primary, pw, {}, logger); + const result = await svc.ingest(URL); + + expect(result.text).toBe('playwright result'); + expect(pw.extract).toHaveBeenCalled(); + }); + + it('activates fallback via OB_PLAYWRIGHT_FALLBACK=true', async () => { + process.env['OB_PLAYWRIGHT_FALLBACK'] = 'true'; + const primary = makeExtractor(makeResult('short', URL)); + const pw = makeExtractor(makeResult('playwright result', URL)); + + const svc = new IngestionService(primary, pw, {}, logger); + const result = await svc.ingest(URL); + + expect(result.text).toBe('playwright result'); + }); + + it('does not activate fallback via OB_PLAYWRIGHT_FALLBACK=0', async () => { + process.env['OB_PLAYWRIGHT_FALLBACK'] = '0'; + const primary = makeExtractor(makeResult('short', URL)); + const pw = makeExtractor(makeResult('playwright result', URL)); + + const svc = new IngestionService(primary, pw, {}, logger); + const result = await svc.ingest(URL); + + expect(result.text).toBe('short'); + expect(pw.extract).not.toHaveBeenCalled(); + }); + + // ------------------------------------------------------------------------- + // Telemetry + // ------------------------------------------------------------------------- + + it('emits telemetry with triggered=false when fallback is disabled', async () => { + const primary = makeExtractor(makeResult('some text', URL)); + const pw = makeExtractor(makeResult('', URL)); + + const svc = new IngestionService(primary, pw, {}, logger); + await svc.ingest(URL); + + const telemetry = capturedTelemetry(infoSpy); + expect(telemetry).toBeDefined(); + expect(telemetry!.event).toBe('playwright_fallback'); + expect(telemetry!.triggered).toBe(false); + expect(telemetry!.primaryContentLength).toBe('some text'.length); + expect(telemetry!.fallbackContentLength).toBe(0); + expect(telemetry!.durationMs).toBe(0); + expect(telemetry!.success).toBe(false); + expect(telemetry!.errorType).toBeNull(); + expect(telemetry!.provider).toBe('playwright'); + }); + + it('emits telemetry with triggered=true when fallback runs', async () => { + const primary = makeExtractor(makeResult('short', URL)); + const fallbackText = 'long fallback content from playwright'; + const pw = makeExtractor(makeResult(fallbackText, URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true }, logger + ); + await svc.ingest(URL); + + const telemetry = capturedTelemetry(infoSpy); + expect(telemetry!.triggered).toBe(true); + expect(telemetry!.primaryContentLength).toBe('short'.length); + expect(telemetry!.fallbackContentLength).toBe(fallbackText.length); + expect(telemetry!.success).toBe(true); + expect(telemetry!.errorType).toBeNull(); + expect(telemetry!.durationMs).toBeGreaterThanOrEqual(0); + }); + + it('emits telemetry with success=false when fallback returns empty', async () => { + const primary = makeExtractor(makeResult('tiny', URL)); + const pw = makeExtractor(makeResult('', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true }, logger + ); + await svc.ingest(URL); + + const telemetry = capturedTelemetry(infoSpy); + expect(telemetry!.triggered).toBe(true); + expect(telemetry!.success).toBe(false); + expect(telemetry!.fallbackContentLength).toBe(0); + }); + + // ------------------------------------------------------------------------- + // Graceful handling of unexpected PlaywrightExtractor error + // ------------------------------------------------------------------------- + + it('handles unexpected PlaywrightExtractor throw gracefully', async () => { + const primary = makeExtractor(makeResult('short', URL)); + const pw: Extractor = { + extract: vi.fn(async () => { throw new Error('unexpected crash'); }), + }; + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true }, logger + ); + // Must not throw. + const result = await svc.ingest(URL); + + // Falls back to primary result. + expect(result.text).toBe('short'); + expect(warnSpy).toHaveBeenCalledWith( + expect.stringContaining('Unexpected error'), + expect.any(String) + ); + }); + + // ------------------------------------------------------------------------- + // Default minContentLength + // ------------------------------------------------------------------------- + + it('uses default minContentLength of 200 when not specified', async () => { + // Text of exactly 199 chars → should trigger fallback. + const shortText = 'a'.repeat(199); + const primary = makeExtractor(makeResult(shortText, URL)); + const pw = makeExtractor(makeResult('playwright result', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true }, logger + ); + const result = await svc.ingest(URL); + + expect(result.text).toBe('playwright result'); + expect(pw.extract).toHaveBeenCalled(); + }); + + it('does not trigger fallback when text is exactly at default threshold (200 chars)', async () => { + const atThreshold = 'a'.repeat(200); + const primary = makeExtractor(makeResult(atThreshold, URL)); + const pw = makeExtractor(makeResult('playwright result', URL)); + + const svc = new IngestionService( + primary, pw, { playwrightFallback: true }, logger + ); + const result = await svc.ingest(URL); + + expect(result.text).toBe(atThreshold); + expect(pw.extract).not.toHaveBeenCalled(); + }); +}); From 66d6e56aac0cce21a4d64a5872b7b6aea2f8d087 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:36:54 +0000 Subject: [PATCH 3/4] fix: address CodeQL and code review feedback - extractor-fetch.ts: fix bad-tag-filter (allow spaces before closing >) and double-escaping (use single-pass entity decoder) - extractor-playwright.ts: replace globalThis any cast with typed interface, add explanatory comment for the optional-peer-dep import cast - add-command.test.ts: rename s -> chunk for clarity Agent-Logs-Url: https://github.com/TheWizardsCode/ContextHub/sessions/511715b4-dc90-4fe1-8664-cd673580f894 Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com> --- src/lib/ingestion/extractor-fetch.ts | 27 ++++++++++++++--------- src/lib/ingestion/extractor-playwright.ts | 20 ++++++++++++----- tests/lib/ingestion/add-command.test.ts | 8 +++---- 3 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts index 55fe33ef..d34acc7e 100644 --- a/src/lib/ingestion/extractor-fetch.ts +++ b/src/lib/ingestion/extractor-fetch.ts @@ -31,22 +31,27 @@ const DEFAULT_USER_AGENT = * Returns the resulting plain text. */ export function htmlToText(html: string): string { - // Remove script and style blocks entirely. + // Remove script and style blocks entirely (allow optional whitespace before closing >). let text = html - .replace(//gi, ' ') - .replace(//gi, ' '); + .replace(//gi, ' ') + .replace(//gi, ' '); // Replace block-level elements with newlines. text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n'); // Strip all remaining tags. text = text.replace(/<[^>]+>/g, ' '); - // Decode common HTML entities. - text = text - .replace(/&/g, '&') - .replace(/</g, '<') - .replace(/>/g, '>') - .replace(/"/g, '"') - .replace(/'/g, "'") - .replace(/ /g, ' '); + // Decode common HTML entities in a single pass to avoid double-decoding + // (e.g. &lt; must become <, not <). + text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => { + switch (match) { + case '&': return '&'; + case '<': return '<'; + case '>': return '>'; + case '"': return '"'; + case ''': return "'"; + case ' ': return ' '; + default: return match; + } + }); // Collapse whitespace. return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim(); } diff --git a/src/lib/ingestion/extractor-playwright.ts b/src/lib/ingestion/extractor-playwright.ts index 6b404c9e..e8d6a87d 100644 --- a/src/lib/ingestion/extractor-playwright.ts +++ b/src/lib/ingestion/extractor-playwright.ts @@ -140,10 +140,15 @@ export class PlaywrightExtractor implements Extractor { const html = await page.content(); const title = await page.title().catch(() => undefined); - // evaluate runs in the browser context where `document` is available; - // cast through globalThis to avoid TypeScript's non-DOM lib complaint. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const text = await page.evaluate(() => (globalThis as any).document?.body?.innerText ?? ''); + // evaluate() runs in the browser context where `document` is available. + // Cast through a minimal interface to avoid TypeScript's non-DOM lib + // complaint without resorting to an untyped `any`. + const text = await page.evaluate(() => { + const g = globalThis as unknown as { + document?: { body?: { innerText?: string } }; + }; + return g.document?.body?.innerText ?? ''; + }); await context.close(); @@ -201,10 +206,15 @@ export interface BrowserModule { * time (via the `loadPlaywrightFn` constructor parameter) without needing to * actually install playwright. * + * The `any` cast is intentional: `playwright` is an optional peer dependency + * that may not be present at compile time, so its types are unavailable. + * The {@link BrowserModule} interface documents the exact subset we rely on. + * * @internal */ export async function loadPlaywright(): Promise { + // playwright is an optional peer dependency; the cast is deliberate. // eslint-disable-next-line @typescript-eslint/no-explicit-any - const mod = await import('playwright' as any) as BrowserModule; + const mod = (await import('playwright' as any)) as BrowserModule; return mod; } diff --git a/tests/lib/ingestion/add-command.test.ts b/tests/lib/ingestion/add-command.test.ts index 2e38321e..9ff780f0 100644 --- a/tests/lib/ingestion/add-command.test.ts +++ b/tests/lib/ingestion/add-command.test.ts @@ -78,8 +78,8 @@ describe('runAdd', () => { it('appends newline to stdout when content does not end with one', async () => { ingestSpy.mockResolvedValueOnce(mockIngestResult('No newline content')); const writes: string[] = []; - stdoutSpy.mockImplementation((s: unknown) => { - writes.push(String(s)); + stdoutSpy.mockImplementation((chunk: unknown) => { + writes.push(String(chunk)); return true; }); @@ -91,8 +91,8 @@ describe('runAdd', () => { it('does not append extra newline when content already ends with one', async () => { ingestSpy.mockResolvedValueOnce(mockIngestResult('Content with newline\n')); const writes: string[] = []; - stdoutSpy.mockImplementation((s: unknown) => { - writes.push(String(s)); + stdoutSpy.mockImplementation((chunk: unknown) => { + writes.push(String(chunk)); return true; }); From 2c2456b41c6d545b25036e129b73badc52584545 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 4 Apr 2026 23:39:41 +0000 Subject: [PATCH 4/4] fix: remaining CodeQL and code review issues - extractor-fetch.ts: use [^>]* in script/style closing tag regex to handle any attributes (fixes CodeQL bad-tag-filter); document numeric entity limitation - add.ts: add parsePositiveInt helper with validation and user-friendly error messages for --min-content-length and --timeout options Agent-Logs-Url: https://github.com/TheWizardsCode/ContextHub/sessions/511715b4-dc90-4fe1-8664-cd673580f894 Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com> --- src/cli/commands/add.ts | 23 +++++++++++++++++------ src/lib/ingestion/extractor-fetch.ts | 11 +++++++---- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/src/cli/commands/add.ts b/src/cli/commands/add.ts index 1c822e61..591f40cb 100644 --- a/src/cli/commands/add.ts +++ b/src/cli/commands/add.ts @@ -132,14 +132,25 @@ export function registerAddCommand(program: { ) .option('--output ', 'Write ingested content to file instead of stdout') .action(async (url: string, opts: Record) => { + const parsePositiveInt = ( + raw: string | boolean | undefined, + name: string + ): number | undefined => { + if (raw === undefined || raw === false || raw === '') return undefined; + const n = parseInt(String(raw), 10); + if (isNaN(n) || n <= 0) { + process.stderr.write( + `[ob add] Error: --${name} must be a positive integer (received: ${String(raw)})\n` + ); + process.exit(1); + } + return n; + }; + const options: AddCommandOptions = { playwrightFallback: Boolean(opts['playwrightFallback']), - minContentLength: opts['minContentLength'] - ? parseInt(String(opts['minContentLength']), 10) - : undefined, - timeout: opts['timeout'] - ? parseInt(String(opts['timeout']), 10) - : undefined, + minContentLength: parsePositiveInt(opts['minContentLength'], 'min-content-length'), + timeout: parsePositiveInt(opts['timeout'], 'timeout'), browser: opts['browser'] as PlaywrightBrowserChannel | undefined, output: opts['output'] ? String(opts['output']) : undefined, }; diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts index d34acc7e..c9fa0f4f 100644 --- a/src/lib/ingestion/extractor-fetch.ts +++ b/src/lib/ingestion/extractor-fetch.ts @@ -31,16 +31,19 @@ const DEFAULT_USER_AGENT = * Returns the resulting plain text. */ export function htmlToText(html: string): string { - // Remove script and style blocks entirely (allow optional whitespace before closing >). + // Remove script and style blocks entirely (allow attributes/whitespace before closing >). let text = html - .replace(//gi, ' ') - .replace(//gi, ' '); + .replace(/]*>/gi, ' ') + .replace(/]*>/gi, ' '); // Replace block-level elements with newlines. text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n'); // Strip all remaining tags. text = text.replace(/<[^>]+>/g, ' '); - // Decode common HTML entities in a single pass to avoid double-decoding + // Decode common named HTML entities in a single pass to avoid double-decoding // (e.g. &lt; must become <, not <). + // Note: numeric character references (e.g. A or A) are intentionally + // not decoded here since they are uncommon in plain-content pages and the + // purpose of this extractor is fast-path text extraction, not full HTML parsing. text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => { switch (match) { case '&': return '&';