From 7492cd598f56c6295feac7e517f236f406b662a4 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:21:42 +0000
Subject: [PATCH 1/4] Initial plan


From f8d2c079e1a2a0865720db5b496604dea99cbfcd Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:34:24 +0000
Subject: [PATCH 2/4] feat: add Playwright fallback extractor for JS-heavy URL
 ingestion

Implements:
- src/lib/ingestion/extractor.ts: Extractor interface + ExtractResult type
- src/lib/ingestion/extractor-fetch.ts: fast-path HTTP fetch extractor
- src/lib/ingestion/extractor-playwright.ts: PlaywrightExtractor (opt-in,
  dynamic import, fresh context per run, configurable timeout, graceful
  degradation)
- src/lib/ingestion/service.ts: IngestionService with primary + fallback
  orchestration, telemetry emission, OB_PLAYWRIGHT_FALLBACK env var support
- src/cli/commands/add.ts: 'ob add <url>' CLI command
- package.json: playwright as optional peer dependency
- 47 unit tests (no real browser required in CI)
- tests/fixtures/playwright-fallback/: HTML fixtures for documentation

Agent-Logs-Url: https://github.com/TheWizardsCode/ContextHub/sessions/511715b4-dc90-4fe1-8664-cd673580f894

Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com>
---
 package.json                                  |   8 +
 src/cli/commands/add.ts                       | 168 ++++++++++
 src/lib/ingestion/extractor-fetch.ts          | 108 +++++++
 src/lib/ingestion/extractor-playwright.ts     | 210 +++++++++++++
 src/lib/ingestion/extractor.ts                |  37 +++
 src/lib/ingestion/service.ts                  | 189 ++++++++++++
 .../playwright-fallback/js-heavy-page.html    |  22 ++
 .../playwright-fallback/static-page.html      |  15 +
 tests/lib/ingestion/add-command.test.ts       | 106 +++++++
 tests/lib/ingestion/extractor-fetch.test.ts   | 131 ++++++++
 .../ingestion/extractor-playwright.test.ts    | 291 ++++++++++++++++++
 tests/lib/ingestion/extractor.test.ts         |  47 +++
 tests/lib/ingestion/service.test.ts           | 287 +++++++++++++++++
 13 files changed, 1619 insertions(+)
 create mode 100644 src/cli/commands/add.ts
 create mode 100644 src/lib/ingestion/extractor-fetch.ts
 create mode 100644 src/lib/ingestion/extractor-playwright.ts
 create mode 100644 src/lib/ingestion/extractor.ts
 create mode 100644 src/lib/ingestion/service.ts
 create mode 100644 tests/fixtures/playwright-fallback/js-heavy-page.html
 create mode 100644 tests/fixtures/playwright-fallback/static-page.html
 create mode 100644 tests/lib/ingestion/add-command.test.ts
 create mode 100644 tests/lib/ingestion/extractor-fetch.test.ts
 create mode 100644 tests/lib/ingestion/extractor-playwright.test.ts
 create mode 100644 tests/lib/ingestion/extractor.test.ts
 create mode 100644 tests/lib/ingestion/service.test.ts
diff --git a/package.json b/package.json
index 426b1489..ed2e8179 100644
--- a/package.json
+++ b/package.json
@@ -36,6 +36,14 @@
     "express": "^4.18.2",
     "js-yaml": "^4.1.1"
   },
+  "peerDependencies": {
+    "playwright": ">=1.40.0"
+  },
+  "peerDependenciesMeta": {
+    "playwright": {
+      "optional": true
+    }
+  },
   "devDependencies": {
     "@types/blessed": "^0.1.7",
     "@types/better-sqlite3": "^7.6.13",
diff --git a/src/cli/commands/add.ts b/src/cli/commands/add.ts
new file mode 100644
index 00000000..1c822e61
--- /dev/null
+++ b/src/cli/commands/add.ts
@@ -0,0 +1,168 @@
+/**
+ * `ob add <url>` CLI command.
+ *
+ * Ingests content from a URL using the fast-path primary extractor, falling
+ * back to a Playwright headless-browser fetch when:
+ *   - The Playwright fallback is explicitly enabled (via config or environment
+ *     variable `OB_PLAYWRIGHT_FALLBACK=1`), AND
+ *   - The primary extractor returns fewer characters than the configured
+ *     minimum content length threshold.
+ *
+ * Usage:
+ *   ob add <url> [--playwright-fallback] [--min-content-length <n>] [--timeout <ms>]
+ *
+ * The command writes the ingested content to stdout (or a file if `--output`
+ * is provided) in a format compatible with the downstream ingestion pipeline.
+ * It exits with a non-zero status code only on unrecoverable errors (e.g. an
+ * invalid URL); partial or empty content from a gracefully-degraded fetch is
+ * not treated as a fatal error.
+ */
+
+import { IngestionService, type IngestionConfig } from '../../lib/ingestion/service.js';
+import { FetchExtractor } from '../../lib/ingestion/extractor-fetch.js';
+import { PlaywrightExtractor, type PlaywrightBrowserChannel } from '../../lib/ingestion/extractor-playwright.js';
+import type { ExtractResult } from '../../lib/ingestion/extractor.js';
+import * as fs from 'fs';
+
+/** Options accepted by the `add` command. */
+export interface AddCommandOptions {
+  /** Enable the Playwright fallback (overrides config). */
+  playwrightFallback?: boolean;
+  /** Minimum content length before fallback is triggered. */
+  minContentLength?: number;
+  /** Playwright navigation timeout in milliseconds. */
+  timeout?: number;
+  /** Browser channel to use for Playwright. */
+  browser?: PlaywrightBrowserChannel;
+  /** Write output to this file path instead of stdout. */
+  output?: string;
+  /**
+   * Injectable file-write function for testing.
+   * Defaults to `fs.writeFileSync` when not provided.
+   * @internal
+   */
+  _writeFile?: (path: string, data: string, encoding: BufferEncoding) => void;
+}
+
+/**
+ * Dependency-injectable runner for the `add` command.
+ *
+ * Separated from Commander.js setup so it can be unit-tested without starting
+ * a real CLI process.
+ *
+ * @param url The URL to ingest.
+ * @param options Command options.
+ * @returns The ingested {@link ExtractResult}.
+ */
+export async function runAdd(
+  url: string,
+  options: AddCommandOptions = {}
+): Promise<ExtractResult> {
+  if (!url) {
+    throw new Error('URL is required');
+  }
+
+  const ingestionConfig: IngestionConfig = {
+    playwrightFallback: options.playwrightFallback ?? false,
+    minContentLength: options.minContentLength,
+  };
+
+  const playwrightExtractor = new PlaywrightExtractor({
+    browser: options.browser ?? 'chromium',
+    timeoutMs: options.timeout ?? 30_000,
+  });
+
+  const service = new IngestionService(
+    new FetchExtractor(),
+    playwrightExtractor,
+    ingestionConfig
+  );
+
+  const result = await service.ingest(url);
+
+  if (options.output) {
+    const writeFile = options._writeFile ?? fs.writeFileSync;
+    writeFile(options.output, result.text, 'utf8');
+  } else {
+    process.stdout.write(result.text);
+    if (result.text.length > 0 && !result.text.endsWith('\n')) {
+      process.stdout.write('\n');
+    }
+  }
+
+  return result;
+}
+
+// ---------------------------------------------------------------------------
+// Commander.js registration (optional — used when loaded as a plugin command)
+// ---------------------------------------------------------------------------
+
+/**
+ * Register the `add` command with a Commander.js program.
+ *
+ * This function follows the same pattern as other commands in `src/commands/`.
+ *
+ * @param program The root Commander.js Command instance.
+ */
+export function registerAddCommand(program: {
+  command: (name: string) => CommandBuilder;
+}): void {
+  program
+    .command('add <url>')
+    .description('Ingest content from a URL (with optional Playwright fallback for JS-heavy pages)')
+    .option(
+      '--playwright-fallback',
+      'Enable Playwright headless-browser fallback (requires `playwright` package)',
+      false
+    )
+    .option(
+      '--min-content-length <n>',
+      'Minimum character count before Playwright fallback is triggered',
+      '200'
+    )
+    .option(
+      '--timeout <ms>',
+      'Playwright navigation timeout in milliseconds',
+      '30000'
+    )
+    .option(
+      '--browser <channel>',
+      'Playwright browser channel: chromium | firefox | webkit',
+      'chromium'
+    )
+    .option('--output <file>', 'Write ingested content to file instead of stdout')
+    .action(async (url: string, opts: Record<string, string | boolean>) => {
+      const options: AddCommandOptions = {
+        playwrightFallback: Boolean(opts['playwrightFallback']),
+        minContentLength: opts['minContentLength']
+          ? parseInt(String(opts['minContentLength']), 10)
+          : undefined,
+        timeout: opts['timeout']
+          ? parseInt(String(opts['timeout']), 10)
+          : undefined,
+        browser: opts['browser'] as PlaywrightBrowserChannel | undefined,
+        output: opts['output'] ? String(opts['output']) : undefined,
+      };
+
+      try {
+        await runAdd(url, options);
+      } catch (err) {
+        process.stderr.write(
+          `[ob add] Error: ${err instanceof Error ? err.message : String(err)}\n`
+        );
+        process.exit(1);
+      }
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Minimal type shim for Commander.js .option / .action chaining
+// (avoids requiring a Commander.js type import in this standalone module)
+// ---------------------------------------------------------------------------
+
+interface CommandBuilder {
+  description: (desc: string) => CommandBuilder;
+  option: (flags: string, description: string, defaultValue?: string | boolean) => CommandBuilder;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  action: (fn: (...args: any[]) => void | Promise<void>) => CommandBuilder;
+}
diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts
new file mode 100644
index 00000000..55fe33ef
--- /dev/null
+++ b/src/lib/ingestion/extractor-fetch.ts
@@ -0,0 +1,108 @@
+/**
+ * Primary fast-path extractor.
+ *
+ * Fetches a URL using the built-in `fetch` API and extracts plain text by
+ * stripping HTML tags.  This is the default extractor used by the ingestion
+ * service; it is fast but cannot handle pages that render content with
+ * client-side JavaScript.
+ */
+
+import type { Extractor, ExtractResult } from './extractor.js';
+
+/** Configuration for the primary fetch-based extractor. */
+export interface FetchExtractorConfig {
+  /**
+   * Request timeout in milliseconds.
+   * @default 15000
+   */
+  timeoutMs?: number;
+  /**
+   * User-Agent string to send with requests.
+   */
+  userAgent?: string;
+}
+
+const DEFAULT_TIMEOUT_MS = 15_000;
+const DEFAULT_USER_AGENT =
+  'Mozilla/5.0 (compatible; OpenBrain-Ingestion/1.0; +https://github.com/TheWizardsCode/ContextHub)';
+
+/**
+ * Strips HTML tags and collapses whitespace from `html`.
+ * Returns the resulting plain text.
+ */
+export function htmlToText(html: string): string {
+  // Remove script and style blocks entirely.
+  let text = html
+    .replace(/<script[\s\S]*?<\/script>/gi, ' ')
+    .replace(/<style[\s\S]*?<\/style>/gi, ' ');
+  // Replace block-level elements with newlines.
+  text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n');
+  // Strip all remaining tags.
+  text = text.replace(/<[^>]+>/g, ' ');
+  // Decode common HTML entities.
+  text = text
+    .replace(/&amp;/g, '&')
+    .replace(/&lt;/g, '<')
+    .replace(/&gt;/g, '>')
+    .replace(/&quot;/g, '"')
+    .replace(/&#39;/g, "'")
+    .replace(/&nbsp;/g, ' ');
+  // Collapse whitespace.
+  return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
+}
+
+/**
+ * Extracts the content of the first `<title>` tag in `html`.
+ */
+export function extractTitle(html: string): string | undefined {
+  const match = /<title[^>]*>([^<]*)<\/title>/i.exec(html);
+  return match ? match[1].trim() : undefined;
+}
+
+/**
+ * Primary extractor that uses HTTP fetch + HTML-to-text conversion.
+ *
+ * This extractor is fast and has no external runtime dependencies.  It is
+ * the default first-pass extractor used by the ingestion service before
+ * attempting the Playwright fallback.
+ */
+export class FetchExtractor implements Extractor {
+  private readonly config: Required<FetchExtractorConfig>;
+
+  constructor(config: FetchExtractorConfig = {}) {
+    this.config = {
+      timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS,
+      userAgent: config.userAgent ?? DEFAULT_USER_AGENT,
+    };
+  }
+
+  async extract(url: string): Promise<ExtractResult> {
+    let html: string;
+
+    try {
+      const controller = new AbortController();
+      const timer = setTimeout(
+        () => controller.abort(),
+        this.config.timeoutMs
+      );
+      try {
+        const response = await fetch(url, {
+          signal: controller.signal,
+          headers: { 'User-Agent': this.config.userAgent },
+        });
+        html = await response.text();
+      } finally {
+        clearTimeout(timer);
+      }
+    } catch {
+      return { text: '', url };
+    }
+
+    return {
+      text: htmlToText(html),
+      html,
+      title: extractTitle(html),
+      url,
+    };
+  }
+}
diff --git a/src/lib/ingestion/extractor-playwright.ts b/src/lib/ingestion/extractor-playwright.ts
new file mode 100644
index 00000000..6b404c9e
--- /dev/null
+++ b/src/lib/ingestion/extractor-playwright.ts
@@ -0,0 +1,210 @@
+/**
+ * Playwright-based fallback extractor.
+ *
+ * Uses a headless Chromium (or configured) browser to render JavaScript-heavy
+ * pages that the primary fast-path extractor cannot handle.
+ *
+ * Design principles:
+ * - Playwright is imported dynamically at runtime so the module does not fail
+ *   to load when the `playwright` package is absent.
+ * - Each invocation uses a fresh browser context to avoid credential leakage
+ *   (no cookies, local storage, or auth tokens are shared between runs).
+ * - All failures (launch error, navigation timeout, navigation error) are
+ *   handled gracefully: the extractor returns an empty result and logs a
+ *   warning rather than throwing.
+ */
+
+import type { Extractor, ExtractResult } from './extractor.js';
+
+/** Playwright browser channel to launch. */
+export type PlaywrightBrowserChannel = 'chromium' | 'firefox' | 'webkit';
+
+/** Configuration for PlaywrightExtractor. */
+export interface PlaywrightExtractorConfig {
+  /**
+   * Which browser to use.
+   * @default 'chromium'
+   */
+  browser?: PlaywrightBrowserChannel;
+  /**
+   * Navigation timeout in milliseconds.
+   * @default 30000
+   */
+  timeoutMs?: number;
+}
+
+/**
+ * Classifies errors that the PlaywrightExtractor can encounter.
+ * Used in telemetry payloads.
+ */
+export type PlaywrightErrorType =
+  | 'launch_failed'
+  | 'timeout'
+  | 'navigation_error'
+  | null;
+
+/** Internal logger type so tests can inject a spy. */
+export type Logger = {
+  warn: (message: string, ...meta: unknown[]) => void;
+};
+
+const DEFAULT_TIMEOUT_MS = 30_000;
+
+/**
+ * Playwright-based URL extractor.
+ *
+ * Implements the same {@link Extractor} interface as the primary extractor so
+ * it can be used as a drop-in replacement or fallback in the ingestion service.
+ */
+export class PlaywrightExtractor implements Extractor {
+  private readonly config: Required<PlaywrightExtractorConfig>;
+  private readonly logger: Logger;
+  private readonly loadPlaywrightFn: () => Promise<BrowserModule>;
+
+  /**
+   * @param config         Browser configuration.
+   * @param logger         Optional logger; defaults to `console`.
+   * @param loadPlaywrightFn  Optional override for the playwright loader.
+   *                       Inject a stub here in tests to avoid a real import.
+   */
+  constructor(
+    config: PlaywrightExtractorConfig = {},
+    logger?: Logger,
+    loadPlaywrightFn?: () => Promise<BrowserModule>
+  ) {
+    this.config = {
+      browser: config.browser ?? 'chromium',
+      timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS,
+    };
+    this.logger = logger ?? console;
+    this.loadPlaywrightFn = loadPlaywrightFn ?? loadPlaywright;
+  }
+
+  /**
+   * Fetch and extract content from `url` using a headless browser.
+   *
+   * Returns an empty {@link ExtractResult} if Playwright is not installed,
+   * the browser fails to launch, or navigation times out.
+   */
+  async extract(url: string): Promise<ExtractResult> {
+    let browserModule: BrowserModule | null = null;
+
+    // Dynamic import guard: fail gracefully if playwright is not installed.
+    try {
+      browserModule = await this.loadPlaywrightFn();
+    } catch {
+      this.logger.warn(
+        '[PlaywrightExtractor] playwright package is not installed. ' +
+          'Install it with: npm install playwright'
+      );
+      return { text: '', url };
+    }
+
+    const launcher = browserModule[this.config.browser];
+    let browser: BrowserLike | null = null;
+
+    try {
+      browser = await launcher.launch({ headless: true });
+    } catch (err) {
+      this.logger.warn(
+        '[PlaywrightExtractor] Failed to launch browser:',
+        err instanceof Error ? err.message : err
+      );
+      return { text: '', url };
+    }
+
+    try {
+      // Fresh context — no cookies, localStorage, or auth tokens.
+      const context = await browser.newContext({
+        storageState: undefined,
+      });
+      const page = await context.newPage();
+
+      try {
+        await page.goto(url, {
+          timeout: this.config.timeoutMs,
+          waitUntil: 'domcontentloaded',
+        });
+      } catch (err) {
+        const isTimeout =
+          err instanceof Error && err.message.includes('Timeout');
+        const errorType: PlaywrightErrorType = isTimeout
+          ? 'timeout'
+          : 'navigation_error';
+        this.logger.warn(
+          `[PlaywrightExtractor] Navigation ${errorType} for ${url}:`,
+          err instanceof Error ? err.message : err
+        );
+        return { text: '', url };
+      }
+
+      const html = await page.content();
+      const title = await page.title().catch(() => undefined);
+      // evaluate runs in the browser context where `document` is available;
+      // cast through globalThis to avoid TypeScript's non-DOM lib complaint.
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      const text = await page.evaluate(() => (globalThis as any).document?.body?.innerText ?? '');
+
+      await context.close();
+
+      return { text, html, title, url };
+    } finally {
+      try {
+        await browser.close();
+      } catch {
+        // Ignore close errors.
+      }
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Internal browser-module loading abstraction (facilitates testing)
+// ---------------------------------------------------------------------------
+
+/** Minimal shape of a Playwright browser launcher we use. */
+export interface BrowserTypeLike {
+  launch(options: { headless: boolean }): Promise<BrowserLike>;
+}
+
+/** Minimal shape of a Playwright Browser we use. */
+export interface BrowserLike {
+  newContext(options: { storageState: undefined }): Promise<BrowserContextLike>;
+  close(): Promise<void>;
+}
+
+/** Minimal shape of a Playwright BrowserContext we use. */
+export interface BrowserContextLike {
+  newPage(): Promise<PageLike>;
+  close(): Promise<void>;
+}
+
+/** Minimal shape of a Playwright Page we use. */
+export interface PageLike {
+  goto(url: string, options: { timeout: number; waitUntil: string }): Promise<unknown>;
+  content(): Promise<string>;
+  title(): Promise<string>;
+  evaluate<T>(fn: () => T): Promise<T>;
+}
+
+/** The subset of the playwright module that PlaywrightExtractor uses. */
+export interface BrowserModule {
+  chromium: BrowserTypeLike;
+  firefox: BrowserTypeLike;
+  webkit: BrowserTypeLike;
+}
+
+/**
+ * Load the playwright module.
+ *
+ * Isolated into a separate function so callers can override it at construction
+ * time (via the `loadPlaywrightFn` constructor parameter) without needing to
+ * actually install playwright.
+ *
+ * @internal
+ */
+export async function loadPlaywright(): Promise<BrowserModule> {
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const mod = await import('playwright' as any) as BrowserModule;
+  return mod;
+}
diff --git a/src/lib/ingestion/extractor.ts b/src/lib/ingestion/extractor.ts
new file mode 100644
index 00000000..6b6bef91
--- /dev/null
+++ b/src/lib/ingestion/extractor.ts
@@ -0,0 +1,37 @@
+/**
+ * Extractor interface for URL content retrieval.
+ *
+ * All extractors (primary fast-path and Playwright fallback) implement this
+ * interface so they can be swapped in the ingestion pipeline without changes
+ * to the service layer.
+ */
+
+/**
+ * The result of extracting content from a URL.
+ */
+export interface ExtractResult {
+  /** The plain-text content extracted from the page. */
+  text: string;
+  /** The raw HTML of the page (optional; used for downstream parsing). */
+  html?: string;
+  /** The page title, if available. */
+  title?: string;
+  /** The URL that was fetched (may differ from the requested URL after redirects). */
+  url: string;
+}
+
+/**
+ * Interface that all URL extractors must implement.
+ */
+export interface Extractor {
+  /**
+   * Fetch and extract content from the given URL.
+   *
+   * Implementations must not throw. Failures are surfaced via a result with
+   * an empty `text` field (and optionally an error logged internally).
+   *
+   * @param url The URL to fetch.
+   * @returns A promise resolving to the extracted content.
+   */
+  extract(url: string): Promise<ExtractResult>;
+}
diff --git a/src/lib/ingestion/service.ts b/src/lib/ingestion/service.ts
new file mode 100644
index 00000000..36549a15
--- /dev/null
+++ b/src/lib/ingestion/service.ts
@@ -0,0 +1,189 @@
+/**
+ * Ingestion service with Playwright fallback orchestration.
+ *
+ * Orchestrates URL content retrieval using a fast primary extractor and an
+ * optional Playwright-based fallback for JavaScript-heavy pages.
+ *
+ * The fallback is opt-in: it is only activated when:
+ *   1. `config.playwrightFallback` is `true` (or the `OB_PLAYWRIGHT_FALLBACK`
+ *      environment variable is set to a truthy value), AND
+ *   2. The primary extractor returns fewer characters than
+ *      `config.minContentLength` (default: 200).
+ *
+ * On every invocation, a structured telemetry entry is emitted via the
+ * provided logger.  Telemetry never includes URL text, page content, or any
+ * user-identifiable data.
+ */
+
+import type { Extractor, ExtractResult } from './extractor.js';
+
+/** Configuration for the ingestion service. */
+export interface IngestionConfig {
+  /**
+   * Enable the Playwright fallback extractor.
+   *
+   * Can also be enabled by setting the `OB_PLAYWRIGHT_FALLBACK` environment
+   * variable to any truthy value (`1`, `true`, `yes`).
+   *
+   * @default false
+   */
+  playwrightFallback?: boolean;
+
+  /**
+   * Minimum character count that primary extractor content must reach before
+   * the Playwright fallback is considered unnecessary.
+   *
+   * @default 200
+   */
+  minContentLength?: number;
+}
+
+// ---------------------------------------------------------------------------
+// Telemetry
+// ---------------------------------------------------------------------------
+
+/** Structured telemetry entry emitted on every ingestion run. */
+export interface PlaywrightFallbackTelemetry {
+  /** Fixed event identifier. */
+  event: 'playwright_fallback';
+  /** Whether the fallback was actually triggered. */
+  triggered: boolean;
+  /** Character count returned by the primary extractor. */
+  primaryContentLength: number;
+  /** Character count returned by PlaywrightExtractor (0 if not triggered or failed). */
+  fallbackContentLength: number;
+  /** Wall-clock time of the Playwright fetch in milliseconds (0 if not triggered). */
+  durationMs: number;
+  /** Whether PlaywrightExtractor returned usable content. */
+  success: boolean;
+  /** Error classification, or null when no error occurred. */
+  errorType: 'launch_failed' | 'timeout' | 'navigation_error' | null;
+  /** Always "playwright" for this fallback. */
+  provider: 'playwright';
+}
+
+/** Minimal logger interface used by the service. */
+export interface ServiceLogger {
+  info: (message: string, meta?: PlaywrightFallbackTelemetry) => void;
+  warn: (message: string, ...args: unknown[]) => void;
+}
+
+// ---------------------------------------------------------------------------
+// IngestionService
+// ---------------------------------------------------------------------------
+
+const DEFAULT_MIN_CONTENT_LENGTH = 200;
+
+/**
+ * Determines whether the Playwright fallback is enabled, considering both the
+ * programmatic config and the `OB_PLAYWRIGHT_FALLBACK` environment variable.
+ */
+function isFallbackEnabled(config: IngestionConfig): boolean {
+  if (config.playwrightFallback === true) return true;
+  const envVal = process.env['OB_PLAYWRIGHT_FALLBACK'];
+  return envVal === '1' || envVal === 'true' || envVal === 'yes';
+}
+
+/**
+ * Service that orchestrates URL ingestion with an optional Playwright
+ * fallback for JavaScript-heavy pages.
+ */
+export class IngestionService {
+  private readonly primaryExtractor: Extractor;
+  private readonly playwrightExtractor: Extractor;
+  private readonly config: IngestionConfig;
+  private readonly logger: ServiceLogger;
+
+  constructor(
+    primaryExtractor: Extractor,
+    playwrightExtractor: Extractor,
+    config: IngestionConfig = {},
+    logger?: ServiceLogger
+  ) {
+    this.primaryExtractor = primaryExtractor;
+    this.playwrightExtractor = playwrightExtractor;
+    this.config = config;
+    this.logger = logger ?? {
+      info: (msg, meta) => {
+        const line = meta ? `${msg} ${JSON.stringify(meta)}` : msg;
+        process.stdout.write(`${line}\n`);
+      },
+      warn: (msg, ...args) => {
+        const parts = [msg, ...args.map(a => String(a))].join(' ');
+        process.stderr.write(`${parts}\n`);
+      },
+    };
+  }
+
+  /**
+   * Ingest content from `url`.
+   *
+   * 1. Runs the primary extractor.
+   * 2. If the fallback is enabled and primary content is below the threshold,
+   *    runs PlaywrightExtractor.
+   * 3. Emits a telemetry entry regardless of which path was taken.
+   * 4. Returns the best available result (fallback if it produced content,
+   *    otherwise the primary result).
+   *
+   * @param url The URL to ingest.
+   * @returns The best {@link ExtractResult} available.
+   */
+  async ingest(url: string): Promise<ExtractResult> {
+    const minLen =
+      this.config.minContentLength ?? DEFAULT_MIN_CONTENT_LENGTH;
+
+    const primaryResult = await this.primaryExtractor.extract(url);
+    const primaryLen = primaryResult.text.length;
+
+    if (!isFallbackEnabled(this.config) || primaryLen >= minLen) {
+      this.emitTelemetry({
+        event: 'playwright_fallback',
+        triggered: false,
+        primaryContentLength: primaryLen,
+        fallbackContentLength: 0,
+        durationMs: 0,
+        success: false,
+        errorType: null,
+        provider: 'playwright',
+      });
+      return primaryResult;
+    }
+
+    // Fallback triggered.
+    const startMs = Date.now();
+    let fallbackResult: ExtractResult = { text: '', url };
+    let errorType: PlaywrightFallbackTelemetry['errorType'] = null;
+
+    try {
+      fallbackResult = await this.playwrightExtractor.extract(url);
+    } catch (err) {
+      // PlaywrightExtractor should not throw, but guard anyway.
+      this.logger.warn(
+        '[IngestionService] Unexpected error from PlaywrightExtractor:',
+        err instanceof Error ? err.message : err
+      );
+      errorType = 'navigation_error';
+    }
+
+    const durationMs = Date.now() - startMs;
+    const fallbackLen = fallbackResult.text.length;
+    const success = fallbackLen > 0;
+
+    this.emitTelemetry({
+      event: 'playwright_fallback',
+      triggered: true,
+      primaryContentLength: primaryLen,
+      fallbackContentLength: fallbackLen,
+      durationMs,
+      success,
+      errorType,
+      provider: 'playwright',
+    });
+
+    return success ? fallbackResult : primaryResult;
+  }
+
+  private emitTelemetry(entry: PlaywrightFallbackTelemetry): void {
+    this.logger.info('[telemetry]', entry);
+  }
+}
diff --git a/tests/fixtures/playwright-fallback/js-heavy-page.html b/tests/fixtures/playwright-fallback/js-heavy-page.html
new file mode 100644
index 00000000..a61b2f6d
--- /dev/null
+++ b/tests/fixtures/playwright-fallback/js-heavy-page.html
@@ -0,0 +1,22 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>JS-Heavy Page (fixture)</title>
+  <script>
+    // Simulate a JavaScript-rendered page.
+    // The initial HTML has no meaningful content; it is added via JS.
+    document.addEventListener('DOMContentLoaded', function () {
+      document.getElementById('app').innerHTML =
+        '<h1>Article Title</h1>' +
+        '<p>This is the main article content rendered by JavaScript. ' +
+        'It contains more than two hundred characters of useful text so that ' +
+        'downstream content-length checks can pass without issue. ' +
+        'The quick brown fox jumped over the lazy dog.</p>';
+    });
+  </script>
+</head>
+<body>
+  <div id="app"><!-- content inserted by JavaScript --></div>
+</body>
+</html>
diff --git a/tests/fixtures/playwright-fallback/static-page.html b/tests/fixtures/playwright-fallback/static-page.html
new file mode 100644
index 00000000..8bd18866
--- /dev/null
+++ b/tests/fixtures/playwright-fallback/static-page.html
@@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <title>Static Page (fixture)</title>
+</head>
+<body>
+  <h1>Static Page</h1>
+  <p>This page has enough static content that the primary extractor can handle
+  it without needing the Playwright fallback. The text here is well over two
+  hundred characters so that a minContentLength check of 200 will pass and the
+  Playwright extractor will not be invoked at all for this fixture. The quick
+  brown fox jumped over the lazy dog.</p>
+</body>
+</html>
diff --git a/tests/lib/ingestion/add-command.test.ts b/tests/lib/ingestion/add-command.test.ts
new file mode 100644
index 00000000..2e38321e
--- /dev/null
+++ b/tests/lib/ingestion/add-command.test.ts
@@ -0,0 +1,106 @@
+/**
+ * Unit tests for src/cli/commands/add.ts
+ *
+ * Tests exercise the `runAdd` function using stubbed IngestionService
+ * instances.  No real network requests or browser launches occur.
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { runAdd } from '../../../src/cli/commands/add.js';
+import * as serviceModule from '../../../src/lib/ingestion/service.js';
+import type { ExtractResult } from '../../../src/lib/ingestion/extractor.js';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function mockIngestResult(text: string): ExtractResult {
+  return { text, url: 'https://example.com' };
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('runAdd', () => {
+  let ingestSpy: ReturnType<typeof vi.spyOn>;
+  let stdoutSpy: ReturnType<typeof vi.spyOn>;
+
+  beforeEach(() => {
+    // Spy on IngestionService.prototype.ingest so we can control the result.
+    ingestSpy = vi
+      .spyOn(serviceModule.IngestionService.prototype, 'ingest')
+      .mockResolvedValue(mockIngestResult('Ingested content'));
+
+    // Capture stdout writes.
+    stdoutSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true);
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  it('writes ingested content to stdout by default', async () => {
+    await runAdd('https://example.com');
+
+    expect(stdoutSpy).toHaveBeenCalledWith('Ingested content');
+  });
+
+  it('returns the ExtractResult', async () => {
+    const result = await runAdd('https://example.com');
+    expect(result.text).toBe('Ingested content');
+    expect(result.url).toBe('https://example.com');
+  });
+
+  it('throws when URL is empty', async () => {
+    await expect(runAdd('')).rejects.toThrow('URL is required');
+  });
+
+  it('passes playwrightFallback option to IngestionService', async () => {
+    // We can inspect the IngestionService constructor call via the spy on ingest.
+    await runAdd('https://example.com', { playwrightFallback: true });
+    expect(ingestSpy).toHaveBeenCalledWith('https://example.com');
+  });
+
+  it('writes to output file when --output is specified', async () => {
+    const writeFileSpy = vi.fn();
+
+    await runAdd('https://example.com', {
+      output: '/tmp/output.txt',
+      _writeFile: writeFileSpy,
+    });
+
+    expect(writeFileSpy).toHaveBeenCalledWith('/tmp/output.txt', 'Ingested content', 'utf8');
+    // Should not also write to stdout.
+    expect(stdoutSpy).not.toHaveBeenCalled();
+  });
+
+  it('appends newline to stdout when content does not end with one', async () => {
+    ingestSpy.mockResolvedValueOnce(mockIngestResult('No newline content'));
+    const writes: string[] = [];
+    stdoutSpy.mockImplementation((s: unknown) => {
+      writes.push(String(s));
+      return true;
+    });
+
+    await runAdd('https://example.com');
+
+    expect(writes.join('')).toMatch(/\n$/);
+  });
+
+  it('does not append extra newline when content already ends with one', async () => {
+    ingestSpy.mockResolvedValueOnce(mockIngestResult('Content with newline\n'));
+    const writes: string[] = [];
+    stdoutSpy.mockImplementation((s: unknown) => {
+      writes.push(String(s));
+      return true;
+    });
+
+    await runAdd('https://example.com');
+
+    // Should be exactly one newline at the end, not two.
+    const combined = writes.join('');
+    expect(combined.endsWith('\n\n')).toBe(false);
+    expect(combined.endsWith('\n')).toBe(true);
+  });
+});
diff --git a/tests/lib/ingestion/extractor-fetch.test.ts b/tests/lib/ingestion/extractor-fetch.test.ts
new file mode 100644
index 00000000..1a680b4f
--- /dev/null
+++ b/tests/lib/ingestion/extractor-fetch.test.ts
@@ -0,0 +1,131 @@
+/**
+ * Unit tests for src/lib/ingestion/extractor-fetch.ts
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { htmlToText, extractTitle, FetchExtractor } from '../../../src/lib/ingestion/extractor-fetch.js';
+
+// ---------------------------------------------------------------------------
+// htmlToText
+// ---------------------------------------------------------------------------
+
+describe('htmlToText', () => {
+  it('strips HTML tags', () => {
+    expect(htmlToText('<p>Hello <strong>world</strong></p>')).toBe('Hello world');
+  });
+
+  it('removes script blocks entirely', () => {
+    const html = '<p>Visible</p><script>var x = 1;</script><p>After</p>';
+    const text = htmlToText(html);
+    expect(text).not.toContain('var x');
+    expect(text).toContain('Visible');
+    expect(text).toContain('After');
+  });
+
+  it('removes style blocks entirely', () => {
+    const html = '<p>Text</p><style>body { color: red; }</style>';
+    const text = htmlToText(html);
+    expect(text).not.toContain('color');
+    expect(text).toContain('Text');
+  });
+
+  it('decodes common HTML entities', () => {
+    // &nbsp; decodes to a regular space which is then trimmed by whitespace normalization.
+    expect(htmlToText('&amp; &lt; &gt; &quot; &#39; &nbsp;')).toBe("& < > \" '");
+  });
+
+  it('collapses whitespace', () => {
+    expect(htmlToText('  hello   world  ')).toBe('hello world');
+  });
+
+  it('returns empty string for empty input', () => {
+    expect(htmlToText('')).toBe('');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// extractTitle
+// ---------------------------------------------------------------------------
+
+describe('extractTitle', () => {
+  it('extracts a simple title', () => {
+    expect(extractTitle('<html><head><title>My Page</title></head></html>')).toBe('My Page');
+  });
+
+  it('returns undefined when no title tag', () => {
+    expect(extractTitle('<html><head></head></html>')).toBeUndefined();
+  });
+
+  it('trims whitespace from title', () => {
+    expect(extractTitle('<title>  Spaces  </title>')).toBe('Spaces');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// FetchExtractor
+// ---------------------------------------------------------------------------
+
+describe('FetchExtractor', () => {
+  const mockFetch = vi.fn();
+
+  beforeEach(() => {
+    vi.stubGlobal('fetch', mockFetch);
+    mockFetch.mockClear();
+  });
+
+  afterEach(() => {
+    vi.restoreAllMocks();
+  });
+
+  function mockResponse(html: string) {
+    return {
+      ok: true,
+      text: vi.fn(async () => html),
+    };
+  }
+
+  it('returns extracted text from a successful fetch', async () => {
+    const html = '<html><head><title>Test</title></head><body><p>Hello world</p></body></html>';
+    mockFetch.mockResolvedValueOnce(mockResponse(html));
+
+    const extractor = new FetchExtractor();
+    const result = await extractor.extract('https://example.com');
+
+    expect(result.url).toBe('https://example.com');
+    expect(result.text).toContain('Hello world');
+    expect(result.title).toBe('Test');
+    expect(result.html).toBe(html);
+  });
+
+  it('returns empty text on network failure', async () => {
+    mockFetch.mockRejectedValueOnce(new Error('Network error'));
+
+    const extractor = new FetchExtractor();
+    const result = await extractor.extract('https://unreachable.example.com');
+
+    expect(result.text).toBe('');
+    expect(result.url).toBe('https://unreachable.example.com');
+  });
+
+  it('returns empty text on abort (timeout)', async () => {
+    mockFetch.mockRejectedValueOnce(
+      Object.assign(new Error('The operation was aborted'), { name: 'AbortError' })
+    );
+
+    const extractor = new FetchExtractor({ timeoutMs: 1 });
+    const result = await extractor.extract('https://slow.example.com');
+
+    expect(result.text).toBe('');
+  });
+
+  it('passes the User-Agent header', async () => {
+    const html = '<p>content</p>';
+    mockFetch.mockResolvedValueOnce(mockResponse(html));
+
+    const extractor = new FetchExtractor({ userAgent: 'TestAgent/1.0' });
+    await extractor.extract('https://example.com');
+
+    const [, requestInit] = mockFetch.mock.calls[0] as [string, RequestInit];
+    expect((requestInit.headers as Record<string, string>)['User-Agent']).toBe('TestAgent/1.0');
+  });
+});
diff --git a/tests/lib/ingestion/extractor-playwright.test.ts b/tests/lib/ingestion/extractor-playwright.test.ts
new file mode 100644
index 00000000..c0690039
--- /dev/null
+++ b/tests/lib/ingestion/extractor-playwright.test.ts
@@ -0,0 +1,291 @@
+/**
+ * Unit tests for src/lib/ingestion/extractor-playwright.ts
+ *
+ * Tests exercise the PlaywrightExtractor class using stubbed browser
+ * objects injected via the constructor's `loadPlaywrightFn` parameter.
+ * No real browser is launched and playwright is never imported.
+ *
+ * Testing strategy:
+ * - `PlaywrightExtractor` accepts an optional `loadPlaywrightFn` in its
+ *   constructor.  Tests pass a factory that returns controlled fake browser
+ *   implementations instead of a real playwright module.
+ * - This matches the "mock/stub" option described in the issue's CI/Testing
+ *   Strategy section and avoids ESM spy limitations.
+ */
+
+import { describe, it, expect, vi } from 'vitest';
+import {
+  PlaywrightExtractor,
+  type BrowserModule,
+  type BrowserTypeLike,
+  type BrowserLike,
+  type BrowserContextLike,
+  type PageLike,
+} from '../../../src/lib/ingestion/extractor-playwright.js';
+
+// ---------------------------------------------------------------------------
+// Helpers: minimal browser stubs
+// ---------------------------------------------------------------------------
+
+function buildPageStub(overrides: Partial<PageLike> = {}): PageLike {
+  return {
+    goto: vi.fn(async () => null),
+    content: vi.fn(async () => '<html><body><p>Playwright content</p></body></html>'),
+    title: vi.fn(async () => 'Playwright Page'),
+    evaluate: vi.fn(async () => 'Playwright content'),
+    ...overrides,
+  };
+}
+
+function buildContextStub(page: PageLike): BrowserContextLike {
+  return {
+    newPage: vi.fn(async () => page),
+    close: vi.fn(async () => undefined),
+  };
+}
+
+function buildBrowserStub(context: BrowserContextLike): BrowserLike {
+  return {
+    newContext: vi.fn(async () => context),
+    close: vi.fn(async () => undefined),
+  };
+}
+
+function buildLauncherStub(browser: BrowserLike): BrowserTypeLike {
+  return {
+    launch: vi.fn(async () => browser),
+  };
+}
+
+function buildBrowserModule(chromium: BrowserTypeLike): BrowserModule {
+  return {
+    chromium,
+    firefox: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))),
+    webkit: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))),
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('PlaywrightExtractor', () => {
+
+  // -------------------------------------------------------------------------
+  // Happy path
+  // -------------------------------------------------------------------------
+
+  it('returns extracted text and html from a successful navigation', async () => {
+    const page = buildPageStub({
+      content: vi.fn(async () => '<html><body><p>JS content</p></body></html>'),
+      title: vi.fn(async () => 'JS Page'),
+      evaluate: vi.fn(async () => 'JS content'),
+    });
+    const launcher = buildLauncherStub(buildBrowserStub(buildContextStub(page)));
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: vi.fn() },
+      async () => buildBrowserModule(launcher)
+    );
+
+    const result = await extractor.extract('https://example.com');
+
+    expect(result.text).toBe('JS content');
+    expect(result.html).toContain('JS content');
+    expect(result.title).toBe('JS Page');
+    expect(result.url).toBe('https://example.com');
+  });
+
+  it('uses a fresh browser context (storageState is undefined)', async () => {
+    const page = buildPageStub();
+    const context = buildContextStub(page);
+    const browser = buildBrowserStub(context);
+    const launcher = buildLauncherStub(browser);
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: vi.fn() },
+      async () => buildBrowserModule(launcher)
+    );
+
+    await extractor.extract('https://example.com');
+
+    expect(browser.newContext).toHaveBeenCalledWith({ storageState: undefined });
+  });
+
+  it('closes the browser even when navigation succeeds', async () => {
+    const page = buildPageStub();
+    const context = buildContextStub(page);
+    const browser = buildBrowserStub(context);
+    const launcher = buildLauncherStub(browser);
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: vi.fn() },
+      async () => buildBrowserModule(launcher)
+    );
+
+    await extractor.extract('https://example.com');
+
+    expect(browser.close).toHaveBeenCalled();
+  });
+
+  it('passes headless:true to browser.launch', async () => {
+    const page = buildPageStub();
+    const context = buildContextStub(page);
+    const browser = buildBrowserStub(context);
+    const launcher = buildLauncherStub(browser);
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: vi.fn() },
+      async () => buildBrowserModule(launcher)
+    );
+
+    await extractor.extract('https://example.com');
+
+    expect(launcher.launch).toHaveBeenCalledWith({ headless: true });
+  });
+
+  it('passes configured timeout to page.goto', async () => {
+    const page = buildPageStub();
+    const context = buildContextStub(page);
+    const browser = buildBrowserStub(context);
+    const launcher = buildLauncherStub(browser);
+    const extractor = new PlaywrightExtractor(
+      { timeoutMs: 5000 },
+      { warn: vi.fn() },
+      async () => buildBrowserModule(launcher)
+    );
+
+    await extractor.extract('https://example.com');
+
+    expect(page.goto).toHaveBeenCalledWith(
+      'https://example.com',
+      expect.objectContaining({ timeout: 5000 })
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Graceful degradation: playwright not installed
+  // -------------------------------------------------------------------------
+
+  it('returns empty result and warns when playwright is not installed', async () => {
+    const warnSpy = vi.fn();
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: warnSpy },
+      async () => { throw new Error("Cannot find module 'playwright'"); }
+    );
+
+    const result = await extractor.extract('https://example.com');
+
+    expect(result.text).toBe('');
+    expect(result.url).toBe('https://example.com');
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringContaining('playwright package is not installed')
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Graceful degradation: launch failure
+  // -------------------------------------------------------------------------
+
+  it('returns empty result and warns when browser fails to launch', async () => {
+    const warnSpy = vi.fn();
+    const launcher: BrowserTypeLike = {
+      launch: vi.fn(async () => { throw new Error('Executable not found'); }),
+    };
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: warnSpy },
+      async () => buildBrowserModule(launcher)
+    );
+
+    const result = await extractor.extract('https://example.com');
+
+    expect(result.text).toBe('');
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringContaining('Failed to launch browser'),
+      expect.any(String)
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Graceful degradation: navigation timeout
+  // -------------------------------------------------------------------------
+
+  it('returns empty result and warns on navigation timeout', async () => {
+    const warnSpy = vi.fn();
+    const page = buildPageStub({
+      goto: vi.fn(async () => { throw new Error('Timeout exceeded'); }),
+    });
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: warnSpy },
+      async () => buildBrowserModule(
+        buildLauncherStub(buildBrowserStub(buildContextStub(page)))
+      )
+    );
+
+    const result = await extractor.extract('https://example.com');
+
+    expect(result.text).toBe('');
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringMatching(/timeout|navigation_error/i),
+      expect.any(String)
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Graceful degradation: navigation error (non-timeout)
+  // -------------------------------------------------------------------------
+
+  it('returns empty result and warns on non-timeout navigation error', async () => {
+    const warnSpy = vi.fn();
+    const page = buildPageStub({
+      goto: vi.fn(async () => { throw new Error('net::ERR_CONNECTION_REFUSED'); }),
+    });
+    const extractor = new PlaywrightExtractor(
+      {},
+      { warn: warnSpy },
+      async () => buildBrowserModule(
+        buildLauncherStub(buildBrowserStub(buildContextStub(page)))
+      )
+    );
+
+    const result = await extractor.extract('https://example.com');
+
+    expect(result.text).toBe('');
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringContaining('navigation_error'),
+      expect.any(String)
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Browser channel selection
+  // -------------------------------------------------------------------------
+
+  it('uses firefox when configured', async () => {
+    const firefoxPage = buildPageStub({
+      evaluate: vi.fn(async () => 'Firefox content'),
+    });
+    const firefoxLauncher = buildLauncherStub(
+      buildBrowserStub(buildContextStub(firefoxPage))
+    );
+
+    const mod: BrowserModule = {
+      chromium: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))),
+      firefox: firefoxLauncher,
+      webkit: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))),
+    };
+
+    const extractor = new PlaywrightExtractor(
+      { browser: 'firefox' },
+      { warn: vi.fn() },
+      async () => mod
+    );
+    const result = await extractor.extract('https://example.com');
+
+    expect(firefoxLauncher.launch).toHaveBeenCalled();
+    expect(result.text).toBe('Firefox content');
+  });
+});
diff --git a/tests/lib/ingestion/extractor.test.ts b/tests/lib/ingestion/extractor.test.ts
new file mode 100644
index 00000000..6af2292c
--- /dev/null
+++ b/tests/lib/ingestion/extractor.test.ts
@@ -0,0 +1,47 @@
+/**
+ * Unit tests for src/lib/ingestion/extractor.ts
+ *
+ * The Extractor interface and ExtractResult type are structural contracts;
+ * these tests verify they are exported with the expected shape.
+ */
+
+import { describe, it, expect } from 'vitest';
+import type { Extractor, ExtractResult } from '../../../src/lib/ingestion/extractor.js';
+
+describe('Extractor interface', () => {
+  it('can be implemented by a simple stub', async () => {
+    const stub: Extractor = {
+      async extract(url: string): Promise<ExtractResult> {
+        return { text: 'hello', url };
+      },
+    };
+
+    const result = await stub.extract('https://example.com');
+    expect(result.text).toBe('hello');
+    expect(result.url).toBe('https://example.com');
+  });
+
+  it('ExtractResult has the expected fields', () => {
+    const result: ExtractResult = {
+      text: 'content',
+      html: '<p>content</p>',
+      title: 'Page title',
+      url: 'https://example.com',
+    };
+
+    expect(result.text).toBe('content');
+    expect(result.html).toBe('<p>content</p>');
+    expect(result.title).toBe('Page title');
+    expect(result.url).toBe('https://example.com');
+  });
+
+  it('ExtractResult html and title fields are optional', () => {
+    const result: ExtractResult = {
+      text: 'content',
+      url: 'https://example.com',
+    };
+
+    expect(result.html).toBeUndefined();
+    expect(result.title).toBeUndefined();
+  });
+});
diff --git a/tests/lib/ingestion/service.test.ts b/tests/lib/ingestion/service.test.ts
new file mode 100644
index 00000000..9e5bdb54
--- /dev/null
+++ b/tests/lib/ingestion/service.test.ts
@@ -0,0 +1,287 @@
+/**
+ * Unit tests for src/lib/ingestion/service.ts
+ *
+ * Covers:
+ * - Primary-only path (fallback disabled or threshold not met)
+ * - Fallback trigger (short primary content + enabled flag)
+ * - Fallback via OB_PLAYWRIGHT_FALLBACK env var
+ * - Fallback returns empty → primary result is returned
+ * - Telemetry emission on every invocation
+ * - Graceful handling when PlaywrightExtractor throws unexpectedly
+ */
+
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+import { IngestionService, type IngestionConfig, type PlaywrightFallbackTelemetry } from '../../../src/lib/ingestion/service.js';
+import type { Extractor, ExtractResult } from '../../../src/lib/ingestion/extractor.js';
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+function makeExtractor(result: ExtractResult): Extractor {
+  return { extract: vi.fn(async () => result) };
+}
+
+function makeResult(text: string, url = 'https://example.com'): ExtractResult {
+  return { text, url };
+}
+
+function capturedTelemetry(
+  infoSpy: ReturnType<typeof vi.fn>
+): PlaywrightFallbackTelemetry | undefined {
+  const call = infoSpy.mock.calls.find(
+    (c: unknown[]) => c[0] === '[telemetry]'
+  );
+  return call ? (call[1] as PlaywrightFallbackTelemetry) : undefined;
+}
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('IngestionService', () => {
+  const URL = 'https://example.com';
+  let infoSpy: ReturnType<typeof vi.fn>;
+  let warnSpy: ReturnType<typeof vi.fn>;
+  let logger: { info: typeof infoSpy; warn: typeof warnSpy };
+
+  beforeEach(() => {
+    infoSpy = vi.fn();
+    warnSpy = vi.fn();
+    logger = { info: infoSpy, warn: warnSpy };
+    // Ensure env var is not set between tests.
+    delete process.env['OB_PLAYWRIGHT_FALLBACK'];
+  });
+
+  afterEach(() => {
+    delete process.env['OB_PLAYWRIGHT_FALLBACK'];
+  });
+
+  // -------------------------------------------------------------------------
+  // Fallback disabled
+  // -------------------------------------------------------------------------
+
+  it('returns primary result when fallback is disabled (default)', async () => {
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw = makeExtractor(makeResult('playwright content', URL));
+
+    const svc = new IngestionService(primary, pw, {}, logger);
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('short');
+    expect(pw.extract).not.toHaveBeenCalled();
+  });
+
+  it('returns primary result when fallback is disabled explicitly', async () => {
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw = makeExtractor(makeResult('playwright content', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: false }, logger
+    );
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('short');
+    expect(pw.extract).not.toHaveBeenCalled();
+  });
+
+  // -------------------------------------------------------------------------
+  // Fallback not triggered (primary content is sufficient)
+  // -------------------------------------------------------------------------
+
+  it('does not trigger fallback when primary content meets threshold', async () => {
+    const longText = 'a'.repeat(300);
+    const primary = makeExtractor(makeResult(longText, URL));
+    const pw = makeExtractor(makeResult('playwright content', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger
+    );
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe(longText);
+    expect(pw.extract).not.toHaveBeenCalled();
+  });
+
+  // -------------------------------------------------------------------------
+  // Fallback triggered
+  // -------------------------------------------------------------------------
+
+  it('triggers fallback when primary content is below threshold', async () => {
+    const primary = makeExtractor(makeResult('too short', URL));
+    const pw = makeExtractor(makeResult('Rich playwright content that is long enough', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger
+    );
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('Rich playwright content that is long enough');
+    expect(pw.extract).toHaveBeenCalledWith(URL);
+  });
+
+  it('returns primary result when fallback also returns empty', async () => {
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw = makeExtractor(makeResult('', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger
+    );
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('short');
+  });
+
+  // -------------------------------------------------------------------------
+  // Environment variable activation
+  // -------------------------------------------------------------------------
+
+  it('activates fallback via OB_PLAYWRIGHT_FALLBACK=1', async () => {
+    process.env['OB_PLAYWRIGHT_FALLBACK'] = '1';
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw = makeExtractor(makeResult('playwright result', URL));
+
+    const svc = new IngestionService(primary, pw, {}, logger);
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('playwright result');
+    expect(pw.extract).toHaveBeenCalled();
+  });
+
+  it('activates fallback via OB_PLAYWRIGHT_FALLBACK=true', async () => {
+    process.env['OB_PLAYWRIGHT_FALLBACK'] = 'true';
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw = makeExtractor(makeResult('playwright result', URL));
+
+    const svc = new IngestionService(primary, pw, {}, logger);
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('playwright result');
+  });
+
+  it('does not activate fallback via OB_PLAYWRIGHT_FALLBACK=0', async () => {
+    process.env['OB_PLAYWRIGHT_FALLBACK'] = '0';
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw = makeExtractor(makeResult('playwright result', URL));
+
+    const svc = new IngestionService(primary, pw, {}, logger);
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('short');
+    expect(pw.extract).not.toHaveBeenCalled();
+  });
+
+  // -------------------------------------------------------------------------
+  // Telemetry
+  // -------------------------------------------------------------------------
+
+  it('emits telemetry with triggered=false when fallback is disabled', async () => {
+    const primary = makeExtractor(makeResult('some text', URL));
+    const pw = makeExtractor(makeResult('', URL));
+
+    const svc = new IngestionService(primary, pw, {}, logger);
+    await svc.ingest(URL);
+
+    const telemetry = capturedTelemetry(infoSpy);
+    expect(telemetry).toBeDefined();
+    expect(telemetry!.event).toBe('playwright_fallback');
+    expect(telemetry!.triggered).toBe(false);
+    expect(telemetry!.primaryContentLength).toBe('some text'.length);
+    expect(telemetry!.fallbackContentLength).toBe(0);
+    expect(telemetry!.durationMs).toBe(0);
+    expect(telemetry!.success).toBe(false);
+    expect(telemetry!.errorType).toBeNull();
+    expect(telemetry!.provider).toBe('playwright');
+  });
+
+  it('emits telemetry with triggered=true when fallback runs', async () => {
+    const primary = makeExtractor(makeResult('short', URL));
+    const fallbackText = 'long fallback content from playwright';
+    const pw = makeExtractor(makeResult(fallbackText, URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true }, logger
+    );
+    await svc.ingest(URL);
+
+    const telemetry = capturedTelemetry(infoSpy);
+    expect(telemetry!.triggered).toBe(true);
+    expect(telemetry!.primaryContentLength).toBe('short'.length);
+    expect(telemetry!.fallbackContentLength).toBe(fallbackText.length);
+    expect(telemetry!.success).toBe(true);
+    expect(telemetry!.errorType).toBeNull();
+    expect(telemetry!.durationMs).toBeGreaterThanOrEqual(0);
+  });
+
+  it('emits telemetry with success=false when fallback returns empty', async () => {
+    const primary = makeExtractor(makeResult('tiny', URL));
+    const pw = makeExtractor(makeResult('', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true }, logger
+    );
+    await svc.ingest(URL);
+
+    const telemetry = capturedTelemetry(infoSpy);
+    expect(telemetry!.triggered).toBe(true);
+    expect(telemetry!.success).toBe(false);
+    expect(telemetry!.fallbackContentLength).toBe(0);
+  });
+
+  // -------------------------------------------------------------------------
+  // Graceful handling of unexpected PlaywrightExtractor error
+  // -------------------------------------------------------------------------
+
+  it('handles unexpected PlaywrightExtractor throw gracefully', async () => {
+    const primary = makeExtractor(makeResult('short', URL));
+    const pw: Extractor = {
+      extract: vi.fn(async () => { throw new Error('unexpected crash'); }),
+    };
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true }, logger
+    );
+    // Must not throw.
+    const result = await svc.ingest(URL);
+
+    // Falls back to primary result.
+    expect(result.text).toBe('short');
+    expect(warnSpy).toHaveBeenCalledWith(
+      expect.stringContaining('Unexpected error'),
+      expect.any(String)
+    );
+  });
+
+  // -------------------------------------------------------------------------
+  // Default minContentLength
+  // -------------------------------------------------------------------------
+
+  it('uses default minContentLength of 200 when not specified', async () => {
+    // Text of exactly 199 chars → should trigger fallback.
+    const shortText = 'a'.repeat(199);
+    const primary = makeExtractor(makeResult(shortText, URL));
+    const pw = makeExtractor(makeResult('playwright result', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true }, logger
+    );
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe('playwright result');
+    expect(pw.extract).toHaveBeenCalled();
+  });
+
+  it('does not trigger fallback when text is exactly at default threshold (200 chars)', async () => {
+    const atThreshold = 'a'.repeat(200);
+    const primary = makeExtractor(makeResult(atThreshold, URL));
+    const pw = makeExtractor(makeResult('playwright result', URL));
+
+    const svc = new IngestionService(
+      primary, pw, { playwrightFallback: true }, logger
+    );
+    const result = await svc.ingest(URL);
+
+    expect(result.text).toBe(atThreshold);
+    expect(pw.extract).not.toHaveBeenCalled();
+  });
+});

From 66d6e56aac0cce21a4d64a5872b7b6aea2f8d087 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:36:54 +0000
Subject: [PATCH 3/4] fix: address CodeQL and code review feedback

- extractor-fetch.ts: fix bad-tag-filter (allow spaces before closing >) and
  double-escaping (use single-pass entity decoder)
- extractor-playwright.ts: replace globalThis any cast with typed interface,
  add explanatory comment for the optional-peer-dep import cast
- add-command.test.ts: rename s -> chunk for clarity

Agent-Logs-Url: https://github.com/TheWizardsCode/ContextHub/sessions/511715b4-dc90-4fe1-8664-cd673580f894

Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com>
---
 src/lib/ingestion/extractor-fetch.ts      | 27 ++++++++++++++---------
 src/lib/ingestion/extractor-playwright.ts | 20 ++++++++++++-----
 tests/lib/ingestion/add-command.test.ts   |  8 +++----
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts
index 55fe33ef..d34acc7e 100644
--- a/src/lib/ingestion/extractor-fetch.ts
+++ b/src/lib/ingestion/extractor-fetch.ts
@@ -31,22 +31,27 @@ const DEFAULT_USER_AGENT =
  * Returns the resulting plain text.
  */
 export function htmlToText(html: string): string {
-  // Remove script and style blocks entirely.
+  // Remove script and style blocks entirely (allow optional whitespace before closing >).
   let text = html
-    .replace(/<script[\s\S]*?<\/script>/gi, ' ')
-    .replace(/<style[\s\S]*?<\/style>/gi, ' ');
+    .replace(/<script[\s\S]*?<\/script\s*>/gi, ' ')
+    .replace(/<style[\s\S]*?<\/style\s*>/gi, ' ');
   // Replace block-level elements with newlines.
   text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n');
   // Strip all remaining tags.
   text = text.replace(/<[^>]+>/g, ' ');
-  // Decode common HTML entities.
-  text = text
-    .replace(/&amp;/g, '&')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&quot;/g, '"')
-    .replace(/&#39;/g, "'")
-    .replace(/&nbsp;/g, ' ');
+  // Decode common HTML entities in a single pass to avoid double-decoding
+  // (e.g. &amp;lt; must become &lt;, not <).
+  text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => {
+    switch (match) {
+      case '&amp;':  return '&';
+      case '&lt;':   return '<';
+      case '&gt;':   return '>';
+      case '&quot;': return '"';
+      case '&#39;':  return "'";
+      case '&nbsp;': return ' ';
+      default:       return match;
+    }
+  });
   // Collapse whitespace.
   return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
 }
diff --git a/src/lib/ingestion/extractor-playwright.ts b/src/lib/ingestion/extractor-playwright.ts
index 6b404c9e..e8d6a87d 100644
--- a/src/lib/ingestion/extractor-playwright.ts
+++ b/src/lib/ingestion/extractor-playwright.ts
@@ -140,10 +140,15 @@ export class PlaywrightExtractor implements Extractor {
 
       const html = await page.content();
       const title = await page.title().catch(() => undefined);
-      // evaluate runs in the browser context where `document` is available;
-      // cast through globalThis to avoid TypeScript's non-DOM lib complaint.
-      // eslint-disable-next-line @typescript-eslint/no-explicit-any
-      const text = await page.evaluate(() => (globalThis as any).document?.body?.innerText ?? '');
+      // evaluate() runs in the browser context where `document` is available.
+      // Cast through a minimal interface to avoid TypeScript's non-DOM lib
+      // complaint without resorting to an untyped `any`.
+      const text = await page.evaluate(() => {
+        const g = globalThis as unknown as {
+          document?: { body?: { innerText?: string } };
+        };
+        return g.document?.body?.innerText ?? '';
+      });
 
       await context.close();
 
@@ -201,10 +206,15 @@ export interface BrowserModule {
  * time (via the `loadPlaywrightFn` constructor parameter) without needing to
  * actually install playwright.
  *
+ * The `any` cast is intentional: `playwright` is an optional peer dependency
+ * that may not be present at compile time, so its types are unavailable.
+ * The {@link BrowserModule} interface documents the exact subset we rely on.
+ *
  * @internal
  */
 export async function loadPlaywright(): Promise<BrowserModule> {
+  // playwright is an optional peer dependency; the cast is deliberate.
   // eslint-disable-next-line @typescript-eslint/no-explicit-any
-  const mod = await import('playwright' as any) as BrowserModule;
+  const mod = (await import('playwright' as any)) as BrowserModule;
   return mod;
 }
diff --git a/tests/lib/ingestion/add-command.test.ts b/tests/lib/ingestion/add-command.test.ts
index 2e38321e..9ff780f0 100644
--- a/tests/lib/ingestion/add-command.test.ts
+++ b/tests/lib/ingestion/add-command.test.ts
@@ -78,8 +78,8 @@ describe('runAdd', () => {
   it('appends newline to stdout when content does not end with one', async () => {
     ingestSpy.mockResolvedValueOnce(mockIngestResult('No newline content'));
     const writes: string[] = [];
-    stdoutSpy.mockImplementation((s: unknown) => {
-      writes.push(String(s));
+    stdoutSpy.mockImplementation((chunk: unknown) => {
+      writes.push(String(chunk));
       return true;
     });
 
@@ -91,8 +91,8 @@ describe('runAdd', () => {
   it('does not append extra newline when content already ends with one', async () => {
     ingestSpy.mockResolvedValueOnce(mockIngestResult('Content with newline\n'));
     const writes: string[] = [];
-    stdoutSpy.mockImplementation((s: unknown) => {
-      writes.push(String(s));
+    stdoutSpy.mockImplementation((chunk: unknown) => {
+      writes.push(String(chunk));
       return true;
     });
 

From 2c2456b41c6d545b25036e129b73badc52584545 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Sat, 4 Apr 2026 23:39:41 +0000
Subject: [PATCH 4/4] fix: remaining CodeQL and code review issues

- extractor-fetch.ts: use [^>]* in script/style closing tag regex to handle
  any attributes (fixes CodeQL bad-tag-filter); document numeric entity
  limitation
- add.ts: add parsePositiveInt helper with validation and user-friendly
  error messages for --min-content-length and --timeout options

Agent-Logs-Url: https://github.com/TheWizardsCode/ContextHub/sessions/511715b4-dc90-4fe1-8664-cd673580f894

Co-authored-by: SorraTheOrc <250240+SorraTheOrc@users.noreply.github.com>
---
 src/cli/commands/add.ts              | 23 +++++++++++++++++------
 src/lib/ingestion/extractor-fetch.ts | 11 +++++++----
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/src/cli/commands/add.ts b/src/cli/commands/add.ts
index 1c822e61..591f40cb 100644
--- a/src/cli/commands/add.ts
+++ b/src/cli/commands/add.ts
@@ -132,14 +132,25 @@ export function registerAddCommand(program: {
     )
     .option('--output <file>', 'Write ingested content to file instead of stdout')
     .action(async (url: string, opts: Record<string, string | boolean>) => {
+      const parsePositiveInt = (
+        raw: string | boolean | undefined,
+        name: string
+      ): number | undefined => {
+        if (raw === undefined || raw === false || raw === '') return undefined;
+        const n = parseInt(String(raw), 10);
+        if (isNaN(n) || n <= 0) {
+          process.stderr.write(
+            `[ob add] Error: --${name} must be a positive integer (received: ${String(raw)})\n`
+          );
+          process.exit(1);
+        }
+        return n;
+      };
+
       const options: AddCommandOptions = {
         playwrightFallback: Boolean(opts['playwrightFallback']),
-        minContentLength: opts['minContentLength']
-          ? parseInt(String(opts['minContentLength']), 10)
-          : undefined,
-        timeout: opts['timeout']
-          ? parseInt(String(opts['timeout']), 10)
-          : undefined,
+        minContentLength: parsePositiveInt(opts['minContentLength'], 'min-content-length'),
+        timeout: parsePositiveInt(opts['timeout'], 'timeout'),
         browser: opts['browser'] as PlaywrightBrowserChannel | undefined,
         output: opts['output'] ? String(opts['output']) : undefined,
       };
diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts
index d34acc7e..c9fa0f4f 100644
--- a/src/lib/ingestion/extractor-fetch.ts
+++ b/src/lib/ingestion/extractor-fetch.ts
@@ -31,16 +31,19 @@ const DEFAULT_USER_AGENT =
  * Returns the resulting plain text.
  */
 export function htmlToText(html: string): string {
-  // Remove script and style blocks entirely (allow optional whitespace before closing >).
+  // Remove script and style blocks entirely (allow attributes/whitespace before closing >).
   let text = html
-    .replace(/<script[\s\S]*?<\/script\s*>/gi, ' ')
-    .replace(/<style[\s\S]*?<\/style\s*>/gi, ' ');
+    .replace(/<script[\s\S]*?<\/script[^>]*>/gi, ' ')
+    .replace(/<style[\s\S]*?<\/style[^>]*>/gi, ' ');
   // Replace block-level elements with newlines.
   text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n');
   // Strip all remaining tags.
   text = text.replace(/<[^>]+>/g, ' ');
-  // Decode common HTML entities in a single pass to avoid double-decoding
+  // Decode common named HTML entities in a single pass to avoid double-decoding
   // (e.g. &amp;lt; must become &lt;, not <).
+  // Note: numeric character references (e.g. &#65; or &#x41;) are intentionally
+  // not decoded here since they are uncommon in plain-content pages and the
+  // purpose of this extractor is fast-path text extraction, not full HTML parsing.
   text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => {
     switch (match) {
       case '&amp;':  return '&';