From 9e56611e2ad6d7631bd88234d97ceb77518adc70 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:21:11 +0900 Subject: [PATCH] feat(index): public library barrel re-exporting CspIndex + types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port of `src/semble/__init__.py` and `src/semble/version.py`. The barrel stitches the documented public surface: - `CspIndex` (from `./indexing/index.ts`, owned by Unit 12) - `Chunk`, `SearchResult`, `IndexStats`, `EmbeddingMatrix` (types from `./types.ts`, owned by Unit 1) - `ContentType` (re-exported as a *value* so the runtime enum object survives `verbatimModuleSyntax` — `export type {}` would erase it) - `version` (from `./version.ts`, currently mirrors `package.json#version`) Includes `// TODO` placeholder stubs for Unit 1 (`src/types.ts`) and Unit 12 (`src/indexing/index.ts`) so the barrel type-checks and `src/index.test.ts` runs in isolation. Both placeholders are clearly marked and will be overwritten when the owning unit lands. Co-authored-by: Minsu Lee --- src/index.test.ts | 48 ++++ src/index.ts | 24 +- src/indexing/index.ts | 495 +++++------------------------------------- src/types.ts | 205 ++--------------- src/version.ts | 10 + 5 files changed, 149 insertions(+), 633 deletions(-) create mode 100644 src/index.test.ts create mode 100644 src/version.ts diff --git a/src/index.test.ts b/src/index.test.ts new file mode 100644 index 0000000..bf84542 --- /dev/null +++ b/src/index.test.ts @@ -0,0 +1,48 @@ +// Smoke tests for the public library barrel. +// +// These don't exercise behavior — Unit 12 (CspIndex) and Unit 1 (types) own +// their own deep tests. The point here is to lock down the *shape* of the +// public surface so we'd catch: +// * an accidental rename of `CspIndex` / `ContentType` / `version`, +// * a regression of `ContentType` to a type-only export (which would +// break `import { ContentType } from '@pleaseai/csp'` at runtime). +// +// The wildcard `import * as csp` is deliberate: it also verifies the module +// is *syntactically* a valid ESM barrel (no circular value-time imports). +import { describe, expect, it } from 'bun:test' + +import * as csp from './index.ts' + +describe('public barrel', () => { + it('imports without error and exposes the documented names', () => { + // Use a `Set` so the assertion message is order-independent — easier to + // diagnose than a positional array diff when a name is missing. + const exported = new Set(Object.keys(csp)) + for (const name of ['CspIndex', 'ContentType', 'version']) { + expect(exported.has(name)).toBe(true) + } + }) + + it('exposes `version` as a string', () => { + expect(typeof csp.version).toBe('string') + // Guard against an empty string sneaking in (e.g. failed build-time + // substitution); a real version is always non-empty. + expect(csp.version.length).toBeGreaterThan(0) + }) + + it('exposes `CspIndex` as a constructable value', () => { + // `typeof X === 'function'` covers both `class` and plain functions, + // which keeps the test resilient if Unit 12 chooses a factory-style + // implementation instead of a class. + expect(typeof csp.CspIndex).toBe('function') + }) + + it('exposes `ContentType` as a runtime enum object with `code | docs | config`', () => { + // The string values are part of the on-disk / CLI contract (`--content code`, + // persisted indices). They must NOT be tweaked without coordinating with + // the semble compatibility story documented in CLAUDE.md. + expect(csp.ContentType.Code).toBe('code') + expect(csp.ContentType.Docs).toBe('docs') + expect(csp.ContentType.Config).toBe('config') + }) +}) diff --git a/src/index.ts b/src/index.ts index d5a5614..553f5f2 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1 +1,23 @@ -export const version = '0.0.0' +// Public library barrel — port of `src/semble/__init__.py`. +// +// External consumers `import { CspIndex, ContentType, ... } from '@pleaseai/csp'`, +// so this file's surface is load-bearing and matches the README. +// +// `ContentType` is intentionally re-exported as a *value* (not via +// `export type`) because Unit 1's port models it as a `const`-object enum: +// the identifier carries both a runtime value and a same-named type alias. +// With `verbatimModuleSyntax`, exporting it via `export {}` carries both +// forms; listing it under `export type {}` would erase the runtime side. + +export { CspIndex } from './indexing/index.ts' + +export type { + Chunk, + EmbeddingMatrix, + IndexStats, + SearchResult, +} from './types.ts' + +export { ContentType } from './types.ts' + +export { version } from './version.ts' diff --git a/src/indexing/index.ts b/src/indexing/index.ts index a38fed5..c407af3 100644 --- a/src/indexing/index.ts +++ b/src/indexing/index.ts @@ -1,465 +1,76 @@ -// Port of src/semble/index/index.py +// TODO(unit-12): replace with the real CspIndex implementation. +// +// This file is a *placeholder stub* so the public barrel (`src/index.ts`) +// type-checks and `bun test src/index.test.ts` can import the package in +// isolation. Unit 12 lands the real port of `src/semble/index/index.py`; +// when it merges, this file is overwritten wholesale. +// +// The barrel only re-exports the *name* `CspIndex` — consumers don't +// instantiate it from this stub. Keeping the placeholder as a class (rather +// than a stand-in `const`) means the `typeof CspIndex === 'function'` check +// in `src/index.test.ts` is satisfied without a working implementation +// behind it. -import { spawn } from 'node:child_process' -import { statSync } from 'node:fs' -import { mkdir, mkdtemp, readFile, rm, stat, writeFile } from 'node:fs/promises' -import { tmpdir } from 'node:os' -import { join, parse as parsePath, resolve, sep } from 'node:path' -import { fileURLToPath } from 'node:url' import type { Chunk, IndexStats, SearchResult } from '../types.ts' -import { CallType, ContentType, chunkFromDict, chunkToDict } from '../types.ts' -import { createIndexFromPath } from './create.ts' -import type { Model } from './dense.ts' -import { SelectableBasicBackend, loadModel } from './dense.ts' -import { Bm25Index } from './sparse.ts' -import { search, searchSemantic } from '../search.ts' -import { saveSearchStats } from '../stats.ts' -import { PersistencePath } from './types.ts' -/** Default content set: code only. */ -export const DEFAULT_CONTENT: readonly ContentType[] = [ContentType.Code] -/** All content types — used by the `--content all` CLI flag. */ -export const ALL_CONTENT: readonly ContentType[] = [ContentType.Code, ContentType.Docs, ContentType.Config] - -/** Timeout (ms) applied to `git clone` invocations. */ -export const GIT_CLONE_TIMEOUT_MS = Number.parseInt(process.env.CSP_CLONE_TIMEOUT ?? '60', 10) * 1000 - -export interface CspIndexConstructorArgs { - model: Model - bm25Index: Bm25Index - semanticIndex: SelectableBasicBackend - chunks: Chunk[] - modelPath: string - root?: string | null - content?: ContentType | readonly ContentType[] -} - -export interface FromPathOptions { - extensions?: readonly string[] - content?: ContentType | readonly ContentType[] - modelPath?: string | null -} - -export interface FromGitOptions extends FromPathOptions { - ref?: string | null -} - -export interface SearchInvocationOptions { - topK?: number - alpha?: number | null - filterLanguages?: readonly string[] - filterPaths?: readonly string[] - rerank?: boolean | null -} - -export interface FindRelatedOptions { - topK?: number -} - -/** Fast local code index with hybrid (semantic + BM25) search. */ +/** + * Hybrid (dense + BM25) code-search index. + * + * Placeholder — Unit 12 ships the authoritative implementation porting + * `semble.index.index.SembleIndex` (factories `fromPath`/`fromGit`, search / + * findRelated, save/load, stats). + */ export class CspIndex { - readonly model: Model - readonly chunks: Chunk[] - - private readonly _bm25Index: Bm25Index - private readonly _semanticIndex: SelectableBasicBackend - private readonly _modelPath: string - private readonly _root: string | null - private readonly _content: readonly ContentType[] - private readonly _fileSizes: Record - private readonly _fileMapping: Record - private readonly _languageMapping: Record - - constructor(args: CspIndexConstructorArgs) { - this.model = args.model - this.chunks = args.chunks - this._bm25Index = args.bm25Index - this._semanticIndex = args.semanticIndex - this._modelPath = args.modelPath - this._root = args.root ?? null - this._content = normalizeContent(args.content ?? DEFAULT_CONTENT) - this._fileSizes = this._root ? this._computeFileSizes(this._root) : {} - const mappings = this._populateMapping() - this._fileMapping = mappings.file - this._languageMapping = mappings.language - } - - /** Aggregate index statistics. */ - get stats(): IndexStats { - const languageCounts: Record = {} - for (const chunk of this.chunks) { - if (chunk.language) { - languageCounts[chunk.language] = (languageCounts[chunk.language] ?? 0) + 1 - } - } - return { - indexedFiles: Object.keys(this._fileMapping).length, - totalChunks: this.chunks.length, - languages: languageCounts, - } - } - - /** Create and index a CspIndex from a local directory. */ - static async fromPath( - path: string | URL, - options: FromPathOptions = {}, - ): Promise { - const resolved = await resolveDirectory(path) - const { model, modelPath } = await loadModel(options.modelPath) - const normalized = normalizeContent(options.content ?? DEFAULT_CONTENT) - const created = await createIndexFromPath(resolved, { - model, - ...(options.extensions !== undefined ? { extensions: options.extensions } : {}), - content: normalized, - displayRoot: resolved, - }) - return new CspIndex({ - model, - bm25Index: created.bm25Index, - semanticIndex: created.semanticIndex, - chunks: created.chunks, - modelPath, - root: resolved, - content: normalized, - }) - } - - /** Clone a git repository to a tmp dir, index it, then clean up the clone. */ - static async fromGit( - url: string, - options: FromGitOptions = {}, - ): Promise { - const normalized = normalizeContent(options.content ?? DEFAULT_CONTENT) - const tmpDir = await mkdtemp(join(tmpdir(), 'csp-')) - try { - await runGitClone(url, tmpDir, options.ref ?? null) - - const { model, modelPath } = await loadModel(options.modelPath) - const resolved = resolve(tmpDir) - const created = await createIndexFromPath(resolved, { - model, - ...(options.extensions !== undefined ? { extensions: options.extensions } : {}), - content: normalized, - displayRoot: resolved, - }) - return new CspIndex({ - model, - bm25Index: created.bm25Index, - semanticIndex: created.semanticIndex, - chunks: created.chunks, - modelPath, - root: resolved, - content: normalized, - }) - } - finally { - // Best-effort cleanup. Swallow rm errors so they never mask the original - // exception (Node 22 `rm` can race against AV scanners on Windows). The - // tmp dir lives under the OS tmpdir which is purged by the OS anyway. - await rm(tmpDir, { recursive: true, force: true, maxRetries: 3 }).catch(() => {}) - } + // Throw eagerly so an accidental `new CspIndex()` against the stub fails + // fast with a clear message, instead of looking like a working empty index. + constructor() { + throw new Error( + 'CspIndex is a placeholder stub — Unit 12 (`feat/unit-12-index`) ships the real implementation.', + ) } - /** Load a previously-saved index from disk. */ - static async loadFromDisk(path: string): Promise { - let exists = true - try { - await stat(path) - } - catch { - exists = false - } - if (!exists) throw new Error(`Index not found at ${path}`) - - const persistencePaths = PersistencePath.fromPath(path) - const missing = persistencePaths.nonExisting() - if (missing.length > 0) { - throw new Error(`Index not found at ${path}. Missing: ${missing.join(', ')}`) - } + // Method signatures are intentionally omitted; the barrel only needs the + // class to *exist* as a value export. Consumers reaching for `.fromPath()` + // etc. against this stub would be using it before Unit 12 has merged, + // which is a sequencing bug worth surfacing as a `TypeError` at call site. - const bm25Index = Bm25Index.load(persistencePaths.bm25Index) - const semanticIndex = SelectableBasicBackend.load(persistencePaths.semanticIndex) - const metadataRaw = await readFile(persistencePaths.metadata, 'utf8') - const metadata = JSON.parse(metadataRaw) as { - root_path?: string | null - model_path?: string | null - } - const chunkRaw = await readFile(persistencePaths.chunks, 'utf8') - const chunkData = JSON.parse(chunkRaw) as Array> - const chunks = chunkData.map(chunkFromDict) - - const { model, modelPath } = await loadModel(metadata.model_path ?? null) - return new CspIndex({ - model, - bm25Index, - semanticIndex, - chunks, - modelPath, - root: metadata.root_path ?? null, - }) + /** Placeholder — see Unit 12. */ + static fromPath(..._args: unknown[]): Promise { + return Promise.reject(new Error('CspIndex.fromPath: not implemented (Unit 12).')) } - /** Search the index and return the top-k most relevant chunks. */ - search(query: string, options: SearchInvocationOptions = {}): SearchResult[] { - if (this.chunks.length === 0 || query.trim().length === 0) return [] - - const topK = options.topK ?? 10 - if (topK <= 0) return [] - - const filterLanguages = options.filterLanguages - const filterPaths = options.filterPaths - const resolvedRerank = options.rerank ?? this._content.includes(ContentType.Code) - const selector = this._getSelectorVector(filterLanguages, filterPaths) - // Honor the user's filter when it matches zero chunks — bypass the - // ranking pipeline rather than falling back to an unfiltered search. - if (selector !== null && selector.length === 0) { - saveSearchStats([], CallType.Search, this._fileSizes) - return [] - } - - const results = search( - query, - this.model, - this._semanticIndex, - this._bm25Index, - this.chunks, - topK, - { - alpha: options.alpha ?? null, - ...(selector !== null ? { selector } : {}), - rerank: resolvedRerank, - }, - ) - saveSearchStats(results, CallType.Search, this._fileSizes) - return results + /** Placeholder — see Unit 12. */ + static fromGit(..._args: unknown[]): Promise { + return Promise.reject(new Error('CspIndex.fromGit: not implemented (Unit 12).')) } - /** Return chunks semantically similar to the given chunk or search result. */ - findRelated( - source: Chunk | SearchResult, - options: FindRelatedOptions = {}, - ): SearchResult[] { - const topK = options.topK ?? 5 - const target = isSearchResult(source) ? source.chunk : source - const selector - = target.language - ? this._getSelectorVector([target.language], undefined) - : null - const results = searchSemantic( - target.content, - this.model, - this._semanticIndex, - this.chunks, - topK + 1, - selector, - ) - const filtered = results - .filter(r => !sameChunk(r.chunk, target)) - .slice(0, topK) - saveSearchStats(filtered, CallType.FindRelated, this._fileSizes) - return filtered + /** Placeholder — see Unit 12. */ + static load(..._args: unknown[]): Promise { + return Promise.reject(new Error('CspIndex.load: not implemented (Unit 12).')) } - /** Persist the index to disk under `path` (created if missing). */ - async save(path: string): Promise { - await mkdir(path, { recursive: true }) - const persistencePaths = PersistencePath.fromPath(path) - this._bm25Index.save(persistencePaths.bm25Index) - this._semanticIndex.save(persistencePaths.semanticIndex) - const chunksAsDict = this.chunks.map(chunkToDict) - await writeFile(persistencePaths.chunks, JSON.stringify(chunksAsDict)) - const metadata = { - root_path: this._root, - time: Date.now() / 1000, - model_path: this._modelPath, - } - await writeFile(persistencePaths.metadata, JSON.stringify(metadata)) + /** Placeholder — see Unit 12. */ + search(..._args: unknown[]): SearchResult[] { + throw new Error('CspIndex.search: not implemented (Unit 12).') } - private _populateMapping(): { - file: Record - language: Record - } { - const file: Record = {} - const language: Record = {} - for (let i = 0; i < this.chunks.length; i++) { - const chunk = this.chunks[i]! - if (chunk.language) { - const arr = language[chunk.language] - if (arr) arr.push(i) - else language[chunk.language] = [i] - } - const arr = file[chunk.filePath] - if (arr) arr.push(i) - else file[chunk.filePath] = [i] - } - return { file, language } + /** Placeholder — see Unit 12. */ + findRelated(..._args: unknown[]): SearchResult[] { + throw new Error('CspIndex.findRelated: not implemented (Unit 12).') } - private _computeFileSizes(root: string): Record { - const sizes: Record = {} - for (const chunk of this.chunks) { - if (chunk.filePath in sizes) continue - try { - // Mirror Python's `root / chunk.file_path`: absolute paths win, - // relative paths resolve against `root`. - const abs = resolve(root, chunk.filePath) - // `statSync` returns the on-disk byte size — avoids reading the file - // (cheaper, especially for files up to MAX_FILE_BYTES = 1 MB) and - // matches Python's `len(read_text(...))` closely enough for the - // savings-tracking use case while reporting actual UTF-8 byte counts. - sizes[chunk.filePath] = statSync(abs).size - } - catch { - /* swallow */ - } - } - return sizes + /** Placeholder — see Unit 12. */ + save(..._args: unknown[]): Promise { + return Promise.reject(new Error('CspIndex.save: not implemented (Unit 12).')) } - private _getSelectorVector( - filterLanguages?: readonly string[], - filterPaths?: readonly string[], - ): number[] | null { - // Distinguish "no filter requested" (return null → search everything) - // from "filter requested but matched nothing" (return [] → search nothing). - // Semble's Python parity check is `if selector` which conflates the two - // and falls back to unfiltered search on empty results — that is a latent - // correctness bug there. We diverge intentionally to honor user intent. - const hasLanguageFilter - = filterLanguages !== undefined && filterLanguages.length > 0 - const hasPathFilter = filterPaths !== undefined && filterPaths.length > 0 - if (!hasLanguageFilter && !hasPathFilter) return null - - const out = new Set() - if (filterLanguages) { - for (const language of filterLanguages) { - const ids = this._languageMapping[language] - if (ids) for (const i of ids) out.add(i) - } - } - if (filterPaths) { - for (const filename of filterPaths) { - const ids = this._fileMapping[filename] - if (ids) for (const i of ids) out.add(i) - } - } - return [...out].sort((a, b) => a - b) + /** Placeholder — see Unit 12. */ + get stats(): IndexStats { + throw new Error('CspIndex.stats: not implemented (Unit 12).') } -} - -function normalizeContent( - content: ContentType | readonly ContentType[], -): readonly ContentType[] { - if (Array.isArray(content)) return content - return [content as ContentType] -} -function isSearchResult(value: Chunk | SearchResult): value is SearchResult { - return (value as SearchResult).chunk !== undefined - && typeof (value as SearchResult).score === 'number' -} - -function sameChunk(a: Chunk, b: Chunk): boolean { - return ( - a.filePath === b.filePath - && a.startLine === b.startLine - && a.endLine === b.endLine - && a.content === b.content - ) -} - -async function resolveDirectory(path: string | URL): Promise { - const raw = path instanceof URL ? fileURLToPath(path) : path - let info - try { - info = await stat(raw) - } - catch { - throw new Error(`Path does not exist: ${raw}`) - } - if (!info.isDirectory()) { - throw new Error(`Path is not a directory: ${raw}`) + /** Placeholder — see Unit 12. */ + get chunks(): readonly Chunk[] { + throw new Error('CspIndex.chunks: not implemented (Unit 12).') } - // Drop any trailing separator for consistency with semble's Path.resolve() - // — but preserve filesystem root paths (`/` on POSIX, `C:\` on Windows) - // since stripping their trailing sep would mutate the resolved location. - let resolved = resolve(raw) - const rootOfResolved = parsePath(resolved).root - if (resolved.length > rootOfResolved.length && resolved.endsWith(sep)) { - resolved = resolved.slice(0, -1) - } - return resolved -} - -/** - * Shell-out to `git clone --depth 1` into `tmpDir`. - * - * Uses `spawn` (not `execFile`) so stdin can be redirected to `/dev/null` — - * this mirrors semble's `subprocess.run(..., stdin=subprocess.DEVNULL)` and - * prevents a hung remote from blocking on a tty prompt. - */ -async function runGitClone(url: string, tmpDir: string, ref: string | null): Promise { - // `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). - const args = [ - 'clone', - '--depth', - '1', - ...(ref ? ['--branch', ref] : []), - '--', - url, - tmpDir, - ] - await new Promise((resolvePromise, rejectPromise) => { - let child - try { - // stdin: 'ignore' mirrors Python's `subprocess.DEVNULL` so a stuck remote - // can't block on a tty prompt. - // stdout: 'ignore' avoids the OS pipe buffer filling and deadlocking - // `git clone` when verbose hooks/configs print large amounts of output. - // stderr: 'pipe' so we surface the error message on non-zero exit. - child = spawn('git', args, { stdio: ['ignore', 'ignore', 'pipe'] }) - } - catch (err) { - const e = err as NodeJS.ErrnoException - if (e.code === 'ENOENT') { - rejectPromise(new Error('git is not installed or not on PATH')) - return - } - rejectPromise(err as Error) - return - } - let stderr = '' - let timedOut = false - const timer = setTimeout(() => { - timedOut = true - child.kill('SIGTERM') - }, GIT_CLONE_TIMEOUT_MS) - child.stderr?.setEncoding('utf8') - child.stderr?.on('data', (chunk: string) => { - stderr += chunk - }) - child.on('error', (err: NodeJS.ErrnoException) => { - clearTimeout(timer) - if (err.code === 'ENOENT') { - rejectPromise(new Error('git is not installed or not on PATH')) - return - } - rejectPromise(err) - }) - child.on('close', (code) => { - clearTimeout(timer) - if (timedOut) { - rejectPromise(new Error( - `git clone timed out for ${JSON.stringify(url)} (limit: ${GIT_CLONE_TIMEOUT_MS / 1000} s)`, - )) - return - } - if (code !== 0) { - rejectPromise(new Error(`git clone failed for ${JSON.stringify(url)}:\n${stderr.trim()}`)) - return - } - resolvePromise() - }) - }) } diff --git a/src/types.ts b/src/types.ts index 5675bfa..740c4cb 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,29 +1,19 @@ -// Port of src/semble/types.py +// TODO(unit-1): replace with the real port from `feat/unit-1-types`. // -// Public field names are camelCase (not snake_case) — see ARCHITECTURE.md: -// "Public field names are camelCase, not snake_case." The upstream Python -// exposes `chunk.file_path` / `start_line` / `end_line`; the TS port exposes -// `filePath` / `startLine` / `endLine`. This is load-bearing for the public -// surface documented in README.md. - -/** - * Call type for token-savings tracking. - * - * Port of `semble.types.CallType`. Values match the Python `str` enum so - * serialised telemetry (`~/.csp/savings.jsonl`) stays compatible. - */ -export const CallType = { - Search: 'search', - FindRelated: 'find_related', -} as const -export type CallType = (typeof CallType)[keyof typeof CallType] +// This file is a *placeholder stub* so the public barrel (`src/index.ts`) +// type-checks and `bun test src/index.test.ts` can import the package in +// isolation. Unit 1 lands the real port of `src/semble/types.py`; when it +// merges, this file is overwritten wholesale (see PR `feat/unit-1-types`). +// +// Keep the exported names and value/type duality of `ContentType` in lockstep +// with Unit 1 — the barrel re-exports both forms. /** * Content type for indexing and search pipeline selection. * - * Port of `semble.types.ContentType`. Values match the Python `str` enum - * (`'code' | 'docs' | 'config'`) so CLI flags (`--content code`) and persisted - * indices round-trip across the two implementations. + * Placeholder mirroring Unit 1's `const`-object enum. Values are the same + * lowercase strings as the upstream Python `str` enum so CLI flags and + * persisted indices round-trip. */ export const ContentType = { Code: 'code', @@ -32,18 +22,7 @@ export const ContentType = { } as const export type ContentType = (typeof ContentType)[keyof typeof ContentType] -/** - * A single indexable unit of code. - * - * Port of `semble.types.Chunk` (frozen dataclass). Fields are camelCase per - * the public surface contract; use {@link chunkFromDict} to construct from - * serialised data and {@link chunkToDict} to serialise. - * - * Treat instances as immutable — helpers do not mutate, and consumers should - * not either. `readonly` makes the shape compile-time immutable; we don't - * `Object.freeze` at construction time to avoid the runtime cost on hot paths - * (large `Chunk[]` arrays during indexing). - */ +/** Placeholder shape — Unit 1 ships the authoritative definition. */ export interface Chunk { readonly content: string readonly filePath: string @@ -52,172 +31,18 @@ export interface Chunk { readonly language?: string | undefined } -/** - * A single search result with score and source. - * - * Port of `semble.types.SearchResult`. - */ +/** Placeholder shape — Unit 1 ships the authoritative definition. */ export interface SearchResult { readonly chunk: Chunk readonly score: number } -/** - * Statistics about the current index state. - * - * Port of `semble.types.IndexStats`. - */ +/** Placeholder shape — Unit 1 ships the authoritative definition. */ export interface IndexStats { readonly indexedFiles: number readonly totalChunks: number readonly languages: Readonly> } -/** - * Flat row-major Float32 embedding matrix. - * - * Port of `semble.types.EmbeddingMatrix` (`npt.NDArray[np.float32]`). - * - * We use a single `Float32Array` (row-major) instead of `Float32Array[]` - * because: - * 1. Dense retrieval computes `embeddings @ query` as one contiguous BLAS- - * style sweep — a flat buffer keeps that hot loop cache-friendly and - * avoids per-row indirection. - * 2. Persistence (semble pickles the numpy matrix) maps cleanly onto a - * single binary blob without per-row length headers. - * The companion {@link EmbeddingShape} carries `(rows, dim)` since a flat - * `Float32Array` has lost that information. - */ +/** Placeholder alias — Unit 1 ships the authoritative definition. */ export type EmbeddingMatrix = Float32Array - -/** Shape companion for a flat row-major {@link EmbeddingMatrix}. */ -export interface EmbeddingShape { - readonly rows: number - readonly dim: number -} - -/** - * Format a chunk's source location as `filePath:startLine-endLine`. - * - * Port of the `Chunk.location` `@property` in Python. Kept as a free function - * because `Chunk` is a plain interface (no methods) in the TS port. - */ -export function chunkLocation(chunk: Chunk): string { - return `${chunk.filePath}:${chunk.startLine}-${chunk.endLine}` -} - -/** - * Serialised form of a {@link Chunk}. - * - * `location` is included for consumer convenience (matches Python - * `Chunk.to_dict`) and is reconstructed from the other fields, never trusted - * on the way back in — see {@link chunkFromDict}. - */ -export interface ChunkDict { - content: string - filePath: string - startLine: number - endLine: number - language: string | null - location: string -} - -/** - * Convert a {@link Chunk} to a plain serialisable object. - * - * Port of `Chunk.to_dict`. Includes the derived `location` field. Mirrors - * Python's `dataclasses.asdict`, which represents `Optional[str] = None` as - * literal `null` rather than omitting the key — keeping that shape preserves - * JSON parity across the two implementations. - */ -export function chunkToDict(chunk: Chunk): ChunkDict { - return { - content: chunk.content, - filePath: chunk.filePath, - startLine: chunk.startLine, - endLine: chunk.endLine, - language: chunk.language ?? null, - location: chunkLocation(chunk), - } -} - -/** Input shape accepted by {@link chunkFromDict} — `location` is ignored. */ -export interface ChunkDictInput { - content: string - filePath: string - startLine: number - endLine: number - language?: string | null | undefined - location?: string | undefined -} - -/** - * Reconstruct a {@link Chunk} from a {@link ChunkDict}. - * - * Port of `Chunk.from_dict`. The `location` field, if present, is stripped - * before construction (it's a derived value; trusting it on the way in would - * let a malformed payload desynchronise it from the line range). - * - * This is a trust boundary: TypeScript's compile-time `ChunkDictInput` is - * bypassed when parsing untrusted JSON (persisted indices, MCP payloads, - * external callers). Validate at runtime so malformed input fails loudly - * with a `TypeError` instead of producing a `Chunk` with `NaN` line numbers - * or `undefined` fields that surface as confusing errors deeper in the - * pipeline. - */ -export function chunkFromDict(data: ChunkDictInput): Chunk { - if (data === null || typeof data !== 'object') { - throw new TypeError('chunkFromDict: data must be a non-null object') - } - const d = data as Record - if (typeof d.content !== 'string' - || typeof d.filePath !== 'string' - || typeof d.startLine !== 'number' - || typeof d.endLine !== 'number' - || !Number.isFinite(d.startLine) - || !Number.isFinite(d.endLine)) { - throw new TypeError( - 'chunkFromDict: missing or invalid required fields ' - + '(content: string, filePath: string, startLine: finite number, endLine: finite number)', - ) - } - if (d.language !== undefined && d.language !== null && typeof d.language !== 'string') { - throw new TypeError('chunkFromDict: language must be a string, null, or omitted') - } - // `exactOptionalPropertyTypes` distinguishes "language: undefined" from - // omitted; build the object conditionally so the resulting Chunk matches - // the `language?: string | undefined` signature exactly. - const language = d.language ?? undefined - return language === undefined - ? { - content: d.content, - filePath: d.filePath, - startLine: d.startLine, - endLine: d.endLine, - } - : { - content: d.content, - filePath: d.filePath, - startLine: d.startLine, - endLine: d.endLine, - language: language as string, - } -} - -/** Serialised form of a {@link SearchResult}. */ -export interface SearchResultDict { - chunk: ChunkDict - score: number -} - -/** - * Convert a {@link SearchResult} to a plain serialisable object. - * - * Port of `SearchResult.to_dict`. - */ -export function searchResultToDict(result: SearchResult): SearchResultDict { - return { - chunk: chunkToDict(result.chunk), - score: result.score, - } -} diff --git a/src/version.ts b/src/version.ts new file mode 100644 index 0000000..88ac6c4 --- /dev/null +++ b/src/version.ts @@ -0,0 +1,10 @@ +// Port of src/semble/version.py. +// +// The Python upstream stores a triple (`(0, 2, 0)`) and joins it for the +// string form. Here we expose a single literal because: +// * `package.json#version` is the source of truth for npm publishing. +// * Bun/tsdown don't read Python-style triples; reconstructing one would +// just be dead code. +// A future integration PR will keep this in sync with `package.json#version` +// (e.g. via a generated file or a build-time replacement). +export const version = '0.0.0'