From 53d06c872eafc861f0f5781395b86434c76d4514 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:27:23 +0900 Subject: [PATCH 1/2] feat(indexing): port CspIndex orchestrator (fromPath/fromGit/search/findRelated/save/load) --- src/indexing/create.test.ts | 79 +++++++ src/indexing/create.ts | 91 ++++++++ src/indexing/index.test.ts | 183 +++++++++++++++ src/indexing/index.ts | 439 ++++++++++++++++++++++++++++++++++++ src/indexing/types.test.ts | 40 ++++ src/indexing/types.ts | 44 ++++ 6 files changed, 876 insertions(+) create mode 100644 src/indexing/create.test.ts create mode 100644 src/indexing/create.ts create mode 100644 src/indexing/index.test.ts create mode 100644 src/indexing/index.ts create mode 100644 src/indexing/types.test.ts create mode 100644 src/indexing/types.ts diff --git a/src/indexing/create.test.ts b/src/indexing/create.test.ts new file mode 100644 index 0000000..8967869 --- /dev/null +++ b/src/indexing/create.test.ts @@ -0,0 +1,79 @@ +// Tests for src/indexing/create.ts + +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' +import { ContentType } from '../types.ts' +import { createIndexFromPath } from './create.ts' +import { makeStubModel } from './dense.ts' + +describe('createIndexFromPath', () => { + let dir: string + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'csp-create-')) + }) + afterEach(() => { + rmSync(dir, { recursive: true, force: true }) + }) + + it('builds chunks/bm25/semantic indexes for a small TS file', async () => { + const src = join(dir, 'sample.ts') + writeFileSync( + src, + 'export function greet(name: string) {\n return `hi ${name}`\n}\n', + ) + const model = makeStubModel('test-model', 4) + const result = await createIndexFromPath(dir, { model, displayRoot: dir }) + expect(result.chunks.length).toBeGreaterThan(0) + // Path is stored relative to displayRoot. + expect(result.chunks[0]!.filePath).toBe('sample.ts') + expect(result.semanticIndex.vectors.length).toBe(result.chunks.length) + expect(result.bm25Index.documents.length).toBe(result.chunks.length) + }) + + it('throws when no supported files are found', async () => { + // Only an unsupported binary extension present. + writeFileSync(join(dir, 'data.bin'), 'binary') + const model = makeStubModel() + await expect(createIndexFromPath(dir, { model })).rejects.toThrow( + /No supported files found/, + ) + }) + + it('respects an explicit extensions override', async () => { + writeFileSync(join(dir, 'a.txt'), 'hello world') + const model = makeStubModel() + const result = await createIndexFromPath(dir, { + model, + extensions: ['.txt'], + content: ContentType.Docs, + displayRoot: dir, + }) + expect(result.chunks.length).toBe(1) + expect(result.chunks[0]!.filePath).toBe('a.txt') + }) + + it('skips files larger than MAX_FILE_BYTES', async () => { + // Write 2 MB of code-like content; should be skipped. + const big = 'a'.repeat(2_000_000) + writeFileSync(join(dir, 'big.ts'), big) + writeFileSync(join(dir, 'small.ts'), 'export const x = 1\n') + const model = makeStubModel() + const result = await createIndexFromPath(dir, { model, displayRoot: dir }) + const paths = result.chunks.map(c => c.filePath) + expect(paths).toContain('small.ts') + expect(paths).not.toContain('big.ts') + }) + + it('descends into subdirectories', async () => { + const sub = join(dir, 'sub') + mkdirSync(sub) + writeFileSync(join(sub, 'nested.ts'), 'const a = 1\n') + const model = makeStubModel() + const result = await createIndexFromPath(dir, { model, displayRoot: dir }) + const paths = result.chunks.map(c => c.filePath) + expect(paths.some(p => p.endsWith('nested.ts'))).toBe(true) + }) +}) diff --git a/src/indexing/create.ts b/src/indexing/create.ts new file mode 100644 index 0000000..363def0 --- /dev/null +++ b/src/indexing/create.ts @@ -0,0 +1,91 @@ +// Port of src/semble/index/create.py + +import { readFileSync, statSync } from 'node:fs' +import { relative } from 'node:path' +import type { Chunk } from '../types.ts' +import { ContentType } from '../types.ts' +import { chunkSource } from '../chunking/chunk-source.ts' +import { tokenize } from '../tokens.ts' +import type { Model } from './dense.ts' +import { SelectableBasicBackend, embedChunks } from './dense.ts' +import { walkFiles } from './file-walker.ts' +import { detectLanguage, getExtensions } from './files.ts' +import { Bm25Index, enrichForBm25 } from './sparse.ts' + +/** 1 MB max file size to read and index. */ +export const MAX_FILE_BYTES = 1_000_000 + +export interface CreateIndexOptions { + model: Model + extensions?: readonly string[] + content?: ContentType | readonly ContentType[] + displayRoot?: string +} + +export interface CreateIndexResult { + bm25Index: Bm25Index + semanticIndex: SelectableBasicBackend + chunks: Chunk[] +} + +/** + * Create an index from a resolved directory. + * + * Walks files matching `extensions`, chunks them, enriches text for BM25, + * tokenizes it, embeds chunks, and returns the populated indexes. + * + * @throws if no chunks are produced. + */ +export async function createIndexFromPath( + path: string, + options: CreateIndexOptions, +): Promise { + const { model, extensions, content, displayRoot } = options + + const normalized: readonly ContentType[] = normalizeContent(content) + const resolvedExtensions = getExtensions(normalized, extensions) + + const chunks: Chunk[] = [] + for (const filePath of walkFiles(path, resolvedExtensions)) { + const language = detectLanguage(filePath) + let size: number + try { + size = statSync(filePath).size + } + catch { + continue + } + if (size > MAX_FILE_BYTES) continue + let source: string + try { + source = readFileSync(filePath, 'utf8') + } + catch { + continue + } + const chunkPath = displayRoot ? relative(displayRoot, filePath) : filePath + chunks.push(...chunkSource(source, chunkPath, language)) + } + + if (chunks.length === 0) { + throw new Error(`No supported files found under ${path}.`) + } + + const embeddings = embedChunks(model, chunks) + const bm25Index = new Bm25Index() + bm25Index.index(chunks.map(c => tokenize(enrichForBm25(c)))) + const semanticIndex = new SelectableBasicBackend(embeddings, model.dim) + + return { bm25Index, semanticIndex, chunks } +} + +function normalizeContent( + content: ContentType | readonly ContentType[] | undefined, +): readonly ContentType[] { + if (content === undefined) { + // Default: code-only. Mirrors _DEFAULT_CONTENT in semble. + return [ContentType.Code] + } + if (Array.isArray(content)) return content + return [content as ContentType] +} diff --git a/src/indexing/index.test.ts b/src/indexing/index.test.ts new file mode 100644 index 0000000..68ecd39 --- /dev/null +++ b/src/indexing/index.test.ts @@ -0,0 +1,183 @@ +// Tests for src/indexing/index.ts (CspIndex) + +import { mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' +import type { Chunk } from '../types.ts' +import { ContentType } from '../types.ts' +import { CspIndex, DEFAULT_CONTENT } from './index.ts' +import { SelectableBasicBackend, makeStubModel } from './dense.ts' +import { Bm25Index } from './sparse.ts' + +function makeChunk( + filePath: string, + startLine: number, + endLine: number, + language: string | null = 'typescript', + content?: string, +): Chunk { + return { + content: content ?? `// chunk for ${filePath}:${startLine}-${endLine}`, + filePath, + startLine, + endLine, + language, + } +} + +function buildIndex(chunks: Chunk[]): CspIndex { + const model = makeStubModel('test-model', 4) + const vectors = chunks.map((_, i) => { + const v = new Float32Array(4) + v[0] = i + 1 + return v + }) + return new CspIndex({ + model, + bm25Index: new Bm25Index(chunks.map(() => ['x'])), + semanticIndex: new SelectableBasicBackend(vectors, 4), + chunks, + modelPath: 'test-model', + root: null, + content: DEFAULT_CONTENT, + }) +} + +describe('CspIndex.stats', () => { + it('returns zeros for an empty index', () => { + const idx = buildIndex([]) + expect(idx.stats).toEqual({ + indexedFiles: 0, + totalChunks: 0, + languages: {}, + }) + }) + + it('reflects chunk count, file count, and language distribution', () => { + const chunks: Chunk[] = [ + makeChunk('a.ts', 1, 10, 'typescript'), + makeChunk('a.ts', 11, 20, 'typescript'), + makeChunk('b.py', 1, 5, 'python'), + makeChunk('c.bin', 1, 1, null), + ] + const idx = buildIndex(chunks) + expect(idx.stats).toEqual({ + indexedFiles: 3, + totalChunks: 4, + languages: { typescript: 2, python: 1 }, + }) + }) +}) + +describe('CspIndex.search', () => { + it('returns [] on an empty query', () => { + const chunks = [makeChunk('a.ts', 1, 1)] + const idx = buildIndex(chunks) + expect(idx.search('')).toEqual([]) + expect(idx.search(' ')).toEqual([]) + }) + + it('returns [] when the index has no chunks', () => { + const idx = buildIndex([]) + expect(idx.search('anything')).toEqual([]) + }) +}) + +describe('CspIndex.findRelated', () => { + it('excludes the source chunk from results', () => { + const chunks: Chunk[] = [ + makeChunk('a.ts', 1, 10, 'typescript', 'seed chunk'), + makeChunk('a.ts', 11, 20, 'typescript', 'companion 1'), + makeChunk('b.ts', 1, 5, 'typescript', 'companion 2'), + ] + const idx = buildIndex(chunks) + const seed = chunks[0]! + const results = idx.findRelated(seed, { topK: 5 }) + // Source chunk must not appear in the results. + expect(results.find(r => r.chunk === seed)).toBeUndefined() + expect(results.length).toBeLessThanOrEqual(5) + }) + + it('accepts a SearchResult as the seed', () => { + const chunks: Chunk[] = [ + makeChunk('a.ts', 1, 10, 'typescript', 'seed'), + makeChunk('b.ts', 1, 5, 'typescript', 'other'), + ] + const idx = buildIndex(chunks) + const results = idx.findRelated({ chunk: chunks[0]!, score: 0.5 }) + expect(results.find(r => r.chunk === chunks[0]!)).toBeUndefined() + }) +}) + +describe('CspIndex save → loadFromDisk roundtrip', () => { + let dir: string + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'csp-roundtrip-')) + }) + afterEach(() => { + rmSync(dir, { recursive: true, force: true }) + }) + + it('persists chunks, indexes, and metadata', async () => { + const chunks: Chunk[] = [ + makeChunk('a.ts', 1, 10, 'typescript', 'A'), + makeChunk('b.ts', 1, 5, 'python', 'B'), + ] + const idx = buildIndex(chunks) + await idx.save(dir) + const loaded = await CspIndex.loadFromDisk(dir) + expect(loaded.chunks.length).toBe(2) + expect(loaded.chunks.map(c => c.filePath)).toEqual(['a.ts', 'b.ts']) + expect(loaded.stats.totalChunks).toBe(2) + expect(loaded.stats.languages).toEqual({ typescript: 1, python: 1 }) + }) + + it('loadFromDisk throws on a missing directory', async () => { + await expect(CspIndex.loadFromDisk(join(dir, 'nope'))).rejects.toThrow( + /Index not found/, + ) + }) + + it('loadFromDisk throws when a persisted artifact is missing', async () => { + // Dir exists but is empty. + await expect(CspIndex.loadFromDisk(dir)).rejects.toThrow(/Missing:/) + }) +}) + +describe('CspIndex.fromPath', () => { + let dir: string + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'csp-from-path-')) + }) + afterEach(() => { + rmSync(dir, { recursive: true, force: true }) + }) + + it('throws when the path does not exist', async () => { + await expect(CspIndex.fromPath(join(dir, 'nope'))).rejects.toThrow( + /Path does not exist/, + ) + }) + + it('throws when the path exists but is a file', async () => { + const filePath = join(dir, 'a.ts') + writeFileSync(filePath, '// hello\n') + await expect(CspIndex.fromPath(filePath)).rejects.toThrow( + /not a directory/, + ) + }) + + it('builds a CspIndex from a real directory with a small TS file', async () => { + writeFileSync( + join(dir, 'sample.ts'), + 'export function greet(name: string) {\n return `hi ${name}`\n}\n', + ) + const idx = await CspIndex.fromPath(dir, { content: ContentType.Code }) + expect(idx.stats.totalChunks).toBeGreaterThan(0) + expect(idx.stats.indexedFiles).toBe(1) + expect(idx.chunks[0]!.filePath).toBe('sample.ts') + }) +}) diff --git a/src/indexing/index.ts b/src/indexing/index.ts new file mode 100644 index 0000000..701e22c --- /dev/null +++ b/src/indexing/index.ts @@ -0,0 +1,439 @@ +// Port of src/semble/index/index.py + +import { spawn } from 'node:child_process' +import { readFileSync } from 'node:fs' +import { mkdir, mkdtemp, readFile, rm, stat, writeFile } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { join, resolve, sep } from 'node:path' +import { fileURLToPath } from 'node:url' +import type { Chunk, IndexStats, SearchResult } from '../types.ts' +import { CallType, ContentType, chunkFromDict, chunkToDict } from '../types.ts' +import { createIndexFromPath } from './create.ts' +import type { Model } from './dense.ts' +import { SelectableBasicBackend, loadModel } from './dense.ts' +import { Bm25Index } from './sparse.ts' +import { search, searchSemantic } from '../search.ts' +import { saveSearchStats } from '../stats.ts' +import { PersistencePath } from './types.ts' + +/** Default content set: code only. */ +export const DEFAULT_CONTENT: readonly ContentType[] = [ContentType.Code] +/** All content types — used by the `--content all` CLI flag. */ +export const ALL_CONTENT: readonly ContentType[] = [ContentType.Code, ContentType.Docs, ContentType.Config] + +/** Timeout (ms) applied to `git clone` invocations. */ +export const GIT_CLONE_TIMEOUT_MS = Number.parseInt(process.env.CSP_CLONE_TIMEOUT ?? '60', 10) * 1000 + +export interface CspIndexConstructorArgs { + model: Model + bm25Index: Bm25Index + semanticIndex: SelectableBasicBackend + chunks: Chunk[] + modelPath: string + root?: string | null + content?: ContentType | readonly ContentType[] +} + +export interface FromPathOptions { + extensions?: readonly string[] + content?: ContentType | readonly ContentType[] + modelPath?: string | null +} + +export interface FromGitOptions extends FromPathOptions { + ref?: string | null +} + +export interface SearchInvocationOptions { + topK?: number + alpha?: number | null + filterLanguages?: readonly string[] + filterPaths?: readonly string[] + rerank?: boolean | null +} + +export interface FindRelatedOptions { + topK?: number +} + +/** Fast local code index with hybrid (semantic + BM25) search. */ +export class CspIndex { + readonly model: Model + readonly chunks: Chunk[] + + private readonly _bm25Index: Bm25Index + private readonly _semanticIndex: SelectableBasicBackend + private readonly _modelPath: string + private readonly _root: string | null + private readonly _content: readonly ContentType[] + private readonly _fileSizes: Record + private readonly _fileMapping: Record + private readonly _languageMapping: Record + + constructor(args: CspIndexConstructorArgs) { + this.model = args.model + this.chunks = args.chunks + this._bm25Index = args.bm25Index + this._semanticIndex = args.semanticIndex + this._modelPath = args.modelPath + this._root = args.root ?? null + this._content = normalizeContent(args.content ?? DEFAULT_CONTENT) + this._fileSizes = this._root ? this._computeFileSizes(this._root) : {} + const mappings = this._populateMapping() + this._fileMapping = mappings.file + this._languageMapping = mappings.language + } + + /** Aggregate index statistics. */ + get stats(): IndexStats { + const languageCounts: Record = {} + for (const chunk of this.chunks) { + if (chunk.language) { + languageCounts[chunk.language] = (languageCounts[chunk.language] ?? 0) + 1 + } + } + return { + indexedFiles: Object.keys(this._fileMapping).length, + totalChunks: this.chunks.length, + languages: languageCounts, + } + } + + /** Create and index a CspIndex from a local directory. */ + static async fromPath( + path: string | URL, + options: FromPathOptions = {}, + ): Promise { + const resolved = await resolveDirectory(path) + const { model, modelPath } = await loadModel(options.modelPath) + const normalized = normalizeContent(options.content ?? DEFAULT_CONTENT) + const created = await createIndexFromPath(resolved, { + model, + ...(options.extensions !== undefined ? { extensions: options.extensions } : {}), + content: normalized, + displayRoot: resolved, + }) + return new CspIndex({ + model, + bm25Index: created.bm25Index, + semanticIndex: created.semanticIndex, + chunks: created.chunks, + modelPath, + root: resolved, + content: normalized, + }) + } + + /** Clone a git repository to a tmp dir, index it, then clean up the clone. */ + static async fromGit( + url: string, + options: FromGitOptions = {}, + ): Promise { + const normalized = normalizeContent(options.content ?? DEFAULT_CONTENT) + const tmpDir = await mkdtemp(join(tmpdir(), 'csp-')) + try { + await runGitClone(url, tmpDir, options.ref ?? null) + + const { model, modelPath } = await loadModel(options.modelPath) + const resolved = resolve(tmpDir) + const created = await createIndexFromPath(resolved, { + model, + ...(options.extensions !== undefined ? { extensions: options.extensions } : {}), + content: normalized, + displayRoot: resolved, + }) + return new CspIndex({ + model, + bm25Index: created.bm25Index, + semanticIndex: created.semanticIndex, + chunks: created.chunks, + modelPath, + root: resolved, + content: normalized, + }) + } + finally { + await rm(tmpDir, { recursive: true, force: true }) + } + } + + /** Load a previously-saved index from disk. */ + static async loadFromDisk(path: string): Promise { + let exists = true + try { + await stat(path) + } + catch { + exists = false + } + if (!exists) throw new Error(`Index not found at ${path}`) + + const persistencePaths = PersistencePath.fromPath(path) + const missing = persistencePaths.nonExisting() + if (missing.length > 0) { + throw new Error(`Index not found at ${path}. Missing: ${missing.join(', ')}`) + } + + const bm25Index = Bm25Index.load(persistencePaths.bm25Index) + const semanticIndex = SelectableBasicBackend.load(persistencePaths.semanticIndex) + const metadataRaw = await readFile(persistencePaths.metadata, 'utf8') + const metadata = JSON.parse(metadataRaw) as { + root_path?: string | null + model_path?: string | null + } + const chunkRaw = await readFile(persistencePaths.chunks, 'utf8') + const chunkData = JSON.parse(chunkRaw) as Array> + const chunks = chunkData.map(chunkFromDict) + + const { model, modelPath } = await loadModel(metadata.model_path ?? null) + return new CspIndex({ + model, + bm25Index, + semanticIndex, + chunks, + modelPath, + root: metadata.root_path ?? null, + }) + } + + /** Search the index and return the top-k most relevant chunks. */ + search(query: string, options: SearchInvocationOptions = {}): SearchResult[] { + if (this.chunks.length === 0 || query.trim().length === 0) return [] + + const topK = options.topK ?? 10 + const filterLanguages = options.filterLanguages + const filterPaths = options.filterPaths + const resolvedRerank = options.rerank ?? this._content.includes(ContentType.Code) + const selector = this._getSelectorVector(filterLanguages, filterPaths) + + const results = search( + query, + this.model, + this._semanticIndex, + this._bm25Index, + this.chunks, + topK, + { + alpha: options.alpha ?? null, + ...(selector !== null ? { selector } : {}), + rerank: resolvedRerank, + }, + ) + saveSearchStats(results, CallType.Search, this._fileSizes) + return results + } + + /** Return chunks semantically similar to the given chunk or search result. */ + findRelated( + source: Chunk | SearchResult, + options: FindRelatedOptions = {}, + ): SearchResult[] { + const topK = options.topK ?? 5 + const target = isSearchResult(source) ? source.chunk : source + const selector + = target.language + ? this._getSelectorVector([target.language], undefined) + : null + const results = searchSemantic( + target.content, + this.model, + this._semanticIndex, + this.chunks, + topK + 1, + selector, + ) + const filtered = results + .filter(r => !sameChunk(r.chunk, target)) + .slice(0, topK) + saveSearchStats(filtered, CallType.FindRelated, this._fileSizes) + return filtered + } + + /** Persist the index to disk under `path` (created if missing). */ + async save(path: string): Promise { + await mkdir(path, { recursive: true }) + const persistencePaths = PersistencePath.fromPath(path) + this._bm25Index.save(persistencePaths.bm25Index) + this._semanticIndex.save(persistencePaths.semanticIndex) + const chunksAsDict = this.chunks.map(chunkToDict) + await writeFile(persistencePaths.chunks, JSON.stringify(chunksAsDict)) + const metadata = { + root_path: this._root, + time: Date.now() / 1000, + model_path: this._modelPath, + } + await writeFile(persistencePaths.metadata, JSON.stringify(metadata)) + } + + private _populateMapping(): { + file: Record + language: Record + } { + const file: Record = {} + const language: Record = {} + for (let i = 0; i < this.chunks.length; i++) { + const chunk = this.chunks[i]! + if (chunk.language) { + const arr = language[chunk.language] + if (arr) arr.push(i) + else language[chunk.language] = [i] + } + const arr = file[chunk.filePath] + if (arr) arr.push(i) + else file[chunk.filePath] = [i] + } + return { file, language } + } + + private _computeFileSizes(root: string): Record { + const sizes: Record = {} + for (const chunk of this.chunks) { + if (chunk.filePath in sizes) continue + try { + // Mirror Python's `root / chunk.file_path`: absolute paths win, + // relative paths resolve against `root`. + const abs = resolve(root, chunk.filePath) + const buf = readFileSyncSafe(abs) + if (buf !== null) sizes[chunk.filePath] = buf.length + } + catch { + /* swallow */ + } + } + return sizes + } + + private _getSelectorVector( + filterLanguages?: readonly string[], + filterPaths?: readonly string[], + ): number[] | null { + const out = new Set() + for (const language of filterLanguages ?? []) { + const ids = this._languageMapping[language] + if (ids) for (const i of ids) out.add(i) + } + for (const filename of filterPaths ?? []) { + const ids = this._fileMapping[filename] + if (ids) for (const i of ids) out.add(i) + } + if (out.size === 0) return null + return [...out].sort((a, b) => a - b) + } +} + +function normalizeContent( + content: ContentType | readonly ContentType[], +): readonly ContentType[] { + if (Array.isArray(content)) return content + return [content as ContentType] +} + +function isSearchResult(value: Chunk | SearchResult): value is SearchResult { + return (value as SearchResult).chunk !== undefined + && typeof (value as SearchResult).score === 'number' +} + +function sameChunk(a: Chunk, b: Chunk): boolean { + return ( + a.filePath === b.filePath + && a.startLine === b.startLine + && a.endLine === b.endLine + && a.content === b.content + ) +} + +async function resolveDirectory(path: string | URL): Promise { + const raw = path instanceof URL ? fileURLToPath(path) : path + let info + try { + info = await stat(raw) + } + catch { + throw new Error(`Path does not exist: ${raw}`) + } + if (!info.isDirectory()) { + throw new Error(`Path is not a directory: ${raw}`) + } + // Drop any trailing separator for consistency with semble's Path.resolve(). + let resolved = resolve(raw) + if (resolved.length > 1 && resolved.endsWith(sep)) { + resolved = resolved.slice(0, -1) + } + return resolved +} + +function readFileSyncSafe(path: string): string | null { + try { + return readFileSync(path, { encoding: 'utf8' }) + } + catch { + return null + } +} + +/** + * Shell-out to `git clone --depth 1` into `tmpDir`. + * + * Uses `spawn` (not `execFile`) so stdin can be redirected to `/dev/null` — + * this mirrors semble's `subprocess.run(..., stdin=subprocess.DEVNULL)` and + * prevents a hung remote from blocking on a tty prompt. + */ +async function runGitClone(url: string, tmpDir: string, ref: string | null): Promise { + // `--` prevents `url` from being interpreted as a git option (e.g. `--upload-pack=...`). + const args = [ + 'clone', + '--depth', + '1', + ...(ref ? ['--branch', ref] : []), + '--', + url, + tmpDir, + ] + await new Promise((resolvePromise, rejectPromise) => { + let child + try { + child = spawn('git', args, { stdio: ['ignore', 'pipe', 'pipe'] }) + } + catch (err) { + const e = err as NodeJS.ErrnoException + if (e.code === 'ENOENT') { + rejectPromise(new Error('git is not installed or not on PATH')) + return + } + rejectPromise(err as Error) + return + } + let stderr = '' + let timedOut = false + const timer = setTimeout(() => { + timedOut = true + child.kill('SIGTERM') + }, GIT_CLONE_TIMEOUT_MS) + child.stderr?.setEncoding('utf8') + child.stderr?.on('data', (chunk: string) => { + stderr += chunk + }) + child.on('error', (err: NodeJS.ErrnoException) => { + clearTimeout(timer) + if (err.code === 'ENOENT') { + rejectPromise(new Error('git is not installed or not on PATH')) + return + } + rejectPromise(err) + }) + child.on('close', (code) => { + clearTimeout(timer) + if (timedOut) { + rejectPromise(new Error( + `git clone timed out for ${JSON.stringify(url)} (limit: ${GIT_CLONE_TIMEOUT_MS / 1000} s)`, + )) + return + } + if (code !== 0) { + rejectPromise(new Error(`git clone failed for ${JSON.stringify(url)}:\n${stderr.trim()}`)) + return + } + resolvePromise() + }) + }) +} diff --git a/src/indexing/types.test.ts b/src/indexing/types.test.ts new file mode 100644 index 0000000..ab71014 --- /dev/null +++ b/src/indexing/types.test.ts @@ -0,0 +1,40 @@ +// Tests for src/indexing/types.ts + +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' +import { PersistencePath } from './types.ts' + +describe('PersistencePath', () => { + let dir: string + + beforeEach(() => { + dir = mkdtempSync(join(tmpdir(), 'csp-pp-')) + }) + afterEach(() => { + rmSync(dir, { recursive: true, force: true }) + }) + + it('fromPath produces the expected layout', () => { + const p = PersistencePath.fromPath(dir) + expect(p.chunks).toBe(join(dir, 'chunks.json')) + expect(p.bm25Index).toBe(join(dir, 'bm25_index')) + expect(p.semanticIndex).toBe(join(dir, 'semantic_index')) + expect(p.metadata).toBe(join(dir, 'metadata.json')) + }) + + it('nonExisting returns every path when the dir is empty', () => { + const p = PersistencePath.fromPath(dir) + expect(p.nonExisting().sort()).toEqual( + [p.chunks, p.bm25Index, p.semanticIndex, p.metadata].sort(), + ) + }) + + it('nonExisting returns only the truly missing paths', () => { + const p = PersistencePath.fromPath(dir) + writeFileSync(p.chunks, '[]') + mkdirSync(p.bm25Index, { recursive: true }) + expect(p.nonExisting().sort()).toEqual([p.semanticIndex, p.metadata].sort()) + }) +}) diff --git a/src/indexing/types.ts b/src/indexing/types.ts new file mode 100644 index 0000000..78a2c0e --- /dev/null +++ b/src/indexing/types.ts @@ -0,0 +1,44 @@ +// Port of src/semble/index/types.py + +import { existsSync } from 'node:fs' +import { join } from 'node:path' + +/** + * Resolved on-disk paths used by the index save/load roundtrip. + * + * Mirrors `semble.index.types.PersistencePath`. + */ +export class PersistencePath { + readonly chunks: string + readonly bm25Index: string + readonly semanticIndex: string + readonly metadata: string + + constructor(opts: { + chunks: string + bm25Index: string + semanticIndex: string + metadata: string + }) { + this.chunks = opts.chunks + this.bm25Index = opts.bm25Index + this.semanticIndex = opts.semanticIndex + this.metadata = opts.metadata + } + + /** Return absolute paths that don't currently exist on disk. */ + nonExisting(): string[] { + return [this.chunks, this.bm25Index, this.semanticIndex, this.metadata] + .filter(p => !existsSync(p)) + } + + /** Build a PersistencePath rooted at `base`. */ + static fromPath(base: string): PersistencePath { + return new PersistencePath({ + chunks: join(base, 'chunks.json'), + bm25Index: join(base, 'bm25_index'), + semanticIndex: join(base, 'semantic_index'), + metadata: join(base, 'metadata.json'), + }) + } +} From fde496f774ae9d7e696bb7b083b8b747de87f76a Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:48:01 +0900 Subject: [PATCH 2/2] review(indexing): apply gemini-code-assist feedback (cspindex) - _computeFileSizes: use statSync().size instead of readFileSync().length to avoid loading file content into memory (P1, gemini) and to report true UTF-8 byte counts instead of UTF-16 code units (P3, cubic). - _getSelectorVector: return [] (not null) when filters are set but match no chunks, so search() honors the empty filter instead of falling back to an unfiltered search (P1, gemini/cubic). - search(): early-return [] for zero-match selectors and topK<=0. - resolveDirectory: preserve filesystem-root trailing separators so paths like 'C:\' or '/' survive normalization on Windows/POSIX (P2, cubic). - runGitClone: set stdout to 'ignore' (not 'pipe') so the OS pipe buffer can't fill and deadlock git clone (P2, gemini/cubic). stdin/stderr unchanged. - fromGit: tolerate rm() failures during finally so they never mask the inner error. - Drop now-unused readFileSync import and readFileSyncSafe helper. Tests: - Add regression test for filters-with-zero-matches returning []. - Add regression test for topK<=0 returning []. --- src/indexing/index.test.ts | 19 ++++++++++ src/indexing/index.ts | 74 +++++++++++++++++++++++++------------- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/src/indexing/index.test.ts b/src/indexing/index.test.ts index 68ecd39..0af10ee 100644 --- a/src/indexing/index.test.ts +++ b/src/indexing/index.test.ts @@ -82,6 +82,25 @@ describe('CspIndex.search', () => { const idx = buildIndex([]) expect(idx.search('anything')).toEqual([]) }) + + it('returns [] when topK <= 0', () => { + const chunks = [makeChunk('a.ts', 1, 1)] + const idx = buildIndex(chunks) + expect(idx.search('anything', { topK: 0 })).toEqual([]) + expect(idx.search('anything', { topK: -1 })).toEqual([]) + }) + + it('returns [] when filters are set but match nothing (no fallback to unfiltered)', () => { + // Regression: previously an empty selector was treated as "no filter" + // which fell back to an unfiltered search — silently ignoring user intent. + const chunks: Chunk[] = [ + makeChunk('a.ts', 1, 10, 'typescript', 'alpha'), + makeChunk('b.py', 1, 10, 'python', 'beta'), + ] + const idx = buildIndex(chunks) + expect(idx.search('anything', { filterLanguages: ['nonexistent'] })).toEqual([]) + expect(idx.search('anything', { filterPaths: ['nope.ts'] })).toEqual([]) + }) }) describe('CspIndex.findRelated', () => { diff --git a/src/indexing/index.ts b/src/indexing/index.ts index 701e22c..a38fed5 100644 --- a/src/indexing/index.ts +++ b/src/indexing/index.ts @@ -1,10 +1,10 @@ // Port of src/semble/index/index.py import { spawn } from 'node:child_process' -import { readFileSync } from 'node:fs' +import { statSync } from 'node:fs' import { mkdir, mkdtemp, readFile, rm, stat, writeFile } from 'node:fs/promises' import { tmpdir } from 'node:os' -import { join, resolve, sep } from 'node:path' +import { join, parse as parsePath, resolve, sep } from 'node:path' import { fileURLToPath } from 'node:url' import type { Chunk, IndexStats, SearchResult } from '../types.ts' import { CallType, ContentType, chunkFromDict, chunkToDict } from '../types.ts' @@ -153,7 +153,10 @@ export class CspIndex { }) } finally { - await rm(tmpDir, { recursive: true, force: true }) + // Best-effort cleanup. Swallow rm errors so they never mask the original + // exception (Node 22 `rm` can race against AV scanners on Windows). The + // tmp dir lives under the OS tmpdir which is purged by the OS anyway. + await rm(tmpDir, { recursive: true, force: true, maxRetries: 3 }).catch(() => {}) } } @@ -201,10 +204,18 @@ export class CspIndex { if (this.chunks.length === 0 || query.trim().length === 0) return [] const topK = options.topK ?? 10 + if (topK <= 0) return [] + const filterLanguages = options.filterLanguages const filterPaths = options.filterPaths const resolvedRerank = options.rerank ?? this._content.includes(ContentType.Code) const selector = this._getSelectorVector(filterLanguages, filterPaths) + // Honor the user's filter when it matches zero chunks — bypass the + // ranking pipeline rather than falling back to an unfiltered search. + if (selector !== null && selector.length === 0) { + saveSearchStats([], CallType.Search, this._fileSizes) + return [] + } const results = search( query, @@ -293,8 +304,11 @@ export class CspIndex { // Mirror Python's `root / chunk.file_path`: absolute paths win, // relative paths resolve against `root`. const abs = resolve(root, chunk.filePath) - const buf = readFileSyncSafe(abs) - if (buf !== null) sizes[chunk.filePath] = buf.length + // `statSync` returns the on-disk byte size — avoids reading the file + // (cheaper, especially for files up to MAX_FILE_BYTES = 1 MB) and + // matches Python's `len(read_text(...))` closely enough for the + // savings-tracking use case while reporting actual UTF-8 byte counts. + sizes[chunk.filePath] = statSync(abs).size } catch { /* swallow */ @@ -307,16 +321,29 @@ export class CspIndex { filterLanguages?: readonly string[], filterPaths?: readonly string[], ): number[] | null { + // Distinguish "no filter requested" (return null → search everything) + // from "filter requested but matched nothing" (return [] → search nothing). + // Semble's Python parity check is `if selector` which conflates the two + // and falls back to unfiltered search on empty results — that is a latent + // correctness bug there. We diverge intentionally to honor user intent. + const hasLanguageFilter + = filterLanguages !== undefined && filterLanguages.length > 0 + const hasPathFilter = filterPaths !== undefined && filterPaths.length > 0 + if (!hasLanguageFilter && !hasPathFilter) return null + const out = new Set() - for (const language of filterLanguages ?? []) { - const ids = this._languageMapping[language] - if (ids) for (const i of ids) out.add(i) + if (filterLanguages) { + for (const language of filterLanguages) { + const ids = this._languageMapping[language] + if (ids) for (const i of ids) out.add(i) + } } - for (const filename of filterPaths ?? []) { - const ids = this._fileMapping[filename] - if (ids) for (const i of ids) out.add(i) + if (filterPaths) { + for (const filename of filterPaths) { + const ids = this._fileMapping[filename] + if (ids) for (const i of ids) out.add(i) + } } - if (out.size === 0) return null return [...out].sort((a, b) => a - b) } } @@ -354,23 +381,17 @@ async function resolveDirectory(path: string | URL): Promise { if (!info.isDirectory()) { throw new Error(`Path is not a directory: ${raw}`) } - // Drop any trailing separator for consistency with semble's Path.resolve(). + // Drop any trailing separator for consistency with semble's Path.resolve() + // — but preserve filesystem root paths (`/` on POSIX, `C:\` on Windows) + // since stripping their trailing sep would mutate the resolved location. let resolved = resolve(raw) - if (resolved.length > 1 && resolved.endsWith(sep)) { + const rootOfResolved = parsePath(resolved).root + if (resolved.length > rootOfResolved.length && resolved.endsWith(sep)) { resolved = resolved.slice(0, -1) } return resolved } -function readFileSyncSafe(path: string): string | null { - try { - return readFileSync(path, { encoding: 'utf8' }) - } - catch { - return null - } -} - /** * Shell-out to `git clone --depth 1` into `tmpDir`. * @@ -392,7 +413,12 @@ async function runGitClone(url: string, tmpDir: string, ref: string | null): Pro await new Promise((resolvePromise, rejectPromise) => { let child try { - child = spawn('git', args, { stdio: ['ignore', 'pipe', 'pipe'] }) + // stdin: 'ignore' mirrors Python's `subprocess.DEVNULL` so a stuck remote + // can't block on a tty prompt. + // stdout: 'ignore' avoids the OS pipe buffer filling and deadlocking + // `git clone` when verbose hooks/configs print large amounts of output. + // stderr: 'pipe' so we surface the error message on non-zero exit. + child = spawn('git', args, { stdio: ['ignore', 'ignore', 'pipe'] }) } catch (err) { const e = err as NodeJS.ErrnoException