From f00d6d7f59a5e6a34dcee4f4e2bcc3bd61a1de95 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:20:46 +0900 Subject: [PATCH 1/2] feat(indexing): port BM25 enrichment + index from semble MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ports src/semble/index/sparse.py to src/indexing/sparse.ts: - enrichForBm25(chunk): appends 'stem stem dir1 dir2 dir3' to the chunk content. Stem is repeated to up-weight path matches and only the last 3 parent directory components are kept, matching Python's Path(...).parent.parts[-3:]. - selectorToMask(selector, size): builds a Uint8Array boolean mask the same length as size with 1s at each selector index, or null when selector is null/undefined (mirrors numpy boolean-mask semantics used by bm25s.get_scores). - Bm25Index: minimal Okapi BM25 backend with build / getScores / save / load. Documents are passed pre-tokenized (caller wraps with tokenize(enrichForBm25(chunk))) and getScores returns a Float32Array in doc order, matching bm25s.BM25.get_scores. weightMask zeros out scores for masked-out documents. BM25 backend choice — Option B (inline minimal BM25) over Option A (third-party npm such as wink-bm25-text-search): keeps the unit self-contained while the dependency tree is still settling, and the required surface (build / getScores / save / load with a weight_mask) is small enough to implement and unit-test in <150 LOC. Replacing the backend later is localized to this file. Stopgap structural Chunk type is inlined until src/types.ts lands from Unit 1, matching the pattern established by Unit 3. Ref: src/semble/index/sparse.py --- src/indexing/sparse.test.ts | 154 ++++++++++++++++++++++++++ src/indexing/sparse.ts | 214 ++++++++++++++++++++++++++++++++++++ 2 files changed, 368 insertions(+) create mode 100644 src/indexing/sparse.test.ts create mode 100644 src/indexing/sparse.ts diff --git a/src/indexing/sparse.test.ts b/src/indexing/sparse.test.ts new file mode 100644 index 0000000..bbccdd9 --- /dev/null +++ b/src/indexing/sparse.test.ts @@ -0,0 +1,154 @@ +import { mkdtemp, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import path from 'node:path' +import { afterEach, beforeEach, describe, expect, test } from 'bun:test' + +import { Bm25Index, type Chunk, enrichForBm25, selectorToMask } from './sparse.ts' + +function makeChunk(overrides: Partial & { filePath: string, content?: string }): Chunk { + return { + content: overrides.content ?? '', + filePath: overrides.filePath, + startLine: overrides.startLine ?? 1, + endLine: overrides.endLine ?? 1, + language: overrides.language ?? null, + } +} + +describe('enrichForBm25', () => { + test('appends repeated stem and last 3 dir parts (2-part dir)', () => { + // Mirrors upstream Python: Path('src/utils/format.ts').parent.parts == ('src', 'utils'), + // so last-3 is the full ['src', 'utils']. + const out = enrichForBm25(makeChunk({ filePath: 'src/utils/format.ts', content: 'hello world' })) + expect(out).toBe('hello world format format src utils') + }) + + test('trims to the last 3 dir parts (4-part dir)', () => { + const out = enrichForBm25(makeChunk({ filePath: 'a/b/c/d/foo.py', content: 'x' })) + expect(out).toBe('x foo foo b c d') + }) + + test('handles a top-level file with no directory components', () => { + const out = enrichForBm25(makeChunk({ filePath: 'foo.py', content: 'x' })) + expect(out).toBe('x foo foo ') + }) + + test('drops "." pseudo-segments from relative paths', () => { + const out = enrichForBm25(makeChunk({ filePath: './a/b/foo.ts', content: 'x' })) + expect(out).toBe('x foo foo a b') + }) +}) + +describe('selectorToMask', () => { + test('builds a 0/1 mask the same length as `size`', () => { + const mask = selectorToMask(new Uint32Array([0, 2, 5]), 6) + expect(mask).not.toBeNull() + expect(Array.from(mask!)).toEqual([1, 0, 1, 0, 0, 1]) + }) + + test('returns null for a null selector', () => { + expect(selectorToMask(null, 6)).toBeNull() + }) + + test('returns null for an undefined selector', () => { + expect(selectorToMask(undefined, 6)).toBeNull() + }) + + test('ignores indices outside the mask bounds', () => { + // Out-of-bounds indices are silently dropped rather than crashing — + // upstream relies on the selector being well-formed but we want to be + // defensive in the TS port. + const mask = selectorToMask(new Uint32Array([0, 10]), 3) + expect(Array.from(mask!)).toEqual([1, 0, 0]) + }) +}) + +describe('Bm25Index.build / getScores', () => { + test('ranks documents containing the query term higher', () => { + const index = Bm25Index.build([ + ['hello', 'world'], + ['hello'], + ['world'], + ]) + const scores = index.getScores(['hello']) + expect(scores).toHaveLength(3) + expect(scores[0]).toBeGreaterThan(0) + expect(scores[1]).toBeGreaterThan(0) + expect(scores[2]).toBe(0) + }) + + test('returns zero scores for unknown query tokens', () => { + const index = Bm25Index.build([['hello'], ['world']]) + const scores = index.getScores(['unknown']) + expect(Array.from(scores)).toEqual([0, 0]) + }) + + test('returns an empty-array-equivalent for an empty corpus', () => { + const index = Bm25Index.build([]) + const scores = index.getScores(['anything']) + expect(scores).toHaveLength(0) + }) + + test('returns zero scores when query tokens are empty', () => { + const index = Bm25Index.build([['hello'], ['world']]) + const scores = index.getScores([]) + expect(Array.from(scores)).toEqual([0, 0]) + }) + + test('weightMask zeros out masked-out documents', () => { + const index = Bm25Index.build([ + ['hello', 'world'], + ['hello'], + ['world'], + ]) + // Mask in docs 0 and 2 only. + const mask = new Uint8Array([1, 0, 1]) + const scores = index.getScores(['hello'], mask) + expect(scores[0]).toBeGreaterThan(0) + expect(scores[1]).toBe(0) + expect(scores[2]).toBe(0) // doc 2 doesn't contain 'hello' + }) + + test('weightMask only suppresses scores; matched-in docs are unchanged', () => { + const index = Bm25Index.build([ + ['hello', 'world'], + ['hello'], + ['world'], + ]) + const baseline = index.getScores(['hello']) + const masked = index.getScores(['hello'], new Uint8Array([1, 1, 1])) + expect(Array.from(masked)).toEqual(Array.from(baseline)) + }) + + test('repeated query tokens do not compound scores', () => { + const index = Bm25Index.build([['hello']]) + const single = index.getScores(['hello']) + const repeated = index.getScores(['hello', 'hello', 'hello']) + expect(Array.from(repeated)).toEqual(Array.from(single)) + }) +}) + +describe('Bm25Index.save / load', () => { + let tmp: string + + beforeEach(async () => { + tmp = await mkdtemp(path.join(tmpdir(), 'csp-bm25-')) + }) + + afterEach(async () => { + await rm(tmp, { recursive: true, force: true }) + }) + + test('round-trips an index and preserves scores', async () => { + const index = Bm25Index.build([ + ['alpha', 'beta'], + ['alpha'], + ['beta', 'gamma'], + ]) + await index.save(tmp) + const loaded = await Bm25Index.load(tmp) + const original = index.getScores(['alpha']) + const restored = loaded.getScores(['alpha']) + expect(Array.from(restored)).toEqual(Array.from(original)) + }) +}) diff --git a/src/indexing/sparse.ts b/src/indexing/sparse.ts new file mode 100644 index 0000000..5b75fd4 --- /dev/null +++ b/src/indexing/sparse.ts @@ -0,0 +1,214 @@ +// Port of src/semble/index/sparse.py +// +// Implements the two helpers from the upstream module plus a minimal BM25 +// index (Bm25Index) that stands in for Python's `bm25s` library. +// +// BM25 backend choice (see PR body for full discussion): +// Option B (inline minimal BM25+ with k1=1.5, b=0.75) was chosen over a +// third-party npm such as wink-bm25-text-search because: +// - The dependency tree stays self-contained while the project is still +// a scaffold (no other indexing deps are pinned yet). +// - The required surface is tiny (build / getScores / save / load) and +// getScores must respect a weight_mask that maps cleanly to BM25's +// per-document scoring loop. +// - Replacing this backend later is a localized change because all +// callers go through the Bm25Index class. + +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import path from 'node:path' + +// Stopgap structural type until ./types.ts lands from Unit 1. +// Mirrors semble.types.Chunk with camelCase field names per +// @pleaseai/csp public-API conventions. +export interface Chunk { + content: string + filePath: string + startLine: number + endLine: number + language?: string | null +} + +/** + * Append file path components to BM25 content to boost path-based queries. + * + * Assumes `chunk.filePath` is already repo-relative (set during indexing) so + * machine-specific directory components are never indexed. The stem is + * repeated twice to up-weight file-path matches in BM25. + */ +export function enrichForBm25(chunk: Chunk): string { + const parsed = path.parse(chunk.filePath) + const stem = parsed.name + const dirParts = parsed.dir + .split(/[/\\]/) + .filter(part => part !== '' && part !== '.' && part !== '/') + const dirText = dirParts.slice(-3).join(' ') + return `${chunk.content} ${stem} ${stem} ${dirText}` +} + +/** + * Convert a selector array of indices into a boolean mask of length `size`. + * + * Returns `null` when `selector` is null/undefined so callers can skip mask + * application entirely (matching the upstream semantics). + */ +export function selectorToMask( + selector: Uint32Array | null | undefined, + size: number, +): Uint8Array | null { + if (selector === null || selector === undefined) + return null + const mask = new Uint8Array(size) + for (const idx of selector) { + if (idx < size) + mask[idx] = 1 + } + return mask +} + +// --------------------------------------------------------------------------- +// Minimal BM25 index +// --------------------------------------------------------------------------- + +// Standard Okapi BM25 hyperparameters used by bm25s' default Lucene scorer. +const K1 = 1.5 +const B = 0.75 + +interface Bm25State { + // Number of documents indexed. + numDocs: number + // Document length (token count) per document, in doc order. + docLengths: Float32Array + // Average document length across the corpus. + avgDocLength: number + // Term -> array of [docId, termFreq] entries (postings list). + postings: Map> + // Term -> document frequency (count of docs containing the term). + docFreq: Map +} + +/** + * Minimal BM25 index supporting build / getScores / save / load. + * + * Documents are passed pre-tokenized (callers use `tokenize(enrichForBm25(...))`). + * `getScores` returns a Float32Array of per-document scores in doc order, + * matching the bm25s.BM25.get_scores contract used by upstream. + */ +export class Bm25Index { + // Exposed only for save() — kept private to consumers. + readonly #state: Bm25State + + private constructor(state: Bm25State) { + this.#state = state + } + + /** Build an index from an array of pre-tokenized documents. */ + static build(documents: string[][]): Bm25Index { + const numDocs = documents.length + const docLengths = new Float32Array(numDocs) + const postings = new Map>() + const docFreq = new Map() + + let totalLen = 0 + for (let docId = 0; docId < numDocs; docId++) { + const tokens = documents[docId] ?? [] + docLengths[docId] = tokens.length + totalLen += tokens.length + + // Term frequencies for this document. + const tf = new Map() + for (const token of tokens) + tf.set(token, (tf.get(token) ?? 0) + 1) + + for (const [term, freq] of tf) { + let list = postings.get(term) + if (list === undefined) { + list = [] + postings.set(term, list) + } + list.push([docId, freq]) + docFreq.set(term, (docFreq.get(term) ?? 0) + 1) + } + } + + const avgDocLength = numDocs > 0 ? totalLen / numDocs : 0 + + return new Bm25Index({ numDocs, docLengths, avgDocLength, postings, docFreq }) + } + + /** + * Compute BM25 scores for the given query tokens. + * + * Returns a Float32Array of length numDocs, in document order. When + * `weightMask` is provided, documents with mask[i] === 0 receive a score + * of 0 (matching bm25s.BM25.get_scores(..., weight_mask=mask) semantics). + */ + getScores(queryTokens: string[], weightMask?: Uint8Array | null): Float32Array { + const { numDocs, docLengths, avgDocLength, postings, docFreq } = this.#state + const scores = new Float32Array(numDocs) + if (queryTokens.length === 0 || numDocs === 0) + return scores + + // De-duplicate query tokens — repeated terms shouldn't compound BM25 scores. + const uniqueTerms = new Set(queryTokens) + + for (const term of uniqueTerms) { + const list = postings.get(term) + if (list === undefined) + continue + const df = docFreq.get(term) ?? 0 + // Lucene/Robertson IDF: log(1 + (N - df + 0.5) / (df + 0.5)). + const idf = Math.log(1 + (numDocs - df + 0.5) / (df + 0.5)) + + for (const [docId, freq] of list) { + const dl = docLengths[docId] ?? 0 + const denom = freq + K1 * (1 - B + (B * dl) / (avgDocLength || 1)) + const contrib = (idf * (freq * (K1 + 1))) / (denom || 1) + scores[docId] = (scores[docId] ?? 0) + contrib + } + } + + if (weightMask) { + for (let i = 0; i < numDocs; i++) { + if (!(weightMask[i] ?? 0)) + scores[i] = 0 + } + } + + return scores + } + + /** Persist the index to `dir`. Creates the directory if it doesn't exist. */ + async save(dir: string): Promise { + await mkdir(dir, { recursive: true }) + const { numDocs, docLengths, avgDocLength, postings, docFreq } = this.#state + const serialized = { + version: 1, + numDocs, + avgDocLength, + docLengths: Array.from(docLengths), + postings: Array.from(postings.entries()), + docFreq: Array.from(docFreq.entries()), + } + await writeFile(path.join(dir, 'bm25.json'), JSON.stringify(serialized)) + } + + /** Load an index previously persisted with `save`. */ + static async load(dir: string): Promise { + const raw = await readFile(path.join(dir, 'bm25.json'), 'utf8') + const parsed = JSON.parse(raw) as { + version: number + numDocs: number + avgDocLength: number + docLengths: number[] + postings: Array<[string, Array<[number, number]>]> + docFreq: Array<[string, number]> + } + return new Bm25Index({ + numDocs: parsed.numDocs, + docLengths: Float32Array.from(parsed.docLengths), + avgDocLength: parsed.avgDocLength, + postings: new Map(parsed.postings), + docFreq: new Map(parsed.docFreq), + }) + } +} From f47f867fd9f7c911a1c69ecf8584b9956e692b84 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:43:55 +0900 Subject: [PATCH 2/2] review(indexing): apply gemini-code-assist feedback (sparse) - enrichForBm25: normalize backslashes and use path.posix.parse so repo-relative paths produce the same enrichment on Windows and POSIX hosts. Filter '.' segments only (no longer need '/' since splitting on a single delimiter). - Bm25Index.getScores: skip masked-out documents inside the postings iteration instead of zeroing them in a separate O(N) pass. Float32Array defaults to 0 so the result is identical. - Add a backslash-normalization test to lock in cross-platform behavior. --- src/indexing/sparse.test.ts | 8 ++++++++ src/indexing/sparse.ts | 25 +++++++++++++++---------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/indexing/sparse.test.ts b/src/indexing/sparse.test.ts index bbccdd9..9566359 100644 --- a/src/indexing/sparse.test.ts +++ b/src/indexing/sparse.test.ts @@ -37,6 +37,14 @@ describe('enrichForBm25', () => { const out = enrichForBm25(makeChunk({ filePath: './a/b/foo.ts', content: 'x' })) expect(out).toBe('x foo foo a b') }) + + test('normalizes backslashes for cross-platform consistency', () => { + // Repo-relative paths must produce the same enrichment regardless of + // host OS — Windows hosts may surface back-slashes if a caller forgets + // to normalize before passing the chunk through. + const out = enrichForBm25(makeChunk({ filePath: 'src\\utils\\format.ts', content: 'hello world' })) + expect(out).toBe('hello world format format src utils') + }) }) describe('selectorToMask', () => { diff --git a/src/indexing/sparse.ts b/src/indexing/sparse.ts index 5b75fd4..16a235b 100644 --- a/src/indexing/sparse.ts +++ b/src/indexing/sparse.ts @@ -34,13 +34,20 @@ export interface Chunk { * Assumes `chunk.filePath` is already repo-relative (set during indexing) so * machine-specific directory components are never indexed. The stem is * repeated twice to up-weight file-path matches in BM25. + * + * Repo-relative paths are normalized to POSIX (forward slashes) before + * parsing so Windows-host indexes produce the same enriched text as POSIX + * hosts. Without this, `path.parse` on Windows would split on `\\` while + * the indexer stores forward-slash paths, leading to inconsistent BM25 + * tokenization across platforms. */ export function enrichForBm25(chunk: Chunk): string { - const parsed = path.parse(chunk.filePath) + const normalized = chunk.filePath.replace(/\\/g, '/') + const parsed = path.posix.parse(normalized) const stem = parsed.name const dirParts = parsed.dir - .split(/[/\\]/) - .filter(part => part !== '' && part !== '.' && part !== '/') + .split('/') + .filter(part => part !== '' && part !== '.') const dirText = dirParts.slice(-3).join(' ') return `${chunk.content} ${stem} ${stem} ${dirText}` } @@ -160,6 +167,11 @@ export class Bm25Index { const idf = Math.log(1 + (numDocs - df + 0.5) / (df + 0.5)) for (const [docId, freq] of list) { + // Skip masked-out documents inside the posting-list iteration so we + // avoid the work entirely; Float32Array entries default to 0 so the + // final scores match the post-loop zeroing approach. + if (weightMask && !weightMask[docId]) + continue const dl = docLengths[docId] ?? 0 const denom = freq + K1 * (1 - B + (B * dl) / (avgDocLength || 1)) const contrib = (idf * (freq * (K1 + 1))) / (denom || 1) @@ -167,13 +179,6 @@ export class Bm25Index { } } - if (weightMask) { - for (let i = 0; i < numDocs; i++) { - if (!(weightMask[i] ?? 0)) - scores[i] = 0 - } - } - return scores }