From a851361c43002ff3aeaba2a4351afdc3289438ad Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:19:05 +0900 Subject: [PATCH 1/2] feat(indexing): port Model2Vec embedding + vector backend from semble MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port src/semble/index/dense.py to TypeScript with stub Model2Vec inference (Option C from the porting plan). Exports: - DEFAULT_MODEL_NAME = 'minishlab/potion-code-16M' - loadModel(modelPath?) — async, cached per path - embedChunks(model, chunks) — Float32Array per chunk - SelectableBasicBackend — cosine-distance backend with optional Uint32Array selector for index filtering, plus save/load roundtrip The Model2Vec model loading is a stub: deterministic, hash-seeded random vectors keep the API contract exercised by tests without requiring HuggingFace network I/O. Real model integration is flagged with a TODO and is explicitly out of scope per the coordinator's e2e recipe. --- src/indexing/dense.test.ts | 175 +++++++++++++++++++++++ src/indexing/dense.ts | 278 +++++++++++++++++++++++++++++++++++++ 2 files changed, 453 insertions(+) create mode 100644 src/indexing/dense.test.ts create mode 100644 src/indexing/dense.ts diff --git a/src/indexing/dense.test.ts b/src/indexing/dense.test.ts new file mode 100644 index 0000000..63802b2 --- /dev/null +++ b/src/indexing/dense.test.ts @@ -0,0 +1,175 @@ +// Port of src/semble/index/dense.py — unit tests + +import { mkdtemp, rm } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterEach, beforeEach, describe, expect, it } from 'bun:test' +import { + DEFAULT_MODEL_NAME, + embedChunks, + loadModel, + SelectableBasicBackend, + type Chunk, +} from './dense' + +function chunk(content: string): Chunk { + return { + content, + filePath: 'a.ts', + startLine: 1, + endLine: 1, + language: 'typescript', + } +} + +describe('loadModel', () => { + it('resolves with a Model exposing a positive dim', async () => { + const { model, modelPath } = await loadModel() + expect(modelPath).toBe(DEFAULT_MODEL_NAME) + expect(model.dim).toBeGreaterThan(0) + }) + + it('caches models by path', async () => { + const a = await loadModel('test/path-A') + const b = await loadModel('test/path-A') + expect(a.model).toBe(b.model) + }) + + it('returns distinct entries for different paths', async () => { + const a = await loadModel('test/path-X') + const b = await loadModel('test/path-Y') + expect(a.modelPath).toBe('test/path-X') + expect(b.modelPath).toBe('test/path-Y') + }) +}) + +describe('embedChunks', () => { + it('returns [] for an empty input', async () => { + const { model } = await loadModel() + expect(embedChunks(model, [])).toEqual([]) + }) + + it('returns one vector per chunk with model.dim length', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('hello'), chunk('world')]) + expect(vectors).toHaveLength(2) + for (const v of vectors) { + expect(v).toBeInstanceOf(Float32Array) + expect(v.length).toBe(model.dim) + } + }) + + it('is deterministic: same content → same vector', async () => { + const { model } = await loadModel() + const [v1] = embedChunks(model, [chunk('def search()')]) + const [v2] = embedChunks(model, [chunk('def search()')]) + expect(v1).toBeDefined() + expect(v2).toBeDefined() + expect(Array.from(v1!)).toEqual(Array.from(v2!)) + }) + + it('produces different vectors for different content', async () => { + const { model } = await loadModel() + const [v1, v2] = embedChunks(model, [chunk('foo'), chunk('bar')]) + expect(v1).toBeDefined() + expect(v2).toBeDefined() + expect(Array.from(v1!)).not.toEqual(Array.from(v2!)) + }) +}) + +describe('SelectableBasicBackend.query', () => { + it('throws when k < 1', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b')]) + const backend = new SelectableBasicBackend(vectors) + expect(() => backend.query([vectors[0]!], 0)).toThrow() + }) + + it('returns top-k (index, distance) pairs sorted by distance', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b'), chunk('c'), chunk('d')]) + const backend = new SelectableBasicBackend(vectors) + + const results = backend.query([vectors[0]!], 3) + expect(results).toHaveLength(1) + const hits = results[0]! + expect(hits).toHaveLength(3) + // Self should be the nearest with ~0 distance. + expect(hits[0]![0]).toBe(0) + expect(hits[0]![1]).toBeCloseTo(0, 5) + // Distances must be monotonically non-decreasing. + for (let i = 1; i < hits.length; i++) { + expect(hits[i]![1]).toBeGreaterThanOrEqual(hits[i - 1]![1]) + } + }) + + it('only returns indices from the selector pool', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b'), chunk('c'), chunk('d')]) + const backend = new SelectableBasicBackend(vectors) + + const selector = new Uint32Array([1, 2]) + const results = backend.query([vectors[0]!], 5, selector) + expect(results).toHaveLength(1) + const hits = results[0]! + // effective_k = min(5, 4, 2) = 2. + expect(hits).toHaveLength(2) + const indices = hits.map(h => h[0]) + for (const i of indices) { + expect([1, 2]).toContain(i) + } + }) + + it('handles multiple query vectors', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b'), chunk('c')]) + const backend = new SelectableBasicBackend(vectors) + + const results = backend.query([vectors[0]!, vectors[1]!], 2) + expect(results).toHaveLength(2) + expect(results[0]![0]![0]).toBe(0) + expect(results[1]![0]![0]).toBe(1) + }) + + it('caps effective_k at the number of stored vectors', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b')]) + const backend = new SelectableBasicBackend(vectors) + const results = backend.query([vectors[0]!], 10) + expect(results[0]!).toHaveLength(2) + }) +}) + +describe('SelectableBasicBackend save/load', () => { + let dir: string + beforeEach(async () => { + dir = await mkdtemp(join(tmpdir(), 'csp-dense-')) + }) + afterEach(async () => { + await rm(dir, { recursive: true, force: true }) + }) + + it('roundtrip preserves vectors and query results', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('alpha'), chunk('beta'), chunk('gamma')]) + const original = new SelectableBasicBackend(vectors) + await original.save(dir) + + const loaded = await SelectableBasicBackend.load(dir) + expect(loaded.vectors).toHaveLength(original.vectors.length) + expect(loaded.dim).toBe(original.dim) + + for (let i = 0; i < original.vectors.length; i++) { + const a = original.vectors[i]! + const b = loaded.vectors[i]! + expect(b.length).toBe(a.length) + for (let j = 0; j < a.length; j++) { + expect(b[j]!).toBeCloseTo(a[j]!, 6) + } + } + + const origResults = original.query([vectors[0]!], 2) + const loadedResults = loaded.query([vectors[0]!], 2) + expect(loadedResults[0]!.map(h => h[0])).toEqual(origResults[0]!.map(h => h[0])) + }) +}) diff --git a/src/indexing/dense.ts b/src/indexing/dense.ts new file mode 100644 index 0000000..6feb04e --- /dev/null +++ b/src/indexing/dense.ts @@ -0,0 +1,278 @@ +// Port of src/semble/index/dense.py +// +// Loads a Model2Vec model, embeds chunks, and provides a vector +// backend with cosine distance + optional index-selector filtering. +// +// NOTE: This unit ships a STUB Model2Vec implementation. `loadModel` and +// `embedChunks` do not download or run a real Model2Vec model. Instead +// they produce deterministic, hash-seeded float vectors so that the API +// contract is exercised by tests without requiring network I/O. +// TODO(dense): integrate real Model2Vec model loading. + +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { join } from 'node:path' + +/** + * Default Model2Vec model name (kept identical to semble for parity). + */ +export const DEFAULT_MODEL_NAME = 'minishlab/potion-code-16M' + +/** + * Default embedding dimension for the stub model. The real + * `potion-code-16M` model emits 256-dim vectors, but the stub is + * dimension-agnostic — pick something small enough for fast tests. + */ +const _DEFAULT_STUB_DIM = 256 + +/** + * Minimal chunk shape this module consumes. We only need `content`, + * so this is inlined rather than imported from a (not-yet-existing) + * top-level `types.ts`. When `src/types.ts` lands, swap this for + * `import type { Chunk } from '../types.ts'`. + */ +export interface Chunk { + content: string + // Other fields (filePath, startLine, endLine, language) are unused + // here but allowed via the index signature so callers can pass full + // Chunk objects without type narrowing. + [key: string]: unknown +} + +/** + * Loaded Model2Vec model. The real model exposes `.encode(texts)`; + * the stub provides the same shape plus a `dim` accessor. + */ +export interface Model { + readonly dim: number + encode: (texts: string[]) => Float32Array[] +} + +const _MODEL_CACHE = new Map() + +/** + * Deterministic 32-bit hash (FNV-1a) for stub seeding. + */ +function fnv1a(s: string): number { + let h = 0x811C9DC5 + for (let i = 0; i < s.length; i++) { + h ^= s.charCodeAt(i) + h = Math.imul(h, 0x01000193) >>> 0 + } + return h >>> 0 +} + +/** + * Mulberry32 PRNG — fast, deterministic, good enough for stub vectors. + */ +function mulberry32(seed: number): () => number { + let a = seed >>> 0 + return () => { + a = (a + 0x6D2B79F5) >>> 0 + let t = a + t = Math.imul(t ^ (t >>> 15), t | 1) + t ^= t + Math.imul(t ^ (t >>> 7), t | 61) + return ((t ^ (t >>> 14)) >>> 0) / 4294967296 + } +} + +/** + * Build a deterministic unit-length vector from a string. Identical + * input strings always produce identical vectors, satisfying the + * "embedding is a pure function of content" contract. + */ +function stubEmbed(text: string, dim: number): Float32Array { + const rng = mulberry32(fnv1a(text)) + const v = new Float32Array(dim) + let norm = 0 + for (let i = 0; i < dim; i++) { + // Box-Muller-ish: cheap normal-ish distribution out of two uniforms. + const u1 = Math.max(rng(), 1e-12) + const u2 = rng() + const g = Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2) + v[i] = g + norm += g * g + } + norm = Math.sqrt(norm) || 1 + for (let i = 0; i < dim; i++) v[i] = v[i]! / norm + return v +} + +function makeStubModel(dim: number): Model { + return { + dim, + encode(texts: string[]): Float32Array[] { + return texts.map(t => stubEmbed(t, dim)) + }, + } +} + +/** + * Load (and cache) a Model2Vec model. Always async, mirroring the + * eventual real implementation that performs an HF download. + * + * @param modelPath Optional model id; defaults to {@link DEFAULT_MODEL_NAME}. + */ +export async function loadModel( + modelPath?: string, +): Promise<{ model: Model, modelPath: string }> { + const resolved = modelPath ?? DEFAULT_MODEL_NAME + let model = _MODEL_CACHE.get(resolved) + if (!model) { + // TODO(dense): replace with real Model2Vec download + inference. + model = makeStubModel(_DEFAULT_STUB_DIM) + _MODEL_CACHE.set(resolved, model) + } + return Promise.resolve({ model, modelPath: resolved }) +} + +/** + * Embed chunks using the configured model. Returns one row per chunk; + * the empty list maps to an empty result (matching semble). + */ +export function embedChunks(model: Model, chunks: Chunk[]): Float32Array[] { + if (chunks.length === 0) return [] + return model.encode(chunks.map(c => c.content)) +} + +// --------------------------------------------------------------------------- +// SelectableBasicBackend +// --------------------------------------------------------------------------- + +export interface BasicArgs { + /** Distance metric — for parity we only support cosine. */ + metric?: 'cosine' +} + +/** + * Pre-normalise a vector in place (L2). Zero vectors stay zero. + */ +function normalizeInPlace(v: Float32Array): void { + let n = 0 + for (let i = 0; i < v.length; i++) n += v[i]! * v[i]! + n = Math.sqrt(n) + if (n === 0) return + for (let i = 0; i < v.length; i++) v[i] = v[i]! / n +} + +function dot(a: Float32Array, b: Float32Array): number { + let s = 0 + const n = a.length + for (let i = 0; i < n; i++) s += a[i]! * b[i]! + return s +} + +/** + * In-memory vector backend with cosine distance and optional + * candidate-selector filtering — TS port of + * `SelectableBasicBackend(CosineBasicBackend)` from semble. + */ +export class SelectableBasicBackend { + /** Pre-normalised row vectors. */ + readonly vectors: Float32Array[] + readonly arguments: BasicArgs + readonly dim: number + + constructor(vectors: Float32Array[], options: BasicArgs = {}) { + this.arguments = { metric: 'cosine', ...options } + // Defensive copy + normalise so cosine distance reduces to (1 - dot). + this.vectors = vectors.map((v) => { + const copy = new Float32Array(v) + normalizeInPlace(copy) + return copy + }) + this.dim = this.vectors[0]?.length ?? 0 + } + + /** + * Batched k-NN query. + * + * @param queryVectors One row per query (raw — will be normalised here). + * @param k Number of neighbours per query. + * @param selector Optional pool of candidate indices; results are + * guaranteed to come from this set. + * @returns For each query, an array of `[chunkIndex, cosineDistance]` + * sorted by ascending distance. + * @throws Error if `k < 1`. + */ + query( + queryVectors: Float32Array[], + k: number, + selector?: Uint32Array, + ): Array> { + if (k < 1) throw new Error(`k should be >= 1, is now ${k}`) + + const numVectors = this.vectors.length + let effectiveK = Math.min(k, numVectors) + if (selector !== undefined) effectiveK = Math.min(effectiveK, selector.length) + + const out: Array> = [] + if (effectiveK === 0) { + for (let i = 0; i < queryVectors.length; i++) out.push([]) + return out + } + + for (const raw of queryVectors) { + const q = new Float32Array(raw) + normalizeInPlace(q) + + const candidatePool = selector ?? null + const poolSize = candidatePool ? candidatePool.length : numVectors + const distances = new Float64Array(poolSize) + for (let i = 0; i < poolSize; i++) { + const vecIdx = candidatePool ? candidatePool[i]! : i + const target = this.vectors[vecIdx]! + distances[i] = 1 - dot(q, target) + } + + // Build [poolIdx, dist] pairs and partial-sort by distance. + const pairs: Array<[number, number]> = Array.from( + { length: poolSize }, + (_, i) => [i, distances[i]!], + ) + pairs.sort((a, b) => a[1] - b[1]) + const top = pairs.slice(0, effectiveK) + + // Map pool-relative indices back to absolute chunk indices. + const mapped: Array<[number, number]> = top.map(([poolIdx, dist]) => [ + candidatePool ? candidatePool[poolIdx]! : poolIdx, + dist, + ]) + out.push(mapped) + } + + return out + } + + /** + * Persist vectors + args to `/vectors.bin` and `/args.json`. + * Format is local to csp — vicinity's own format is not preserved. + */ + async save(dir: string): Promise { + await mkdir(dir, { recursive: true }) + const rows = this.vectors.length + const dim = this.dim + const buf = new Float32Array(rows * dim) + for (let r = 0; r < rows; r++) buf.set(this.vectors[r]!, r * dim) + const meta = { rows, dim, arguments: this.arguments } + await writeFile(join(dir, 'vectors.bin'), Buffer.from(buf.buffer, buf.byteOffset, buf.byteLength)) + await writeFile(join(dir, 'args.json'), JSON.stringify(meta)) + } + + /** + * Inverse of {@link SelectableBasicBackend.save}. + */ + static async load(dir: string): Promise { + const metaRaw = await readFile(join(dir, 'args.json'), 'utf8') + const meta = JSON.parse(metaRaw) as { rows: number, dim: number, arguments: BasicArgs } + const bytes = await readFile(join(dir, 'vectors.bin')) + // Copy into a fresh ArrayBuffer so alignment is guaranteed. + const ab = new ArrayBuffer(bytes.byteLength) + new Uint8Array(ab).set(bytes) + const flat = new Float32Array(ab) + const vectors: Float32Array[] = [] + for (let r = 0; r < meta.rows; r++) { + vectors.push(flat.slice(r * meta.dim, (r + 1) * meta.dim)) + } + return new SelectableBasicBackend(vectors, meta.arguments) + } +} From b817e45502f664c767f2c22e499dfd4bd9f3d32f Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:43:32 +0900 Subject: [PATCH 2/2] review(indexing): apply gemini-code-assist feedback (dense) --- src/indexing/dense.test.ts | 47 +++++++++++++++++++++++++++++++++++++- src/indexing/dense.ts | 32 ++++++++++++++++++++++++-- 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/src/indexing/dense.test.ts b/src/indexing/dense.test.ts index 63802b2..73a39b2 100644 --- a/src/indexing/dense.test.ts +++ b/src/indexing/dense.test.ts @@ -1,6 +1,6 @@ // Port of src/semble/index/dense.py — unit tests -import { mkdtemp, rm } from 'node:fs/promises' +import { mkdtemp, rm, writeFile } from 'node:fs/promises' import { tmpdir } from 'node:os' import { join } from 'node:path' import { afterEach, beforeEach, describe, expect, it } from 'bun:test' @@ -85,6 +85,33 @@ describe('SelectableBasicBackend.query', () => { expect(() => backend.query([vectors[0]!], 0)).toThrow() }) + it('throws when constructed with inconsistent vector dimensions', async () => { + const { model } = await loadModel() + const [v0] = embedChunks(model, [chunk('a')]) + const truncated = new Float32Array(v0!.length - 1) + expect(() => new SelectableBasicBackend([v0!, truncated])).toThrow( + /Inconsistent vector dimensions/, + ) + }) + + it('throws when a query vector dimension differs from the index dim', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b')]) + const backend = new SelectableBasicBackend(vectors) + const bad = new Float32Array(backend.dim - 1) + expect(() => backend.query([bad], 1)).toThrow(/Query vector dimension mismatch/) + }) + + it('throws when a selector index is out of bounds', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('a'), chunk('b')]) + const backend = new SelectableBasicBackend(vectors) + const selector = new Uint32Array([0, 5]) + expect(() => backend.query([vectors[0]!], 1, selector)).toThrow( + /Selector index out of bounds/, + ) + }) + it('returns top-k (index, distance) pairs sorted by distance', async () => { const { model } = await loadModel() const vectors = embedChunks(model, [chunk('a'), chunk('b'), chunk('c'), chunk('d')]) @@ -172,4 +199,22 @@ describe('SelectableBasicBackend save/load', () => { const loadedResults = loaded.query([vectors[0]!], 2) expect(loadedResults[0]!.map(h => h[0])).toEqual(origResults[0]!.map(h => h[0])) }) + + it('rejects a truncated vectors.bin during load', async () => { + const { model } = await loadModel() + const vectors = embedChunks(model, [chunk('alpha'), chunk('beta')]) + const original = new SelectableBasicBackend(vectors) + await original.save(dir) + + // Truncate vectors.bin to half its expected size. + const truncated = new Float32Array(original.dim) // one row instead of two + await writeFile( + join(dir, 'vectors.bin'), + Buffer.from(truncated.buffer, truncated.byteOffset, truncated.byteLength), + ) + + await expect(SelectableBasicBackend.load(dir)).rejects.toThrow( + /Vector file size mismatch/, + ) + }) }) diff --git a/src/indexing/dense.ts b/src/indexing/dense.ts index 6feb04e..45a9fe7 100644 --- a/src/indexing/dense.ts +++ b/src/indexing/dense.ts @@ -174,13 +174,18 @@ export class SelectableBasicBackend { constructor(vectors: Float32Array[], options: BasicArgs = {}) { this.arguments = { metric: 'cosine', ...options } + this.dim = vectors[0]?.length ?? 0 // Defensive copy + normalise so cosine distance reduces to (1 - dot). this.vectors = vectors.map((v) => { + if (v.length !== this.dim) { + throw new Error( + `Inconsistent vector dimensions: expected ${this.dim}, got ${v.length}`, + ) + } const copy = new Float32Array(v) normalizeInPlace(copy) return copy }) - this.dim = this.vectors[0]?.length ?? 0 } /** @@ -203,7 +208,19 @@ export class SelectableBasicBackend { const numVectors = this.vectors.length let effectiveK = Math.min(k, numVectors) - if (selector !== undefined) effectiveK = Math.min(effectiveK, selector.length) + if (selector !== undefined) { + // Bounds-check selector indices up front so we fail fast with a + // descriptive error instead of crashing during the dot-product loop. + for (let i = 0; i < selector.length; i++) { + const idx = selector[i]! + if (idx >= numVectors) { + throw new Error( + `Selector index out of bounds: ${idx} (total vectors: ${numVectors})`, + ) + } + } + effectiveK = Math.min(effectiveK, selector.length) + } const out: Array> = [] if (effectiveK === 0) { @@ -212,6 +229,11 @@ export class SelectableBasicBackend { } for (const raw of queryVectors) { + if (raw.length !== this.dim) { + throw new Error( + `Query vector dimension mismatch: expected ${this.dim}, got ${raw.length}`, + ) + } const q = new Float32Array(raw) normalizeInPlace(q) @@ -265,6 +287,12 @@ export class SelectableBasicBackend { const metaRaw = await readFile(join(dir, 'args.json'), 'utf8') const meta = JSON.parse(metaRaw) as { rows: number, dim: number, arguments: BasicArgs } const bytes = await readFile(join(dir, 'vectors.bin')) + const expectedBytes = meta.rows * meta.dim * 4 + if (bytes.byteLength !== expectedBytes) { + throw new Error( + `Vector file size mismatch: expected ${expectedBytes} bytes, got ${bytes.byteLength}`, + ) + } // Copy into a fresh ArrayBuffer so alignment is guaranteed. const ab = new ArrayBuffer(bytes.byteLength) new Uint8Array(ab).set(bytes)