Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions src/indexing/create.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
// Tests for src/indexing/create.ts

import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
import { ContentType } from '../types.ts'
import { createIndexFromPath } from './create.ts'
import { makeStubModel } from './dense.ts'

describe('createIndexFromPath', () => {
let dir: string

beforeEach(() => {
dir = mkdtempSync(join(tmpdir(), 'csp-create-'))
})
afterEach(() => {
rmSync(dir, { recursive: true, force: true })
})

it('builds chunks/bm25/semantic indexes for a small TS file', async () => {
const src = join(dir, 'sample.ts')
writeFileSync(
src,
'export function greet(name: string) {\n return `hi ${name}`\n}\n',
)
const model = makeStubModel('test-model', 4)
const result = await createIndexFromPath(dir, { model, displayRoot: dir })
expect(result.chunks.length).toBeGreaterThan(0)
// Path is stored relative to displayRoot.
expect(result.chunks[0]!.filePath).toBe('sample.ts')
expect(result.semanticIndex.vectors.length).toBe(result.chunks.length)
expect(result.bm25Index.documents.length).toBe(result.chunks.length)
})

it('throws when no supported files are found', async () => {
// Only an unsupported binary extension present.
writeFileSync(join(dir, 'data.bin'), 'binary')
const model = makeStubModel()
await expect(createIndexFromPath(dir, { model })).rejects.toThrow(
/No supported files found/,
)
})

it('respects an explicit extensions override', async () => {
writeFileSync(join(dir, 'a.txt'), 'hello world')
const model = makeStubModel()
const result = await createIndexFromPath(dir, {
model,
extensions: ['.txt'],
content: ContentType.Docs,
displayRoot: dir,
})
expect(result.chunks.length).toBe(1)
expect(result.chunks[0]!.filePath).toBe('a.txt')
})

it('skips files larger than MAX_FILE_BYTES', async () => {
// Write 2 MB of code-like content; should be skipped.
const big = 'a'.repeat(2_000_000)
writeFileSync(join(dir, 'big.ts'), big)
writeFileSync(join(dir, 'small.ts'), 'export const x = 1\n')
const model = makeStubModel()
const result = await createIndexFromPath(dir, { model, displayRoot: dir })
const paths = result.chunks.map(c => c.filePath)
expect(paths).toContain('small.ts')
expect(paths).not.toContain('big.ts')
})

it('descends into subdirectories', async () => {
const sub = join(dir, 'sub')
mkdirSync(sub)
writeFileSync(join(sub, 'nested.ts'), 'const a = 1\n')
const model = makeStubModel()
const result = await createIndexFromPath(dir, { model, displayRoot: dir })
const paths = result.chunks.map(c => c.filePath)
expect(paths.some(p => p.endsWith('nested.ts'))).toBe(true)
})
})
91 changes: 91 additions & 0 deletions src/indexing/create.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
// Port of src/semble/index/create.py

import { readFileSync, statSync } from 'node:fs'
import { relative } from 'node:path'
import type { Chunk } from '../types.ts'
import { ContentType } from '../types.ts'
import { chunkSource } from '../chunking/chunk-source.ts'
import { tokenize } from '../tokens.ts'
import type { Model } from './dense.ts'
import { SelectableBasicBackend, embedChunks } from './dense.ts'
import { walkFiles } from './file-walker.ts'
import { detectLanguage, getExtensions } from './files.ts'
import { Bm25Index, enrichForBm25 } from './sparse.ts'

/** 1 MB max file size to read and index. */
export const MAX_FILE_BYTES = 1_000_000

export interface CreateIndexOptions {
model: Model
extensions?: readonly string[]
content?: ContentType | readonly ContentType[]
displayRoot?: string
}

export interface CreateIndexResult {
bm25Index: Bm25Index
semanticIndex: SelectableBasicBackend
chunks: Chunk[]
}

/**
* Create an index from a resolved directory.
*
* Walks files matching `extensions`, chunks them, enriches text for BM25,
* tokenizes it, embeds chunks, and returns the populated indexes.
*
* @throws if no chunks are produced.
*/
export async function createIndexFromPath(
path: string,
options: CreateIndexOptions,
): Promise<CreateIndexResult> {
const { model, extensions, content, displayRoot } = options

const normalized: readonly ContentType[] = normalizeContent(content)
const resolvedExtensions = getExtensions(normalized, extensions)

const chunks: Chunk[] = []
for (const filePath of walkFiles(path, resolvedExtensions)) {
const language = detectLanguage(filePath)
let size: number
try {
size = statSync(filePath).size
}
catch {
continue
}
if (size > MAX_FILE_BYTES) continue
let source: string
try {
source = readFileSync(filePath, 'utf8')
}
catch {
continue
}
const chunkPath = displayRoot ? relative(displayRoot, filePath) : filePath
chunks.push(...chunkSource(source, chunkPath, language))
}

if (chunks.length === 0) {
throw new Error(`No supported files found under ${path}.`)
}

const embeddings = embedChunks(model, chunks)
const bm25Index = new Bm25Index()
bm25Index.index(chunks.map(c => tokenize(enrichForBm25(c))))
const semanticIndex = new SelectableBasicBackend(embeddings, model.dim)

return { bm25Index, semanticIndex, chunks }
}

function normalizeContent(
content: ContentType | readonly ContentType[] | undefined,
): readonly ContentType[] {
if (content === undefined) {
// Default: code-only. Mirrors _DEFAULT_CONTENT in semble.
return [ContentType.Code]
}
if (Array.isArray(content)) return content
return [content as ContentType]
}
202 changes: 202 additions & 0 deletions src/indexing/index.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
// Tests for src/indexing/index.ts (CspIndex)

import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { afterEach, beforeEach, describe, expect, it } from 'bun:test'
import type { Chunk } from '../types.ts'
import { ContentType } from '../types.ts'
import { CspIndex, DEFAULT_CONTENT } from './index.ts'
import { SelectableBasicBackend, makeStubModel } from './dense.ts'
import { Bm25Index } from './sparse.ts'

function makeChunk(
filePath: string,
startLine: number,
endLine: number,
language: string | null = 'typescript',
content?: string,
): Chunk {
return {
content: content ?? `// chunk for ${filePath}:${startLine}-${endLine}`,
filePath,
startLine,
endLine,
language,
}
}

function buildIndex(chunks: Chunk[]): CspIndex {
const model = makeStubModel('test-model', 4)
const vectors = chunks.map((_, i) => {
const v = new Float32Array(4)
v[0] = i + 1
return v
})
return new CspIndex({
model,
bm25Index: new Bm25Index(chunks.map(() => ['x'])),
semanticIndex: new SelectableBasicBackend(vectors, 4),
chunks,
modelPath: 'test-model',
root: null,
content: DEFAULT_CONTENT,
})
}

describe('CspIndex.stats', () => {
it('returns zeros for an empty index', () => {
const idx = buildIndex([])
expect(idx.stats).toEqual({
indexedFiles: 0,
totalChunks: 0,
languages: {},
})
})

it('reflects chunk count, file count, and language distribution', () => {
const chunks: Chunk[] = [
makeChunk('a.ts', 1, 10, 'typescript'),
makeChunk('a.ts', 11, 20, 'typescript'),
makeChunk('b.py', 1, 5, 'python'),
makeChunk('c.bin', 1, 1, null),
]
const idx = buildIndex(chunks)
expect(idx.stats).toEqual({
indexedFiles: 3,
totalChunks: 4,
languages: { typescript: 2, python: 1 },
})
})
})

describe('CspIndex.search', () => {
it('returns [] on an empty query', () => {
const chunks = [makeChunk('a.ts', 1, 1)]
const idx = buildIndex(chunks)
expect(idx.search('')).toEqual([])
expect(idx.search(' ')).toEqual([])
})

it('returns [] when the index has no chunks', () => {
const idx = buildIndex([])
expect(idx.search('anything')).toEqual([])
})

it('returns [] when topK <= 0', () => {
const chunks = [makeChunk('a.ts', 1, 1)]
const idx = buildIndex(chunks)
expect(idx.search('anything', { topK: 0 })).toEqual([])
expect(idx.search('anything', { topK: -1 })).toEqual([])
})

it('returns [] when filters are set but match nothing (no fallback to unfiltered)', () => {
// Regression: previously an empty selector was treated as "no filter"
// which fell back to an unfiltered search — silently ignoring user intent.
const chunks: Chunk[] = [
makeChunk('a.ts', 1, 10, 'typescript', 'alpha'),
makeChunk('b.py', 1, 10, 'python', 'beta'),
]
const idx = buildIndex(chunks)
expect(idx.search('anything', { filterLanguages: ['nonexistent'] })).toEqual([])
expect(idx.search('anything', { filterPaths: ['nope.ts'] })).toEqual([])
})
})

describe('CspIndex.findRelated', () => {
it('excludes the source chunk from results', () => {
const chunks: Chunk[] = [
makeChunk('a.ts', 1, 10, 'typescript', 'seed chunk'),
makeChunk('a.ts', 11, 20, 'typescript', 'companion 1'),
makeChunk('b.ts', 1, 5, 'typescript', 'companion 2'),
]
const idx = buildIndex(chunks)
const seed = chunks[0]!
const results = idx.findRelated(seed, { topK: 5 })
// Source chunk must not appear in the results.
expect(results.find(r => r.chunk === seed)).toBeUndefined()
expect(results.length).toBeLessThanOrEqual(5)
})

it('accepts a SearchResult as the seed', () => {
const chunks: Chunk[] = [
makeChunk('a.ts', 1, 10, 'typescript', 'seed'),
makeChunk('b.ts', 1, 5, 'typescript', 'other'),
]
const idx = buildIndex(chunks)
const results = idx.findRelated({ chunk: chunks[0]!, score: 0.5 })
expect(results.find(r => r.chunk === chunks[0]!)).toBeUndefined()
})
})

describe('CspIndex save → loadFromDisk roundtrip', () => {
let dir: string

beforeEach(() => {
dir = mkdtempSync(join(tmpdir(), 'csp-roundtrip-'))
})
afterEach(() => {
rmSync(dir, { recursive: true, force: true })
})

it('persists chunks, indexes, and metadata', async () => {
const chunks: Chunk[] = [
makeChunk('a.ts', 1, 10, 'typescript', 'A'),
makeChunk('b.ts', 1, 5, 'python', 'B'),
]
const idx = buildIndex(chunks)
await idx.save(dir)
const loaded = await CspIndex.loadFromDisk(dir)
expect(loaded.chunks.length).toBe(2)
expect(loaded.chunks.map(c => c.filePath)).toEqual(['a.ts', 'b.ts'])
expect(loaded.stats.totalChunks).toBe(2)
expect(loaded.stats.languages).toEqual({ typescript: 1, python: 1 })
})

it('loadFromDisk throws on a missing directory', async () => {
await expect(CspIndex.loadFromDisk(join(dir, 'nope'))).rejects.toThrow(
/Index not found/,
)
})

it('loadFromDisk throws when a persisted artifact is missing', async () => {
// Dir exists but is empty.
await expect(CspIndex.loadFromDisk(dir)).rejects.toThrow(/Missing:/)
})
})

describe('CspIndex.fromPath', () => {
let dir: string

beforeEach(() => {
dir = mkdtempSync(join(tmpdir(), 'csp-from-path-'))
})
afterEach(() => {
rmSync(dir, { recursive: true, force: true })
})

it('throws when the path does not exist', async () => {
await expect(CspIndex.fromPath(join(dir, 'nope'))).rejects.toThrow(
/Path does not exist/,
)
})

it('throws when the path exists but is a file', async () => {
const filePath = join(dir, 'a.ts')
writeFileSync(filePath, '// hello\n')
await expect(CspIndex.fromPath(filePath)).rejects.toThrow(
/not a directory/,
)
})

it('builds a CspIndex from a real directory with a small TS file', async () => {
writeFileSync(
join(dir, 'sample.ts'),
'export function greet(name: string) {\n return `hi ${name}`\n}\n',
)
const idx = await CspIndex.fromPath(dir, { content: ContentType.Code })
expect(idx.stats.totalChunks).toBeGreaterThan(0)
expect(idx.stats.indexedFiles).toBe(1)
expect(idx.chunks[0]!.filePath).toBe('sample.ts')
})
})
Loading