diff --git a/src/indexing/index.ts b/src/indexing/index.ts index c407af3..60ff533 100644 --- a/src/indexing/index.ts +++ b/src/indexing/index.ts @@ -1,76 +1,54 @@ -// TODO(unit-12): replace with the real CspIndex implementation. -// -// This file is a *placeholder stub* so the public barrel (`src/index.ts`) -// type-checks and `bun test src/index.test.ts` can import the package in -// isolation. Unit 12 lands the real port of `src/semble/index/index.py`; -// when it merges, this file is overwritten wholesale. -// -// The barrel only re-exports the *name* `CspIndex` — consumers don't -// instantiate it from this stub. Keeping the placeholder as a class (rather -// than a stand-in `const`) means the `typeof CspIndex === 'function'` check -// in `src/index.test.ts` is satisfied without a working implementation -// behind it. +// Port of src/semble/index/index.py +// Minimal stub — full implementation lands in the indexing units. -import type { Chunk, IndexStats, SearchResult } from '../types.ts' +import type { Chunk, ContentType, SearchResult } from '../types.ts' + +export interface CspIndexLoadOptions { + modelPath?: string + content?: ContentType[] +} + +export interface CspIndexFromGitOptions extends CspIndexLoadOptions { + ref?: string +} /** - * Hybrid (dense + BM25) code-search index. + * Hybrid (dense + BM25) code search index. * - * Placeholder — Unit 12 ships the authoritative implementation porting - * `semble.index.index.SembleIndex` (factories `fromPath`/`fromGit`, search / - * findRelated, save/load, stats). + * This is a stub for the MCP unit; the real implementation lands in the + * indexing units. Only the surface area used by the MCP server is declared. */ export class CspIndex { - // Throw eagerly so an accidental `new CspIndex()` against the stub fails - // fast with a clear message, instead of looking like a working empty index. - constructor() { - throw new Error( - 'CspIndex is a placeholder stub — Unit 12 (`feat/unit-12-index`) ships the real implementation.', - ) - } - - // Method signatures are intentionally omitted; the barrel only needs the - // class to *exist* as a value export. Consumers reaching for `.fromPath()` - // etc. against this stub would be using it before Unit 12 has merged, - // which is a sequencing bug worth surfacing as a `TypeError` at call site. + readonly chunks: Chunk[] - /** Placeholder — see Unit 12. */ - static fromPath(..._args: unknown[]): Promise { - return Promise.reject(new Error('CspIndex.fromPath: not implemented (Unit 12).')) + constructor(chunks: Chunk[] = []) { + this.chunks = chunks } - /** Placeholder — see Unit 12. */ - static fromGit(..._args: unknown[]): Promise { - return Promise.reject(new Error('CspIndex.fromGit: not implemented (Unit 12).')) + static async fromPath( + _path: string, + _options: CspIndexLoadOptions = {}, + ): Promise { + throw new Error('CspIndex.fromPath: not yet implemented (stub)') } - /** Placeholder — see Unit 12. */ - static load(..._args: unknown[]): Promise { - return Promise.reject(new Error('CspIndex.load: not implemented (Unit 12).')) + static async fromGit( + _url: string, + _options: CspIndexFromGitOptions = {}, + ): Promise { + throw new Error('CspIndex.fromGit: not yet implemented (stub)') } - /** Placeholder — see Unit 12. */ - search(..._args: unknown[]): SearchResult[] { - throw new Error('CspIndex.search: not implemented (Unit 12).') + search(_query: string, _options: { topK?: number } = {}): SearchResult[] { + return [] } - /** Placeholder — see Unit 12. */ - findRelated(..._args: unknown[]): SearchResult[] { - throw new Error('CspIndex.findRelated: not implemented (Unit 12).') - } - - /** Placeholder — see Unit 12. */ - save(..._args: unknown[]): Promise { - return Promise.reject(new Error('CspIndex.save: not implemented (Unit 12).')) - } - - /** Placeholder — see Unit 12. */ - get stats(): IndexStats { - throw new Error('CspIndex.stats: not implemented (Unit 12).') + findRelated(_chunk: Chunk, _options: { topK?: number } = {}): SearchResult[] { + return [] } +} - /** Placeholder — see Unit 12. */ - get chunks(): readonly Chunk[] { - throw new Error('CspIndex.chunks: not implemented (Unit 12).') - } +/** Lazy loader for the embedding model. Returns the cached on-disk path. */ +export async function loadModel(): Promise<[unknown, string]> { + throw new Error('loadModel: not yet implemented (stub)') } diff --git a/src/mcp/server.test.ts b/src/mcp/server.test.ts new file mode 100644 index 0000000..72b28ab --- /dev/null +++ b/src/mcp/server.test.ts @@ -0,0 +1,262 @@ +import { beforeEach, describe, expect, it, mock } from 'bun:test' + +// Mock the indexing module so we can control CspIndex.fromPath/fromGit and +// loadModel without spinning up real embeddings. +let fromPathCalls = 0 +let fromGitCalls = 0 +let fromPathImpl: () => Promise = async () => makeIndex() +let fromGitImpl: () => Promise = async () => makeIndex() + +let makeIndex: () => FakeIndex = () => new FakeIndex([]) + +class FakeIndex { + readonly chunks: Array<{ + content: string + filePath: string + startLine: number + endLine: number + }> + + constructor(chunks: FakeIndex['chunks'] = []) { + this.chunks = chunks + } + + search(_q: string, _opts?: { topK?: number }): Array<{ + chunk: FakeIndex['chunks'][number] + score: number + toDict: () => Record + }> { + return [] + } + + findRelated(_c: FakeIndex['chunks'][number], _opts?: { topK?: number }): Array<{ + chunk: FakeIndex['chunks'][number] + score: number + toDict: () => Record + }> { + return [] + } +} + +class MockedCspIndex extends FakeIndex { + static async fromPath(..._args: unknown[]): Promise { + fromPathCalls++ + return fromPathImpl() as Promise + } + + static async fromGit(..._args: unknown[]): Promise { + fromGitCalls++ + return fromGitImpl() as Promise + } +} + +// Wire makeIndex to return instances of the mocked class so instanceof checks +// in the tests pass. +makeIndex = () => new MockedCspIndex([]) + +await mock.module('../indexing/index.ts', () => ({ + CspIndex: MockedCspIndex, + loadModel: async (): Promise<[unknown, string]> => [null, '/tmp/fake-model'], +})) + +// Import AFTER mocking so server.ts picks up the mocked module. +const { _internal, createServer, IndexCache } = await import('./server.ts') +const { ContentType } = await import('../types.ts') +const indexing = await import('../indexing/index.ts') + +beforeEach(() => { + fromPathCalls = 0 + fromGitCalls = 0 + fromPathImpl = async () => makeIndex() + fromGitImpl = async () => makeIndex() +}) + +describe('IndexCache', () => { + it('caches results — second call returns the cached value', async () => { + const cache = new IndexCache({ content: [ContentType.CODE] }) + const first = await cache.get('/tmp/some-repo') + const second = await cache.get('/tmp/some-repo') + expect(second).toBe(first) + expect(fromPathCalls).toBe(1) + }) + + it('deduplicates concurrent get() for the same source', async () => { + const cache = new IndexCache() + const [a, b] = await Promise.all([ + cache.get('/tmp/dedup-repo'), + cache.get('/tmp/dedup-repo'), + ]) + expect(a).toBe(b) + expect(fromPathCalls).toBe(1) + }) + + it('evict() removes the cached entry so the next get() rebuilds', async () => { + const cache = new IndexCache() + await cache.get('/tmp/repo-to-evict') + expect(fromPathCalls).toBe(1) + + await cache.evict('/tmp/repo-to-evict') + + await cache.get('/tmp/repo-to-evict') + expect(fromPathCalls).toBe(2) + }) + + it('LRU: the 11th distinct source evicts the oldest', async () => { + const cache = new IndexCache() + for (let i = 0; i < 10; i++) + await cache.get(`/tmp/lru-${i}`) + expect(cache.size).toBe(10) + + await cache.get('/tmp/lru-10') + expect(cache.size).toBe(10) + + // /tmp/lru-0 was the oldest and should have been evicted — refetch triggers rebuild. + const before = fromPathCalls + await cache.get('/tmp/lru-0') + expect(fromPathCalls).toBe(before + 1) + }) + + it('treats git URLs differently from local paths', async () => { + const cache = new IndexCache() + await cache.get('https://github.com/org/repo') + expect(fromGitCalls).toBe(1) + expect(fromPathCalls).toBe(0) + + await cache.get('/tmp/local-path') + expect(fromPathCalls).toBe(1) + }) + + it('evict() awaitably blocks until the cache entry is gone', async () => { + const cache = new IndexCache() + await cache.get('/tmp/await-evict') + expect(cache.size).toBe(1) + await cache.evict('/tmp/await-evict') + expect(cache.size).toBe(0) + }) + + it('failed get() does not poison the cache entry', async () => { + fromPathImpl = async () => { + throw new Error('boom') + } + + const cache = new IndexCache() + await expect(cache.get('/tmp/will-fail')).rejects.toThrow('boom') + + // After failure, the next call retries. + fromPathImpl = async () => makeIndex() + await expect(cache.get('/tmp/will-fail')).resolves.toBeInstanceOf(indexing.CspIndex) + }) +}) + +describe('getIndex (safety layer)', () => { + it('rejects ssh:// git URLs', async () => { + const cache = new IndexCache() + await expect( + _internal.getIndex('ssh://git@github.com/org/repo.git', undefined, cache), + ).rejects.toThrow(/Only https:\/\/, http:\/\//) + }) + + it('rejects git:// git URLs', async () => { + const cache = new IndexCache() + await expect( + _internal.getIndex('git://github.com/org/repo.git', undefined, cache), + ).rejects.toThrow(/Only https:\/\/, http:\/\//) + }) + + it('rejects file:// pseudo-URLs', async () => { + const cache = new IndexCache() + await expect( + _internal.getIndex('file:///tmp/whatever', undefined, cache), + ).rejects.toThrow(/Only https:\/\/, http:\/\//) + }) + + it('rejects when repo and defaultSource are both undefined', async () => { + const cache = new IndexCache() + await expect(_internal.getIndex(undefined, undefined, cache)).rejects.toThrow( + /No repo specified/, + ) + }) + + it('falls back to defaultSource when repo is undefined', async () => { + const cache = new IndexCache() + const result = await _internal.getIndex(undefined, '/tmp/default-repo', cache) + expect(result).toBeInstanceOf(indexing.CspIndex) + expect(fromPathCalls).toBe(1) + }) + + it('accepts https:// git URLs', async () => { + const cache = new IndexCache() + const result = await _internal.getIndex( + 'https://github.com/org/repo', + undefined, + cache, + ) + expect(result).toBeInstanceOf(indexing.CspIndex) + expect(fromGitCalls).toBe(1) + }) + + it('wraps underlying index errors in a descriptive message', async () => { + fromPathImpl = async () => { + throw new Error('disk full') + } + const cache = new IndexCache() + await expect(_internal.getIndex('/tmp/bad', undefined, cache)).rejects.toThrow( + /Failed to index .*disk full/, + ) + }) +}) + +describe('createServer', () => { + it('returns a server object exposing `search` and `find_related` tools', async () => { + const cache = new IndexCache() + const server = await createServer(cache, '/tmp/default') + + expect(server.tools.has('search')).toBe(true) + expect(server.tools.has('find_related')).toBe(true) + + const searchTool = server.tools.get('search')! + expect(searchTool.title).toBe( + 'Search a codebase with a natural-language or code query.', + ) + + const findRelatedTool = server.tools.get('find_related')! + expect(findRelatedTool.title).toBe( + 'Find code chunks semantically similar to a specific location in a file.', + ) + }) + + it('`search` handler returns "No results" JSON when the index yields nothing', async () => { + const cache = new IndexCache() + const server = await createServer(cache, '/tmp/default') + const searchTool = server.tools.get('search')! + const out = await searchTool.handler({ query: 'foo' }) + expect(JSON.parse(out)).toEqual({ error: 'No results found.' }) + }) + + it('`search` handler surfaces safety errors as plain strings', async () => { + const cache = new IndexCache() + const server = await createServer(cache) // no defaultSource + const searchTool = server.tools.get('search')! + const out = await searchTool.handler({ query: 'foo' }) // no repo either + expect(out).toMatch(/No repo specified/) + }) + + it('`search` handler rejects ssh:// git URLs as a plain-string error', async () => { + const cache = new IndexCache() + const server = await createServer(cache) + const searchTool = server.tools.get('search')! + const out = await searchTool.handler({ + query: 'foo', + repo: 'ssh://git@github.com/org/repo', + }) + expect(out).toMatch(/Only https:\/\/, http:\/\//) + }) + + it('`find_related` handler returns a helpful message when the chunk is missing', async () => { + const cache = new IndexCache() + const server = await createServer(cache, '/tmp/default') + const tool = server.tools.get('find_related')! + const out = await tool.handler({ file_path: 'nope.ts', line: 42 }) + expect(out).toMatch(/No chunk found at nope.ts:42/) + }) +}) diff --git a/src/mcp/server.ts b/src/mcp/server.ts new file mode 100644 index 0000000..a4e64a0 --- /dev/null +++ b/src/mcp/server.ts @@ -0,0 +1,655 @@ +// Port of src/semble/mcp.py + +import * as fs from 'node:fs/promises' +import * as path from 'node:path' + +import { CspIndex, loadModel } from '../indexing/index.ts' +import { ContentType } from '../types.ts' +import { formatResults, isGitUrl, resolveChunk } from '../utils.ts' + +const REPO_DESCRIPTION + = 'https:// or http:// git URL (e.g. https://github.com/org/repo) or local directory path to index and search. ' + + 'Required when no default index was configured at startup. ' + + 'The index is cached after the first call, so repeat queries are fast.' + +const CACHE_MAX_SIZE = 10 // Max number of cached indexes to keep in memory. + +const SERVER_INSTRUCTIONS + = 'Instant code search for any local or remote git repository. ' + + 'Call `search` to find relevant code; call `find_related` on a result to discover similar code elsewhere. ' + + 'When working in a local project, pass the project root as `repo`. ' + + 'For remote repos, pass an explicit https:// URL. Never guess or infer URLs. ' + + 'Prefer these tools over Grep, Glob, or Read for any question about how code works.' + +/** + * A deferred Promise — exposes its resolve/reject for use as a one-shot + * readiness signal (the model-load latch in IndexCache). + */ +interface Deferred { + promise: Promise + resolve: (value: T) => void + reject: (reason?: unknown) => void +} + +function createDeferred(): Deferred { + let resolve!: (value: T) => void + let reject!: (reason?: unknown) => void + const promise = new Promise((res, rej) => { + resolve = res + reject = rej + }) + return { promise, resolve, reject } +} + +/** Resolve a local filesystem path to its canonical absolute form. */ +async function resolvePath(p: string): Promise { + try { + return await fs.realpath(p) + } + catch { + return path.resolve(p) + } +} + +export interface IndexCacheOptions { + content?: ContentType[] +} + +/** + * Cache of indexed repos and local paths for the lifetime of the MCP server + * process. LRU-bounded (10 entries) and deduplicates concurrent requests via + * Promise caching. + */ +export class IndexCache { + // Use a Map for insertion-order semantics (LRU via re-insert). + private readonly tasks = new Map>() + private readonly content: ContentType[] + private readonly modelReady: Deferred + private modelPath: string | null = null + private modelError: unknown = null + private modelLoadStarted = false + private watcherClose: (() => Promise) | null = null + + constructor(options: IndexCacheOptions = {}) { + this.content = options.content ?? [ContentType.CODE] + this.modelReady = createDeferred() + // Prevent unhandled promise rejection warnings if the model fails to load + // before any caller awaits the promise. Callers of awaitModel() still + // observe the rejection because they await the same promise themselves. + this.modelReady.promise.catch(() => {}) + } + + /** + * Begin loading the embedding model (idempotent). Call from `serve` to + * run model load in parallel with starting the server. If never called + * explicitly, the first `get()` will trigger it. + */ + ensureModelLoading(): void { + if (this.modelLoadStarted) + return + this.modelLoadStarted = true + void (async () => { + try { + const [, modelPath] = await loadModel() + this.modelPath = modelPath + this.modelReady.resolve(modelPath) + } + catch (err) { + this.modelError = err + this.modelReady.reject(err) + } + })() + } + + private async awaitModel(): Promise { + this.ensureModelLoading() + if (this.modelError !== null) + throw this.modelError + return this.modelReady.promise + } + + private async computeCacheKey(source: string, ref?: string): Promise { + if (isGitUrl(source)) + return ref !== undefined && ref !== '' ? `${source}@${ref}` : source + return resolvePath(source) + } + + /** + * Return an index for the requested source, building and caching it on + * first access. Concurrent calls for the same key share a single Promise. + */ + async get(source: string, ref?: string): Promise { + const cacheKey = await this.computeCacheKey(source, ref) + + const existing = this.tasks.get(cacheKey) + if (existing !== undefined) { + // Touch for LRU (move to most-recent end). + this.tasks.delete(cacheKey) + this.tasks.set(cacheKey, existing) + try { + return await existing + } + catch (err) { + // Only evict if this task hasn't already been replaced. + if (this.tasks.get(cacheKey) === existing) + this.tasks.delete(cacheKey) + throw err + } + } + + const modelPath = await this.awaitModel() + + // Re-check after the await: another caller may have populated the entry. + const racedExisting = this.tasks.get(cacheKey) + if (racedExisting !== undefined) { + this.tasks.delete(cacheKey) + this.tasks.set(cacheKey, racedExisting) + return racedExisting + } + + // LRU eviction: drop oldest entry (first inserted). + if (this.tasks.size >= CACHE_MAX_SIZE) { + const oldestKey = this.tasks.keys().next().value + if (oldestKey !== undefined) + this.tasks.delete(oldestKey) + } + + const buildPromise: Promise = isGitUrl(source) + ? CspIndex.fromGit(source, { + // Only include `ref` when caller actually supplied one — avoids + // tripping `exactOptionalPropertyTypes` and matches semble's behavior + // (passing `ref=None` would be equivalent, but explicit-undefined is + // distinct from "not present" in TS). + ...(ref !== undefined ? { ref } : {}), + modelPath, + content: this.content, + }) + : CspIndex.fromPath(cacheKey, { + modelPath, + content: this.content, + }) + + this.tasks.set(cacheKey, buildPromise) + + try { + return await buildPromise + } + catch (err) { + // Only evict if this task hasn't already been replaced. + if (this.tasks.get(cacheKey) === buildPromise) + this.tasks.delete(cacheKey) + throw err + } + } + + /** + * Remove the cached entry for `source`. Awaitable so callers (notably the + * file watcher) can guarantee the deletion lands before the next `get()`. + */ + async evict(source: string): Promise { + const cacheKey = await this.computeCacheKey(source) + this.tasks.delete(cacheKey) + } + + /** Number of currently cached entries (for tests / introspection). */ + get size(): number { + return this.tasks.size + } + + /** + * Start a background watcher that evicts + re-gets the index whenever + * files at `path` change. Uses chokidar (debounced). + * + * Calling this more than once stops the previous watcher first to avoid + * leaking file handles. + */ + async startWatcher(watchPath: string): Promise { + // Stop any existing watcher before installing a new one. + await this.stopWatcher() + + interface ChokidarWatcher { + on: (event: string, cb: () => void) => void + close: () => Promise + } + interface ChokidarModule { + watch: ( + watchPath: string, + opts: { ignoreInitial: boolean, persistent: boolean }, + ) => ChokidarWatcher + } + let chokidar: ChokidarModule + try { + // chokidar isn't a declared dep yet (Unit 0); resolve lazily so the + // module loads even when it's absent. + // @ts-expect-error chokidar may not be installed yet + const mod = (await import('chokidar')) as { default?: ChokidarModule } & ChokidarModule + chokidar = mod.default ?? mod + } + catch { + // chokidar not installed — silently no-op so callers that don't need + // watching still work. + return + } + + // Match semble: watch everything. Upstream relies on the underlying + // walker's .gitignore handling to filter what actually ends up in the + // index; the watcher itself doesn't filter, so projects rooted inside a + // dotfile directory (e.g. ~/.config/proj) still re-index correctly. + const watcher = chokidar.watch(watchPath, { + ignoreInitial: true, + persistent: true, + }) + + let debounce: ReturnType | null = null + const onChange = (): void => { + if (debounce !== null) + clearTimeout(debounce) + debounce = setTimeout(() => { + debounce = null + // Await evict before get so the rebuild sees a fresh cache slot. + void (async () => { + try { + await this.evict(watchPath) + await this.get(watchPath) + } + catch { + // Swallow rebuild errors; the next explicit get() will surface them. + } + })() + }, 250) + } + + watcher.on('add', onChange) + watcher.on('change', onChange) + watcher.on('unlink', onChange) + watcher.on('addDir', onChange) + watcher.on('unlinkDir', onChange) + + this.watcherClose = async () => { + if (debounce !== null) + clearTimeout(debounce) + await watcher.close() + } + } + + /** Stop the file watcher, if any. */ + async stopWatcher(): Promise { + if (this.watcherClose !== null) { + const close = this.watcherClose + this.watcherClose = null + await close() + } + } +} + +/** + * Return a cached index for a repo, rejecting unsafe git transport schemes + * and missing-source cases with descriptive errors. + */ +async function getIndex( + repo: string | undefined, + defaultSource: string | undefined, + cache: IndexCache, +): Promise { + if ( + repo !== undefined + && isGitUrl(repo) + && !repo.startsWith('https://') + && !repo.startsWith('http://') + ) { + throw new Error( + `Only https://, http://, or local directory paths are accepted as \`repo\`. Got: ${JSON.stringify(repo)}`, + ) + } + const source = repo ?? defaultSource + if (source === undefined || source === '') { + throw new Error( + 'No repo specified and no default index. ' + + 'Pass an https:// or http:// git URL or local directory path as `repo`.', + ) + } + try { + return await cache.get(source) + } + catch (exc) { + const msg = exc instanceof Error ? exc.message : String(exc) + throw new Error(`Failed to index ${JSON.stringify(source)}: ${msg}`) + } +} + +// Exported for tests so they can exercise the safety branches without the SDK. +export const _internal = { getIndex } + +/** Configured MCP server (typed loosely so we don't depend on the SDK at compile time). */ +export interface CspMcpServer { + /** Tool registry — exposed for test/introspection. */ + readonly tools: ReadonlyMap + /** True when the real `@modelcontextprotocol/sdk` server backs this object. */ + readonly isPlaceholder: boolean + /** Connect to a transport (no-op for the placeholder). */ + connect: (transport: unknown) => Promise + /** Underlying SDK server, if any. */ + readonly underlying: unknown +} + +interface ToolDef { + title: string + description: string + handler: (args: Record) => Promise +} + +/** + * Build and return a configured MCP server backed by the given cache. + * + * If `@modelcontextprotocol/sdk` is installed, this registers `search` and + * `find_related` tools on a real `McpServer`. If it isn't (yet), a + * placeholder is returned so the rest of the module remains usable and + * testable. + */ +export async function createServer( + cache: IndexCache, + defaultSource?: string, +): Promise { + const searchTool: ToolDef = { + title: 'Search a codebase with a natural-language or code query.', + description: + 'Pass a git URL or local path as `repo` to index it on demand; indexes are cached for the session. ' + + 'Use this to find where something is implemented, understand a library, or locate related code.', + handler: async (args) => { + try { + const query = String(args.query ?? '') + const repo = args.repo === undefined ? undefined : String(args.repo) + const topK + = typeof args.top_k === 'number' + ? args.top_k + : typeof args.topK === 'number' + ? args.topK + : 5 + + const index = await getIndex(repo, defaultSource, cache) + const results = index.search(query, { topK }) + if (results.length === 0) + return JSON.stringify({ error: 'No results found.' }) + return JSON.stringify(formatResults(query, results)) + } + catch (err) { + return err instanceof Error ? err.message : String(err) + } + }, + } + + const findRelatedTool: ToolDef = { + title: 'Find code chunks semantically similar to a specific location in a file.', + description: + 'Use after `search` to explore related implementations or callers. ' + + 'Pass file_path and line from a prior search result.', + handler: async (args) => { + try { + const filePath = String(args.file_path ?? args.filePath ?? '') + const line = Number(args.line ?? 0) + const repo = args.repo === undefined ? undefined : String(args.repo) + const topK + = typeof args.top_k === 'number' + ? args.top_k + : typeof args.topK === 'number' + ? args.topK + : 5 + + const index = await getIndex(repo, defaultSource, cache) + const chunk = resolveChunk(index.chunks, filePath, line) + if (chunk === null) { + return ( + `No chunk found at ${filePath}:${line}. ` + + 'Make sure the file is indexed and the line number is within a known chunk.' + ) + } + const results = index.findRelated(chunk, { topK }) + if (results.length === 0) { + return JSON.stringify({ + error: `No related chunks found for ${filePath}:${line}.`, + }) + } + return JSON.stringify( + formatResults(`Chunks related to ${filePath}:${line}`, results), + ) + } + catch (err) { + return err instanceof Error ? err.message : String(err) + } + }, + } + + const tools = new Map([ + ['search', searchTool], + ['find_related', findRelatedTool], + ]) + + // Try to wire up the real MCP SDK; fall back to a placeholder if it's not + // installed (per the unit spec — Unit 0 may not be merged yet). + type McpServerCtor = new ( + info: { name: string, version?: string }, + options?: { instructions?: string }, + ) => McpServerInstance + interface McpServerInstance { + registerTool: ( + name: string, + config: { title: string, description: string, inputSchema: unknown }, + handler: (args: Record) => Promise, + ) => void + connect: (transport: unknown) => Promise + } + let McpServer: McpServerCtor | null = null + try { + // @ts-expect-error @modelcontextprotocol/sdk may not be installed yet (Unit 0) + const mod = (await import('@modelcontextprotocol/sdk/server/mcp.js')) as { + McpServer: McpServerCtor + } + McpServer = mod.McpServer + } + catch { + McpServer = null + } + + if (McpServer === null) { + return { + tools, + isPlaceholder: true, + connect: async () => { + throw new Error( + '@modelcontextprotocol/sdk is not installed; createServer returned a placeholder.', + ) + }, + underlying: null, + } + } + + const underlying = new McpServer( + { name: 'csp', version: '0.0.0' }, + { instructions: SERVER_INSTRUCTIONS }, + ) + + // The MCP SDK's `registerTool` `inputSchema` expects a Zod raw shape + // (`Record`), not raw JSON Schema. zod is a transitive + // dependency of @modelcontextprotocol/sdk, so if the SDK loaded we should + // be able to load zod too. If it isn't reachable for any reason, fall back + // to registering the tool without an input schema so it's still callable. + interface ZodLikeSchema { + optional: () => ZodLikeSchema + describe: (desc: string) => ZodLikeSchema + default: (value: unknown) => ZodLikeSchema + } + interface ZodLikeModule { + string: () => ZodLikeSchema + number: () => ZodLikeSchema + } + let z: ZodLikeModule | null = null + try { + // @ts-expect-error zod is a transitive dep of @modelcontextprotocol/sdk + const zmod = (await import('zod')) as { z?: ZodLikeModule } & ZodLikeModule + z = zmod.z ?? zmod + } + catch { + z = null + } + + const searchSchema = z === null + ? undefined + : { + query: z.string().describe('Natural language or code query.'), + repo: z.string().describe(REPO_DESCRIPTION).optional(), + top_k: z.number().describe('Number of results to return.').default(5), + } + + const findRelatedSchema = z === null + ? undefined + : { + file_path: z + .string() + .describe( + 'Path to the file as stored in the index (use file_path from a search result).', + ), + line: z.number().describe('Line number (1-indexed).'), + repo: z.string().describe(REPO_DESCRIPTION).optional(), + top_k: z.number().describe('Number of similar chunks to return.').default(5), + } + + underlying.registerTool( + 'search', + { + title: searchTool.title, + description: searchTool.description, + ...(searchSchema !== undefined ? { inputSchema: searchSchema } : {}), + }, + async args => ({ + content: [{ type: 'text', text: await searchTool.handler(args as Record) }], + }), + ) + + underlying.registerTool( + 'find_related', + { + title: findRelatedTool.title, + description: findRelatedTool.description, + ...(findRelatedSchema !== undefined ? { inputSchema: findRelatedSchema } : {}), + }, + async args => ({ + content: [ + { type: 'text', text: await findRelatedTool.handler(args as Record) }, + ], + }), + ) + + return { + tools, + isPlaceholder: false, + connect: transport => underlying.connect(transport), + underlying, + } +} + +export interface ServeOptions { + ref?: string + content?: ContentType[] +} + +/** + * Start an MCP stdio server, optionally pre-indexing a default source. + * + * Pre-warms the embedding model in parallel with starting the server and + * starts a file watcher for local paths. + */ +export async function serve(path?: string, options: ServeOptions = {}): Promise { + const cache = new IndexCache({ content: options.content ?? [ContentType.CODE] }) + + // Kick off model load + optional pre-index in parallel with server startup. + const prewarm = (async (): Promise => { + try { + cache.ensureModelLoading() + // Wait for the model load to settle before pre-indexing. + // awaitModel is private; ensure the model is ready by triggering and + // catching get() — which itself awaits the model. + if (path !== undefined && path !== '') { + try { + await cache.get(path, options.ref) + } + catch { + // Pre-indexing failure shouldn't crash the server. + } + if (!isGitUrl(path)) { + try { + await cache.startWatcher(path) + } + catch { + // Watcher failure is non-fatal. + } + } + } + } + catch { + // Already logged via modelError; the server can still report errors per-call. + } + })() + + const server = await createServer(cache, path) + + // Attempt to attach stdio transport from the SDK; if not available, log and exit cleanly. + let StdioTransportCtor: + | (new () => { close?: () => Promise | void }) + | null = null + try { + // @ts-expect-error @modelcontextprotocol/sdk may not be installed yet (Unit 0) + const mod = (await import('@modelcontextprotocol/sdk/server/stdio.js')) as { + StdioServerTransport: new () => { close?: () => Promise | void } + } + StdioTransportCtor = mod.StdioServerTransport + } + catch { + StdioTransportCtor = null + } + + if (StdioTransportCtor === null || server.isPlaceholder) { + // No SDK — nothing to serve. Await pre-warm so callers can inspect the + // cache, then tear down the watcher so this path doesn't leak file + // handles (the prewarm above may have started one). + try { + await prewarm + } + finally { + await cache.stopWatcher() + } + return + } + + // Hook into stdin EOF so we can return once the client disconnects, mirroring + // semble's `run_stdio_async()` blocking semantics. Both listeners share a + // single cleanup so whichever event fires first removes the other — + // otherwise repeated `serve()` calls (tests, restarts) accumulate listeners + // on `process.stdin` and trip MaxListenersExceededWarning. + const stdinClosed = new Promise((resolve) => { + const cleanup = (): void => { + process.stdin.removeListener('end', cleanup) + process.stdin.removeListener('close', cleanup) + resolve() + } + process.stdin.on('end', cleanup) + process.stdin.on('close', cleanup) + }) + + const transport = new StdioTransportCtor() + try { + // connect() must be inside the try so a failure here still runs the + // transport/watcher cleanup below. + await server.connect(transport) + // Block on stdin close — connect() returns immediately after handshake, + // and we MUST NOT close the transport until the client disconnects. + await stdinClosed + // After the client disconnects, drain any pre-warm work that's still in + // flight so we don't orphan promises. + await prewarm + } + finally { + if (typeof transport.close === 'function') + await transport.close() + await cache.stopWatcher() + } +} diff --git a/src/types.ts b/src/types.ts index 740c4cb..9656945 100644 --- a/src/types.ts +++ b/src/types.ts @@ -1,48 +1,38 @@ -// TODO(unit-1): replace with the real port from `feat/unit-1-types`. -// -// This file is a *placeholder stub* so the public barrel (`src/index.ts`) -// type-checks and `bun test src/index.test.ts` can import the package in -// isolation. Unit 1 lands the real port of `src/semble/types.py`; when it -// merges, this file is overwritten wholesale (see PR `feat/unit-1-types`). -// -// Keep the exported names and value/type duality of `ContentType` in lockstep -// with Unit 1 — the barrel re-exports both forms. +// Port of src/semble/types.py +// Minimal stub — full implementation lands in Unit 1. -/** - * Content type for indexing and search pipeline selection. - * - * Placeholder mirroring Unit 1's `const`-object enum. Values are the same - * lowercase strings as the upstream Python `str` enum so CLI flags and - * persisted indices round-trip. - */ -export const ContentType = { - Code: 'code', - Docs: 'docs', - Config: 'config', -} as const -export type ContentType = (typeof ContentType)[keyof typeof ContentType] +/** Call type for token-savings tracking. */ +export enum CallType { + SEARCH = 'search', + FIND_RELATED = 'find_related', +} + +/** Content type for indexing and search pipeline selection. */ +export enum ContentType { + CODE = 'code', + DOCS = 'docs', + CONFIG = 'config', +} -/** Placeholder shape — Unit 1 ships the authoritative definition. */ +/** A single indexable unit of code. */ export interface Chunk { - readonly content: string - readonly filePath: string - readonly startLine: number - readonly endLine: number - readonly language?: string | undefined + content: string + filePath: string + startLine: number + endLine: number + language?: string | null } -/** Placeholder shape — Unit 1 ships the authoritative definition. */ +/** A single search result with score and source. */ export interface SearchResult { - readonly chunk: Chunk - readonly score: number + chunk: Chunk + score: number + toDict: () => Record } -/** Placeholder shape — Unit 1 ships the authoritative definition. */ +/** Statistics about the current index state. */ export interface IndexStats { - readonly indexedFiles: number - readonly totalChunks: number - readonly languages: Readonly> + indexedFiles: number + totalChunks: number + languages: Record } - -/** Placeholder alias — Unit 1 ships the authoritative definition. */ -export type EmbeddingMatrix = Float32Array diff --git a/src/utils.ts b/src/utils.ts index bdb1d77..455cf99 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -1,21 +1,7 @@ // Port of src/semble/utils.py +// Minimal stub — full implementation lands in Unit 3. -// Stopgap structural types until ./types.ts lands. -// Mirror semble.types.Chunk / SearchResult with camelCase field names per -// the @pleaseai/csp public-API conventions. -export interface Chunk { - content: string - filePath: string - startLine: number - endLine: number - language?: string | null -} - -export interface SearchResult { - chunk: Chunk - score: number - toDict: () => Record -} +import type { Chunk, SearchResult } from './types.ts' const GIT_URL_SCHEMES = [ 'https://', @@ -59,7 +45,6 @@ export function resolveChunk( ) { if (line < chunk.endLine) return chunk - // line === endLine: boundary; keep as fallback for end-of-file chunks. if (fallback === null) fallback = chunk }