diff --git a/src/ranking/boosting.test.ts b/src/ranking/boosting.test.ts index 8592f21..cf01723 100644 --- a/src/ranking/boosting.test.ts +++ b/src/ranking/boosting.test.ts @@ -1,4 +1,4 @@ -import type { Chunk } from './boosting.ts' +import type { Chunk } from '../types.ts' import { describe, expect, test } from 'bun:test' import { _chunkDefinesSymbol, diff --git a/src/ranking/boosting.ts b/src/ranking/boosting.ts index fc62479..f524d25 100644 --- a/src/ranking/boosting.ts +++ b/src/ranking/boosting.ts @@ -1,34 +1,7 @@ // Port of src/semble/ranking/boosting.py -// TODO(integration): replace inline Chunk type with `import type { Chunk } from '../types.ts'` -// once Unit 1 lands in main. -export interface Chunk { - content: string - filePath: string - startLine: number - endLine: number - language?: string -} - -// TODO(integration): replace with import from '../tokens.ts' once Unit 2 lands in main. -const TOKEN_CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+/g - -function splitIdentifier(token: string): string[] { - const lower = token.toLowerCase() - let parts: string[] = [] - - if (token.includes('_')) { - parts = lower.split('_').filter(p => p.length > 0) - } - else { - parts = (token.match(TOKEN_CAMEL_RE) ?? []).map(m => m.toLowerCase()) - } - - if (parts.length >= 2) { - return [lower, ...parts] - } - return [lower] -} +import type { Chunk } from '../types.ts' +import { splitIdentifier } from '../tokens.ts' // Symbol-lookup queries: namespace-qualified, leading-underscore, or containing // uppercase/underscore. Plain lowercase words (e.g. "session") are NL, not symbols. diff --git a/src/ranking/penalties.ts b/src/ranking/penalties.ts index 10d2684..0758a27 100644 --- a/src/ranking/penalties.ts +++ b/src/ranking/penalties.ts @@ -1,12 +1,6 @@ // Port of src/semble/ranking/penalties.py -// Inlined Chunk type until src/types.ts lands (Unit 1). -interface Chunk { - content: string - filePath: string - startLine: number - endLine: number - language?: string -} + +import type { Chunk } from '../types.ts' // Patterns that identify test files across common languages. // Grouped by language for readability; combined into a single regex. diff --git a/src/search.ts b/src/search.ts index fc11a0b..253fb90 100644 --- a/src/search.ts +++ b/src/search.ts @@ -1,6 +1,9 @@ // Port of src/semble/search.py import type { Chunk, SearchResult } from './types.ts' +import { applyQueryBoost, boostMultiChunkFiles } from './ranking/boosting.ts' +import { rerankTopK } from './ranking/penalties.ts' +import { resolveAlpha } from './ranking/weighting.ts' import { tokenize } from './tokens.ts' // Re-export the shared types so downstream importers (and tests) can keep @@ -35,109 +38,6 @@ function makeResult(chunk: Chunk, score: number): SearchResult { } } -// TODO(integration): replace with import from './ranking/weighting.ts' -const _ALPHA_SYMBOL = 0.3 -const _ALPHA_NL = 0.5 -const _SYMBOL_QUERY_RE = /^(?:[A-Za-z_]\w*(?:(?:::|\\|->|\.)[A-Za-z_]\w*)+|_\w*|[A-Za-z][\da-z]*[A-Z_]\w*|[A-Z][A-Za-z0-9]*)$/ -function isSymbolQuery(query: string): boolean { - return _SYMBOL_QUERY_RE.test(query.trim()) -} -function resolveAlpha(query: string, alpha: number | undefined): number { - if (alpha !== undefined) { - return alpha - } - return isSymbolQuery(query) ? _ALPHA_SYMBOL : _ALPHA_NL -} - -// TODO(integration): replace with import from './ranking/boosting.ts' -function boostMultiChunkFiles(scores: Map): void { - if (scores.size === 0) { - return - } - let maxScore = -Infinity - for (const v of scores.values()) { - if (v > maxScore) { - maxScore = v - } - } - if (maxScore === 0) { - return - } - const fileSum = new Map() - const bestChunk = new Map() - for (const [chunk, score] of scores) { - fileSum.set(chunk.filePath, (fileSum.get(chunk.filePath) ?? 0) + score) - const existing = bestChunk.get(chunk.filePath) - if (existing === undefined || score > (scores.get(existing) ?? -Infinity)) { - bestChunk.set(chunk.filePath, chunk) - } - } - let maxFileSum = -Infinity - for (const v of fileSum.values()) { - if (v > maxFileSum) { - maxFileSum = v - } - } - const boostUnit = maxScore * 0.2 - for (const [filePath, chunk] of bestChunk) { - const sum = fileSum.get(filePath) ?? 0 - scores.set(chunk, (scores.get(chunk) ?? 0) + (boostUnit * sum) / maxFileSum) - } -} - -// TODO(integration): replace with import from './ranking/boosting.ts' -function applyQueryBoost( - combinedScores: Map, - _query: string, - _allChunks: Chunk[], -): Map { - // Minimal stub — preserves identity. Full implementation arrives with ranking/boosting.ts. - return new Map(combinedScores) -} - -// TODO(integration): replace with import from './ranking/penalties.ts' -function rerankTopK( - scores: Map, - topK: number, - options: { penalisePaths: boolean } = { penalisePaths: true }, -): Array<[Chunk, number]> { - // Minimal stub mirroring the Python file-saturation logic without path penalties. - void options - if (scores.size === 0) { - return [] - } - const ranked = [...scores.entries()].sort((a, b) => b[1] - a[1]) - const FILE_SATURATION_THRESHOLD = 1 - const FILE_SATURATION_DECAY = 0.5 - const fileSelected = new Map() - const selected: Array<[number, Chunk]> = [] - let minSelected = Number.POSITIVE_INFINITY - - for (const [chunk, penScore] of ranked) { - if (selected.length >= topK && penScore <= minSelected) { - break - } - const alreadySelected = fileSelected.get(chunk.filePath) ?? 0 - let effScore = penScore - if (alreadySelected >= FILE_SATURATION_THRESHOLD) { - const excess = alreadySelected - FILE_SATURATION_THRESHOLD + 1 - effScore *= FILE_SATURATION_DECAY ** excess - } - selected.push([effScore, chunk]) - fileSelected.set(chunk.filePath, alreadySelected + 1) - if (selected.length >= topK) { - minSelected = Number.POSITIVE_INFINITY - for (const [s] of selected) { - if (s < minSelected) { - minSelected = s - } - } - } - } - selected.sort((a, b) => b[0] - a[0]) - return selected.slice(0, topK).map(([score, chunk]) => [chunk, score]) -} - // --- Public exports --------------------------------------------------------- export const RRF_K = 60 diff --git a/src/tokens.test.ts b/src/tokens.test.ts index 4dfa7de..840db9c 100644 --- a/src/tokens.test.ts +++ b/src/tokens.test.ts @@ -60,6 +60,18 @@ describe('splitIdentifier', () => { 'def', ]) }) + + it('splits kebab-case and dotted path stems on `-`/`.` separators', () => { + // `splitIdentifier` is also called on raw file-path stems (e.g. in + // ranking/boosting.ts). The camel regex treats `-`/`.` as separators, so + // the lowercase fast-path must not short-circuit these. + expect(splitIdentifier('user-service')).toEqual([ + 'user-service', + 'user', + 'service', + ]) + expect(splitIdentifier('foo.bar')).toEqual(['foo.bar', 'foo', 'bar']) + }) }) describe('tokenize', () => { diff --git a/src/tokens.ts b/src/tokens.ts index 0abaf7d..a0f7024 100644 --- a/src/tokens.ts +++ b/src/tokens.ts @@ -19,10 +19,12 @@ const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+/g export function splitIdentifier(token: string): string[] { const lower = token.toLowerCase() - // Fast-path: pure-lowercase tokens with no underscores/digits cannot split - // further. TOKEN_RE only matches [a-zA-Z0-9_], so the absence of `_`, - // uppercase, and digits means the token is already a single sub-token. - if (!token.includes('_') && !/[A-Z0-9]/.test(token)) { + // Fast-path: a token made up solely of lowercase ASCII letters cannot split + // further, since `CAMEL_RE` would match it as a single run. This guard is + // intentionally narrow — `splitIdentifier` is also called on raw path stems + // (e.g. "user-service", "foo.bar"), and `CAMEL_RE` treats `-`/`.` as + // separators, so those must fall through to the splitting logic below. + if (/^[a-z]+$/.test(token)) { return [lower] }