diff --git a/src/ranking/boosting.test.ts b/src/ranking/boosting.test.ts new file mode 100644 index 0000000..9f1c93a --- /dev/null +++ b/src/ranking/boosting.test.ts @@ -0,0 +1,296 @@ +import { describe, expect, test } from 'bun:test' +import { + _chunkDefinesSymbol, + _countKeywordMatches, + _extractSymbolName, + _stemMatches, + applyQueryBoost, + boostMultiChunkFiles, + type Chunk, + DEFINITION_BOOST_MULTIPLIER, + EMBEDDED_SYMBOL_BOOST_SCALE, + FILE_COHERENCE_BOOST_FRAC, + isSymbolQuery, +} from './boosting.ts' + +function mkChunk(content: string, filePath: string, startLine = 1, endLine = 10): Chunk { + return { content, filePath, startLine, endLine } +} + +describe('isSymbolQuery', () => { + test('PascalCase identifiers are symbol queries', () => { + expect(isSymbolQuery('HandlerStack')).toBe(true) + expect(isSymbolQuery('Client')).toBe(true) + }) + + test('namespace-qualified identifiers are symbol queries', () => { + expect(isSymbolQuery('Sinatra::Base')).toBe(true) + expect(isSymbolQuery('Phoenix.Router')).toBe(true) + expect(isSymbolQuery('foo->bar')).toBe(true) + expect(isSymbolQuery('A\\B\\C')).toBe(true) + }) + + test('leading-underscore identifiers are symbol queries', () => { + expect(isSymbolQuery('_private')).toBe(true) + expect(isSymbolQuery('_')).toBe(true) + }) + + test('snake_case identifiers are symbol queries', () => { + expect(isSymbolQuery('my_func')).toBe(true) + }) + + test('plain lowercase words are NL', () => { + expect(isSymbolQuery('session')).toBe(false) + expect(isSymbolQuery('foo')).toBe(false) + }) + + test('NL phrases are NL', () => { + expect(isSymbolQuery('how does this work')).toBe(false) + expect(isSymbolQuery('find the cache layer')).toBe(false) + }) + + test('trims whitespace', () => { + expect(isSymbolQuery(' HandlerStack ')).toBe(true) + }) +}) + +describe('_extractSymbolName', () => { + test('extracts trailing name after :: separator', () => { + expect(_extractSymbolName('Sinatra::Base')).toBe('Base') + }) + + test('extracts trailing name after .', () => { + expect(_extractSymbolName('Phoenix.Router')).toBe('Router') + }) + + test('extracts trailing name after ->', () => { + expect(_extractSymbolName('foo->bar')).toBe('bar') + }) + + test('returns the original (trimmed) when no separator', () => { + expect(_extractSymbolName('Client')).toBe('Client') + expect(_extractSymbolName(' Client ')).toBe('Client') + }) +}) + +describe('_stemMatches', () => { + test('exact match', () => { + expect(_stemMatches('client', 'client')).toBe(true) + }) + + test('snake-stripped match', () => { + expect(_stemMatches('handler_stack', 'handlerstack')).toBe(true) + }) + + test('plural-stripped match', () => { + expect(_stemMatches('clients', 'client')).toBe(true) + expect(_stemMatches('handler_stacks', 'handlerstack')).toBe(true) + }) + + test('no match', () => { + expect(_stemMatches('foo', 'bar')).toBe(false) + }) +}) + +describe('_chunkDefinesSymbol', () => { + test('matches class definition', () => { + const chunk = mkChunk('class HandlerStack:\n pass\n', 'a.py') + expect(_chunkDefinesSymbol(chunk, 'HandlerStack')).toBe(true) + }) + + test('matches def function', () => { + const chunk = mkChunk('def my_func(x):\n return x\n', 'a.py') + expect(_chunkDefinesSymbol(chunk, 'my_func')).toBe(true) + }) + + test('matches namespace-qualified defmodule for trailing name', () => { + const chunk = mkChunk('defmodule Phoenix.Router do\nend\n', 'a.ex') + expect(_chunkDefinesSymbol(chunk, 'Router')).toBe(true) + }) + + test('case-sensitive: does not match "Module" as keyword', () => { + const chunk = mkChunk('Module Foo', 'a.txt') + expect(_chunkDefinesSymbol(chunk, 'Foo')).toBe(false) + }) + + test('case-insensitive for SQL DDL', () => { + const chunk = mkChunk('create table users (id int);', 'a.sql') + expect(_chunkDefinesSymbol(chunk, 'users')).toBe(true) + const chunk2 = mkChunk('CREATE TABLE users (id int);', 'a.sql') + expect(_chunkDefinesSymbol(chunk2, 'users')).toBe(true) + }) + + test('does not match in the middle of a word', () => { + const chunk = mkChunk('# subclass Foo\n', 'a.py') + expect(_chunkDefinesSymbol(chunk, 'Foo')).toBe(false) + }) +}) + +describe('_countKeywordMatches', () => { + test('all exact matches', () => { + expect(_countKeywordMatches(new Set(['foo', 'bar']), new Set(['foo', 'bar', 'baz']))).toBe(2) + }) + + test('prefix overlap (min 3 chars)', () => { + // "dep" matches "dependency" (keyword shorter than part) + expect(_countKeywordMatches(new Set(['dep']), new Set(['dependency']))).toBe(1) + // "depend" matches "dependencies" (both ≥3, longer.startsWith(shorter)) + expect(_countKeywordMatches(new Set(['depend']), new Set(['dependencies']))).toBe(1) + // Part shorter than keyword also works (shorter is part) + expect(_countKeywordMatches(new Set(['dependency']), new Set(['dep']))).toBe(1) + }) + + test('skips < 3 chars', () => { + expect(_countKeywordMatches(new Set(['de']), new Set(['dependency']))).toBe(0) + }) +}) + +describe('boostMultiChunkFiles', () => { + test('top chunk receives boost_unit * fileSum / maxFileSum', () => { + const c1 = mkChunk('x', 'a.ts', 1, 10) + const c2 = mkChunk('y', 'a.ts', 11, 20) + const c3 = mkChunk('z', 'a.ts', 21, 30) + const cOther = mkChunk('q', 'b.ts') + + const scores = new Map([ + [c1, 0.5], + [c2, 0.4], + [c3, 0.3], + [cOther, 0.2], + ]) + + boostMultiChunkFiles(scores) + + // Top chunk in a.ts is c1 (0.5). file_sum["a.ts"] = 1.2, file_sum["b.ts"] = 0.2. + // max_score = 0.5, boost_unit = 0.5 * 0.2 = 0.1, max_file_sum = 1.2. + // c1 gets: 0.5 + 0.1 * 1.2 / 1.2 = 0.6 + // cOther gets: 0.2 + 0.1 * 0.2 / 1.2 ≈ 0.21666... + expect(scores.get(c1)).toBeCloseTo(0.6, 10) + expect(scores.get(c2)).toBe(0.4) + expect(scores.get(c3)).toBe(0.3) + expect(scores.get(cOther)).toBeCloseTo(0.2 + 0.1 * 0.2 / 1.2, 10) + }) + + test('no-op on empty map', () => { + const scores = new Map() + boostMultiChunkFiles(scores) + expect(scores.size).toBe(0) + }) + + test('no-op when max score is zero', () => { + const c = mkChunk('x', 'a.ts') + const scores = new Map([[c, 0]]) + boostMultiChunkFiles(scores) + expect(scores.get(c)).toBe(0) + }) + + test('no NaN/Infinity when fileSums cancel to zero', () => { + // Positive and negative scores within each file sum to zero → maxFileSum == 0. + // Without the guard, the boost formula would divide by zero and corrupt the scores map. + const c1 = mkChunk('x', 'a.ts', 1, 10) + const c2 = mkChunk('y', 'a.ts', 11, 20) + const scores = new Map([ + [c1, 1.0], + [c2, -1.0], + ]) + boostMultiChunkFiles(scores) + const v1 = scores.get(c1) + const v2 = scores.get(c2) + expect(Number.isFinite(v1 ?? Number.NaN)).toBe(true) + expect(Number.isFinite(v2 ?? Number.NaN)).toBe(true) + // No mutation expected when maxFileSum <= 0. + expect(v1).toBe(1.0) + expect(v2).toBe(-1.0) + }) + + test('uses FILE_COHERENCE_BOOST_FRAC = 0.2', () => { + // Single chunk, single file → fileSum == maxFileSum, so boost = boost_unit. + const c = mkChunk('x', 'a.ts') + const scores = new Map([[c, 1.0]]) + boostMultiChunkFiles(scores) + expect(scores.get(c)).toBeCloseTo(1.0 + 1.0 * FILE_COHERENCE_BOOST_FRAC, 10) + }) +}) + +describe('applyQueryBoost', () => { + test('symbol query with definition keyword boosts chunk by DEFINITION_BOOST_MULTIPLIER * maxScore (1.0× when stem does not match)', () => { + // File stem is "other", not "handlerstack" → 1.0× tier. + const defChunk = mkChunk('class HandlerStack:\n pass\n', 'other.py') + const otherChunk = mkChunk('print("hi")', 'b.py') + + const scores = new Map([ + [defChunk, 0.5], + [otherChunk, 1.0], + ]) + const boosted = applyQueryBoost(scores, 'HandlerStack', [defChunk, otherChunk]) + + // maxScore = 1.0, boostUnit = 1.0 * 3.0 = 3.0; defChunk picks up 3.0 (1.0× tier). + expect(boosted.get(defChunk)).toBeCloseTo(0.5 + 1.0 * DEFINITION_BOOST_MULTIPLIER, 10) + expect(boosted.get(otherChunk)).toBe(1.0) + }) + + test('symbol query with matching file stem gets 1.5× tier boost', () => { + // Stem "handler_stack" matches "handlerstack" after snake-stripping. + const defChunk = mkChunk('class HandlerStack:\n pass\n', 'handler_stack.py') + const scores = new Map([[defChunk, 0.5]]) + const boosted = applyQueryBoost(scores, 'HandlerStack', [defChunk]) + // boostUnit = 0.5 * 3.0 = 1.5; tier = 1.5 * 1.5 = 2.25; new score = 0.5 + 2.25 = 2.75. + expect(boosted.get(defChunk)).toBeCloseTo(2.75, 10) + }) + + test('symbol query promotes non-candidate stem-matching chunks', () => { + const candidate = mkChunk('print("hi")', 'b.py') + const nonCandidate = mkChunk('class HandlerStack:\n pass\n', 'handler_stack.py') + const scores = new Map([[candidate, 1.0]]) + const boosted = applyQueryBoost(scores, 'HandlerStack', [candidate, nonCandidate]) + // Non-candidate appears with score = boostUnit * 1.5 = 1.0 * 3.0 * 1.5 = 4.5. + expect(boosted.get(nonCandidate)).toBeCloseTo(4.5, 10) + }) + + test('NL query with embedded PascalCase triggers half-strength embedded boost', () => { + const defChunk = mkChunk('class StateManager:\n pass\n', 'state_manager.py') + const scores = new Map([[defChunk, 1.0]]) + const boosted = applyQueryBoost( + scores, + 'where does the StateManager initialize state', + [defChunk], + ) + // Embedded boost: tier-with-stem-match = boostUnit * 1.5 + // boostUnit_embedded = 1.0 * DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE = 1.5 + // tier = 1.5 * 1.5 = 2.25 → new score = 1.0 + 2.25 = 3.25 + // Plus possible stem-match boost from `_boostStemMatches`. To avoid that ambiguity, + // assert lower bound. + const expectedEmbedded = DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE * 1.5 + const result = boosted.get(defChunk) ?? 0 + expect(result).toBeGreaterThanOrEqual(1.0 + expectedEmbedded - 1e-9) + }) + + test('returns a new map and does not mutate input', () => { + const c = mkChunk('class Foo:\n pass\n', 'foo.py') + const original = new Map([[c, 1.0]]) + const boosted = applyQueryBoost(original, 'Foo', [c]) + expect(original.get(c)).toBe(1.0) + expect(boosted).not.toBe(original) + expect(boosted.get(c)).toBeGreaterThan(1.0) + }) + + test('empty input returns a fresh map (no aliasing of caller state)', () => { + const empty = new Map() + const out = applyQueryBoost(empty, 'foo', []) + expect(out.size).toBe(0) + // Result must not alias the caller's map: mutating the result must not affect the input. + expect(out).not.toBe(empty) + out.set(mkChunk('x', 'a.ts'), 1) + expect(empty.size).toBe(0) + }) + + test('NL query boosts via stem matches when file path words match', () => { + const c = mkChunk('print("hi")', 'cache_layer.py') + const scores = new Map([[c, 1.0]]) + const boosted = applyQueryBoost(scores, 'find the cache layer', [c]) + // Keywords: {find, the→stopword, cache, layer} → {find, cache, layer}. + // Parts from "cache_layer" split → cache_layer, cache, layer + // Matches: cache, layer → n=2, ratio=2/3, boost = 1.0 * 1.0 * 2/3 + expect(boosted.get(c)).toBeCloseTo(1.0 + 2 / 3, 10) + }) +}) diff --git a/src/ranking/boosting.ts b/src/ranking/boosting.ts new file mode 100644 index 0000000..d25f4fe --- /dev/null +++ b/src/ranking/boosting.ts @@ -0,0 +1,476 @@ +// Port of src/semble/ranking/boosting.py + +// TODO(integration): replace inline Chunk type with `import type { Chunk } from '../types.ts'` +// once Unit 1 lands in main. +export interface Chunk { + content: string + filePath: string + startLine: number + endLine: number + language?: string +} + +// TODO(integration): replace with import from '../tokens.ts' once Unit 2 lands in main. +const TOKEN_CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g + +function splitIdentifier(token: string): string[] { + const lower = token.toLowerCase() + let parts: string[] = [] + + if (token.includes('_')) { + parts = lower.split('_').filter(p => p.length > 0) + } + else { + parts = (token.match(TOKEN_CAMEL_RE) ?? []).map(m => m.toLowerCase()) + } + + if (parts.length >= 2) { + return [lower, ...parts] + } + return [lower] +} + +// Symbol-lookup queries: namespace-qualified, leading-underscore, or containing +// uppercase/underscore. Plain lowercase words (e.g. "session") are NL, not symbols. +export const SYMBOL_QUERY_RE = /^(?:[A-Z_a-z]\w*(?:(?:::|\\|->|\.)[A-Z_a-z]\w*)+|_\w*|[A-Za-z][A-Za-z0-9]*[A-Z_]\w*|[A-Z][A-Za-z0-9]*)$/ + +// CamelCase/camelCase identifiers embedded in a NL query; excludes plain words and pure acronyms. +export const EMBEDDED_SYMBOL_RE = /\b(?:[A-Z][a-z][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]*|[a-z][a-zA-Z0-9]*[A-Z][a-zA-Z0-9]+)\b/g + +// Minimum stem length for prefix-based non-candidate scan (avoids over-broad matches). +export const EMBEDDED_STEM_MIN_LEN = 4 + +// Half-strength: the symbol may be incidental to the NL query. +export const EMBEDDED_SYMBOL_BOOST_SCALE = 0.5 + +// Case-sensitive: IGNORECASE produces false positives like "Module" in Python docs +// or "Class" method calls in Ruby. +export const DEFINITION_KEYWORDS = [ + 'class', + 'module', + 'defmodule', // Elixir + 'def', + 'interface', + 'struct', + 'enum', + 'trait', + 'type', + 'func', + 'function', + 'object', + 'abstract class', + 'data class', + 'fn', + 'fun', // Kotlin + 'package', + 'namespace', + 'protocol', // Swift + 'record', // C# 9+, Java 16+ + 'typedef', // C/C++/Dart +] as const + +// SQL DDL is conventionally all-caps or all-lowercase; match both via IGNORECASE. +export const SQL_DEFINITION_KEYWORDS = [ + 'CREATE TABLE', + 'CREATE VIEW', + 'CREATE PROCEDURE', + 'CREATE FUNCTION', +] as const + +// Additive boost multiplier for chunks that define a queried symbol. +export const DEFINITION_BOOST_MULTIPLIER = 3.0 + +// Additive boost multiplier for NL queries when file stems match query words. +export const STEM_BOOST_MULTIPLIER = 1.0 + +// Fraction of max_score added to each file's top chunk, scaled by its aggregate candidate score. +export const FILE_COHERENCE_BOOST_FRAC = 0.2 + +// Common English stopwords excluded from file-stem matching for NL queries. +export const STOPWORDS: ReadonlySet = new Set( + ('a an and are as at be by do does for from has have how if in is it not of on or the to was' + + ' what when where which who why with').split(' '), +) + +function escapeRegex(s: string): string { + return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&') +} + +/** Find the max numeric value in an iterable without spreading (avoids argument-count limits). */ +function maxValue(values: Iterable): number { + let m = Number.NEGATIVE_INFINITY + for (const v of values) { + if (v > m) + m = v + } + return m +} + +const KEYWORD_PREFIX = '(?:^|(?<=\\s))(?:' +const DEFINITION_KEYWORD_BODY = DEFINITION_KEYWORDS.map(escapeRegex).join('|') +const SQL_KEYWORD_BODY = SQL_DEFINITION_KEYWORDS.map(escapeRegex).join('|') + +/** Return True if the query looks like a bare symbol or namespace-qualified identifier. */ +export function isSymbolQuery(query: string): boolean { + return SYMBOL_QUERY_RE.test(query.trim()) +} + +/** Apply query-type boosts to candidate scores. Returns a new Map. */ +export function applyQueryBoost( + combinedScores: Map, + query: string, + allChunks: Chunk[], +): Map { + if (combinedScores.size === 0) { + // Always return a fresh Map to honor the non-mutating contract; do not alias caller state. + return new Map() + } + + const maxScore = maxValue(combinedScores.values()) + const boosted = new Map(combinedScores) + + if (isSymbolQuery(query)) { + _boostSymbolDefinitions(boosted, query, maxScore, allChunks) + } + else { + _boostStemMatches(boosted, query, maxScore) + _boostEmbeddedSymbols(boosted, query, maxScore, allChunks) + } + + return boosted +} + +/** Promote files with multiple high-scoring chunks by boosting their top chunk (in-place). */ +export function boostMultiChunkFiles(scores: Map): void { + if (scores.size === 0) { + return + } + + const maxScore = maxValue(scores.values()) + if (maxScore === 0.0) { + return + } + + const fileSum = new Map() + const bestChunk = new Map() + for (const [chunk, score] of scores) { + const filePath = chunk.filePath + fileSum.set(filePath, (fileSum.get(filePath) ?? 0.0) + score) + const existingBest = bestChunk.get(filePath) + if (existingBest === undefined || score > (scores.get(existingBest) ?? -Infinity)) { + bestChunk.set(filePath, chunk) + } + } + + const maxFileSum = maxValue(fileSum.values()) + // Guard against zero/negative maxFileSum to avoid NaN / Infinity from the division below + // (e.g. when positive and negative chunk scores cancel out within every file). + if (maxFileSum <= 0) { + return + } + const boostUnit = maxScore * FILE_COHERENCE_BOOST_FRAC + for (const [filePath, chunk] of bestChunk) { + const sum = fileSum.get(filePath) ?? 0.0 + scores.set(chunk, (scores.get(chunk) ?? 0.0) + boostUnit * sum / maxFileSum) + } +} + +/** + * Extract the final identifier from a possibly namespace-qualified query. + * + * Examples: "Sinatra::Base" → "Base", "Client" → "Client". + */ +export function _extractSymbolName(query: string): string { + for (const separator of ['::', '\\', '->', '.']) { + const idx = query.lastIndexOf(separator) + if (idx !== -1) { + return query.slice(idx + separator.length) + } + } + return query.trim() +} + +// LRU-ish cache for compiled definition patterns; simple FIFO eviction at 256 entries. +const DEFINITION_PATTERN_CACHE_MAX = 256 +const _definitionPatternCache = new Map() + +export function _definitionPattern(symbolName: string): [RegExp, RegExp] { + const cached = _definitionPatternCache.get(symbolName) + if (cached !== undefined) { + return cached + } + + const escaped = escapeRegex(symbolName) + const nsPrefix = '(?:[A-Z_a-z]\\w*(?:\\.|::))*' + const suffix = `)\\s+${nsPrefix}${escaped}(?:\\s|[<({:\\[;]|$)` + const general = new RegExp(KEYWORD_PREFIX + DEFINITION_KEYWORD_BODY + suffix, 'm') + const sql = new RegExp(KEYWORD_PREFIX + SQL_KEYWORD_BODY + suffix, 'im') + const entry: [RegExp, RegExp] = [general, sql] + + if (_definitionPatternCache.size >= DEFINITION_PATTERN_CACHE_MAX) { + // FIFO eviction: drop the oldest entry. + const firstKey = _definitionPatternCache.keys().next().value + if (firstKey !== undefined) { + _definitionPatternCache.delete(firstKey) + } + } + _definitionPatternCache.set(symbolName, entry) + return entry +} + +/** + * Return True if the chunk contains a definition of *symbolName*. + * + * Case-sensitive for general keywords, case-insensitive for SQL DDL. + * Also matches namespace-qualified forms (e.g. `defmodule Phoenix.Router` for `Router`). + */ +export function _chunkDefinesSymbol(chunk: Chunk, symbolName: string): boolean { + const [general, sql] = _definitionPattern(symbolName) + return general.test(chunk.content) || sql.test(chunk.content) +} + +// Mirror Python's `str.rstrip("s")`: strip all trailing 's' characters. +function stripTrailingS(s: string): string { + return s.endsWith('s') ? s.replace(/s+$/, '') : s +} + +/** Return True if *stem* matches *name* (exact, snake_case-normalised, or plural). */ +export function _stemMatches(stem: string, name: string): boolean { + const stemNorm = stem.replace(/_/g, '') + return stem === name + || stemNorm === name + || stripTrailingS(stem) === name + || stripTrailingS(stemNorm) === name +} + +function pathStemOriginal(filePath: string): string { + // Match Python's pathlib.Path.stem: filename without suffix; handles both / and \. + // Path.stem leaves leading-dot files untouched (".gitignore" → ".gitignore"). + const sepIdx = Math.max(filePath.lastIndexOf('/'), filePath.lastIndexOf('\\')) + const base = sepIdx === -1 ? filePath : filePath.slice(sepIdx + 1) + const dotIdx = base.lastIndexOf('.') + return dotIdx <= 0 ? base : base.slice(0, dotIdx) +} + +function pathStemLower(filePath: string): string { + return pathStemOriginal(filePath).toLowerCase() +} + +function pathParentName(filePath: string): string { + // Strip trailing separators, then take the segment before the basename. + const cleaned = filePath.replace(/[/\\]+$/, '') + const sepIdx = Math.max(cleaned.lastIndexOf('/'), cleaned.lastIndexOf('\\')) + if (sepIdx === -1) + return '' + const parent = cleaned.slice(0, sepIdx) + const parentSepIdx = Math.max(parent.lastIndexOf('/'), parent.lastIndexOf('\\')) + return parentSepIdx === -1 ? parent : parent.slice(parentSepIdx + 1) +} + +/** Return the boost amount for a chunk that defines one of *names* (0.0 if none match). */ +export function _definitionTier(chunk: Chunk, names: Set, boostUnit: number): number { + let matches = false + for (const name of names) { + if (_chunkDefinesSymbol(chunk, name)) { + matches = true + break + } + } + if (!matches) + return 0.0 + const stem = pathStemLower(chunk.filePath) + for (const name of names) { + if (_stemMatches(stem, name.toLowerCase())) { + return boostUnit * 1.5 + } + } + return boostUnit * 1.0 +} + +/** Boost non-candidate chunks whose lowercased file stem satisfies stemOk (in-place). */ +export function _scanNonCandidates( + boosted: Map, + names: Set, + boostUnit: number, + allChunks: Chunk[], + stemOk: (stem: string) => boolean, +): void { + for (const chunk of allChunks) { + if (boosted.has(chunk)) + continue + if (!stemOk(pathStemLower(chunk.filePath))) + continue + const tier = _definitionTier(chunk, names, boostUnit) + if (tier !== 0.0) { + boosted.set(chunk, tier) + } + } +} + +/** Boost chunks that define the queried symbol, scanning candidates and stem-matched non-candidates (in-place). */ +export function _boostSymbolDefinitions( + boosted: Map, + query: string, + maxScore: number, + allChunks: Chunk[], +): void { + const symbolName = _extractSymbolName(query) + const names = new Set([symbolName]) + const trimmed = query.trim() + if (symbolName !== trimmed) { + names.add(trimmed) + } + + const boostUnit = maxScore * DEFINITION_BOOST_MULTIPLIER + + // Iterate keys() directly: we only update existing entries, never add/delete during iteration. + for (const chunk of boosted.keys()) { + const tier = _definitionTier(chunk, names, boostUnit) + if (tier !== 0.0) { + boosted.set(chunk, (boosted.get(chunk) ?? 0.0) + tier) + } + } + + const symbolLower = symbolName.toLowerCase() + _scanNonCandidates( + boosted, + names, + boostUnit, + allChunks, + stem => _stemMatches(stem, symbolLower), + ) +} + +/** + * Boost chunks defining CamelCase/camelCase symbols embedded in NL queries (in-place). + * + * Half-strength vs pure symbol queries. Non-candidate scan uses stem-prefix match + * so e.g. `state.ts` is found for symbol `StateManager`. + */ +export function _boostEmbeddedSymbols( + boosted: Map, + query: string, + maxScore: number, + allChunks: Chunk[], +): void { + const names = new Set(query.match(EMBEDDED_SYMBOL_RE) ?? []) + if (names.size === 0) + return + + const boostUnit = maxScore * DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE + + // Iterate keys() directly: we only update existing entries, never add/delete during iteration. + for (const chunk of boosted.keys()) { + const tier = _definitionTier(chunk, names, boostUnit) + if (tier !== 0.0) { + boosted.set(chunk, (boosted.get(chunk) ?? 0.0) + tier) + } + } + + const symbolsLower: string[] = Array.from(names, s => s.toLowerCase()) + for (const chunk of allChunks) { + if (boosted.has(chunk)) + continue + const stem = pathStemLower(chunk.filePath) + const stemNorm = stem.replace(/_/g, '') + let matches = false + for (const symbolLower of symbolsLower) { + if ( + stem === symbolLower + || stemNorm === symbolLower + || (stem.length >= EMBEDDED_STEM_MIN_LEN && symbolLower.startsWith(stem)) + || (stemNorm.length >= EMBEDDED_STEM_MIN_LEN && symbolLower.startsWith(stemNorm)) + ) { + matches = true + break + } + } + if (!matches) + continue + const tier = _definitionTier(chunk, names, boostUnit) + if (tier !== 0.0) { + boosted.set(chunk, tier) + } + } +} + +/** Count query keywords that match path parts, allowing prefix overlap (min 3 chars). */ +export function _countKeywordMatches(keywords: Set, parts: Set): number { + let exactCount = 0 + const exact = new Set() + for (const k of keywords) { + if (parts.has(k)) { + exact.add(k) + exactCount++ + } + } + if (exactCount === keywords.size) { + return exactCount + } + let nMatches = exactCount + for (const keyword of keywords) { + if (exact.has(keyword)) + continue + for (const part of parts) { + // Avoid array allocation + destructuring on every iteration; pick shorter/longer directly. + const shorter = keyword.length <= part.length ? keyword : part + const longer = keyword.length <= part.length ? part : keyword + if (shorter.length >= 3 && longer.startsWith(shorter)) { + nMatches++ + break + } + } + } + return nMatches +} + +const QUERY_WORD_RE = /[A-Z_a-z]\w*/g + +/** + * Boost chunks whose file paths match NL query keywords (in-place). + * + * Uses prefix matching for morphological variants (e.g. "dependency" matches + * "dependencies"). Matches file stems and the immediate parent directory name. + */ +export function _boostStemMatches( + boosted: Map, + query: string, + maxScore: number, +): void { + const keywords = new Set() + for (const word of query.match(QUERY_WORD_RE) ?? []) { + if (word.length > 2) { + const lower = word.toLowerCase() + if (!STOPWORDS.has(lower)) { + keywords.add(lower) + } + } + } + if (keywords.size === 0) + return + + const boost = maxScore * STEM_BOOST_MULTIPLIER + const pathCache = new Map>() + // Iterate keys() directly: we only update existing entries, never add/delete during iteration. + for (const chunk of boosted.keys()) { + let parts = pathCache.get(chunk.filePath) + if (parts === undefined) { + // Use original-case stem so splitIdentifier sees camelCase boundaries. + parts = new Set(splitIdentifier(pathStemOriginal(chunk.filePath))) + const parentName = pathParentName(chunk.filePath) + if (parentName !== '' && parentName !== '.' && parentName !== '/' && parentName !== '..') { + for (const p of splitIdentifier(parentName)) { + parts.add(p) + } + } + pathCache.set(chunk.filePath, parts) + } + const nMatches = _countKeywordMatches(keywords, parts) + if (nMatches > 0) { + const matchRatio = nMatches / keywords.size + if (matchRatio >= 0.10) { + boosted.set(chunk, (boosted.get(chunk) ?? 0.0) + boost * matchRatio) + } + } + } +} diff --git a/src/ranking/weighting.test.ts b/src/ranking/weighting.test.ts new file mode 100644 index 0000000..56c31db --- /dev/null +++ b/src/ranking/weighting.test.ts @@ -0,0 +1,28 @@ +import { describe, expect, test } from 'bun:test' +import { ALPHA_NL, ALPHA_SYMBOL, resolveAlpha } from './weighting.ts' + +describe('resolveAlpha', () => { + test('returns ALPHA_NL for plain lowercase queries', () => { + expect(resolveAlpha('session', null)).toBe(0.5) + expect(resolveAlpha('session', null)).toBe(ALPHA_NL) + }) + + test('returns ALPHA_SYMBOL for PascalCase symbol queries', () => { + expect(resolveAlpha('HandlerStack', null)).toBe(0.3) + expect(resolveAlpha('HandlerStack', null)).toBe(ALPHA_SYMBOL) + }) + + test('returns the provided alpha when set', () => { + expect(resolveAlpha('foo', 0.7)).toBe(0.7) + expect(resolveAlpha('HandlerStack', 0.9)).toBe(0.9) + }) + + test('treats undefined like null', () => { + expect(resolveAlpha('session', undefined)).toBe(0.5) + expect(resolveAlpha('HandlerStack', undefined)).toBe(0.3) + }) + + test('alpha=0 is honored (not treated as missing)', () => { + expect(resolveAlpha('HandlerStack', 0)).toBe(0) + }) +}) diff --git a/src/ranking/weighting.ts b/src/ranking/weighting.ts new file mode 100644 index 0000000..e8a52b6 --- /dev/null +++ b/src/ranking/weighting.ts @@ -0,0 +1,14 @@ +// Port of src/semble/ranking/weighting.py + +import { isSymbolQuery } from './boosting.ts' + +export const ALPHA_SYMBOL = 0.3 // lean BM25 for exact keyword matching +export const ALPHA_NL = 0.5 // balanced semantic + BM25 + +/** Return the blending weight for semantic scores, auto-detecting from query type. */ +export function resolveAlpha(query: string, alpha: number | null | undefined): number { + if (alpha !== null && alpha !== undefined) { + return alpha + } + return isSymbolQuery(query) ? ALPHA_SYMBOL : ALPHA_NL +}