Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
296 changes: 296 additions & 0 deletions src/ranking/boosting.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
import { describe, expect, test } from 'bun:test'
import {
_chunkDefinesSymbol,
_countKeywordMatches,
_extractSymbolName,
_stemMatches,
applyQueryBoost,
boostMultiChunkFiles,
type Chunk,
DEFINITION_BOOST_MULTIPLIER,
EMBEDDED_SYMBOL_BOOST_SCALE,
FILE_COHERENCE_BOOST_FRAC,
isSymbolQuery,
} from './boosting.ts'

function mkChunk(content: string, filePath: string, startLine = 1, endLine = 10): Chunk {
return { content, filePath, startLine, endLine }
}

describe('isSymbolQuery', () => {
test('PascalCase identifiers are symbol queries', () => {
expect(isSymbolQuery('HandlerStack')).toBe(true)
expect(isSymbolQuery('Client')).toBe(true)
})

test('namespace-qualified identifiers are symbol queries', () => {
expect(isSymbolQuery('Sinatra::Base')).toBe(true)
expect(isSymbolQuery('Phoenix.Router')).toBe(true)
expect(isSymbolQuery('foo->bar')).toBe(true)
expect(isSymbolQuery('A\\B\\C')).toBe(true)
})

test('leading-underscore identifiers are symbol queries', () => {
expect(isSymbolQuery('_private')).toBe(true)
expect(isSymbolQuery('_')).toBe(true)
})

test('snake_case identifiers are symbol queries', () => {
expect(isSymbolQuery('my_func')).toBe(true)
})

test('plain lowercase words are NL', () => {
expect(isSymbolQuery('session')).toBe(false)
expect(isSymbolQuery('foo')).toBe(false)
})

test('NL phrases are NL', () => {
expect(isSymbolQuery('how does this work')).toBe(false)
expect(isSymbolQuery('find the cache layer')).toBe(false)
})

test('trims whitespace', () => {
expect(isSymbolQuery(' HandlerStack ')).toBe(true)
})
})

describe('_extractSymbolName', () => {
test('extracts trailing name after :: separator', () => {
expect(_extractSymbolName('Sinatra::Base')).toBe('Base')
})

test('extracts trailing name after .', () => {
expect(_extractSymbolName('Phoenix.Router')).toBe('Router')
})

test('extracts trailing name after ->', () => {
expect(_extractSymbolName('foo->bar')).toBe('bar')
})

test('returns the original (trimmed) when no separator', () => {
expect(_extractSymbolName('Client')).toBe('Client')
expect(_extractSymbolName(' Client ')).toBe('Client')
})
})

describe('_stemMatches', () => {
test('exact match', () => {
expect(_stemMatches('client', 'client')).toBe(true)
})

test('snake-stripped match', () => {
expect(_stemMatches('handler_stack', 'handlerstack')).toBe(true)
})

test('plural-stripped match', () => {
expect(_stemMatches('clients', 'client')).toBe(true)
expect(_stemMatches('handler_stacks', 'handlerstack')).toBe(true)
})

test('no match', () => {
expect(_stemMatches('foo', 'bar')).toBe(false)
})
})

describe('_chunkDefinesSymbol', () => {
test('matches class definition', () => {
const chunk = mkChunk('class HandlerStack:\n pass\n', 'a.py')
expect(_chunkDefinesSymbol(chunk, 'HandlerStack')).toBe(true)
})

test('matches def function', () => {
const chunk = mkChunk('def my_func(x):\n return x\n', 'a.py')
expect(_chunkDefinesSymbol(chunk, 'my_func')).toBe(true)
})

test('matches namespace-qualified defmodule for trailing name', () => {
const chunk = mkChunk('defmodule Phoenix.Router do\nend\n', 'a.ex')
expect(_chunkDefinesSymbol(chunk, 'Router')).toBe(true)
})

test('case-sensitive: does not match "Module" as keyword', () => {
const chunk = mkChunk('Module Foo', 'a.txt')
expect(_chunkDefinesSymbol(chunk, 'Foo')).toBe(false)
})

test('case-insensitive for SQL DDL', () => {
const chunk = mkChunk('create table users (id int);', 'a.sql')
expect(_chunkDefinesSymbol(chunk, 'users')).toBe(true)
const chunk2 = mkChunk('CREATE TABLE users (id int);', 'a.sql')
expect(_chunkDefinesSymbol(chunk2, 'users')).toBe(true)
})

test('does not match in the middle of a word', () => {
const chunk = mkChunk('# subclass Foo\n', 'a.py')
expect(_chunkDefinesSymbol(chunk, 'Foo')).toBe(false)
})
})

describe('_countKeywordMatches', () => {
test('all exact matches', () => {
expect(_countKeywordMatches(new Set(['foo', 'bar']), new Set(['foo', 'bar', 'baz']))).toBe(2)
})

test('prefix overlap (min 3 chars)', () => {
// "dep" matches "dependency" (keyword shorter than part)
expect(_countKeywordMatches(new Set(['dep']), new Set(['dependency']))).toBe(1)
// "depend" matches "dependencies" (both ≥3, longer.startsWith(shorter))
expect(_countKeywordMatches(new Set(['depend']), new Set(['dependencies']))).toBe(1)
// Part shorter than keyword also works (shorter is part)
expect(_countKeywordMatches(new Set(['dependency']), new Set(['dep']))).toBe(1)
})

test('skips < 3 chars', () => {
expect(_countKeywordMatches(new Set(['de']), new Set(['dependency']))).toBe(0)
})
})

describe('boostMultiChunkFiles', () => {
test('top chunk receives boost_unit * fileSum / maxFileSum', () => {
const c1 = mkChunk('x', 'a.ts', 1, 10)
const c2 = mkChunk('y', 'a.ts', 11, 20)
const c3 = mkChunk('z', 'a.ts', 21, 30)
const cOther = mkChunk('q', 'b.ts')

const scores = new Map<Chunk, number>([
[c1, 0.5],
[c2, 0.4],
[c3, 0.3],
[cOther, 0.2],
])

boostMultiChunkFiles(scores)

// Top chunk in a.ts is c1 (0.5). file_sum["a.ts"] = 1.2, file_sum["b.ts"] = 0.2.
// max_score = 0.5, boost_unit = 0.5 * 0.2 = 0.1, max_file_sum = 1.2.
// c1 gets: 0.5 + 0.1 * 1.2 / 1.2 = 0.6
// cOther gets: 0.2 + 0.1 * 0.2 / 1.2 ≈ 0.21666...
expect(scores.get(c1)).toBeCloseTo(0.6, 10)
expect(scores.get(c2)).toBe(0.4)
expect(scores.get(c3)).toBe(0.3)
expect(scores.get(cOther)).toBeCloseTo(0.2 + 0.1 * 0.2 / 1.2, 10)
})

test('no-op on empty map', () => {
const scores = new Map<Chunk, number>()
boostMultiChunkFiles(scores)
expect(scores.size).toBe(0)
})

test('no-op when max score is zero', () => {
const c = mkChunk('x', 'a.ts')
const scores = new Map<Chunk, number>([[c, 0]])
boostMultiChunkFiles(scores)
expect(scores.get(c)).toBe(0)
})

test('no NaN/Infinity when fileSums cancel to zero', () => {
// Positive and negative scores within each file sum to zero → maxFileSum == 0.
// Without the guard, the boost formula would divide by zero and corrupt the scores map.
const c1 = mkChunk('x', 'a.ts', 1, 10)
const c2 = mkChunk('y', 'a.ts', 11, 20)
const scores = new Map<Chunk, number>([
[c1, 1.0],
[c2, -1.0],
])
boostMultiChunkFiles(scores)
const v1 = scores.get(c1)
const v2 = scores.get(c2)
expect(Number.isFinite(v1 ?? Number.NaN)).toBe(true)
expect(Number.isFinite(v2 ?? Number.NaN)).toBe(true)
// No mutation expected when maxFileSum <= 0.
expect(v1).toBe(1.0)
expect(v2).toBe(-1.0)
})

test('uses FILE_COHERENCE_BOOST_FRAC = 0.2', () => {
// Single chunk, single file → fileSum == maxFileSum, so boost = boost_unit.
const c = mkChunk('x', 'a.ts')
const scores = new Map<Chunk, number>([[c, 1.0]])
boostMultiChunkFiles(scores)
expect(scores.get(c)).toBeCloseTo(1.0 + 1.0 * FILE_COHERENCE_BOOST_FRAC, 10)
})
})

describe('applyQueryBoost', () => {
test('symbol query with definition keyword boosts chunk by DEFINITION_BOOST_MULTIPLIER * maxScore (1.0× when stem does not match)', () => {
// File stem is "other", not "handlerstack" → 1.0× tier.
const defChunk = mkChunk('class HandlerStack:\n pass\n', 'other.py')
const otherChunk = mkChunk('print("hi")', 'b.py')

const scores = new Map<Chunk, number>([
[defChunk, 0.5],
[otherChunk, 1.0],
])
const boosted = applyQueryBoost(scores, 'HandlerStack', [defChunk, otherChunk])

// maxScore = 1.0, boostUnit = 1.0 * 3.0 = 3.0; defChunk picks up 3.0 (1.0× tier).
expect(boosted.get(defChunk)).toBeCloseTo(0.5 + 1.0 * DEFINITION_BOOST_MULTIPLIER, 10)
expect(boosted.get(otherChunk)).toBe(1.0)
})

test('symbol query with matching file stem gets 1.5× tier boost', () => {
// Stem "handler_stack" matches "handlerstack" after snake-stripping.
const defChunk = mkChunk('class HandlerStack:\n pass\n', 'handler_stack.py')
const scores = new Map<Chunk, number>([[defChunk, 0.5]])
const boosted = applyQueryBoost(scores, 'HandlerStack', [defChunk])
// boostUnit = 0.5 * 3.0 = 1.5; tier = 1.5 * 1.5 = 2.25; new score = 0.5 + 2.25 = 2.75.
expect(boosted.get(defChunk)).toBeCloseTo(2.75, 10)
})

test('symbol query promotes non-candidate stem-matching chunks', () => {
const candidate = mkChunk('print("hi")', 'b.py')
const nonCandidate = mkChunk('class HandlerStack:\n pass\n', 'handler_stack.py')
const scores = new Map<Chunk, number>([[candidate, 1.0]])
const boosted = applyQueryBoost(scores, 'HandlerStack', [candidate, nonCandidate])
// Non-candidate appears with score = boostUnit * 1.5 = 1.0 * 3.0 * 1.5 = 4.5.
expect(boosted.get(nonCandidate)).toBeCloseTo(4.5, 10)
})

test('NL query with embedded PascalCase triggers half-strength embedded boost', () => {
const defChunk = mkChunk('class StateManager:\n pass\n', 'state_manager.py')
const scores = new Map<Chunk, number>([[defChunk, 1.0]])
const boosted = applyQueryBoost(
scores,
'where does the StateManager initialize state',
[defChunk],
)
// Embedded boost: tier-with-stem-match = boostUnit * 1.5
// boostUnit_embedded = 1.0 * DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE = 1.5
// tier = 1.5 * 1.5 = 2.25 → new score = 1.0 + 2.25 = 3.25
// Plus possible stem-match boost from `_boostStemMatches`. To avoid that ambiguity,
// assert lower bound.
const expectedEmbedded = DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE * 1.5
const result = boosted.get(defChunk) ?? 0
expect(result).toBeGreaterThanOrEqual(1.0 + expectedEmbedded - 1e-9)
})

test('returns a new map and does not mutate input', () => {
const c = mkChunk('class Foo:\n pass\n', 'foo.py')
const original = new Map<Chunk, number>([[c, 1.0]])
const boosted = applyQueryBoost(original, 'Foo', [c])
expect(original.get(c)).toBe(1.0)
expect(boosted).not.toBe(original)
expect(boosted.get(c)).toBeGreaterThan(1.0)
})

test('empty input returns a fresh map (no aliasing of caller state)', () => {
const empty = new Map<Chunk, number>()
const out = applyQueryBoost(empty, 'foo', [])
expect(out.size).toBe(0)
// Result must not alias the caller's map: mutating the result must not affect the input.
expect(out).not.toBe(empty)
out.set(mkChunk('x', 'a.ts'), 1)
expect(empty.size).toBe(0)
})

test('NL query boosts via stem matches when file path words match', () => {
const c = mkChunk('print("hi")', 'cache_layer.py')
const scores = new Map<Chunk, number>([[c, 1.0]])
const boosted = applyQueryBoost(scores, 'find the cache layer', [c])
// Keywords: {find, the→stopword, cache, layer} → {find, cache, layer}.
// Parts from "cache_layer" split → cache_layer, cache, layer
// Matches: cache, layer → n=2, ratio=2/3, boost = 1.0 * 1.0 * 2/3
expect(boosted.get(c)).toBeCloseTo(1.0 + 2 / 3, 10)
})
})
Loading