From e0b5dfbf9b90e76d3187deee4816ad2fa771f8b7 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:15:25 +0900 Subject: [PATCH 1/2] feat(tokens): port identifier-aware tokenizer from semble Ports src/semble/tokens.py to src/tokens.ts: - splitIdentifier(token): lowered original + camelCase/snake_case sub-tokens. Returns only [lower] when fewer than two sub-tokens exist, matching the Python behaviour. - tokenize(text): walks identifier matches with TOKEN_RE and expands each via splitIdentifier for BM25 indexing. Regexes are ported verbatim (TOKEN_RE, CAMEL_RE) with the /g flag so matchAll works. The snake_case branch filters empty parts to match Python's '[p for p in lower.split("_") if p]'. Tests cover the canonical semble examples plus edge cases verified against the Python implementation (consecutive underscores, leading underscore, digit runs). --- src/tokens.test.ts | 95 ++++++++++++++++++++++++++++++++++++++++++++++ src/tokens.ts | 51 +++++++++++++++++++++++++ 2 files changed, 146 insertions(+) create mode 100644 src/tokens.test.ts create mode 100644 src/tokens.ts diff --git a/src/tokens.test.ts b/src/tokens.test.ts new file mode 100644 index 0000000..4dfa7de --- /dev/null +++ b/src/tokens.test.ts @@ -0,0 +1,95 @@ +// Port of src/semble/tokens.py tests + +import { describe, expect, it } from 'bun:test' +import { splitIdentifier, tokenize } from './tokens.ts' + +describe('splitIdentifier', () => { + it('splits PascalCase identifiers', () => { + expect(splitIdentifier('HandlerStack')).toEqual([ + 'handlerstack', + 'handler', + 'stack', + ]) + }) + + it('preserves runs of capitals as a single sub-token', () => { + expect(splitIdentifier('getHTTPResponse')).toEqual([ + 'gethttpresponse', + 'get', + 'http', + 'response', + ]) + }) + + it('handles leading run of capitals', () => { + expect(splitIdentifier('XMLParser')).toEqual([ + 'xmlparser', + 'xml', + 'parser', + ]) + }) + + it('splits snake_case identifiers', () => { + expect(splitIdentifier('my_func')).toEqual(['my_func', 'my', 'func']) + }) + + it('returns only the lowered token when there is no boundary', () => { + expect(splitIdentifier('simple')).toEqual(['simple']) + }) + + it('lowercases an already lower-case token', () => { + expect(splitIdentifier('Already')).toEqual(['already']) + }) + + it('keeps consecutive underscores from collapsing into duplicate parts', () => { + // Python `split('_')` produces empty strings between consecutive + // underscores; the upstream filter drops them. + expect(splitIdentifier('foo__bar')).toEqual(['foo__bar', 'foo', 'bar']) + }) + + it('treats a leading underscore as snake_case with one effective part', () => { + // `_foo`.split('_') === ['', 'foo'] -> filtered to ['foo'] -> len < 2 + expect(splitIdentifier('_foo')).toEqual(['_foo']) + }) + + it('splits digit runs as their own camel sub-token', () => { + expect(splitIdentifier('abc123Def')).toEqual([ + 'abc123def', + 'abc', + '123', + 'def', + ]) + }) +}) + +describe('tokenize', () => { + it('splits plain space-separated words', () => { + expect(tokenize('foo bar baz')).toEqual(['foo', 'bar', 'baz']) + }) + + it('expands compound identifiers and drops non-identifier digits', () => { + // Numbers that do not start an identifier (e.g. "123") are not matched by + // TOKEN_RE, which mirrors the upstream Python behaviour. + expect(tokenize('camelCase_snake_case 123')).toEqual([ + 'camelcase_snake_case', + 'camelcase', + 'snake', + 'case', + ]) + }) + + it('returns an empty array for input with no identifiers', () => { + expect(tokenize(' !!! 123 ???')).toEqual([]) + }) + + it('preserves multiple identifiers and expands each', () => { + expect(tokenize('HandlerStack my_func')).toEqual([ + 'handlerstack', + 'handler', + 'stack', + 'my_func', + 'my', + 'func', + ]) + }) +}) diff --git a/src/tokens.ts b/src/tokens.ts new file mode 100644 index 0000000..a5b9e90 --- /dev/null +++ b/src/tokens.ts @@ -0,0 +1,51 @@ +// Port of src/semble/tokens.py + +const TOKEN_RE = /[a-zA-Z_][a-zA-Z0-9_]*/g + +// Split on camelCase/PascalCase boundaries: +// "HandlerStack" -> ["Handler", "Stack"] +// "getHTTPResponse" -> ["get", "HTTP", "Response"] +// "XMLParser" -> ["XML", "Parser"] +const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g + +/** + * Split a single identifier into sub-tokens via camelCase/snake_case. + * + * Returns the original token (lowered) plus any sub-tokens. + * E.g. "HandlerStack" -> ["handlerstack", "handler", "stack"] + * "my_func" -> ["my_func", "my", "func"] + * "simple" -> ["simple"] + */ +export function splitIdentifier(token: string): string[] { + const lower = token.toLowerCase() + let parts: string[] + + if (token.includes('_')) { + // snake_case splitting + parts = lower.split('_').filter(p => p.length > 0) + } + else { + // camelCase / PascalCase splitting + parts = Array.from(token.matchAll(CAMEL_RE), ([m]) => m.toLowerCase()) + } + + if (parts.length >= 2) { + return [lower, ...parts] + } + return [lower] +} + +/** + * Split text into lowercase identifier-like tokens for BM25 indexing. + * + * Compound identifiers (camelCase, PascalCase, snake_case) are expanded + * into sub-tokens so that partial matches work. The original compound + * token is preserved for exact-match boosting. + */ +export function tokenize(text: string): string[] { + const result: string[] = [] + for (const [match] of text.matchAll(TOKEN_RE)) { + result.push(...splitIdentifier(match)) + } + return result +} From 67e3607e882fab6178c64e947fd39ef0d6770f9c Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:42:16 +0900 Subject: [PATCH 2/2] review(tokens): apply gemini-code-assist feedback Add fast-path in splitIdentifier for pure-lowercase tokens to skip unnecessary regex execution and array allocation. Behavior is identical to semble (TOKEN_RE only matches [a-zA-Z0-9_], so absence of _, A-Z, 0-9 means the token cannot split further). --- src/tokens.ts | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/tokens.ts b/src/tokens.ts index a5b9e90..598b60f 100644 --- a/src/tokens.ts +++ b/src/tokens.ts @@ -18,6 +18,14 @@ const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g */ export function splitIdentifier(token: string): string[] { const lower = token.toLowerCase() + + // Fast-path: pure-lowercase tokens with no underscores/digits cannot split + // further. TOKEN_RE only matches [a-zA-Z0-9_], so the absence of `_`, + // uppercase, and digits means the token is already a single sub-token. + if (!token.includes('_') && !/[A-Z0-9]/.test(token)) { + return [lower] + } + let parts: string[] if (token.includes('_')) {