Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions src/tokens.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// Port of src/semble/tokens.py tests

import { describe, expect, it } from 'bun:test'
import { splitIdentifier, tokenize } from './tokens.ts'

describe('splitIdentifier', () => {
it('splits PascalCase identifiers', () => {
expect(splitIdentifier('HandlerStack')).toEqual([
'handlerstack',
'handler',
'stack',
])
})

it('preserves runs of capitals as a single sub-token', () => {
expect(splitIdentifier('getHTTPResponse')).toEqual([
'gethttpresponse',
'get',
'http',
'response',
])
})

it('handles leading run of capitals', () => {
expect(splitIdentifier('XMLParser')).toEqual([
'xmlparser',
'xml',
'parser',
])
})

it('splits snake_case identifiers', () => {
expect(splitIdentifier('my_func')).toEqual(['my_func', 'my', 'func'])
})

it('returns only the lowered token when there is no boundary', () => {
expect(splitIdentifier('simple')).toEqual(['simple'])
})

it('lowercases an already lower-case token', () => {
expect(splitIdentifier('Already')).toEqual(['already'])
})

it('keeps consecutive underscores from collapsing into duplicate parts', () => {
// Python `split('_')` produces empty strings between consecutive
// underscores; the upstream filter drops them.
expect(splitIdentifier('foo__bar')).toEqual(['foo__bar', 'foo', 'bar'])
})

it('treats a leading underscore as snake_case with one effective part', () => {
// `_foo`.split('_') === ['', 'foo'] -> filtered to ['foo'] -> len < 2
expect(splitIdentifier('_foo')).toEqual(['_foo'])
})

it('splits digit runs as their own camel sub-token', () => {
expect(splitIdentifier('abc123Def')).toEqual([
'abc123def',
'abc',
'123',
'def',
])
})
})

describe('tokenize', () => {
it('splits plain space-separated words', () => {
expect(tokenize('foo bar baz')).toEqual(['foo', 'bar', 'baz'])
})

it('expands compound identifiers and drops non-identifier digits', () => {
// Numbers that do not start an identifier (e.g. "123") are not matched by
// TOKEN_RE, which mirrors the upstream Python behaviour.
expect(tokenize('camelCase_snake_case 123')).toEqual([
'camelcase_snake_case',
'camelcase',
'snake',
'case',
])
})

it('returns an empty array for input with no identifiers', () => {
expect(tokenize(' !!! 123 ???')).toEqual([])
})

it('preserves multiple identifiers and expands each', () => {
expect(tokenize('HandlerStack my_func')).toEqual([
'handlerstack',
'handler',
'stack',
'my_func',
'my',
'func',
])
})
})
59 changes: 59 additions & 0 deletions src/tokens.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Port of src/semble/tokens.py

const TOKEN_RE = /[a-zA-Z_][a-zA-Z0-9_]*/g

// Split on camelCase/PascalCase boundaries:
// "HandlerStack" -> ["Handler", "Stack"]
// "getHTTPResponse" -> ["get", "HTTP", "Response"]
// "XMLParser" -> ["XML", "Parser"]
const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g

/**
* Split a single identifier into sub-tokens via camelCase/snake_case.
*
* Returns the original token (lowered) plus any sub-tokens.
* E.g. "HandlerStack" -> ["handlerstack", "handler", "stack"]
* "my_func" -> ["my_func", "my", "func"]
* "simple" -> ["simple"]
*/
export function splitIdentifier(token: string): string[] {
const lower = token.toLowerCase()

// Fast-path: pure-lowercase tokens with no underscores/digits cannot split
// further. TOKEN_RE only matches [a-zA-Z0-9_], so the absence of `_`,
// uppercase, and digits means the token is already a single sub-token.
if (!token.includes('_') && !/[A-Z0-9]/.test(token)) {
return [lower]
}

let parts: string[]

if (token.includes('_')) {
// snake_case splitting
parts = lower.split('_').filter(p => p.length > 0)
}
else {
// camelCase / PascalCase splitting
parts = Array.from(token.matchAll(CAMEL_RE), ([m]) => m.toLowerCase())
}

if (parts.length >= 2) {
return [lower, ...parts]
}
return [lower]
}
Comment on lines +19 to +44

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For simple lowercase tokens (which make up the vast majority of words in typical text), we can bypass the expensive matchAll regex execution and array allocation entirely. Since TOKEN_RE only matches [a-zA-Z0-9_], any token that does not contain underscores, uppercase letters, or digits consists solely of lowercase letters and cannot be split further. Adding a fast-path check at the beginning of splitIdentifier significantly improves tokenization performance.

export function splitIdentifier(token: string): string[] {
  const lower = token.toLowerCase()
  if (!token.includes('_') && !/[A-Z0-9]/.test(token)) {
    return [lower]
  }
  let parts: string[]

  if (token.includes('_')) {
    // snake_case splitting
    parts = lower.split('_').filter(p => p.length > 0)
  }
  else {
    // camelCase / PascalCase splitting
    parts = Array.from(token.matchAll(CAMEL_RE), ([m]) => m.toLowerCase())
  }

  if (parts.length >= 2) {
    return [lower, ...parts]
  }
  return [lower]
}


/**
* Split text into lowercase identifier-like tokens for BM25 indexing.
*
* Compound identifiers (camelCase, PascalCase, snake_case) are expanded
* into sub-tokens so that partial matches work. The original compound
* token is preserved for exact-match boosting.
*/
export function tokenize(text: string): string[] {
const result: string[] = []
for (const [match] of text.matchAll(TOKEN_RE)) {
result.push(...splitIdentifier(match))
}
return result
}
Comment thread
amondnet marked this conversation as resolved.