pleaseai · amondnet · May 28, 2026 · May 28, 2026 · May 28, 2026 · gemini-code-assist
diff --git a/src/tokens.test.ts b/src/tokens.test.ts
@@ -0,0 +1,95 @@
+// Port of src/semble/tokens.py tests
+
+import { describe, expect, it } from 'bun:test'
+import { splitIdentifier, tokenize } from './tokens.ts'
+
+describe('splitIdentifier', () => {
+  it('splits PascalCase identifiers', () => {
+    expect(splitIdentifier('HandlerStack')).toEqual([
+      'handlerstack',
+      'handler',
+      'stack',
+    ])
+  })
+
+  it('preserves runs of capitals as a single sub-token', () => {
+    expect(splitIdentifier('getHTTPResponse')).toEqual([
+      'gethttpresponse',
+      'get',
+      'http',
+      'response',
+    ])
+  })
+
+  it('handles leading run of capitals', () => {
+    expect(splitIdentifier('XMLParser')).toEqual([
+      'xmlparser',
+      'xml',
+      'parser',
+    ])
+  })
+
+  it('splits snake_case identifiers', () => {
+    expect(splitIdentifier('my_func')).toEqual(['my_func', 'my', 'func'])
+  })
+
+  it('returns only the lowered token when there is no boundary', () => {
+    expect(splitIdentifier('simple')).toEqual(['simple'])
+  })
+
+  it('lowercases an already lower-case token', () => {
+    expect(splitIdentifier('Already')).toEqual(['already'])
+  })
+
+  it('keeps consecutive underscores from collapsing into duplicate parts', () => {
+    // Python `split('_')` produces empty strings between consecutive
+    // underscores; the upstream filter drops them.
+    expect(splitIdentifier('foo__bar')).toEqual(['foo__bar', 'foo', 'bar'])
+  })
+
+  it('treats a leading underscore as snake_case with one effective part', () => {
+    // `_foo`.split('_') === ['', 'foo'] -> filtered to ['foo'] -> len < 2
+    expect(splitIdentifier('_foo')).toEqual(['_foo'])
+  })
+
+  it('splits digit runs as their own camel sub-token', () => {
+    expect(splitIdentifier('abc123Def')).toEqual([
+      'abc123def',
+      'abc',
+      '123',
+      'def',
+    ])
+  })
+})
+
+describe('tokenize', () => {
+  it('splits plain space-separated words', () => {
+    expect(tokenize('foo bar baz')).toEqual(['foo', 'bar', 'baz'])
+  })
+
+  it('expands compound identifiers and drops non-identifier digits', () => {
+    // Numbers that do not start an identifier (e.g. "123") are not matched by
+    // TOKEN_RE, which mirrors the upstream Python behaviour.
+    expect(tokenize('camelCase_snake_case 123')).toEqual([
+      'camelcase_snake_case',
+      'camelcase',
+      'snake',
+      'case',
+    ])
+  })
+
+  it('returns an empty array for input with no identifiers', () => {
+    expect(tokenize('   !!! 123 ???')).toEqual([])
+  })
+
+  it('preserves multiple identifiers and expands each', () => {
+    expect(tokenize('HandlerStack my_func')).toEqual([
+      'handlerstack',
+      'handler',
+      'stack',
+      'my_func',
+      'my',
+      'func',
+    ])
+  })
+})
diff --git a/src/tokens.ts b/src/tokens.ts
@@ -0,0 +1,59 @@
+// Port of src/semble/tokens.py
+
+const TOKEN_RE = /[a-zA-Z_][a-zA-Z0-9_]*/g
+
+// Split on camelCase/PascalCase boundaries:
+//   "HandlerStack" -> ["Handler", "Stack"]
+//   "getHTTPResponse" -> ["get", "HTTP", "Response"]
+//   "XMLParser" -> ["XML", "Parser"]
+const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g
+
+/**
+ * Split a single identifier into sub-tokens via camelCase/snake_case.
+ *
+ * Returns the original token (lowered) plus any sub-tokens.
+ * E.g. "HandlerStack" -> ["handlerstack", "handler", "stack"]
+ *      "my_func" -> ["my_func", "my", "func"]
+ *      "simple" -> ["simple"]
+ */
+export function splitIdentifier(token: string): string[] {
+  const lower = token.toLowerCase()
+
+  // Fast-path: pure-lowercase tokens with no underscores/digits cannot split
+  // further. TOKEN_RE only matches [a-zA-Z0-9_], so the absence of `_`,
+  // uppercase, and digits means the token is already a single sub-token.
+  if (!token.includes('_') && !/[A-Z0-9]/.test(token)) {
+    return [lower]
+  }
+
+  let parts: string[]
+
+  if (token.includes('_')) {
+    // snake_case splitting
+    parts = lower.split('_').filter(p => p.length > 0)
+  }
+  else {
+    // camelCase / PascalCase splitting
+    parts = Array.from(token.matchAll(CAMEL_RE), ([m]) => m.toLowerCase())
+  }
+
+  if (parts.length >= 2) {
+    return [lower, ...parts]
+  }
+  return [lower]
+}
+
+/**
+ * Split text into lowercase identifier-like tokens for BM25 indexing.
+ *
+ * Compound identifiers (camelCase, PascalCase, snake_case) are expanded
+ * into sub-tokens so that partial matches work. The original compound
+ * token is preserved for exact-match boosting.
+ */
+export function tokenize(text: string): string[] {
+  const result: string[] = []
+  for (const [match] of text.matchAll(TOKEN_RE)) {
+    result.push(...splitIdentifier(match))
+  }
+  return result
+}