From e0b5dfbf9b90e76d3187deee4816ad2fa771f8b7 Mon Sep 17 00:00:00 2001
From: Minsu Lee <amond@amond.net>
Date: Fri, 29 May 2026 00:15:25 +0900
Subject: [PATCH 1/2] feat(tokens): port identifier-aware tokenizer from semble

Ports src/semble/tokens.py to src/tokens.ts:

- splitIdentifier(token): lowered original + camelCase/snake_case
  sub-tokens. Returns only [lower] when fewer than two sub-tokens
  exist, matching the Python behaviour.
- tokenize(text): walks identifier matches with TOKEN_RE and
  expands each via splitIdentifier for BM25 indexing.

Regexes are ported verbatim (TOKEN_RE, CAMEL_RE) with the /g flag
so matchAll works. The snake_case branch filters empty parts to
match Python's '[p for p in lower.split("_") if p]'.

Tests cover the canonical semble examples plus edge cases verified
against the Python implementation (consecutive underscores, leading
underscore, digit runs).
---
 src/tokens.test.ts | 95 ++++++++++++++++++++++++++++++++++++++++++++++
 src/tokens.ts      | 51 +++++++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 src/tokens.test.ts
 create mode 100644 src/tokens.ts

diff --git a/src/tokens.test.ts b/src/tokens.test.ts
new file mode 100644
index 0000000..4dfa7de
--- /dev/null
+++ b/src/tokens.test.ts
@@ -0,0 +1,95 @@
+// Port of src/semble/tokens.py tests
+
+import { describe, expect, it } from 'bun:test'
+import { splitIdentifier, tokenize } from './tokens.ts'
+
+describe('splitIdentifier', () => {
+  it('splits PascalCase identifiers', () => {
+    expect(splitIdentifier('HandlerStack')).toEqual([
+      'handlerstack',
+      'handler',
+      'stack',
+    ])
+  })
+
+  it('preserves runs of capitals as a single sub-token', () => {
+    expect(splitIdentifier('getHTTPResponse')).toEqual([
+      'gethttpresponse',
+      'get',
+      'http',
+      'response',
+    ])
+  })
+
+  it('handles leading run of capitals', () => {
+    expect(splitIdentifier('XMLParser')).toEqual([
+      'xmlparser',
+      'xml',
+      'parser',
+    ])
+  })
+
+  it('splits snake_case identifiers', () => {
+    expect(splitIdentifier('my_func')).toEqual(['my_func', 'my', 'func'])
+  })
+
+  it('returns only the lowered token when there is no boundary', () => {
+    expect(splitIdentifier('simple')).toEqual(['simple'])
+  })
+
+  it('lowercases an already lower-case token', () => {
+    expect(splitIdentifier('Already')).toEqual(['already'])
+  })
+
+  it('keeps consecutive underscores from collapsing into duplicate parts', () => {
+    // Python `split('_')` produces empty strings between consecutive
+    // underscores; the upstream filter drops them.
+    expect(splitIdentifier('foo__bar')).toEqual(['foo__bar', 'foo', 'bar'])
+  })
+
+  it('treats a leading underscore as snake_case with one effective part', () => {
+    // `_foo`.split('_') === ['', 'foo'] -> filtered to ['foo'] -> len < 2
+    expect(splitIdentifier('_foo')).toEqual(['_foo'])
+  })
+
+  it('splits digit runs as their own camel sub-token', () => {
+    expect(splitIdentifier('abc123Def')).toEqual([
+      'abc123def',
+      'abc',
+      '123',
+      'def',
+    ])
+  })
+})
+
+describe('tokenize', () => {
+  it('splits plain space-separated words', () => {
+    expect(tokenize('foo bar baz')).toEqual(['foo', 'bar', 'baz'])
+  })
+
+  it('expands compound identifiers and drops non-identifier digits', () => {
+    // Numbers that do not start an identifier (e.g. "123") are not matched by
+    // TOKEN_RE, which mirrors the upstream Python behaviour.
+    expect(tokenize('camelCase_snake_case 123')).toEqual([
+      'camelcase_snake_case',
+      'camelcase',
+      'snake',
+      'case',
+    ])
+  })
+
+  it('returns an empty array for input with no identifiers', () => {
+    expect(tokenize('   !!! 123 ???')).toEqual([])
+  })
+
+  it('preserves multiple identifiers and expands each', () => {
+    expect(tokenize('HandlerStack my_func')).toEqual([
+      'handlerstack',
+      'handler',
+      'stack',
+      'my_func',
+      'my',
+      'func',
+    ])
+  })
+})
diff --git a/src/tokens.ts b/src/tokens.ts
new file mode 100644
index 0000000..a5b9e90
--- /dev/null
+++ b/src/tokens.ts
@@ -0,0 +1,51 @@
+// Port of src/semble/tokens.py
+
+const TOKEN_RE = /[a-zA-Z_][a-zA-Z0-9_]*/g
+
+// Split on camelCase/PascalCase boundaries:
+//   "HandlerStack" -> ["Handler", "Stack"]
+//   "getHTTPResponse" -> ["get", "HTTP", "Response"]
+//   "XMLParser" -> ["XML", "Parser"]
+const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g
+
+/**
+ * Split a single identifier into sub-tokens via camelCase/snake_case.
+ *
+ * Returns the original token (lowered) plus any sub-tokens.
+ * E.g. "HandlerStack" -> ["handlerstack", "handler", "stack"]
+ *      "my_func" -> ["my_func", "my", "func"]
+ *      "simple" -> ["simple"]
+ */
+export function splitIdentifier(token: string): string[] {
+  const lower = token.toLowerCase()
+  let parts: string[]
+
+  if (token.includes('_')) {
+    // snake_case splitting
+    parts = lower.split('_').filter(p => p.length > 0)
+  }
+  else {
+    // camelCase / PascalCase splitting
+    parts = Array.from(token.matchAll(CAMEL_RE), ([m]) => m.toLowerCase())
+  }
+
+  if (parts.length >= 2) {
+    return [lower, ...parts]
+  }
+  return [lower]
+}
+
+/**
+ * Split text into lowercase identifier-like tokens for BM25 indexing.
+ *
+ * Compound identifiers (camelCase, PascalCase, snake_case) are expanded
+ * into sub-tokens so that partial matches work. The original compound
+ * token is preserved for exact-match boosting.
+ */
+export function tokenize(text: string): string[] {
+  const result: string[] = []
+  for (const [match] of text.matchAll(TOKEN_RE)) {
+    result.push(...splitIdentifier(match))
+  }
+  return result
+}

From 67e3607e882fab6178c64e947fd39ef0d6770f9c Mon Sep 17 00:00:00 2001
From: Minsu Lee <minsu.lee@dietfriends.kr>
Date: Fri, 29 May 2026 00:42:16 +0900
Subject: [PATCH 2/2] review(tokens): apply gemini-code-assist feedback

Add fast-path in splitIdentifier for pure-lowercase tokens to skip
unnecessary regex execution and array allocation. Behavior is identical
to semble (TOKEN_RE only matches [a-zA-Z0-9_], so absence of _, A-Z, 0-9
means the token cannot split further).
---
 src/tokens.ts | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tokens.ts b/src/tokens.ts
index a5b9e90..598b60f 100644
--- a/src/tokens.ts
+++ b/src/tokens.ts
@@ -18,6 +18,14 @@ const CAMEL_RE = /[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|[0-9]+/g
  */
 export function splitIdentifier(token: string): string[] {
   const lower = token.toLowerCase()
+
+  // Fast-path: pure-lowercase tokens with no underscores/digits cannot split
+  // further. TOKEN_RE only matches [a-zA-Z0-9_], so the absence of `_`,
+  // uppercase, and digits means the token is already a single sub-token.
+  if (!token.includes('_') && !/[A-Z0-9]/.test(token)) {
+    return [lower]
+  }
+
   let parts: string[]
 
   if (token.includes('_')) {