From dc5fefdb0f893bfda56ef929aa681b4b829dc088 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:16:46 +0900 Subject: [PATCH 1/2] =?UTF-8?q?feat(indexing):=20port=20extension=E2=86=92?= =?UTF-8?q?language=20detection=20from=20semble?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port src/semble/index/files.py to TypeScript at src/indexing/files.ts. Exports: - EXTENSION_TO_LANGUAGE: full 350+ entry record (no abbreviation) - DOC_LANGUAGES, CONFIG_LANGUAGES, DATA_LANGUAGES, ALL_LANGUAGES (ReadonlySet) - detectLanguage(fileName): mirrors Python's Path(name).suffix.lower() lookup (case-insensitive, dotfile-aware — '.gitignore' → undefined like Python) - getExtensions(types, extensions): unions content-type extensions with the user-provided list and returns a sorted, deduplicated array Includes 22 bun:test cases covering common languages, case-insensitivity, dotfile semantics, multi-dot filenames, content-type unions, and set non-emptiness invariants. --- src/indexing/files.test.ts | 123 +++++++++ src/indexing/files.ts | 508 +++++++++++++++++++++++++++++++++++++ 2 files changed, 631 insertions(+) create mode 100644 src/indexing/files.test.ts create mode 100644 src/indexing/files.ts diff --git a/src/indexing/files.test.ts b/src/indexing/files.test.ts new file mode 100644 index 0000000..0e45037 --- /dev/null +++ b/src/indexing/files.test.ts @@ -0,0 +1,123 @@ +import { describe, expect, it } from 'bun:test' +import { + ALL_LANGUAGES, + CONFIG_LANGUAGES, + DATA_LANGUAGES, + detectLanguage, + DOC_LANGUAGES, + EXTENSION_TO_LANGUAGE, + getExtensions, +} from './files.ts' + +describe('detectLanguage', () => { + it('detects typescript from .ts', () => { + expect(detectLanguage('foo.ts')).toBe('typescript') + }) + + it('detects tsx from .tsx', () => { + expect(detectLanguage('foo.tsx')).toBe('tsx') + }) + + it('detects python from .py', () => { + expect(detectLanguage('foo.py')).toBe('python') + }) + + it('detects markdown from .md', () => { + expect(detectLanguage('foo.md')).toBe('markdown') + }) + + it('returns undefined for unknown extensions', () => { + expect(detectLanguage('foo.unknown')).toBeUndefined() + }) + + it('is case-insensitive on the suffix', () => { + expect(detectLanguage('Foo.TS')).toBe('typescript') + }) + + it('returns undefined for files without an extension', () => { + expect(detectLanguage('Makefile')).toBeUndefined() + }) + + it('returns undefined for dotfiles like .gitignore', () => { + // Mirrors Python's Path('.gitignore').suffix === '' + expect(detectLanguage('.gitignore')).toBeUndefined() + }) + + it('matches the final suffix for files with multiple dots', () => { + expect(detectLanguage('foo.bar.ts')).toBe('typescript') + }) + + it('handles paths with directory separators', () => { + expect(detectLanguage('src/indexing/files.ts')).toBe('typescript') + }) +}) + +describe('getExtensions', () => { + it('includes common code extensions when content type is code', () => { + const exts = getExtensions(['code'], undefined) + expect(exts).toContain('.ts') + expect(exts).toContain('.py') + expect(exts).toContain('.go') + }) + + it('includes doc extensions but not code extensions when content type is docs', () => { + const exts = getExtensions(['docs'], undefined) + expect(exts).toContain('.md') + expect(exts).toContain('.rst') + expect(exts).not.toContain('.ts') + }) + + it('includes config extensions when content type is config', () => { + const exts = getExtensions(['config'], undefined) + expect(exts).toContain('.toml') + expect(exts).toContain('.yaml') + }) + + it('appends user-provided extensions', () => { + const exts = getExtensions(['code'], ['.foo']) + expect(exts).toContain('.foo') + }) + + it('returns a sorted list with no duplicates', () => { + const exts = getExtensions(['code', 'docs'], ['.ts', '.foo']) + const sorted = [...exts].sort() + expect(exts).toEqual(sorted) + expect(new Set(exts).size).toBe(exts.length) + }) + + it('unions multiple content types', () => { + const code = new Set(getExtensions(['code'], undefined)) + const docs = new Set(getExtensions(['docs'], undefined)) + const both = new Set(getExtensions(['code', 'docs'], undefined)) + for (const ext of code) expect(both.has(ext)).toBe(true) + for (const ext of docs) expect(both.has(ext)).toBe(true) + }) +}) + +describe('language sets', () => { + it('EXTENSION_TO_LANGUAGE is non-empty', () => { + expect(Object.keys(EXTENSION_TO_LANGUAGE).length).toBeGreaterThan(0) + }) + + it('ALL_LANGUAGES is non-empty', () => { + expect(ALL_LANGUAGES.size).toBeGreaterThan(0) + }) + + it('DOC_LANGUAGES is non-empty', () => { + expect(DOC_LANGUAGES.size).toBeGreaterThan(0) + }) + + it('CONFIG_LANGUAGES is non-empty', () => { + expect(CONFIG_LANGUAGES.size).toBeGreaterThan(0) + }) + + it('DATA_LANGUAGES is non-empty', () => { + expect(DATA_LANGUAGES.size).toBeGreaterThan(0) + }) + + it('ALL_LANGUAGES contains every value in EXTENSION_TO_LANGUAGE', () => { + for (const lang of Object.values(EXTENSION_TO_LANGUAGE)) { + expect(ALL_LANGUAGES.has(lang)).toBe(true) + } + }) +}) diff --git a/src/indexing/files.ts b/src/indexing/files.ts new file mode 100644 index 0000000..4ef8cb1 --- /dev/null +++ b/src/indexing/files.ts @@ -0,0 +1,508 @@ +// Port of src/semble/index/files.py + +export type ContentType = 'code' | 'docs' | 'config' + +export const EXTENSION_TO_LANGUAGE: Record = { + '.4th': 'forth', + '.ada': 'ada', + '.adb': 'ada', + '.adoc': 'asciidoc', + '.ads': 'ada', + '.agda': 'agda', + '.al': 'al', + '.as': 'actionscript', + '.asciidoc': 'asciidoc', + '.asm': 'asm', + '.astro': 'astro', + '.awk': 'awk', + '.axi': 'netlinx', + '.axs': 'netlinx', + '.bash': 'bash', + '.bat': 'batch', + '.bb': 'bitbake', + '.bbappend': 'bitbake', + '.bbclass': 'bitbake', + '.beancount': 'beancount', + '.bib': 'bibtex', + '.bicep': 'bicep', + '.blade': 'blade', + '.bq': 'sql_bigquery', + '.brs': 'brightscript', + '.bsl': 'bsl', + '.bzl': 'starlark', + '.c': 'c', + '.c3': 'c3', + '.c3i': 'c3', + '.c3t': 'c3', + '.caddyfile': 'caddy', + '.cairo': 'cairo', + '.capnp': 'capnp', + '.cbl': 'cobol', + '.cc': 'cpp', + '.cedar': 'cedar', + '.cedarschema': 'cedarschema', + '.cel': 'cel', + '.cfc': 'cfml', + '.cfg': 'ini', + '.chatito': 'chatito', + '.circom': 'circom', + '.cjs': 'javascript', + '.ck': 'chuck', + '.cl': 'commonlisp', + '.clar': 'clarity', + '.clj': 'clojure', + '.cljc': 'clojure', + '.cljs': 'clojure', + '.cls': 'abl', + '.cmake': 'cmake', + '.cmd': 'batch', + '.cob': 'cobol', + '.cobol': 'cobol', + '.conf': 'nginx', + '.cook': 'cooklang', + '.corn': 'corn', + '.cpon': 'cpon', + '.cpp': 'cpp', + '.cr': 'crystal', + '.cs': 'csharp', + '.cshtml': 'razor', + '.css': 'css', + '.cst': 'cst', + '.csv': 'csv', + '.cts': 'typescript', + '.cu': 'cuda', + '.cuda': 'cuda', + '.cue': 'cue', + '.cxx': 'cpp', + '.cylc': 'cylc', + '.d': 'd', + '.dart': 'dart', + '.desktop': 'desktop', + '.dhall': 'dhall', + '.diff': 'diff', + '.dj': 'djot', + '.dl': 'souffle', + '.dockerfile': 'dockerfile', + '.dot': 'dot', + '.dsp': 'faust', + '.dtd': 'dtd', + '.dts': 'devicetree', + '.dtsi': 'devicetree', + '.ebnf': 'ebnf', + '.eds': 'eds', + '.eex': 'eex', + '.el': 'elisp', + '.elm': 'elm', + '.elv': 'elvish', + '.enforce': 'enforce', + '.eps': 'postscript', + '.erb': 'embeddedtemplate', + '.erl': 'erlang', + '.ex': 'elixir', + '.exs': 'elixir', + '.f': 'fortran', + '.f03': 'fortran', + '.f08': 'fortran', + '.f90': 'fortran', + '.f95': 'fortran', + '.fc': 'func', + '.fidl': 'fidl', + '.filter': 'poe_filter', + '.fir': 'firrtl', + '.fish': 'fish', + '.fnl': 'fennel', + '.fs': 'fsharp', + '.fsd': 'facility', + '.fsi': 'fsharp_signature', + '.fsx': 'fsharp', + '.fth': 'forth', + '.fun': 'sml', + '.g': 'gap', + '.gd': 'gdscript', + '.gdshader': 'gdshader', + '.gi': 'gap', + '.gitattributes': 'gitattributes', + '.gitignore': 'gitignore', + '.gleam': 'gleam', + '.glsl': 'glsl', + '.gn': 'gn', + '.gni': 'gn', + '.gnuplot': 'gnuplot', + '.go': 'go', + '.gotmpl': 'gotmpl', + '.gp': 'gnuplot', + '.gql': 'graphql', + '.gradle': 'groovy', + '.graphql': 'graphql', + '.gren': 'gren', + '.groovy': 'groovy', + '.gv': 'dot', + '.h': 'c', + '.hack': 'hack', + '.hare': 'hare', + '.hbs': 'glimmer', + '.hcl': 'hcl', + '.heex': 'heex', + '.hjson': 'hjson', + '.hlsl': 'hlsl', + '.hocon': 'hocon', + '.hoon': 'hoon', + '.hpp': 'cpp', + '.hrl': 'erlang', + '.hs': 'haskell', + '.htm': 'html', + '.html': 'html', + '.http': 'http', + '.hurl': 'hurl', + '.hx': 'haxe', + '.hxx': 'cpp', + '.idr': 'idris', + '.inc': 'sourcepawn', + '.ini': 'ini', + '.ino': 'arduino', + '.ispc': 'ispc', + '.j2': 'jinja2', + '.jai': 'jai', + '.janet': 'janet', + '.java': 'java', + '.jinja2': 'jinja2', + '.jl': 'julia', + '.journal': 'ledger', + '.jq': 'jq', + '.js': 'javascript', + '.json': 'json', + '.json5': 'json5', + '.jsonnet': 'jsonnet', + '.jsx': 'javascript', + '.just': 'just', + '.k': 'kcl', + '.kdl': 'kdl', + '.kt': 'kotlin', + '.kts': 'kotlin', + '.lc': 'elsa', + '.ldg': 'ledger', + '.lds': 'linkerscript', + '.lean': 'lean', + '.ledger': 'ledger', + '.leex': 'eex', + '.less': 'less', + '.libsonnet': 'jsonnet', + '.liquid': 'liquid', + '.lisp': 'commonlisp', + '.ll': 'llvm', + '.lua': 'lua', + '.luau': 'luau', + '.m': 'objc', + '.magik': 'magik', + '.makefile': 'make', + '.markdown': 'markdown', + '.matlab': 'matlab', + '.md': 'markdown', + '.mermaid': 'mermaid', + '.meson': 'meson', + '.mjs': 'javascript', + '.mk': 'make', + '.ml': 'ocaml', + '.mli': 'ocaml_interface', + '.mlir': 'mlir', + '.mll': 'ocamllex', + '.mmd': 'mermaid', + '.mod': 'gomod', + '.mojo': 'mojo', + '.move': 'move', + '.mts': 'typescript', + '.nasm': 'nasm', + '.ncl': 'nickel', + '.nginx': 'nginx', + '.nim': 'nim', + '.nims': 'nim', + '.ninja': 'ninja', + '.nix': 'nix', + '.norg': 'norg', + '.nqc': 'nqc', + '.nu': 'nushell', + '.nut': 'squirrel', + '.odin': 'odin', + '.org': 'org', + '.p': 'abl', + '.pas': 'pascal', + '.patch': 'diff', + '.pbtxt': 'textproto', + '.pem': 'pem', + '.pgn': 'pgn', + '.php': 'php', + '.pkl': 'pkl', + '.pl': 'perl', + '.plt': 'gnuplot', + '.pm': 'perl', + '.po': 'po', + '.pony': 'pony', + '.pot': 'po', + '.pp': 'puppet', + '.prisma': 'prisma', + '.pro': 'prolog', + '.promql': 'promql', + '.properties': 'properties', + '.proto': 'proto', + '.prql': 'prql', + '.ps': 'postscript', + '.ps1': 'powershell', + '.psd1': 'powershell', + '.psm1': 'powershell', + '.psv': 'psv', + '.pug': 'pug', + '.purs': 'purescript', + '.py': 'python', + '.pyi': 'python', + '.pyw': 'python', + '.ql': 'ql', + '.qml': 'qmljs', + '.r': 'r', + '.rasi': 'rasi', + '.razor': 'razor', + '.rb': 'ruby', + '.rbs': 'rbs', + '.re': 're2c', + '.rego': 'rego', + '.res': 'rescript', + '.resi': 'rescript', + '.rkt': 'racket', + '.robot': 'robot', + '.roc': 'roc', + '.ron': 'ron', + '.rs': 'rust', + '.rst': 'rst', + '.rtf': 'rtf', + '.s': 'asm', + '.scad': 'openscad', + '.scala': 'scala', + '.scm': 'scheme', + '.scss': 'scss', + '.sh': 'bash', + '.shtml': 'superhtml', + '.sig': 'sml', + '.slang': 'slang', + '.smali': 'smali', + '.smithy': 'smithy', + '.smk': 'snakemake', + '.sml': 'sml', + '.sol': 'solidity', + '.sp': 'sourcepawn', + '.sparql': 'sparql', + '.sql': 'sql', + '.squirrel': 'squirrel', + '.st': 'smalltalk', + '.stan': 'stan', + '.star': 'starlark', + '.sv': 'systemverilog', + '.svelte': 'svelte', + '.svh': 'systemverilog', + '.sw': 'sway', + '.swift': 'swift', + '.tact': 'tact', + '.tal': 'uxntal', + '.tape': 'vhs', + '.tcl': 'tcl', + '.td': 'tablegen', + '.templ': 'templ', + '.tera': 'tera', + '.tex': 'latex', + '.textproto': 'textproto', + '.tf': 'terraform', + '.tfvars': 'terraform', + '.thrift': 'thrift', + '.tl': 'teal', + '.tla': 'tlaplus', + '.todotxt': 'todotxt', + '.toml': 'toml', + '.tres': 'godot_resource', + '.trigger': 'apex', + '.ts': 'typescript', + '.tscn': 'godot_resource', + '.tsconfig': 'typoscript', + '.tsp': 'typespec', + '.tsv': 'tsv', + '.tsx': 'tsx', + '.ttl': 'turtle', + '.twig': 'twig', + // Overly broad + // '.txt': 'vimdoc', + '.typoscript': 'typoscript', + '.typst': 'typst', + '.v': 'v', + '.vb': 'vb', + '.verilog': 'verilog', + '.vhd': 'vhdl', + '.vhdl': 'vhdl', + '.vim': 'vim', + '.vrl': 'vrl', + '.vue': 'vue', + '.w': 'abl', + '.wast': 'wast', + '.wat': 'wat', + '.wgsl': 'wgsl', + '.wit': 'wit', + '.wl': 'wolfram', + '.xml': 'xml', + '.xsl': 'xml', + '.xslt': 'xml', + '.yaml': 'yaml', + '.yml': 'yaml', + '.yuck': 'yuck', + '.zig': 'zig', + '.ziggy': 'ziggy', + '.zsh': 'zsh', +} + +export const DOC_LANGUAGES: ReadonlySet = new Set([ + 'asciidoc', + 'bibtex', + 'djot', + 'doxygen', + 'html', + 'javadoc', + 'jsdoc', + 'latex', + 'luadoc', + 'markdown', + 'markdown_inline', + 'mermaid', + 'norg', + 'norg_meta', + 'org', + 'phpdoc', + 'po', + 'rst', + 'rtf', + 'vimdoc', +]) + +export const CONFIG_LANGUAGES: ReadonlySet = new Set([ + 'beancount', + 'capnp', + 'cedarschema', + 'comment', + 'cooklang', + 'cpon', + 'desktop', + 'devicetree', + 'diff', + 'dtd', + 'editorconfig', + 'ebnf', + 'git_config', + 'gitattributes', + 'gitcommit', + 'gitignore', + 'godot_resource', + 'gomod', + 'gosum', + 'gowork', + 'gpg', + 'hjson', + 'hocon', + 'ini', + 'kdl', + 'ledger', + 'pem', + 'pgn', + 'properties', + 'proto', + 'requirements', + 'ron', + 'smithy', + 'ssh_config', + 'textproto', + 'thrift', + 'todotxt', + 'toml', + 'turtle', + 'typespec', + 'wit', + 'xcompose', + 'xml', + 'yaml', + 'ziggy_schema', +]) + +export const DATA_LANGUAGES: ReadonlySet = new Set([ + 'csv', + 'json', + 'json5', + 'psv', + 'tsv', +]) + +export const ALL_LANGUAGES: ReadonlySet = new Set( + Object.values(EXTENSION_TO_LANGUAGE), +) + +// Code languages = ALL - DOC - CONFIG - DATA +const CODE_LANGUAGES: ReadonlySet = (() => { + const set = new Set(ALL_LANGUAGES) + for (const l of DOC_LANGUAGES) set.delete(l) + for (const l of CONFIG_LANGUAGES) set.delete(l) + for (const l of DATA_LANGUAGES) set.delete(l) + return set +})() + +// Invert EXTENSION_TO_LANGUAGE, collecting duplicates per language. +const LANGUAGE_TO_EXTENSIONS: ReadonlyMap = (() => { + const inv = new Map() + for (const [ext, lang] of Object.entries(EXTENSION_TO_LANGUAGE)) { + const list = inv.get(lang) + if (list === undefined) inv.set(lang, [ext]) + else list.push(ext) + } + return inv +})() + +const CONTENT_TYPE_LANGUAGES: Record> = { + code: CODE_LANGUAGES, + docs: DOC_LANGUAGES, + config: CONFIG_LANGUAGES, +} + +/** + * Detect the language of a file by its extension. + * + * Matching is case-insensitive on the final `.suffix` (mirroring Python's + * `Path(...).suffix.lower()` lookup). + */ +export function detectLanguage(fileName: string): string | undefined { + // Mirror Python's Path(fileName).suffix.lower(): take the substring after + // the last '/', then return the part from the final '.' onward — but only + // if that '.' is not at the very start of the basename (so '.gitignore' + // resolves to '' just like Python). + const base = fileName.slice(fileName.lastIndexOf('/') + 1) + const dot = base.lastIndexOf('.') + if (dot <= 0) return undefined + return EXTENSION_TO_LANGUAGE[base.slice(dot).toLowerCase()] +} + +/** + * Resolve a set of content types to the union of file extensions associated + * with their languages. Optional `extensions` are added verbatim. The result + * is sorted lexicographically (mirroring Python's `sorted(set)`). + */ +export function getExtensions( + types: readonly ContentType[], + extensions: readonly string[] | undefined, +): string[] { + const languages = new Set() + for (const type of types) { + for (const lang of CONTENT_TYPE_LANGUAGES[type]) { + languages.add(lang) + } + } + const out = new Set() + for (const lang of languages) { + const exts = LANGUAGE_TO_EXTENSIONS.get(lang) + if (exts === undefined) continue + for (const ext of exts) out.add(ext) + } + if (extensions !== undefined) { + for (const ext of extensions) out.add(ext) + } + return [...out].sort() +} From 7d4a229f12d7d64bf266ed92fc0c066e96b52738 Mon Sep 17 00:00:00 2001 From: Minsu Lee Date: Fri, 29 May 2026 00:42:49 +0900 Subject: [PATCH 2/2] review(indexing): apply gemini-code-assist feedback --- src/indexing/files.test.ts | 8 ++++++++ src/indexing/files.ts | 10 ++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/indexing/files.test.ts b/src/indexing/files.test.ts index 0e45037..3d3713b 100644 --- a/src/indexing/files.test.ts +++ b/src/indexing/files.test.ts @@ -41,6 +41,8 @@ describe('detectLanguage', () => { it('returns undefined for dotfiles like .gitignore', () => { // Mirrors Python's Path('.gitignore').suffix === '' expect(detectLanguage('.gitignore')).toBeUndefined() + expect(detectLanguage('dir/.gitignore')).toBeUndefined() + expect(detectLanguage('dir\\.gitignore')).toBeUndefined() }) it('matches the final suffix for files with multiple dots', () => { @@ -50,6 +52,12 @@ describe('detectLanguage', () => { it('handles paths with directory separators', () => { expect(detectLanguage('src/indexing/files.ts')).toBe('typescript') }) + + it('handles Windows-style path separators', () => { + // Mirrors pathlib.Path on Windows where '\\' is also a separator. + expect(detectLanguage('src\\indexing\\files.ts')).toBe('typescript') + expect(detectLanguage('C:\\Users\\me\\foo.py')).toBe('python') + }) }) describe('getExtensions', () => { diff --git a/src/indexing/files.ts b/src/indexing/files.ts index 4ef8cb1..7f6db55 100644 --- a/src/indexing/files.ts +++ b/src/indexing/files.ts @@ -471,10 +471,12 @@ const CONTENT_TYPE_LANGUAGES: Record> = { */ export function detectLanguage(fileName: string): string | undefined { // Mirror Python's Path(fileName).suffix.lower(): take the substring after - // the last '/', then return the part from the final '.' onward — but only - // if that '.' is not at the very start of the basename (so '.gitignore' - // resolves to '' just like Python). - const base = fileName.slice(fileName.lastIndexOf('/') + 1) + // the last path separator, then return the part from the final '.' onward — + // but only if that '.' is not at the very start of the basename (so + // '.gitignore' resolves to '' just like Python). Both POSIX ('/') and + // Windows ('\\') separators are handled, matching pathlib.Path on Windows. + const lastSep = Math.max(fileName.lastIndexOf('/'), fileName.lastIndexOf('\\')) + const base = fileName.slice(lastSep + 1) const dot = base.lastIndexOf('.') if (dot <= 0) return undefined return EXTENSION_TO_LANGUAGE[base.slice(dot).toLowerCase()]