From 43879e7969b6216bd48fcc4925b372f185147f32 Mon Sep 17 00:00:00 2001
From: Minsu Lee <minsu.lee@dietfriends.kr>
Date: Fri, 19 Jun 2026 20:13:57 +0900
Subject: [PATCH 1/3] feat(search): wire real ranking pipeline + reconcile
 chunk size to 750

Two upstream parity reconciliations against MinishLab/semble (source of truth):

1. Wire real ranking pipeline into search (crates/csp/src/search.rs)
   - Replace inline identity/saturation-only stubs
     (apply_query_boost_identity, rerank_top_k_saturation) and duplicate
     FILE_SATURATION_* constants with the real implementations:
     ranking::boosting::apply_query_boost followed by
     ranking::penalties::rerank_top_k(.., penalise_paths = alpha_weight < 1.0)
   - Matches upstream search.search order: query-type boosts + path penalties
     + file-saturation decay now all run in the search path
   - Resolves TD-002

2. Reconcile chunk length to 750 + cache invalidation
   (crates/csp/src/chunking/source.rs, crates/csp/src/indexing/index.rs)
   - DESIRED_CHUNK_LENGTH_CHARS: 1500 -> 750 (matches upstream
     _DESIRED_CHUNK_LENGTH_CHARS)
   - Add chunk_size to index manifest and a try_reuse check that rebuilds
     any cache whose chunk_size differs or predates the field, mirroring
     upstream _metadata_matches
   - New test: try_reuse_rejects_stale_chunk_size

3. Update .please/docs/references/semble.md to mark TD-002 resolved and
   chunk-length 750 reconciled.

Quality gate passed: cargo fmt --all --check, cargo clippy --all-targets
--all-features -- -D warnings, cargo test --workspace (259 lib + 8 CLI, 0
failures).
---
 .please/docs/references/semble.md | 73 ++++++++++++++--------------
 crates/csp/src/chunking/source.rs |  4 +-
 crates/csp/src/indexing/index.rs  | 66 +++++++++++++++++++++++---
 crates/csp/src/search.rs          | 79 ++++---------------------------
 4 files changed, 109 insertions(+), 113 deletions(-)
diff --git a/.please/docs/references/semble.md b/.please/docs/references/semble.md
index b0aa53c..c7250e5 100644
--- a/.please/docs/references/semble.md
+++ b/.please/docs/references/semble.md
@@ -51,14 +51,15 @@ They are fused with **Reciprocal Rank Fusion** and then reranked with code-speci
    (dense matrix)              ("{content} {stem}           ▼
        │                        {stem} {dir[-3:]}")  rerank (if CODE):
        ▼                        → tokenize → BM25       boost_multi_chunk_files   (wired)
-  SelectableBasicBackend        index                  apply_query_boost          (⚠ identity stub)
-  (cosine)                                             rerank_top_k               (⚠ saturation-only stub)
+  SelectableBasicBackend        index                  apply_query_boost          (wired)
+  (cosine)                                             rerank_top_k               (wired, path penalties + saturation)
                                                               ▼
                                                        top_k SearchResult → ~/.csp/savings.jsonl
 ```
 
-⚠ = TD-002: the full ranking lives in `ranking::{boosting,penalties}` but is **not yet wired**
-into `search.rs`, mirroring the TS source's current state (see §4.10/§6).
+The full ranking in `ranking::{boosting,penalties}` is now **wired** into `search.rs`
+(query-type boosts + path penalties + file saturation), matching the upstream
+`search.search` pipeline order (see §4.10).
 
 ---
 
@@ -69,7 +70,7 @@ into `search.rs`, mirroring the TS source's current state (see §4.10/§6).
 | `types.py` | `csp/src/types.rs` | ported | `Chunk`, `ContentType`, `CallType` enums; `ChunkDict`/`SearchResultDict` serde |
 | `tokens.py` | `csp/src/tokens.rs` | ported | identifier-aware tokenizer (BM25 input) |
 | `chunking/core.py` | `csp/src/chunking/core.rs` | ported (real tree-sitter) | node-merge + line-fallback boundary algorithm; `TsNode` bridge |
-| `chunking/chunking.py` | `csp/src/chunking/source.rs` | ported (⚠ param drift) | `chunk_source` → `Vec<Chunk>`; char↔line conversion |
+| `chunking/chunking.py` | `csp/src/chunking/source.rs` | ported | `chunk_source` → `Vec<Chunk>`; char↔line conversion (chunk length 750) |
 | `index/file_walker.py` | `csp/src/indexing/file_walker.rs` | ported (`.cspignore`) | gitignore-aware recursive walk (`ignore` crate idioms) |
 | `index/files.py` | `csp/src/indexing/files.rs` | ported | ext→language map, content-type sets, file status checks |
 | `index/dense.py` | `csp/src/indexing/dense.rs` | ported (real + stub) | `Model` enum, `embed_chunks`, `SelectableBasicBackend` cosine |
@@ -77,10 +78,10 @@ into `search.rs`, mirroring the TS source's current state (see §4.10/§6).
 | `index/create.py` | `csp/src/indexing/create.rs` | ported | build BM25 + dense + chunks from a path |
 | `index/index.py` | `csp/src/indexing/index.rs` | ported | `CspIndex` orchestrator (from_path/from_git/search/find_related/save/load) + `load_or_build_index` |
 | `cache.py` | `csp/src/indexing/cache.rs` | adapted | content-hash cache at `~/.csp/index/` (ADR-0002), 0700 perms |
-| `search.py` | `csp/src/search.rs` | ported (⚠ ranking stub) | hybrid RRF + alpha blend; trait seams |
+| `search.py` | `csp/src/search.rs` | ported (ranking wired) | hybrid RRF + alpha blend; trait seams |
 | `ranking/weighting.py` | `csp/src/ranking/weighting.rs` | ported | adaptive alpha |
-| `ranking/boosting.py` | `csp/src/ranking/boosting.rs` | ported (boost_multi wired; others unwired) | query-type detection + definition/stem/embedded boosts |
-| `ranking/penalties.py` | `csp/src/ranking/penalties.rs` | ported (unwired) | path penalties + file-saturation rerank |
+| `ranking/boosting.py` | `csp/src/ranking/boosting.rs` | ported (wired) | query-type detection + definition/stem/embedded boosts |
+| `ranking/penalties.py` | `csp/src/ranking/penalties.rs` | ported (wired) | path penalties + file-saturation rerank |
 | `stats.py` | `csp/src/stats.rs` | adapted | `~/.csp/savings.jsonl` read/write + report formatting |
 | `mcp.py` | `csp/src/mcp.rs` (core) + `csp-cli/src/mcp_server.rs` (rmcp transport) | ported | MCP `search` / `find_related` tools |
 | `cli.py` | `csp-cli/src/main.rs` | adapted (clap) | subcommands: search / find-related / index / savings / clear / init / mcp |
@@ -131,7 +132,9 @@ Same contract as semble `tokens.py`:
   shape as semble. Byte offsets are converted to char offsets for multibyte safety.
 
 **`source.rs`** (`chunk_source`):
-- `DESIRED_CHUNK_LENGTH_CHARS = 1500` (⚠ upstream is now **750** — see §6).
+- `DESIRED_CHUNK_LENGTH_CHARS = 750` (matches upstream `_DESIRED_CHUNK_LENGTH_CHARS`).
+  The value is also recorded in the index manifest (`chunk_size`) so a cache built
+  with a different target length is auto-invalidated (see §4.9/§4.14).
 - AST chunking when `language_for(lang).is_some()`, else line fallback. Char offsets → 1-indexed
   line numbers; clamps end to avoid the zero-length off-by-one.
 
@@ -193,7 +196,7 @@ The public façade (parallels `SembleIndex`):
   `~/.csp/index/<hash>` on a validated hit, else build and persist.
 - Builds file→indices and language→indices maps for selectors and stats.
 
-### 4.10 `search.rs` — hybrid retrieval & fusion (with TD-002 stub)
+### 4.10 `search.rs` — hybrid retrieval & fusion
 
 The heart of ranking. `search(query, model, semantic_index, bm25_index, chunks, top_k, options)`:
 1. `resolve_alpha(query, options.alpha)`; `rerank = options.rerank.unwrap_or(true)`.
@@ -204,9 +207,10 @@ The heart of ranking. `search(query, model, semantic_index, bm25_index, chunks,
    `1/(RRF_K + rank)`, `RRF_K = 60`, rank from 1.
 6. Union of indices, **sorted by `start_line`** to neutralize hash-iteration nondeterminism;
    `combined = α·rrf_semantic + (1-α)·rrf_bm25`.
-7. If `rerank`: `boost_multi_chunk_files` (**wired**, shared impl) → `apply_query_boost_identity`
-   (⚠ stub) → `rerank_top_k_saturation` (⚠ stub: file-saturation decay only, path penalties
-   **not applied**, `penalise_paths` ignored). Else plain sort + truncate.
+7. If `rerank`: `boost_multi_chunk_files` → `ranking::boosting::apply_query_boost` →
+   `ranking::penalties::rerank_top_k(.., penalise_paths = alpha_weight < 1.0)` — the real
+   ranking functions, matching the upstream `search.search` order (path penalties apply only
+   when BM25 contributes). Else plain sort + truncate.
 
 **Rust idioms / structure**:
 - **Trait seams** for testability: `EmbeddingModel`, `VectorBackend`, `SparseBackend`,
@@ -220,23 +224,23 @@ The heart of ranking. `search(query, model, semantic_index, bm25_index, chunks,
   than panicking — matters for the long-running MCP server.
 - `SearchOptions` struct (`alpha`, `selector`, `rerank`) instead of Python kwargs.
 
-> **TD-002**: `ranking::boosting::apply_query_boost` and `ranking::penalties::rerank_top_k` are
-> fully ported (with tests) but **not wired** into `search.rs` — exactly as in the TS source,
-> which still uses the inline stubs (`TODO(integration)`). So search-ranking parity is
-> fixture-level. `FILE_SATURATION_THRESHOLD`/`DECAY` are therefore defined **twice** (the inline
-> stub in `search.rs` and the real one in `ranking/penalties.rs`).
+> **TD-002 (resolved)**: `ranking::boosting::apply_query_boost` and
+> `ranking::penalties::rerank_top_k` are now wired into `search.rs`, so the full ranking
+> (query-type boosts + path penalties + file-saturation decay) runs in the search path.
+> The duplicate inline `FILE_SATURATION_THRESHOLD`/`DECAY` stub constants in `search.rs` were
+> removed; the canonical definitions live only in `ranking/penalties.rs`.
 
 ### 4.11 `ranking/weighting.rs` — adaptive alpha
 
 `resolve_alpha(query, alpha)`: explicit wins; else `ALPHA_SYMBOL = 0.3` (BM25-leaning) for symbol
 queries vs `ALPHA_NL = 0.5` for NL, decided by `is_symbol_query`.
 
-### 4.12 `ranking/boosting.rs` — query-type detection & boosts (mostly unwired)
+### 4.12 `ranking/boosting.rs` — query-type detection & boosts (wired)
 
 Ported faithfully (`LazyLock<Regex>` for the static patterns, `RefCell<HashMap>` LRU for
 `definition_pattern` cache):
 - `SYMBOL_QUERY_RE` / `EMBEDDED_SYMBOL_RE` — symbol vs NL classification.
-- `apply_query_boost` (unwired): symbol → `_boost_symbol_definitions` (definition regex per
+- `apply_query_boost` (wired into `search.rs`): symbol → `_boost_symbol_definitions` (definition regex per
   keyword set: `class def fn func struct enum trait type …` case-sensitive + SQL DDL
   case-insensitive; `DEFINITION_BOOST_MULTIPLIER = 3.0`, ×1.5 on stem match); NL →
   `_boost_stem_matches` (`STEM_BOOST_MULTIPLIER = 1.0`, ≥0.10 ratio, prefix-match morphology) +
@@ -244,9 +248,9 @@ Ported faithfully (`LazyLock<Regex>` for the static patterns, `RefCell<HashMap>`
 - `boost_multi_chunk_files` (**wired** into search): top chunk per file boosted by
   `max_score * FILE_COHERENCE_BOOST_FRAC` (=0.2) × (file score sum / max file sum).
 
-### 4.13 `ranking/penalties.rs` — path penalties & saturation rerank (unwired)
+### 4.13 `ranking/penalties.rs` — path penalties & saturation rerank (wired)
 
-`rerank_top_k(scores, chunks, top_k, penalise_paths)` ported but unwired:
+`rerank_top_k(scores, chunks, top_k, penalise_paths)` ported and wired into `search.rs`:
 - Path penalties (multiplicative): test files/dirs `STRONG_PENALTY = 0.3`; compat/legacy +
   examples/docs `0.3`; re-export barrels (`__init__.py`, `package-info.java`)
   `MODERATE_PENALTY = 0.5`; `.d.ts` `MILD_PENALTY = 0.7`.
@@ -260,6 +264,11 @@ Ported faithfully (`LazyLock<Regex>` for the static patterns, `RefCell<HashMap>`
   (NFR-003), tightening pre-existing dirs on Unix.
 - `clear_index_cache` removes only the index dir — never the `~/.csp` home (which also holds
   `savings.jsonl`).
+- **Cache validity** (`try_reuse`): a cached index is reused only when the manifest's
+  `chunk_size` equals the current `DESIRED_CHUNK_LENGTH_CHARS` (a manifest predating the field
+  → `None` → rebuild) **and**, for local sources, the live source-file content hash matches.
+  This mirrors upstream `_metadata_matches`, which gained a `chunk_size` check so the 1500→750
+  change auto-invalidates stale caches.
 - **Divergence from upstream**: semble uses the OS cache dir (`~/Library/Caches/semble`, XDG,
   `%LOCALAPPDATA%`) + `SEMBLE_CACHE_LOCATION`; csp fixes a global `~/.csp/index/` per ADR-0002.
 
@@ -309,7 +318,7 @@ Clean two-layer split:
 | RRF k | `60` | `60` | `search.rs RRF_K` |
 | α symbol / NL | `0.3` / `0.5` | `0.3` / `0.5` | `ranking/weighting.rs` |
 | candidate over-fetch | `top_k * 5` | `top_k * 5` | `search.rs` |
-| desired chunk length | **`750`** | **`1500`** ⚠ | `chunking/source.rs` |
+| desired chunk length | `750` | `750` | `chunking/source.rs` |
 | min chunk size | `50` | `50` | `chunking/core.rs` |
 | recursion depth | `500` | `500` | `chunking/core.rs` |
 | definition boost × | `3.0` | `3.0` | `ranking/boosting.rs` |
@@ -318,7 +327,7 @@ Clean two-layer split:
 | stem boost × | `1.0` | `1.0` | `ranking/boosting.rs` |
 | file-coherence frac | `0.2` | `0.2` | `ranking/boosting.rs` |
 | strong / moderate / mild penalty | `0.3` / `0.5` / `0.7` | same | `ranking/penalties.rs` |
-| file saturation threshold / decay | `1` / `0.5` | `1` / `0.5` (defined **twice**, see §4.10) | `search.rs` + `ranking/penalties.rs` |
+| file saturation threshold / decay | `1` / `0.5` | `1` / `0.5` | `ranking/penalties.rs` |
 | max file bytes | `1_000_000` | `1_000_000` | `index/files.py` / `indexing/create.rs` |
 | default model | `minishlab/potion-code-16M` | same (real + stub) | `utils.py` / `indexing/dense.rs` |
 | MCP in-mem LRU | `10` | `10` | `mcp.py` / `csp::mcp` |
@@ -344,10 +353,6 @@ Clean two-layer split:
 
 ### 6.2 Open stubs & gaps (verify before claiming runtime parity)
 
-- **TD-002 — ranking not wired**: `search.rs` uses `apply_query_boost_identity` +
-  `rerank_top_k_saturation`; the real `ranking::{boosting::apply_query_boost,
-  penalties::rerank_top_k}` are ported but unwired (matches the TS source). Search-ranking parity
-  is fixture-level only. Saturation constants are duplicated as a result.
 - **Curated tree-sitter set** — only ~14 grammars are statically linked (`language_for`); upstream
   uses `tree_sitter_language_pack` (≈all languages). Languages outside the curated set are
   recognized by the extension map but **line-chunked**, not AST-chunked. This is a real behavioral
@@ -355,13 +360,11 @@ Clean two-layer split:
 
 ### 6.3 Upstream drift since the review baseline (`eacbe43` → `136b6f7`)
 
-> ⚠ **Action item**: reconcile before claiming parity.
-
-- **Chunk length changed to 750** upstream (`chunking/chunking.py`), while the Rust port (and the
-  TS source it mirrors) still use **1500**. Upstream also added a `chunk_size` field to index
-  metadata + cache validation so the change auto-invalidates stale caches. Decide whether csp
-  follows 750 (and adds the metadata field to both TS and Rust) or documents 1500 as a deliberate
-  divergence.
+- **Chunk length 750 (reconciled)** — the Rust port now uses `DESIRED_CHUNK_LENGTH_CHARS = 750`
+  (was 1500), matching upstream `chunking/chunking.py`. The value is recorded in the index
+  manifest as `chunk_size` and validated in `try_reuse`, so the change auto-invalidates stale
+  caches (mirrors upstream's added metadata field + cache check). The TS source still uses 1500,
+  but per the current direction Python upstream — not TS — is the source of truth.
 
 ---
 
diff --git a/crates/csp/src/chunking/source.rs b/crates/csp/src/chunking/source.rs
index 541386a..2e04fd7 100644
--- a/crates/csp/src/chunking/source.rs
+++ b/crates/csp/src/chunking/source.rs
@@ -9,8 +9,8 @@
 use super::core::{chunk as chunk_ast, chunk_lines, is_supported_language, ChunkBoundary};
 use crate::types::Chunk;
 
-/// The desired length of chunks in characters.
-pub const DESIRED_CHUNK_LENGTH_CHARS: usize = 1500;
+/// The desired length of chunks in characters (semble `_DESIRED_CHUNK_LENGTH_CHARS`).
+pub const DESIRED_CHUNK_LENGTH_CHARS: usize = 750;
 
 /// Chunk pre-read source text.
 pub fn chunk_source(source: &str, file_path: &str, language: Option<&str>) -> Vec<Chunk> {
diff --git a/crates/csp/src/indexing/index.rs b/crates/csp/src/indexing/index.rs
index 8cfeed2..c3ea8ec 100644
--- a/crates/csp/src/indexing/index.rs
+++ b/crates/csp/src/indexing/index.rs
@@ -10,6 +10,7 @@ use std::process::Command;
 use serde::{Deserialize, Serialize};
 use sha2::{Digest, Sha256};
 
+use crate::chunking::source::DESIRED_CHUNK_LENGTH_CHARS;
 use crate::indexing::cache::{
     compute_content_hash, ensure_cache_dir, resolve_cache_dir, CacheFile, CacheLocation,
 };
@@ -40,6 +41,11 @@ pub struct IndexManifest {
     pub source_id: Option<String>,
     pub content: Vec<ContentType>,
     pub model_id: String,
+    /// Target chunk length the index was built with. Changing it alters every
+    /// chunk boundary, so a cache built with a different value must be rebuilt
+    /// (mirrors semble `_metadata_matches`). `None` = built before this field
+    /// existed → treated as a mismatch.
+    pub chunk_size: Option<u32>,
 }
 
 /// Query options for [`CspIndex::search`] / [`CspIndex::find_related`].
@@ -289,6 +295,7 @@ impl CspIndex {
             source_id: self.root.clone(),
             content: self.content.clone(),
             model_id: self.model_path.clone(),
+            chunk_size: Some(DESIRED_CHUNK_LENGTH_CHARS as u32),
         };
         let manifest_json = serde_json::to_string(&manifest).map_err(|e| e.to_string())?;
         std::fs::write(dir.join("manifest.json"), manifest_json).map_err(|e| e.to_string())
@@ -424,6 +431,16 @@ pub fn parse_manifest(raw: &serde_json::Value) -> Result<IndexManifest, String>
         .and_then(serde_json::Value::as_str)
         .ok_or("Invalid manifest: modelId must be a string")?
         .to_string();
+    // Absent/null = built before the field existed → None (treated as a cache
+    // mismatch by `try_reuse`). A present-but-non-numeric value is malformed.
+    let chunk_size = match obj.get("chunkSize") {
+        None | Some(serde_json::Value::Null) => None,
+        Some(v) => Some(
+            v.as_u64()
+                .and_then(|n| u32::try_from(n).ok())
+                .ok_or("Invalid manifest: chunkSize must be a u32")?,
+        ),
+    };
     let content_arr = obj
         .get("content")
         .and_then(serde_json::Value::as_array)
@@ -442,6 +459,7 @@ pub fn parse_manifest(raw: &serde_json::Value) -> Result<IndexManifest, String>
         source_id,
         content,
         model_id,
+        chunk_size,
     })
 }
 
@@ -530,13 +548,18 @@ fn try_reuse(cache_dir: &Path, is_git: bool, source_hash: Option<&str>) -> Optio
     if !manifest_path.exists() {
         return None;
     }
-    if !is_git {
-        let raw = std::fs::read_to_string(&manifest_path).ok()?;
-        let value: serde_json::Value = serde_json::from_str(&raw).ok()?;
-        let manifest = parse_manifest(&value).ok()?;
-        if Some(manifest.content_hash.as_str()) != source_hash {
-            return None;
-        }
+    let raw = std::fs::read_to_string(&manifest_path).ok()?;
+    let value: serde_json::Value = serde_json::from_str(&raw).ok()?;
+    let manifest = parse_manifest(&value).ok()?;
+    // A chunk_size change re-chunks every file, so a cache built with a different
+    // target length is stale even if the source files are byte-identical.
+    if manifest.chunk_size != Some(DESIRED_CHUNK_LENGTH_CHARS as u32) {
+        return None;
+    }
+    // Local sources additionally validate the live source-file hash; git sources
+    // are URL+ref keyed (no cheap live hash).
+    if !is_git && Some(manifest.content_hash.as_str()) != source_hash {
+        return None;
     }
     CspIndex::load_from_disk(cache_dir).ok()
 }
@@ -737,6 +760,35 @@ mod tests {
         assert_eq!(value["modelId"], "test-model");
         assert_eq!(value["content"], serde_json::json!(["code"]));
         assert!(value["contentHash"].as_str().unwrap().len() == 64);
+        assert_eq!(
+            value["chunkSize"].as_u64(),
+            Some(u64::from(DESIRED_CHUNK_LENGTH_CHARS as u32))
+        );
+    }
+
+    #[test]
+    fn try_reuse_rejects_stale_chunk_size() {
+        let chunks = vec![make_chunk("a.ts", 1, 10, Some("typescript"), "A")];
+        let idx = build_index(chunks);
+        let dir = tempdir().unwrap();
+        idx.save(dir.path(), Some("deadbeef")).unwrap();
+
+        // Fresh cache (matching hash + current chunk_size) is reused.
+        assert!(try_reuse(dir.path(), false, Some("deadbeef")).is_some());
+
+        // Rewrite the manifest with a different chunk_size → stale → rebuild.
+        let manifest_path = dir.path().join("manifest.json");
+        let raw = std::fs::read_to_string(&manifest_path).unwrap();
+        let mut value: serde_json::Value = serde_json::from_str(&raw).unwrap();
+        value["chunkSize"] = serde_json::json!(9999);
+        std::fs::write(&manifest_path, value.to_string()).unwrap();
+        assert!(try_reuse(dir.path(), false, Some("deadbeef")).is_none());
+
+        // A manifest predating the field (absent chunkSize) is also stale.
+        let mut value: serde_json::Value = serde_json::from_str(&raw).unwrap();
+        value.as_object_mut().unwrap().remove("chunkSize");
+        std::fs::write(&manifest_path, value.to_string()).unwrap();
+        assert!(try_reuse(dir.path(), false, Some("deadbeef")).is_none());
     }
 
     #[test]
diff --git a/crates/csp/src/search.rs b/crates/csp/src/search.rs
index ddc7ded..70a824b 100644
--- a/crates/csp/src/search.rs
+++ b/crates/csp/src/search.rs
@@ -1,22 +1,16 @@
-//! Hybrid search pipeline. Port of `src/search.ts` (← semble `search.py`).
+//! Hybrid search pipeline. Port of semble `search.py`.
 //!
 //! semantic + BM25 → per-list RRF (`k=60`) → alpha-weighted combine → optional
-//! rerank (multi-chunk file boost → query boost → top-k with file saturation).
-//!
-//! Parity note: like `search.ts`, this reproduces the module's *current* inline
-//! ranking — `apply_query_boost` is an identity pass and `rerank_top_k` applies
-//! only file-saturation decay (no path penalties). The fuller
-//! `ranking::{boosting::apply_query_boost, penalties::rerank_top_k}` are ported
-//! (T006/T007) but, exactly as in the TS source, are not yet wired into the
-//! search pipeline (`TODO(integration)`). `boost_multi_chunk_files` *is* the
-//! shared ranking implementation (identical to the TS inline version).
+//! rerank. The rerank stage mirrors the upstream `search.search` order:
+//! multi-chunk file boost (`boost_multi_chunk_files`), then query-type boost
+//! (`apply_query_boost`), then top-k with path penalties + file saturation
+//! (`rerank_top_k`, with `penalise_paths = alpha_weight < 1.0`).
 
 use std::collections::HashSet;
 
-use indexmap::IndexMap;
-
 use crate::indexing::sparse::selector_to_mask;
-use crate::ranking::boosting::boost_multi_chunk_files;
+use crate::ranking::boosting::{apply_query_boost, boost_multi_chunk_files};
+use crate::ranking::penalties::rerank_top_k;
 use crate::ranking::weighting::resolve_alpha;
 use crate::ranking::Scores;
 use crate::tokens::tokenize;
@@ -25,9 +19,6 @@ use crate::types::Chunk;
 /// Reciprocal Rank Fusion constant.
 pub const RRF_K: usize = 60;
 
-const FILE_SATURATION_THRESHOLD: usize = 1;
-const FILE_SATURATION_DECAY: f64 = 0.5;
-
 /// A scored search hit.
 #[derive(Debug, Clone, PartialEq)]
 pub struct SearchResult {
@@ -168,57 +159,6 @@ pub struct SearchOptions {
     pub rerank: Option<bool>,
 }
 
-/// Identity query boost — mirrors the current `search.ts` inline stub. (The full
-/// `ranking::boosting::apply_query_boost` is ported but not yet wired here.)
-fn apply_query_boost_identity(scores: &Scores) -> Scores {
-    scores.clone()
-}
-
-/// Top-k rerank with file-saturation decay only — mirrors the current `search.ts`
-/// inline stub (path penalties not applied; the `penalise_paths` flag is ignored,
-/// matching the TS `void options`).
-fn rerank_top_k_saturation(scores: &Scores, chunks: &[Chunk], top_k: usize) -> Vec<(usize, f64)> {
-    if scores.is_empty() {
-        return Vec::new();
-    }
-    let mut ranked: Vec<(usize, f64)> = scores.iter().map(|(&i, &s)| (i, s)).collect();
-    ranked.sort_by(|a, b| b.1.total_cmp(&a.1));
-
-    let mut file_selected: IndexMap<String, usize> = IndexMap::new();
-    let mut selected: Vec<(f64, usize)> = Vec::new();
-    let mut min_selected = f64::INFINITY;
-
-    for (idx, pen_score) in ranked {
-        if selected.len() >= top_k && pen_score <= min_selected {
-            break;
-        }
-        let already = file_selected
-            .get(&chunks[idx].file_path)
-            .copied()
-            .unwrap_or(0);
-        let mut eff_score = pen_score;
-        if already >= FILE_SATURATION_THRESHOLD {
-            let excess = already - FILE_SATURATION_THRESHOLD + 1;
-            eff_score *= FILE_SATURATION_DECAY.powi(excess as i32);
-        }
-        selected.push((eff_score, idx));
-        file_selected.insert(chunks[idx].file_path.clone(), already + 1);
-        if selected.len() >= top_k {
-            min_selected = selected
-                .iter()
-                .map(|&(s, _)| s)
-                .fold(f64::INFINITY, f64::min);
-        }
-    }
-
-    selected.sort_by(|a, b| b.0.total_cmp(&a.0));
-    selected.truncate(top_k);
-    selected
-        .into_iter()
-        .map(|(score, idx)| (idx, score))
-        .collect()
-}
-
 /// Hybrid search: alpha-weighted combination of RRF-normalised semantic and BM25
 /// scores, with optional code-tuned reranking.
 pub fn search(
@@ -278,8 +218,9 @@ pub fn search(
 
     let ranked: Vec<(usize, f64)> = if rerank {
         boost_multi_chunk_files(&mut combined, chunks);
-        let boosted = apply_query_boost_identity(&combined);
-        rerank_top_k_saturation(&boosted, chunks, top_k)
+        let boosted = apply_query_boost(&combined, query, chunks);
+        // Path penalties apply only when BM25 contributes (alpha_weight < 1.0).
+        rerank_top_k(&boosted, chunks, top_k, alpha_weight < 1.0)
     } else {
         let mut entries: Vec<(usize, f64)> = combined.iter().map(|(&i, &s)| (i, s)).collect();
         entries.sort_by(|a, b| b.1.total_cmp(&a.1));

From ad8b932114f335d7b989ebc6f6749bde0ba295c3 Mon Sep 17 00:00:00 2001
From: Minsu Lee <minsu.lee@dietfriends.kr>
Date: Fri, 19 Jun 2026 20:18:18 +0900
Subject: [PATCH 2/3] docs: make Python upstream the source of truth; deprecate
 TS src
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Rust port now matches the Python upstream (MinishLab/semble) rather than
the TS stubs — dense embeddings are real, ranking is wired, chunk length is 750.
Reframe the governance docs accordingly:

- CLAUDE.md: Python upstream is the source of truth / parity oracle; TS `src/`
  is deprecated (historical/reference only, slated for deletion). Add a curl
  fallback for fetching upstream when `ask` is unavailable.
- ADR-0003: dated clarification that the equivalence oracle is the Python
  upstream and the TS source is no longer authoritative (decision preserved).
- semble.md header: same reframing; note PR #37 advanced the baseline.
---
 .please/docs/decisions/0003-rewrite-in-rust.md |  2 +-
 .please/docs/references/semble.md              | 13 ++++++++-----
 CLAUDE.md                                      | 12 +++++++-----
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/.please/docs/decisions/0003-rewrite-in-rust.md b/.please/docs/decisions/0003-rewrite-in-rust.md
index b301583..c3a02af 100644
--- a/.please/docs/decisions/0003-rewrite-in-rust.md
+++ b/.please/docs/decisions/0003-rewrite-in-rust.md
@@ -72,7 +72,7 @@ Adding napi-rs *now* would directly conflict with motivation #1 (single binary),
 
 - [ADR 0001](0001-native-tree-sitter.md)'s native-vs-WASM tension dissolves — tree-sitter is a native Rust crate. ADR 0001 stays accepted for the TS lineage but no longer constrains the Rust line.
 - [ADR 0002](0002-index-storage-cache-model.md)'s global `~/.csp/index/` cache model is language-agnostic and carries over unchanged.
-- The existing TS test suite becomes **golden fixtures** for verifying the Rust rewrite's behavioral equivalence, then is retired with the TS code.
+- The **Python upstream** (`MinishLab/semble`) is the equivalence oracle for the Rust rewrite; the existing TS test suite is reused as convenient language-neutral **golden fixtures** for already-ported modules, then retired with the TS code. *(Clarified 2026-06-19: the TS `src/` is the source of truth no longer — it is deprecated and slated for deletion. Where the Rust port has moved past the old TS stubs — real dense embeddings, wired ranking, chunk length 750 — the upstream Python is authoritative, not the TS source.)*
 
 ## Alternatives considered
 
diff --git a/.please/docs/references/semble.md b/.please/docs/references/semble.md
index c7250e5..6cee444 100644
--- a/.please/docs/references/semble.md
+++ b/.please/docs/references/semble.md
@@ -6,11 +6,14 @@
 > captures the load-bearing algorithm + its constants, the Rust-specific structure/idioms, and
 > where the port diverges.
 >
-> **Analyzed at**: upstream semble `136b6f7` (2026-06-18); Rust port at repo `2f2baa2`
-> (PR #34 "Rust rewrite foundation"). **Parity oracle**: the TS `src/` test suite reused as
-> golden fixtures — Rust reproduces the TS module behavior bit-for-bit, so "parity" is
-> *fixture-level*, not full-runtime. The TS `src/` stays the source of truth until Rust reaches
-> parity (per ADR-0003).
+> **Analyzed at**: upstream semble `136b6f7` (2026-06-18); Rust port baseline `2f2baa2`
+> (PR #34 "Rust rewrite foundation"), since advanced by PR #37 (ranking wired + chunk 750).
+> **Source of truth**: the **Python upstream** (`MinishLab/semble`) — the Rust port targets
+> behavioral equivalence with it. The TS `src/` is **deprecated** (slated for deletion, kept
+> only as a historical/reference implementation) and is no longer the parity oracle; its test
+> suite remains usable as language-neutral golden fixtures for already-ported modules, but where
+> the Rust port has moved past the old TS stubs (real dense embeddings, wired ranking, chunk
+> length 750) the upstream Python is authoritative.
 > **Upstream layout**: Python `src/semble/`. **Port layout**: `crates/csp/src/` (lib) +
 > `crates/csp-cli/src/` (bin).
 
diff --git a/CLAUDE.md b/CLAUDE.md
index 295df0e..6480fee 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -8,19 +8,21 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 
 ### Rust rewrite (ADR-0003)
 
-A Rust port lives in `crates/csp` (library) + `crates/csp-cli` (`csp` binary); the TS `src/` stays the source of truth until Rust reaches parity.
+A Rust port lives in `crates/csp` (library) + `crates/csp-cli` (`csp` binary). **The Python upstream ([MinishLab/semble](https://github.com/MinishLab/semble)) is the source of truth** — the Rust port targets behavioral equivalence with the upstream Python. The TS `src/` is **deprecated**: slated for deletion and retained only as a historical/reference implementation; it is **no longer** the source of truth or the parity oracle.
 - Quality gate before every Rust commit: `cargo fmt --all && cargo clippy --all-targets --all-features -- -D warnings && cargo test --workspace`.
-- Parity oracle = the TS **test suite** reused as golden fixtures. TS `dense.ts` (Model2Vec) and `search.ts` ranking are deterministic **stubs** (`TODO(integration)`); Rust reproduces them bit-for-bit, so "parity" is fixture-level, not full runtime.
+- Parity oracle = the **Python upstream** behavior (read the source directly — see the fetch note below). The TS test suite stays usable as language-neutral golden fixtures for already-ported modules, but is not authoritative where it disagrees with upstream. The Rust port has intentionally moved **past the old TS stubs** to match upstream: dense embeddings are real (`model2vec-rs`, not the deterministic stub), the ranking pipeline is wired (query boosts + path penalties + file saturation), and the chunk length is `750`. The TS `src/` still carries the older stubs/values until it is removed.
 - CLI/MCP output is a **snake_case** wire dict (`csp::utils::format_results`, mirroring TS `SearchResult.toDict`), distinct from the camelCase `ChunkDict` used for on-disk persistence.
 - rmcp 1.7: the default `#[tool_handler]` rebuilds the router via `Self::tool_router()` and leaves a stored `tool_router` field unread (clippy `dead_code`) — use `#[tool_handler(router = self.tool_router)]`.
 
-When porting modules from semble, fetch the upstream source via `ask`:
+When porting modules from semble, fetch the upstream source and read the Python directly:
 
 ```bash
-ask src github:MinishLab/semble@main    # absolute path to the cached checkout
+ask src github:MinishLab/semble@main    # absolute path to the cached checkout (if `ask` is installed)
+# Fallback when `ask` is unavailable — fetch a raw file straight from GitHub:
+curl -fsSL https://raw.githubusercontent.com/MinishLab/semble/main/src/semble/search.py
 ```
 
-Read the Python source there directly — do not infer behavior from the README. Key upstream modules and their target TS counterparts live under `src/semble/` (Python): `types.py`, `tokens.py`, `chunking/`, `index/` (files, file_walker, dense, sparse, create, index), `ranking/` (boosting, penalties, weighting), `search.py`, `mcp.py`, `cli.py`, `cache.py`, `stats.py`, `utils.py`.
+Read the Python source directly — do not infer behavior from the README. Key upstream modules and their target TS counterparts live under `src/semble/` (Python): `types.py`, `tokens.py`, `chunking/`, `index/` (files, file_walker, dense, sparse, create, index), `ranking/` (boosting, penalties, weighting), `search.py`, `mcp.py`, `cli.py`, `cache.py`, `stats.py`, `utils.py`.
 
 ## Stack
 

From f79e8f4a6e639472c095e87ab551516621a14ef9 Mon Sep 17 00:00:00 2001
From: Minsu Lee <minsu.lee@dietfriends.kr>
Date: Sat, 20 Jun 2026 11:32:44 +0900
Subject: [PATCH 3/3] chore: apply AI code review suggestions

- index.rs: parse manifest chunkSize via Option::filter/map/transpose (gemini)
- CLAUDE.md: reconcile Conventions chunk length 1500 -> 750 to match Rust port (cubic)
---
 CLAUDE.md                        |  2 +-
 crates/csp/src/indexing/index.rs | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 6480fee..a308c30 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -64,7 +64,7 @@ These names are **load-bearing** — they appear in the README's MCP configs, CL
 - **Adaptive alpha**: `is_symbol_query()` chooses `_ALPHA_SYMBOL=0.3` (BM25-leaning) vs `_ALPHA_NL=0.5` for NL queries.
 - **BM25 enrichment**: chunk text is augmented with `{stem} {stem} {last 3 dir parts}` before tokenization (`enrich_for_bm25`). Stem is repeated twice to up-weight path matches.
 - **Tokenization**: identifier-aware — `camelCase`/`PascalCase`/`snake_case` are split into sub-tokens *plus* the original lowercased compound. See `src/semble/tokens.py`.
-- **Chunking**: tree-sitter AST-based with line-fallback when language is unsupported. Target chunk length is 1500 chars; `_MIN_CHUNK_SIZE=50` prevents recursion into tiny nodes; `_RECURSION_DEPTH=500` guards pathological ASTs.
+- **Chunking**: tree-sitter AST-based with line-fallback when language is unsupported. Target chunk length is 750 chars (`DESIRED_CHUNK_LENGTH_CHARS`, matching upstream `chunking/chunking.py`; the deprecated TS `src/` still carries the older 1500); `_MIN_CHUNK_SIZE=50` prevents recursion into tiny nodes; `_RECURSION_DEPTH=500` guards pathological ASTs.
 - **Ranking pipeline order** (in `search.search`): semantic + BM25 → RRF → multi-chunk file boost → query-type boost (definition / stem / embedded-symbol) → top-k rerank with path penalties + file-saturation decay (`_FILE_SATURATION_DECAY=0.5` per extra chunk beyond 1 per file).
 - **Path penalties**: test files (`_STRONG_PENALTY=0.3`), `__init__.py`/barrels (`_MODERATE_PENALTY=0.5`), `.d.ts` (`_MILD_PENALTY=0.7`), compat/examples dirs (`_STRONG_PENALTY`). Apply only when `alpha_weight < 1.0` (i.e., BM25 contributing).
 - **File walking**: respect `.gitignore` *and* `.sembleignore` (port as `.cspignore`). Default-ignored dirs include `.git`, `node_modules`, `dist`, `build`, `.next`, plus add `.csp/` (replacement for `.semble/`). Note: the canonical **index cache** is no longer repo-local — per [ADR 0002](.please/docs/decisions/0002-index-storage-cache-model.md) it moved to the global `~/.csp/index/`. The repo-local `.csp/` entry stays in the default-ignore list for any local artifacts, but the index cache itself is global.
diff --git a/crates/csp/src/indexing/index.rs b/crates/csp/src/indexing/index.rs
index c3ea8ec..17f1185 100644
--- a/crates/csp/src/indexing/index.rs
+++ b/crates/csp/src/indexing/index.rs
@@ -433,14 +433,15 @@ pub fn parse_manifest(raw: &serde_json::Value) -> Result<IndexManifest, String>
         .to_string();
     // Absent/null = built before the field existed → None (treated as a cache
     // mismatch by `try_reuse`). A present-but-non-numeric value is malformed.
-    let chunk_size = match obj.get("chunkSize") {
-        None | Some(serde_json::Value::Null) => None,
-        Some(v) => Some(
+    let chunk_size = obj
+        .get("chunkSize")
+        .filter(|v| !v.is_null())
+        .map(|v| {
             v.as_u64()
                 .and_then(|n| u32::try_from(n).ok())
-                .ok_or("Invalid manifest: chunkSize must be a u32")?,
-        ),
-    };
+                .ok_or("Invalid manifest: chunkSize must be a u32")
+        })
+        .transpose()?;
     let content_arr = obj
         .get("content")
         .and_then(serde_json::Value::as_array)