diff --git a/.gitignore b/.gitignore index 404c87b..a36d8f7 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,6 @@ keys/ # Logs *.log .openclaude-profile.json + +# Local planning / scratch docs (never commit) +docs/superpowers/ diff --git a/crates/gitlawb-node/src/api/repos.rs b/crates/gitlawb-node/src/api/repos.rs index 0993d4b..884d7ca 100644 --- a/crates/gitlawb-node/src/api/repos.rs +++ b/crates/gitlawb-node/src/api/repos.rs @@ -12,7 +12,7 @@ use uuid::Uuid; use crate::cert; use crate::error::{AppError, Result}; -use crate::git::{smart_http, store}; +use crate::git::{smart_http, store, visibility_pack}; use crate::state::AppState; use crate::visibility::{visibility_check, Decision}; use crate::webhooks; @@ -330,6 +330,8 @@ pub async fn git_info_refs( if service == "git-upload-pack" { let rules = state.db.list_visibility_rules(&record.id).await?; let caller = auth.as_ref().map(|e| e.0 .0.as_str()); + // Subtree (mode B) rules do not gate the advertisement: refs expose commit + // tips only, and blob withholding happens in the upload-pack pack build. if visibility_check(&rules, record.is_public, &record.owner_did, caller, "/") == Decision::Deny { @@ -392,18 +394,45 @@ pub async fn git_upload_pack( .await .map_err(|e| AppError::Git(e.to_string()))?; let body_len = body.len(); - let resp = smart_http::upload_pack(&disk_path, body) + + // withheld_blob_oids walks every ref with blocking `git ls-tree`; keep that + // off the async worker thread. + let withheld = { + let path = disk_path.clone(); + let rules = rules.clone(); + let owner_did = record.owner_did.clone(); + let caller_owned = caller.map(str::to_string); + let is_public = record.is_public; + tokio::task::spawn_blocking(move || { + visibility_pack::withheld_blob_oids( + &path, + &rules, + is_public, + &owner_did, + caller_owned.as_deref(), + ) + }) .await - .map_err(|e| { - let msg = e.to_string(); - if msg.contains("bad line length") || msg.contains("protocol error") { - tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); - AppError::BadRequest(msg) - } else { - tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); - AppError::Git(msg) - } - })?; + .map_err(|e| AppError::Git(e.to_string()))? + .map_err(|e| AppError::Git(e.to_string()))? + }; + + let resp = if withheld.is_empty() { + smart_http::upload_pack(&disk_path, body).await + } else { + tracing::info!(repo = %name, caller = ?caller, withheld = withheld.len(), "serving filtered pack"); + smart_http::upload_pack_excluding(&disk_path, body, &withheld).await + } + .map_err(|e| { + let msg = e.to_string(); + if msg.contains("bad line length") || msg.contains("protocol error") { + tracing::warn!(repo = %name, err = %msg, "git-upload-pack: bad client request"); + AppError::BadRequest(msg) + } else { + tracing::error!(repo = %name, err = %msg, "git-upload-pack failed"); + AppError::Git(msg) + } + })?; crate::metrics::record_fetch(&format!("{owner}/{name}")); crate::metrics::observe_pack_size(body_len as f64); Ok(resp) @@ -579,14 +608,68 @@ pub async fn git_receive_pack( } } - // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty) - { + // Replication enforcement (Phase 2): decide once per push whether the public + // may read this repo at all and, if so, which blob OIDs must not leave the + // node. `withheld == None` means replicate nothing (private / mode A / + // undetermined): skip every pin so even commit and tree objects (which + // withheld_blob_oids never lists) stay local. `announce` gates the + // network-facing announcements. Fail closed: a private or undetermined repo + // never leaks. + let rules_opt = state.db.list_visibility_rules(&record.id).await.ok(); + let announce = match &rules_opt { + Some(rules) => { + visibility_check(rules, record.is_public, &record.owner_did, None, "/") + == Decision::Allow + } + None => false, + }; + let withheld: Option> = if !announce { + None + } else { + match &rules_opt { + Some(rules) if rules.is_empty() => Some(std::collections::HashSet::new()), + // withheld_blob_oids walks every ref with blocking `git ls-tree`; + // keep that off the async worker thread. + Some(rules) => { + let path = disk_path.clone(); + let rules = rules.clone(); + let owner_did = record.owner_did.clone(); + let is_public = record.is_public; + tokio::task::spawn_blocking(move || { + crate::git::visibility_pack::withheld_blob_oids( + &path, &rules, is_public, &owner_did, None, + ) + }) + .await + .map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids task panicked; skipping replication for this push") + }) + .ok() + .and_then(|r| { + r.map_err(|e| { + tracing::warn!(err = %e, "withheld_blob_oids failed; skipping replication for this push") + }) + .ok() + }) + } + None => None, + } + }; + + // Pin new git objects to the local IPFS node (no-op if ipfs_api is empty). + // Skipped entirely when the public cannot read the repo (withheld == None). + if let Some(withheld_ipfs) = withheld.clone() { let ipfs_api = state.config.ipfs_api.clone(); let repo_path_clone = disk_path.clone(); let db_clone = state.db.clone(); tokio::spawn(async move { - let pinned = - crate::ipfs_pin::pin_new_objects(&ipfs_api, &repo_path_clone, &db_clone).await; + let pinned = crate::ipfs_pin::pin_new_objects( + &ipfs_api, + &repo_path_clone, + &db_clone, + &withheld_ipfs, + ) + .await; if !pinned.is_empty() { tracing::info!(count = pinned.len(), "pinned git objects to IPFS"); for (sha, cid) in &pinned { @@ -625,15 +708,22 @@ pub async fn git_receive_pack( let owner_did_for_arweave = record.owner_did.clone(); let self_public_url = state.config.public_url.clone(); let node_keypair = Arc::clone(&state.node_keypair); + let withheld_pinata = withheld; tokio::spawn(async move { - let pinned = crate::pinata::pin_new_objects( - &http_client, - &pinata_upload_url, - &pinata_jwt, - &repo_path_clone, - &db_clone, - ) - .await; + let pinned = match &withheld_pinata { + Some(withheld) => { + crate::pinata::pin_new_objects( + &http_client, + &pinata_upload_url, + &pinata_jwt, + &repo_path_clone, + &db_clone, + withheld, + ) + .await + } + None => Vec::new(), + }; if !pinned.is_empty() { tracing::info!(count = pinned.len(), "pinned git objects to Pinata"); @@ -652,77 +742,82 @@ pub async fn git_receive_pack( .await; } - if let Some(p2p) = &p2p_handle { - p2p.publish_ref_update(crate::p2p::RefUpdateEvent { - node_did: node_did_str.clone(), - pusher_did: pusher_did_clone.clone(), - repo: repo_slug.clone(), - ref_name: ref_name.clone(), - old_sha: "".to_string(), - new_sha: new_sha.clone(), - timestamp: chrono::Utc::now().to_rfc3339(), - cert_id: None, - cid: cid.map(|s| s.to_string()), - }) - .await; + if announce { + if let Some(p2p) = &p2p_handle { + p2p.publish_ref_update(crate::p2p::RefUpdateEvent { + node_did: node_did_str.clone(), + pusher_did: pusher_did_clone.clone(), + repo: repo_slug.clone(), + ref_name: ref_name.clone(), + old_sha: "".to_string(), + new_sha: new_sha.clone(), + timestamp: chrono::Utc::now().to_rfc3339(), + cert_id: None, + cid: cid.map(|s| s.to_string()), + }) + .await; + } } } // HTTP peer notification — notify all known peers to pull from us. // This is the reliable fallback when Gossipsub p2p is not yet connected. - if let Ok(peers) = db_for_peers.list_peers().await { - for peer in peers { - if peer.http_url.is_empty() { - continue; - } - let peer_url = peer.http_url.trim_end_matches('/'); - if let Some(self_url) = self_public_url.as_deref() { - if peer_url == self_url.trim_end_matches('/') { + // Suppressed for repos the public cannot read. + if announce { + if let Ok(peers) = db_for_peers.list_peers().await { + for peer in peers { + if peer.http_url.is_empty() { continue; } - } - let path = "/api/v1/sync/notify"; - let notify_url = format!("{peer_url}{path}"); - let body = serde_json::json!({ - "repo": repo_slug.clone(), - "ref_name": ref_updates_clone.first().map(|(r, _)| r).unwrap_or(&String::new()), - "new_sha": ref_updates_clone.first().map(|(_, s)| s).unwrap_or(&String::new()), - "node_did": node_did_str.clone(), - "pusher_did": pusher_did_clone.clone(), - "old_sha": "0000000000000000000000000000000000000000", - "timestamp": chrono::Utc::now().to_rfc3339(), - }); - let body_bytes = match serde_json::to_vec(&body) { - Ok(bytes) => bytes, - Err(e) => { - tracing::warn!(peer = %peer.did, err = %e, "failed to serialize peer sync notify"); - continue; - } - }; - let signed = gitlawb_core::http_sig::sign_request( - node_keypair.as_ref(), - "POST", - path, - &body_bytes, - ); - match http_client - .post(¬ify_url) - .header("Content-Type", "application/json") - .header("Content-Digest", signed.content_digest) - .header("Signature-Input", signed.signature_input) - .header("Signature", signed.signature) - .body(body_bytes) - .send() - .await - { - Ok(r) if r.status().is_success() => { - tracing::info!(peer = %peer.did, repo = %repo_slug, "notified peer to sync") - } - Ok(r) => { - tracing::warn!(peer = %peer.did, status = %r.status(), "peer sync notify returned error") + let peer_url = peer.http_url.trim_end_matches('/'); + if let Some(self_url) = self_public_url.as_deref() { + if peer_url == self_url.trim_end_matches('/') { + continue; + } } - Err(e) => { - tracing::warn!(peer = %peer.did, err = %e, "failed to notify peer") + let path = "/api/v1/sync/notify"; + let notify_url = format!("{peer_url}{path}"); + let body = serde_json::json!({ + "repo": repo_slug.clone(), + "ref_name": ref_updates_clone.first().map(|(r, _)| r).unwrap_or(&String::new()), + "new_sha": ref_updates_clone.first().map(|(_, s)| s).unwrap_or(&String::new()), + "node_did": node_did_str.clone(), + "pusher_did": pusher_did_clone.clone(), + "old_sha": "0000000000000000000000000000000000000000", + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + let body_bytes = match serde_json::to_vec(&body) { + Ok(bytes) => bytes, + Err(e) => { + tracing::warn!(peer = %peer.did, err = %e, "failed to serialize peer sync notify"); + continue; + } + }; + let signed = gitlawb_core::http_sig::sign_request( + node_keypair.as_ref(), + "POST", + path, + &body_bytes, + ); + match http_client + .post(¬ify_url) + .header("Content-Type", "application/json") + .header("Content-Digest", signed.content_digest) + .header("Signature-Input", signed.signature_input) + .header("Signature", signed.signature) + .body(body_bytes) + .send() + .await + { + Ok(r) if r.status().is_success() => { + tracing::info!(peer = %peer.did, repo = %repo_slug, "notified peer to sync") + } + Ok(r) => { + tracing::warn!(peer = %peer.did, status = %r.status(), "peer sync notify returned error") + } + Err(e) => { + tracing::warn!(peer = %peer.did, err = %e, "failed to notify peer") + } } } } @@ -746,8 +841,9 @@ pub async fn git_receive_pack( timestamp: now_ts.clone(), }); - // Arweave permanent anchoring — fire for each ref update - if !irys_url.is_empty() { + // Arweave permanent anchoring — fire for each ref update. + // Suppressed for repos the public cannot read (public permanent ledger). + if announce && !irys_url.is_empty() { for (ref_name, new_sha) in &ref_updates_clone { let cid = cid_map.get(new_sha).cloned(); let anchor = crate::arweave::RefAnchor { diff --git a/crates/gitlawb-node/src/git/mod.rs b/crates/gitlawb-node/src/git/mod.rs index 4dcd233..49259d5 100644 --- a/crates/gitlawb-node/src/git/mod.rs +++ b/crates/gitlawb-node/src/git/mod.rs @@ -3,3 +3,4 @@ pub mod repo_store; pub mod smart_http; pub mod store; pub mod tigris; +pub mod visibility_pack; diff --git a/crates/gitlawb-node/src/git/smart_http.rs b/crates/gitlawb-node/src/git/smart_http.rs index 6a00107..80374fb 100644 --- a/crates/gitlawb-node/src/git/smart_http.rs +++ b/crates/gitlawb-node/src/git/smart_http.rs @@ -1,8 +1,9 @@ -use anyhow::{bail, Result}; +use anyhow::{bail, Context, Result}; use axum::body::Body; use axum::http::StatusCode; use axum::response::Response; use bytes::Bytes; +use std::collections::HashSet; use std::path::Path; use std::process::Stdio; use tokio::io::AsyncWriteExt; @@ -120,3 +121,571 @@ fn pkt_line(data: &str) -> Vec { let len = data.len() + 4; format!("{len:04x}{data}").into_bytes() } + +/// Build a packfile containing every object reachable from all refs EXCEPT the +/// given blob OIDs. Commits and trees are always included, so SHAs stay intact; +/// only the named blobs are dropped. +pub fn build_filtered_pack(repo_path: &Path, withheld: &HashSet) -> Result> { + // All reachable objects as "oid [path]" lines. + let rev = std::process::Command::new("git") + .args(["rev-list", "--objects", "--all"]) + .current_dir(repo_path) + .output()?; + if !rev.status.success() { + bail!( + "git rev-list failed: {}", + String::from_utf8_lossy(&rev.stderr) + ); + } + let mut keep = Vec::new(); + for line in String::from_utf8_lossy(&rev.stdout).lines() { + let oid = line.split_whitespace().next().unwrap_or(""); + if oid.is_empty() || withheld.contains(oid) { + continue; + } + keep.push(oid.to_string()); + } + let mut child = std::process::Command::new("git") + .args(["pack-objects", "--stdout"]) + .current_dir(repo_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn()?; + { + use std::io::Write as _; + let mut stdin = child.stdin.take().expect("stdin"); + stdin.write_all(keep.join("\n").as_bytes())?; + stdin.write_all(b"\n")?; + } + let out = child.wait_with_output()?; + if !out.status.success() { + bail!( + "git pack-objects failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + } + Ok(out.stdout) +} + +/// Serve a clone/fetch with the withheld blobs removed from the response pack. +/// +/// The framing is git protocol v0 (`NAK` then the pack), matching the v0 ref +/// advertisement that `info_refs` emits (it runs `git upload-pack +/// --advertise-refs` without `GIT_PROTOCOL=version=2`, so clients negotiate v0). +/// If `info_refs` ever advertises v2, this serve path must learn v2 framing too. +/// +/// Because the pack deliberately omits blobs that the sent trees still +/// reference, the pack is not closed under reachability. A stock full clone +/// rejects it at fetch time ("remote did not send all necessary objects"); only +/// a partial clone (the client passes `--filter`, marking a promisor remote) +/// accepts the pack with the private blobs absent. Tree and commit SHAs stay +/// intact either way. The clean partial-clone client UX is a separate follow-up +/// (git-remote-gitlawb); the security guarantee (private bytes never leave the +/// node) holds regardless of client. +/// +/// Negotiation is intentionally ignored: rather than honoring the client's +/// `want`/`have` lines, this always sends a self-contained pack of every object +/// across all refs minus the withheld blobs, and replies `NAK`. A fresh clone +/// and an incremental fetch are both correct (the client de-duplicates objects +/// it already has); the cost is that a fetch re-sends the full object set +/// instead of a thin delta. Honoring negotiation for smaller fetch packs is an +/// optimization follow-up, not a correctness requirement. +pub async fn upload_pack_excluding( + repo_path: &Path, + request_body: Bytes, + withheld: &HashSet, +) -> Result { + // build_filtered_pack shells out to git (rev-list, pack-objects) with + // blocking std::process I/O; run it off the async worker so a large repo's + // pack build does not stall the tokio runtime. + let pack = { + let repo_path = repo_path.to_path_buf(); + let withheld = withheld.clone(); + tokio::task::spawn_blocking(move || build_filtered_pack(&repo_path, &withheld)) + .await + .context("filtered-pack build task panicked")?? + }; + + // The client lists its capabilities on the first `want` line. Honor + // side-band-64k when offered (every modern smart-HTTP client offers it); + // otherwise stream the raw pack after NAK. + let sideband = memmem(&request_body, b"side-band-64k"); + + let mut body = Vec::new(); + body.extend_from_slice(&pkt_line("NAK\n")); + if sideband { + // Band 1 carries pack data, chunked under the pkt-line size limit. + for chunk in pack.chunks(65515) { + let mut framed = Vec::with_capacity(chunk.len() + 1); + framed.push(0x01); + framed.extend_from_slice(chunk); + let len = framed.len() + 4; + body.extend_from_slice(format!("{len:04x}").as_bytes()); + body.extend_from_slice(&framed); + } + body.extend_from_slice(b"0000"); + } else { + body.extend_from_slice(&pack); + } + + Ok(Response::builder() + .status(StatusCode::OK) + .header("Content-Type", "application/x-git-upload-pack-result") + .header("Cache-Control", "no-cache") + .body(Body::from(body))?) +} + +/// True if `needle` occurs anywhere in `haystack`. Small substring scan used to +/// detect a client capability token in the upload-pack request body. +fn memmem(haystack: &[u8], needle: &[u8]) -> bool { + if needle.is_empty() || haystack.len() < needle.len() { + return needle.is_empty(); + } + haystack + .windows(needle.len()) + .any(|window| window == needle) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::process::Command; + use tempfile::TempDir; + + /// List OIDs in a pack by writing it to a temp dir and running verify-pack. + pub(super) fn pack_object_ids(pack: &[u8]) -> std::collections::HashSet { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("test.pack"); + std::fs::write(&path, pack).unwrap(); + // index-pack creates the matching .idx next to the pack. + let ok = Command::new("git") + .args(["index-pack", path.to_str().unwrap()]) + .status() + .unwrap() + .success(); + assert!(ok, "index-pack failed"); + let out = Command::new("git") + .args(["verify-pack", "-v", path.to_str().unwrap()]) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout) + .lines() + .filter_map(|l| l.split_whitespace().next()) + .filter(|t| t.len() == 40 && t.chars().all(|c| c.is_ascii_hexdigit())) + .map(|s| s.to_string()) + .collect() + } + + #[tokio::test] + async fn filtered_serve_excludes_withheld_blob() { + // Build a bare repo, capture the secret + public blob OIDs. + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret.clone()); + + let pack = build_filtered_pack(&bare, &withheld).unwrap(); + let ids = pack_object_ids(&pack); + assert!(ids.contains(&public), "public blob must be in the pack"); + assert!( + !ids.contains(&secret), + "secret blob must NOT be in the pack" + ); + } + + #[tokio::test] + async fn client_clone_lacks_withheld_blob_bytes() { + use axum::body::to_bytes; + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let g = |args: &[&str], dir: &std::path::Path| { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + }; + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + g(&["init", "-q"], &work); + g(&["config", "user.email", "t@t"], &work); + g(&["config", "user.name", "t"], &work); + g(&["add", "."], &work); + g(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret_oid = oid("secret/b.txt"); + let public_oid = oid("public/a.txt"); + g( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + + let mut withheld = std::collections::HashSet::new(); + withheld.insert(secret_oid.clone()); + + // A realistic v0 request advertises side-band-64k, so the serve frames + // the pack in band 1 (the path real clients exercise). + let req = Bytes::from_static( + b"0098want 0000000000000000000000000000000000000000 \ + side-band-64k ofs-delta agent=git/2\n00000009done\n", + ); + let resp = upload_pack_excluding(&bare, req, &withheld).await.unwrap(); + let body = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let ids = pack_object_ids(&extract_pack(&body)); + assert!( + ids.contains(&public_oid), + "public blob must be present in served pack" + ); + assert!( + !ids.contains(&secret_oid), + "withheld blob must be absent from served pack" + ); + } + + /// Strip the v0 upload-pack framing (NAK line + sideband-64k bands), + /// returning the raw pack. Mirrors how a client de-frames the band-1 stream. + fn extract_pack(body: &[u8]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i + 4 <= body.len() { + let len = + usize::from_str_radix(std::str::from_utf8(&body[i..i + 4]).unwrap_or("0000"), 16) + .unwrap_or(0); + if len == 0 { + i += 4; + continue; + } + let chunk = &body[i + 4..i + len]; + // band 1 = pack data; skip the NAK line and any other bands. + if chunk.first() == Some(&0x01) { + out.extend_from_slice(&chunk[1..]); + } + i += len; + } + out + } + + // Shared harness for the real-git server tests: a minimal smart-HTTP server + // backed by the real info_refs + upload_pack_excluding. + + #[derive(Clone)] + struct FilterState { + repo: std::path::PathBuf, + withheld: HashSet, + } + + async fn refs_handler( + axum::extract::State(st): axum::extract::State>, + axum::extract::Query(q): axum::extract::Query>, + ) -> Response { + let service = q.get("service").cloned().unwrap_or_default(); + info_refs(&st.repo, &service).await.unwrap() + } + + async fn pack_handler( + axum::extract::State(st): axum::extract::State>, + body: Bytes, + ) -> Response { + upload_pack_excluding(&st.repo, body, &st.withheld) + .await + .unwrap() + } + + /// Spawn the server for `bare`, withholding `withheld`. Returns the clone URL + /// and the server task (abort it when done). + async fn spawn_filter_server( + bare: std::path::PathBuf, + withheld: HashSet, + ) -> (String, tokio::task::JoinHandle<()>) { + use axum::routing::{get, post}; + let state = std::sync::Arc::new(FilterState { + repo: bare, + withheld, + }); + let app = axum::Router::new() + .route("/repo.git/info/refs", get(refs_handler)) + .route("/repo.git/git-upload-pack", post(pack_handler)) + .with_state(state); + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + let handle = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + (format!("http://127.0.0.1:{port}/repo.git"), handle) + } + + fn run_git(args: &[&str], dir: &std::path::Path) { + assert!(Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success()); + } + + /// Build a work repo (public/a.txt, secret/b.txt) and a bare clone of it. + /// Returns (work, bare, secret_blob_oid, public_blob_oid). + fn fixture_with_secret( + td: &TempDir, + ) -> (std::path::PathBuf, std::path::PathBuf, String, String) { + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"pub\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"SECRET\n").unwrap(); + run_git(&["init", "-q"], &work); + run_git(&["config", "user.email", "t@t"], &work); + run_git(&["config", "user.name", "t"], &work); + run_git(&["add", "."], &work); + run_git(&["commit", "-qm", "init"], &work); + let oid = |p: &str| { + let o = Command::new("git") + .args(["rev-parse", &format!("HEAD:{p}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + let secret_oid = oid("secret/b.txt"); + let public_oid = oid("public/a.txt"); + run_git( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + (work, bare, secret_oid, public_oid) + } + + /// Enumerate exactly the objects a repo physically has (no promisor lazy + /// fetch), so tests assert on what bytes actually crossed the wire. + fn local_object_ids(repo: &std::path::Path) -> String { + let out = Command::new("git") + .args(["cat-file", "--batch-all-objects", "--batch-check"]) + .current_dir(repo) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).into_owned() + } + + /// End-to-end: a real `git` client clones through `info_refs` + + /// `upload_pack_excluding` and ends up without the withheld blob's bytes + /// while still seeing its tree entry (SHA). Uses a partial clone + /// (`--filter`) because a pack that omits a referenced blob is only + /// accepted by a promisor-aware client; a stock full clone is refused at + /// fetch time by the connectivity check. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_partial_clone_omits_withheld_blob() { + let td = TempDir::new().unwrap(); + let (_work, bare, secret_oid, public_oid) = fixture_with_secret(&td); + + let (url, server) = spawn_filter_server(bare, HashSet::from([secret_oid.clone()])).await; + + let dest = td.path().join("clone"); + let dest_s = dest.to_str().unwrap().to_string(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args([ + "-c", + "protocol.version=2", + "clone", + "--filter=blob:none", + "--no-checkout", + "-q", + &url, + &dest_s, + ]) + .output() + .unwrap() + }) + .await + .unwrap(); + + assert!( + out.status.success(), + "clone failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // The public blob is present in the clone, the withheld blob is not. + let local = local_object_ids(&dest); + assert!( + local.contains(&public_oid), + "public blob should be present in the clone" + ); + assert!( + !local.contains(&secret_oid), + "withheld blob bytes must be absent from the clone" + ); + + // The tree entry (and SHA) for the private file is still visible. + let tree = Command::new("git") + .args(["ls-tree", "-r", "HEAD"]) + .current_dir(&dest) + .output() + .unwrap(); + let tree = String::from_utf8_lossy(&tree.stdout); + assert!( + tree.contains(&secret_oid) && tree.contains("secret/b.txt"), + "the private path and its blob SHA must remain visible: {tree}" + ); + + server.abort(); + } + + /// End-to-end: an incremental `git fetch` after a partial clone still works + /// and still withholds the private blob. The serve path ignores the client's + /// have/want negotiation and always sends a self-contained pack of all refs + /// minus the withheld blobs (it replies NAK, so the client treats it as "no + /// common commits" and accepts the full set). This is correct, just not + /// bandwidth-optimal; thin-pack/negotiation is an optimization follow-up. + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn real_git_fetch_after_partial_clone_still_withholds() { + let td = TempDir::new().unwrap(); + let (work, bare, secret_oid, _public_oid) = fixture_with_secret(&td); + let branch = { + let o = Command::new("git") + .args(["symbolic-ref", "--short", "HEAD"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + + let (url, server) = + spawn_filter_server(bare.clone(), HashSet::from([secret_oid.clone()])).await; + + // Partial-clone the initial state. + let dest = td.path().join("clone"); + let dest_s = dest.to_str().unwrap().to_string(); + let url_c = url.clone(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args([ + "-c", + "protocol.version=2", + "clone", + "--filter=blob:none", + "--no-checkout", + "-q", + &url_c, + &dest_s, + ]) + .output() + .unwrap() + }) + .await + .unwrap(); + assert!( + out.status.success(), + "clone failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // Add a new public commit on the server side. + std::fs::write(work.join("public/c.txt"), b"v2\n").unwrap(); + run_git(&["add", "."], &work); + run_git(&["commit", "-qm", "c2"], &work); + let new_oid = { + let o = Command::new("git") + .args(["rev-parse", "HEAD:public/c.txt"]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&o.stdout).trim().to_string() + }; + run_git(&["push", "-q", bare.to_str().unwrap(), &branch], &work); + + // Incremental fetch: the client has c1 and asks for the update. + let dest_f = dest.clone(); + let out = tokio::task::spawn_blocking(move || { + Command::new("git") + .args(["-c", "protocol.version=2", "fetch", "-q", "origin"]) + .current_dir(&dest_f) + .output() + .unwrap() + }) + .await + .unwrap(); + assert!( + out.status.success(), + "fetch failed: {}", + String::from_utf8_lossy(&out.stderr) + ); + + // The new commit's blob arrived; the withheld blob is still absent. + let local = local_object_ids(&dest); + assert!( + local.contains(&new_oid), + "the new commit's blob must be fetched" + ); + assert!( + !local.contains(&secret_oid), + "withheld blob must remain absent after fetch" + ); + + server.abort(); + } +} diff --git a/crates/gitlawb-node/src/git/visibility_pack.rs b/crates/gitlawb-node/src/git/visibility_pack.rs new file mode 100644 index 0000000..c9c6d6b --- /dev/null +++ b/crates/gitlawb-node/src/git/visibility_pack.rs @@ -0,0 +1,233 @@ +//! Resolve which blob OIDs must be withheld from a caller because every path +//! at which the blob appears is denied by the repo's visibility rules. Trees +//! and commits are never withheld (mode B keeps SHAs intact); only blob +//! content is held back. + +use crate::db::VisibilityRule; +use crate::git::store; +use crate::visibility::{visibility_check, Decision}; +use anyhow::{Context, Result}; +use std::collections::HashSet; +use std::path::Path; + +/// List every (blob_oid, "/repo/relative/path") pair reachable from any branch +/// ref in `repo_path`. Uses `git ls-tree -r` per ref so each path a blob lives +/// at is represented (the same blob content can appear at several paths). Paths +/// are returned with a leading "/" to match the glob form used by visibility +/// rules ("/secret/**"). +fn blob_paths(repo_path: &Path) -> Result> { + let refs = store::list_refs(repo_path).context("list_refs failed")?; + let mut out = Vec::new(); + for (refname, _oid) in refs { + if !refname.starts_with("refs/heads/") && !refname.starts_with("refs/tags/") { + continue; + } + let listing = std::process::Command::new("git") + .args(["ls-tree", "-r", &refname]) + .current_dir(repo_path) + .output() + .context("git ls-tree -r failed")?; + if !listing.status.success() { + continue; + } + for line in String::from_utf8_lossy(&listing.stdout).lines() { + // " blob \t" + let Some((meta, path)) = line.split_once('\t') else { + continue; + }; + let mut parts = meta.split_whitespace(); + let _mode = parts.next(); + let kind = parts.next(); + let oid = parts.next(); + if kind == Some("blob") { + if let Some(oid) = oid { + out.push((oid.to_string(), format!("/{path}"))); + } + } + } + } + Ok(out) +} + +/// Blob OIDs the caller may not read. A blob is withheld only if visibility +/// denies the caller at *every* path the blob appears at; a blob that is also +/// reachable through an allowed path is sent (its content is public elsewhere). +/// +/// The whole-repo "/" gate is handled by the caller before this function runs: +/// if "/" denies, the caller gets a 404 and never reaches the filtered serve. +pub fn withheld_blob_oids( + repo_path: &Path, + rules: &[VisibilityRule], + is_public: bool, + owner_did: &str, + caller: Option<&str>, +) -> Result> { + let mut denied: HashSet = HashSet::new(); + let mut allowed: HashSet = HashSet::new(); + for (oid, path) in blob_paths(repo_path)? { + match visibility_check(rules, is_public, owner_did, caller, &path) { + Decision::Deny => { + denied.insert(oid); + } + Decision::Allow => { + allowed.insert(oid); + } + } + } + Ok(denied.difference(&allowed).cloned().collect()) +} + +/// Objects that may replicate to the public: everything not in `withheld`. +/// Order-preserving. The single seam every replication site (IPFS, Pinata) +/// passes its object list through; option B would later reroute the withheld +/// ones through encrypt-then-pin instead of dropping them. +pub fn replicable_objects(all: Vec, withheld: &HashSet) -> Vec { + all.into_iter() + .filter(|oid| !withheld.contains(oid)) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::VisibilityMode; + use chrono::Utc; + use std::process::Command; + use tempfile::TempDir; + + fn rule(path_glob: &str, readers: &[&str]) -> VisibilityRule { + VisibilityRule { + id: "x".into(), + repo_id: "r1".into(), + path_glob: path_glob.into(), + mode: VisibilityMode::B, + reader_dids: readers.iter().map(|s| s.to_string()).collect(), + created_by: "did:key:zOwner".into(), + created_at: Utc::now(), + } + } + + const OWNER: &str = "did:key:zOwner"; + + /// Build a bare repo with public/a.txt and secret/b.txt at one commit. + /// Returns (tempdir, bare_path, secret_blob_oid, public_blob_oid). + fn fixture() -> (TempDir, std::path::PathBuf, String, String) { + let td = TempDir::new().unwrap(); + let work = td.path().join("work"); + let bare = td.path().join("bare.git"); + let run = |args: &[&str], dir: &Path| { + let ok = Command::new("git") + .args(args) + .current_dir(dir) + .status() + .unwrap() + .success(); + assert!(ok, "git {args:?} failed"); + }; + std::fs::create_dir_all(work.join("public")).unwrap(); + std::fs::create_dir_all(work.join("secret")).unwrap(); + std::fs::write(work.join("public/a.txt"), b"public bytes\n").unwrap(); + std::fs::write(work.join("secret/b.txt"), b"TOP SECRET\n").unwrap(); + run(&["init", "-q"], &work); + run(&["config", "user.email", "t@t"], &work); + run(&["config", "user.name", "t"], &work); + run(&["add", "."], &work); + run(&["commit", "-qm", "init"], &work); + let oid = |path: &str| { + let out = Command::new("git") + .args(["rev-parse", &format!("HEAD:{path}")]) + .current_dir(&work) + .output() + .unwrap(); + String::from_utf8_lossy(&out.stdout).trim().to_string() + }; + let secret = oid("secret/b.txt"); + let public = oid("public/a.txt"); + run( + &[ + "clone", + "-q", + "--bare", + work.to_str().unwrap(), + bare.to_str().unwrap(), + ], + td.path(), + ); + (td, bare, secret, public) + } + + #[test] + fn anonymous_caller_withholds_only_private_blob() { + let (_td, bare, secret_oid, public_oid) = fixture(); + let rules = [rule("/secret/**", &[])]; + // caller = None models the public / any peer: what must not replicate. + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, None).unwrap(); + assert!( + withheld.contains(&secret_oid), + "secret blob must be withheld" + ); + assert!( + !withheld.contains(&public_oid), + "public blob must replicate" + ); + // Trees and commits are never withheld; the set holds only the secret blob. + assert_eq!(withheld.len(), 1, "only the secret blob OID is withheld"); + } + + #[test] + fn non_reader_withholds_only_the_private_blob() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zStranger")).unwrap(); + assert!(withheld.contains(&secret), "secret blob must be withheld"); + assert!( + !withheld.contains(&public), + "public blob must NOT be withheld" + ); + } + + #[test] + fn owner_withholds_nothing() { + let (_td, bare, secret, public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = withheld_blob_oids(&bare, &rules, true, OWNER, Some(OWNER)).unwrap(); + assert!(withheld.is_empty(), "owner sees everything"); + let _ = (secret, public); + } + + #[test] + fn listed_reader_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let rules = [rule("/secret/**", &["did:key:zFriend"])]; + let withheld = + withheld_blob_oids(&bare, &rules, true, OWNER, Some("did:key:zFriend")).unwrap(); + assert!(withheld.is_empty(), "listed reader sees the subtree"); + } + + #[test] + fn no_subtree_rules_withholds_nothing() { + let (_td, bare, _secret, _public) = fixture(); + let withheld = withheld_blob_oids(&bare, &[], true, OWNER, None).unwrap(); + assert!( + withheld.is_empty(), + "public repo, no rules, nothing withheld" + ); + } + + #[test] + fn replicable_objects_drops_withheld_keeps_rest() { + let all = vec!["aaa".to_string(), "bbb".to_string(), "ccc".to_string()]; + let withheld: HashSet = ["bbb".to_string()].into_iter().collect(); + let got = replicable_objects(all, &withheld); + assert_eq!(got, vec!["aaa".to_string(), "ccc".to_string()]); + } + + #[test] + fn replicable_objects_empty_withheld_keeps_all() { + let all = vec!["aaa".to_string(), "bbb".to_string()]; + let withheld: HashSet = HashSet::new(); + let got = replicable_objects(all.clone(), &withheld); + assert_eq!(got, all); + } +} diff --git a/crates/gitlawb-node/src/ipfs_pin.rs b/crates/gitlawb-node/src/ipfs_pin.rs index 831f1ad..96d6abd 100644 --- a/crates/gitlawb-node/src/ipfs_pin.rs +++ b/crates/gitlawb-node/src/ipfs_pin.rs @@ -7,6 +7,8 @@ //! If `ipfs_api` is empty the functions are no-ops, so the node works fine //! without a local IPFS daemon. +use std::collections::HashSet; + use anyhow::Result; use gitlawb_core::cid::Cid; @@ -78,6 +80,7 @@ pub async fn pin_new_objects( ipfs_api: &str, repo_path: &std::path::Path, db: &crate::db::Db, + withheld: &HashSet, ) -> Vec<(String, String)> { if ipfs_api.is_empty() { return vec![]; @@ -92,6 +95,8 @@ pub async fn pin_new_objects( } }; + let object_list = crate::git::visibility_pack::replicable_objects(object_list, withheld); + let mut pinned = Vec::new(); for sha in object_list { diff --git a/crates/gitlawb-node/src/pinata.rs b/crates/gitlawb-node/src/pinata.rs index ee9d416..90bddad 100644 --- a/crates/gitlawb-node/src/pinata.rs +++ b/crates/gitlawb-node/src/pinata.rs @@ -7,6 +7,7 @@ //! no-op, so nodes without Pinata backing work fine. use anyhow::Result; +use std::collections::HashSet; /// Pin a single git object's raw bytes on Pinata (v3 API). /// @@ -76,6 +77,7 @@ pub async fn pin_new_objects( jwt: &str, repo_path: &std::path::Path, db: &crate::db::Db, + withheld: &HashSet, ) -> Vec<(String, String)> { if jwt.is_empty() { return vec![]; @@ -92,6 +94,7 @@ pub async fn pin_new_objects( return vec![]; } }; + let object_list = crate::git::visibility_pack::replicable_objects(object_list, withheld); let mut pinned = Vec::new(); diff --git a/crates/gitlawb-node/src/visibility.rs b/crates/gitlawb-node/src/visibility.rs index b246dbf..1107de7 100644 --- a/crates/gitlawb-node/src/visibility.rs +++ b/crates/gitlawb-node/src/visibility.rs @@ -242,4 +242,24 @@ mod tests { Decision::Allow ); } + + // Mirrors the gossip-announce gate in git_receive_pack: announce iff an + // anonymous caller can read "/". + #[test] + fn announce_gate_matches_public_readability() { + let announce = |rules: &[VisibilityRule], is_public: bool| { + visibility_check(rules, is_public, OWNER, None, "/") == Decision::Allow + }; + // Public repo, no rules → announce. + assert!(announce(&[], true)); + // Legacy private repo (is_public false, no rules) → silent. + assert!(!announce(&[], false)); + // Mode A whole-repo rule with no public readers → silent. + assert!(!announce(&[rule("/", VisibilityMode::A, &[])], true)); + // Mode B public repo with a private subtree → still announce. + assert!(announce( + &[rule("/secret/**", VisibilityMode::B, &[])], + true + )); + } }