From 4913668eea1141d26431aa7db99aef2ca65e03fe Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Mon, 18 May 2026 15:07:03 -0400 Subject: [PATCH 1/8] fix git import speeds --- atomic-cli/src/commands/git/parallel.rs | 391 ++++++++++++------ atomic-core/src/apply/graph_batch.rs | 28 ++ atomic-repository/src/repository/insert.rs | 454 +++++++++++++++++++++ atomic-repository/src/repository/mod.rs | 1 + tests/harness/10_git_import.sh | 15 +- 5 files changed, 766 insertions(+), 123 deletions(-) diff --git a/atomic-cli/src/commands/git/parallel.rs b/atomic-cli/src/commands/git/parallel.rs index 55039c3..1223de3 100644 --- a/atomic-cli/src/commands/git/parallel.rs +++ b/atomic-cli/src/commands/git/parallel.rs @@ -65,7 +65,7 @@ use git2::{ }; use rayon::prelude::*; -use atomic_core::change::{Author, Change, ChangeHeader}; +use atomic_core::change::{Author, ChangeHeader}; use atomic_core::change::{Encoding, Local}; use atomic_core::record::workflow::graph_op::BuiltHunk; use atomic_core::record::workflow::GitDiffLine; @@ -224,6 +224,38 @@ fn count_line_units(content: &[u8]) -> usize { } } +fn trace_git_import_enabled() -> bool { + std::env::var_os("ATOMIC_TRACE_GIT_IMPORT").is_some() +} + +fn trace_git_import(message: impl AsRef) { + if trace_git_import_enabled() { + eprintln!("[git-import] {}", message.as_ref()); + } +} + +fn should_detect_renames(diff: &Diff) -> bool { + let mut adds = 0usize; + let mut deletes = 0usize; + + for delta in diff.deltas() { + match delta.status() { + Delta::Added => adds += 1, + Delta::Deleted => deletes += 1, + _ => {} + } + } + + if adds == 0 || deletes == 0 { + return false; + } + + // libgit2 rename detection is similarity matching over candidate + // add/delete pairs. On root imports or vendored-tree rewrites this can + // dominate the whole import before Atomic sees a single ParsedCommit. + adds.saturating_mul(deletes) <= 250_000 +} + fn record_generated_full_replace( path: &str, new_content: &[u8], @@ -265,6 +297,101 @@ fn record_generated_full_replace( recorded } +fn record_git_diff_add_fast( + path: &str, + new_content: &[u8], + diff_lines: &[GitDiffLine], + kind: atomic_core::record::workflow::DetectionKind, +) -> Option { + let encoding = Encoding::detect(new_content); + if encoding == Encoding::Binary { + return None; + } + + let mut recorded = RecordedFile::new(path); + recorded.set_kind(kind); + recorded.set_encoding(encoding); + recorded.add_hunk(BuiltHunk::new_edit( + Local::new(path, 1), + Some(encoding), + 0, + new_content.len() as u64, + )); + recorded.set_content(new_content.to_vec()); + + let (git_file_ops, git_stats) = + atomic_core::record::workflow::build_crdt_ops_from_git_diff(path, diff_lines); + recorded.set_crdt_ops(git_file_ops); + recorded.set_crdt_stats(git_stats); + Some(recorded) +} + +fn build_linewise_crdt_ops_for_added_file( + path: &str, + content: &[u8], + encoding: Encoding, +) -> ( + atomic_core::change::FileOps, + atomic_core::record::workflow::CrdtBuildStats, +) { + use atomic_core::change::LineOps; + use atomic_core::crdt::{BranchId, BranchOp, TrunkId}; + use atomic_core::types::NodeId; + + let placeholder_change_id = NodeId::new(0); + let trunk_id = TrunkId::new(placeholder_change_id, 0); + let enc = if encoding == Encoding::Binary { + None + } else { + Some(encoding) + }; + let mut file_ops = atomic_core::change::FileOps::create(trunk_id, path.to_string(), enc); + let mut stats = atomic_core::record::workflow::CrdtBuildStats::new(); + stats.files_added = 1; + + let mut prev_branch: Option = None; + for (line_idx, _line) in content.split_inclusive(|&b| b == b'\n').enumerate() { + let branch_id = BranchId::new(placeholder_change_id, line_idx as u32); + let line_ops = LineOps::new_with_line_nums( + branch_id, + BranchOp::Insert { + after: prev_branch, + content: Vec::new(), + }, + None, + Some(line_idx + 1), + ); + file_ops.add_line_op(line_ops); + stats.lines_added += 1; + prev_branch = Some(branch_id); + } + + (file_ops, stats) +} + +fn record_git_import_add_linewise(path: &str, new_content: &[u8]) -> Option { + let encoding = Encoding::detect(new_content); + if encoding == Encoding::Binary { + return None; + } + + let mut recorded = RecordedFile::new(path); + recorded.set_kind(atomic_core::record::workflow::DetectionKind::Added); + recorded.set_encoding(encoding); + recorded.add_hunk(BuiltHunk::new_edit( + Local::new(path, 1), + Some(encoding), + 0, + new_content.len() as u64, + )); + recorded.set_content(new_content.to_vec()); + + let (file_ops, stats) = build_linewise_crdt_ops_for_added_file(path, new_content, encoding); + recorded.set_crdt_ops(file_ops); + recorded.set_crdt_stats(stats); + Some(recorded) +} + impl ParallelImporter { /// Create a new parallel importer. pub fn new(git_repo: &GitRepository, options: ParallelImportOptions) -> Self { @@ -778,6 +905,21 @@ impl ParallelImporter { Some(c) => c.as_slice(), None => continue, }; + if let Some(ref diff_lines) = file.diff_lines { + if let Some(rec) = record_git_diff_add_fast( + &file.path, + content, + diff_lines, + atomic_core::record::workflow::DetectionKind::Added, + ) { + recorded_files.push(rec); + continue; + } + } + if let Some(rec) = record_git_import_add_linewise(&file.path, content) { + recorded_files.push(rec); + continue; + } memory_wc.add_file(&file.path, content); let detected = DetectedFile::added(&file.path); match record_added_file(&memory_wc, &detected, &core_options) { @@ -1085,77 +1227,18 @@ impl ParallelImporter { ); } - // Assemble the change from recorded files - if recorded_files.is_empty() { - let mut change = Change::empty(header); - change.unhashed = Some(self.build_git_metadata(parsed, false, true)); - let hash = change.hash().map_err(|e| CliError::Internal(e.into()))?; - repo.save_change(&change) - .map_err(|e| CliError::Internal(e.into()))?; - repo.insert_change(&hash, Default::default()) - .map_err(|e| CliError::Internal(e.into()))?; - return Ok(true); - } - - let step_start = Instant::now(); - let (mut change, hash) = match repo.assemble_and_hash(header.clone(), &recorded_files) { - Ok(result) => result, - Err(e) => { - // Globalization may strip all hunks (e.g., pure deletion commits - // where find_content_vertices returns empty for already-deleted - // files). Fall back to an empty change — the explicit - // repo.remove() cleanup below still handles the TREE entries. - let err_msg = e.to_string(); - if err_msg.contains("empty") || err_msg.contains("AllEmpty") { - let mut empty = Change::empty(header); - empty.unhashed = Some(self.build_git_metadata(parsed, false, true)); - let h = empty.hash().map_err(|e| CliError::Internal(e.into()))?; - repo.save_change(&empty) - .map_err(|e| CliError::Internal(e.into()))?; - repo.insert_change(&h, Default::default()) - .map_err(|e| CliError::Internal(e.into()))?; - - // Still clean up deleted files from TREE and FILE_INDEX - if !deleted_paths.is_empty() { - let del_refs: Vec<&str> = - deleted_paths.iter().map(|s| s.as_str()).collect(); - let _ = repo.remove_batch(&del_refs); - let _ = repo.del_file_index_batch(&del_refs); - } - - return Ok(true); - } - return Err(CliError::Internal(e.into())); - } - }; - let assemble_ms = step_start.elapsed().as_millis(); - - change.unhashed = Some(self.build_git_metadata(parsed, false, false)); - - // Save and insert - let step_start = Instant::now(); - repo.save_change(&change) - .map_err(|e| CliError::Internal(e.into()))?; - let save_ms = step_start.elapsed().as_millis(); - - let step_start = Instant::now(); - repo.insert_change(&hash, Default::default()) + let metadata = self.build_git_metadata(parsed, false, recorded_files.is_empty()); + let write_start = Instant::now(); + let write_outcome = repo + .write_import_recorded( + header, + &recorded_files, + metadata, + &deleted_paths, + Default::default(), + ) .map_err(|e| CliError::Internal(e.into()))?; - let insert_ms = step_start.elapsed().as_millis(); - - // Log slow commits (>50ms total) so we can identify the bottleneck - let total_ms = assemble_ms + save_ms + insert_ms; - if total_ms > 50 { - log::info!( - " SLOW commit {} ({} files): assemble={}ms save={}ms insert={}ms total={}ms", - parsed.short_sha, - parsed.files.len(), - assemble_ms, - save_ms, - insert_ms, - total_ms, - ); - } + let write_ms = write_start.elapsed().as_millis(); // Files deleted via record_modified_file (the "show diff lines" path) // produce GraphOp::Replacement, not GraphOp::FileDel, so insert_change @@ -1164,9 +1247,44 @@ impl ParallelImporter { // Also remove from FILE_INDEX so status doesn't show them as deleted. // Batch-remove deleted files from TREE and FILE_INDEX in single write txns. if !deleted_paths.is_empty() { + let cleanup_start = Instant::now(); let del_refs: Vec<&str> = deleted_paths.iter().map(|s| s.as_str()).collect(); - let _ = repo.remove_batch(&del_refs); let _ = repo.del_file_index_batch(&del_refs); + let cleanup_ms = cleanup_start.elapsed().as_millis(); + trace_git_import(format!( + "write {} files={} recorded={} add_batch={}ms record={}ms assemble={}ms save={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms cleanup={}ms writer_total={}ms total={}ms", + parsed.short_sha, + parsed.files.len(), + recorded_files.len(), + add_batch_ms, + record_ms, + write_outcome.timings.assemble_ms, + write_outcome.timings.save_ms, + write_outcome.timings.apply_ms, + write_outcome.timings.direct_graph_ms, + write_outcome.timings.direct_crdt_ms, + write_outcome.timings.commit_ms, + cleanup_ms, + write_ms, + commit_start.elapsed().as_millis() + )); + } else { + trace_git_import(format!( + "write {} files={} recorded={} add_batch={}ms record={}ms assemble={}ms save={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms cleanup=0ms writer_total={}ms total={}ms", + parsed.short_sha, + parsed.files.len(), + recorded_files.len(), + add_batch_ms, + record_ms, + write_outcome.timings.assemble_ms, + write_outcome.timings.save_ms, + write_outcome.timings.apply_ms, + write_outcome.timings.direct_graph_ms, + write_outcome.timings.direct_crdt_ms, + write_outcome.timings.commit_ms, + write_ms, + commit_start.elapsed().as_millis() + )); } Ok(true) @@ -1179,16 +1297,23 @@ impl ParallelImporter { parsed: &ParsedCommit, header: ChangeHeader, ) -> CliResult { - let mut change = Change::empty(header); - change.unhashed = Some(self.build_git_metadata(parsed, true, false)); - - let hash = change.hash().map_err(|e| CliError::Internal(e.into()))?; - - repo.save_change(&change) + let commit_start = Instant::now(); + let metadata = self.build_git_metadata(parsed, true, false); + let write_outcome = repo + .write_import_recorded(header, &[], metadata, &[], Default::default()) .map_err(|e| CliError::Internal(e.into()))?; - repo.insert_change(&hash, Default::default()) - .map_err(|e| CliError::Internal(e.into()))?; + trace_git_import(format!( + "write {} files=0 recorded=0 add_batch=0ms record=0ms assemble={}ms save={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms cleanup=0ms total={}ms empty_commit=true", + parsed.short_sha, + write_outcome.timings.assemble_ms, + write_outcome.timings.save_ms, + write_outcome.timings.apply_ms, + write_outcome.timings.direct_graph_ms, + write_outcome.timings.direct_crdt_ms, + write_outcome.timings.commit_ms, + commit_start.elapsed().as_millis() + )); Ok(true) } @@ -1281,6 +1406,7 @@ fn parse_commit( _index: usize, oid_to_index: &std::collections::HashMap, ) -> CliResult { + let parse_start = Instant::now(); let commit = git_repo.find_commit(oid).map_err(|e| CliError::GitError { message: format!("Failed to find commit {}: {}", oid, e), })?; @@ -1330,11 +1456,13 @@ fn parse_commit( let mut diff_opts = DiffOptions::new(); diff_opts.include_untracked(false); + let diff_start = Instant::now(); let mut diff = git_repo .diff_tree_to_tree(parent_tree.as_ref(), Some(&tree), Some(&mut diff_opts)) .map_err(|e| CliError::GitError { message: format!("Failed to compute diff: {}", e), })?; + let diff_ms = diff_start.elapsed().as_millis(); // Apply rename detection — mirrors what git CLI does after computing // the initial diff. This correctly classifies renamed files as R deltas @@ -1350,21 +1478,38 @@ fn parse_commit( // src/export/markdown.rs while adding src/export/tests.rs (52% // similar) would be misclassified as a rename of markdown→tests, // orphaning markdown.rs from the TREE. - let mut find_opts = DiffFindOptions::new(); - find_opts.renames(true); - let _ = diff.find_similar(Some(&mut find_opts)); - - let stats = diff.stats().map_err(|e| CliError::GitError { - message: format!("Failed to get diff stats: {}", e), - })?; + let rename_start = Instant::now(); + let detected_renames = should_detect_renames(&diff); + if detected_renames { + let mut find_opts = DiffFindOptions::new(); + find_opts.renames(true); + let _ = diff.find_similar(Some(&mut find_opts)); + } else { + log::debug!( + "parse_commit {}: skipping rename detection for large/add-only diff", + short_sha + ); + } + let rename_ms = rename_start.elapsed().as_millis(); // Parse files. Pure rename commits can show up as zero-stat directory // modifications in libgit2; when that happens, fall back to recursive // `git diff-tree -r -M` name-status output for per-file entries. - let mut files = parse_diff_files(git_repo, &diff, &tree, parent_tree.as_ref())?; - if stats.files_changed() == 0 { + let capture_diff_lines = parent_tree.is_some(); + let files_start = Instant::now(); + let mut files = parse_diff_files( + git_repo, + &diff, + &tree, + parent_tree.as_ref(), + capture_diff_lines, + )?; + let mut parse_files_ms = files_start.elapsed().as_millis(); + if files.is_empty() { if let Some(ref pt) = parent_tree { + let fallback_start = Instant::now(); let fallback = parse_diff_files_via_git_cli(git_repo, oid, pt.id(), &tree, pt)?; + parse_files_ms += fallback_start.elapsed().as_millis(); if !fallback.is_empty() { files = fallback; } @@ -1372,6 +1517,19 @@ fn parse_commit( } let is_empty = files.is_empty(); + trace_git_import(format!( + "parse {} files={} merge={} empty={} diff={}ms rename={}ms(rename_detect={}) files={}ms total={}ms", + short_sha, + files.len(), + is_merge, + is_empty, + diff_ms, + rename_ms, + detected_renames, + parse_files_ms, + parse_start.elapsed().as_millis() + )); + Ok(ParsedCommit { git_sha: sha, short_sha, @@ -1420,6 +1578,7 @@ fn parse_diff_files( diff: &Diff, tree: &Tree, parent_tree: Option<&Tree>, + capture_diff_lines: bool, ) -> CliResult> { use std::collections::HashMap; @@ -1432,32 +1591,34 @@ fn parse_diff_files( // Map from file path → accumulated diff lines for that file. let mut lines_by_path: HashMap> = HashMap::new(); - let _ = diff.foreach( - &mut |_delta, _progress| true, // file_cb (no-op) - None, // binary_cb - None, // hunk_cb - Some(&mut |delta, _hunk, line| { - let origin = line.origin(); - // We only keep `+`, `-`, and context (` `) lines. - if origin != '+' && origin != '-' && origin != ' ' { - return true; - } - let path = delta - .new_file() - .path() - .or_else(|| delta.old_file().path()) - .map(|p| p.to_string_lossy().to_string()) - .unwrap_or_default(); - - lines_by_path.entry(path).or_default().push(GitDiffLine { - origin, - content: line.content().to_vec(), - old_lineno: line.old_lineno(), - new_lineno: line.new_lineno(), - }); - true - }), - ); + if capture_diff_lines { + let _ = diff.foreach( + &mut |_delta, _progress| true, // file_cb (no-op) + None, // binary_cb + None, // hunk_cb + Some(&mut |delta, _hunk, line| { + let origin = line.origin(); + // We only keep `+`, `-`, and context (` `) lines. + if origin != '+' && origin != '-' && origin != ' ' { + return true; + } + let path = delta + .new_file() + .path() + .or_else(|| delta.old_file().path()) + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(); + + lines_by_path.entry(path).or_default().push(GitDiffLine { + origin, + content: line.content().to_vec(), + old_lineno: line.old_lineno(), + new_lineno: line.new_lineno(), + }); + true + }), + ); + } // ── Step 2: build ParsedFile entries from the delta list ───────────── diff --git a/atomic-core/src/apply/graph_batch.rs b/atomic-core/src/apply/graph_batch.rs index e70b9e3..d59942a 100644 --- a/atomic-core/src/apply/graph_batch.rs +++ b/atomic-core/src/apply/graph_batch.rs @@ -72,6 +72,34 @@ impl<'txn> GraphWriteBatch<'txn> { Ok(()) } + + /// Add a canonical bidirectional GRAPH edge pair, but only the forward + /// adjacency row to INODE_GRAPH. + /// + /// This is useful for bulk import paths that can add terminal inode rows + /// after building a linear chain. The global GRAPH remains fully + /// bidirectional; only the file-local secondary index is compacted. + pub fn add_edge_with_reverse_inode_forward_only( + &mut self, + inode: Option, + flag: EdgeFlags, + source: GraphNode, + dest: GraphNode, + introduced_by: NodeId, + ) -> PristineResult<()> { + let forward_edge = SerializedGraphEdge::new(flag, dest.start_pos(), introduced_by); + let reverse_flag = flag | EdgeFlags::PARENT; + let reverse_edge = SerializedGraphEdge::new(reverse_flag, source.end_pos(), introduced_by); + + self.put_graph(source, forward_edge)?; + self.put_graph(dest, reverse_edge)?; + + if let Some(inode_val) = inode { + self.put_inode_graph(inode_val, source, forward_edge)?; + } + + Ok(()) + } } #[inline] diff --git a/atomic-repository/src/repository/insert.rs b/atomic-repository/src/repository/insert.rs index 0755cdd..103320a 100644 --- a/atomic-repository/src/repository/insert.rs +++ b/atomic-repository/src/repository/insert.rs @@ -5,6 +5,9 @@ use crate::apply::{ write_change_to_graph, CrossViewInsertOptions, CrossViewInsertOutcome, InsertOptions, InsertOutcome, InsertStats, }; +use atomic_core::change::Insertion; +use atomic_core::types::{ChangePosition, EdgeFlags, GraphNode, SerializedGraphEdge}; +use std::collections::{HashMap, HashSet}; /// Check whether a file's creating change exists ONLY on the given view /// (and no other view). Returns `true` when it is safe to remove the @@ -63,9 +66,460 @@ fn is_file_only_on_view( true } +/// Timing details for the git-import fresh-write path. +#[derive(Debug, Clone, Copy, Default)] +pub struct ImportWriteTimings { + pub assemble_ms: u128, + pub save_ms: u128, + pub apply_ms: u128, + pub commit_ms: u128, + pub direct_graph_ms: u128, + pub direct_crdt_ms: u128, +} + +/// Outcome from writing an already-recorded git-import commit. +#[derive(Debug, Clone)] +pub struct ImportWriteOutcome { + pub hash: Hash, + pub timings: ImportWriteTimings, + pub insert: InsertOutcome, +} + +fn import_direct_source( + pos: &Position>, + by_end: &HashMap>, +) -> Option> { + match pos.change { + Some(hash) if hash == Hash::NONE => Some(GraphNode::root()), + None => by_end.get(&pos.pos).copied(), + _ => None, + } +} + +fn import_direct_inode( + pos: &Position>, + inode_by_pos: &HashMap, +) -> Option { + match pos.change { + None => inode_by_pos.get(&pos.pos).copied(), + _ => None, + } +} + +fn import_direct_can_apply(change: &Change) -> bool { + if change.hunks().is_empty() { + return true; + } + + change.hunks().iter().all(|op| match op { + GraphOp::FileAdd { + add_name, + add_inode, + contents, + .. + } => { + add_name.successors.is_empty() + && add_inode.successors.is_empty() + && contents + .as_ref() + .map(|c| c.successors.is_empty()) + .unwrap_or(true) + } + GraphOp::Edit { + change: atomic_core::change::Atom::Insertion(insertion), + .. + } => { + insertion.successors.is_empty() + && insertion.predecessors.len() == 1 + && insertion.predecessors[0].change.is_none() + && insertion.inode.change.is_none() + } + _ => false, + }) +} + +fn import_direct_write_insertion( + batch: &mut atomic_core::apply::GraphWriteBatch<'_>, + change_id: NodeId, + insertion: &Insertion>, + by_end: &mut HashMap>, + inode_by_pos: &HashMap, + inode_sources: &mut HashSet<(u64, GraphNode)>, + inode_terminal_candidates: &mut Vec<(Inode, GraphNode, SerializedGraphEdge)>, +) -> Result<(), RepositoryError> { + if !insertion.successors.is_empty() || insertion.predecessors.len() != 1 { + return Err(RepositoryError::Apply( + "direct import insertion requires one predecessor and no successors".to_string(), + )); + } + + let source = import_direct_source(&insertion.predecessors[0], by_end).ok_or_else(|| { + RepositoryError::Apply(format!( + "direct import missing predecessor at {:?}", + insertion.predecessors[0] + )) + })?; + let dest = GraphNode { + change: change_id, + start: insertion.start, + end: insertion.end, + }; + let inode = import_direct_inode(&insertion.inode, inode_by_pos); + let flag = insertion.flag | EdgeFlags::BLOCK; + + batch + .add_edge_with_reverse_inode_forward_only(inode, flag, source, dest, change_id) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + if let Some(inode_val) = inode { + inode_sources.insert((inode_val.get(), source)); + let reverse_edge = + SerializedGraphEdge::new(flag | EdgeFlags::PARENT, source.end_pos(), change_id); + inode_terminal_candidates.push((inode_val, dest, reverse_edge)); + } + by_end.insert(dest.end, dest); + Ok(()) +} + impl Repository { // Change Insertion Methods + /// Assemble, save, and apply a freshly imported Git commit without going + /// through the normal `insert_change()` load/check path. + /// + /// This preserves the normal graph writer and CRDT table application, but + /// avoids reloading the just-saved change and avoids the `has_change_in_graph` + /// probe. The write transaction is opened before assembly, so globalization + /// and application share one consistent transaction view. + pub fn write_import_recorded( + &self, + header: ChangeHeader, + recorded_files: &[atomic_core::record::workflow::RecordedFile], + unhashed: serde_json::Value, + deleted_paths: &[String], + options: InsertOptions, + ) -> Result { + use atomic_core::record::workflow::assemble_change; + use atomic_core::record::workflow::assembly::AssemblyOptions; + + let mut timings = ImportWriteTimings::default(); + let view_name = options.view.as_deref().unwrap_or(&self.current_view); + + let mut txn = self + .pristine + .write_txn() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let assemble_start = std::time::Instant::now(); + let mut change = if recorded_files.is_empty() { + Change::empty(header) + } else { + match assemble_change(&txn, recorded_files, header.clone(), &AssemblyOptions::default()) + { + Ok(result) => result.into_change(), + Err(e) => { + let err_msg = e.to_string(); + if err_msg.contains("empty") || err_msg.contains("AllEmpty") { + Change::empty(header) + } else { + return Err(RepositoryError::Apply(e.to_string())); + } + } + } + }; + timings.assemble_ms = assemble_start.elapsed().as_millis(); + + change.unhashed = Some(unhashed); + + let mut v3_bytes = Vec::new(); + let hash = change + .serialize(&mut v3_bytes) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let (final_change, verified_hash) = Change::deserialize(&mut v3_bytes.as_slice()) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + debug_assert_eq!(hash, verified_hash); + + let save_start = std::time::Instant::now(); + self.save_change_bytes(&hash, &v3_bytes, &final_change)?; + timings.save_ms = save_start.elapsed().as_millis(); + + let change_id = txn + .register_change(&hash) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_change_deps(change_id, final_change.dependencies()) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + for graph_op in final_change.hunks() { + match graph_op { + GraphOp::FileAdd { + add_inode, path, .. + } => { + let new_inode = txn + .alloc_inode() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let inode_position = Position::new(change_id, add_inode.start); + txn.put_tree(path, new_inode) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_inode(new_inode, inode_position) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + GraphOp::DirAdd { + add_inode, path, .. + } => { + use atomic_core::pristine::directory_flags; + + let new_inode = txn + .alloc_inode() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let inode_position = Position::new(change_id, add_inode.start); + txn.put_tree(path, new_inode) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_inode(new_inode, inode_position) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_directory(new_inode, directory_flags::explicit_empty()) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + GraphOp::FileDel { path, .. } => { + if let Ok(Some(inode)) = txn.get_inode(path) { + let dominated = is_file_only_on_view(&txn, inode, view_name); + if dominated { + let _ = txn.del_tree(path); + let _ = txn.del_inode(inode); + } + } + } + GraphOp::DirDel { path, .. } => { + if let Ok(Some(inode)) = txn.get_inode(path) { + let dominated = is_file_only_on_view(&txn, inode, view_name); + if dominated { + let _ = txn.del_tree(path); + let _ = txn.del_inode(inode); + let _ = txn.del_directory(inode); + } + } + } + GraphOp::FileMove { add, path, .. } => { + let inode_change_id = match &add.inode.change { + None => change_id, + Some(h) if *h == Hash::NONE => NodeId::ROOT, + Some(h) => txn.get_internal(h).unwrap_or(None).unwrap_or(NodeId::ROOT), + }; + let inode_pos = Position::new(inode_change_id, add.inode.pos); + + if let Ok(Some(inode)) = txn.position_inode(inode_pos) { + if let Ok(Some(old_path)) = txn.get_path(inode) { + if old_path != *path { + let _ = txn.del_tree(&old_path); + } + } + let _ = txn.put_tree(path, inode); + } + } + _ => {} + } + } + + for deleted_path in deleted_paths { + if let Ok(Some(inode)) = txn.get_inode(deleted_path) { + let dominated = is_file_only_on_view(&txn, inode, view_name); + if dominated { + let _ = txn.del_tree(deleted_path); + let _ = txn.del_inode(inode); + } + } + } + + let apply_start = std::time::Instant::now(); + let (insert, direct_graph_ms, direct_crdt_ms) = if import_direct_can_apply(&final_change) { + let (insert, graph_ms, crdt_ms) = self.write_import_direct_add_chain( + &mut txn, + view_name, + change_id, + &hash, + &final_change, + &options, + )?; + (insert, graph_ms, crdt_ms) + } else { + let insert = write_change_to_graph( + &mut txn, + view_name, + change_id, + &hash, + &final_change, + &options, + false, + ) + .map_err(|e| RepositoryError::Apply(e.to_string()))?; + (insert, 0, 0) + }; + timings.apply_ms = apply_start.elapsed().as_millis(); + timings.direct_graph_ms = direct_graph_ms; + timings.direct_crdt_ms = direct_crdt_ms; + + let commit_start = std::time::Instant::now(); + txn.commit() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + timings.commit_ms = commit_start.elapsed().as_millis(); + + Ok(ImportWriteOutcome { + hash, + timings, + insert, + }) + } + + fn write_import_direct_add_chain( + &self, + txn: &mut atomic_core::pristine::WriteTxn<'_>, + view_name: &str, + change_id: NodeId, + hash: &Hash, + change: &Change, + _options: &InsertOptions, + ) -> Result<(InsertOutcome, u128, u128), RepositoryError> { + use atomic_core::apply::{apply_file_ops_batched, compute_new_state}; + + let mut by_end: HashMap> = HashMap::new(); + let mut inode_by_pos: HashMap = HashMap::new(); + + for graph_op in change.hunks() { + if let GraphOp::FileAdd { + add_inode, path, .. + } = graph_op + { + let inode_position = Position::new(change_id, add_inode.start); + let inode = match txn + .position_inode(inode_position) + .map_err(|e| RepositoryError::Database(e.to_string()))? + { + Some(existing) => existing, + None => { + let inode = txn + .alloc_inode() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_tree(path, inode) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_inode(inode, inode_position) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + inode + } + }; + inode_by_pos.insert(add_inode.start, inode); + } + } + + let graph_start = std::time::Instant::now(); + { + let mut batch = atomic_core::apply::GraphWriteBatch::new(&*txn) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let mut inode_sources: HashSet<(u64, GraphNode)> = HashSet::new(); + let mut inode_terminal_candidates: Vec<( + Inode, + GraphNode, + SerializedGraphEdge, + )> = Vec::new(); + + for graph_op in change.hunks() { + match graph_op { + GraphOp::FileAdd { + add_name, + add_inode, + contents, + .. + } => { + import_direct_write_insertion( + &mut batch, + change_id, + add_name, + &mut by_end, + &inode_by_pos, + &mut inode_sources, + &mut inode_terminal_candidates, + )?; + import_direct_write_insertion( + &mut batch, + change_id, + add_inode, + &mut by_end, + &inode_by_pos, + &mut inode_sources, + &mut inode_terminal_candidates, + )?; + if let Some(contents) = contents { + import_direct_write_insertion( + &mut batch, + change_id, + contents, + &mut by_end, + &inode_by_pos, + &mut inode_sources, + &mut inode_terminal_candidates, + )?; + } + } + GraphOp::Edit { + change: atomic_core::change::Atom::Insertion(insertion), + .. + } => { + import_direct_write_insertion( + &mut batch, + change_id, + insertion, + &mut by_end, + &inode_by_pos, + &mut inode_sources, + &mut inode_terminal_candidates, + )?; + } + _ => { + return Err(RepositoryError::Apply( + "direct import received unsupported graph op".to_string(), + )); + } + } + } + + for (inode, node, edge) in inode_terminal_candidates { + if !inode_sources.contains(&(inode.get(), node)) { + batch + .put_inode_graph(inode, node, edge) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + } + } + let graph_ms = graph_start.elapsed().as_millis(); + + let crdt_start = std::time::Instant::now(); + if change.has_file_ops() { + apply_file_ops_batched(txn, change_id, change.file_ops()) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + let crdt_ms = crdt_start.elapsed().as_millis(); + + let mut view = txn + .open_or_create_view(view_name) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let new_state = compute_new_state(&view.state, hash); + let sequence = view.change_count + 1; + txn.put_change(&mut view, change_id, hash) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + view.state = new_state; + view.change_count = sequence; + txn.update_view(&view) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let mut stats = InsertStats::new(); + stats.changes_applied = 1; + stats.applied_hashes.push(*hash); + stats.atoms_processed = change.hunks().len(); + + Ok(( + InsertOutcome::new(new_state, sequence, false, stats), + graph_ms, + crdt_ms, + )) + } + /// Insert a change into the current view. /// /// This is the high-level method for inserting a single change into the diff --git a/atomic-repository/src/repository/mod.rs b/atomic-repository/src/repository/mod.rs index cd514b9..f45fe09 100644 --- a/atomic-repository/src/repository/mod.rs +++ b/atomic-repository/src/repository/mod.rs @@ -114,6 +114,7 @@ mod vault_kg_enrich; mod vault_names; mod vault_triples; pub use vault_embeddings::{hash_embed, EmbedConfig, TextChunk}; +pub use insert::{ImportWriteOutcome, ImportWriteTimings}; pub use vault_goal::{ GoalInfo, GoalStartOptions, GoalStartResult, GoalStopOptions, GoalStopResult, }; diff --git a/tests/harness/10_git_import.sh b/tests/harness/10_git_import.sh index 8f2bb8d..f843a0c 100755 --- a/tests/harness/10_git_import.sh +++ b/tests/harness/10_git_import.sh @@ -267,15 +267,14 @@ else _fail "import completed in reasonable time" "took ${duration}s" fi -# Verify counts match (with some tolerance for merge handling) -actual="$(atomic log 2>/dev/null | grep -cE '^\s*#[0-9]+|^[0-9a-f]{8,}' 2>/dev/null || true)" -actual="${actual:-0}" -actual="$(echo "$actual" | tr -d '[:space:]')" -[[ -z "$actual" ]] && actual=0 -if [[ $actual -ge $((expected_commits - 50)) ]] && [[ $actual -le $((expected_commits + 50)) ]]; then - _pass "change count roughly matches ($actual vs $expected_commits)" +# Verify imported Git commit count exactly. Do not use raw `atomic log` +# length here: git import records follow-up Atomic-only changes such as +# repository/vault initialization, and those are not Git commits. +actual="$(count_imported_git_changes)" +if [[ "$actual" -eq "$expected_commits" ]]; then + _pass "imported git change count matches ($actual vs $expected_commits)" else - _fail "change count matches" "expected ~$expected_commits, got $actual" + _fail "imported git change count matches" "expected $expected_commits imported git changes, got $actual" fi # ════════════════════════════════════════════════════════════════════════════ From 87cc8a74bbdca81b5e2780f0b55f33ab9fafe28a Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Mon, 18 May 2026 15:46:22 -0400 Subject: [PATCH 2/8] support logging tx --- atomic-cli/src/commands/git/parallel.rs | 162 +++++++++++++++++++++++- 1 file changed, 159 insertions(+), 3 deletions(-) diff --git a/atomic-cli/src/commands/git/parallel.rs b/atomic-cli/src/commands/git/parallel.rs index 1223de3..08ea711 100644 --- a/atomic-cli/src/commands/git/parallel.rs +++ b/atomic-cli/src/commands/git/parallel.rs @@ -55,9 +55,10 @@ use std::collections::HashSet; use std::path::{Path, PathBuf}; use std::process::Command; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use std::time::Instant; +use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; +use std::sync::{mpsc, Arc}; +use std::thread; +use std::time::{Duration, Instant}; use chrono::{DateTime, TimeZone, Utc}; use git2::{ @@ -234,6 +235,88 @@ fn trace_git_import(message: impl AsRef) { } } +struct SlowImportProgress { + done: mpsc::Sender<()>, + reported: Arc, + handle: Option>, +} + +impl SlowImportProgress { + fn start(commit: String, summary: String) -> Self { + let (done, rx) = mpsc::channel(); + let reported = Arc::new(AtomicBool::new(false)); + let reported_for_thread = Arc::clone(&reported); + let handle = thread::spawn(move || { + let started = Instant::now(); + if rx.recv_timeout(Duration::from_secs(5)).is_ok() { + return; + } + + reported_for_thread.store(true, Ordering::Relaxed); + print_info(&format!( + "Still importing {} after {}s; please be patient. {}", + commit, + started.elapsed().as_secs(), + summary + )); + + loop { + if rx.recv_timeout(Duration::from_secs(15)).is_ok() { + break; + } + print_info(&format!( + "Still importing {} after {}s; graph/CRDT writes are still running.", + commit, + started.elapsed().as_secs() + )); + } + }); + + Self { + done, + reported, + handle: Some(handle), + } + } + + fn finish(mut self) -> bool { + let _ = self.done.send(()); + if let Some(handle) = self.handle.take() { + let _ = handle.join(); + } + self.reported.load(Ordering::Relaxed) + } +} + +fn truncate_for_progress(input: &str, max_chars: usize) -> String { + let mut out = String::new(); + for (idx, ch) in input.chars().enumerate() { + if idx >= max_chars { + out.push_str("..."); + return out; + } + out.push(ch); + } + out +} + +fn format_byte_count(bytes: usize) -> String { + const KIB: f64 = 1024.0; + const MIB: f64 = KIB * 1024.0; + const GIB: f64 = MIB * 1024.0; + + let bytes_f = bytes as f64; + if bytes_f >= GIB { + format!("{:.1}GiB", bytes_f / GIB) + } else if bytes_f >= MIB { + format!("{:.1}MiB", bytes_f / MIB) + } else if bytes_f >= KIB { + format!("{:.1}KiB", bytes_f / KIB) + } else { + format!("{}B", bytes) + } +} + fn should_detect_renames(diff: &Diff) -> bool { let mut adds = 0usize; let mut deletes = 0usize; @@ -392,6 +475,62 @@ fn record_git_import_add_linewise(path: &str, new_content: &[u8]) -> Option String { + let message = truncate_for_progress(&parsed.metadata.message.replace('\n', " "), 72); + format!("{} \"{}\"", parsed.short_sha, message) +} + +fn slow_import_record_summary(parsed: &ParsedCommit, recorded_files: &[RecordedFile]) -> String { + let mut added = 0usize; + let mut modified = 0usize; + let mut deleted = 0usize; + let mut renamed = 0usize; + let mut copied = 0usize; + let mut bytes = 0usize; + + for file in &parsed.files { + match file.operation { + FileOperation::Added => added += 1, + FileOperation::Modified => modified += 1, + FileOperation::Deleted => deleted += 1, + FileOperation::Renamed => renamed += 1, + FileOperation::Copied => copied += 1, + } + bytes += file.new_content.as_ref().map(|c| c.len()).unwrap_or(0); + bytes += file.old_content.as_ref().map(|c| c.len()).unwrap_or(0); + } + + let mut largest: Vec<(&str, usize)> = recorded_files + .iter() + .map(|rec| (rec.path(), rec.content().len())) + .collect(); + largest.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(b.0))); + let top_paths = largest + .into_iter() + .take(3) + .map(|(path, size)| format!("{} ({})", path, format_byte_count(size))) + .collect::>(); + + let top = if top_paths.is_empty() { + "top records: none".to_string() + } else { + format!("top records: {}", top_paths.join(", ")) + }; + + format!( + "records={}, files={}, bytes={}, ops=+{}/~{}/-{} renames={} copies={}; {}", + recorded_files.len(), + parsed.files.len(), + format_byte_count(bytes), + added, + modified, + deleted, + renamed, + copied, + top + ) +} + impl ParallelImporter { /// Create a new parallel importer. pub fn new(git_repo: &GitRepository, options: ParallelImportOptions) -> Self { @@ -1228,6 +1367,10 @@ impl ParallelImporter { } let metadata = self.build_git_metadata(parsed, false, recorded_files.is_empty()); + let progress = SlowImportProgress::start( + slow_import_commit_label(parsed), + slow_import_record_summary(parsed, &recorded_files), + ); let write_start = Instant::now(); let write_outcome = repo .write_import_recorded( @@ -1239,6 +1382,19 @@ impl ParallelImporter { ) .map_err(|e| CliError::Internal(e.into()))?; let write_ms = write_start.elapsed().as_millis(); + let progress_reported = progress.finish(); + if progress_reported || write_ms >= 5_000 { + print_info(&format!( + "Imported {} in {:.1}s (assemble={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms)", + slow_import_commit_label(parsed), + write_ms as f64 / 1000.0, + write_outcome.timings.assemble_ms, + write_outcome.timings.apply_ms, + write_outcome.timings.direct_graph_ms, + write_outcome.timings.direct_crdt_ms, + write_outcome.timings.commit_ms + )); + } // Files deleted via record_modified_file (the "show diff lines" path) // produce GraphOp::Replacement, not GraphOp::FileDel, so insert_change From f938d8640b12a52279c86a1cbb593ce0cbb4a1af Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Tue, 19 May 2026 15:11:39 -0400 Subject: [PATCH 3/8] fix performance --- .../src/hooks/claude_code/settings.rs | 90 +- atomic-agent/src/hooks/codex.rs | 1177 +++++++++------- atomic-cli/src/commands/agent/enable.rs | 37 +- atomic-cli/src/commands/change/command.rs | 95 +- atomic-cli/src/commands/change/tests.rs | 38 +- atomic-cli/src/commands/diff/helpers.rs | 48 +- atomic-cli/src/commands/diff/tests.rs | 35 + atomic-cli/src/commands/doctor.rs | 84 ++ atomic-cli/src/commands/git/import.rs | 148 +- atomic-cli/src/commands/git/parallel.rs | 1233 ++++++++++++++++- atomic-core/src/apply/file_ops.rs | 193 ++- atomic-core/src/apply/mod.rs | 4 +- atomic-repository/src/changestore/mod.rs | 84 ++ atomic-repository/src/changestore/tests.rs | 33 + .../src/changestore/trait_impl.rs | 54 +- atomic-repository/src/repository/insert.rs | 674 ++++++++- atomic-repository/src/repository/mod.rs | 4 +- .../src/repository/semantic_materialize.rs | 619 +++++++++ 18 files changed, 3876 insertions(+), 774 deletions(-) create mode 100644 atomic-repository/src/repository/semantic_materialize.rs diff --git a/atomic-agent/src/hooks/claude_code/settings.rs b/atomic-agent/src/hooks/claude_code/settings.rs index 7e6c541..f7b9f7c 100644 --- a/atomic-agent/src/hooks/claude_code/settings.rs +++ b/atomic-agent/src/hooks/claude_code/settings.rs @@ -9,11 +9,6 @@ use crate::error::{AgentError, AgentResult}; pub(super) const ATOMIC_HOOK_PREFIX: &str = "atomic agent hooks claude-code"; pub(super) const METADATA_DENY_RULE: &str = "Read(./.atomic/metadata/**)"; -// ============================================================================ -// SETTINGS FILE TYPES -// ============================================================================ - -/// A single hook entry within a matcher group. #[derive(Debug, Clone, Serialize, Deserialize)] pub(crate) struct ClaudeHookEntry { #[serde(rename = "type")] @@ -21,11 +16,6 @@ pub(crate) struct ClaudeHookEntry { pub command: String, } -/// A matcher group containing one or more hook entries. -/// -/// For simple hooks (stop, session-start, etc.) the `matcher` is empty. -/// For tool-specific hooks (PreToolUse, PostToolUse) the `matcher` is the -/// tool name (e.g., "Task", "TodoWrite"). #[derive(Debug, Clone, Serialize, Deserialize)] pub(crate) struct ClaudeHookMatcher { #[serde(default)] @@ -33,7 +23,6 @@ pub(crate) struct ClaudeHookMatcher { pub hooks: Vec, } -/// The hooks section of `.claude/settings.json`. #[derive(Debug, Default, Clone, Serialize, Deserialize)] #[serde(rename_all = "PascalCase")] pub(crate) struct ClaudeHooks { @@ -60,14 +49,6 @@ pub(crate) struct ClaudeHooks { pub post_tool_use: Vec, } -// ============================================================================ -// READ / WRITE SETTINGS -// ============================================================================ - -/// Read and parse the existing `.claude/settings.json`, if it exists. -/// -/// Returns `(raw_settings, hooks)` where `raw_settings` preserves unknown -/// fields and `hooks` is the parsed hooks section. pub(crate) fn read_settings( settings_path: &Path, ) -> AgentResult<(serde_json::Map, ClaudeHooks)> { @@ -105,7 +86,6 @@ pub(crate) fn read_settings( Ok((raw, hooks)) } -/// Write settings back to `.claude/settings.json`, preserving formatting. pub(crate) fn write_settings( settings_path: &Path, raw: &serde_json::Map, @@ -123,8 +103,6 @@ pub(crate) fn write_settings( path: settings_path.to_path_buf(), reason: e.to_string(), })?; - - // Ensure trailing newline for POSIX compatibility if !output.ends_with('\n') { output.push('\n'); } @@ -133,33 +111,19 @@ pub(crate) fn write_settings( operation: "write".to_string(), path: settings_path.to_path_buf(), reason: e.to_string(), - })?; - - Ok(()) + }) } -/// Check if a specific command exists in a matcher list. pub(crate) fn hook_command_exists( matchers: &[ClaudeHookMatcher], matcher_name: &str, command: &str, ) -> bool { - for matcher in matchers { - if matcher.matcher == matcher_name { - for hook in &matcher.hooks { - if hook.command == command { - return true; - } - } - } - } - false + matchers.iter().any(|matcher| { + matcher.matcher == matcher_name && matcher.hooks.iter().any(|h| h.command == command) + }) } -/// Add a hook command to the appropriate matcher in the list. -/// -/// If a matcher with the given name already exists, the hook is appended to it. -/// Otherwise, a new matcher group is created. pub(crate) fn add_hook_to_matcher( matchers: &mut Vec, matcher_name: &str, @@ -170,73 +134,51 @@ pub(crate) fn add_hook_to_matcher( command: command.to_string(), }; - // Find existing matcher with the same name - for matcher in matchers.iter_mut() { - if matcher.matcher == matcher_name { - matcher.hooks.push(entry); - return; - } + if let Some(matcher) = matchers.iter_mut().find(|m| m.matcher == matcher_name) { + matcher.hooks.push(entry); + return; } - // No existing matcher — create a new one matchers.push(ClaudeHookMatcher { matcher: matcher_name.to_string(), hooks: vec![entry], }); } -/// Check if any matcher in a hook list contains an Atomic hook. pub(crate) fn has_any_atomic_hook(matchers: &[ClaudeHookMatcher]) -> bool { - matchers.iter().any(|m| { - m.hooks - .iter() - .any(|h| h.command.contains(ATOMIC_HOOK_PREFIX)) - }) + matchers + .iter() + .any(|m| m.hooks.iter().any(|h| is_atomic_hook(&h.command))) } -/// Returns `true` if a hook command string is an Atomic hook. -/// -/// Uses `contains` rather than `starts_with` so that guarded commands -/// like `test -d .atomic && atomic agent hooks claude-code … || true` -/// are still recognized. pub(crate) fn is_atomic_hook(command: &str) -> bool { command.contains(ATOMIC_HOOK_PREFIX) } -/// Remove all Atomic hooks from a matcher list. -/// -/// Preserves non-Atomic hooks. Removes empty matchers after filtering. pub(crate) fn remove_atomic_hooks(matchers: &mut Vec) { for matcher in matchers.iter_mut() { matcher.hooks.retain(|h| !is_atomic_hook(&h.command)); } - // Remove empty matchers matchers.retain(|m| !m.hooks.is_empty()); } -/// Ensure the metadata deny rule exists in permissions.deny. -/// -/// Returns `true` if the rule was added (i.e., it wasn't already present). pub(crate) fn ensure_deny_rule(raw: &mut serde_json::Map) -> bool { let permissions = raw .entry("permissions".to_string()) .or_insert_with(|| serde_json::json!({})); - let permissions_obj = match permissions.as_object_mut() { - Some(obj) => obj, - None => return false, + let Some(permissions_obj) = permissions.as_object_mut() else { + return false; }; let deny = permissions_obj .entry("deny".to_string()) .or_insert_with(|| serde_json::json!([])); - let deny_arr = match deny.as_array_mut() { - Some(arr) => arr, - None => return false, + let Some(deny_arr) = deny.as_array_mut() else { + return false; }; - // Check if already present let rule_value = serde_json::Value::String(METADATA_DENY_RULE.to_string()); if deny_arr.contains(&rule_value) { return false; @@ -246,7 +188,6 @@ pub(crate) fn ensure_deny_rule(raw: &mut serde_json::Map) { let Some(permissions) = raw.get_mut("permissions") else { return; @@ -264,12 +205,9 @@ pub(crate) fn remove_deny_rule(raw: &mut serde_json::Map` with JSON on stdin. //! -//! # Codex Hooks Architecture +//! Supported verbs: //! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────────────┐ -//! │ Codex Hooks (.codex/hooks.json) │ -//! │ │ -//! │ SessionStart ──▶ atomic agent hooks codex session-start │ -//! │ UserPromptSubmit ──▶ atomic agent hooks codex user-prompt-submit │ -//! │ Stop ──▶ atomic agent hooks codex stop │ -//! │ PostToolUse ──▶ atomic agent hooks codex post-tool │ -//! │ PreToolUse ──▶ atomic agent hooks codex pre-tool │ -//! └─────────────────────────────────────────────────────────────────────────┘ -//! ``` -//! -//! # Config Format -//! -//! Codex uses `.codex/hooks.json` (project-level) or `~/.codex/hooks.json` -//! (user-level). The format uses PascalCase event names: -//! -//! ```json -//! { -//! "hooks": { -//! "SessionStart": [{ "hooks": [{ "type": "command", "command": "..." }] }], -//! "PostToolUse": [{ "hooks": [{ "type": "command", "command": "..." }] }] -//! } -//! } -//! ``` -//! -//! # Exit Code Behavior -//! -//! Same as Claude Code: exit 0 = success, exit 2 = block. -//! -//! # Limitations -//! -//! - No `SessionEnd` event (Codex does not fire one) -//! - `PostToolUse` currently only fires for Bash tool (WIP) -//! - Feature flag `[features] codex_hooks = true` required in config.toml +//! | Codex hook | CLI verb | HookType | +//! |------------------|-----------------------|----------------| +//! | `SessionStart` | `session-start` | SessionStart | +//! | `UserPromptSubmit` | `user-prompt-submit` | TurnStart | +//! | `Stop` | `stop` | TurnEnd | +//! | `PreToolUse` | `pre-tool` | PreToolUse | +//! | `PostToolUse` | `post-tool` | PostToolUse | -use std::path::Path; +use std::path::{Path, PathBuf}; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; use crate::error::{AgentError, AgentResult}; use crate::event::{HookType, TurnEvent}; use super::AgentHook; -// ============================================================================= -// Constants -// ============================================================================= - -/// The directory where Codex stores per-project configuration. const CODEX_DIR: &str = ".codex"; +const HOOKS_FILE: &str = "hooks.json"; +const ATOMIC_HOOK_PREFIX: &str = "atomic agent hooks codex"; -// ============================================================================= -// Codex JSON Input Types -// ============================================================================= - -/// JSON input for the `SessionStart` hook. -/// -/// Fires when Codex begins a new session. #[derive(Debug, Deserialize)] #[allow(dead_code)] struct SessionStartInput { #[serde(default)] session_id: Option, #[serde(default)] - transcript_path: Option, + thread_id: Option, #[serde(default)] - cwd: Option, + transcript_path: Option, #[serde(default)] model: Option, #[serde(default)] source: Option, #[serde(default)] - hook_event_name: Option, + cwd: Option, } -/// JSON input for the `UserPromptSubmit` hook (maps to TurnStart). -/// -/// Fires after the user submits a prompt, before the model begins responding. #[derive(Debug, Deserialize)] #[allow(dead_code)] struct UserPromptSubmitInput { #[serde(default)] session_id: Option, #[serde(default)] + thread_id: Option, + #[serde(default)] transcript_path: Option, #[serde(default)] - cwd: Option, + prompt: Option, #[serde(default)] model: Option, #[serde(default)] - prompt: Option, - #[serde(default)] - turn_id: Option, + cwd: Option, } -/// JSON input for the `Stop` hook (maps to TurnEnd). -/// -/// Fires when the model finishes responding for the turn. #[derive(Debug, Deserialize)] #[allow(dead_code)] struct StopInput { #[serde(default)] session_id: Option, #[serde(default)] - transcript_path: Option, + thread_id: Option, #[serde(default)] - cwd: Option, + transcript_path: Option, #[serde(default)] model: Option, #[serde(default)] - turn_id: Option, + last_assistant_message: Option, #[serde(default)] stop_hook_active: Option, #[serde(default)] - last_assistant_message: Option, + cwd: Option, } -/// JSON input for the `PostToolUse` hook. -/// -/// Currently only fires for the Bash tool (WIP for other tools). #[derive(Debug, Deserialize)] #[allow(dead_code)] -struct PostToolUseInput { +struct ToolUseInput { #[serde(default)] session_id: Option, #[serde(default)] - transcript_path: Option, - #[serde(default)] - cwd: Option, - #[serde(default)] - model: Option, + thread_id: Option, #[serde(default)] - turn_id: Option, + transcript_path: Option, #[serde(default)] tool_name: Option, #[serde(default)] tool_use_id: Option, #[serde(default)] - tool_input: Option, - #[serde(default)] - tool_response: Option, -} - -/// JSON input for the `PreToolUse` hook. -/// -/// Fires before a tool is invoked. -#[derive(Debug, Deserialize)] -#[allow(dead_code)] -struct PreToolUseInput { - #[serde(default)] - session_id: Option, - #[serde(default)] - transcript_path: Option, - #[serde(default)] - cwd: Option, - #[serde(default)] - model: Option, - #[serde(default)] - turn_id: Option, + tool_call_id: Option, #[serde(default)] - tool_name: Option, + call_id: Option, #[serde(default)] - tool_use_id: Option, + tool_input: Option, #[serde(default)] - tool_input: Option, + tool_response: Option, } -// ============================================================================= -// ============================================================================= -// CodexHook -// ============================================================================= +#[derive(Debug, Default, Serialize, Deserialize)] +struct CodexHooksFile { + #[serde(default)] + hooks: Map, + #[serde(flatten)] + extra: Map, +} -/// Codex agent hook adapter. -/// -/// Handles hook JSON parsing, installation into `.codex/hooks.json`, -/// and presence detection via the `.codex/` directory. -/// -/// Codex is OpenAI's coding agent. It uses a hooks system with PascalCase -/// event names and a simpler config format than Claude Code. #[derive(Debug)] pub struct CodexHook { - _private: (), // prevent construction outside of new() + _private: (), } impl CodexHook { - /// Create a new Codex hook adapter. pub fn new() -> Self { Self { _private: () } } - /// Extract a session ID from an optional field, generating a fallback if missing. - /// - /// Codex provides stable session IDs, so we use them directly. If missing - /// (shouldn't happen in practice), fall back to a generated ID. - fn extract_session_id(session_id: Option) -> String { + pub fn global_hooks_path() -> Option { + dirs::home_dir().map(|home| home.join(CODEX_DIR).join(HOOKS_FILE)) + } + + pub fn install_global(&self, force: bool) -> AgentResult { + let path = Self::global_hooks_path().ok_or_else(|| AgentError::ConfigError { + operation: "resolve".to_string(), + path: PathBuf::from("~/.codex/hooks.json"), + reason: "Could not determine home directory for Codex hooks".to_string(), + })?; + install_hooks_at(&path, force) + } + + pub fn uninstall_global(&self) -> AgentResult<()> { + if let Some(path) = Self::global_hooks_path() { + uninstall_hooks_at(&path)?; + } + Ok(()) + } + + pub fn is_installed_global(&self) -> bool { + Self::global_hooks_path().is_some_and(|path| hooks_file_has_atomic_hooks(&path)) + } + + fn local_hooks_path(repo_root: &Path) -> PathBuf { + repo_root.join(CODEX_DIR).join(HOOKS_FILE) + } + + fn extract_session_id( + session_id: Option, + thread_id: Option, + raw: &Value, + ) -> String { session_id + .or(thread_id) + .or_else(|| value_string(raw, "conversation_id")) + .or_else(|| value_string(raw, "thread_id")) .filter(|s| !s.is_empty()) .unwrap_or_else(|| format!("codex-{}", uuid_short())) } + + fn parse_json(&self, hook_type: HookType, input: &[u8]) -> AgentResult { + if input.is_empty() { + return Err(AgentError::HookInputEmpty { + agent: self.name().to_string(), + hook_type: hook_type.as_str().to_string(), + }); + } + + serde_json::from_slice(input).map_err(|e| AgentError::HookParseFailed { + agent: self.name().to_string(), + hook_type: hook_type.as_str().to_string(), + reason: e.to_string(), + }) + } + + fn parse_value Deserialize<'de>>( + &self, + hook_type: HookType, + raw_json: Value, + ) -> AgentResult { + serde_json::from_value(raw_json).map_err(|e| AgentError::HookParseFailed { + agent: self.name().to_string(), + hook_type: hook_type.as_str().to_string(), + reason: e.to_string(), + }) + } } impl Default for CodexHook { @@ -213,10 +196,6 @@ impl Default for CodexHook { } } -// ============================================================================= -// AgentHook Trait Implementation -// ============================================================================= - impl AgentHook for CodexHook { fn name(&self) -> &str { "codex" @@ -227,154 +206,97 @@ impl AgentHook for CodexHook { } fn parse_event(&self, hook_type: HookType, input: &[u8]) -> AgentResult { - if input.is_empty() { - return Err(AgentError::HookInputEmpty { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - }); - } - - let raw_json: serde_json::Value = - serde_json::from_slice(input).map_err(|e| AgentError::HookParseFailed { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - reason: e.to_string(), - })?; + let raw_json = self.parse_json(hook_type, input)?; match hook_type { HookType::SessionStart => { - let parsed: SessionStartInput = - serde_json::from_value(raw_json.clone()).map_err(|e| { - AgentError::HookParseFailed { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - reason: e.to_string(), - } - })?; - - let session_id = Self::extract_session_id(parsed.session_id); - let mut event = TurnEvent::new(session_id, hook_type).with_raw_json(raw_json); - + let parsed: SessionStartInput = self.parse_value(hook_type, raw_json.clone())?; + let mut event = TurnEvent::new( + Self::extract_session_id(parsed.session_id, parsed.thread_id, &raw_json), + hook_type, + ) + .with_raw_json(with_openai_provider(raw_json)); if let Some(path) = parsed.transcript_path { event = event.with_transcript_path(path); } - Ok(event) } - HookType::TurnStart => { - // Codex: UserPromptSubmit → TurnStart - let parsed: UserPromptSubmitInput = serde_json::from_value(raw_json.clone()) - .map_err(|e| AgentError::HookParseFailed { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - reason: e.to_string(), - })?; - - let session_id = Self::extract_session_id(parsed.session_id); - let mut event = TurnEvent::new(session_id, hook_type).with_raw_json(raw_json); - + let parsed: UserPromptSubmitInput = + self.parse_value(hook_type, raw_json.clone())?; + let mut event = TurnEvent::new( + Self::extract_session_id(parsed.session_id, parsed.thread_id, &raw_json), + hook_type, + ) + .with_raw_json(with_openai_provider(raw_json)); if let Some(path) = parsed.transcript_path { event = event.with_transcript_path(path); } - if let Some(prompt) = parsed.prompt { + if let Some(prompt) = parsed.prompt.filter(|p| !p.is_empty()) { event = event.with_prompt(prompt); } - Ok(event) } - HookType::TurnEnd => { - // Codex: Stop → TurnEnd - let parsed: StopInput = serde_json::from_value(raw_json.clone()).map_err(|e| { - AgentError::HookParseFailed { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - reason: e.to_string(), - } - })?; - - let session_id = Self::extract_session_id(parsed.session_id); - let mut event = TurnEvent::new(session_id, hook_type).with_raw_json(raw_json); - - if let Some(path) = parsed.transcript_path { - event = event.with_transcript_path(path); - } - - Ok(event) - } - - HookType::PreToolUse => { - let parsed: PreToolUseInput = - serde_json::from_value(raw_json.clone()).map_err(|e| { - AgentError::HookParseFailed { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - reason: e.to_string(), - } - })?; - - let session_id = Self::extract_session_id(parsed.session_id); - let mut event = TurnEvent::new(session_id, hook_type).with_raw_json(raw_json); - + let parsed: StopInput = self.parse_value(hook_type, raw_json.clone())?; + let raw_json = normalize_stop_raw(raw_json); + let mut event = TurnEvent::new( + Self::extract_session_id(parsed.session_id, parsed.thread_id, &raw_json), + hook_type, + ) + .with_raw_json(with_openai_provider(raw_json)); if let Some(path) = parsed.transcript_path { event = event.with_transcript_path(path); } - if let Some(name) = parsed.tool_name { - event = event.with_tool_name(name); - } - if let Some(id) = parsed.tool_use_id { - event = event.with_tool_use_id(id); - } - Ok(event) } - - HookType::PostToolUse => { - let parsed: PostToolUseInput = - serde_json::from_value(raw_json.clone()).map_err(|e| { - AgentError::HookParseFailed { - agent: self.name().to_string(), - hook_type: hook_type.as_str().to_string(), - reason: e.to_string(), - } - })?; - - let session_id = Self::extract_session_id(parsed.session_id); - let mut event = TurnEvent::new(session_id, hook_type).with_raw_json(raw_json); - + HookType::PreToolUse | HookType::PostToolUse => { + let parsed: ToolUseInput = self.parse_value(hook_type, raw_json.clone())?; + let raw_json = if hook_type == HookType::PostToolUse { + normalize_tool_raw(raw_json) + } else { + raw_json + }; + let mut event = TurnEvent::new( + Self::extract_session_id(parsed.session_id, parsed.thread_id, &raw_json), + hook_type, + ) + .with_raw_json(with_openai_provider(raw_json)); if let Some(path) = parsed.transcript_path { event = event.with_transcript_path(path); } - if let Some(name) = parsed.tool_name { + if let Some(name) = parsed.tool_name.filter(|n| !n.is_empty()) { event = event.with_tool_name(name); } - if let Some(id) = parsed.tool_use_id { + let tool_use_id = parsed + .tool_use_id + .or(parsed.tool_call_id) + .or(parsed.call_id) + .filter(|id| !id.is_empty()); + if let Some(id) = tool_use_id { event = event.with_tool_use_id(id); } - Ok(event) } - - // Codex does NOT have a SessionEnd event HookType::SessionEnd => Err(AgentError::HookParseFailed { agent: self.name().to_string(), hook_type: hook_type.as_str().to_string(), - reason: "Codex does not support SessionEnd hooks".to_string(), + reason: "Codex does not currently emit SessionEnd hooks".to_string(), }), } } - fn install(&self, _repo_root: &Path) -> AgentResult { - Ok(0) // Installation handled by atomic-codex package + fn install(&self, repo_root: &Path) -> AgentResult { + install_hooks_at(&Self::local_hooks_path(repo_root), false) } - fn uninstall(&self, _repo_root: &Path) -> AgentResult<()> { - Ok(()) // Uninstallation handled by atomic-codex package + fn uninstall(&self, repo_root: &Path) -> AgentResult<()> { + uninstall_hooks_at(&Self::local_hooks_path(repo_root)) } - fn is_installed(&self, _repo_root: &Path) -> bool { - false // Managed by atomic-codex package + fn is_installed(&self, repo_root: &Path) -> bool { + hooks_file_has_atomic_hooks(&Self::local_hooks_path(repo_root)) + || self.is_installed_global() } fn supported_hooks(&self) -> Vec { @@ -389,6 +311,7 @@ impl AgentHook for CodexHook { fn detect_presence(&self, repo_root: &Path) -> bool { repo_root.join(CODEX_DIR).is_dir() + || Self::global_hooks_path().is_some_and(|path| path.exists()) } fn hook_verbs(&self) -> Vec<&str> { @@ -396,38 +319,286 @@ impl AgentHook for CodexHook { "session-start", "user-prompt-submit", "stop", - "post-tool", "pre-tool", + "post-tool", ] } } -// ============================================================================= -// Verb Mapping -// ============================================================================= - -/// Map Codex hook verbs to Atomic HookTypes. -/// -/// These are registered in addition to the standard verbs in -/// [`HookType::from_verb`]. The CLI dispatch layer checks both. pub fn verb_to_hook_type(verb: &str) -> Option { match verb { "session-start" => Some(HookType::SessionStart), "user-prompt-submit" => Some(HookType::TurnStart), "stop" => Some(HookType::TurnEnd), - "post-tool" => Some(HookType::PostToolUse), "pre-tool" => Some(HookType::PreToolUse), + "post-tool" => Some(HookType::PostToolUse), _ => None, } } -// ============================================================================= -// Helpers -// ============================================================================= +fn install_hooks_at(path: &Path, force: bool) -> AgentResult { + let mut config = read_hooks_file(path)?; + if force { + remove_atomic_hooks(&mut config.hooks); + } + + let mut installed = 0; + for spec in CODEX_HOOK_DEFS { + if add_hook( + &mut config.hooks, + spec.event, + spec.command, + spec.status_message, + ) { + installed += 1; + } + } + + write_hooks_file(path, &config)?; + Ok(installed) +} + +fn uninstall_hooks_at(path: &Path) -> AgentResult<()> { + if !path.exists() { + return Ok(()); + } + let mut config = read_hooks_file(path)?; + remove_atomic_hooks(&mut config.hooks); + write_hooks_file(path, &config) +} + +fn read_hooks_file(path: &Path) -> AgentResult { + if !path.exists() { + return Ok(CodexHooksFile::default()); + } + let content = std::fs::read_to_string(path).map_err(|e| AgentError::ConfigError { + operation: "read".to_string(), + path: path.to_path_buf(), + reason: e.to_string(), + })?; + serde_json::from_str(&content).map_err(|e| AgentError::ConfigError { + operation: "parse".to_string(), + path: path.to_path_buf(), + reason: e.to_string(), + }) +} + +fn write_hooks_file(path: &Path, config: &CodexHooksFile) -> AgentResult<()> { + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(|e| AgentError::ConfigError { + operation: "create directory".to_string(), + path: parent.to_path_buf(), + reason: e.to_string(), + })?; + } + let content = serde_json::to_string_pretty(config).map_err(|e| AgentError::ConfigError { + operation: "serialize".to_string(), + path: path.to_path_buf(), + reason: e.to_string(), + })?; + std::fs::write(path, content).map_err(|e| AgentError::ConfigError { + operation: "write".to_string(), + path: path.to_path_buf(), + reason: e.to_string(), + }) +} + +fn hooks_file_has_atomic_hooks(path: &Path) -> bool { + std::fs::read_to_string(path) + .map(|content| content.contains(ATOMIC_HOOK_PREFIX)) + .unwrap_or(false) +} + +fn add_hook( + hooks: &mut Map, + event: &str, + command: &str, + status_message: Option<&str>, +) -> bool { + let groups = hooks + .entry(event.to_string()) + .or_insert_with(|| Value::Array(Vec::new())); + let Some(groups) = groups.as_array_mut() else { + *groups = Value::Array(Vec::new()); + let Some(groups) = hooks.get_mut(event).and_then(Value::as_array_mut) else { + return false; + }; + return add_hook_to_groups(groups, command, status_message); + }; + add_hook_to_groups(groups, command, status_message) +} + +fn add_hook_to_groups( + groups: &mut Vec, + command: &str, + status_message: Option<&str>, +) -> bool { + if groups.iter().any(|group| group_has_command(group, command)) { + return false; + } + + let mut entry = Map::new(); + entry.insert("type".to_string(), Value::String("command".to_string())); + entry.insert("command".to_string(), Value::String(command.to_string())); + if let Some(message) = status_message { + entry.insert( + "statusMessage".to_string(), + Value::String(message.to_string()), + ); + } + + let mut group = Map::new(); + group.insert( + "hooks".to_string(), + Value::Array(vec![Value::Object(entry)]), + ); + groups.push(Value::Object(group)); + true +} + +fn group_has_command(group: &Value, command: &str) -> bool { + group + .get("hooks") + .and_then(Value::as_array) + .is_some_and(|hooks| { + hooks.iter().any(|hook| { + hook.get("command") + .and_then(Value::as_str) + .is_some_and(|cmd| cmd == command) + }) + }) +} + +fn remove_atomic_hooks(hooks: &mut Map) { + for value in hooks.values_mut() { + let Some(groups) = value.as_array_mut() else { + continue; + }; + groups.retain_mut(|group| { + let Some(group_obj) = group.as_object_mut() else { + return true; + }; + let Some(group_hooks) = group_obj.get_mut("hooks").and_then(Value::as_array_mut) else { + return true; + }; + group_hooks.retain(|hook| { + !hook + .get("command") + .and_then(Value::as_str) + .is_some_and(is_atomic_hook) + }); + !group_hooks.is_empty() + }); + } +} + +fn is_atomic_hook(command: &str) -> bool { + command.contains(ATOMIC_HOOK_PREFIX) +} + +fn with_openai_provider(mut raw: Value) -> Value { + if let Some(obj) = raw.as_object_mut() { + obj.entry("provider".to_string()) + .or_insert_with(|| Value::String("openai".to_string())); + } + raw +} + +fn normalize_stop_raw(mut raw: Value) -> Value { + let stop_hook_active = raw.get("stop_hook_active").and_then(Value::as_bool); + if let Some(active) = stop_hook_active { + if let Some(obj) = raw.as_object_mut() { + obj.entry("finish_reason".to_string()).or_insert_with(|| { + Value::String(if active { "tool-calls" } else { "stop" }.to_string()) + }); + } + } + raw +} + +fn normalize_tool_raw(mut raw: Value) -> Value { + let Some(response) = raw.get("tool_response").cloned() else { + return raw; + }; + + if let Some(output) = extract_tool_output(&response) { + insert_if_missing(&mut raw, "tool_output", Value::String(output)); + } + if let Some(status) = extract_tool_status(&response) { + insert_if_missing(&mut raw, "status", Value::String(status)); + } + if let Some(duration) = extract_duration_ms(&response) { + insert_if_missing(&mut raw, "duration", Value::Number(duration.into())); + } + if let Some(file_path) = extract_file_path(raw.get("tool_input"), &response) { + insert_if_missing(&mut raw, "file_path", Value::String(file_path)); + } + + raw +} + +fn extract_tool_output(response: &Value) -> Option { + if let Some(text) = response.as_str() { + return Some(text.to_string()); + } + for key in ["output", "content", "result", "stdout", "message"] { + if let Some(text) = response.get(key).and_then(Value::as_str) { + return Some(text.to_string()); + } + } + None +} + +fn extract_tool_status(response: &Value) -> Option { + if response + .get("success") + .and_then(Value::as_bool) + .is_some_and(|success| success) + { + return Some("completed".to_string()); + } + if response + .get("success") + .and_then(Value::as_bool) + .is_some_and(|success| !success) + || response.get("error").is_some() + { + return Some("error".to_string()); + } + None +} + +fn extract_duration_ms(response: &Value) -> Option { + response + .get("duration_ms") + .or_else(|| response.get("duration")) + .and_then(Value::as_u64) +} + +fn extract_file_path(tool_input: Option<&Value>, response: &Value) -> Option { + for value in [tool_input, Some(response)].into_iter().flatten() { + for key in ["file_path", "filePath", "path"] { + if let Some(path) = value.get(key).and_then(Value::as_str) { + return Some(path.to_string()); + } + } + } + None +} + +fn insert_if_missing(raw: &mut Value, key: &str, value: Value) { + if let Some(obj) = raw.as_object_mut() { + obj.entry(key.to_string()).or_insert(value); + } +} + +fn value_string(raw: &Value, key: &str) -> Option { + raw.get(key) + .and_then(Value::as_str) + .map(str::to_string) + .filter(|s| !s.is_empty()) +} -/// Generate a short hex ID from the current timestamp. -/// -/// Used as a fallback session ID when Codex doesn't provide one. fn uuid_short() -> String { use std::time::{SystemTime, UNIX_EPOCH}; @@ -435,80 +606,96 @@ fn uuid_short() -> String { .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_millis(); - - // Use lower 32 bits of timestamp for a short hex ID format!("{:08x}", (now & 0xFFFF_FFFF) as u32) } -// ============================================================================= -// Tests -// ============================================================================= +struct HookDef { + event: &'static str, + command: &'static str, + status_message: Option<&'static str>, +} + +const CODEX_HOOK_DEFS: &[HookDef] = &[ + HookDef { + event: "SessionStart", + command: "test -d .atomic && atomic agent hooks codex session-start || true", + status_message: Some("Atomic: tracking session"), + }, + HookDef { + event: "UserPromptSubmit", + command: "test -d .atomic && atomic agent hooks codex user-prompt-submit || true", + status_message: None, + }, + HookDef { + event: "Stop", + command: "test -d .atomic && atomic agent hooks codex stop || true", + status_message: None, + }, + HookDef { + event: "PreToolUse", + command: "test -d .atomic && atomic agent hooks codex pre-tool || true", + status_message: None, + }, + HookDef { + event: "PostToolUse", + command: "test -d .atomic && atomic agent hooks codex post-tool || true", + status_message: None, + }, +]; #[cfg(test)] mod tests { use super::*; + use crate::event::HookType; + use tempfile::TempDir; fn make_hook() -> CodexHook { CodexHook::new() } - // ========================================================================= - // Identity tests - // ========================================================================= - #[test] fn test_name() { - let hook = make_hook(); - assert_eq!(hook.name(), "codex"); + assert_eq!(make_hook().name(), "codex"); } #[test] fn test_display_name() { - let hook = make_hook(); - assert_eq!(hook.display_name(), "Codex"); + assert_eq!(make_hook().display_name(), "Codex"); } #[test] fn test_default() { - let hook = CodexHook::default(); - assert_eq!(hook.name(), "codex"); + assert_eq!(CodexHook::default().name(), "codex"); } #[test] fn test_supported_hooks() { - let hook = make_hook(); - let supported = hook.supported_hooks(); - assert!(supported.contains(&HookType::SessionStart)); - assert!(supported.contains(&HookType::TurnStart)); - assert!(supported.contains(&HookType::TurnEnd)); - assert!(supported.contains(&HookType::PreToolUse)); - assert!(supported.contains(&HookType::PostToolUse)); - // Codex does NOT support SessionEnd - assert!(!supported.contains(&HookType::SessionEnd)); - } - - #[test] - fn test_supported_hooks_count() { - let hook = make_hook(); - assert_eq!(hook.supported_hooks().len(), 5); + let hooks = make_hook().supported_hooks(); + assert_eq!(hooks.len(), 5); + assert!(hooks.contains(&HookType::SessionStart)); + assert!(hooks.contains(&HookType::TurnStart)); + assert!(hooks.contains(&HookType::TurnEnd)); + assert!(hooks.contains(&HookType::PreToolUse)); + assert!(hooks.contains(&HookType::PostToolUse)); + assert!(!hooks.contains(&HookType::SessionEnd)); } #[test] fn test_hook_verbs() { let hook = make_hook(); let verbs = hook.hook_verbs(); - assert_eq!(verbs.len(), 5); - assert!(verbs.contains(&"session-start")); - assert!(verbs.contains(&"user-prompt-submit")); - assert!(verbs.contains(&"stop")); - assert!(verbs.contains(&"post-tool")); - assert!(verbs.contains(&"pre-tool")); + assert_eq!( + verbs, + vec![ + "session-start", + "user-prompt-submit", + "stop", + "pre-tool", + "post-tool" + ] + ); } - // ========================================================================= - // Verb mapping tests - // ========================================================================= - #[test] fn test_verb_to_hook_type() { assert_eq!( @@ -520,319 +707,253 @@ mod tests { Some(HookType::TurnStart) ); assert_eq!(verb_to_hook_type("stop"), Some(HookType::TurnEnd)); - assert_eq!(verb_to_hook_type("post-tool"), Some(HookType::PostToolUse)); assert_eq!(verb_to_hook_type("pre-tool"), Some(HookType::PreToolUse)); + assert_eq!(verb_to_hook_type("post-tool"), Some(HookType::PostToolUse)); + assert_eq!(verb_to_hook_type("session-end"), None); assert_eq!(verb_to_hook_type("unknown"), None); - assert_eq!(verb_to_hook_type(""), None); } - // ========================================================================= - // Parse event tests - // ========================================================================= - #[test] fn test_parse_session_start() { - let hook = make_hook(); let input = br#"{ - "session_id": "sess-abc-123", - "transcript_path": "/tmp/codex-transcript.json", - "cwd": "/home/user/project", - "model": "o3-mini", + "session_id": "sess-123", + "transcript_path": "/tmp/codex.jsonl", + "model": "gpt-5.5", "source": "startup", - "hook_event_name": "SessionStart" + "cwd": "/repo" }"#; - let event = hook.parse_event(HookType::SessionStart, input).unwrap(); - assert_eq!(event.session_id, "sess-abc-123"); + let event = make_hook() + .parse_event(HookType::SessionStart, input) + .unwrap(); + assert_eq!(event.session_id, "sess-123"); assert_eq!(event.event_type, HookType::SessionStart); - assert!(event.transcript_path.is_some()); - assert!(event.raw_json.is_some()); + assert_eq!( + event.transcript_path.as_deref(), + Some(Path::new("/tmp/codex.jsonl")) + ); + let raw = event.raw_json.unwrap(); + assert_eq!(raw["model"], "gpt-5.5"); + assert_eq!(raw["provider"], "openai"); } #[test] - fn test_parse_session_start_extracts_model_in_raw_json() { - let hook = make_hook(); - let input = br#"{"session_id": "s1", "model": "o3-mini"}"#; - let event = hook.parse_event(HookType::SessionStart, input).unwrap(); - let raw = event.raw_json.unwrap(); - assert_eq!(raw.get("model").and_then(|v| v.as_str()), Some("o3-mini")); + fn test_parse_session_start_uses_thread_id_fallback() { + let input = br#"{"thread_id": "thread-123"}"#; + let event = make_hook() + .parse_event(HookType::SessionStart, input) + .unwrap(); + assert_eq!(event.session_id, "thread-123"); } #[test] - fn test_parse_user_prompt_submit() { - let hook = make_hook(); - let input = br#"{ - "session_id": "sess-abc-123", - "transcript_path": "/tmp/t.json", - "cwd": "/home/user/project", - "model": "o3-mini", - "prompt": "Fix the login bug in auth.rs", - "turn_id": "turn-001" - }"#; - let event = hook.parse_event(HookType::TurnStart, input).unwrap(); - assert_eq!(event.session_id, "sess-abc-123"); - assert_eq!(event.event_type, HookType::TurnStart); - assert_eq!( - event.prompt.as_deref(), - Some("Fix the login bug in auth.rs") - ); + fn test_parse_session_start_generates_fallback_session_id() { + let input = br#"{"cwd": "/repo"}"#; + let event = make_hook() + .parse_event(HookType::SessionStart, input) + .unwrap(); + assert!(event.session_id.starts_with("codex-")); } #[test] - fn test_parse_stop() { - let hook = make_hook(); + fn test_parse_user_prompt_submit() { let input = br#"{ - "session_id": "sess-abc-123", - "transcript_path": "/tmp/t.json", - "cwd": "/home/user/project", - "model": "o3-mini", - "turn_id": "turn-001", - "stop_hook_active": true, - "last_assistant_message": "Done! I fixed the bug." + "session_id": "sess-123", + "prompt": "fix the hook", + "model": "gpt-5.5" }"#; - let event = hook.parse_event(HookType::TurnEnd, input).unwrap(); - assert_eq!(event.session_id, "sess-abc-123"); - assert_eq!(event.event_type, HookType::TurnEnd); + let event = make_hook().parse_event(HookType::TurnStart, input).unwrap(); + assert_eq!(event.session_id, "sess-123"); + assert_eq!(event.event_type, HookType::TurnStart); + assert_eq!(event.prompt.as_deref(), Some("fix the hook")); + assert_eq!(event.raw_json.unwrap()["provider"], "openai"); } #[test] - fn test_parse_pre_tool_use() { - let hook = make_hook(); - let input = br#"{ - "session_id": "sess-abc-123", - "tool_name": "Bash", - "tool_use_id": "tool-42", - "tool_input": {"command": "ls -la"} - }"#; - let event = hook.parse_event(HookType::PreToolUse, input).unwrap(); - assert_eq!(event.session_id, "sess-abc-123"); - assert_eq!(event.event_type, HookType::PreToolUse); - assert_eq!(event.tool_name.as_deref(), Some("Bash")); - assert_eq!(event.tool_use_id.as_deref(), Some("tool-42")); + fn test_parse_user_prompt_submit_empty_prompt_is_none() { + let input = br#"{"session_id": "sess-123", "prompt": ""}"#; + let event = make_hook().parse_event(HookType::TurnStart, input).unwrap(); + assert!(event.prompt.is_none()); } #[test] - fn test_parse_post_tool_use() { - let hook = make_hook(); + fn test_parse_stop_preserves_assistant_summary_and_finish_reason() { let input = br#"{ - "session_id": "sess-abc-123", - "tool_name": "Bash", - "tool_use_id": "tool-42", - "tool_input": {"command": "ls -la"}, - "tool_response": {"output": "total 0\ndrwxr-xr-x 2 user user 64 Jan 1 00:00 ."} + "session_id": "sess-123", + "last_assistant_message": "Updated the parser and tests.", + "stop_hook_active": false, + "model": "gpt-5.5" }"#; - let event = hook.parse_event(HookType::PostToolUse, input).unwrap(); - assert_eq!(event.session_id, "sess-abc-123"); - assert_eq!(event.event_type, HookType::PostToolUse); - assert_eq!(event.tool_name.as_deref(), Some("Bash")); - assert_eq!(event.tool_use_id.as_deref(), Some("tool-42")); + let event = make_hook().parse_event(HookType::TurnEnd, input).unwrap(); + assert_eq!(event.session_id, "sess-123"); + assert_eq!(event.event_type, HookType::TurnEnd); + let raw = event.raw_json.unwrap(); + assert_eq!( + raw["last_assistant_message"], + "Updated the parser and tests." + ); + assert_eq!(raw["finish_reason"], "stop"); } #[test] - fn test_parse_session_end_unsupported() { - let hook = make_hook(); - let input = br#"{"session_id": "s1"}"#; - let result = hook.parse_event(HookType::SessionEnd, input); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.to_string().contains("SessionEnd")); + fn test_parse_stop_active_infers_tool_calls_finish_reason() { + let input = br#"{"session_id": "sess-123", "stop_hook_active": true}"#; + let event = make_hook().parse_event(HookType::TurnEnd, input).unwrap(); + assert_eq!(event.raw_json.unwrap()["finish_reason"], "tool-calls"); } #[test] - fn test_parse_empty_input() { - let hook = make_hook(); - let result = hook.parse_event(HookType::SessionStart, b""); - assert!(result.is_err()); + fn test_parse_pre_tool() { + let input = br#"{ + "session_id": "sess-123", + "tool_name": "exec_command", + "tool_use_id": "call-1", + "tool_input": {"cmd": "cargo test"} + }"#; + let event = make_hook() + .parse_event(HookType::PreToolUse, input) + .unwrap(); + assert_eq!(event.session_id, "sess-123"); + assert_eq!(event.event_type, HookType::PreToolUse); + assert_eq!(event.tool_name.as_deref(), Some("exec_command")); + assert_eq!(event.tool_use_id.as_deref(), Some("call-1")); + assert_eq!(event.raw_json.unwrap()["tool_input"]["cmd"], "cargo test"); } #[test] - fn test_parse_invalid_json() { - let hook = make_hook(); - let result = hook.parse_event(HookType::SessionStart, b"not json at all"); - assert!(result.is_err()); + fn test_parse_pre_tool_accepts_tool_call_id_alias() { + let input = br#"{ + "session_id": "sess-123", + "tool_name": "exec_command", + "tool_call_id": "call-2" + }"#; + let event = make_hook() + .parse_event(HookType::PreToolUse, input) + .unwrap(); + assert_eq!(event.tool_use_id.as_deref(), Some("call-2")); } #[test] - fn test_parse_missing_session_id_generates_fallback() { - let hook = make_hook(); - let input = br#"{"transcript_path": "/tmp/t.json"}"#; - let event = hook.parse_event(HookType::SessionStart, input).unwrap(); - assert!(event.session_id.starts_with("codex-")); + fn test_parse_post_tool_normalizes_tool_response_for_provenance() { + let input = br#"{ + "session_id": "sess-123", + "tool_name": "exec_command", + "tool_use_id": "call-3", + "tool_input": {"cmd": "cargo test", "file_path": "src/lib.rs"}, + "tool_response": {"output": "ok", "success": true, "duration_ms": 42} + }"#; + let event = make_hook() + .parse_event(HookType::PostToolUse, input) + .unwrap(); + assert_eq!(event.session_id, "sess-123"); + assert_eq!(event.event_type, HookType::PostToolUse); + assert_eq!(event.tool_name.as_deref(), Some("exec_command")); + assert_eq!(event.tool_use_id.as_deref(), Some("call-3")); + let raw = event.raw_json.unwrap(); + assert_eq!(raw["tool_output"], "ok"); + assert_eq!(raw["status"], "completed"); + assert_eq!(raw["duration"], 42); + assert_eq!(raw["file_path"], "src/lib.rs"); } #[test] - fn test_parse_empty_session_id_generates_fallback() { - let hook = make_hook(); - let input = br#"{"session_id": ""}"#; - let event = hook.parse_event(HookType::SessionStart, input).unwrap(); - assert!(event.session_id.starts_with("codex-")); + fn test_parse_post_tool_normalizes_error_response() { + let input = br#"{ + "session_id": "sess-123", + "tool_name": "exec_command", + "tool_response": {"error": "failed"} + }"#; + let event = make_hook() + .parse_event(HookType::PostToolUse, input) + .unwrap(); + assert_eq!(event.raw_json.unwrap()["status"], "error"); } #[test] - fn test_parse_minimal_input() { - let hook = make_hook(); - // Codex sends at least {} — all fields are optional with serde(default) - let input = br#"{}"#; - let event = hook.parse_event(HookType::SessionStart, input).unwrap(); - assert!(event.session_id.starts_with("codex-")); - assert_eq!(event.event_type, HookType::SessionStart); + fn test_parse_session_end_is_unsupported() { + let result = make_hook().parse_event(HookType::SessionEnd, br#"{"session_id":"s"}"#); + assert!(matches!(result, Err(AgentError::HookParseFailed { .. }))); } #[test] - fn test_parse_turn_start_no_prompt() { - let hook = make_hook(); - let input = br#"{"session_id": "s1"}"#; - let event = hook.parse_event(HookType::TurnStart, input).unwrap(); - assert_eq!(event.session_id, "s1"); - assert!(event.prompt.is_none()); + fn test_parse_event_empty_input() { + let result = make_hook().parse_event(HookType::TurnEnd, b""); + assert!(matches!(result, Err(AgentError::HookInputEmpty { .. }))); } #[test] - fn test_parse_post_tool_no_tool_name() { - let hook = make_hook(); - let input = br#"{"session_id": "s1"}"#; - let event = hook.parse_event(HookType::PostToolUse, input).unwrap(); - assert_eq!(event.session_id, "s1"); - assert!(event.tool_name.is_none()); + fn test_parse_event_invalid_json() { + let result = make_hook().parse_event(HookType::TurnEnd, b"not-json"); + assert!(matches!(result, Err(AgentError::HookParseFailed { .. }))); } #[test] - fn test_parse_pre_tool_no_tool_use_id() { - let hook = make_hook(); - let input = br#"{"session_id": "s1", "tool_name": "Bash"}"#; - let event = hook.parse_event(HookType::PreToolUse, input).unwrap(); - assert_eq!(event.session_id, "s1"); - assert_eq!(event.tool_name.as_deref(), Some("Bash")); - assert!(event.tool_use_id.is_none()); + fn test_detect_presence_with_local_codex_dir() { + let dir = TempDir::new().unwrap(); + std::fs::create_dir(dir.path().join(CODEX_DIR)).unwrap(); + assert!(make_hook().detect_presence(dir.path())); } - // ========================================================================= - // Detection tests - // ========================================================================= - #[test] - fn test_detect_presence_with_codex_dir() { - let dir = tempfile::tempdir().unwrap(); - std::fs::create_dir_all(dir.path().join(".codex")).unwrap(); + fn test_local_install_is_idempotent_and_preserves_other_hooks() { + let dir = TempDir::new().unwrap(); + let hooks_path = dir.path().join(CODEX_DIR).join(HOOKS_FILE); + std::fs::create_dir_all(hooks_path.parent().unwrap()).unwrap(); + std::fs::write( + &hooks_path, + r#"{"hooks":{"Stop":[{"hooks":[{"type":"command","command":"custom stop"}]}]},"other":true}"#, + ) + .unwrap(); let hook = make_hook(); - assert!(hook.detect_presence(dir.path())); - } + assert_eq!(hook.install(dir.path()).unwrap(), 5); + assert!(hook.is_installed(dir.path())); + assert_eq!(hook.install(dir.path()).unwrap(), 0); - #[test] - fn test_detect_presence_without_codex_dir() { - let dir = tempfile::tempdir().unwrap(); - let hook = make_hook(); - assert!(!hook.detect_presence(dir.path())); - } - - // ========================================================================= - // Install / uninstall are no-ops (managed by atomic-codex package) - // ========================================================================= - - #[test] - fn test_install_is_noop() { - let dir = tempfile::tempdir().unwrap(); - let hook = make_hook(); - let count = hook.install(dir.path()).unwrap(); - assert_eq!(count, 0); - } - - #[test] - fn test_uninstall_is_noop() { - let dir = tempfile::tempdir().unwrap(); - let hook = make_hook(); - assert!(hook.uninstall(dir.path()).is_ok()); + let content = std::fs::read_to_string(&hooks_path).unwrap(); + assert!(content.contains("custom stop")); + assert!(content.contains("atomic agent hooks codex session-start")); + assert!(content.contains("atomic agent hooks codex user-prompt-submit")); + assert!(content.contains("atomic agent hooks codex stop")); + assert!(content.contains("atomic agent hooks codex pre-tool")); + assert!(content.contains("atomic agent hooks codex post-tool")); + assert!(content.contains("\"other\": true")); } #[test] - fn test_is_installed_always_false() { - let dir = tempfile::tempdir().unwrap(); + fn test_uninstall_removes_only_atomic_hooks() { + let dir = TempDir::new().unwrap(); let hook = make_hook(); - assert!(!hook.is_installed(dir.path())); - } - - // ========================================================================= - // extract_session_id tests - // ========================================================================= - - #[test] - fn test_extract_session_id_with_value() { - let id = CodexHook::extract_session_id(Some("sess-123".to_string())); - assert_eq!(id, "sess-123"); - } - - #[test] - fn test_extract_session_id_empty_generates_fallback() { - let id = CodexHook::extract_session_id(Some("".to_string())); - assert!(id.starts_with("codex-")); - } - - #[test] - fn test_extract_session_id_none_generates_fallback() { - let id = CodexHook::extract_session_id(None); - assert!(id.starts_with("codex-")); - } - - // ========================================================================= - // hook_command_exists tests - // ========================================================================= - - // ========================================================================= - // uuid_short helper tests - // ========================================================================= + hook.install(dir.path()).unwrap(); + let hooks_path = dir.path().join(CODEX_DIR).join(HOOKS_FILE); + let mut config = read_hooks_file(&hooks_path).unwrap(); + add_hook_to_groups( + config + .hooks + .get_mut("Stop") + .and_then(Value::as_array_mut) + .unwrap(), + "custom stop", + None, + ); + write_hooks_file(&hooks_path, &config).unwrap(); - #[test] - fn test_uuid_short_format() { - let id = uuid_short(); - assert_eq!(id.len(), 8); - assert!(id.chars().all(|c| c.is_ascii_hexdigit())); + hook.uninstall(dir.path()).unwrap(); + let content = std::fs::read_to_string(&hooks_path).unwrap(); + assert!(!content.contains("atomic agent hooks codex")); + assert!(content.contains("custom stop")); } #[test] - fn test_uuid_short_not_all_zeros() { - let id = uuid_short(); - assert_ne!(id, "00000000"); + fn test_uninstall_missing_file_is_ok() { + let dir = TempDir::new().unwrap(); + make_hook().uninstall(dir.path()).unwrap(); } - // ========================================================================= - // Roundtrip / integration tests - // ========================================================================= - #[test] - fn test_parse_all_events_roundtrip() { - let hook = make_hook(); - - // SessionStart - let input = br#"{"session_id": "s1", "model": "o3-mini"}"#; - let event = hook.parse_event(HookType::SessionStart, input).unwrap(); - assert_eq!(event.session_id, "s1"); - - // TurnStart (UserPromptSubmit) - let input = br#"{"session_id": "s1", "prompt": "do something"}"#; - let event = hook.parse_event(HookType::TurnStart, input).unwrap(); - assert_eq!(event.prompt.as_deref(), Some("do something")); - - // TurnEnd (Stop) - let input = br#"{"session_id": "s1", "stop_hook_active": true}"#; - let event = hook.parse_event(HookType::TurnEnd, input).unwrap(); - assert_eq!(event.session_id, "s1"); - - // PreToolUse - let input = br#"{"session_id": "s1", "tool_name": "Bash"}"#; - let event = hook.parse_event(HookType::PreToolUse, input).unwrap(); - assert_eq!(event.tool_name.as_deref(), Some("Bash")); - - // PostToolUse - let input = - br#"{"session_id": "s1", "tool_name": "Bash", "tool_response": {"output": "ok"}}"#; - let event = hook.parse_event(HookType::PostToolUse, input).unwrap(); - assert_eq!(event.tool_name.as_deref(), Some("Bash")); - } - - #[test] - fn test_debug_impl() { - let hook = make_hook(); - let debug = format!("{:?}", hook); - assert!(debug.contains("CodexHook")); + fn test_force_install_rewrites_atomic_hooks() { + let dir = TempDir::new().unwrap(); + let hooks_path = dir.path().join(CODEX_DIR).join(HOOKS_FILE); + install_hooks_at(&hooks_path, false).unwrap(); + assert_eq!(install_hooks_at(&hooks_path, true).unwrap(), 5); } } diff --git a/atomic-cli/src/commands/agent/enable.rs b/atomic-cli/src/commands/agent/enable.rs index d672f65..3278837 100644 --- a/atomic-cli/src/commands/agent/enable.rs +++ b/atomic-cli/src/commands/agent/enable.rs @@ -246,8 +246,10 @@ impl Enable { /// Supports: /// - `claude-code` → `~/.claude/settings.json` /// - `gemini-cli` → `~/.gemini/settings.json` + /// - `codex` → `~/.codex/hooks.json` fn run_global(&self) -> CliResult<()> { use atomic_agent::hooks::claude_code::ClaudeCodeHook; + use atomic_agent::hooks::codex::CodexHook; use atomic_agent::hooks::gemini_cli::GeminiCliHook; let agent_name = self.agent.as_deref().unwrap_or("claude-code"); @@ -319,9 +321,42 @@ impl Enable { } } + "codex" => { + let hook = CodexHook::new(); + + if !self.force && hook.is_installed_global() { + print_success("Global hooks already installed in ~/.codex/hooks.json."); + println!(" Use --force to reinstall."); + return Ok(()); + } + + match hook.install_global(self.force) { + Ok(count) if count > 0 => { + print_success(&format!( + "Installed {} global hook{} for Codex", + count, + if count == 1 { "" } else { "s" }, + )); + println!(); + println!("Hooks written to: ~/.codex/hooks.json"); + println!(); + println!("Every Codex session in a project with .atomic/ will now:"); + println!(" • Record each turn as an Atomic change with full provenance"); + println!(" • Track session metadata (turn number, timing, files)"); + println!(" • Capture tool calls through pre/post tool hooks"); + } + Ok(_) => { + print_success("Global hooks already up to date."); + } + Err(e) => { + print_error(&format!("Failed to install global hooks: {}", e)); + } + } + } + other => { print_warning(&format!( - "Global install is not supported for '{}'. Supported agents: claude-code, gemini-cli", + "Global install is not supported for '{}'. Supported agents: claude-code, gemini-cli, codex", other )); } diff --git a/atomic-cli/src/commands/change/command.rs b/atomic-cli/src/commands/change/command.rs index cf00aff..04acc6f 100644 --- a/atomic-cli/src/commands/change/command.rs +++ b/atomic-cli/src/commands/change/command.rs @@ -447,15 +447,15 @@ impl ChangeCmd { count_unique_paths(&change.hashed.hunks) )); - // Always show hunks with atom details - for graph_op in &change.hashed.hunks { - let (symbol, path) = hunk_symbol_and_path(graph_op); - let atom_info = hunk_atom_info(graph_op); + // Show one summary row per path. Git imports and large records can + // legitimately contain thousands of graph ops for a small number + // of files; printing each op makes the change view unusable. + for summary in hunk_display_summaries(&change.hashed.hunks) { output.push_str(&format!( " {} {} {}\n", - symbol, - style_path(&path), - hint(&atom_info) + summary.symbol, + style_path(&summary.path), + hint(&summary.info) )); } } @@ -832,3 +832,84 @@ fn hunk_symbol_and_path(graph_op: &GraphOp) -> (&'static str, String) { GraphOp::DelRoot { .. } => ("⊘", "(root)".to_string()), } } + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct HunkDisplaySummary { + pub symbol: &'static str, + pub path: String, + pub info: String, +} + +#[derive(Debug, Clone)] +struct HunkDisplayAggregate { + symbol: &'static str, + total: usize, + infos: std::collections::BTreeMap, +} + +pub(crate) fn hunk_display_summaries(hunks: &[GraphOp]) -> Vec { + let mut by_path: std::collections::BTreeMap = + std::collections::BTreeMap::new(); + + for graph_op in hunks { + let (symbol, path) = hunk_symbol_and_path(graph_op); + let info = hunk_atom_info(graph_op); + let aggregate = by_path.entry(path).or_insert_with(|| HunkDisplayAggregate { + symbol, + total: 0, + infos: std::collections::BTreeMap::new(), + }); + aggregate.symbol = merge_hunk_symbols(aggregate.symbol, symbol); + aggregate.total += 1; + *aggregate.infos.entry(info).or_insert(0) += 1; + } + + by_path + .into_iter() + .map(|(path, aggregate)| HunkDisplaySummary { + symbol: aggregate.symbol, + path, + info: format_hunk_aggregate_info(aggregate.total, &aggregate.infos), + }) + .collect() +} + +fn merge_hunk_symbols(current: &'static str, next: &'static str) -> &'static str { + if current == next { + return current; + } + if current == "±" || next == "±" { + return "±"; + } + match (current, next) { + ("+", "~") | ("~", "+") | ("+", "-") | ("-", "+") | ("~", "-") | ("-", "~") => "±", + ("📁+", "📁-") | ("📁-", "📁+") => "±", + _ => current, + } +} + +fn format_hunk_aggregate_info( + total: usize, + infos: &std::collections::BTreeMap, +) -> String { + if total == 1 { + return infos + .keys() + .next() + .cloned() + .unwrap_or_else(|| "(1 hunk)".to_string()); + } + + let details = infos + .iter() + .map(|(info, count)| format!("{}x {}", count, trim_hunk_info(info))) + .collect::>() + .join("; "); + format!("({} hunks: {})", total, details) +} + +fn trim_hunk_info(info: &str) -> &str { + info.strip_prefix('(') + .and_then(|s| s.strip_suffix(')')) + .unwrap_or(info) +} diff --git a/atomic-cli/src/commands/change/tests.rs b/atomic-cli/src/commands/change/tests.rs index 6355eda..a9448b8 100644 --- a/atomic-cli/src/commands/change/tests.rs +++ b/atomic-cli/src/commands/change/tests.rs @@ -3,7 +3,9 @@ use super::*; mod tests { use super::*; use atomic_core::change::ChangeHeader; - use atomic_core::types::Merkle; + use atomic_core::change::{Atom, Encoding, Insertion, Local}; + use atomic_core::types::{ChangePosition, Merkle, Position}; + use atomic_core::EdgeFlags; // ChangeFormat Tests @@ -359,6 +361,23 @@ mod tests { ) } + fn test_edit_hunk(path: &str, start: u64, end: u64) -> GraphOp { + let change = Hash::of(format!("{}:{}:{}", path, start, end).as_bytes()); + let inode = Position::new(change, ChangePosition::new(0)); + GraphOp::Edit { + change: Atom::Insertion(Insertion { + predecessors: Vec::new(), + successors: Vec::new(), + flag: EdgeFlags::BLOCK, + start: ChangePosition::new(start), + end: ChangePosition::new(end), + inode, + }), + local: Local::new(path, 1), + encoding: Some(Encoding::Utf8), + } + } + #[test] fn test_json_change_from_change() { let change = create_test_change(); @@ -476,6 +495,23 @@ mod tests { assert!(parsed.get("sequence").is_none() || parsed["sequence"].is_null()); } + #[test] + fn test_hunk_display_summaries_coalesce_same_path() { + let hunks = vec![ + test_edit_hunk("src/main.rs", 0, 5), + test_edit_hunk("src/main.rs", 5, 10), + test_edit_hunk("src/lib.rs", 10, 15), + ]; + + let summaries = hunk_display_summaries(&hunks); + + assert_eq!(summaries.len(), 2); + assert_eq!(summaries[0].path, "src/lib.rs"); + assert_eq!(summaries[0].info, "(+1 span: new content)"); + assert_eq!(summaries[1].path, "src/main.rs"); + assert_eq!(summaries[1].info, "(2 hunks: 2x +1 span: new content)"); + } + // Integration Tests use serial_test::serial; diff --git a/atomic-cli/src/commands/diff/helpers.rs b/atomic-cli/src/commands/diff/helpers.rs index 05b23ab..1abb4bf 100644 --- a/atomic-cli/src/commands/diff/helpers.rs +++ b/atomic-cli/src/commands/diff/helpers.rs @@ -47,6 +47,14 @@ impl Diff { hash: change_ref.to_string(), })?; + // Git-imported changes carry Git's captured +/- lines in unhashed + // metadata. Use that directly for review output before considering + // FileOps or the expensive graph reconstruction fallback. Graph-first + // imported changes may intentionally have no FileOps yet. + if let Some((file_diffs, stats)) = Self::build_git_import_file_diffs(&change) { + return self.print_change_file_diffs(&change, &hash, config, file_diffs, stats); + } + // Check if change has semantic layer (file_ops) if change.has_file_ops() { // Use the semantic layer for human-readable diff @@ -67,24 +75,6 @@ impl Diff { change_hash: &Hash, config: &DiffOutputConfig, ) -> CliResult<()> { - if let Some((file_diffs, stats)) = Self::build_git_import_file_diffs(change) { - if file_diffs.is_empty() { - self.print_no_changes(); - return Ok(()); - } - - if config.format == DiffFormat::Unified { - self.print_change_header(change, change_hash, config); - } - - return match config.format { - DiffFormat::Unified => self.print_unified(&file_diffs, config), - DiffFormat::Stat => self.print_stat(&stats, config), - DiffFormat::NameOnly => self.print_name_only(&file_diffs), - DiffFormat::NameStatus => self.print_name_status(&file_diffs, config), - }; - } - let file_ops = change.file_ops(); if file_ops.is_empty() { @@ -246,12 +236,26 @@ impl Diff { return Ok(()); } - // Print change header information + self.print_change_file_diffs(change, change_hash, config, file_diffs, stats) + } + + fn print_change_file_diffs( + &self, + change: &Change, + change_hash: &Hash, + config: &DiffOutputConfig, + file_diffs: Vec, + stats: DiffStats, + ) -> CliResult<()> { + if file_diffs.is_empty() { + self.print_no_changes(); + return Ok(()); + } + if config.format == DiffFormat::Unified { self.print_change_header(change, change_hash, config); } - // Print in the appropriate format match config.format { DiffFormat::Unified => self.print_unified(&file_diffs, config), DiffFormat::Stat => self.print_stat(&stats, config), @@ -291,7 +295,9 @@ impl Diff { .is_some() } - fn build_git_import_file_diffs(change: &Change) -> Option<(Vec, DiffStats)> { + pub(super) fn build_git_import_file_diffs( + change: &Change, + ) -> Option<(Vec, DiffStats)> { let diff_files_value = change .unhashed .as_ref()? diff --git a/atomic-cli/src/commands/diff/tests.rs b/atomic-cli/src/commands/diff/tests.rs index b321d0e..c58a954 100644 --- a/atomic-cli/src/commands/diff/tests.rs +++ b/atomic-cli/src/commands/diff/tests.rs @@ -624,6 +624,41 @@ mod tests { assert_eq!(diff.change, Some("abc123".to_string())); } + #[test] + fn test_git_import_diff_metadata_without_file_ops_builds_file_diff() { + use atomic_core::change::ChangeHeader; + use atomic_core::record::workflow::GitDiffLine; + + let mut change = Change::empty(ChangeHeader::new("Imported graph-first change")); + change.unhashed = Some(serde_json::json!({ + "git": { + "sha": "1234567890abcdef", + "diff_lines": [{ + "path": "src/lib.rs", + "lines": [ + GitDiffLine { + origin: '+', + content: b"fn imported() {}\n".to_vec(), + old_lineno: None, + new_lineno: Some(1), + } + ] + }] + } + })); + + assert!(!change.has_file_ops()); + + let (file_diffs, stats) = Diff::build_git_import_file_diffs(&change).unwrap(); + + assert_eq!(file_diffs.len(), 1); + assert_eq!(file_diffs[0].new_path, "src/lib.rs"); + assert_eq!(file_diffs[0].hunks.len(), 1); + assert_eq!(file_diffs[0].hunks[0].lines.len(), 1); + assert_eq!(file_diffs[0].stats.insertions, 1); + assert_eq!(stats.total_insertions(), 1); + } + #[test] fn test_diff_with_algorithm() { let diff = Diff::new().with_algorithm("patience"); diff --git a/atomic-cli/src/commands/doctor.rs b/atomic-cli/src/commands/doctor.rs index 2f9a59c..17914c1 100644 --- a/atomic-cli/src/commands/doctor.rs +++ b/atomic-cli/src/commands/doctor.rs @@ -1,5 +1,6 @@ //! Repository diagnostic and repair commands. +use atomic_repository::CrdtMaterializeOptions; use clap::{Args, Subcommand}; use crate::commands::{require_repository, Command}; @@ -23,6 +24,14 @@ pub enum DoctorCommands { /// without repeatedly loading change files. #[command(name = "repair-dependency-index")] RepairDependencyIndex(RepairDependencyIndex), + + /// Materialize stored FileOps into CRDT semantic tables. + /// + /// This is the second phase of graph-first Git import: the graph is already + /// written, and this command builds CRDT tables from graph-linked FileOps + /// stored in the imported changes. + #[command(name = "materialize-crdt")] + MaterializeCrdt(MaterializeCrdt), } /// Rebuild the normal change dependency index from stored changes. @@ -33,10 +42,23 @@ pub struct RepairDependencyIndex { pub force: bool, } +/// Build CRDT tables from stored change FileOps. +#[derive(Debug, Args, Default)] +pub struct MaterializeCrdt { + /// View to materialize. Defaults to the current view. + #[arg(long)] + pub view: Option, + + /// Re-apply even when a trunk row already exists. + #[arg(long)] + pub force: bool, +} + impl Command for Doctor { fn run(&self) -> CliResult<()> { match &self.command { DoctorCommands::RepairDependencyIndex(cmd) => cmd.run(), + DoctorCommands::MaterializeCrdt(cmd) => cmd.run(), } } } @@ -69,6 +91,61 @@ impl Command for RepairDependencyIndex { } } +impl Command for MaterializeCrdt { + fn run(&self) -> CliResult<()> { + let repo = require_repository(None)?; + let view = self + .view + .clone() + .unwrap_or_else(|| repo.current_view().to_string()); + + print_info(&format!("Materializing CRDT tables for view '{}'...", view)); + if self.force { + print_warning("--force enabled: existing CRDT trunk rows may be overwritten"); + } + + let outcome = repo.materialize_crdt_from_changes(CrdtMaterializeOptions { + view: Some(view), + force: self.force, + })?; + + print_success(&format!( + "CRDT materialization complete in {:.1}s: {} changes scanned, {} changes applied, {} FileOps applied, {} already materialized, {} skipped", + outcome.elapsed_ms as f64 / 1000.0, + outcome.changes_scanned, + outcome.changes_applied, + outcome.file_ops_applied, + outcome.file_ops_already_materialized, + outcome.file_ops_skipped + )); + print_hint(&format!( + "CRDT rows: trunks +{}, branches +{}, leaves +{}", + outcome.stats.trunks_created, + outcome.stats.branches_created, + outcome.stats.leaves_created + )); + if outcome.file_ops_skipped > 0 { + print_hint(&format!( + "Skipped FileOps: non_create={}, unresolved_path={}, unresolved_line={}, missing_range={}, non_insert_branch={}, non_insert_leaf={}", + outcome.skip_stats.non_create_trunk, + outcome.skip_stats.unresolved_path, + outcome.skip_stats.unresolved_line, + outcome.skip_stats.missing_content_range, + outcome.skip_stats.non_insert_branch, + outcome.skip_stats.non_insert_leaf + )); + if !outcome.skip_samples.is_empty() { + print_hint(&format!( + "Skip samples: {}", + outcome.skip_samples.join(", ") + )); + } + } + + Ok(()) + } +} + #[cfg(test)] mod tests { use super::*; @@ -78,4 +155,11 @@ mod tests { let cmd = RepairDependencyIndex::default(); assert!(!cmd.force); } + + #[test] + fn materialize_crdt_defaults_to_current_view_non_force() { + let cmd = MaterializeCrdt::default(); + assert!(cmd.view.is_none()); + assert!(!cmd.force); + } } diff --git a/atomic-cli/src/commands/git/import.rs b/atomic-cli/src/commands/git/import.rs index 864d389..d895f32 100644 --- a/atomic-cli/src/commands/git/import.rs +++ b/atomic-cli/src/commands/git/import.rs @@ -24,6 +24,7 @@ //! - Merge commits are linearized (first parent only) use std::collections::HashSet; +use std::path::Path; use clap::Parser; use git2::{Repository as GitRepository, Sort}; @@ -84,6 +85,38 @@ pub struct Import { pub no_vault: bool, } +fn import_ignore_patterns(workdir: &Path, kind: Option<&str>) -> Vec { + let template = if let Some(kind) = kind { + super::super::init::get_ignore_template(kind) + } else if workdir.join("Cargo.toml").exists() { + super::super::init::get_ignore_template("rust") + } else if workdir.join("package.json").exists() { + super::super::init::get_ignore_template("node") + } else if workdir.join("go.mod").exists() { + super::super::init::get_ignore_template("go") + } else if workdir.join("setup.py").exists() || workdir.join("pyproject.toml").exists() { + super::super::init::get_ignore_template("python") + } else { + None + }; + + template + .unwrap_or(".atomic\n.git\n") + .lines() + .map(str::trim) + .filter(|line| !line.is_empty() && !line.starts_with('#')) + .map(ToOwned::to_owned) + .collect() +} + +fn current_git_branch(git_repo: &GitRepository) -> Option { + let head = git_repo.head().ok()?; + if !head.is_branch() { + return None; + } + head.shorthand().map(ToOwned::to_owned) +} + impl Import { /// Import a single branch into an Atomic view using parallel processing. fn import_branch( @@ -101,6 +134,10 @@ impl Import { incremental: self.incremental, imported_shas: imported_shas.clone(), repo_name, + ignored_path_patterns: import_ignore_patterns( + git_repo.workdir().unwrap_or_else(|| repo.root()), + self.kind.as_deref(), + ), }; let importer = ParallelImporter::new(git_repo, options); @@ -187,12 +224,8 @@ impl Import { /// Get the default branch name. fn get_default_branch(&self, git_repo: &GitRepository) -> CliResult { // Try HEAD first (current branch) - if let Ok(head) = git_repo.head() { - if head.is_branch() { - if let Some(name) = head.shorthand() { - return Ok(name.to_string()); - } - } + if let Some(name) = current_git_branch(git_repo) { + return Ok(name); } // Fall back to common default names @@ -334,13 +367,7 @@ impl Command for Import { Ok(result) => print_info(&format!("Materialized {} files", result.files_written)), Err(e) => print_warning(&format!("Working copy materialization failed: {}", e)), } - - // Restore files from git to fix import fidelity issues. - // The graph reconstruction may produce slightly wrong content - // for some files (hunk misalignment across thousands of commits). - // Git has the authoritative content — restore from it and update - // the FILE_INDEX so atomic status sees them as clean. - restore_from_git_and_reindex(&repo, &git_repo); + reindex_working_copy(&repo); // Initialize .atomicignore + vault AFTER import + materialize. // Must be before KG enrichment so has_vault() returns true. @@ -394,16 +421,21 @@ impl Command for Import { // Import let count = self.import_branch(&git_repo, &branch_name, &mut repo, &imported_shas)?; - // Materialize the working copy from the graph - print_info("Materializing working copy..."); - match repo.materialize() { - Ok(result) => print_info(&format!("Materialized {} files", result.files_written)), - Err(e) => print_warning(&format!("Working copy materialization failed: {}", e)), + if current_git_branch(&git_repo).as_deref() == Some(branch_name.as_str()) { + print_info("Using Git working copy as imported materialization."); + reindex_working_copy(&repo); + } else { + // Importing a non-checked-out branch must update disk from Atomic. + print_info("Materializing working copy..."); + match repo.materialize() { + Ok(result) => { + print_info(&format!("Materialized {} files", result.files_written)) + } + Err(e) => print_warning(&format!("Working copy materialization failed: {}", e)), + } + reindex_working_copy(&repo); } - // Restore files from git to fix import fidelity issues. - restore_from_git_and_reindex(&repo, &git_repo); - // Initialize .atomicignore + vault AFTER import + materialize if !repo_exists { init_atomicignore_and_vault( @@ -441,59 +473,41 @@ impl Command for Import { } } -/// Restore working copy files from git and rebuild FILE_INDEX. +/// Rebuild FILE_INDEX from the current working copy. /// -/// After materialize, some files may have slightly wrong content due to -/// graph reconstruction fidelity issues. Git is the source of truth for -/// file content — restore from it, then update the FILE_INDEX so that -/// `atomic status` reports the working copy as clean. -fn restore_from_git_and_reindex(repo: &Repository, git_repo: &GitRepository) { +/// During normal single-branch Git import the files on disk are already the +/// authoritative Git checkout for the imported branch, so there is no reason +/// to materialize the same content back out of Atomic. Indexing the tracked +/// files makes the post-import `atomic status` baseline clean. +fn reindex_working_copy(repo: &Repository) { use atomic_core::types::Hash; use std::time::SystemTime; let repo_root = repo.root().to_path_buf(); - - // `git checkout -- .` restores all tracked files to HEAD state - let result = std::process::Command::new("git") - .args(["checkout", "--", "."]) - .current_dir(&repo_root) - .output(); - - match result { - Ok(output) if output.status.success() => { - // Rebuild FILE_INDEX for all tracked files so status is clean. - // Only index files that exist on disk and have graph content. - // Files without graph content (tracked but not recorded) are - // left alone — status will correctly show them as Added. - let tracked = repo.list_tracked_files().unwrap_or_default(); - let mut entries: Vec<(String, i64, u32, u64, Hash)> = Vec::new(); - - for file in &tracked { - let abs = repo_root.join(&file.path); - if let Ok(metadata) = std::fs::metadata(&abs) { - let mtime = metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH); - let duration = mtime - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default(); - if let Ok(bytes) = std::fs::read(&abs) { - entries.push(( - file.path.to_string_lossy().replace('\\', "/"), - duration.as_secs() as i64, - duration.subsec_nanos(), - metadata.len(), - Hash::of(&bytes), - )); - } - } - } - - if !entries.is_empty() { - let _ = repo.update_file_index(&entries); + let tracked = repo.list_tracked_files().unwrap_or_default(); + let mut entries: Vec<(String, i64, u32, u64, Hash)> = Vec::new(); + + for file in &tracked { + let abs = repo_root.join(&file.path); + if let Ok(metadata) = std::fs::metadata(&abs) { + let mtime = metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH); + let duration = mtime + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default(); + if let Ok(bytes) = std::fs::read(&abs) { + entries.push(( + file.path.to_string_lossy().replace('\\', "/"), + duration.as_secs() as i64, + duration.subsec_nanos(), + metadata.len(), + Hash::of(&bytes), + )); } } - _ => { - log::warn!("git checkout failed — some files may show as modified in atomic status"); - } + } + + if !entries.is_empty() { + let _ = repo.update_file_index(&entries); } } diff --git a/atomic-cli/src/commands/git/parallel.rs b/atomic-cli/src/commands/git/parallel.rs index 08ea711..9cbdae5 100644 --- a/atomic-cli/src/commands/git/parallel.rs +++ b/atomic-cli/src/commands/git/parallel.rs @@ -52,7 +52,7 @@ //! - Phase 2 (sequential write): ~5s //! - Total: ~35s vs ~5min with serial approach -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use std::process::Command; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -66,12 +66,15 @@ use git2::{ }; use rayon::prelude::*; -use atomic_core::change::{Author, ChangeHeader}; +use atomic_core::change::{ + Atom, Author, Change, ChangeHeader, EdgeUpdate, GraphOp, Insertion, NewEdge, +}; use atomic_core::change::{Encoding, Local}; +use atomic_core::record::workflow::extract_filename; use atomic_core::record::workflow::graph_op::BuiltHunk; use atomic_core::record::workflow::GitDiffLine; use atomic_core::record::workflow::RecordedFile; -use atomic_core::types::Hash as ContentHash; +use atomic_core::types::{ChangePosition, EdgeFlags, GraphNode, Hash as ContentHash, Position}; use atomic_repository::Repository; use crate::error::{CliError, CliResult}; @@ -181,6 +184,10 @@ pub struct ParallelImportOptions { pub imported_shas: HashSet, /// Repository name (from remote URL or directory). pub repo_name: String, + /// Import-time ignore patterns, usually from the detected `.atomicignore` + /// template. These are applied before graph construction so generated + /// build outputs never enter the imported history. + pub ignored_path_patterns: Vec, } impl Default for ParallelImportOptions { @@ -189,10 +196,58 @@ impl Default for ParallelImportOptions { incremental: false, imported_shas: HashSet::new(), repo_name: "unknown".to_string(), + ignored_path_patterns: Vec::new(), } } } +#[derive(Debug, Clone)] +struct ImportIgnoreMatcher { + patterns: Vec, +} + +impl ImportIgnoreMatcher { + fn new(patterns: Vec) -> Self { + let patterns = patterns + .into_iter() + .map(|pattern| pattern.trim().replace('\\', "/")) + .filter(|pattern| !pattern.is_empty() && !pattern.starts_with('#')) + .collect(); + Self { patterns } + } + + fn is_empty(&self) -> bool { + self.patterns.is_empty() + } + + fn matches(&self, path: &str) -> bool { + let normalized = path.trim_start_matches('/').replace('\\', "/"); + let basename = Path::new(&normalized) + .file_name() + .and_then(|name| name.to_str()) + .unwrap_or(&normalized); + + self.patterns.iter().any(|pattern| { + let pattern = pattern.trim_start_matches('/'); + if let Some(dir) = pattern.strip_suffix('/') { + return normalized == dir + || normalized.starts_with(&format!("{dir}/")) + || normalized.contains(&format!("/{dir}/")); + } + + if let Some(suffix) = pattern.strip_prefix("**/*") { + return normalized.ends_with(suffix); + } + + if let Some(suffix) = pattern.strip_prefix('*') { + return basename.ends_with(suffix); + } + + normalized == pattern || basename == pattern + }) + } +} + // ═══════════════════════════════════════════════════════════════════════════ // ParallelImporter // ═══════════════════════════════════════════════════════════════════════════ @@ -204,6 +259,7 @@ impl Default for ParallelImportOptions { pub struct ParallelImporter { git_repo_path: PathBuf, options: ParallelImportOptions, + ignore_matcher: ImportIgnoreMatcher, } fn is_generated_diff_skip_path(path: &str) -> bool { @@ -225,6 +281,154 @@ fn count_line_units(content: &[u8]) -> usize { } } +#[derive(Clone, Debug)] +struct ImportLine { + change: ContentHash, + start: ChangePosition, + end: ChangePosition, + incoming_by: ContentHash, + content: Vec, +} + +impl ImportLine { + fn node(&self) -> GraphNode> { + GraphNode { + change: Some(self.change), + start: self.start, + end: self.end, + } + } + + fn start_pos(&self) -> Position> { + Position { + change: Some(self.change), + pos: self.start, + } + } + + fn end_pos(&self) -> Position> { + Position { + change: Some(self.change), + pos: self.end, + } + } +} + +#[derive(Clone, Debug)] +struct ImportIndexedFile { + inode_pos: Position>, + lines: Vec, +} + +#[derive(Default)] +struct ImportLineIndex { + files: HashMap, +} + +impl ImportLineIndex { + fn update_from_added_change(&mut self, change_hash: ContentHash, change: &Change) { + for graph_op in change.hunks() { + match graph_op { + GraphOp::FileAdd { + add_inode, + contents, + path, + .. + } => { + let inode_pos = Position { + change: Some(change_hash), + pos: add_inode.start, + }; + let mut lines = Vec::new(); + if let Some(contents) = contents { + lines.push(ImportLine { + change: change_hash, + start: contents.start, + end: contents.end, + incoming_by: change_hash, + content: change.contents + [contents.start.as_usize()..contents.end.as_usize()] + .to_vec(), + }); + } + self.files + .insert(path.clone(), ImportIndexedFile { inode_pos, lines }); + } + GraphOp::Edit { + change: Atom::Insertion(insertion), + local, + .. + } => { + if let Some(indexed) = self.files.get_mut(&local.path) { + indexed.lines.push(ImportLine { + change: change_hash, + start: insertion.start, + end: insertion.end, + incoming_by: change_hash, + content: change.contents + [insertion.start.as_usize()..insertion.end.as_usize()] + .to_vec(), + }); + } + } + _ => {} + } + } + } +} + +#[derive(Debug)] +enum PendingLineIndexUpdate { + Add { + path: String, + inode_pos: Position>, + new_ranges: Vec<(ChangePosition, ChangePosition)>, + new_lines: Vec>, + }, + Modify { + path: String, + replacements: Vec, + }, + Rename { + old_path: String, + new_path: String, + }, + Delete { + path: String, + }, +} + +#[derive(Debug)] +struct PendingLineReplacement { + start_idx: usize, + old_len: usize, + new_ranges: Vec<(ChangePosition, ChangePosition)>, + new_lines: Vec>, + successor_incoming_by_current: bool, +} + +#[derive(Debug)] +struct GitReplacementBlock { + old_start: usize, + old_len: usize, + new_start: usize, + new_lines: Vec>, +} + +fn position_hashes(pos: &Position>) -> impl Iterator + '_ { + pos.change + .into_iter() + .filter(|hash| *hash != ContentHash::NONE) +} + +fn split_graph_first_lines(content: &[u8]) -> Vec<&[u8]> { + if content.is_empty() { + Vec::new() + } else { + content.split_inclusive(|&b| b == b'\n').collect() + } +} + fn trace_git_import_enabled() -> bool { std::env::var_os("ATOMIC_TRACE_GIT_IMPORT").is_some() } @@ -452,6 +656,62 @@ fn build_linewise_crdt_ops_for_added_file( (file_ops, stats) } +fn build_graph_first_file_ops_for_added_file( + path: &str, + content_lines: &[Vec], + ranges: &[(ChangePosition, ChangePosition)], + encoding: Encoding, + file_idx: u32, + next_branch_idx: &mut u32, +) -> atomic_core::change::FileOps { + use atomic_core::change::LineOps; + use atomic_core::crdt::{BranchId, BranchOp, LeafId, LeafOp, TrunkId}; + use atomic_core::types::NodeId; + + let placeholder_change_id = NodeId::ROOT; + let trunk_id = TrunkId::new(placeholder_change_id, file_idx); + let enc = if encoding == Encoding::Binary { + None + } else { + Some(encoding) + }; + let mut file_ops = atomic_core::change::FileOps::create(trunk_id, path.to_string(), enc); + + let mut prev_branch: Option = None; + for (line_idx, line) in content_lines.iter().enumerate() { + let branch_id = BranchId::new(placeholder_change_id, *next_branch_idx); + *next_branch_idx += 1; + let leaf_id = LeafId::new(placeholder_change_id, line_idx as u32); + let trimmed = line.strip_suffix(b"\n").unwrap_or(line); + let leaf_ops = if trimmed.is_empty() { + Vec::new() + } else { + vec![LeafOp::Insert { + after: None, + kind: atomic_core::diff::TokenKind::Word, + content: trimmed.to_vec(), + }] + }; + let _ = leaf_id; + let mut line_ops = LineOps::new_with_line_nums( + branch_id, + BranchOp::Insert { + after: prev_branch, + content: leaf_ops, + }, + None, + Some(line_idx + 1), + ); + if let Some((start, end)) = ranges.get(line_idx) { + line_ops.set_content_range(*start, *end); + } + file_ops.add_line_op(line_ops); + prev_branch = Some(branch_id); + } + + file_ops +} + fn record_git_import_add_linewise(path: &str, new_content: &[u8]) -> Option { let encoding = Encoding::detect(new_content); if encoding == Encoding::Binary { @@ -475,6 +735,806 @@ fn record_git_import_add_linewise(path: &str, new_content: &[u8]) -> Option Option<(Change, Vec, Vec)> { + if parsed.files.is_empty() { + return None; + } + + let mut contents = Vec::new(); + let mut hunks = Vec::new(); + let mut file_ops = Vec::new(); + let mut next_file_idx = 0u32; + let mut next_branch_idx = 0u32; + let mut dependencies = HashSet::new(); + let mut pending = Vec::new(); + let mut deleted_paths = Vec::new(); + + for file in &parsed.files { + match file.operation { + FileOperation::Added | FileOperation::Copied => { + let new_content = file.new_content.as_deref().unwrap_or(&[]); + let encoding = Encoding::detect(new_content); + if encoding == Encoding::Binary { + return None; + } + + let filename = extract_filename(&file.path); + let name_start = ChangePosition::new(contents.len() as u64); + contents.extend_from_slice(filename.as_bytes()); + let name_end = ChangePosition::new(contents.len() as u64); + let inode_pos = Position { + change: None, + pos: name_end, + }; + let name_pos = Position { + change: None, + pos: name_end, + }; + let parent_pos = Position { + change: Some(ContentHash::NONE), + pos: ChangePosition::ROOT, + }; + + let new_line_contents: Vec> = if is_generated_diff_skip_path(&file.path) { + if new_content.is_empty() { + Vec::new() + } else { + vec![new_content.to_vec()] + } + } else { + split_graph_first_lines(new_content) + .into_iter() + .map(|line| line.to_vec()) + .collect() + }; + let mut new_ranges = Vec::new(); + for line in &new_line_contents { + let start = ChangePosition::new(contents.len() as u64); + contents.extend_from_slice(line); + let end = ChangePosition::new(contents.len() as u64); + new_ranges.push((start, end)); + } + + let first_content = new_ranges.first().map(|&(start, end)| Insertion { + predecessors: vec![inode_pos], + successors: vec![], + flag: EdgeFlags::BLOCK, + start, + end, + inode: inode_pos, + }); + + hunks.push(GraphOp::FileAdd { + add_name: Insertion { + predecessors: vec![parent_pos], + successors: vec![], + flag: EdgeFlags::FOLDER | EdgeFlags::BLOCK, + start: name_start, + end: name_end, + inode: parent_pos, + }, + add_inode: Insertion { + predecessors: vec![name_pos], + successors: vec![], + flag: EdgeFlags::FOLDER | EdgeFlags::BLOCK, + start: name_end, + end: name_end, + inode: inode_pos, + }, + contents: first_content, + path: file.path.clone(), + encoding: Some(encoding), + }); + + for (idx, &(start, end)) in new_ranges.iter().enumerate().skip(1) { + hunks.push(GraphOp::Edit { + change: Atom::Insertion(Insertion { + predecessors: vec![Position { + change: None, + pos: new_ranges[idx - 1].1, + }], + successors: vec![], + flag: EdgeFlags::BLOCK, + start, + end, + inode: inode_pos, + }), + local: Local::new(&file.path, (idx + 1) as u64), + encoding: Some(encoding), + }); + } + + file_ops.push(build_graph_first_file_ops_for_added_file( + &file.path, + &new_line_contents, + &new_ranges, + encoding, + next_file_idx, + &mut next_branch_idx, + )); + next_file_idx += 1; + pending.push(PendingLineIndexUpdate::Add { + path: file.path.clone(), + inode_pos, + new_ranges, + new_lines: new_line_contents, + }); + continue; + } + FileOperation::Renamed => { + let old_path = file.old_path.as_deref()?; + let indexed = line_index.files.get(old_path)?; + let new_content = file.new_content.as_deref().unwrap_or(&[]); + let encoding = Encoding::detect(new_content); + if encoding == Encoding::Binary { + return None; + } + + let new_filename = extract_filename(&file.path); + let name_start = ChangePosition::new(contents.len() as u64); + contents.extend_from_slice(new_filename.as_bytes()); + let name_end = ChangePosition::new(contents.len() as u64); + + let old_filename = extract_filename(old_path); + let old_name_end = indexed.inode_pos.pos; + let old_name_start = ChangePosition::new( + old_name_end.get().saturating_sub(old_filename.len() as u64), + ); + let parent_pos = Position { + change: Some(ContentHash::NONE), + pos: ChangePosition::ROOT, + }; + + dependencies.extend(position_hashes(&indexed.inode_pos)); + + let del = EdgeUpdate { + edges: vec![NewEdge { + previous: EdgeFlags::FOLDER | EdgeFlags::BLOCK, + flag: EdgeFlags::FOLDER | EdgeFlags::BLOCK | EdgeFlags::DELETED, + from: parent_pos, + to: GraphNode { + change: indexed.inode_pos.change, + start: old_name_start, + end: old_name_end, + }, + introduced_by: indexed.inode_pos.change, + }], + inode: indexed.inode_pos, + }; + + hunks.push(GraphOp::FileMove { + del, + add: Insertion { + predecessors: vec![parent_pos], + successors: vec![indexed.inode_pos], + flag: EdgeFlags::FOLDER | EdgeFlags::BLOCK, + start: name_start, + end: name_end, + inode: indexed.inode_pos, + }, + path: file.path.clone(), + }); + pending.push(PendingLineIndexUpdate::Rename { + old_path: old_path.to_string(), + new_path: file.path.clone(), + }); + + if let Some(diff_lines) = file.diff_lines.as_ref() { + let (ops, _) = atomic_core::record::workflow::build_crdt_ops_from_git_diff( + &file.path, diff_lines, + ); + file_ops.push(ops); + } + + let replacements = if is_generated_diff_skip_path(&file.path) { + vec![GitReplacementBlock { + old_start: if indexed.lines.is_empty() { 0 } else { 1 }, + old_len: indexed.lines.len(), + new_start: 1, + new_lines: if new_content.is_empty() { + Vec::new() + } else { + vec![new_content.to_vec()] + }, + }] + } else { + current_state_replacements(indexed, new_content) + }; + if !replacements.is_empty() { + let mut pending_replacements = Vec::new(); + for replacement in replacements { + let start_idx = if replacement.old_len == 0 { + replacement.old_start + } else { + replacement.old_start.checked_sub(1)? + }; + let end_idx = start_idx.checked_add(replacement.old_len)?; + if end_idx > indexed.lines.len() { + return None; + } + + let predecessor = if start_idx == 0 { + indexed.inode_pos + } else { + indexed.lines[start_idx - 1].end_pos() + }; + let successor = indexed.lines.get(end_idx).map(ImportLine::start_pos); + + dependencies.extend(position_hashes(&predecessor)); + if let Some(successor) = successor { + dependencies.extend(position_hashes(&successor)); + } + + let mut edge_update = EdgeUpdate { + edges: Vec::with_capacity(replacement.old_len), + inode: indexed.inode_pos, + }; + for line_idx in start_idx..end_idx { + let from = if line_idx == 0 { + indexed.inode_pos + } else { + indexed.lines[line_idx - 1].end_pos() + }; + let old_line = &indexed.lines[line_idx]; + dependencies.insert(old_line.change); + dependencies.insert(old_line.incoming_by); + edge_update.edges.push(NewEdge { + previous: EdgeFlags::BLOCK, + flag: EdgeFlags::BLOCK | EdgeFlags::DELETED, + from, + to: old_line.node(), + introduced_by: Some(old_line.incoming_by), + }); + } + + let mut new_ranges = Vec::with_capacity(replacement.new_lines.len()); + for new_line in &replacement.new_lines { + let start = ChangePosition::new(contents.len() as u64); + contents.extend_from_slice(new_line); + let end = ChangePosition::new(contents.len() as u64); + new_ranges.push((start, end)); + } + + if new_ranges.is_empty() { + hunks.push(GraphOp::Edit { + change: Atom::EdgeUpdate(edge_update), + local: Local::new(&file.path, replacement.new_start as u64), + encoding: Some(encoding), + }); + } else if replacement.old_len == 0 { + let first = new_ranges[0]; + let first_successors = if new_ranges.len() == 1 { + successor.into_iter().collect() + } else { + Vec::new() + }; + hunks.push(GraphOp::Edit { + change: Atom::Insertion(Insertion { + predecessors: vec![predecessor], + successors: first_successors, + flag: EdgeFlags::BLOCK, + start: first.0, + end: first.1, + inode: indexed.inode_pos, + }), + local: Local::new(&file.path, replacement.new_start as u64), + encoding: Some(encoding), + }); + } else { + let first = new_ranges[0]; + let first_successors = if new_ranges.len() == 1 { + successor.into_iter().collect() + } else { + Vec::new() + }; + hunks.push(GraphOp::Replacement { + change: edge_update, + replacement: Insertion { + predecessors: vec![predecessor], + successors: first_successors, + flag: EdgeFlags::BLOCK, + start: first.0, + end: first.1, + inode: indexed.inode_pos, + }, + local: Local::new(&file.path, replacement.new_start as u64), + encoding: Some(encoding), + }); + } + + for (new_idx, &(start, end)) in new_ranges.iter().enumerate().skip(1) { + let predecessor = Position { + change: None, + pos: new_ranges[new_idx - 1].1, + }; + let successors = if new_idx + 1 == new_ranges.len() { + successor.into_iter().collect() + } else { + Vec::new() + }; + hunks.push(GraphOp::Edit { + change: Atom::Insertion(Insertion { + predecessors: vec![predecessor], + successors, + flag: EdgeFlags::BLOCK, + start, + end, + inode: indexed.inode_pos, + }), + local: Local::new( + &file.path, + (replacement.new_start + new_idx) as u64, + ), + encoding: Some(encoding), + }); + } + + pending_replacements.push(PendingLineReplacement { + start_idx, + old_len: replacement.old_len, + new_ranges, + new_lines: replacement.new_lines, + successor_incoming_by_current: successor.is_some(), + }); + } + + pending.push(PendingLineIndexUpdate::Modify { + path: file.path.clone(), + replacements: pending_replacements, + }); + } + continue; + } + FileOperation::Deleted => { + let indexed = line_index.files.get(&file.path)?; + let mut edge_update = EdgeUpdate { + edges: Vec::with_capacity(indexed.lines.len()), + inode: indexed.inode_pos, + }; + for line_idx in 0..indexed.lines.len() { + let from = if line_idx == 0 { + indexed.inode_pos + } else { + indexed.lines[line_idx - 1].end_pos() + }; + let old_line = &indexed.lines[line_idx]; + dependencies.insert(old_line.change); + dependencies.insert(old_line.incoming_by); + edge_update.edges.push(NewEdge { + previous: EdgeFlags::BLOCK, + flag: EdgeFlags::BLOCK | EdgeFlags::DELETED, + from, + to: old_line.node(), + introduced_by: Some(old_line.incoming_by), + }); + } + hunks.push(GraphOp::Edit { + change: Atom::EdgeUpdate(edge_update), + local: Local::new(&file.path, 1), + encoding: file + .old_content + .as_deref() + .map(Encoding::detect) + .filter(|enc| *enc != Encoding::Binary) + .or(Some(Encoding::Utf8)), + }); + pending.push(PendingLineIndexUpdate::Delete { + path: file.path.clone(), + }); + deleted_paths.push(file.path.clone()); + if let Some(diff_lines) = file.diff_lines.as_ref() { + let (ops, _) = atomic_core::record::workflow::build_crdt_ops_from_git_diff( + &file.path, diff_lines, + ); + file_ops.push(ops); + } + continue; + } + FileOperation::Modified => {} + } + + let indexed = line_index.files.get(&file.path)?; + let replacements = if is_generated_diff_skip_path(&file.path) { + let new_content = file.new_content.as_deref()?; + vec![GitReplacementBlock { + old_start: if indexed.lines.is_empty() { 0 } else { 1 }, + old_len: indexed.lines.len(), + new_start: 1, + new_lines: if new_content.is_empty() { + Vec::new() + } else { + vec![new_content.to_vec()] + }, + }] + } else if parsed.is_merge { + let new_content = file.new_content.as_deref()?; + current_state_replacements(indexed, new_content) + } else { + let diff_lines = file.diff_lines.as_ref()?; + let (ops, _) = + atomic_core::record::workflow::build_crdt_ops_from_git_diff(&file.path, diff_lines); + file_ops.push(ops); + parse_git_diff_replacements(diff_lines)? + }; + if replacements.is_empty() { + continue; + } + + let encoding = file + .new_content + .as_deref() + .map(Encoding::detect) + .filter(|enc| *enc != Encoding::Binary) + .or(Some(Encoding::Utf8)); + + let mut pending_replacements = Vec::new(); + + for replacement in replacements { + let start_idx = if replacement.old_len == 0 { + replacement.old_start + } else { + replacement.old_start.checked_sub(1)? + }; + let end_idx = start_idx.checked_add(replacement.old_len)?; + if end_idx > indexed.lines.len() { + return None; + } + + let predecessor = if start_idx == 0 { + indexed.inode_pos + } else { + indexed.lines[start_idx - 1].end_pos() + }; + let successor = indexed.lines.get(end_idx).map(ImportLine::start_pos); + + dependencies.extend(position_hashes(&predecessor)); + if let Some(successor) = successor { + dependencies.extend(position_hashes(&successor)); + } + + let mut edge_update = EdgeUpdate { + edges: Vec::with_capacity(replacement.old_len), + inode: indexed.inode_pos, + }; + + for line_idx in start_idx..end_idx { + let from = if line_idx == 0 { + indexed.inode_pos + } else { + indexed.lines[line_idx - 1].end_pos() + }; + let old_line = &indexed.lines[line_idx]; + dependencies.insert(old_line.change); + dependencies.insert(old_line.incoming_by); + edge_update.edges.push(NewEdge { + previous: EdgeFlags::BLOCK, + flag: EdgeFlags::BLOCK | EdgeFlags::DELETED, + from, + to: old_line.node(), + introduced_by: Some(old_line.incoming_by), + }); + } + + let mut new_ranges = Vec::with_capacity(replacement.new_lines.len()); + for new_line in &replacement.new_lines { + let start = ChangePosition::new(contents.len() as u64); + contents.extend_from_slice(new_line); + let end = ChangePosition::new(contents.len() as u64); + new_ranges.push((start, end)); + } + + if new_ranges.is_empty() { + hunks.push(GraphOp::Edit { + change: Atom::EdgeUpdate(edge_update), + local: Local::new(&file.path, replacement.new_start as u64), + encoding, + }); + } else if replacement.old_len == 0 { + let first = new_ranges[0]; + let first_successors = if new_ranges.len() == 1 { + successor.into_iter().collect() + } else { + Vec::new() + }; + hunks.push(GraphOp::Edit { + change: Atom::Insertion(Insertion { + predecessors: vec![predecessor], + successors: first_successors, + flag: EdgeFlags::BLOCK, + start: first.0, + end: first.1, + inode: indexed.inode_pos, + }), + local: Local::new(&file.path, replacement.new_start as u64), + encoding, + }); + } else { + let first = new_ranges[0]; + let first_successors = if new_ranges.len() == 1 { + successor.into_iter().collect() + } else { + Vec::new() + }; + hunks.push(GraphOp::Replacement { + change: edge_update, + replacement: Insertion { + predecessors: vec![predecessor], + successors: first_successors, + flag: EdgeFlags::BLOCK, + start: first.0, + end: first.1, + inode: indexed.inode_pos, + }, + local: Local::new(&file.path, replacement.new_start as u64), + encoding, + }); + } + + for (new_idx, &(start, end)) in new_ranges.iter().enumerate().skip(1) { + let predecessor = Position { + change: None, + pos: new_ranges[new_idx - 1].1, + }; + let successors = if new_idx + 1 == new_ranges.len() { + successor.into_iter().collect() + } else { + Vec::new() + }; + hunks.push(GraphOp::Edit { + change: Atom::Insertion(Insertion { + predecessors: vec![predecessor], + successors, + flag: EdgeFlags::BLOCK, + start, + end, + inode: indexed.inode_pos, + }), + local: Local::new(&file.path, (replacement.new_start + new_idx) as u64), + encoding, + }); + } + + pending_replacements.push(PendingLineReplacement { + start_idx, + old_len: replacement.old_len, + new_ranges, + new_lines: replacement.new_lines, + successor_incoming_by_current: successor.is_some(), + }); + } + + pending.push(PendingLineIndexUpdate::Modify { + path: file.path.clone(), + replacements: pending_replacements, + }); + } + + if hunks.is_empty() { + return None; + } + + let mut dependencies: Vec = dependencies.into_iter().collect(); + dependencies.sort(); + dependencies.dedup(); + Some(( + Change::with_file_ops(header, hunks, file_ops, contents, dependencies), + pending, + deleted_paths, + )) +} + +fn current_state_replacements( + indexed: &ImportIndexedFile, + new_content: &[u8], +) -> Vec { + let old_lines = &indexed.lines; + let new_lines: Vec> = split_graph_first_lines(new_content) + .into_iter() + .map(|line| line.to_vec()) + .collect(); + + let mut prefix = 0usize; + while prefix < old_lines.len() + && prefix < new_lines.len() + && old_lines[prefix].content == new_lines[prefix] + { + prefix += 1; + } + + let mut suffix = 0usize; + while suffix < old_lines.len().saturating_sub(prefix) + && suffix < new_lines.len().saturating_sub(prefix) + && old_lines[old_lines.len() - 1 - suffix].content + == new_lines[new_lines.len() - 1 - suffix] + { + suffix += 1; + } + + let old_mid_len = old_lines.len().saturating_sub(prefix + suffix); + let new_mid_end = new_lines.len().saturating_sub(suffix); + if old_mid_len == 0 && prefix == new_mid_end { + return Vec::new(); + } + + vec![GitReplacementBlock { + old_start: if old_mid_len == 0 { prefix } else { prefix + 1 }, + old_len: old_mid_len, + new_start: prefix + 1, + new_lines: new_lines[prefix..new_mid_end].to_vec(), + }] +} + +fn parse_git_diff_replacements(lines: &[GitDiffLine]) -> Option> { + let mut blocks = Vec::new(); + let mut old_start: Option = None; + let mut new_start: Option = None; + let mut old_len = 0usize; + let mut new_lines = Vec::new(); + let mut old_cursor = 1usize; + let mut new_cursor = 1usize; + + let flush = |blocks: &mut Vec, + old_start: &mut Option, + new_start: &mut Option, + old_len: &mut usize, + new_lines: &mut Vec>| + -> Option<()> { + if *old_len > 0 || !new_lines.is_empty() { + let old = old_start.take()?; + let new = new_start.take().unwrap_or(old); + blocks.push(GitReplacementBlock { + old_start: old, + old_len: *old_len, + new_start: new, + new_lines: std::mem::take(new_lines), + }); + *old_len = 0; + } + Some(()) + }; + + for line in lines { + match line.origin { + ' ' => { + flush( + &mut blocks, + &mut old_start, + &mut new_start, + &mut old_len, + &mut new_lines, + )?; + old_cursor = line + .old_lineno + .map(|n| n as usize + 1) + .unwrap_or(old_cursor + 1); + new_cursor = line + .new_lineno + .map(|n| n as usize + 1) + .unwrap_or(new_cursor + 1); + } + '-' => { + if old_start.is_none() { + old_start = Some(line.old_lineno.map(|n| n as usize).unwrap_or(old_cursor)); + } + old_cursor = line + .old_lineno + .map(|n| n as usize + 1) + .unwrap_or(old_cursor + 1); + old_len += 1; + } + '+' => { + if new_start.is_none() { + new_start = Some(line.new_lineno.map(|n| n as usize).unwrap_or(new_cursor)); + } + if old_start.is_none() { + old_start = Some(old_cursor.saturating_sub(1)); + } + new_cursor = line + .new_lineno + .map(|n| n as usize + 1) + .unwrap_or(new_cursor + 1); + new_lines.push(line.content.clone()); + } + _ => {} + } + } + + flush( + &mut blocks, + &mut old_start, + &mut new_start, + &mut old_len, + &mut new_lines, + )?; + + Some(blocks) +} + +fn apply_line_index_updates( + line_index: &mut ImportLineIndex, + change_hash: ContentHash, + pending: Vec, +) { + for update in pending { + match update { + PendingLineIndexUpdate::Add { + path, + inode_pos, + new_ranges, + new_lines, + } => { + let inode_pos = Position { + change: Some(change_hash), + pos: inode_pos.pos, + }; + let lines = new_ranges + .iter() + .zip(new_lines) + .map(|(&(start, end), content)| ImportLine { + change: change_hash, + start, + end, + incoming_by: change_hash, + content, + }) + .collect(); + line_index + .files + .insert(path, ImportIndexedFile { inode_pos, lines }); + } + PendingLineIndexUpdate::Modify { path, replacements } => { + let Some(indexed) = line_index.files.get_mut(&path) else { + continue; + }; + + let mut offset: isize = 0; + for replacement in replacements { + let adjusted_start = (replacement.start_idx as isize + offset).max(0) as usize; + let adjusted_end = adjusted_start + .saturating_add(replacement.old_len) + .min(indexed.lines.len()); + let new_lines: Vec = replacement + .new_ranges + .iter() + .zip(replacement.new_lines) + .map(|(&(start, end), content)| ImportLine { + change: change_hash, + start, + end, + incoming_by: change_hash, + content, + }) + .collect(); + indexed + .lines + .splice(adjusted_start..adjusted_end, new_lines); + + if replacement.successor_incoming_by_current { + let successor_idx = adjusted_start + replacement.new_ranges.len(); + if let Some(successor) = indexed.lines.get_mut(successor_idx) { + successor.incoming_by = change_hash; + } + } + + offset += replacement.new_ranges.len() as isize - replacement.old_len as isize; + } + } + PendingLineIndexUpdate::Rename { old_path, new_path } => { + if let Some(indexed) = line_index.files.remove(&old_path) { + line_index.files.insert(new_path, indexed); + } + } + PendingLineIndexUpdate::Delete { path } => { + line_index.files.remove(&path); + } + } + } +} + fn slow_import_commit_label(parsed: &ParsedCommit) -> String { let message = truncate_for_progress(&parsed.metadata.message.replace('\n', " "), 72); format!("{} \"{}\"", parsed.short_sha, message) @@ -539,10 +1599,12 @@ impl ParallelImporter { .parent() .map(|p| p.to_path_buf()) .unwrap_or_else(|| git_repo.path().to_path_buf()); + let ignore_matcher = ImportIgnoreMatcher::new(options.ignored_path_patterns.clone()); Self { git_repo_path, options, + ignore_matcher, } } @@ -553,6 +1615,36 @@ impl ParallelImporter { }) } + fn path_ignored_for_import(&self, path: &str) -> bool { + !self.ignore_matcher.is_empty() && self.ignore_matcher.matches(path) + } + + fn file_ignored_for_import(&self, file: &ParsedFile) -> bool { + match file.operation { + FileOperation::Renamed => { + let old_ignored = file + .old_path + .as_deref() + .is_some_and(|old_path| self.path_ignored_for_import(old_path)); + old_ignored && self.path_ignored_for_import(&file.path) + } + _ => self.path_ignored_for_import(&file.path), + } + } + + fn apply_import_ignores(&self, commit: &mut ParsedCommit) { + if self.ignore_matcher.is_empty() || commit.files.is_empty() { + return; + } + + commit + .files + .retain(|file| !self.file_ignored_for_import(file)); + if commit.files.is_empty() { + commit.is_empty = true; + } + } + /// Import commits from a branch into an Atomic repository. /// /// Commits are processed in **batches** to keep memory bounded and show @@ -594,6 +1686,7 @@ impl ParallelImporter { let import_start = Instant::now(); let mut commits_written = 0usize; + let mut line_index = ImportLineIndex::default(); for (batch_idx, chunk) in commit_oids.chunks(batch_size).enumerate() { let batch_start = batch_idx * batch_size; @@ -621,7 +1714,7 @@ impl ParallelImporter { // Phase 2: Sequential write for this batch let write_start = Instant::now(); - let write_stats = self.phase2_write(repo, &parsed_commits)?; + let write_stats = self.phase2_write(repo, &parsed_commits, &mut line_index)?; let write_elapsed = write_start.elapsed(); stats.phase2_duration += write_elapsed; @@ -710,6 +1803,9 @@ impl ParallelImporter { if let Ok(status) = repo.status(atomic_repository::StatusOptions::default()) { for entry in status.untracked() { let path_str = entry.path().to_string_lossy().replace('\\', "/"); + if self.path_ignored_for_import(&path_str) { + continue; + } // Add to tracking let _ = repo.add(&path_str, atomic_repository::TrackingOptions::default()); @@ -848,7 +1944,9 @@ impl ParallelImporter { message: format!("Failed to open git repository: {}", e), })?; - parse_commit(&git_repo, *oid, idx, &oid_to_index) + let mut commit = parse_commit(&git_repo, *oid, idx, &oid_to_index)?; + self.apply_import_ignores(&mut commit); + Ok(commit) }) .collect(); @@ -884,6 +1982,7 @@ impl ParallelImporter { &self, repo: &mut Repository, commits: &[ParsedCommit], + line_index: &mut ImportLineIndex, ) -> CliResult { let mut stats = WriteStats::default(); let total = commits.len(); @@ -911,7 +2010,7 @@ impl ParallelImporter { } // Write the change - match self.write_commit(repo, parsed) { + match self.write_commit(repo, parsed, line_index) { Ok(written) => { if written { stats.changes_written += 1; @@ -941,6 +2040,9 @@ impl ParallelImporter { if file.operation == FileOperation::Deleted { continue; } + if self.path_ignored_for_import(&file.path) { + continue; + } let abs_path = repo_root.join(&file.path); if let Ok(metadata) = std::fs::metadata(&abs_path) { use std::time::SystemTime; @@ -969,7 +2071,12 @@ impl ParallelImporter { } /// Write a single commit to the repository. - fn write_commit(&self, repo: &mut Repository, parsed: &ParsedCommit) -> CliResult { + fn write_commit( + &self, + repo: &mut Repository, + parsed: &ParsedCommit, + line_index: &mut ImportLineIndex, + ) -> CliResult { use atomic_core::output::memory::Memory; use atomic_core::record::workflow::{ record_added_file, record_deleted_file, record_modified_file, DetectedFile, @@ -998,6 +2105,54 @@ impl ParallelImporter { return self.write_empty_commit(repo, parsed, header); } + if let Some((mut graph_change, pending_updates, graph_deleted_paths)) = + build_graph_first_change(header.clone(), parsed, line_index) + { + graph_change.unhashed = Some(self.build_git_metadata(parsed, false, false)); + let progress = SlowImportProgress::start( + slow_import_commit_label(parsed), + format!( + "graph-first files={}, ops={}; CRDT metadata deferred", + parsed.files.len(), + graph_change.hunks().len() + ), + ); + let write_start = Instant::now(); + let write_result = repo.write_import_graph_change( + graph_change, + &graph_deleted_paths, + Default::default(), + ); + let write_ms = write_start.elapsed().as_millis(); + let progress_reported = progress.finish(); + let write_outcome = write_result.map_err(|e| CliError::Internal(e.into()))?; + apply_line_index_updates(line_index, write_outcome.hash, pending_updates); + if progress_reported || write_ms >= 5_000 { + print_info(&format!( + "Imported {} in {:.1}s (graph-first assemble={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms)", + slow_import_commit_label(parsed), + write_ms as f64 / 1000.0, + write_outcome.timings.assemble_ms, + write_outcome.timings.apply_ms, + write_outcome.timings.direct_graph_ms, + write_outcome.timings.direct_crdt_ms, + write_outcome.timings.commit_ms + )); + } + trace_git_import(format!( + "write {} files={} graph_first=1 ops={} apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms total={}ms", + parsed.short_sha, + parsed.files.len(), + write_outcome.insert.stats.atoms_processed, + write_outcome.timings.apply_ms, + write_outcome.timings.direct_graph_ms, + write_outcome.timings.direct_crdt_ms, + write_outcome.timings.commit_ms, + commit_start.elapsed().as_millis() + )); + return Ok(true); + } + // Track new files so the pristine knows about them before we record. // Also collect deleted paths so we can remove them from TREE after insert. // Use batch operations to avoid a separate write txn + fsync per file. @@ -1383,6 +2538,22 @@ impl ParallelImporter { .map_err(|e| CliError::Internal(e.into()))?; let write_ms = write_start.elapsed().as_millis(); let progress_reported = progress.finish(); + if !recorded_files.is_empty() + && recorded_files.iter().all(|recorded| { + matches!( + recorded.kind(), + Some(atomic_core::record::workflow::DetectionKind::Added) + ) + }) + { + match repo.load_change(&write_outcome.hash) { + Ok(change) => line_index.update_from_added_change(write_outcome.hash, &change), + Err(err) => trace_git_import(format!( + "graph-first {}: could not seed line index from {}: {}", + parsed.short_sha, write_outcome.hash, err + )), + } + } if progress_reported || write_ms >= 5_000 { print_info(&format!( "Imported {} in {:.1}s (assemble={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms)", @@ -1494,6 +2665,14 @@ impl ParallelImporter { file.diff_lines.as_ref().map(|lines| { serde_json::json!({ "path": file.path, + "old_path": file.old_path, + "operation": match file.operation { + FileOperation::Added => "added", + FileOperation::Modified => "modified", + FileOperation::Deleted => "deleted", + FileOperation::Renamed => "renamed", + FileOperation::Copied => "copied", + }, "lines": lines, }) }) @@ -1804,7 +2983,7 @@ fn parse_diff_files( .map(|p| p.to_string_lossy().to_string()) .unwrap_or_default(); - let old_path = if operation == FileOperation::Renamed { + let old_path = if matches!(operation, FileOperation::Renamed | FileOperation::Copied) { old_file.path().map(|p| p.to_string_lossy().to_string()) } else { None @@ -1944,7 +3123,7 @@ fn parse_diff_files_via_git_cli( }); } 'C' => { - let _old_path = parts.next(); + let old_path = parts.next(); let Some(path) = parts.next() else { continue }; files.push(ParsedFile { path: path.to_string(), @@ -1952,7 +3131,7 @@ fn parse_diff_files_via_git_cli( new_content: get_file_content(git_repo, tree, path).ok(), old_content: None, diff_lines: None, - old_path: None, + old_path: old_path.map(|path| path.to_string()), }); } _ => {} @@ -2052,4 +3231,38 @@ mod tests { assert_eq!(stats.commits_found, 0); assert_eq!(stats.changes_written, 0); } + + #[test] + fn test_graph_first_added_file_ops_use_unique_branch_ids_and_ranges() { + let mut next_branch_idx = 0; + let first = build_graph_first_file_ops_for_added_file( + "a.txt", + &[b"one\n".to_vec(), b"two\n".to_vec()], + &[ + (ChangePosition::new(0), ChangePosition::new(4)), + (ChangePosition::new(4), ChangePosition::new(8)), + ], + Encoding::Utf8, + 0, + &mut next_branch_idx, + ); + let second = build_graph_first_file_ops_for_added_file( + "b.txt", + &[b"three\n".to_vec()], + &[(ChangePosition::new(8), ChangePosition::new(14))], + Encoding::Utf8, + 1, + &mut next_branch_idx, + ); + + assert_eq!(first.trunk_id().file_idx(), 0); + assert_eq!(second.trunk_id().file_idx(), 1); + assert_eq!(first.line_ops()[0].branch_id().branch_idx(), 0); + assert_eq!(first.line_ops()[1].branch_id().branch_idx(), 1); + assert_eq!(second.line_ops()[0].branch_id().branch_idx(), 2); + assert_eq!( + first.line_ops()[1].content_range(), + Some((ChangePosition::new(4), ChangePosition::new(8))) + ); + } } diff --git a/atomic-core/src/apply/file_ops.rs b/atomic-core/src/apply/file_ops.rs index 5ff82cd..1c1d004 100644 --- a/atomic-core/src/apply/file_ops.rs +++ b/atomic-core/src/apply/file_ops.rs @@ -124,6 +124,21 @@ impl ApplyFileOpsStats { pub fn has_operations(&self) -> bool { self.total_trunk_ops() > 0 || self.total_branch_ops() > 0 || self.total_leaf_ops() > 0 } + + /// Add another stats value into this one. + pub fn merge(&mut self, other: ApplyFileOpsStats) { + self.trunks_created += other.trunks_created; + self.trunks_deleted += other.trunks_deleted; + self.trunks_moved += other.trunks_moved; + self.branches_created += other.branches_created; + self.branches_deleted += other.branches_deleted; + self.branches_restored += other.branches_restored; + self.branches_reparented += other.branches_reparented; + self.leaves_created += other.leaves_created; + self.leaves_deleted += other.leaves_deleted; + self.leaves_replaced += other.leaves_replaced; + self.leaves_restored += other.leaves_restored; + } } // Apply Functions @@ -154,8 +169,9 @@ pub fn apply_file_ops( file_ops: &[FileOps], ) -> PristineResult { let mut stats = ApplyFileOpsStats::new(); + let mut next_leaf_idx = 0u32; for ops in file_ops { - apply_single_file_ops(txn, change_id, ops, &mut stats)?; + apply_single_file_ops(txn, change_id, ops, &mut stats, &mut next_leaf_idx)?; } Ok(stats) } @@ -207,6 +223,7 @@ pub fn apply_file_ops_batched( let mut branch_vertex_table = txn.txn.open_table(BRANCH_VERTEX)?; let mut vertex_branch_table = txn.txn.open_table(VERTEX_BRANCH)?; + let mut next_leaf_idx = 0u32; for (ops, trunk_create) in file_ops.iter().zip(trunk_creates.iter()) { // Resolve TrunkId placeholder — same logic as apply_single_file_ops. let raw_trunk_id = ops.trunk_id(); @@ -267,12 +284,13 @@ pub fn apply_file_ops_batched( vertex_branch_table.insert(&vertex_bytes, &branch_key)?; } - for (leaf_idx, leaf_op) in content.iter().enumerate() { + for leaf_op in content { let LeafOp::Insert { kind, content, .. } = leaf_op else { unreachable!("batched apply only runs for insert-only leaf ops"); }; - let leaf_id = LeafId::new(branch_id.change_id(), leaf_idx as u32); + let leaf_id = LeafId::new(branch_id.change_id(), next_leaf_idx); + next_leaf_idx += 1; let leaf_key = encode_leaf_id(&leaf_id); let leaf_value = encode_leaf_value(&SerializedLeaf { branch_id, @@ -291,6 +309,161 @@ pub fn apply_file_ops_batched( Ok(stats) } +/// Apply insert-only FileOps for multiple changes with CRDT tables opened once. +/// +/// Graph-first Git import stores semantic FileOps in every change during phase +/// 1, then materializes the CRDT tables in phase 2. Applying each change +/// independently opens the same redb tables repeatedly. This grouped variant +/// keeps those table handles open across the whole fanout while still resolving +/// `ROOT` IDs against each change's real `NodeId`. +pub fn apply_file_ops_batched_groups( + txn: &mut WriteTxn<'_>, + groups: &[(NodeId, Vec)], +) -> PristineResult { + if groups + .iter() + .any(|(_, file_ops)| !can_batch_apply_file_ops(file_ops)) + { + let mut stats = ApplyFileOpsStats::new(); + for (change_id, file_ops) in groups { + stats.merge(apply_file_ops_batched(txn, *change_id, file_ops)?); + } + return Ok(stats); + } + + let mut stats = ApplyFileOpsStats::new(); + let mut trunk_creates = Vec::new(); + + for (change_id, file_ops) in groups { + for ops in file_ops { + let trunk_create = match ops.trunk_op() { + Some(TrunkOp::Create { encoding, .. }) => { + let inode = match txn.get_inode(ops.path())? { + Some(i) => i, + None => txn.alloc_inode()?, + }; + let raw_trunk_id = ops.trunk_id(); + let trunk_id = if raw_trunk_id.change_id().is_root() { + TrunkId::new(*change_id, raw_trunk_id.file_idx()) + } else { + raw_trunk_id + }; + Some(( + trunk_id, + SerializedTrunk { + inode, + state: TrunkState::Alive, + encoding: encoding_to_u8(encoding.as_ref()), + path: ops.path().to_string(), + }, + )) + } + _ => None, + }; + trunk_creates.push(trunk_create); + } + } + + let mut trunks_table = txn.txn.open_table(TRUNKS)?; + let mut inode_trunk_table = txn.txn.open_table(INODE_TRUNK)?; + let mut path_trunk_table = txn.txn.open_table(PATH_TRUNK)?; + let mut branches_table = txn.txn.open_table(BRANCHES)?; + let mut trunk_branches_table = txn.txn.open_multimap_table(TRUNK_BRANCHES)?; + let mut branch_after_table = txn.txn.open_table(BRANCH_AFTER)?; + let mut leaves_table = txn.txn.open_table(LEAVES)?; + let mut branch_leaves_table = txn.txn.open_multimap_table(BRANCH_LEAVES)?; + let mut branch_vertex_table = txn.txn.open_table(BRANCH_VERTEX)?; + let mut vertex_branch_table = txn.txn.open_table(VERTEX_BRANCH)?; + + let mut trunk_create_idx = 0usize; + for (change_id, file_ops) in groups { + let mut next_leaf_idx = 0u32; + for ops in file_ops { + let raw_trunk_id = ops.trunk_id(); + let trunk_id = if raw_trunk_id.change_id().is_root() { + TrunkId::new(*change_id, raw_trunk_id.file_idx()) + } else { + raw_trunk_id + }; + let trunk_key = encode_trunk_id(&trunk_id); + + if let Some((_, serialized)) = &trunk_creates[trunk_create_idx] { + let trunk_value = encode_trunk_value(serialized); + trunks_table.insert(&trunk_key, trunk_value.as_slice())?; + inode_trunk_table.insert(serialized.inode.get(), &trunk_key)?; + path_trunk_table.insert(ops.path(), &trunk_key)?; + stats.trunks_created += 1; + } + trunk_create_idx += 1; + + for line_ops in ops.line_ops() { + let raw_branch_id = line_ops.branch_id(); + let branch_id = if raw_branch_id.change_id().is_root() { + BranchId::new(*change_id, raw_branch_id.branch_idx()) + } else { + raw_branch_id + }; + + let BranchOp::Insert { after, content } = line_ops.operation() else { + unreachable!("batched apply only runs for insert-only file ops"); + }; + + let branch_key = encode_branch_id(&branch_id); + let branch_value = encode_branch_value(&SerializedBranch { + trunk_id, + state: BranchState::Alive, + line_hash: 0, + }); + branches_table.insert(&branch_key, &branch_value)?; + trunk_branches_table.insert(&trunk_key, &branch_key)?; + stats.branches_created += 1; + + let after_key = match after { + Some(a) if a.change_id().is_root() => { + encode_branch_id(&BranchId::new(*change_id, a.branch_idx())) + } + Some(a) => encode_branch_id(a), + None => [0u8; 12], + }; + branch_after_table.insert(&branch_key, &after_key)?; + + if let Some((start, end)) = line_ops.content_range() { + let graph_node = GraphNode { + change: *change_id, + start, + end, + }; + let vertex_bytes = encode_vertex_position(&graph_node); + branch_vertex_table.insert(&branch_key, &vertex_bytes)?; + vertex_branch_table.insert(&vertex_bytes, &branch_key)?; + } + + for leaf_op in content { + let LeafOp::Insert { kind, content, .. } = leaf_op else { + unreachable!("batched apply only runs for insert-only leaf ops"); + }; + + let leaf_id = LeafId::new(branch_id.change_id(), next_leaf_idx); + next_leaf_idx += 1; + let leaf_key = encode_leaf_id(&leaf_id); + let leaf_value = encode_leaf_value(&SerializedLeaf { + branch_id, + kind: *kind, + state: LeafState::Alive, + content_start: 0, + content_end: content.len() as u32, + }); + leaves_table.insert(&leaf_key, &leaf_value)?; + branch_leaves_table.insert(&branch_key, &leaf_key)?; + stats.leaves_created += 1; + } + } + } + } + + Ok(stats) +} + fn can_batch_apply_file_ops(file_ops: &[FileOps]) -> bool { file_ops.iter().all(|ops| { matches!(ops.trunk_op(), None | Some(TrunkOp::Create { .. })) @@ -312,6 +485,7 @@ fn apply_single_file_ops( change_id: NodeId, ops: &FileOps, stats: &mut ApplyFileOpsStats, + next_leaf_idx: &mut u32, ) -> PristineResult<()> { // Resolve the TrunkId placeholder. The recorder stores TrunkIds // with `change_id = NodeId::ROOT` as a placeholder meaning "this @@ -336,7 +510,7 @@ fn apply_single_file_ops( // Apply line operations for line_ops in ops.line_ops() { - apply_line_ops_with_position(txn, change_id, trunk_id, line_ops, stats)?; + apply_line_ops_with_position(txn, change_id, trunk_id, line_ops, stats, next_leaf_idx)?; } Ok(()) @@ -417,6 +591,7 @@ fn apply_line_ops_with_position( trunk_id: TrunkId, line_ops: &LineOps, stats: &mut ApplyFileOpsStats, + next_leaf_idx: &mut u32, ) -> PristineResult<()> { // Resolve the BranchId. The change file stores BranchIds with // `change_id = NodeId::ROOT` as a placeholder meaning "this change @@ -493,8 +668,9 @@ fn apply_line_ops_with_position( } // Apply leaf operations for this line's tokens - for (leaf_idx, leaf_op) in content.iter().enumerate() { - let leaf_id = LeafId::new(branch_id.change_id(), leaf_idx as u32); + for leaf_op in content { + let leaf_id = LeafId::new(branch_id.change_id(), *next_leaf_idx); + *next_leaf_idx += 1; apply_leaf_op(txn, branch_id, leaf_id, leaf_op, stats)?; } } @@ -571,8 +747,9 @@ fn apply_line_ops_with_position( } // Apply leaf operations for the new content - for (leaf_idx, leaf_op) in new_content.iter().enumerate() { - let leaf_id = LeafId::new(branch_id.change_id(), leaf_idx as u32); + for leaf_op in new_content { + let leaf_id = LeafId::new(branch_id.change_id(), *next_leaf_idx); + *next_leaf_idx += 1; apply_leaf_op(txn, branch_id, leaf_id, leaf_op, stats)?; } } diff --git a/atomic-core/src/apply/mod.rs b/atomic-core/src/apply/mod.rs index b7cc307..940532f 100644 --- a/atomic-core/src/apply/mod.rs +++ b/atomic-core/src/apply/mod.rs @@ -257,5 +257,7 @@ pub use conflict::{ }; // Re-export FileOps application -pub use file_ops::{apply_file_ops, apply_file_ops_batched, ApplyFileOpsStats}; +pub use file_ops::{ + apply_file_ops, apply_file_ops_batched, apply_file_ops_batched_groups, ApplyFileOpsStats, +}; pub use graph_batch::GraphWriteBatch; diff --git a/atomic-repository/src/changestore/mod.rs b/atomic-repository/src/changestore/mod.rs index b6eff8d..b018cca 100644 --- a/atomic-repository/src/changestore/mod.rs +++ b/atomic-repository/src/changestore/mod.rs @@ -130,6 +130,36 @@ impl std::fmt::Debug for ChangeStore { } } +fn copy_content_from_change( + hash: &Hash, + change: &Change, + start: usize, + end: usize, + buf: &mut [u8], +) -> ChangeStoreResult { + if end > change.contents.len() { + return Err(ChangeStoreError::ContentOutOfBounds { + hash: hash.to_base32(), + requested_start: start, + requested_end: end, + content_len: change.contents.len(), + }); + } + + let len = end - start; + if buf.len() < len { + return Err(ChangeStoreError::ContentOutOfBounds { + hash: hash.to_base32(), + requested_start: start, + requested_end: end, + content_len: buf.len(), + }); + } + + buf[..len].copy_from_slice(&change.contents[start..end]); + Ok(len) +} + impl ChangeStore { /// Create a new change store with the given directory and cache capacity. /// @@ -370,6 +400,60 @@ impl ChangeStore { Ok(change) } + /// Copy a content span from a change without cloning the full `Change`. + /// + /// Graph output calls this for every vertex it materializes. Using + /// `load_change()` here is expensive for imported changes because cache + /// hits clone the entire change, including potentially large unhashed Git + /// metadata. This path copies only the requested bytes. + pub(crate) fn copy_content_span( + &self, + hash: &Hash, + start: usize, + end: usize, + buf: &mut [u8], + ) -> ChangeStoreResult { + { + if let Ok(mut cache) = self.cache.write() { + if let Some(change) = cache.get(hash) { + return copy_content_from_change(hash, change, start, end, buf); + } + } + } + + let path = self.change_path(hash); + log::debug!( + "Loading change content {} from {}", + hash.to_base32(), + path.display() + ); + + if !path.exists() { + return Err(ChangeStoreError::NotFound { + hash: hash.to_base32(), + }); + } + + let file = File::open(&path)?; + let mut reader = BufReader::new(file); + let (change, computed_hash) = Change::deserialize(&mut reader)?; + + if computed_hash != *hash { + return Err(ChangeStoreError::HashMismatch { + expected: hash.to_base32(), + computed: computed_hash.to_base32(), + }); + } + + let copied = copy_content_from_change(hash, &change, start, end, buf)?; + + if let Ok(mut cache) = self.cache.write() { + cache.insert(*hash, change); + } + + Ok(copied) + } + /// Delete a change from disk and the cache. /// /// # Arguments diff --git a/atomic-repository/src/changestore/tests.rs b/atomic-repository/src/changestore/tests.rs index 1527080..c8222eb 100644 --- a/atomic-repository/src/changestore/tests.rs +++ b/atomic-repository/src/changestore/tests.rs @@ -220,6 +220,39 @@ fn test_save_load_roundtrip() { ); } +#[test] +fn test_copy_content_span_avoids_full_change_load_on_cache_hit() { + let (store, _temp) = create_test_store(); + + let mut original = create_test_change_with_content("Test content span", b"0123456789abcdef"); + original.unhashed = Some(serde_json::json!({ + "git": { + "diff_lines": [ + { + "path": "large.rs", + "lines": (0..1000).map(|idx| serde_json::json!({ + "origin": "+", + "content": format!("line {idx}\n"), + "old_lineno": null, + "new_lineno": idx + 1, + })).collect::>() + } + ] + } + })); + + let hash = store.save_change(&original).expect("Failed to save change"); + + let mut buf = [0u8; 4]; + let copied = store + .copy_content_span(&hash, 4, 8, &mut buf) + .expect("Failed to copy content span"); + + assert_eq!(copied, 4); + assert_eq!(&buf, b"4567"); + assert_eq!(store.cache_size(), 1); +} + #[test] fn test_load_nonexistent_change() { let (store, _temp) = create_test_store(); diff --git a/atomic-repository/src/changestore/trait_impl.rs b/atomic-repository/src/changestore/trait_impl.rs index 9de3f7e..9d2a477 100644 --- a/atomic-repository/src/changestore/trait_impl.rs +++ b/atomic-repository/src/changestore/trait_impl.rs @@ -5,7 +5,7 @@ use atomic_core::change::ChangeHeader; use atomic_core::change::ChangeStore as ChangeStoreTrait; -use atomic_core::types::{Base32, GraphNode, Hash, NodeId}; +use atomic_core::types::{GraphNode, Hash, NodeId}; use super::{ChangeStore, ChangeStoreError}; @@ -44,34 +44,10 @@ impl ChangeStoreTrait for ChangeStore { } }; - // Load the change - let change = self.load_change(&hash)?; - - // Extract content bytes let start = span.start.get() as usize; let end = span.end.get() as usize; - if end > change.contents.len() { - return Err(ChangeStoreError::ContentOutOfBounds { - hash: hash.to_base32(), - requested_start: start, - requested_end: end, - content_len: change.contents.len(), - }); - } - - let len = end - start; - if buf.len() < len { - return Err(ChangeStoreError::ContentOutOfBounds { - hash: hash.to_base32(), - requested_start: start, - requested_end: end, - content_len: buf.len(), - }); - } - - buf[..len].copy_from_slice(&change.contents[start..end]); - Ok(len) + self.copy_content_span(&hash, start, end, buf) } fn get_contents_ext( @@ -85,34 +61,10 @@ impl ChangeStoreTrait for ChangeStore { None => return Ok(0), }; - // Load the change - let change = self.load_change(&hash)?; - - // Extract content bytes let start = span.start.get() as usize; let end = span.end.get() as usize; - if end > change.contents.len() { - return Err(ChangeStoreError::ContentOutOfBounds { - hash: hash.to_base32(), - requested_start: start, - requested_end: end, - content_len: change.contents.len(), - }); - } - - let len = end - start; - if buf.len() < len { - return Err(ChangeStoreError::ContentOutOfBounds { - hash: hash.to_base32(), - requested_start: start, - requested_end: end, - content_len: buf.len(), - }); - } - - buf[..len].copy_from_slice(&change.contents[start..end]); - Ok(len) + self.copy_content_span(&hash, start, end, buf) } fn get_header(&self, hash: &Hash) -> Result { diff --git a/atomic-repository/src/repository/insert.rs b/atomic-repository/src/repository/insert.rs index 103320a..2e69e21 100644 --- a/atomic-repository/src/repository/insert.rs +++ b/atomic-repository/src/repository/insert.rs @@ -85,6 +85,56 @@ pub struct ImportWriteOutcome { pub insert: InsertOutcome, } +#[derive(Default)] +struct ImportGraphFirstVertexCache { + by_inode: HashMap, +} + +#[derive(Default)] +struct ImportGraphFirstInodeCache { + by_end: HashMap, GraphNode>, + by_start: HashMap, GraphNode>, +} + +impl ImportGraphFirstVertexCache { + fn load( + &mut self, + txn: &T, + inode: Inode, + ) -> Result<&ImportGraphFirstInodeCache, RepositoryError> + where + T: TreeTxnT, + { + if !self.by_inode.contains_key(&inode) { + let mut cache = ImportGraphFirstInodeCache::default(); + let vertices = txn + .iter_inode_vertices(inode) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + for result in vertices { + let (node, _edge) = result.map_err(|e| RepositoryError::Database(e.to_string()))?; + cache.by_end.entry(node.end_pos()).or_insert(node); + cache + .by_start + .entry(node.start_pos()) + .and_modify(|existing| { + if existing.start == existing.end && node.start != node.end { + *existing = node; + } + }) + .or_insert(node); + } + self.by_inode.insert(inode, cache); + } + + self.by_inode.get(&inode).ok_or_else(|| { + RepositoryError::Apply(format!( + "missing import vertex cache for inode {}", + inode.get() + )) + }) + } +} + fn import_direct_source( pos: &Position>, by_end: &HashMap>, @@ -138,6 +188,144 @@ fn import_direct_can_apply(change: &Change) -> bool { }) } +fn import_graph_first_can_apply(change: &Change) -> bool { + !change.hunks().is_empty() + && change.hunks().iter().all(|op| match op { + GraphOp::FileAdd { .. } => true, + GraphOp::FileMove { add, .. } => { + !add.predecessors.is_empty() && add.predecessors.len() == 1 + } + GraphOp::Replacement { replacement, .. } => { + !replacement.predecessors.is_empty() && replacement.predecessors.len() == 1 + } + GraphOp::Edit { + change: atomic_core::change::Atom::Insertion(insertion), + .. + } => !insertion.predecessors.is_empty() && insertion.predecessors.len() == 1, + GraphOp::Edit { + change: atomic_core::change::Atom::EdgeUpdate(_), + .. + } => true, + _ => false, + }) +} + +fn import_graph_first_position( + txn: &T, + pos: &Position>, + change_id: NodeId, +) -> Result, RepositoryError> { + let resolved_change = match pos.change { + Some(hash) if hash == Hash::NONE => NodeId::ROOT, + Some(hash) => txn + .get_internal(&hash) + .map_err(|e| RepositoryError::Database(e.to_string()))? + .ok_or_else(|| RepositoryError::Apply(format!("missing dependency {}", hash)))?, + None => change_id, + }; + + Ok(Position::new(resolved_change, pos.pos)) +} + +fn import_graph_first_node( + txn: &T, + node: GraphNode>, + change_id: NodeId, +) -> Result, RepositoryError> { + let pos = Position::new(node.change, node.start); + let resolved = import_graph_first_position(txn, &pos, change_id)?; + Ok(GraphNode { + change: resolved.change, + start: node.start, + end: node.end, + }) +} + +fn import_graph_first_resolved_inode( + txn: &T, + inode_pos: &Position>, + change_id: NodeId, +) -> Result, RepositoryError> { + let resolved = import_graph_first_position(txn, inode_pos, change_id)?; + if resolved.change.is_root() { + return Ok(None); + } + txn.position_inode(resolved) + .map_err(|e| RepositoryError::Database(e.to_string())) +} + +fn import_graph_first_source( + txn: &T, + pos: &Position>, + inode_pos: &Position>, + resolved_inode: Option, + old_by_end: &HashMap, GraphNode>, + current_by_end: &HashMap, GraphNode>, + vertex_cache: &mut ImportGraphFirstVertexCache, + change_id: NodeId, +) -> Result, RepositoryError> +where + T: GraphTxnT + TreeTxnT, +{ + let resolved = import_graph_first_position(txn, pos, change_id)?; + + if resolved.change == change_id { + if let Some(node) = current_by_end.get(&resolved) { + return Ok(*node); + } + } + + if let Some(node) = old_by_end.get(&resolved) { + return Ok(*node); + } + + let resolved_inode_pos = import_graph_first_position(txn, inode_pos, change_id)?; + if resolved == resolved_inode_pos { + return Ok(GraphNode { + change: resolved.change, + start: resolved.pos, + end: resolved.pos, + }); + } + + if let Some(inode) = resolved_inode { + if let Some(node) = vertex_cache.load(txn, inode)?.by_end.get(&resolved) { + return Ok(*node); + } + } + + txn.find_block_end(resolved) + .map_err(|e| RepositoryError::Apply(e.to_string())) +} + +fn import_graph_first_successor( + txn: &T, + pos: &Position>, + resolved_inode: Option, + current_by_start: &HashMap, GraphNode>, + vertex_cache: &mut ImportGraphFirstVertexCache, + change_id: NodeId, +) -> Result, RepositoryError> +where + T: GraphTxnT + TreeTxnT, +{ + let resolved = import_graph_first_position(txn, pos, change_id)?; + if resolved.change == change_id { + if let Some(node) = current_by_start.get(&resolved) { + return Ok(*node); + } + } + + if let Some(inode) = resolved_inode { + if let Some(node) = vertex_cache.load(txn, inode)?.by_start.get(&resolved) { + return Ok(*node); + } + } + + txn.find_block(resolved) + .map_err(|e| RepositoryError::Apply(e.to_string())) +} + fn import_direct_write_insertion( batch: &mut atomic_core::apply::GraphWriteBatch<'_>, change_id: NodeId, @@ -213,8 +401,12 @@ impl Repository { let mut change = if recorded_files.is_empty() { Change::empty(header) } else { - match assemble_change(&txn, recorded_files, header.clone(), &AssemblyOptions::default()) - { + match assemble_change( + &txn, + recorded_files, + header.clone(), + &AssemblyOptions::default(), + ) { Ok(result) => result.into_change(), Err(e) => { let err_msg = e.to_string(); @@ -368,6 +560,484 @@ impl Repository { }) } + /// Save and apply an already-built git-import graph change. + /// + /// This bypasses record/globalize and eager CRDT table writes. The caller + /// has already compiled Git's snapshot delta into line-level graph ops. + pub fn write_import_graph_change( + &self, + change: Change, + deleted_paths: &[String], + options: InsertOptions, + ) -> Result { + let mut timings = ImportWriteTimings::default(); + let view_name = options.view.as_deref().unwrap_or(&self.current_view); + + let mut txn = self + .pristine + .write_txn() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let assemble_start = std::time::Instant::now(); + let mut v3_bytes = Vec::new(); + let hash = change + .serialize(&mut v3_bytes) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let (final_change, verified_hash) = Change::deserialize(&mut v3_bytes.as_slice()) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + debug_assert_eq!(hash, verified_hash); + timings.assemble_ms = assemble_start.elapsed().as_millis(); + + let save_start = std::time::Instant::now(); + self.save_change_bytes(&hash, &v3_bytes, &final_change)?; + timings.save_ms = save_start.elapsed().as_millis(); + + let change_id = txn + .register_change(&hash) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_change_deps(change_id, final_change.dependencies()) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let apply_start = std::time::Instant::now(); + let insert = if import_graph_first_can_apply(&final_change) { + let (insert, graph_ms, crdt_ms) = self.write_import_graph_first_direct( + &mut txn, + view_name, + change_id, + &hash, + &final_change, + &options, + )?; + timings.direct_graph_ms = graph_ms; + timings.direct_crdt_ms = crdt_ms; + insert + } else { + write_change_to_graph( + &mut txn, + view_name, + change_id, + &hash, + &final_change, + &options, + false, + ) + .map_err(|e| RepositoryError::Apply(e.to_string()))? + }; + timings.apply_ms = apply_start.elapsed().as_millis(); + + for deleted_path in deleted_paths { + if let Ok(Some(inode)) = txn.get_inode(deleted_path) { + let _ = txn.del_tree(deleted_path); + let _ = txn.del_inode(inode); + } + } + + let commit_start = std::time::Instant::now(); + txn.commit() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + timings.commit_ms = commit_start.elapsed().as_millis(); + + Ok(ImportWriteOutcome { + hash, + timings, + insert, + }) + } + + fn write_import_graph_first_direct( + &self, + txn: &mut atomic_core::pristine::WriteTxn<'_>, + view_name: &str, + change_id: NodeId, + hash: &Hash, + change: &Change, + _options: &InsertOptions, + ) -> Result<(InsertOutcome, u128, u128), RepositoryError> { + use atomic_core::apply::compute_new_state; + + let graph_start = std::time::Instant::now(); + let mut pending_edges: Vec<( + Option, + EdgeFlags, + GraphNode, + GraphNode, + )> = Vec::new(); + + { + let mut old_by_end: HashMap, GraphNode> = HashMap::new(); + let mut current_by_end: HashMap, GraphNode> = HashMap::new(); + let mut current_by_start: HashMap, GraphNode> = HashMap::new(); + let mut vertex_cache = ImportGraphFirstVertexCache::default(); + + for graph_op in change.hunks() { + match graph_op { + GraphOp::FileAdd { + add_name, + add_inode, + contents, + path, + .. + } => { + let inode_position = Position::new(change_id, add_inode.start); + let inode = txn + .alloc_inode() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_tree(path, inode) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + txn.put_inode(inode, inode_position) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let name_node = GraphNode { + change: change_id, + start: add_name.start, + end: add_name.end, + }; + let name_inode = + import_graph_first_resolved_inode(&*txn, &add_name.inode, change_id)?; + let name_source = import_graph_first_source( + &*txn, + &add_name.predecessors[0], + &add_name.inode, + name_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + name_inode, + add_name.flag | EdgeFlags::BLOCK, + name_source, + name_node, + )); + current_by_end.insert(name_node.end_pos(), name_node); + current_by_start.insert(name_node.start_pos(), name_node); + + let inode_node = GraphNode { + change: change_id, + start: add_inode.start, + end: add_inode.end, + }; + let inode_source = import_graph_first_source( + &*txn, + &add_inode.predecessors[0], + &add_inode.inode, + Some(inode), + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + Some(inode), + add_inode.flag | EdgeFlags::BLOCK, + inode_source, + inode_node, + )); + current_by_end.insert(inode_node.end_pos(), inode_node); + current_by_start.insert(inode_node.start_pos(), inode_node); + + if let Some(contents) = contents { + let content_node = GraphNode { + change: change_id, + start: contents.start, + end: contents.end, + }; + let content_source = import_graph_first_source( + &*txn, + &contents.predecessors[0], + &contents.inode, + Some(inode), + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + Some(inode), + contents.flag | EdgeFlags::BLOCK, + content_source, + content_node, + )); + current_by_end.insert(content_node.end_pos(), content_node); + current_by_start.insert(content_node.start_pos(), content_node); + } + } + GraphOp::Replacement { + change: edge_update, + replacement, + .. + } => { + let resolved_inode = import_graph_first_resolved_inode( + &*txn, + &edge_update.inode, + change_id, + )?; + + for edge in &edge_update.edges { + let target = import_graph_first_node(&*txn, edge.to, change_id)?; + let source = import_graph_first_source( + &*txn, + &edge.from, + &edge_update.inode, + resolved_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push((resolved_inode, edge.flag, source, target)); + old_by_end.insert(target.end_pos(), target); + } + + let node = GraphNode { + change: change_id, + start: replacement.start, + end: replacement.end, + }; + let source = import_graph_first_source( + &*txn, + &replacement.predecessors[0], + &replacement.inode, + resolved_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + resolved_inode, + replacement.flag | EdgeFlags::BLOCK, + source, + node, + )); + + for successor in &replacement.successors { + let target = import_graph_first_successor( + &*txn, + successor, + resolved_inode, + ¤t_by_start, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + resolved_inode, + replacement.flag | EdgeFlags::BLOCK, + node, + target, + )); + } + + current_by_end.insert(node.end_pos(), node); + current_by_start.insert(node.start_pos(), node); + } + GraphOp::FileMove { del, add, path } => { + let resolved_inode = + import_graph_first_resolved_inode(&*txn, &add.inode, change_id)?; + + for edge in &del.edges { + let target = import_graph_first_node(&*txn, edge.to, change_id)?; + let source = import_graph_first_source( + &*txn, + &edge.from, + &del.inode, + resolved_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push((resolved_inode, edge.flag, source, target)); + old_by_end.insert(target.end_pos(), target); + } + + let node = GraphNode { + change: change_id, + start: add.start, + end: add.end, + }; + let source = import_graph_first_source( + &*txn, + &add.predecessors[0], + &add.inode, + resolved_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + resolved_inode, + add.flag | EdgeFlags::BLOCK, + source, + node, + )); + + let inode_pos = import_graph_first_position(&*txn, &add.inode, change_id)?; + for successor in &add.successors { + let resolved_successor = + import_graph_first_position(&*txn, successor, change_id)?; + let target = if resolved_successor == inode_pos { + GraphNode { + change: inode_pos.change, + start: inode_pos.pos, + end: inode_pos.pos, + } + } else { + import_graph_first_successor( + &*txn, + successor, + resolved_inode, + ¤t_by_start, + &mut vertex_cache, + change_id, + )? + }; + pending_edges.push(( + resolved_inode, + add.flag | EdgeFlags::BLOCK, + node, + target, + )); + } + + if let Some(inode) = resolved_inode { + if let Ok(Some(old_path)) = txn.get_path(inode) { + if old_path != *path { + let _ = txn.del_tree(&old_path); + } + } + txn.put_tree(path, inode) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + + current_by_end.insert(node.end_pos(), node); + current_by_start.insert(node.start_pos(), node); + } + GraphOp::Edit { + change: atomic_core::change::Atom::Insertion(insertion), + .. + } => { + let resolved_inode = + import_graph_first_resolved_inode(&*txn, &insertion.inode, change_id)?; + let node = GraphNode { + change: change_id, + start: insertion.start, + end: insertion.end, + }; + let source = import_graph_first_source( + &*txn, + &insertion.predecessors[0], + &insertion.inode, + resolved_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + resolved_inode, + insertion.flag | EdgeFlags::BLOCK, + source, + node, + )); + + for successor in &insertion.successors { + let target = import_graph_first_successor( + &*txn, + successor, + resolved_inode, + ¤t_by_start, + &mut vertex_cache, + change_id, + )?; + pending_edges.push(( + resolved_inode, + insertion.flag | EdgeFlags::BLOCK, + node, + target, + )); + } + + current_by_end.insert(node.end_pos(), node); + current_by_start.insert(node.start_pos(), node); + } + GraphOp::Edit { + change: atomic_core::change::Atom::EdgeUpdate(edge_update), + .. + } => { + let resolved_inode = import_graph_first_resolved_inode( + &*txn, + &edge_update.inode, + change_id, + )?; + + for edge in &edge_update.edges { + let target = import_graph_first_node(&*txn, edge.to, change_id)?; + let source = import_graph_first_source( + &*txn, + &edge.from, + &edge_update.inode, + resolved_inode, + &old_by_end, + ¤t_by_end, + &mut vertex_cache, + change_id, + )?; + pending_edges.push((resolved_inode, edge.flag, source, target)); + old_by_end.insert(target.end_pos(), target); + } + } + _ => { + return Err(RepositoryError::Apply( + "graph-first direct import received unsupported graph op".to_string(), + )); + } + } + } + } + + { + let mut graph_batch = atomic_core::apply::GraphWriteBatch::new(&*txn) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + for (inode, flag, source, target) in pending_edges { + graph_batch + .add_edge_with_reverse(inode, flag, source, target, change_id) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + } + let graph_ms = graph_start.elapsed().as_millis(); + + // Phase 1 of git import writes the graph truth and stores semantic + // FileOps in the change file, but intentionally does not fan those + // FileOps out into CRDT tables. That table materialization is phase 2. + let crdt_ms = 0; + + let mut view = txn + .open_or_create_view(view_name) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + let new_state = compute_new_state(&view.state, hash); + let sequence = view.change_count + 1; + txn.put_change(&mut view, change_id, hash) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + view.state = new_state; + view.change_count = sequence; + txn.update_view(&view) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let mut stats = InsertStats::new(); + stats.changes_applied = 1; + stats.applied_hashes.push(*hash); + stats.atoms_processed = change.hunks().len(); + + Ok(( + InsertOutcome::new(new_state, sequence, false, stats), + graph_ms, + crdt_ms, + )) + } + fn write_import_direct_add_chain( &self, txn: &mut atomic_core::pristine::WriteTxn<'_>, diff --git a/atomic-repository/src/repository/mod.rs b/atomic-repository/src/repository/mod.rs index f45fe09..dd3d5cf 100644 --- a/atomic-repository/src/repository/mod.rs +++ b/atomic-repository/src/repository/mod.rs @@ -77,6 +77,7 @@ use crate::RepositoryError; mod filter; mod materialize; +mod semantic_materialize; mod switch; mod views; @@ -113,8 +114,9 @@ mod vault_intent; mod vault_kg_enrich; mod vault_names; mod vault_triples; -pub use vault_embeddings::{hash_embed, EmbedConfig, TextChunk}; pub use insert::{ImportWriteOutcome, ImportWriteTimings}; +pub use semantic_materialize::{CrdtMaterializeOptions, CrdtMaterializeOutcome}; +pub use vault_embeddings::{hash_embed, EmbedConfig, TextChunk}; pub use vault_goal::{ GoalInfo, GoalStartOptions, GoalStartResult, GoalStopOptions, GoalStopResult, }; diff --git a/atomic-repository/src/repository/semantic_materialize.rs b/atomic-repository/src/repository/semantic_materialize.rs new file mode 100644 index 0000000..89bb162 --- /dev/null +++ b/atomic-repository/src/repository/semantic_materialize.rs @@ -0,0 +1,619 @@ +use std::collections::HashMap; +use std::time::Instant; + +use atomic_core::apply::{apply_file_ops_batched_groups, ApplyFileOpsStats}; +use atomic_core::change::{Atom, Change, FileOps, GraphOp, LineOps}; +use atomic_core::crdt::tables::{encode_branch_id, encode_trunk_id, encode_vertex_position}; +use atomic_core::crdt::{BranchId, BranchOp, LeafOp, TrunkId, TrunkOp}; +use atomic_core::pristine::{CrdtTxnT, GraphTxnT, MutTxnT, ViewTxnT}; +use atomic_core::types::{ChangePosition, GraphNode, NodeId}; + +use crate::apply::get_view_changes; +use crate::{Repository, RepositoryError}; + +/// Options for materializing stored FileOps into CRDT tables. +#[derive(Debug, Clone)] +pub struct CrdtMaterializeOptions { + /// View whose ordered changes should be replayed. + pub view: Option, + /// Re-apply rows that appear to already have CRDT trunks. + pub force: bool, +} + +impl Default for CrdtMaterializeOptions { + fn default() -> Self { + Self { + view: None, + force: false, + } + } +} + +/// Summary from a CRDT materialization pass. +#[derive(Debug, Clone, Default)] +pub struct CrdtMaterializeOutcome { + pub view: String, + pub changes_scanned: usize, + pub changes_applied: usize, + pub file_ops_applied: usize, + pub file_ops_already_materialized: usize, + pub file_ops_skipped: usize, + pub skip_stats: CrdtMaterializeSkipStats, + pub skip_samples: Vec, + pub elapsed_ms: u128, + pub stats: ApplyFileOpsStats, +} + +/// Why FileOps were not materialized in the current phase-2 pass. +#[derive(Debug, Clone, Default)] +pub struct CrdtMaterializeSkipStats { + pub non_create_trunk: usize, + pub unresolved_path: usize, + pub unresolved_line: usize, + pub missing_content_range: usize, + pub non_insert_branch: usize, + pub non_insert_leaf: usize, +} + +impl CrdtMaterializeSkipStats { + pub fn total(&self) -> usize { + self.non_create_trunk + + self.unresolved_path + + self.unresolved_line + + self.missing_content_range + + self.non_insert_branch + + self.non_insert_leaf + } +} + +impl Repository { + /// Populate CRDT tables from FileOps already stored in changes. + /// + /// This is phase 2 for graph-first Git import: phase 1 writes every graph + /// vertex/edge and stores semantic operations in the change file; this pass + /// fans out the subset of those operations that are already graph-linked and + /// safe to index. Diff-only placeholder FileOps remain stored for review + /// metadata but are skipped until they can be resolved against existing + /// branch IDs. + pub fn materialize_crdt_from_changes( + &self, + options: CrdtMaterializeOptions, + ) -> Result { + let view_name = options + .view + .clone() + .unwrap_or_else(|| self.current_view.clone()); + let start = Instant::now(); + let mut txn = self + .pristine + .write_txn() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let view = txn + .get_view(&view_name) + .map_err(|e| RepositoryError::Database(e.to_string()))? + .ok_or_else(|| RepositoryError::ViewNotFound { + name: view_name.clone(), + })?; + + let changes = + get_view_changes(&txn, &view).map_err(|e| RepositoryError::Apply(e.to_string()))?; + + let mut outcome = CrdtMaterializeOutcome { + view: view_name, + changes_scanned: changes.len(), + ..CrdtMaterializeOutcome::default() + }; + + let mut groups = Vec::new(); + let mut live_files: HashMap = HashMap::new(); + + for (_seq, hash) in changes { + let change_id = txn + .get_internal(&hash) + .map_err(|e| RepositoryError::Database(e.to_string()))? + .ok_or_else(|| RepositoryError::ChangeNotFound { + hash: hash.to_string(), + })?; + let change = self.load_change(&hash)?; + if !change.has_file_ops() { + continue; + } + if !is_git_import_change(&change) { + outcome.file_ops_already_materialized += change.file_ops().len(); + continue; + } + + apply_git_path_metadata(&change, &mut live_files); + let insertion_ranges = collect_insertion_ranges_by_path(&change); + let mut next_branch_idx = next_change_branch_idx(change.file_ops()); + let mut safe_ops = Vec::new(); + for ops in change.file_ops() { + if phase2_can_materialize_create_file_ops(ops) { + if !options.force && crdt_trunk_exists(&txn, change_id, ops)? { + seed_materialized_file(change_id, ops, &mut live_files); + outcome.file_ops_already_materialized += 1; + continue; + } + seed_materialized_file(change_id, ops, &mut live_files); + safe_ops.push(ops.clone()); + } else if ops.trunk_op().is_none() { + match resolve_existing_file_ops( + change_id, + ops, + insertion_ranges.get(ops.path()), + &mut live_files, + &mut next_branch_idx, + ) { + Ok(Some(resolved)) => { + if !options.force + && file_ops_already_materialized(&txn, change_id, &resolved)? + { + outcome.file_ops_already_materialized += 1; + } else { + safe_ops.push(resolved); + } + } + Ok(None) => { + record_skip( + &mut outcome, + ops.path(), + CrdtMaterializeSkipReason::NonCreateTrunk, + ); + } + Err(reason) => { + record_skip(&mut outcome, ops.path(), reason); + } + } + } else { + record_skip( + &mut outcome, + ops.path(), + CrdtMaterializeSkipReason::NonCreateTrunk, + ); + } + } + + if safe_ops.is_empty() { + continue; + } + outcome.file_ops_applied += safe_ops.len(); + outcome.changes_applied += 1; + groups.push((change_id, safe_ops)); + } + + if !groups.is_empty() { + outcome.stats = apply_file_ops_batched_groups(&mut txn, &groups) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + } + + txn.commit() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + outcome.elapsed_ms = start.elapsed().as_millis(); + Ok(outcome) + } +} + +fn crdt_trunk_exists( + txn: &T, + change_id: NodeId, + ops: &FileOps, +) -> Result { + let raw = ops.trunk_id(); + let trunk_id = if raw.change_id().is_root() { + TrunkId::new(change_id, raw.file_idx()) + } else { + raw + }; + let key = encode_trunk_id(&trunk_id); + txn.get_crdt_trunk(&key) + .map(|v| v.is_some()) + .map_err(|e| RepositoryError::Database(e.to_string())) +} + +fn file_ops_already_materialized( + txn: &T, + change_id: NodeId, + ops: &FileOps, +) -> Result { + if ops.line_ops().is_empty() { + return Ok(false); + } + + for line in ops.line_ops() { + let branch_id = line.branch_id(); + let branch_key = encode_branch_id(&branch_id); + match line.operation() { + BranchOp::Insert { .. } | BranchOp::Modify { .. } => { + let Some((start, end)) = line.content_range() else { + return Ok(false); + }; + let vertex_key = encode_vertex_position(&GraphNode { + change: change_id, + start, + end, + }); + let mapped = txn + .get_crdt_vertex_branch(&vertex_key) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + if mapped != Some(branch_id) { + return Ok(false); + } + } + BranchOp::Delete { .. } => { + let branch = txn + .get_crdt_branch(&branch_key) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + if !branch.is_some_and(|branch| branch.state.is_deleted()) { + return Ok(false); + } + } + BranchOp::Restore { .. } => { + let branch = txn + .get_crdt_branch(&branch_key) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + if !branch.is_some_and(|branch| branch.state.is_alive()) { + return Ok(false); + } + } + BranchOp::Reparent { .. } => return Ok(false), + } + } + + Ok(true) +} + +fn record_skip( + outcome: &mut CrdtMaterializeOutcome, + path: &str, + reason: CrdtMaterializeSkipReason, +) { + outcome.skip_stats.record(reason); + outcome.file_ops_skipped += 1; + if outcome.skip_samples.len() < 16 { + outcome.skip_samples.push(format!("{}:{:?}", path, reason)); + } +} + +fn is_git_import_change(change: &Change) -> bool { + change + .unhashed + .as_ref() + .and_then(|value| value.get("git")) + .is_some() +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum CrdtMaterializeSkipReason { + NonCreateTrunk, + UnresolvedPath, + UnresolvedLine, + MissingContentRange, + NonInsertBranch, + NonInsertLeaf, +} + +impl CrdtMaterializeSkipStats { + fn record(&mut self, reason: CrdtMaterializeSkipReason) { + match reason { + CrdtMaterializeSkipReason::NonCreateTrunk => self.non_create_trunk += 1, + CrdtMaterializeSkipReason::UnresolvedPath => self.unresolved_path += 1, + CrdtMaterializeSkipReason::UnresolvedLine => self.unresolved_line += 1, + CrdtMaterializeSkipReason::MissingContentRange => self.missing_content_range += 1, + CrdtMaterializeSkipReason::NonInsertBranch => self.non_insert_branch += 1, + CrdtMaterializeSkipReason::NonInsertLeaf => self.non_insert_leaf += 1, + } + } +} + +#[derive(Debug, Clone)] +struct MaterializedFileState { + trunk_id: TrunkId, + branches: Vec, +} + +type InsertionRangeMap = HashMap>>; + +fn phase2_can_materialize_create_file_ops(ops: &FileOps) -> bool { + phase2_create_skip_reason(ops).is_none() +} + +fn phase2_create_skip_reason(ops: &FileOps) -> Option { + if !matches!(ops.trunk_op(), Some(TrunkOp::Create { .. })) { + return Some(CrdtMaterializeSkipReason::NonCreateTrunk); + } + + for line in ops.line_ops() { + if line.content_range().is_none() { + return Some(CrdtMaterializeSkipReason::MissingContentRange); + } + let BranchOp::Insert { content, .. } = line.operation() else { + return Some(CrdtMaterializeSkipReason::NonInsertBranch); + }; + if !content + .iter() + .all(|leaf| matches!(leaf, LeafOp::Insert { .. })) + { + return Some(CrdtMaterializeSkipReason::NonInsertLeaf); + } + } + + None +} + +fn next_change_branch_idx(file_ops: &[FileOps]) -> u32 { + file_ops + .iter() + .flat_map(FileOps::line_ops) + .map(|line| line.branch_id().branch_idx()) + .max() + .map_or(0, |idx| idx.saturating_add(1)) +} + +fn resolve_trunk_id(change_id: NodeId, trunk_id: TrunkId) -> TrunkId { + if trunk_id.change_id().is_root() { + TrunkId::new(change_id, trunk_id.file_idx()) + } else { + trunk_id + } +} + +fn seed_materialized_file( + change_id: NodeId, + ops: &FileOps, + live_files: &mut HashMap, +) { + let trunk_id = resolve_trunk_id(change_id, ops.trunk_id()); + let branches = ops + .line_ops() + .iter() + .map(|line| { + let branch = line.branch_id(); + if branch.change_id().is_root() { + BranchId::new(change_id, branch.branch_idx()) + } else { + branch + } + }) + .collect(); + live_files.insert( + ops.path().to_string(), + MaterializedFileState { trunk_id, branches }, + ); +} + +fn collect_insertion_ranges_by_path(change: &Change) -> InsertionRangeMap { + let mut ranges: InsertionRangeMap = HashMap::new(); + for hunk in change.hunks() { + match hunk { + GraphOp::Edit { + change: Atom::Insertion(insertion), + local, + .. + } => { + ranges + .entry(local.path.clone()) + .or_default() + .entry(local.line as usize) + .or_default() + .push((insertion.start, insertion.end)); + } + GraphOp::Replacement { + replacement, local, .. + } => { + ranges + .entry(local.path.clone()) + .or_default() + .entry(local.line as usize) + .or_default() + .push((replacement.start, replacement.end)); + } + _ => {} + } + } + ranges +} + +fn apply_git_path_metadata( + change: &Change, + live_files: &mut HashMap, +) { + let Some(diff_files) = change + .unhashed + .as_ref() + .and_then(|value| value.get("git")) + .and_then(|git| git.get("diff_lines")) + .and_then(|diff_lines| diff_lines.as_array()) + else { + return; + }; + + for file in diff_files { + let operation = file.get("operation").and_then(|v| v.as_str()); + let path = file.get("path").and_then(|v| v.as_str()); + let old_path = file.get("old_path").and_then(|v| v.as_str()); + match (operation, old_path, path) { + (Some("renamed"), Some(old_path), Some(path)) => { + if let Some(state) = live_files.remove(old_path) { + live_files.insert(path.to_string(), state); + } + } + (Some("copied"), Some(old_path), Some(path)) => { + if let Some(state) = live_files.get(old_path).cloned() { + live_files.insert(path.to_string(), state); + } + } + _ => {} + } + } +} + +fn take_insertion_range( + ranges: &mut Option>>, + line: usize, +) -> Option<(ChangePosition, ChangePosition)> { + let line_ranges = ranges.as_mut()?.get_mut(&line)?; + if line_ranges.is_empty() { + None + } else { + Some(line_ranges.remove(0)) + } +} + +fn resolve_existing_file_ops( + change_id: NodeId, + ops: &FileOps, + insertion_ranges: Option<&HashMap>>, + live_files: &mut HashMap, + next_branch_idx: &mut u32, +) -> Result, CrdtMaterializeSkipReason> { + let Some(current) = live_files.get(ops.path()).cloned() else { + return Err(CrdtMaterializeSkipReason::UnresolvedPath); + }; + + let mut branches = current.branches; + let mut resolved = FileOps::edit(current.trunk_id, ops.path().to_string()); + let mut ranges = insertion_ranges.cloned(); + let mut line_offset: isize = 0; + + for line in ops.line_ops() { + match line.operation() { + BranchOp::Delete { content, .. } => { + let old_line = line + .old_line_num() + .ok_or(CrdtMaterializeSkipReason::UnresolvedLine)?; + let idx = adjusted_line_index(old_line, line_offset, branches.len())?; + let branch_id = branches.remove(idx); + resolved.add_line_op(LineOps::delete(branch_id, content.clone())); + line_offset -= 1; + } + BranchOp::Insert { content, .. } => { + let new_line = line + .new_line_num() + .ok_or(CrdtMaterializeSkipReason::UnresolvedLine)?; + let (start, end) = take_insertion_range(&mut ranges, new_line) + .ok_or(CrdtMaterializeSkipReason::MissingContentRange)?; + let idx = new_line.saturating_sub(1).min(branches.len()); + let after = if idx == 0 { + None + } else { + Some(branches[idx - 1]) + }; + let branch_id = BranchId::new(change_id, *next_branch_idx); + *next_branch_idx = next_branch_idx.saturating_add(1); + let op = LineOps::insert(branch_id, after, content.clone()) + .with_new_line_num(new_line) + .with_content_range(start, end); + resolved.add_line_op(op); + branches.insert(idx, branch_id); + line_offset += 1; + } + BranchOp::Modify { + old_content, + new_content, + .. + } => { + let old_line = line + .old_line_num() + .ok_or(CrdtMaterializeSkipReason::UnresolvedLine)?; + let new_line = line + .new_line_num() + .ok_or(CrdtMaterializeSkipReason::UnresolvedLine)?; + let idx = adjusted_line_index(old_line, line_offset, branches.len())?; + let branch_id = branches[idx]; + let (start, end) = take_insertion_range(&mut ranges, new_line) + .ok_or(CrdtMaterializeSkipReason::MissingContentRange)?; + let op = LineOps::modify(branch_id, old_content.clone(), new_content.clone()) + .with_old_line_num(old_line) + .with_new_line_num(new_line) + .with_content_range(start, end); + resolved.add_line_op(op); + } + BranchOp::Restore { .. } | BranchOp::Reparent { .. } => { + return Err(CrdtMaterializeSkipReason::NonInsertBranch); + } + } + } + + if resolved.line_ops().is_empty() { + return Ok(None); + } + + live_files.insert( + ops.path().to_string(), + MaterializedFileState { + trunk_id: current.trunk_id, + branches, + }, + ); + Ok(Some(resolved)) +} + +fn adjusted_line_index( + one_based_line: usize, + line_offset: isize, + len: usize, +) -> Result { + let base = one_based_line + .checked_sub(1) + .ok_or(CrdtMaterializeSkipReason::UnresolvedLine)?; + let adjusted = base as isize + line_offset; + if adjusted < 0 { + return Err(CrdtMaterializeSkipReason::UnresolvedLine); + } + let idx = adjusted as usize; + if idx >= len { + return Err(CrdtMaterializeSkipReason::UnresolvedLine); + } + Ok(idx) +} + +#[cfg(test)] +mod tests { + use super::*; + use atomic_core::change::{Encoding, LineOps}; + use atomic_core::crdt::{BranchId, LeafId}; + use atomic_core::diff::TokenKind; + + #[test] + fn materialize_filter_accepts_graph_linked_create_inserts() { + let mut ops = FileOps::create( + TrunkId::new(NodeId::ROOT, 0), + "src/lib.rs".to_string(), + Some(Encoding::Utf8), + ); + ops.add_line_op( + LineOps::insert( + BranchId::new(NodeId::ROOT, 0), + None, + vec![LeafOp::Insert { + after: Some(LeafId::new(NodeId::ROOT, 0)), + kind: TokenKind::Word, + content: b"fn".to_vec(), + }], + ) + .with_content_range(0usize.into(), 3usize.into()), + ); + + assert_eq!(phase2_create_skip_reason(&ops), None); + } + + #[test] + fn materialize_filter_rejects_diff_only_edit_ops() { + let mut ops = FileOps::edit(TrunkId::new(NodeId::ROOT, 0), "src/lib.rs".to_string()); + ops.add_line_op(LineOps::insert( + BranchId::new(NodeId::ROOT, 0), + None, + vec![LeafOp::Insert { + after: None, + kind: TokenKind::Word, + content: b"fn".to_vec(), + }], + )); + + assert_eq!( + phase2_create_skip_reason(&ops), + Some(CrdtMaterializeSkipReason::NonCreateTrunk) + ); + } +} From e0461a0cfd6b3b3c8b6cbe69d085e33c474242c8 Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Tue, 19 May 2026 16:28:17 -0400 Subject: [PATCH 4/8] fix provenance tracing --- .../src/turn/orchestrator/provenance.rs | 22 +++++- atomic-agent/src/turn/orchestrator/turn.rs | 76 +++++++++++++++++++ 2 files changed, 94 insertions(+), 4 deletions(-) diff --git a/atomic-agent/src/turn/orchestrator/provenance.rs b/atomic-agent/src/turn/orchestrator/provenance.rs index c81b355..a34fb54 100644 --- a/atomic-agent/src/turn/orchestrator/provenance.rs +++ b/atomic-agent/src/turn/orchestrator/provenance.rs @@ -75,8 +75,15 @@ impl TurnOrchestrator { } }; - if let Err(e) = lock_file.lock_exclusive() { - log::warn!("Failed to acquire lock for session {}: {}", session_id, e,); + if let Err(e) = lock_file.try_lock_exclusive() { + if e.kind() == std::io::ErrorKind::WouldBlock { + log::warn!( + "Provenance accumulator for session {} is already locked; skipping best-effort provenance update", + session_id, + ); + } else { + log::warn!("Failed to acquire lock for session {}: {}", session_id, e,); + } return None; } @@ -146,8 +153,15 @@ impl TurnOrchestrator { } }; - if let Err(e) = lock_file.lock_exclusive() { - log::warn!("Failed to acquire lock for session {}: {}", session_id, e); + if let Err(e) = lock_file.try_lock_exclusive() { + if e.kind() == std::io::ErrorKind::WouldBlock { + log::warn!( + "Provenance accumulator for session {} is already locked; skipping save", + session_id, + ); + } else { + log::warn!("Failed to acquire lock for session {}: {}", session_id, e); + } return; } diff --git a/atomic-agent/src/turn/orchestrator/turn.rs b/atomic-agent/src/turn/orchestrator/turn.rs index 24b270d..07e3619 100644 --- a/atomic-agent/src/turn/orchestrator/turn.rs +++ b/atomic-agent/src/turn/orchestrator/turn.rs @@ -2,6 +2,7 @@ //! //! Contains handlers for TurnStart, TurnEnd, and ToolUse events. +use std::fs::File; use std::path::Path; use crate::error::{AgentError, AgentResult}; @@ -11,6 +12,24 @@ use crate::turn::phase::{self, Action, Event, TransitionContext}; use super::{DispatchResult, TurnOrchestrator}; +const TURN_END_LOCK_FILENAME: &str = "turn-end.lock"; + +struct TurnEndLockGuard { + file: File, +} + +impl Drop for TurnEndLockGuard { + fn drop(&mut self) { + let _ = fs2::FileExt::unlock(&self.file); + } +} + +enum TurnEndLock { + Acquired(TurnEndLockGuard), + Busy, + Unavailable, +} + impl TurnOrchestrator { /// Handle a TurnStart event (UserPromptSubmit). /// @@ -117,6 +136,18 @@ impl TurnOrchestrator { event: TurnEvent, ) -> AgentResult { let session_id = &event.session_id; + let _turn_end_lock = match self.try_turn_end_lock(session_id) { + TurnEndLock::Acquired(guard) => Some(guard), + TurnEndLock::Busy => { + log::warn!( + "Turn end for session {} is already being recorded; skipping duplicate Stop hook", + session_id + ); + return Ok(DispatchResult::new(session_id, phase::Phase::Idle) + .with_warning("duplicate Stop hook skipped: turn already recording")); + } + TurnEndLock::Unavailable => None, + }; // Fast gate: check if anything changed since the last record. // This bypasses the entire status machinery (TREE scan, filesystem @@ -385,4 +416,49 @@ impl TurnOrchestrator { Ok(DispatchResult::new(session_id, session.phase)) } + + fn try_turn_end_lock(&self, session_id: &str) -> TurnEndLock { + use fs2::FileExt; + + let dir = self.session_graph_dir(session_id); + if let Err(e) = std::fs::create_dir_all(&dir) { + log::warn!( + "Failed to create turn-end lock dir for session {}: {}", + session_id, + e + ); + return TurnEndLock::Unavailable; + } + + let lock_path = dir.join(TURN_END_LOCK_FILENAME); + let file = match std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(false) + .open(&lock_path) + { + Ok(file) => file, + Err(e) => { + log::warn!( + "Failed to open turn-end lock for session {}: {}", + session_id, + e + ); + return TurnEndLock::Unavailable; + } + }; + + match file.try_lock_exclusive() { + Ok(()) => TurnEndLock::Acquired(TurnEndLockGuard { file }), + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => TurnEndLock::Busy, + Err(e) => { + log::warn!( + "Failed to acquire turn-end lock for session {}: {}", + session_id, + e + ); + TurnEndLock::Unavailable + } + } + } } From 48ab43fb64c1ddb37c348895a4738946859d67c9 Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Tue, 19 May 2026 22:10:17 -0400 Subject: [PATCH 5/8] working speed bump --- atomic-agent/src/record/mod.rs | 2 + atomic-cli/src/commands/agent/hooks.rs | 89 ++++++++- atomic-cli/src/commands/git/import.rs | 85 +++++++-- atomic-cli/src/commands/git/mod.rs | 6 +- atomic-cli/src/commands/git/parallel.rs | 176 +++++++++++++----- atomic-cli/src/main.rs | 4 +- atomic-cli/tests/push_integration_test.rs | 4 +- atomic-repository/src/record/options.rs | 61 ++++++ atomic-repository/src/repository/insert.rs | 25 +-- atomic-repository/src/repository/record.rs | 65 ++++--- .../src/repository/semantic_materialize.rs | 11 +- tests/harness/10_git_import.sh | 26 +-- 12 files changed, 426 insertions(+), 128 deletions(-) diff --git a/atomic-agent/src/record/mod.rs b/atomic-agent/src/record/mod.rs index 2f799fa..bc47ecf 100644 --- a/atomic-agent/src/record/mod.rs +++ b/atomic-agent/src/record/mod.rs @@ -258,6 +258,8 @@ pub fn record_turn( .view(options.session.view_name.clone()) .apply_after_record(true) .save_to_store(true) + .sync_vault(false) + .enrich_kg(false) .provenance(vec![provenance_entry]) .metadata_bytes(envelope_bytes); diff --git a/atomic-cli/src/commands/agent/hooks.rs b/atomic-cli/src/commands/agent/hooks.rs index 21b969c..c73d8f5 100644 --- a/atomic-cli/src/commands/agent/hooks.rs +++ b/atomic-cli/src/commands/agent/hooks.rs @@ -49,7 +49,8 @@ //! {"systemMessage": "Atomic is tracking this session..."} //! ``` -use std::io::Read; +use std::io::{Read, Write}; +use std::process::{Command as ProcessCommand, Stdio}; use anyhow::anyhow; use clap::Args; @@ -82,6 +83,10 @@ pub struct Hooks { /// The hook verb (e.g., "stop", "user-prompt-submit", "session-start"). verb: String, + + /// Run the hook body in this process. + #[arg(long, hide = true)] + foreground: bool, } impl Command for Hooks { @@ -95,6 +100,10 @@ impl Command for Hooks { )) })?; + if self.should_handoff_codex_stop() { + return self.handoff_codex_stop(&input); + } + // Look up the agent adapter let registry = AgentRegistry::with_defaults(); let agent = registry @@ -200,6 +209,49 @@ impl Command for Hooks { } } +impl Hooks { + fn should_handoff_codex_stop(&self) -> bool { + !self.foreground && self.agent_name == "codex" && self.verb == "stop" + } + + fn handoff_codex_stop(&self, input: &[u8]) -> CliResult<()> { + let exe = std::env::current_exe().map_err(|e| { + CliError::Internal(anyhow!("Failed to resolve atomic executable: {}", e)) + })?; + + let mut child = ProcessCommand::new(exe) + .arg("agent") + .arg("hooks") + .arg(&self.agent_name) + .arg(&self.verb) + .arg("--foreground") + .stdin(Stdio::piped()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .map_err(|e| { + CliError::Internal(anyhow!( + "Failed to start background Codex Stop recorder: {}", + e + )) + })?; + + if let Some(mut stdin) = child.stdin.take() { + stdin.write_all(input).map_err(|e| { + CliError::Io(std::io::Error::new( + e.kind(), + format!( + "Failed to pass Codex Stop input to background recorder: {}", + e + ), + )) + })?; + } + + Ok(()) + } +} + // Tests #[cfg(test)] @@ -211,6 +263,7 @@ mod tests { let hooks = Hooks { agent_name: "claude-code".to_string(), verb: "stop".to_string(), + foreground: false, }; assert_eq!(hooks.agent_name, "claude-code"); assert_eq!(hooks.verb, "stop"); @@ -318,9 +371,43 @@ mod tests { let hooks = Hooks { agent_name: "claude-code".to_string(), verb: "session-start".to_string(), + foreground: false, }; let debug = format!("{:?}", hooks); assert!(debug.contains("claude-code")); assert!(debug.contains("session-start")); } + + #[test] + fn test_codex_stop_handoffs_by_default() { + let hooks = Hooks { + agent_name: "codex".to_string(), + verb: "stop".to_string(), + foreground: false, + }; + + assert!(hooks.should_handoff_codex_stop()); + } + + #[test] + fn test_codex_stop_foreground_disables_handoff() { + let hooks = Hooks { + agent_name: "codex".to_string(), + verb: "stop".to_string(), + foreground: true, + }; + + assert!(!hooks.should_handoff_codex_stop()); + } + + #[test] + fn test_non_codex_stop_does_not_handoff() { + let hooks = Hooks { + agent_name: "claude-code".to_string(), + verb: "stop".to_string(), + foreground: false, + }; + + assert!(!hooks.should_handoff_codex_stop()); + } } diff --git a/atomic-cli/src/commands/git/import.rs b/atomic-cli/src/commands/git/import.rs index d895f32..4d49c03 100644 --- a/atomic-cli/src/commands/git/import.rs +++ b/atomic-cli/src/commands/git/import.rs @@ -8,7 +8,7 @@ //! The import process: //! 1. Opens the Git repository in the current directory //! 2. Resolves the target branch (default or specified) -//! 3. Walks commit history in topological order (oldest first) +//! 3. Walks first-parent commit history in topological order (oldest first) //! 4. For each commit, creates an Atomic change with: //! - Author from Git commit //! - Message from commit subject/body @@ -21,7 +21,8 @@ //! //! - Submodules are skipped with a warning //! - Binary files are imported as-is -//! - Merge commits are linearized (first parent only) +//! - Default imports are mainline-only (first parent) +//! - Use `--all` to import all local branches as views use std::collections::HashSet; use std::path::Path; @@ -56,10 +57,13 @@ pub struct Import { #[arg(long, short = 'b', value_name = "BRANCH")] pub branch: Option, - /// Import all branches as separate views. + /// Import all local branches as separate views. /// - /// Creates one Atomic view for each Git branch found in the repository. - #[arg(long)] + /// Creates one Atomic view for each Git branch found in the repository and + /// imports the full reachable history for those branches. By default, + /// `atomic git import` imports only the selected branch's mainline + /// first-parent history. + #[arg(long = "all", visible_alias = "all-branches")] pub all_branches: bool, /// Only import commits not already in Atomic. @@ -86,6 +90,13 @@ pub struct Import { } fn import_ignore_patterns(workdir: &Path, kind: Option<&str>) -> Vec { + const COMMON_IMPORT_IGNORES: &[&str] = &[ + "node_modules/", + "bower_components/", + ".yarn/cache/", + ".pnpm-store/", + ]; + let template = if let Some(kind) = kind { super::super::init::get_ignore_template(kind) } else if workdir.join("Cargo.toml").exists() { @@ -100,13 +111,22 @@ fn import_ignore_patterns(workdir: &Path, kind: Option<&str>) -> Vec { None }; - template - .unwrap_or(".atomic\n.git\n") - .lines() - .map(str::trim) - .filter(|line| !line.is_empty() && !line.starts_with('#')) - .map(ToOwned::to_owned) - .collect() + let mut patterns: Vec = COMMON_IMPORT_IGNORES + .iter() + .map(|pattern| (*pattern).to_string()) + .collect(); + + patterns.extend( + template + .unwrap_or(".atomic\n.git\n") + .lines() + .map(str::trim) + .filter(|line| !line.is_empty() && !line.starts_with('#')) + .map(ToOwned::to_owned), + ); + patterns.sort(); + patterns.dedup(); + patterns } fn current_git_branch(git_repo: &GitRepository) -> Option { @@ -125,6 +145,7 @@ impl Import { branch_name: &str, repo: &mut Repository, imported_shas: &HashSet, + mainline_only: bool, ) -> CliResult { // Get repository name from remote URL or working directory let repo_name = self.get_repo_name(git_repo); @@ -138,6 +159,7 @@ impl Import { git_repo.workdir().unwrap_or_else(|| repo.root()), self.kind.as_deref(), ), + mainline_only, }; let importer = ParallelImporter::new(git_repo, options); @@ -246,6 +268,7 @@ impl Import { git_repo: &GitRepository, head_oid: git2::Oid, imported_shas: &HashSet, + mainline_only: bool, ) -> CliResult { let mut revwalk = git_repo.revwalk().map_err(|e| CliError::GitError { message: format!("Failed to create revwalk: {}", e), @@ -255,6 +278,14 @@ impl Import { message: format!("Failed to push HEAD to revwalk: {}", e), })?; + if mainline_only { + revwalk + .simplify_first_parent() + .map_err(|e| CliError::GitError { + message: format!("Failed to simplify revwalk to first-parent history: {}", e), + })?; + } + revwalk .set_sorting(Sort::TOPOLOGICAL | Sort::REVERSE) .map_err(|e| CliError::GitError { @@ -303,7 +334,12 @@ impl Command for Import { for branch_name in &branches { if let Ok(reference) = git_repo.find_branch(branch_name, git2::BranchType::Local) { if let Some(target) = reference.get().target() { - let count = self.count_commits(&git_repo, target, &HashSet::new())?; + let count = self.count_commits( + &git_repo, + target, + &HashSet::new(), + !self.all_branches, + )?; print_info(&format!( "Would import {} commits from branch '{}'", count, branch_name @@ -357,7 +393,7 @@ impl Command for Import { // Import the branch let count = - self.import_branch(&git_repo, &branch_name, &mut repo, &imported_shas)?; + self.import_branch(&git_repo, &branch_name, &mut repo, &imported_shas, false)?; total_imported += count; } @@ -419,7 +455,8 @@ impl Command for Import { .map_err(|e| CliError::Internal(e.into()))?; // Import - let count = self.import_branch(&git_repo, &branch_name, &mut repo, &imported_shas)?; + let count = + self.import_branch(&git_repo, &branch_name, &mut repo, &imported_shas, true)?; if current_git_branch(&git_repo).as_deref() == Some(branch_name.as_str()) { print_info("Using Git working copy as imported materialization."); @@ -613,4 +650,22 @@ mod tests { assert!(!import.incremental); assert!(import.branch.is_none()); } + + #[test] + fn test_all_flag_and_legacy_alias() { + let import = Import::try_parse_from(["import", "--all"]).unwrap(); + assert!(import.all_branches); + + let import = Import::try_parse_from(["import", "--all-branches"]).unwrap(); + assert!(import.all_branches); + } + + #[test] + fn test_import_ignore_patterns_always_exclude_dependency_dirs() { + let patterns = import_ignore_patterns(Path::new("."), Some("go")); + + assert!(patterns.iter().any(|p| p == "node_modules/")); + assert!(patterns.iter().any(|p| p == ".yarn/cache/")); + assert!(patterns.iter().any(|p| p == "vendor/")); + } } diff --git a/atomic-cli/src/commands/git/mod.rs b/atomic-cli/src/commands/git/mod.rs index 51cdf64..0cdf551 100644 --- a/atomic-cli/src/commands/git/mod.rs +++ b/atomic-cli/src/commands/git/mod.rs @@ -40,7 +40,7 @@ //! ## Import All Branches //! //! ```text -//! $ atomic git import --all-branches +//! $ atomic git import --all //! ``` //! //! ## Incremental Import @@ -81,8 +81,8 @@ pub enum GitCommands { /// # Import specific branch /// atomic git import --branch main /// - /// # Import all branches as stacks - /// atomic git import --all-branches + /// # Import all local branches as views + /// atomic git import --all /// /// # Preview without creating repository /// atomic git import --dry-run diff --git a/atomic-cli/src/commands/git/parallel.rs b/atomic-cli/src/commands/git/parallel.rs index 9cbdae5..17d23c8 100644 --- a/atomic-cli/src/commands/git/parallel.rs +++ b/atomic-cli/src/commands/git/parallel.rs @@ -188,6 +188,13 @@ pub struct ParallelImportOptions { /// template. These are applied before graph construction so generated /// build outputs never enter the imported history. pub ignored_path_patterns: Vec, + /// Import only the selected branch's first-parent history. + /// + /// This is the default single-branch Git import mode. It treats merge + /// commits on the trunk branch as the landing event and avoids importing + /// long-running branch internals or repeated upstream merges into feature + /// branches. + pub mainline_only: bool, } impl Default for ParallelImportOptions { @@ -197,6 +204,7 @@ impl Default for ParallelImportOptions { imported_shas: HashSet::new(), repo_name: "unknown".to_string(), ignored_path_patterns: Vec::new(), + mainline_only: true, } } } @@ -263,14 +271,49 @@ pub struct ParallelImporter { } fn is_generated_diff_skip_path(path: &str) -> bool { + let normalized = path.replace('\\', "/"); let name = Path::new(path) .file_name() .and_then(|s| s.to_str()) .unwrap_or(path); - - name.ends_with(".lock") - || name.ends_with(".sum") - || matches!(name, "package-lock.json" | "yarn.lock" | "pnpm-lock.yaml") + let lower_name = name.to_ascii_lowercase(); + let lower_path = normalized.to_ascii_lowercase(); + + lower_name.ends_with(".lock") + || lower_name.ends_with(".sum") + || lower_name.ends_with(".min.css") + || lower_name.ends_with(".min.js") + || lower_name.ends_with(".map") + || matches!( + lower_name.as_str(), + "package-lock.json" | "yarn.lock" | "pnpm-lock.yaml" | "npm-shrinkwrap.json" + ) + || matches!( + Path::new(&lower_name) + .extension() + .and_then(|ext| ext.to_str()), + Some( + "png" + | "jpg" + | "jpeg" + | "gif" + | "webp" + | "ico" + | "bmp" + | "tiff" + | "woff" + | "woff2" + | "ttf" + | "eot" + | "otf" + | "pdf" + | "zip" + | "gz" + | "tgz" + ) + ) + || lower_path.ends_with("/website/source/stylesheets/main.css") + || lower_path == "website/source/stylesheets/main.css" } fn count_line_units(content: &[u8]) -> usize { @@ -676,6 +719,9 @@ fn build_graph_first_file_ops_for_added_file( Some(encoding) }; let mut file_ops = atomic_core::change::FileOps::create(trunk_id, path.to_string(), enc); + if encoding == Encoding::Binary { + return file_ops; + } let mut prev_branch: Option = None; for (line_idx, line) in content_lines.iter().enumerate() { @@ -758,9 +804,6 @@ fn build_graph_first_change( FileOperation::Added | FileOperation::Copied => { let new_content = file.new_content.as_deref().unwrap_or(&[]); let encoding = Encoding::detect(new_content); - if encoding == Encoding::Binary { - return None; - } let filename = extract_filename(&file.path); let name_start = ChangePosition::new(contents.len() as u64); @@ -779,18 +822,19 @@ fn build_graph_first_change( pos: ChangePosition::ROOT, }; - let new_line_contents: Vec> = if is_generated_diff_skip_path(&file.path) { - if new_content.is_empty() { - Vec::new() + let new_line_contents: Vec> = + if encoding == Encoding::Binary || is_generated_diff_skip_path(&file.path) { + if new_content.is_empty() { + Vec::new() + } else { + vec![new_content.to_vec()] + } } else { - vec![new_content.to_vec()] - } - } else { - split_graph_first_lines(new_content) - .into_iter() - .map(|line| line.to_vec()) - .collect() - }; + split_graph_first_lines(new_content) + .into_iter() + .map(|line| line.to_vec()) + .collect() + }; let mut new_ranges = Vec::new(); for line in &new_line_contents { let start = ChangePosition::new(contents.len() as u64); @@ -870,9 +914,6 @@ fn build_graph_first_change( let indexed = line_index.files.get(old_path)?; let new_content = file.new_content.as_deref().unwrap_or(&[]); let encoding = Encoding::detect(new_content); - if encoding == Encoding::Binary { - return None; - } let new_filename = extract_filename(&file.path); let name_start = ChangePosition::new(contents.len() as u64); @@ -923,27 +964,30 @@ fn build_graph_first_change( new_path: file.path.clone(), }); - if let Some(diff_lines) = file.diff_lines.as_ref() { - let (ops, _) = atomic_core::record::workflow::build_crdt_ops_from_git_diff( - &file.path, diff_lines, - ); - file_ops.push(ops); + if encoding != Encoding::Binary && !is_generated_diff_skip_path(&file.path) { + if let Some(diff_lines) = file.diff_lines.as_ref() { + let (ops, _) = atomic_core::record::workflow::build_crdt_ops_from_git_diff( + &file.path, diff_lines, + ); + file_ops.push(ops); + } } - let replacements = if is_generated_diff_skip_path(&file.path) { - vec![GitReplacementBlock { - old_start: if indexed.lines.is_empty() { 0 } else { 1 }, - old_len: indexed.lines.len(), - new_start: 1, - new_lines: if new_content.is_empty() { - Vec::new() - } else { - vec![new_content.to_vec()] - }, - }] - } else { - current_state_replacements(indexed, new_content) - }; + let replacements = + if encoding == Encoding::Binary || is_generated_diff_skip_path(&file.path) { + vec![GitReplacementBlock { + old_start: if indexed.lines.is_empty() { 0 } else { 1 }, + old_len: indexed.lines.len(), + new_start: 1, + new_lines: if new_content.is_empty() { + Vec::new() + } else { + vec![new_content.to_vec()] + }, + }] + } else { + current_state_replacements(indexed, new_content) + }; if !replacements.is_empty() { let mut pending_replacements = Vec::new(); for replacement in replacements { @@ -1138,8 +1182,11 @@ fn build_graph_first_change( } let indexed = line_index.files.get(&file.path)?; - let replacements = if is_generated_diff_skip_path(&file.path) { - let new_content = file.new_content.as_deref()?; + let new_content = file.new_content.as_deref()?; + let encoding = Encoding::detect(new_content); + let replacements = if encoding == Encoding::Binary + || is_generated_diff_skip_path(&file.path) + { vec![GitReplacementBlock { old_start: if indexed.lines.is_empty() { 0 } else { 1 }, old_len: indexed.lines.len(), @@ -1884,6 +1931,14 @@ impl ParallelImporter { message: format!("Failed to push target to revwalk: {}", e), })?; + if self.options.mainline_only { + revwalk + .simplify_first_parent() + .map_err(|e| CliError::GitError { + message: format!("Failed to simplify revwalk to first-parent history: {}", e), + })?; + } + // Topological order, oldest first revwalk .set_sorting(git2::Sort::TOPOLOGICAL | git2::Sort::REVERSE) @@ -3232,6 +3287,26 @@ mod tests { assert_eq!(stats.changes_written, 0); } + #[test] + fn test_generated_diff_skip_paths_include_terraform_website_assets() { + assert!(is_generated_diff_skip_path( + "website/source/stylesheets/main.css" + )); + assert!(is_generated_diff_skip_path( + "website/source/images/logo-static.png" + )); + assert!(is_generated_diff_skip_path("package-lock.json")); + assert!(is_generated_diff_skip_path("dist/app.min.js")); + + assert!(!is_generated_diff_skip_path( + "website/source/stylesheets/_footer.less" + )); + assert!(!is_generated_diff_skip_path( + "website/source/layouts/docs.erb" + )); + assert!(!is_generated_diff_skip_path("internal/style.css")); + } + #[test] fn test_graph_first_added_file_ops_use_unique_branch_ids_and_ranges() { let mut next_branch_idx = 0; @@ -3265,4 +3340,21 @@ mod tests { Some((ChangePosition::new(4), ChangePosition::new(8))) ); } + + #[test] + fn test_graph_first_binary_file_ops_create_trunk_only() { + let mut next_branch_idx = 0; + let ops = build_graph_first_file_ops_for_added_file( + "website/source/images/logo-static.png", + &[b"\x89PNG\r\n\x1a\n".to_vec()], + &[(ChangePosition::new(0), ChangePosition::new(8))], + Encoding::Binary, + 0, + &mut next_branch_idx, + ); + + assert_eq!(ops.trunk_id().file_idx(), 0); + assert!(ops.line_ops().is_empty()); + assert_eq!(next_branch_idx, 0); + } } diff --git a/atomic-cli/src/main.rs b/atomic-cli/src/main.rs index 7b1160f..f602d63 100644 --- a/atomic-cli/src/main.rs +++ b/atomic-cli/src/main.rs @@ -379,8 +379,8 @@ enum Commands { /// # Import specific branch /// atomic git import --branch main /// - /// # Import all branches as views - /// atomic git import --all-branches + /// # Import all local branches as views + /// atomic git import --all /// /// # Preview without creating repository /// atomic git import --dry-run diff --git a/atomic-cli/tests/push_integration_test.rs b/atomic-cli/tests/push_integration_test.rs index bfb09b0..3f2e2a5 100644 --- a/atomic-cli/tests/push_integration_test.rs +++ b/atomic-cli/tests/push_integration_test.rs @@ -422,8 +422,8 @@ fn test_remote_error_is_not_found() { let repo_err = RemoteError::repo_not_found("url"); assert!(repo_err.is_not_found()); - let stack_err = RemoteError::stack_not_found("main"); - assert!(stack_err.is_not_found()); + let view_err = RemoteError::view_not_found("main"); + assert!(view_err.is_not_found()); let change_err = RemoteError::change_not_found("ABC123"); assert!(change_err.is_not_found()); diff --git a/atomic-repository/src/record/options.rs b/atomic-repository/src/record/options.rs index 027752a..7684acf 100644 --- a/atomic-repository/src/record/options.rs +++ b/atomic-repository/src/record/options.rs @@ -65,6 +65,15 @@ pub struct RecordOptions { /// Whether to save the change to the store. save_to_store: bool, + /// Whether to refresh the file index after applying the recorded change. + update_file_index: bool, + + /// Whether to deflate vault working-copy files after recording. + sync_vault: bool, + + /// Whether to enrich the knowledge graph for the newly recorded change. + enrich_kg: bool, + /// AI provenance information for this change. /// /// When recording AI-assisted changes, this captures metadata about @@ -191,6 +200,37 @@ impl RecordOptions { self } + /// Set whether to refresh the file index after applying the recorded change. + /// + /// Defaults to true for normal user records. Agent hooks may disable this + /// to keep turn-end latency low. + #[must_use] + pub fn update_file_index(mut self, update: bool) -> Self { + self.update_file_index = update; + self + } + + /// Set whether to deflate vault working-copy files after recording. + /// + /// Defaults to true for normal user records. Agent hooks may disable this + /// because vault sync is best-effort maintenance, not required for the + /// recorded change itself. + #[must_use] + pub fn sync_vault(mut self, sync: bool) -> Self { + self.sync_vault = sync; + self + } + + /// Set whether to enrich the knowledge graph after recording. + /// + /// Defaults to true for normal user records. Agent hooks may disable this + /// so enrichment can happen explicitly instead of blocking Stop hooks. + #[must_use] + pub fn enrich_kg(mut self, enrich: bool) -> Self { + self.enrich_kg = enrich; + self + } + /// Set opaque metadata bytes for `HashedChange.metadata`. /// /// These bytes are included in the change's cryptographic hash, @@ -300,6 +340,24 @@ impl RecordOptions { self.save_to_store } + /// Get whether to refresh the file index after applying. + #[must_use] + pub fn get_update_file_index(&self) -> bool { + self.update_file_index + } + + /// Get whether to deflate vault working-copy files after recording. + #[must_use] + pub fn get_sync_vault(&self) -> bool { + self.sync_vault + } + + /// Get whether to enrich the knowledge graph after recording. + #[must_use] + pub fn get_enrich_kg(&self) -> bool { + self.enrich_kg + } + /// Get the AI provenance information. #[must_use] pub fn get_provenance(&self) -> &[Provenance] { @@ -366,6 +424,9 @@ impl Default for RecordOptions { message: None, apply_after_record: true, save_to_store: true, + update_file_index: true, + sync_vault: true, + enrich_kg: true, provenance: Vec::new(), } } diff --git a/atomic-repository/src/repository/insert.rs b/atomic-repository/src/repository/insert.rs index 2e69e21..f0677b6 100644 --- a/atomic-repository/src/repository/insert.rs +++ b/atomic-repository/src/repository/insert.rs @@ -105,15 +105,15 @@ impl ImportGraphFirstVertexCache { where T: TreeTxnT, { - if !self.by_inode.contains_key(&inode) { - let mut cache = ImportGraphFirstInodeCache::default(); + if let std::collections::hash_map::Entry::Vacant(entry) = self.by_inode.entry(inode) { + let mut inode_cache = ImportGraphFirstInodeCache::default(); let vertices = txn .iter_inode_vertices(inode) .map_err(|e| RepositoryError::Database(e.to_string()))?; for result in vertices { let (node, _edge) = result.map_err(|e| RepositoryError::Database(e.to_string()))?; - cache.by_end.entry(node.end_pos()).or_insert(node); - cache + inode_cache.by_end.entry(node.end_pos()).or_insert(node); + inode_cache .by_start .entry(node.start_pos()) .and_modify(|existing| { @@ -123,7 +123,7 @@ impl ImportGraphFirstVertexCache { }) .or_insert(node); } - self.by_inode.insert(inode, cache); + entry.insert(inode_cache); } self.by_inode.get(&inode).ok_or_else(|| { @@ -135,6 +135,13 @@ impl ImportGraphFirstVertexCache { } } +type PendingImportEdge = ( + Option, + EdgeFlags, + GraphNode, + GraphNode, +); + fn import_direct_source( pos: &Position>, by_end: &HashMap>, @@ -254,6 +261,7 @@ fn import_graph_first_resolved_inode( .map_err(|e| RepositoryError::Database(e.to_string())) } +#[allow(clippy::too_many_arguments)] fn import_graph_first_source( txn: &T, pos: &Position>, @@ -656,12 +664,7 @@ impl Repository { use atomic_core::apply::compute_new_state; let graph_start = std::time::Instant::now(); - let mut pending_edges: Vec<( - Option, - EdgeFlags, - GraphNode, - GraphNode, - )> = Vec::new(); + let mut pending_edges: Vec = Vec::new(); { let mut old_by_end: HashMap, GraphNode> = HashMap::new(); diff --git a/atomic-repository/src/repository/record.rs b/atomic-repository/src/repository/record.rs index e9fb106..490aa1d 100644 --- a/atomic-repository/src/repository/record.rs +++ b/atomic-repository/src/repository/record.rs @@ -761,38 +761,41 @@ impl Repository { // the record, so subsequent status() calls can skip unchanged // files (mtime+size match) or avoid graph reconstruction // (compare stored content hash instead). - if let Ok(mut idx_txn) = self.pristine.write_txn() { - let file_index_start = std::time::Instant::now(); - for path_str in outcome.recorded_files() { - // Strip directory markers like "dir/ (directory)" - let clean_path = - path_str.strip_suffix("/ (directory)").unwrap_or(path_str); - let abs_path = self.root.join(clean_path); - if let Ok(metadata) = std::fs::metadata(&abs_path) { - use std::time::SystemTime; - let mtime = metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH); - let duration = mtime - .duration_since(SystemTime::UNIX_EPOCH) - .unwrap_or_default(); - let content_hash = std::fs::read(&abs_path) - .map(|bytes| Hash::of(&bytes)) - .unwrap_or(Hash::ZERO); - let _ = idx_txn.put_file_index( - clean_path, - duration.as_secs() as i64, - duration.subsec_nanos(), - metadata.len(), - &content_hash, + if options.get_update_file_index() { + if let Ok(mut idx_txn) = self.pristine.write_txn() { + let file_index_start = std::time::Instant::now(); + for path_str in outcome.recorded_files() { + // Strip directory markers like "dir/ (directory)" + let clean_path = + path_str.strip_suffix("/ (directory)").unwrap_or(path_str); + let abs_path = self.root.join(clean_path); + if let Ok(metadata) = std::fs::metadata(&abs_path) { + use std::time::SystemTime; + let mtime = + metadata.modified().unwrap_or(SystemTime::UNIX_EPOCH); + let duration = mtime + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default(); + let content_hash = std::fs::read(&abs_path) + .map(|bytes| Hash::of(&bytes)) + .unwrap_or(Hash::ZERO); + let _ = idx_txn.put_file_index( + clean_path, + duration.as_secs() as i64, + duration.subsec_nanos(), + metadata.len(), + &content_hash, + ); + } + } + let _ = idx_txn.commit(); + if trace_record { + eprintln!( + "[record] file_index_update complete elapsed={:?}", + file_index_start.elapsed() ); } } - let _ = idx_txn.commit(); - if trace_record { - eprintln!( - "[record] file_index_update complete elapsed={:?}", - file_index_start.elapsed() - ); - } } } Err(e) => { @@ -802,7 +805,7 @@ impl Repository { } // Deflate vault working copy changes (if vault is initialized) - if self.has_vault().unwrap_or(false) { + if options.get_sync_vault() && self.has_vault().unwrap_or(false) { match self.vault_record_working_copy() { Ok(vault_paths) if !vault_paths.is_empty() => { outcome.set_vault_paths(vault_paths); @@ -817,7 +820,7 @@ impl Repository { } // Auto-enrich KG with the new change (best-effort) - if outcome.was_saved() { + if options.get_enrich_kg() && outcome.was_saved() { let hash = *outcome.hash(); if let Err(e) = self.kg_enrich_change(&hash) { log::debug!("KG enrich for change: {}", e); diff --git a/atomic-repository/src/repository/semantic_materialize.rs b/atomic-repository/src/repository/semantic_materialize.rs index 89bb162..430433c 100644 --- a/atomic-repository/src/repository/semantic_materialize.rs +++ b/atomic-repository/src/repository/semantic_materialize.rs @@ -12,7 +12,7 @@ use crate::apply::get_view_changes; use crate::{Repository, RepositoryError}; /// Options for materializing stored FileOps into CRDT tables. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct CrdtMaterializeOptions { /// View whose ordered changes should be replayed. pub view: Option, @@ -20,15 +20,6 @@ pub struct CrdtMaterializeOptions { pub force: bool, } -impl Default for CrdtMaterializeOptions { - fn default() -> Self { - Self { - view: None, - force: false, - } - } -} - /// Summary from a CRDT materialization pass. #[derive(Debug, Clone, Default)] pub struct CrdtMaterializeOutcome { diff --git a/tests/harness/10_git_import.sh b/tests/harness/10_git_import.sh index f843a0c..1940752 100755 --- a/tests/harness/10_git_import.sh +++ b/tests/harness/10_git_import.sh @@ -24,6 +24,10 @@ count_imported_git_changes() { echo "$count" } +git_first_parent_commit_count() { + git rev-list --first-parent --count HEAD 2>/dev/null || echo "0" +} + assert_imported_git_change_count() { local desc="$1" local expected="$2" @@ -78,9 +82,9 @@ echo " Cloning hashicorp/go-uuid..." clone_git_repo "https://github.com/hashicorp/go-uuid.git" cd "$GIT_REPO_DIR" -expected_commits="$(git_commit_count)" +expected_commits="$(git_first_parent_commit_count)" default_branch="$(git_default_branch)" -echo " Found $expected_commits commits on branch '$default_branch'" +echo " Found $expected_commits first-parent commits on branch '$default_branch'" # Initialize atomic and import assert_success "atomic git import succeeds" atomic git import @@ -94,10 +98,10 @@ assert_view_exists "view '$default_branch' created" "$default_branch" actual_imported_commits="$(count_imported_git_changes)" if [[ "$actual_imported_commits" -eq "$expected_commits" ]] || \ [[ "$actual_imported_commits" -eq $((expected_commits - 1)) ]]; then - _pass "change count matches git commits ($expected_commits)" + _pass "change count matches git first-parent commits ($expected_commits)" else - _fail "change count matches git commits ($expected_commits)" \ - "expected $expected_commits imported git changes (or $((expected_commits - 1)) with one skipped empty/merge commit), got $actual_imported_commits" + _fail "change count matches git first-parent commits ($expected_commits)" \ + "expected $expected_commits imported git mainline changes (or $((expected_commits - 1)) with one skipped empty/merge commit), got $actual_imported_commits" fi # ════════════════════════════════════════════════════════════════════════════ @@ -172,8 +176,8 @@ echo " Cloning holman/spark..." clone_git_repo "https://github.com/holman/spark.git" cd "$GIT_REPO_DIR" -expected_commits="$(git_commit_count)" -echo " Found $expected_commits commits" +expected_commits="$(git_first_parent_commit_count)" +echo " Found $expected_commits first-parent commits" assert_success "atomic git import succeeds" atomic git import @@ -249,8 +253,8 @@ echo " Cloning sharkdp/hyperfine (this may take a minute)..." clone_git_repo "https://github.com/sharkdp/hyperfine.git" cd "$GIT_REPO_DIR" -expected_commits="$(git_commit_count)" -echo " Found $expected_commits commits" +expected_commits="$(git_first_parent_commit_count)" +echo " Found $expected_commits first-parent commits" start_time=$(date +%s) atomic git import >/dev/null 2>&1 @@ -267,7 +271,7 @@ else _fail "import completed in reasonable time" "took ${duration}s" fi -# Verify imported Git commit count exactly. Do not use raw `atomic log` +# Verify imported Git mainline commit count exactly. Do not use raw `atomic log` # length here: git import records follow-up Atomic-only changes such as # repository/vault initialization, and those are not Git commits. actual="$(count_imported_git_changes)" @@ -367,7 +371,7 @@ git_commit "Feature B" "b.txt" "b" git checkout "$initial_branch" --quiet atomic init >/dev/null 2>&1 -atomic git import --all-branches >/dev/null 2>&1 +atomic git import --all >/dev/null 2>&1 # Verify all branches became views assert_view_exists "${initial_branch} view exists" "$initial_branch" From f754768244a86858c5bc0cb1f32b68035a4bc26a Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Wed, 20 May 2026 09:08:28 -0400 Subject: [PATCH 6/8] support for hot path imports --- atomic-cli/src/commands/git/parallel.rs | 608 ++++++++++++++++-- atomic-core/src/pristine/inode_graph/impls.rs | 102 +++ atomic-core/src/pristine/inode_graph/types.rs | 24 + atomic-core/src/pristine/view_graph.rs | 12 + .../src/record/workflow/globalize/hunk.rs | 224 +++++++ atomic-repository/src/repository/insert.rs | 337 +++++++++- atomic-repository/src/repository/mod.rs | 4 +- .../src/repository/tests/integration_tests.rs | 42 ++ tests/harness/18_git_import_hot_file.sh | 198 ++++++ 9 files changed, 1481 insertions(+), 70 deletions(-) create mode 100644 tests/harness/18_git_import_hot_file.sh diff --git a/atomic-cli/src/commands/git/parallel.rs b/atomic-cli/src/commands/git/parallel.rs index 17d23c8..77ab0a6 100644 --- a/atomic-cli/src/commands/git/parallel.rs +++ b/atomic-cli/src/commands/git/parallel.rs @@ -361,6 +361,7 @@ impl ImportLine { struct ImportIndexedFile { inode_pos: Position>, lines: Vec, + imported_commits: usize, } #[derive(Default)] @@ -394,8 +395,14 @@ impl ImportLineIndex { .to_vec(), }); } - self.files - .insert(path.clone(), ImportIndexedFile { inode_pos, lines }); + self.files.insert( + path.clone(), + ImportIndexedFile { + inode_pos, + lines, + imported_commits: 1, + }, + ); } GraphOp::Edit { change: Atom::Insertion(insertion), @@ -418,6 +425,191 @@ impl ImportLineIndex { } } } + + fn seed_missing_modified_files(&mut self, repo: &Repository, parsed: &ParsedCommit) { + for file in &parsed.files { + if file.operation != FileOperation::Modified || self.files.contains_key(&file.path) { + continue; + } + let Some(old_content) = file.old_content.as_deref() else { + continue; + }; + + let old_lines: Vec> = if Encoding::detect(old_content) == Encoding::Binary + || is_generated_diff_skip_path(&file.path) + { + if old_content.is_empty() { + Vec::new() + } else { + vec![old_content.to_vec()] + } + } else { + split_graph_first_lines(old_content) + .into_iter() + .map(|line| line.to_vec()) + .collect() + }; + + let seed = match repo.import_line_index_seed(&file.path) { + Ok(Some(seed)) => seed, + Ok(None) => continue, + Err(err) => { + trace_git_import(format!( + "{}: could not seed line index for {}: {}", + parsed.short_sha, file.path, err + )); + continue; + } + }; + + if seed.lines.len() != old_lines.len() { + trace_git_import(format!( + "{}: not seeding line index for {}: graph lines={} git old lines={}", + parsed.short_sha, + file.path, + seed.lines.len(), + old_lines.len() + )); + continue; + } + + let mut imported_changes = HashSet::new(); + let lines = seed + .lines + .iter() + .zip(old_lines) + .map(|(line, content)| { + imported_changes.insert(line.incoming_by); + ImportLine { + change: line.change, + start: line.start, + end: line.end, + incoming_by: line.incoming_by, + content, + } + }) + .collect(); + + self.files.insert( + file.path.clone(), + ImportIndexedFile { + inode_pos: Position { + change: Some(seed.inode_pos.change), + pos: seed.inode_pos.pos, + }, + lines, + imported_commits: imported_changes.len().max(1), + }, + ); + } + } + + fn seed_file_from_graph_content( + &mut self, + repo: &Repository, + path: &str, + content: &[u8], + imported_commits_hint: usize, + ) -> bool { + let line_contents: Vec> = + if Encoding::detect(content) == Encoding::Binary || is_generated_diff_skip_path(path) { + if content.is_empty() { + Vec::new() + } else { + vec![content.to_vec()] + } + } else { + split_graph_first_lines(content) + .into_iter() + .map(|line| line.to_vec()) + .collect() + }; + + let seed = match repo.import_line_index_seed(path) { + Ok(Some(seed)) => seed, + Ok(None) => return false, + Err(err) => { + trace_git_import(format!( + "could not reseed line index for {} after fallback: {}", + path, err + )); + return false; + } + }; + + if seed.lines.len() != line_contents.len() { + trace_git_import(format!( + "not reseeding line index for {} after fallback: graph lines={} content lines={}", + path, + seed.lines.len(), + line_contents.len() + )); + return false; + } + + let mut imported_changes = HashSet::new(); + let lines = seed + .lines + .iter() + .zip(line_contents) + .map(|(line, content)| { + imported_changes.insert(line.incoming_by); + ImportLine { + change: line.change, + start: line.start, + end: line.end, + incoming_by: line.incoming_by, + content, + } + }) + .collect(); + + self.files.insert( + path.to_string(), + ImportIndexedFile { + inode_pos: Position { + change: Some(seed.inode_pos.change), + pos: seed.inode_pos.pos, + }, + lines, + imported_commits: imported_changes.len().max(imported_commits_hint).max(1), + }, + ); + true + } + + fn reseed_from_fallback_write(&mut self, repo: &Repository, parsed: &ParsedCommit) { + for file in &parsed.files { + match file.operation { + FileOperation::Deleted => { + self.files.remove(&file.path); + } + FileOperation::Renamed => { + if let Some(old_path) = file.old_path.as_deref() { + self.files.remove(old_path); + } + if let Some(new_content) = file.new_content.as_deref() { + let hint = self + .files + .get(&file.path) + .map(|indexed| indexed.imported_commits.saturating_add(1)) + .unwrap_or(1); + self.seed_file_from_graph_content(repo, &file.path, new_content, hint); + } + } + FileOperation::Added | FileOperation::Copied | FileOperation::Modified => { + if let Some(new_content) = file.new_content.as_deref() { + let hint = self + .files + .get(&file.path) + .map(|indexed| indexed.imported_commits.saturating_add(1)) + .unwrap_or(1); + self.seed_file_from_graph_content(repo, &file.path, new_content, hint); + } + } + } + } + } } #[derive(Debug)] @@ -458,6 +650,23 @@ struct GitReplacementBlock { new_lines: Vec>, } +#[derive(Debug, Clone)] +struct GraphFirstSkip { + path: String, + operation: FileOperation, + reason: &'static str, +} + +impl GraphFirstSkip { + fn new(file: &ParsedFile, reason: &'static str) -> Self { + Self { + path: file.path.clone(), + operation: file.operation, + reason, + } + } +} + fn position_hashes(pos: &Position>) -> impl Iterator + '_ { pos.change .into_iter() @@ -472,6 +681,92 @@ fn split_graph_first_lines(content: &[u8]) -> Vec<&[u8]> { } } +fn import_shape_for_file(file: &ParsedFile, line_index: &ImportLineIndex) -> (usize, usize, usize) { + let indexed = line_index.files.get(&file.path).or_else(|| { + file.old_path + .as_deref() + .and_then(|old| line_index.files.get(old)) + }); + let current_lines = file + .new_content + .as_deref() + .or(file.old_content.as_deref()) + .map(count_line_units) + .or_else(|| indexed.map(|idx| idx.lines.len())) + .unwrap_or(0); + let indexed_lines = indexed.map(|idx| idx.lines.len()).unwrap_or(0); + let imported_commits = indexed.map(|idx| idx.imported_commits).unwrap_or(0); + (current_lines, indexed_lines, imported_commits) +} + +fn import_shape_summary(parsed: &ParsedCommit, line_index: &ImportLineIndex) -> String { + let mut entries: Vec<(usize, String)> = parsed + .files + .iter() + .map(|file| { + let (current_lines, indexed_lines, imported_commits) = + import_shape_for_file(file, line_index); + let bytes = file + .new_content + .as_ref() + .or(file.old_content.as_ref()) + .map(|content| content.len()) + .unwrap_or(0); + let weight = current_lines + .saturating_mul(imported_commits.max(1)) + .saturating_add(indexed_lines); + ( + weight, + format!( + "{} op={:?} lines={} indexed_lines={} file_commits={} bytes={}", + file.path, + file.operation, + current_lines, + indexed_lines, + imported_commits, + bytes + ), + ) + }) + .collect(); + entries.sort_by(|a, b| b.0.cmp(&a.0)); + entries + .into_iter() + .take(3) + .map(|(_, entry)| entry) + .collect::>() + .join("; ") +} + +fn graph_first_skip_summary(skips: &[GraphFirstSkip], parsed: &ParsedCommit) -> String { + if skips.is_empty() { + return String::new(); + } + + skips + .iter() + .take(5) + .map(|skip| { + let lines = parsed + .files + .iter() + .find(|file| file.path == skip.path) + .and_then(|file| { + file.new_content + .as_deref() + .or(file.old_content.as_deref()) + .map(count_line_units) + }) + .unwrap_or(0); + format!( + "{} op={:?} reason={} lines={}", + skip.path, skip.operation, skip.reason, lines + ) + }) + .collect::>() + .join("; ") +} + fn trace_git_import_enabled() -> bool { std::env::var_os("ATOMIC_TRACE_GIT_IMPORT").is_some() } @@ -785,9 +1080,13 @@ fn build_graph_first_change( header: ChangeHeader, parsed: &ParsedCommit, line_index: &ImportLineIndex, -) -> Option<(Change, Vec, Vec)> { +) -> Result<(Change, Vec, Vec), Vec> { if parsed.files.is_empty() { - return None; + return Err(vec![GraphFirstSkip { + path: String::new(), + operation: FileOperation::Modified, + reason: "empty_commit", + }]); } let mut contents = Vec::new(); @@ -798,6 +1097,7 @@ fn build_graph_first_change( let mut dependencies = HashSet::new(); let mut pending = Vec::new(); let mut deleted_paths = Vec::new(); + let mut skips = Vec::new(); for file in &parsed.files { match file.operation { @@ -910,8 +1210,14 @@ fn build_graph_first_change( continue; } FileOperation::Renamed => { - let old_path = file.old_path.as_deref()?; - let indexed = line_index.files.get(old_path)?; + let Some(old_path) = file.old_path.as_deref() else { + skips.push(GraphFirstSkip::new(file, "rename_missing_old_path")); + continue; + }; + let Some(indexed) = line_index.files.get(old_path) else { + skips.push(GraphFirstSkip::new(file, "rename_missing_line_index")); + continue; + }; let new_content = file.new_content.as_deref().unwrap_or(&[]); let encoding = Encoding::detect(new_content); @@ -994,11 +1300,27 @@ fn build_graph_first_change( let start_idx = if replacement.old_len == 0 { replacement.old_start } else { - replacement.old_start.checked_sub(1)? + match replacement.old_start.checked_sub(1) { + Some(idx) => idx, + None => { + skips.push(GraphFirstSkip::new( + file, + "rename_replacement_underflow", + )); + continue; + } + } + }; + let Some(end_idx) = start_idx.checked_add(replacement.old_len) else { + skips.push(GraphFirstSkip::new(file, "rename_replacement_overflow")); + continue; }; - let end_idx = start_idx.checked_add(replacement.old_len)?; if end_idx > indexed.lines.len() { - return None; + skips.push(GraphFirstSkip::new( + file, + "rename_replacement_out_of_bounds", + )); + continue; } let predecessor = if start_idx == 0 { @@ -1134,7 +1456,17 @@ fn build_graph_first_change( continue; } FileOperation::Deleted => { - let indexed = line_index.files.get(&file.path)?; + let Some(indexed) = line_index.files.get(&file.path) else { + skips.push(GraphFirstSkip::new( + file, + "delete_missing_line_index_cleanup_only", + )); + deleted_paths.push(file.path.clone()); + pending.push(PendingLineIndexUpdate::Delete { + path: file.path.clone(), + }); + continue; + }; let mut edge_update = EdgeUpdate { edges: Vec::with_capacity(indexed.lines.len()), inode: indexed.inode_pos, @@ -1181,8 +1513,14 @@ fn build_graph_first_change( FileOperation::Modified => {} } - let indexed = line_index.files.get(&file.path)?; - let new_content = file.new_content.as_deref()?; + let Some(indexed) = line_index.files.get(&file.path) else { + skips.push(GraphFirstSkip::new(file, "modified_missing_line_index")); + continue; + }; + let Some(new_content) = file.new_content.as_deref() else { + skips.push(GraphFirstSkip::new(file, "modified_missing_new_content")); + continue; + }; let encoding = Encoding::detect(new_content); let replacements = if encoding == Encoding::Binary || is_generated_diff_skip_path(&file.path) @@ -1198,14 +1536,20 @@ fn build_graph_first_change( }, }] } else if parsed.is_merge { - let new_content = file.new_content.as_deref()?; current_state_replacements(indexed, new_content) } else { - let diff_lines = file.diff_lines.as_ref()?; + let Some(diff_lines) = file.diff_lines.as_ref() else { + skips.push(GraphFirstSkip::new(file, "modified_missing_diff_lines")); + continue; + }; let (ops, _) = atomic_core::record::workflow::build_crdt_ops_from_git_diff(&file.path, diff_lines); file_ops.push(ops); - parse_git_diff_replacements(diff_lines)? + let Some(replacements) = parse_git_diff_replacements(diff_lines) else { + skips.push(GraphFirstSkip::new(file, "modified_unparseable_diff_lines")); + continue; + }; + replacements }; if replacements.is_empty() { continue; @@ -1224,11 +1568,22 @@ fn build_graph_first_change( let start_idx = if replacement.old_len == 0 { replacement.old_start } else { - replacement.old_start.checked_sub(1)? + let Some(idx) = replacement.old_start.checked_sub(1) else { + skips.push(GraphFirstSkip::new(file, "modified_replacement_underflow")); + continue; + }; + idx + }; + let Some(end_idx) = start_idx.checked_add(replacement.old_len) else { + skips.push(GraphFirstSkip::new(file, "modified_replacement_overflow")); + continue; }; - let end_idx = start_idx.checked_add(replacement.old_len)?; if end_idx > indexed.lines.len() { - return None; + skips.push(GraphFirstSkip::new( + file, + "modified_replacement_out_of_bounds", + )); + continue; } let predecessor = if start_idx == 0 { @@ -1361,19 +1716,84 @@ fn build_graph_first_change( } if hunks.is_empty() { - return None; + if skips.is_empty() { + skips.push(GraphFirstSkip { + path: String::new(), + operation: FileOperation::Modified, + reason: "no_graph_hunks", + }); + } + return Err(skips); + } + + let fatal_skips: Vec = skips + .iter() + .filter(|skip| skip.reason != "delete_missing_line_index_cleanup_only") + .cloned() + .collect(); + if !fatal_skips.is_empty() { + return Err(skips); } let mut dependencies: Vec = dependencies.into_iter().collect(); dependencies.sort(); dependencies.dedup(); - Some(( + Ok(( Change::with_file_ops(header, hunks, file_ops, contents, dependencies), pending, deleted_paths, )) } +fn build_graph_first_skip_reasons( + parsed: &ParsedCommit, + line_index: &ImportLineIndex, +) -> Vec { + let mut skips = Vec::new(); + for file in &parsed.files { + match file.operation { + FileOperation::Modified => { + if !line_index.files.contains_key(&file.path) { + skips.push(GraphFirstSkip::new(file, "modified_missing_line_index")); + } else if file.new_content.is_none() { + skips.push(GraphFirstSkip::new(file, "modified_missing_new_content")); + } else if !parsed.is_merge + && !is_generated_diff_skip_path(&file.path) + && file + .new_content + .as_deref() + .map(Encoding::detect) + .is_some_and(|encoding| encoding != Encoding::Binary) + && file.diff_lines.is_none() + { + skips.push(GraphFirstSkip::new(file, "modified_missing_diff_lines")); + } + } + FileOperation::Deleted => { + if !line_index.files.contains_key(&file.path) { + skips.push(GraphFirstSkip::new( + file, + "delete_missing_line_index_cleanup_only", + )); + } + } + FileOperation::Renamed => { + if file.old_path.is_none() { + skips.push(GraphFirstSkip::new(file, "rename_missing_old_path")); + } else if file + .old_path + .as_deref() + .is_some_and(|old| !line_index.files.contains_key(old)) + { + skips.push(GraphFirstSkip::new(file, "rename_missing_line_index")); + } + } + FileOperation::Added | FileOperation::Copied => {} + } + } + skips +} + fn current_state_replacements( indexed: &ImportIndexedFile, new_content: &[u8], @@ -1529,14 +1949,22 @@ fn apply_line_index_updates( content, }) .collect(); - line_index - .files - .insert(path, ImportIndexedFile { inode_pos, lines }); + line_index.files.insert( + path, + ImportIndexedFile { + inode_pos, + lines, + imported_commits: 1, + }, + ); } PendingLineIndexUpdate::Modify { path, replacements } => { let Some(indexed) = line_index.files.get_mut(&path) else { continue; }; + if !replacements.is_empty() { + indexed.imported_commits += 1; + } let mut offset: isize = 0; for replacement in replacements { @@ -1697,14 +2125,8 @@ impl ParallelImporter { /// Commits are processed in **batches** to keep memory bounded and show /// progress sooner. Each batch: parse in parallel → write sequentially. /// - /// Batch sizes are tiered by total commit count: - /// - /// | Total commits | Batch size | - /// |---------------|------------| - /// | < 5,000 | 250 | - /// | 5,000–9,999 | 500 | - /// | 10,000–19,999 | 1,000 | - /// | ≥ 20,000 | 2,500 | + /// Imports use a fixed 1,000-commit batch size. This keeps progress and + /// memory behavior predictable across small and large repositories. pub fn import_branch( &self, branch_name: &str, @@ -1897,14 +2319,9 @@ impl ParallelImporter { Ok(stats) } - /// Determine batch size based on total commit count. - fn batch_size_for(total: usize) -> usize { - match total { - 0..5_000 => 250, - 5_000..10_000 => 500, - 10_000..20_000 => 1_000, - _ => 2_500, - } + /// Determine the default import batch size. + fn batch_size_for(_total: usize) -> usize { + 1_000 } /// Collect commit OIDs in topological order (oldest first). @@ -2160,17 +2577,40 @@ impl ParallelImporter { return self.write_empty_commit(repo, parsed, header); } - if let Some((mut graph_change, pending_updates, graph_deleted_paths)) = - build_graph_first_change(header.clone(), parsed, line_index) - { + line_index.seed_missing_modified_files(repo, parsed); + let graph_first_skips = build_graph_first_skip_reasons(parsed, line_index); + let graph_first_result = build_graph_first_change(header.clone(), parsed, line_index); + + if let Ok((mut graph_change, pending_updates, graph_deleted_paths)) = graph_first_result { graph_change.unhashed = Some(self.build_git_metadata(parsed, false, false)); + let pre_write_shape = import_shape_summary(parsed, line_index); + let pre_write_skip_summary = graph_first_skip_summary(&graph_first_skips, parsed); let progress = SlowImportProgress::start( slow_import_commit_label(parsed), - format!( - "graph-first files={}, ops={}; CRDT metadata deferred", - parsed.files.len(), - graph_change.hunks().len() - ), + if pre_write_shape.is_empty() && pre_write_skip_summary.is_empty() { + format!( + "graph-first files={}, ops={}; CRDT metadata deferred", + parsed.files.len(), + graph_change.hunks().len() + ) + } else { + format!( + "graph-first files={}, ops={}; {}{}{}CRDT metadata deferred", + parsed.files.len(), + graph_change.hunks().len(), + pre_write_shape, + if pre_write_shape.is_empty() || pre_write_skip_summary.is_empty() { + "" + } else { + "; " + }, + if pre_write_skip_summary.is_empty() { + String::new() + } else { + format!("cleanup-only skips: {}; ", pre_write_skip_summary) + } + ) + }, ); let write_start = Instant::now(); let write_result = repo.write_import_graph_change( @@ -2181,6 +2621,16 @@ impl ParallelImporter { let write_ms = write_start.elapsed().as_millis(); let progress_reported = progress.finish(); let write_outcome = write_result.map_err(|e| CliError::Internal(e.into()))?; + let shape_summary = if progress_reported + || write_ms >= 5_000 + || write_outcome.timings.assemble_ms >= 5_000 + || write_outcome.timings.apply_ms >= 5_000 + || write_outcome.timings.direct_graph_ms >= 5_000 + { + Some(pre_write_shape.clone()) + } else { + None + }; apply_line_index_updates(line_index, write_outcome.hash, pending_updates); if progress_reported || write_ms >= 5_000 { print_info(&format!( @@ -2193,6 +2643,15 @@ impl ParallelImporter { write_outcome.timings.direct_crdt_ms, write_outcome.timings.commit_ms )); + if let Some(shape) = shape_summary.as_ref().filter(|shape| !shape.is_empty()) { + print_info(&format!(" Slow import shape: {}", shape)); + } + if !pre_write_skip_summary.is_empty() { + print_info(&format!( + " Graph-first cleanup-only skips: {}", + pre_write_skip_summary + )); + } } trace_git_import(format!( "write {} files={} graph_first=1 ops={} apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms total={}ms", @@ -2205,6 +2664,10 @@ impl ParallelImporter { write_outcome.timings.commit_ms, commit_start.elapsed().as_millis() )); + if !graph_deleted_paths.is_empty() { + let del_refs: Vec<&str> = graph_deleted_paths.iter().map(|s| s.as_str()).collect(); + let _ = repo.del_file_index_batch(&del_refs); + } return Ok(true); } @@ -2577,10 +3040,19 @@ impl ParallelImporter { } let metadata = self.build_git_metadata(parsed, false, recorded_files.is_empty()); - let progress = SlowImportProgress::start( - slow_import_commit_label(parsed), - slow_import_record_summary(parsed, &recorded_files), - ); + let pre_write_shape = import_shape_summary(parsed, line_index); + let graph_first_skip_summary = graph_first_skip_summary(&graph_first_skips, parsed); + let progress_summary = if pre_write_shape.is_empty() { + slow_import_record_summary(parsed, &recorded_files) + } else { + format!( + "{}; {}", + slow_import_record_summary(parsed, &recorded_files), + pre_write_shape + ) + }; + let progress = + SlowImportProgress::start(slow_import_commit_label(parsed), progress_summary); let write_start = Instant::now(); let write_outcome = repo .write_import_recorded( @@ -2593,22 +3065,7 @@ impl ParallelImporter { .map_err(|e| CliError::Internal(e.into()))?; let write_ms = write_start.elapsed().as_millis(); let progress_reported = progress.finish(); - if !recorded_files.is_empty() - && recorded_files.iter().all(|recorded| { - matches!( - recorded.kind(), - Some(atomic_core::record::workflow::DetectionKind::Added) - ) - }) - { - match repo.load_change(&write_outcome.hash) { - Ok(change) => line_index.update_from_added_change(write_outcome.hash, &change), - Err(err) => trace_git_import(format!( - "graph-first {}: could not seed line index from {}: {}", - parsed.short_sha, write_outcome.hash, err - )), - } - } + line_index.reseed_from_fallback_write(repo, parsed); if progress_reported || write_ms >= 5_000 { print_info(&format!( "Imported {} in {:.1}s (assemble={}ms apply={}ms direct_graph={}ms direct_crdt={}ms commit={}ms)", @@ -2620,6 +3077,15 @@ impl ParallelImporter { write_outcome.timings.direct_crdt_ms, write_outcome.timings.commit_ms )); + if !pre_write_shape.is_empty() { + print_info(&format!(" Slow import shape: {}", pre_write_shape)); + } + if !graph_first_skip_summary.is_empty() { + print_info(&format!( + " Graph-first skipped: {}", + graph_first_skip_summary + )); + } } // Files deleted via record_modified_file (the "show diff lines" path) @@ -3287,6 +3753,14 @@ mod tests { assert_eq!(stats.changes_written, 0); } + #[test] + fn test_import_batch_size_is_fixed_at_1000() { + assert_eq!(ParallelImporter::batch_size_for(0), 1_000); + assert_eq!(ParallelImporter::batch_size_for(82), 1_000); + assert_eq!(ParallelImporter::batch_size_for(18_000), 1_000); + assert_eq!(ParallelImporter::batch_size_for(100_000), 1_000); + } + #[test] fn test_generated_diff_skip_paths_include_terraform_website_assets() { assert!(is_generated_diff_skip_path( diff --git a/atomic-core/src/pristine/inode_graph/impls.rs b/atomic-core/src/pristine/inode_graph/impls.rs index af23501..d4d49fe 100644 --- a/atomic-core/src/pristine/inode_graph/impls.rs +++ b/atomic-core/src/pristine/inode_graph/impls.rs @@ -176,6 +176,57 @@ impl InodeGraphOps for ReadTxn { Ok(None) } + fn find_block_end_in_inode( + &self, + inode: Inode, + pos: Position, + ) -> Result>, Self::InodeError> { + let table = self.txn.open_multimap_table(INODE_GRAPH)?; + + let inode_id = inode.get(); + let change_id = pos.change.get(); + let target_pos = pos.pos.get(); + + let empty_key = encode_inode_vertex(inode_id, change_id, target_pos, target_pos); + if table.get(&empty_key)?.next().is_some() { + return Ok(Some(GraphNode { + change: NodeId::new(change_id), + start: ChangePosition::new(target_pos), + end: ChangePosition::new(target_pos), + })); + } + + let start_key = encode_inode_vertex(inode_id, change_id, 0, 0); + let end_key = encode_inode_vertex(inode_id, change_id, target_pos, u64::MAX); + + for result in table.range::<&[u8; 32]>(&start_key..=&end_key)? { + let (key, _values) = result?; + let (_, v_change, v_start, v_end) = decode_inode_vertex(key.value()); + + if v_change != change_id { + continue; + } + + if v_end == target_pos && v_start < v_end { + return Ok(Some(GraphNode { + change: NodeId::new(v_change), + start: ChangePosition::new(v_start), + end: ChangePosition::new(v_end), + })); + } + + if v_start <= target_pos && target_pos < v_end { + return Ok(Some(GraphNode { + change: NodeId::new(v_change), + start: ChangePosition::new(v_start), + end: ChangePosition::new(v_end), + })); + } + } + + Ok(None) + } + fn count_inode_vertices(&self, inode: Inode) -> Result { let table = self.txn.open_multimap_table(INODE_GRAPH)?; @@ -346,6 +397,57 @@ impl<'a> InodeGraphOps for WriteTxn<'a> { Ok(None) } + fn find_block_end_in_inode( + &self, + inode: Inode, + pos: Position, + ) -> Result>, Self::InodeError> { + let table = self.txn.open_multimap_table(INODE_GRAPH)?; + + let inode_id = inode.get(); + let change_id = pos.change.get(); + let target_pos = pos.pos.get(); + + let empty_key = encode_inode_vertex(inode_id, change_id, target_pos, target_pos); + if table.get(&empty_key)?.next().is_some() { + return Ok(Some(GraphNode { + change: NodeId::new(change_id), + start: ChangePosition::new(target_pos), + end: ChangePosition::new(target_pos), + })); + } + + let start_key = encode_inode_vertex(inode_id, change_id, 0, 0); + let end_key = encode_inode_vertex(inode_id, change_id, target_pos, u64::MAX); + + for result in table.range::<&[u8; 32]>(&start_key..=&end_key)? { + let (key, _values) = result?; + let (_, v_change, v_start, v_end) = decode_inode_vertex(key.value()); + + if v_change != change_id { + continue; + } + + if v_end == target_pos && v_start < v_end { + return Ok(Some(GraphNode { + change: NodeId::new(v_change), + start: ChangePosition::new(v_start), + end: ChangePosition::new(v_end), + })); + } + + if v_start <= target_pos && target_pos < v_end { + return Ok(Some(GraphNode { + change: NodeId::new(v_change), + start: ChangePosition::new(v_start), + end: ChangePosition::new(v_end), + })); + } + } + + Ok(None) + } + fn count_inode_vertices(&self, inode: Inode) -> Result { let table = self.txn.open_multimap_table(INODE_GRAPH)?; diff --git a/atomic-core/src/pristine/inode_graph/types.rs b/atomic-core/src/pristine/inode_graph/types.rs index fdb968d..c3b9e59 100644 --- a/atomic-core/src/pristine/inode_graph/types.rs +++ b/atomic-core/src/pristine/inode_graph/types.rs @@ -358,6 +358,20 @@ pub trait InodeGraphOps { pos: Position, ) -> Result>, Self::InodeError>; + /// Find a block ending at the given position within an inode scope. + /// + /// This is the inode-scoped equivalent of `GraphTxnT::find_block_end`. + /// Implementations that cannot support it efficiently may return `Ok(None)` + /// and callers can fall back to the global graph lookup. + fn find_block_end_in_inode( + &self, + inode: Inode, + pos: Position, + ) -> Result>, Self::InodeError> { + let _ = (inode, pos); + Ok(None) + } + /// Count vertices in an inode scope. /// /// More efficient than iterating and counting when only the count is needed. @@ -385,6 +399,16 @@ pub trait InodeGraphOps { Ok(self.count_inode_vertices(inode)? > 0) } + /// Whether inode-scoped edge iteration needs an external view filter. + /// + /// Plain repository transactions return `false`: `INODE_GRAPH` can be used + /// as file-local truth. View wrappers that delegate to an unfiltered + /// `INODE_GRAPH` return `true` so callers do not accidentally see edges + /// from changes outside the current view. + fn inode_graph_needs_view_filter(&self) -> bool { + false + } + /// Iterate all edges for vertices within an inode scope. /// /// This provides a convenient way to iterate all edges for a file diff --git a/atomic-core/src/pristine/view_graph.rs b/atomic-core/src/pristine/view_graph.rs index 234af08..7950e51 100644 --- a/atomic-core/src/pristine/view_graph.rs +++ b/atomic-core/src/pristine/view_graph.rs @@ -106,6 +106,14 @@ impl<'a, T: InodeGraphOps> InodeGraphOps for ViewGraph<'a, T> { self.inner.find_block_in_inode(inode, pos) } + fn find_block_end_in_inode( + &self, + inode: Inode, + pos: Position, + ) -> Result>, Self::InodeError> { + self.inner.find_block_end_in_inode(inode, pos) + } + fn count_inode_vertices(&self, inode: Inode) -> Result { self.inner.count_inode_vertices(inode) } @@ -113,6 +121,10 @@ impl<'a, T: InodeGraphOps> InodeGraphOps for ViewGraph<'a, T> { fn inode_graph_is_populated(&self, inode: Inode) -> Result { self.inner.inode_graph_is_populated(inode) } + + fn inode_graph_needs_view_filter(&self) -> bool { + true + } } /// Filtered adjacency iterator that only yields edges from visible changes. diff --git a/atomic-core/src/record/workflow/globalize/hunk.rs b/atomic-core/src/record/workflow/globalize/hunk.rs index acca1a7..409202c 100644 --- a/atomic-core/src/record/workflow/globalize/hunk.rs +++ b/atomic-core/src/record/workflow/globalize/hunk.rs @@ -725,6 +725,24 @@ where use crate::types::EdgeFlags; use std::collections::HashSet; + if !txn.inode_graph_needs_view_filter() && txn.inode_graph_is_populated(inode).unwrap_or(false) + { + let step = std::time::Instant::now(); + let ordered = collect_sorted_content_vertices_inode_ordered(txn, inode, inode_pos); + if !ordered.is_empty() { + let elapsed_ms = step.elapsed().as_millis(); + if elapsed_ms > 50 { + log::warn!( + "collect_sorted_content_vertices: inode fast path took {}ms (inode={:?}, vertices={})", + elapsed_ms, + inode, + ordered.len(), + ); + } + return Ok(ordered); + } + } + let mut ordered: Vec> = Vec::new(); let mut visited: HashSet> = HashSet::new(); let mut current = inode_pos.inode_node(); @@ -918,6 +936,212 @@ where Ok(ordered) } +fn collect_sorted_content_vertices_inode_ordered( + txn: &T, + inode: Inode, + inode_pos: Position, +) -> Vec> +where + T: GraphTxnT + InodeGraphOps, +{ + use crate::types::EdgeFlags; + use std::collections::HashSet; + + let mut ordered: Vec> = Vec::new(); + let mut visited: HashSet> = HashSet::new(); + let mut current = inode_pos.inode_node(); + + let is_dead_in_inode = |node: GraphNode| -> bool { + let mut parents = match txn.init_inode_adj( + node_inode_or(inode), + node, + EdgeFlags::PARENT, + EdgeFlags::all(), + ) { + Ok(adj) => adj, + Err(_) => return false, + }; + while let Some(edge) = txn.next_inode_adj(&mut parents) { + let Ok(edge) = edge else { + continue; + }; + let flags = edge.flag(); + if flags.contains(EdgeFlags::PARENT) && flags.contains(EdgeFlags::DELETED) { + return true; + } + } + false + }; + + fn alive_reaches_inode( + txn: &T, + inode: Inode, + start: GraphNode, + target: GraphNode, + is_dead: &dyn Fn(GraphNode) -> bool, + ) -> bool { + if start == target { + return true; + } + + let mut stack = vec![start]; + let mut seen = std::collections::HashSet::new(); + + while let Some(current) = stack.pop() { + if !seen.insert(current) { + continue; + } + + let mut adj = match txn.init_inode_adj( + current_inode_or(inode), + current, + EdgeFlags::BLOCK, + EdgeFlags::all(), + ) { + Ok(adj) => adj, + Err(_) => continue, + }; + + while let Some(edge) = txn.next_inode_adj(&mut adj) { + let Ok(edge) = edge else { + continue; + }; + let flags = edge.flag(); + if flags.contains(EdgeFlags::PARENT) + || flags.contains(EdgeFlags::DELETED) + || flags.contains(EdgeFlags::PSEUDO) + { + continue; + } + + let dest = txn + .find_block_in_inode(inode, edge.dest()) + .ok() + .flatten() + .or_else(|| txn.find_block(edge.dest()).ok()); + let Some(dest) = dest else { + continue; + }; + if is_dead(dest) { + continue; + } + if dest == target { + return true; + } + stack.push(dest); + } + } + + false + } + + loop { + if !visited.insert(current) { + break; + } + + let mut adj = match txn.init_inode_adj( + inode, + current, + EdgeFlags::BLOCK, + EdgeFlags::BLOCK | EdgeFlags::FOLDER, + ) { + Ok(adj) => adj, + Err(_) => break, + }; + + let mut alive_candidates: Vec> = Vec::new(); + let mut next_dead: Option> = None; + + while let Some(edge) = txn.next_inode_adj(&mut adj) { + let Ok(edge) = edge else { + continue; + }; + let flags = edge.flag(); + if flags.contains(EdgeFlags::PARENT) + || flags.contains(EdgeFlags::DELETED) + || flags.contains(EdgeFlags::PSEUDO) + { + continue; + } + + let dest = txn + .find_block_in_inode(inode, edge.dest()) + .ok() + .flatten() + .or_else(|| txn.find_block(edge.dest()).ok()); + let Some(dest) = dest else { + continue; + }; + if visited.contains(&dest) { + continue; + } + if !is_dead_in_inode(dest) { + alive_candidates.push(dest); + } else if next_dead.is_none() { + next_dead = Some(dest); + } + } + + let next_alive = if alive_candidates.len() <= 1 { + alive_candidates.into_iter().next() + } else { + alive_candidates + .iter() + .copied() + .find(|candidate| { + let reaches_other = alive_candidates.iter().copied().any(|other| { + other != *candidate + && alive_reaches_inode(txn, inode, *candidate, other, &is_dead_in_inode) + }); + let reached_by_other = alive_candidates.iter().copied().any(|other| { + other != *candidate + && alive_reaches_inode(txn, inode, other, *candidate, &is_dead_in_inode) + }); + reaches_other && !reached_by_other + }) + .or_else(|| { + alive_candidates.iter().copied().find(|candidate| { + alive_candidates.iter().copied().any(|other| { + other != *candidate + && alive_reaches_inode( + txn, + inode, + *candidate, + other, + &is_dead_in_inode, + ) + }) + }) + }) + .or_else(|| alive_candidates.into_iter().next()) + }; + + let dest = match next_alive.or(next_dead) { + Some(dest) => dest, + None => break, + }; + + let is_inode_marker = dest.start == dest.end && dest.start == inode_pos.pos; + let is_alive = !is_dead_in_inode(dest); + if is_alive && !is_inode_marker && !dest.change.is_root() && dest.start != dest.end { + ordered.push(dest); + } + + current = dest; + } + + ordered +} + +fn node_inode_or(inode: Inode) -> Inode { + inode +} + +fn current_inode_or(inode: Inode) -> Inode { + inode +} + /// Retrieve every alive content vertex for a file. /// /// This always uses `retrieve_graph` which traverses via the `GraphTxnT` diff --git a/atomic-repository/src/repository/insert.rs b/atomic-repository/src/repository/insert.rs index f0677b6..d8f8512 100644 --- a/atomic-repository/src/repository/insert.rs +++ b/atomic-repository/src/repository/insert.rs @@ -6,6 +6,7 @@ use crate::apply::{ InsertOutcome, InsertStats, }; use atomic_core::change::Insertion; +use atomic_core::pristine::InodeGraphOps; use atomic_core::types::{ChangePosition, EdgeFlags, GraphNode, SerializedGraphEdge}; use std::collections::{HashMap, HashSet}; @@ -85,9 +86,27 @@ pub struct ImportWriteOutcome { pub insert: InsertOutcome, } +/// Existing graph line metadata used by git import to lazily rebuild its +/// in-memory line index after a prior fallback or interrupted fast path. +#[derive(Debug, Clone)] +pub struct ImportLineIndexSeed { + pub inode_pos: Position, + pub lines: Vec, +} + +#[derive(Debug, Clone)] +pub struct ImportLineIndexSeedLine { + pub change: Hash, + pub start: ChangePosition, + pub end: ChangePosition, + pub incoming_by: Hash, +} + #[derive(Default)] struct ImportGraphFirstVertexCache { by_inode: HashMap, + by_end_pos: HashMap<(Inode, Position), GraphNode>, + by_start_pos: HashMap<(Inode, Position), GraphNode>, } #[derive(Default)] @@ -97,6 +116,50 @@ struct ImportGraphFirstInodeCache { } impl ImportGraphFirstVertexCache { + fn find_end( + &mut self, + txn: &T, + inode: Inode, + pos: Position, + ) -> Result>, RepositoryError> + where + T: GraphTxnT + InodeGraphOps, + { + if let Some(node) = self.by_end_pos.get(&(inode, pos)) { + return Ok(Some(*node)); + } + let Some(node) = txn + .find_block_end_in_inode(inode, pos) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + self.by_end_pos.insert((inode, pos), node); + Ok(Some(node)) + } + + fn find_start( + &mut self, + txn: &T, + inode: Inode, + pos: Position, + ) -> Result>, RepositoryError> + where + T: GraphTxnT + InodeGraphOps, + { + if let Some(node) = self.by_start_pos.get(&(inode, pos)) { + return Ok(Some(*node)); + } + let Some(node) = txn + .find_block_in_inode(inode, pos) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + self.by_start_pos.insert((inode, pos), node); + Ok(Some(node)) + } + fn load( &mut self, txn: &T, @@ -195,6 +258,106 @@ fn import_direct_can_apply(change: &Change) -> bool { }) } +fn import_seed_edge_visible(edge: &SerializedGraphEdge, visible: &HashSet) -> bool { + let change = edge.introduced_by(); + change.is_root() || visible.contains(&change) +} + +fn import_seed_node_visible(node: GraphNode, visible: &HashSet) -> bool { + node.change.is_root() || visible.contains(&node.change) +} + +fn import_seed_is_dead( + txn: &T, + inode: Inode, + node: GraphNode, + visible: &HashSet, +) -> bool +where + T: GraphTxnT + InodeGraphOps, +{ + let mut parents = match txn.init_inode_adj(inode, node, EdgeFlags::PARENT, EdgeFlags::all()) { + Ok(adj) => adj, + Err(_) => return false, + }; + while let Some(edge) = txn.next_inode_adj(&mut parents) { + let Ok(edge) = edge else { + continue; + }; + let flags = edge.flag(); + if flags.contains(EdgeFlags::PARENT) + && flags.contains(EdgeFlags::DELETED) + && import_seed_edge_visible(&edge, visible) + { + return true; + } + } + false +} + +fn import_seed_alive_reaches( + txn: &T, + inode: Inode, + start: GraphNode, + target: GraphNode, + visible: &HashSet, +) -> bool +where + T: GraphTxnT + InodeGraphOps, +{ + if start == target { + return true; + } + + let mut stack = vec![start]; + let mut seen = HashSet::new(); + + while let Some(current) = stack.pop() { + if !seen.insert(current) { + continue; + } + + let mut adj = match txn.init_inode_adj(inode, current, EdgeFlags::BLOCK, EdgeFlags::all()) { + Ok(adj) => adj, + Err(_) => continue, + }; + + while let Some(edge) = txn.next_inode_adj(&mut adj) { + let Ok(edge) = edge else { + continue; + }; + let flags = edge.flag(); + if flags.contains(EdgeFlags::PARENT) + || flags.contains(EdgeFlags::DELETED) + || flags.contains(EdgeFlags::PSEUDO) + || !import_seed_edge_visible(&edge, visible) + { + continue; + } + + let Some(dest) = txn + .find_block_in_inode(inode, edge.dest()) + .ok() + .flatten() + .or_else(|| txn.find_block(edge.dest()).ok()) + else { + continue; + }; + if !import_seed_node_visible(dest, visible) + || import_seed_is_dead(txn, inode, dest, visible) + { + continue; + } + if dest == target { + return true; + } + stack.push(dest); + } + } + + false +} + fn import_graph_first_can_apply(change: &Change) -> bool { !change.hunks().is_empty() && change.hunks().iter().all(|op| match op { @@ -273,7 +436,7 @@ fn import_graph_first_source( change_id: NodeId, ) -> Result, RepositoryError> where - T: GraphTxnT + TreeTxnT, + T: GraphTxnT + TreeTxnT + InodeGraphOps, { let resolved = import_graph_first_position(txn, pos, change_id)?; @@ -297,6 +460,9 @@ where } if let Some(inode) = resolved_inode { + if let Some(node) = vertex_cache.find_end(txn, inode, resolved)? { + return Ok(node); + } if let Some(node) = vertex_cache.load(txn, inode)?.by_end.get(&resolved) { return Ok(*node); } @@ -315,7 +481,7 @@ fn import_graph_first_successor( change_id: NodeId, ) -> Result, RepositoryError> where - T: GraphTxnT + TreeTxnT, + T: GraphTxnT + TreeTxnT + InodeGraphOps, { let resolved = import_graph_first_position(txn, pos, change_id)?; if resolved.change == change_id { @@ -325,6 +491,9 @@ where } if let Some(inode) = resolved_inode { + if let Some(node) = vertex_cache.find_start(txn, inode, resolved)? { + return Ok(node); + } if let Some(node) = vertex_cache.load(txn, inode)?.by_start.get(&resolved) { return Ok(*node); } @@ -379,6 +548,170 @@ fn import_direct_write_insertion( impl Repository { // Change Insertion Methods + /// Rebuild ordered line vertex metadata for a tracked file from the + /// current view's graph. Git import uses this as a conservative repair + /// path when its in-memory line index is missing for a modified file. + pub fn import_line_index_seed( + &self, + path: &str, + ) -> Result, RepositoryError> { + let normalized = path.replace('\\', "/"); + let txn = self + .pristine + .read_txn() + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let view = txn + .get_view(&self.current_view) + .map_err(|e| RepositoryError::Database(e.to_string()))? + .ok_or_else(|| RepositoryError::ViewNotFound { + name: self.current_view.clone(), + })?; + let visible = collect_visible_change_ids_with_deps(&txn, &view)?; + + let Some(inode) = txn + .get_inode(&normalized) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + let Some(position) = txn + .inode_position(inode) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + if !import_seed_node_visible(position.inode_node(), &visible) { + return Ok(None); + } + let Some(inode_change) = txn + .get_external(position.change) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + + let mut current = position.inode_node(); + let mut visited = HashSet::new(); + let mut lines = Vec::new(); + + loop { + if !visited.insert(current) { + return Ok(None); + } + + let mut adj = txn + .init_inode_adj( + inode, + current, + EdgeFlags::BLOCK, + EdgeFlags::BLOCK | EdgeFlags::FOLDER, + ) + .map_err(|e| RepositoryError::Database(e.to_string()))?; + + let mut alive_candidates: Vec<(GraphNode, NodeId)> = Vec::new(); + let mut next_dead: Option<(GraphNode, NodeId)> = None; + + while let Some(edge) = txn.next_inode_adj(&mut adj) { + let edge = edge.map_err(|e| RepositoryError::Database(e.to_string()))?; + let flags = edge.flag(); + if flags.contains(EdgeFlags::PARENT) + || flags.contains(EdgeFlags::DELETED) + || flags.contains(EdgeFlags::PSEUDO) + || !import_seed_edge_visible(&edge, &visible) + { + continue; + } + + let Some(dest) = txn + .find_block_in_inode(inode, edge.dest()) + .map_err(|e| RepositoryError::Database(e.to_string()))? + .or_else(|| txn.find_block(edge.dest()).ok()) + else { + continue; + }; + if visited.contains(&dest) || !import_seed_node_visible(dest, &visible) { + continue; + } + let introduced_by = edge.introduced_by(); + if !import_seed_is_dead(&txn, inode, dest, &visible) { + alive_candidates.push((dest, introduced_by)); + } else if next_dead.is_none() { + next_dead = Some((dest, introduced_by)); + } + } + + let next_alive = if alive_candidates.len() <= 1 { + alive_candidates.into_iter().next() + } else { + alive_candidates + .iter() + .copied() + .find(|(candidate, _)| { + let reaches_other = alive_candidates.iter().copied().any(|(other, _)| { + other != *candidate + && import_seed_alive_reaches( + &txn, inode, *candidate, other, &visible, + ) + }); + let reached_by_other = + alive_candidates.iter().copied().any(|(other, _)| { + other != *candidate + && import_seed_alive_reaches( + &txn, inode, other, *candidate, &visible, + ) + }); + reaches_other && !reached_by_other + }) + .or_else(|| { + alive_candidates.iter().copied().find(|(candidate, _)| { + alive_candidates.iter().copied().any(|(other, _)| { + other != *candidate + && import_seed_alive_reaches( + &txn, inode, *candidate, other, &visible, + ) + }) + }) + }) + .or_else(|| alive_candidates.into_iter().next()) + }; + + let Some((dest, introduced_by)) = next_alive.or(next_dead) else { + break; + }; + + let is_inode_marker = dest.start == dest.end && dest.start == position.pos; + let is_alive = !import_seed_is_dead(&txn, inode, dest, &visible); + if is_alive && !is_inode_marker && !dest.change.is_root() && dest.start != dest.end { + let Some(change) = txn + .get_external(dest.change) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + let Some(incoming_by) = txn + .get_external(introduced_by) + .map_err(|e| RepositoryError::Database(e.to_string()))? + else { + return Ok(None); + }; + lines.push(ImportLineIndexSeedLine { + change, + start: dest.start, + end: dest.end, + incoming_by, + }); + } + + current = dest; + } + + Ok(Some(ImportLineIndexSeed { + inode_pos: Position::new(inode_change, position.pos), + lines, + })) + } + /// Assemble, save, and apply a freshly imported Git commit without going /// through the normal `insert_change()` load/check path. /// diff --git a/atomic-repository/src/repository/mod.rs b/atomic-repository/src/repository/mod.rs index dd3d5cf..a1af029 100644 --- a/atomic-repository/src/repository/mod.rs +++ b/atomic-repository/src/repository/mod.rs @@ -114,7 +114,9 @@ mod vault_intent; mod vault_kg_enrich; mod vault_names; mod vault_triples; -pub use insert::{ImportWriteOutcome, ImportWriteTimings}; +pub use insert::{ + ImportLineIndexSeed, ImportLineIndexSeedLine, ImportWriteOutcome, ImportWriteTimings, +}; pub use semantic_materialize::{CrdtMaterializeOptions, CrdtMaterializeOutcome}; pub use vault_embeddings::{hash_embed, EmbedConfig, TextChunk}; pub use vault_goal::{ diff --git a/atomic-repository/src/repository/tests/integration_tests.rs b/atomic-repository/src/repository/tests/integration_tests.rs index 6c0ff40..f35aebd 100644 --- a/atomic-repository/src/repository/tests/integration_tests.rs +++ b/atomic-repository/src/repository/tests/integration_tests.rs @@ -246,6 +246,48 @@ fn test_status_clean_after_modify_and_record() { ); } +#[test] +fn test_import_line_index_seed_reads_current_graph_lines() { + let (temp_dir, repo) = create_temp_repo(); + + let file_path = temp_dir.path().join("seed_test.txt"); + std::fs::write(&file_path, b"one\ntwo\nthree\n").unwrap(); + repo.add("seed_test.txt", TrackingOptions::default()) + .unwrap(); + + repo.record( + ChangeHeader::new("seed base"), + RecordOptions::new() + .with_all(true) + .save_to_store(true) + .apply_after_record(true), + ) + .unwrap(); + + let seed = repo + .import_line_index_seed("seed_test.txt") + .unwrap() + .expect("tracked file should seed from graph"); + assert_eq!(seed.lines.len(), 3); + assert!(seed.lines.iter().all(|line| line.start < line.end)); + + std::fs::write(&file_path, b"one\nTWO\nthree\nfour\n").unwrap(); + repo.record( + ChangeHeader::new("seed edit"), + RecordOptions::new() + .with_all(true) + .save_to_store(true) + .apply_after_record(true), + ) + .unwrap(); + + let seed = repo + .import_line_index_seed("seed_test.txt") + .unwrap() + .expect("edited file should seed from graph"); + assert_eq!(seed.lines.len(), 4); +} + /// Test that switching views correctly outputs file content. /// /// This test verifies that when switching between views that share diff --git a/tests/harness/18_git_import_hot_file.sh b/tests/harness/18_git_import_hot_file.sh new file mode 100644 index 0000000..12cc350 --- /dev/null +++ b/tests/harness/18_git_import_hot_file.sh @@ -0,0 +1,198 @@ +#!/usr/bin/env bash +# 18_git_import_hot_file.sh — Synthetic hot-file git import regression. +# +# This harness creates a local Git repository with one long-lived file that is +# edited repeatedly across many commits, then times `atomic git import`. +# It is meant to reproduce the Terraform-style "ordinary late edit to a hot +# file" shape without cloning a huge upstream repository. + +HARNESS_DIR="$(cd "$(dirname "$0")" && pwd)" +source "$HARNESS_DIR/helpers.sh" + +HOT_FILE_COMMITS="${HOT_FILE_COMMITS:-60}" +HOT_FILE_LINES="${HOT_FILE_LINES:-300}" +HOT_FILE_IMPORT_MAX_SECONDS="${HOT_FILE_IMPORT_MAX_SECONDS:-30}" +HOT_FILE_MODE="${HOT_FILE_MODE:-linear}" +HOT_FILE_BRANCHES="${HOT_FILE_BRANCHES:-8}" +HOT_FILE_ALLOWED_SKIPS="${HOT_FILE_ALLOWED_SKIPS:-3}" + +HOT_FILE_PATH="terraform/context_apply_test.go" + +count_imported_git_changes() { + local count=0 + local hash + while IFS= read -r hash; do + [[ -z "$hash" ]] && continue + if atomic change "$hash" 2>/dev/null | grep -q 'Commit:'; then + count=$((count + 1)) + fi + done < <(atomic log --format short --no-color --full-hash 2>/dev/null | awk '/^[A-Z2-7]{20,}/ { print $1 }') + echo "$count" +} + +git_first_parent_commit_count() { + git rev-list --first-parent --count HEAD 2>/dev/null || echo "0" +} + +write_hot_file() { + local commit_idx="$1" + local line_idx + + mkdir -p "$(dirname "$HOT_FILE_PATH")" + : > "$HOT_FILE_PATH" + for ((line_idx = 1; line_idx <= HOT_FILE_LINES; line_idx++)); do + if (( line_idx % 37 == commit_idx % 37 )); then + printf 'func TestHotPath_%04d_%04d(t *testing.T) { t.Log("hot-%04d") }\n' \ + "$line_idx" "$commit_idx" "$commit_idx" >> "$HOT_FILE_PATH" + else + printf 'func TestHotPath_%04d(t *testing.T) { t.Log("stable-%04d") }\n' \ + "$line_idx" "$line_idx" >> "$HOT_FILE_PATH" + fi + done +} + +edit_hot_line_in_place() { + local line_no="$1" + local marker="$2" + + awk -v line_no="$line_no" -v marker="$marker" ' + NR == line_no { + printf "func TestHotPath_%04d_%s(t *testing.T) { t.Log(\"%s\") }\n", line_no, marker, marker + next + } + { print } + ' "$HOT_FILE_PATH" > "$HOT_FILE_PATH.tmp" + mv "$HOT_FILE_PATH.tmp" "$HOT_FILE_PATH" +} + +create_linear_history() { + local i + for ((i = 1; i <= HOT_FILE_COMMITS; i++)); do + write_hot_file "$i" + git add "$HOT_FILE_PATH" + git commit --quiet -m "Hot file edit $i" + done +} + +create_branchy_history() { + local main_branch branch line_no b round marker + + main_branch="$(git symbolic-ref --short HEAD 2>/dev/null || git rev-parse --abbrev-ref HEAD)" + + # Create independent branch edits from the same base line. Importing with + # --all replays all branch changes into the Atomic graph, producing a + # branchy file graph before the final mainline edit. + for ((b = 1; b <= HOT_FILE_BRANCHES; b++)); do + git checkout --quiet -b "hot-branch-$b" "$main_branch" + line_no=$(( (b % HOT_FILE_LINES) + 1 )) + for ((round = 1; round <= HOT_FILE_COMMITS; round++)); do + marker="branch_${b}_${round}" + edit_hot_line_in_place "$line_no" "$marker" + git add "$HOT_FILE_PATH" + git commit --quiet -m "Branch $b hot edit $round" + done + done + + git checkout --quiet "$main_branch" + line_no=$(( (HOT_FILE_BRANCHES % HOT_FILE_LINES) + 2 )) + edit_hot_line_in_place "$line_no" "final_main_${HOT_FILE_COMMITS}" + git add "$HOT_FILE_PATH" + git commit --quiet -m "Final main hot edit" +} + +echo "" +echo "${BOLD}══════════════════════════════════════════════════════════════${RESET}" +echo "${BOLD} Suite: 18_git_import_hot_file${RESET}" +echo "${BOLD}══════════════════════════════════════════════════════════════${RESET}" + +begin_section "Git Import: Synthetic Hot File" + +make_temp_repo "git-import-hot-file" +init_git_repo + +write_hot_file 0 +git add "$HOT_FILE_PATH" +git commit --quiet -m "Initial hot file" + +case "$HOT_FILE_MODE" in + linear) + create_linear_history + ;; + branchy) + create_branchy_history + ;; + *) + _fail "valid HOT_FILE_MODE" "expected 'linear' or 'branchy', got '$HOT_FILE_MODE'" + print_summary + exit 1 + ;; +esac + +expected_commits="$(git_first_parent_commit_count)" +if [[ "$HOT_FILE_MODE" == "branchy" ]]; then + expected_commits="$(git rev-list --all --count 2>/dev/null || echo "$expected_commits")" +fi +echo " Synthetic history: mode=${HOT_FILE_MODE}, commits=${expected_commits}, lines=${HOT_FILE_LINES}, path=${HOT_FILE_PATH}" + +atomic init >/dev/null 2>&1 + +start_time=$(date +%s) +import_args=(git import) +if [[ "$HOT_FILE_MODE" == "branchy" ]]; then + import_args+=(--all) +fi + +import_out="$(atomic "${import_args[@]}" 2>&1)" || { + _fail "hot-file import succeeds" "$import_out" + print_summary + exit 1 +} +end_time=$(date +%s) +duration=$((end_time - start_time)) + +_pass "hot-file import succeeds" +echo " Import took ${duration}s" + +actual="$(count_imported_git_changes)" +if [[ "$HOT_FILE_MODE" == "branchy" ]]; then + if [[ "$actual" -ge 2 ]]; then + _pass "branchy import produced git changes ($actual imported)" + else + _fail "branchy import produced git changes" "expected at least 2 imported git changes, got $actual" + fi +elif [[ "$actual" -eq "$expected_commits" ]]; then + _pass "imported git change count matches ($actual vs $expected_commits)" +elif [[ "$actual" -ge $((expected_commits - HOT_FILE_ALLOWED_SKIPS)) ]]; then + _pass "imported git change count within skip tolerance ($actual vs $expected_commits)" +else + _fail "imported git change count matches" \ + "expected $expected_commits imported git changes, got $actual (allowed skips: $HOT_FILE_ALLOWED_SKIPS)" +fi + +if [[ "$duration" -le "$HOT_FILE_IMPORT_MAX_SECONDS" ]]; then + _pass "hot-file import within ${HOT_FILE_IMPORT_MAX_SECONDS}s budget" +else + _fail "hot-file import within ${HOT_FILE_IMPORT_MAX_SECONDS}s budget" \ + "took ${duration}s; likely hot-file assembly traversal regression" +fi + +status_out="$(atomic status --short 2>/dev/null || true)" +hot_file_status="$(echo "$status_out" | grep -F "$HOT_FILE_PATH" || true)" +if [[ -z "$hot_file_status" ]]; then + _pass "hot file clean after import" +else + _fail "hot file clean after import" "$hot_file_status" +fi + +expected_marker="hot-$(printf '%04d' "$HOT_FILE_COMMITS")" +if [[ "$HOT_FILE_MODE" == "branchy" ]]; then + expected_marker="final_main_${HOT_FILE_COMMITS}" +fi + +if grep -q "$expected_marker" "$HOT_FILE_PATH"; then + _pass "final hot-file content materialized" +else + _fail "final hot-file content materialized" "missing final edit marker '$expected_marker'" +fi + +print_summary From 0e4ce07d45fb34a7dd07b62bcd64b98ba9e2f8e4 Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Wed, 20 May 2026 09:22:32 -0400 Subject: [PATCH 7/8] cleanup build and bump version --- Cargo.lock | 18 +++++++++--------- Cargo.toml | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7c2bf05..15f60f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -117,7 +117,7 @@ dependencies = [ [[package]] name = "atomic-agent" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "atomic-core", @@ -143,7 +143,7 @@ dependencies = [ [[package]] name = "atomic-cli" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "atomic-agent", @@ -180,7 +180,7 @@ dependencies = [ [[package]] name = "atomic-config" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "dirs", @@ -194,7 +194,7 @@ dependencies = [ [[package]] name = "atomic-core" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "bitflags", @@ -220,7 +220,7 @@ dependencies = [ [[package]] name = "atomic-identity" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "atomic-config", @@ -250,7 +250,7 @@ dependencies = [ [[package]] name = "atomic-remote" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "bytes", @@ -270,7 +270,7 @@ dependencies = [ [[package]] name = "atomic-repository" -version = "0.6.0" +version = "0.6.1" dependencies = [ "anyhow", "atomic-config", @@ -299,7 +299,7 @@ dependencies = [ [[package]] name = "atomic-semantic" -version = "0.6.0" +version = "0.6.1" dependencies = [ "serde", "serde_json", @@ -315,7 +315,7 @@ dependencies = [ [[package]] name = "atomic-teams" -version = "0.6.0" +version = "0.6.1" dependencies = [ "atomic-remote", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 733e324..6b8a27f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,7 +13,7 @@ members = [ ] [workspace.package] -version = "0.6.0" +version = "0.6.1" edition = "2021" authors = ["Atomic Contributors"] license = "Apache-2.0" From 848ec62c5fbcdde91b7a02b2ddb61a6161812224 Mon Sep 17 00:00:00 2001 From: Lee Faus Date: Wed, 20 May 2026 09:51:47 -0400 Subject: [PATCH 8/8] fix clippy error --- atomic-cli/src/commands/git/parallel.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atomic-cli/src/commands/git/parallel.rs b/atomic-cli/src/commands/git/parallel.rs index 77ab0a6..ed411c9 100644 --- a/atomic-cli/src/commands/git/parallel.rs +++ b/atomic-cli/src/commands/git/parallel.rs @@ -729,7 +729,7 @@ fn import_shape_summary(parsed: &ParsedCommit, line_index: &ImportLineIndex) -> ) }) .collect(); - entries.sort_by(|a, b| b.0.cmp(&a.0)); + entries.sort_by_key(|entry| std::cmp::Reverse(entry.0)); entries .into_iter() .take(3)