From 8d3c996363788385c0c73541affa431bd127c218 Mon Sep 17 00:00:00 2001 From: Sephyi Date: Sun, 19 Apr 2026 19:01:48 +0200 Subject: [PATCH] perf(services): add RegexSet first-pass for secret scanning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Secret scanning previously ran each of the 24 built-in regexes against every added diff line, making the cost O(N x 24 x L) per scan. The common case (no secret on the line) paid the full per-pattern overhead. Bundle the patterns into a new `PatternSet` that pairs the `Vec` with a `regex::RegexSet` built from the same sources. Per-line scanning now asks the aggregated automaton first; only lines that actually hit a pattern resolve an index back to the owning `SecretPattern`. `RegexSet::matches().iter()` yields indices in ascending order, so taking `.next()` preserves the prior "first pattern wins" semantics — output shape of `SecretMatch` is unchanged. The `RegexSet` is compiled once per `build_patterns` call (and cached via `LazyLock` for the default pattern set), so build cost is amortised. All 38 existing safety tests plus the three proptests continue to pass. Closes audit entry F-022 from #3. --- src/services/safety.rs | 108 ++++++++++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 27 deletions(-) diff --git a/src/services/safety.rs b/src/services/safety.rs index a2f17b0..70ff19d 100644 --- a/src/services/safety.rs +++ b/src/services/safety.rs @@ -5,7 +5,7 @@ use std::borrow::Cow; use std::sync::LazyLock; -use regex::Regex; +use regex::{Regex, RegexSet}; use crate::domain::StagedChanges; @@ -23,11 +23,62 @@ pub struct SecretPattern { pub description: Cow<'static, str>, } +/// A bundle of secret-detection patterns together with a [`RegexSet`] over +/// the same patterns for a fast first-pass filter. +/// +/// Per-line scanning calls [`RegexSet::matches`] first; only on hits does it +/// fall back to individual [`Regex::is_match`] on the matching pattern. This +/// turns the common no-match case from `O(N × 24 × L)` into +/// `O(N × L)` (one aggregated automaton pass) plus an `O(1)` membership check. +pub struct PatternSet { + patterns: Vec, + set: RegexSet, +} + +impl PatternSet { + /// Number of patterns in the set. + #[allow(dead_code)] // exercised by integration tests; library consumers may also want it + pub fn len(&self) -> usize { + self.patterns.len() + } + + /// True when there are no patterns — every scan is a no-op. + #[allow(dead_code)] // paired with len() for clippy::len_without_is_empty + pub fn is_empty(&self) -> bool { + self.patterns.is_empty() + } + + /// Iterate the patterns in stable order (matches [`RegexSet`] index order). + #[allow(dead_code)] // exercised by integration tests; library consumers may also want it + pub fn iter(&self) -> std::slice::Iter<'_, SecretPattern> { + self.patterns.iter() + } + + /// Build a [`PatternSet`] from a `Vec`. The [`RegexSet`] + /// is compiled from the patterns' source strings; indices align 1:1 with + /// `patterns`, so `set.matches(line)` yields valid indices into + /// `patterns`. + /// + /// Since every individual [`Regex`] in `patterns` is already known to + /// compile, the combined [`RegexSet`] compile is expected to succeed; if + /// it somehow does not (size limits, etc.), we fall back to an empty + /// [`RegexSet`] and the scanner still works correctly — it just loses + /// the first-pass speedup and runs the per-pattern check on every line. + fn from_patterns(patterns: Vec) -> Self { + // `Regex::as_str` returns the exact source the regex was built from, + // so feeding the same strings into `RegexSet` yields identical + // semantics to the per-pattern `Regex::is_match` fallback. + let sources: Vec<&str> = patterns.iter().map(|p| p.regex.as_str()).collect(); + let set = RegexSet::new(&sources).unwrap_or_else(|_| RegexSet::empty()); + Self { patterns, set } + } +} + /// Build the full set of secret patterns, applying custom additions and disabled names. /// /// Custom patterns are compiled from user-provided regex strings. Invalid regexes /// are silently skipped (logged at warn level in the caller). -pub fn build_patterns(custom: &[String], disabled: &[String]) -> Vec { +pub fn build_patterns(custom: &[String], disabled: &[String]) -> PatternSet { let mut patterns = builtin_patterns(); // Remove disabled patterns by name (case-insensitive match) @@ -47,7 +98,7 @@ pub fn build_patterns(custom: &[String], disabled: &[String]) -> Vec Vec { @@ -197,7 +248,18 @@ fn builtin_patterns() -> Vec { } /// Default patterns (no custom, no disabled) for use by the legacy API. -static DEFAULT_PATTERNS: LazyLock> = LazyLock::new(builtin_patterns); +static DEFAULT_PATTERNS: LazyLock = + LazyLock::new(|| PatternSet::from_patterns(builtin_patterns())); + +/// First element of the set that matches `line`, if any. Uses +/// [`RegexSet::matches`] as a single-automaton first-pass filter — the +/// common case (no match) returns without touching any individual +/// [`Regex`]. `SetMatches::iter()` yields indices in ascending order, so +/// taking `.next()` preserves the previous "first pattern wins" semantics. +fn first_match<'a>(line: &str, patterns: &'a PatternSet) -> Option<&'a SecretPattern> { + let idx = patterns.set.matches(line).iter().next()?; + Some(&patterns.patterns[idx]) +} /// Scan per-file truncated diffs for secrets using default patterns. /// @@ -212,7 +274,7 @@ pub fn scan_for_secrets(changes: &StagedChanges) -> Vec { /// Scan per-file truncated diffs for secrets using the given pattern set. pub fn scan_for_secrets_with_patterns( changes: &StagedChanges, - patterns: &[SecretPattern], + patterns: &PatternSet, ) -> Vec { let mut found = Vec::new(); @@ -230,15 +292,13 @@ pub fn scan_for_secrets_with_patterns( continue; } - for pat in patterns { - if pat.regex.is_match(line) { - found.push(SecretMatch { - pattern_name: pat.name.to_string(), - file: file.path.display().to_string(), - line: Some(line_num), - }); - break; // One match per line is enough - } + if let Some(pat) = first_match(line, patterns) { + found.push(SecretMatch { + pattern_name: pat.name.to_string(), + file: file.path.display().to_string(), + line: Some(line_num), + }); + // One match per line is enough; fall through to next line. } } } @@ -261,10 +321,7 @@ static HUNK_HEADER: LazyLock = /// file diffs are truncated to `max_file_lines`. Parses the raw `git diff` /// output directly, tracking file paths from diff headers and calculating /// accurate source line numbers from hunk headers (`@@ -L,l +R,r @@`). -pub fn scan_full_diff_with_patterns( - full_diff: &str, - patterns: &[SecretPattern], -) -> Vec { +pub fn scan_full_diff_with_patterns(full_diff: &str, patterns: &PatternSet) -> Vec { let mut found = Vec::new(); let mut current_file = String::new(); let mut current_line: Option = None; @@ -316,15 +373,12 @@ pub fn scan_full_diff_with_patterns( // Only check added lines if line.starts_with('+') && !line.starts_with("+++") { let content = &line[1..]; - for pat in patterns { - if pat.regex.is_match(content) { - found.push(SecretMatch { - pattern_name: pat.name.to_string(), - file: current_file.clone(), - line: Some(*line_num), - }); - break; - } + if let Some(pat) = first_match(content, patterns) { + found.push(SecretMatch { + pattern_name: pat.name.to_string(), + file: current_file.clone(), + line: Some(*line_num), + }); } *line_num += 1; } else if line.starts_with(' ') {