diff --git a/CLAUDE.md b/CLAUDE.md index e5c0c176..559150ba 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -356,6 +356,15 @@ expr.Run(program, envMap) // Returns bool Methods are bound at runtime to actual node fields, enabling type-safe queries without reflection. +### Diff-Aware Scanning (`pathfinder ci --base`) +CI mode filters findings to files touched by the PR. The wire is: + +1. `diff.GetChangedFiles()` runs `git diff --name-only --diff-filter=ACMRD ..HEAD`. +2. The flag set is **ACMRD** (Added, Copied, Modified, Renamed, **Deleted**). Deletions matter because the downstream filter at `cmd/ci.go` uses the list as a sentinel — an empty list means "no source in the diff," which a deletion-only PR would otherwise look identical to. +3. `cmd/ci.go` applies `output.NewDiffFilter(changedFiles)` whenever `diffEnabled`, **without** a `len(changedFiles) > 0` guard. An empty diff intersection returns zero findings; it does NOT fall back to a full-repo scan. Falling back was the May 2026 regression that surfaced as 207 findings on a PR that only deleted a YAML workflow file. + +If you ever need to bring back the full-scan fallback (e.g., for `--no-diff`), do it by turning off `diffEnabled` rather than by smuggling logic into the filter guard. The two states must stay separable: "diff-aware on, nothing matched" vs "diff-aware off, scan everything." + ### SARIF Report Generation CI mode generates SARIF reports for GitHub Advanced Security: ```go diff --git a/sast-engine/cmd/ci.go b/sast-engine/cmd/ci.go index 98c62f4b..772449e2 100644 --- a/sast-engine/cmd/ci.go +++ b/sast-engine/cmd/ci.go @@ -366,30 +366,29 @@ Examples: // Merge container detections with code analysis detections. allEnriched = append(allEnriched, containerDetections...) - // Apply diff filter when diff-aware mode is active. - if diffEnabled && len(changedFiles) > 0 { - totalBefore := len(allEnriched) - diffFilter := output.NewDiffFilter(changedFiles) - allEnriched = diffFilter.Filter(allEnriched) + // Apply diff filter when diff-aware mode is active. See + // applyDiffFilter for the explicit "do not fall back to full scan + // on empty diff" contract this commit enforces. + totalBefore := len(allEnriched) + var filterApplied bool + allEnriched, filterApplied = applyDiffFilter(allEnriched, changedFiles, diffEnabled) + if filterApplied { logger.Progress("Diff filter: %d/%d findings in changed files", len(allEnriched), totalBefore) } // Total rules = code analysis rules loaded + container rules loaded. totalRules := len(rules) + containerRulesCount - // Count unique source files. When diff-aware, only count changed files. - var filesScanned int - if diffEnabled && len(changedFiles) > 0 { - filesScanned = len(changedFiles) - } else { - uniqueFiles := make(map[string]bool) - for _, node := range codeGraph.Nodes { - if node.File != "" { - uniqueFiles[node.File] = true - } + // Count unique source files for the report. countScannedFiles picks + // len(changedFiles) when diff-aware (including 0, by design), else + // the unique-file count derived from the code graph. + uniqueFiles := make(map[string]bool) + for _, node := range codeGraph.Nodes { + if node.File != "" { + uniqueFiles[node.File] = true } - filesScanned = len(uniqueFiles) } + filesScanned := countScannedFiles(diffEnabled, len(changedFiles), len(uniqueFiles)) logger.Statistic("Scan complete. Found %d vulnerabilities", len(allEnriched)) logger.Progress("Generating %s output...", outputFormat) diff --git a/sast-engine/cmd/diff_filter.go b/sast-engine/cmd/diff_filter.go new file mode 100644 index 00000000..eb81dfc0 --- /dev/null +++ b/sast-engine/cmd/diff_filter.go @@ -0,0 +1,41 @@ +package cmd + +import ( + "github.com/shivasurya/code-pathfinder/sast-engine/dsl" + "github.com/shivasurya/code-pathfinder/sast-engine/output" +) + +// applyDiffFilter intersects detections with the set of files changed in +// the diff when diff-aware mode is enabled. When diffEnabled is false it +// returns the input unchanged so callers can branch on the bool to decide +// whether to log "Diff filter: X/Y..." +// +// Critically, applyDiffFilter does NOT short-circuit when changedFiles is +// empty: an empty list means the diff genuinely covers no source files +// (delete-only PR, docs-only PR, empty PR), and the right answer is zero +// detections, not "fall back to a full scan." Falling back was the May +// 2026 regression that surfaced as 207 findings on a PR that only +// deleted a YAML workflow file (the public post-mortem in CLAUDE.md). +func applyDiffFilter( + detections []*dsl.EnrichedDetection, + changedFiles []string, + diffEnabled bool, +) (filtered []*dsl.EnrichedDetection, applied bool) { + if !diffEnabled { + return detections, false + } + return output.NewDiffFilter(changedFiles).Filter(detections), true +} + +// countScannedFiles picks the file-count number to report in scan output +// based on whether diff-aware mode was active. When diff-aware, the +// reported count is the number of files in the diff (which may be 0 by +// design; see applyDiffFilter for why we no longer fall back). When not +// diff-aware, the count is the number of unique files the file walk +// observed and the scanner actually parsed. +func countScannedFiles(diffEnabled bool, changedFilesCount, fallbackUniqueFilesCount int) int { + if diffEnabled { + return changedFilesCount + } + return fallbackUniqueFilesCount +} diff --git a/sast-engine/cmd/diff_filter_test.go b/sast-engine/cmd/diff_filter_test.go new file mode 100644 index 00000000..5d5235ce --- /dev/null +++ b/sast-engine/cmd/diff_filter_test.go @@ -0,0 +1,93 @@ +package cmd + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/shivasurya/code-pathfinder/sast-engine/dsl" +) + +// detection builds a minimal EnrichedDetection whose RelPath matches the +// given path. Sufficient for DiffFilter, which keys off RelPath / FilePath. +func detection(rel string) *dsl.EnrichedDetection { + return &dsl.EnrichedDetection{ + Location: dsl.LocationInfo{RelPath: rel, FilePath: rel}, + } +} + +// --- applyDiffFilter --- + +func TestApplyDiffFilter_DisabledReturnsInputUnchanged(t *testing.T) { + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, nil, false) + assert.False(t, applied, "applied flag must be false when diff is disabled") + assert.Equal(t, in, out, "input must be returned unchanged when diff is disabled") +} + +func TestApplyDiffFilter_DisabledIgnoresChangedFiles(t *testing.T) { + // changedFiles is non-nil but diffEnabled is false: the filter still must + // not run. Guards against accidental "always apply if list provided" + // regressions. + in := []*dsl.EnrichedDetection{detection("a.py")} + out, applied := applyDiffFilter(in, []string{"unrelated.py"}, false) + assert.False(t, applied) + assert.Equal(t, in, out) +} + +func TestApplyDiffFilter_EnabledWithMatchingFile(t *testing.T) { + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, []string{"a.py"}, true) + assert.True(t, applied, "applied flag must be true when diff is enabled") + assert.Len(t, out, 1, "only a.py finding should survive") + assert.Equal(t, "a.py", out[0].Location.RelPath) +} + +func TestApplyDiffFilter_EnabledWithNoMatches(t *testing.T) { + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, []string{"unrelated.py"}, true) + assert.True(t, applied) + assert.Empty(t, out) +} + +func TestApplyDiffFilter_EnabledWithEmptyChangedFiles(t *testing.T) { + // The regression case. Empty changedFiles + diffEnabled=true must produce + // zero detections, NOT a full pass-through. Falling back to a full scan + // here was the May 2026 207-findings bug. + in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")} + out, applied := applyDiffFilter(in, []string{}, true) + assert.True(t, applied, "applied flag must remain true even with empty changedFiles") + assert.Empty(t, out, "empty diff must filter ALL findings to zero, not pass them through") +} + +func TestApplyDiffFilter_EnabledWithNilChangedFiles(t *testing.T) { + // nil slice is treated the same as an empty slice: nothing to match + // against, so everything is filtered out. Same regression guard. + in := []*dsl.EnrichedDetection{detection("a.py")} + out, applied := applyDiffFilter(in, nil, true) + assert.True(t, applied) + assert.Empty(t, out) +} + +// --- countScannedFiles --- + +func TestCountScannedFiles_DiffEnabledReturnsChangedCount(t *testing.T) { + assert.Equal(t, 5, countScannedFiles(true, 5, 999), + "diff-aware count must be the changed-files count, regardless of fallback") +} + +func TestCountScannedFiles_DiffEnabledWithZeroChangedReturnsZero(t *testing.T) { + // Regression guard mirroring applyDiffFilter's empty-diff behaviour: an + // empty diff must report 0 files scanned, NOT silently use the fallback + // count (which would mask the empty-diff state in the JSON output). + assert.Equal(t, 0, countScannedFiles(true, 0, 250)) +} + +func TestCountScannedFiles_DiffDisabledReturnsFallback(t *testing.T) { + assert.Equal(t, 42, countScannedFiles(false, 7, 42), + "non-diff scans must report the unique-file count from the graph") +} + +func TestCountScannedFiles_DiffDisabledWithZeroFallback(t *testing.T) { + assert.Equal(t, 0, countScannedFiles(false, 7, 0)) +} diff --git a/sast-engine/cmd/scan.go b/sast-engine/cmd/scan.go index 7c4899ba..03cb84bf 100644 --- a/sast-engine/cmd/scan.go +++ b/sast-engine/cmd/scan.go @@ -356,11 +356,14 @@ Examples: // Merge container detections with code analysis detections allEnriched = append(allEnriched, containerDetections...) - // Apply diff filter when diff-aware mode is active. - if diffAware && len(changedFiles) > 0 { - totalBefore := len(allEnriched) - diffFilter := output.NewDiffFilter(changedFiles) - allEnriched = diffFilter.Filter(allEnriched) + // Apply diff filter when diff-aware mode is active. Shared with + // cmd/ci.go via applyDiffFilter to keep the empty-diff contract + // (return zero findings, do NOT fall back to a full scan) honoured + // in both commands. See cmd/diff_filter.go for the rationale. + totalBefore := len(allEnriched) + var filterApplied bool + allEnriched, filterApplied = applyDiffFilter(allEnriched, changedFiles, diffAware) + if filterApplied { logger.Progress("Diff filter: %d/%d findings in changed files", len(allEnriched), totalBefore) } diff --git a/sast-engine/diff/git_provider.go b/sast-engine/diff/git_provider.go index 1356d937..3d695075 100644 --- a/sast-engine/diff/git_provider.go +++ b/sast-engine/diff/git_provider.go @@ -59,13 +59,17 @@ func (p *GitDiffProvider) findMergeBase() (string, error) { } // diffFiles runs git diff --name-only to list changed files from merge-base to head. -// Uses --diff-filter=ACMR to include Added, Copied, Modified, and Renamed files only. +// Uses --diff-filter=ACMRD to include Added, Copied, Modified, Renamed, AND Deleted +// files. Deletions matter because callers use this list as a sentinel: an empty list +// signals "no files in the diff," and dropping deletions would make a delete-only PR +// look identical to an empty PR, which historically caused the diff filter at +// cmd/ci.go to fall back to a full-repo scan. func (p *GitDiffProvider) diffFiles(mergeBase string) ([]string, error) { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() diffRange := mergeBase + ".." + p.HeadRef - cmd := exec.CommandContext(ctx, "git", "diff", "--name-only", "--diff-filter=ACMR", diffRange) + cmd := exec.CommandContext(ctx, "git", "diff", "--name-only", "--diff-filter=ACMRD", diffRange) cmd.Dir = p.ProjectRoot output, err := cmd.Output() diff --git a/sast-engine/diff/git_provider_test.go b/sast-engine/diff/git_provider_test.go index 08bd767f..43ba5b77 100644 --- a/sast-engine/diff/git_provider_test.go +++ b/sast-engine/diff/git_provider_test.go @@ -109,18 +109,20 @@ func TestGitDiffProvider_BranchWithMergeBase(t *testing.T) { assert.Equal(t, []string{"feature.py"}, files) } -func TestGitDiffProvider_DeletedFileExcluded(t *testing.T) { - // Tests that deleted files are excluded (--diff-filter=ACMR does not include D). +func TestGitDiffProvider_DeletedFileIncluded(t *testing.T) { + // Deleted files MUST appear in GetChangedFiles. They drive a separate + // downstream signal: cmd/ci.go treats an empty list as "no source in the + // diff" and refuses to fall back to a full scan, so dropping deletions + // would make a delete-only PR look identical to an empty PR. See the + // comment on diffFiles for the full rationale (--diff-filter=ACMRD). dir := setupTestRepo(t) - // Add a file on main. writeFile(t, filepath.Join(dir, "to_delete.py"), "# will be deleted") runGit(t, dir, "add", ".") runGit(t, dir, "commit", "-m", "add file to delete") runGit(t, dir, "checkout", "-b", "feature") - // Delete the file and add a new one. require.NoError(t, os.Remove(filepath.Join(dir, "to_delete.py"))) writeFile(t, filepath.Join(dir, "new_file.py"), "# new") runGit(t, dir, "add", ".") @@ -134,8 +136,36 @@ func TestGitDiffProvider_DeletedFileExcluded(t *testing.T) { files, err := provider.GetChangedFiles() require.NoError(t, err) - // to_delete.py should NOT appear (deleted). Only new_file.py. - assert.Equal(t, []string{"new_file.py"}, files) + assert.ElementsMatch(t, []string{"to_delete.py", "new_file.py"}, files) +} + +func TestGitDiffProvider_DeletionOnlyPR(t *testing.T) { + // Regression for the May 2026 207-findings incident: a PR that only + // deletes a file (no additions or modifications) must surface that + // deletion in GetChangedFiles. Returning an empty list here looks + // identical to an empty PR and historically caused cmd/ci.go to skip + // the diff filter entirely, dumping every full-scan finding. + dir := setupTestRepo(t) + + writeFile(t, filepath.Join(dir, ".github", "workflows", "doomed.yml"), "name: doomed\n") + runGit(t, dir, "add", ".") + runGit(t, dir, "commit", "-m", "add workflow we'll later delete") + + runGit(t, dir, "checkout", "-b", "feature") + require.NoError(t, os.Remove(filepath.Join(dir, ".github", "workflows", "doomed.yml"))) + runGit(t, dir, "add", ".") + runGit(t, dir, "commit", "-m", "delete the workflow") + + provider := &GitDiffProvider{ + ProjectRoot: dir, + BaseRef: "main", + HeadRef: "HEAD", + } + + files, err := provider.GetChangedFiles() + require.NoError(t, err) + assert.Equal(t, []string{".github/workflows/doomed.yml"}, files, + "deletion-only diff must surface the deleted path so callers don't mistake it for an empty PR") } func TestGitDiffProvider_EmptyDiff(t *testing.T) { diff --git a/sast-engine/output/filter.go b/sast-engine/output/filter.go index a6dd2790..342ad149 100644 --- a/sast-engine/output/filter.go +++ b/sast-engine/output/filter.go @@ -20,11 +20,13 @@ func NewDiffFilter(changedFiles []string) *DiffFilter { } // Filter returns only detections whose RelPath is in the changed files set. -// If no changed files were provided (empty set), all detections are returned. +// An empty set filters everything out: a DiffFilter constructed with no +// changed files represents "nothing in the diff to match," and the right +// answer is zero detections, not pass-through. Callers that want "no +// filtering" must not call Filter at all (see cmd.applyDiffFilter which +// gates on the diffEnabled flag for exactly this reason). Falling back to +// pass-through was the May 2026 207-findings regression. func (f *DiffFilter) Filter(detections []*dsl.EnrichedDetection) []*dsl.EnrichedDetection { - if len(f.changedFiles) == 0 { - return detections - } filtered := make([]*dsl.EnrichedDetection, 0, len(detections)) for _, det := range detections { if f.changedFiles[det.Location.RelPath] { @@ -35,10 +37,9 @@ func (f *DiffFilter) Filter(detections []*dsl.EnrichedDetection) []*dsl.Enriched } // FilteredCount returns the number of detections that would be removed. +// With an empty changed-files set, every detection would be removed, so +// the count equals len(detections). See Filter for the rationale. func (f *DiffFilter) FilteredCount(detections []*dsl.EnrichedDetection) int { - if len(f.changedFiles) == 0 { - return 0 - } count := 0 for _, det := range detections { if !f.changedFiles[det.Location.RelPath] { diff --git a/sast-engine/output/filter_test.go b/sast-engine/output/filter_test.go index f00fdbc2..286fd17b 100644 --- a/sast-engine/output/filter_test.go +++ b/sast-engine/output/filter_test.go @@ -74,14 +74,17 @@ func TestDiffFilter_Filter(t *testing.T) { wantRelPaths: []string{"app/views.py", "app/auth.py"}, }, { - name: "empty changed files returns all detections", + // An empty changed-files set means "nothing in the diff to match"; + // the right answer is zero detections, not pass-through. Callers + // that want "no filtering" must skip Filter entirely. + name: "empty changed files filters everything out", changedFiles: []string{}, detections: []*dsl.EnrichedDetection{ makeDetection("app/views.py", "critical"), makeDetection("app/models.py", "high"), }, - wantCount: 2, - wantRelPaths: []string{"app/views.py", "app/models.py"}, + wantCount: 0, + wantRelPaths: nil, }, { name: "nil detections", @@ -257,12 +260,15 @@ func TestDiffFilter_FilteredCount(t *testing.T) { wantFiltered: 2, }, { - name: "empty changed files means no filtering", + // With an empty changed-files set, every detection would be + // removed by Filter (see Filter's contract): FilteredCount must + // report that as well, not 0. + name: "empty changed files reports all as removed", changedFiles: []string{}, detections: []*dsl.EnrichedDetection{ makeDetection("app/views.py", "critical"), }, - wantFiltered: 0, + wantFiltered: 1, }, { name: "nil detections",