Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,15 @@ expr.Run(program, envMap) // Returns bool

Methods are bound at runtime to actual node fields, enabling type-safe queries without reflection.

### Diff-Aware Scanning (`pathfinder ci --base`)
CI mode filters findings to files touched by the PR. The wire is:

1. `diff.GetChangedFiles()` runs `git diff --name-only --diff-filter=ACMRD <merge-base>..HEAD`.
2. The flag set is **ACMRD** (Added, Copied, Modified, Renamed, **Deleted**). Deletions matter because the downstream filter at `cmd/ci.go` uses the list as a sentinel — an empty list means "no source in the diff," which a deletion-only PR would otherwise look identical to.
3. `cmd/ci.go` applies `output.NewDiffFilter(changedFiles)` whenever `diffEnabled`, **without** a `len(changedFiles) > 0` guard. An empty diff intersection returns zero findings; it does NOT fall back to a full-repo scan. Falling back was the May 2026 regression that surfaced as 207 findings on a PR that only deleted a YAML workflow file.

If you ever need to bring back the full-scan fallback (e.g., for `--no-diff`), do it by turning off `diffEnabled` rather than by smuggling logic into the filter guard. The two states must stay separable: "diff-aware on, nothing matched" vs "diff-aware off, scan everything."

### SARIF Report Generation
CI mode generates SARIF reports for GitHub Advanced Security:
```go
Expand Down
31 changes: 15 additions & 16 deletions sast-engine/cmd/ci.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,30 +366,29 @@ Examples:
// Merge container detections with code analysis detections.
allEnriched = append(allEnriched, containerDetections...)

// Apply diff filter when diff-aware mode is active.
if diffEnabled && len(changedFiles) > 0 {
totalBefore := len(allEnriched)
diffFilter := output.NewDiffFilter(changedFiles)
allEnriched = diffFilter.Filter(allEnriched)
// Apply diff filter when diff-aware mode is active. See
// applyDiffFilter for the explicit "do not fall back to full scan
// on empty diff" contract this commit enforces.
totalBefore := len(allEnriched)
var filterApplied bool
allEnriched, filterApplied = applyDiffFilter(allEnriched, changedFiles, diffEnabled)
if filterApplied {
logger.Progress("Diff filter: %d/%d findings in changed files", len(allEnriched), totalBefore)
}

// Total rules = code analysis rules loaded + container rules loaded.
totalRules := len(rules) + containerRulesCount

// Count unique source files. When diff-aware, only count changed files.
var filesScanned int
if diffEnabled && len(changedFiles) > 0 {
filesScanned = len(changedFiles)
} else {
uniqueFiles := make(map[string]bool)
for _, node := range codeGraph.Nodes {
if node.File != "" {
uniqueFiles[node.File] = true
}
// Count unique source files for the report. countScannedFiles picks
// len(changedFiles) when diff-aware (including 0, by design), else
// the unique-file count derived from the code graph.
uniqueFiles := make(map[string]bool)
for _, node := range codeGraph.Nodes {
if node.File != "" {
uniqueFiles[node.File] = true
}
filesScanned = len(uniqueFiles)
}
filesScanned := countScannedFiles(diffEnabled, len(changedFiles), len(uniqueFiles))

logger.Statistic("Scan complete. Found %d vulnerabilities", len(allEnriched))
logger.Progress("Generating %s output...", outputFormat)
Expand Down
41 changes: 41 additions & 0 deletions sast-engine/cmd/diff_filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package cmd

import (
"github.com/shivasurya/code-pathfinder/sast-engine/dsl"
"github.com/shivasurya/code-pathfinder/sast-engine/output"
)

// applyDiffFilter intersects detections with the set of files changed in
// the diff when diff-aware mode is enabled. When diffEnabled is false it
// returns the input unchanged so callers can branch on the bool to decide
// whether to log "Diff filter: X/Y..."
//
// Critically, applyDiffFilter does NOT short-circuit when changedFiles is
// empty: an empty list means the diff genuinely covers no source files
// (delete-only PR, docs-only PR, empty PR), and the right answer is zero
// detections, not "fall back to a full scan." Falling back was the May
// 2026 regression that surfaced as 207 findings on a PR that only
// deleted a YAML workflow file (the public post-mortem in CLAUDE.md).
func applyDiffFilter(
detections []*dsl.EnrichedDetection,
changedFiles []string,
diffEnabled bool,
) (filtered []*dsl.EnrichedDetection, applied bool) {
if !diffEnabled {
return detections, false
}
return output.NewDiffFilter(changedFiles).Filter(detections), true
}

// countScannedFiles picks the file-count number to report in scan output
// based on whether diff-aware mode was active. When diff-aware, the
// reported count is the number of files in the diff (which may be 0 by
// design; see applyDiffFilter for why we no longer fall back). When not
// diff-aware, the count is the number of unique files the file walk
// observed and the scanner actually parsed.
func countScannedFiles(diffEnabled bool, changedFilesCount, fallbackUniqueFilesCount int) int {
if diffEnabled {
return changedFilesCount
}
return fallbackUniqueFilesCount
}
93 changes: 93 additions & 0 deletions sast-engine/cmd/diff_filter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package cmd

import (
"testing"

"github.com/stretchr/testify/assert"

"github.com/shivasurya/code-pathfinder/sast-engine/dsl"
)

// detection builds a minimal EnrichedDetection whose RelPath matches the
// given path. Sufficient for DiffFilter, which keys off RelPath / FilePath.
func detection(rel string) *dsl.EnrichedDetection {
return &dsl.EnrichedDetection{
Location: dsl.LocationInfo{RelPath: rel, FilePath: rel},
}
}

// --- applyDiffFilter ---

func TestApplyDiffFilter_DisabledReturnsInputUnchanged(t *testing.T) {
in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")}
out, applied := applyDiffFilter(in, nil, false)
assert.False(t, applied, "applied flag must be false when diff is disabled")
assert.Equal(t, in, out, "input must be returned unchanged when diff is disabled")
}

func TestApplyDiffFilter_DisabledIgnoresChangedFiles(t *testing.T) {
// changedFiles is non-nil but diffEnabled is false: the filter still must
// not run. Guards against accidental "always apply if list provided"
// regressions.
in := []*dsl.EnrichedDetection{detection("a.py")}
out, applied := applyDiffFilter(in, []string{"unrelated.py"}, false)
assert.False(t, applied)
assert.Equal(t, in, out)
}

func TestApplyDiffFilter_EnabledWithMatchingFile(t *testing.T) {
in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")}
out, applied := applyDiffFilter(in, []string{"a.py"}, true)
assert.True(t, applied, "applied flag must be true when diff is enabled")
assert.Len(t, out, 1, "only a.py finding should survive")
assert.Equal(t, "a.py", out[0].Location.RelPath)
}

func TestApplyDiffFilter_EnabledWithNoMatches(t *testing.T) {
in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")}
out, applied := applyDiffFilter(in, []string{"unrelated.py"}, true)
assert.True(t, applied)
assert.Empty(t, out)
}

func TestApplyDiffFilter_EnabledWithEmptyChangedFiles(t *testing.T) {
// The regression case. Empty changedFiles + diffEnabled=true must produce
// zero detections, NOT a full pass-through. Falling back to a full scan
// here was the May 2026 207-findings bug.
in := []*dsl.EnrichedDetection{detection("a.py"), detection("b.py")}
out, applied := applyDiffFilter(in, []string{}, true)
assert.True(t, applied, "applied flag must remain true even with empty changedFiles")
assert.Empty(t, out, "empty diff must filter ALL findings to zero, not pass them through")
}

func TestApplyDiffFilter_EnabledWithNilChangedFiles(t *testing.T) {
// nil slice is treated the same as an empty slice: nothing to match
// against, so everything is filtered out. Same regression guard.
in := []*dsl.EnrichedDetection{detection("a.py")}
out, applied := applyDiffFilter(in, nil, true)
assert.True(t, applied)
assert.Empty(t, out)
}

// --- countScannedFiles ---

func TestCountScannedFiles_DiffEnabledReturnsChangedCount(t *testing.T) {
assert.Equal(t, 5, countScannedFiles(true, 5, 999),
"diff-aware count must be the changed-files count, regardless of fallback")
}

func TestCountScannedFiles_DiffEnabledWithZeroChangedReturnsZero(t *testing.T) {
// Regression guard mirroring applyDiffFilter's empty-diff behaviour: an
// empty diff must report 0 files scanned, NOT silently use the fallback
// count (which would mask the empty-diff state in the JSON output).
assert.Equal(t, 0, countScannedFiles(true, 0, 250))
}

func TestCountScannedFiles_DiffDisabledReturnsFallback(t *testing.T) {
assert.Equal(t, 42, countScannedFiles(false, 7, 42),
"non-diff scans must report the unique-file count from the graph")
}

func TestCountScannedFiles_DiffDisabledWithZeroFallback(t *testing.T) {
assert.Equal(t, 0, countScannedFiles(false, 7, 0))
}
13 changes: 8 additions & 5 deletions sast-engine/cmd/scan.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,14 @@ Examples:
// Merge container detections with code analysis detections
allEnriched = append(allEnriched, containerDetections...)

// Apply diff filter when diff-aware mode is active.
if diffAware && len(changedFiles) > 0 {
totalBefore := len(allEnriched)
diffFilter := output.NewDiffFilter(changedFiles)
allEnriched = diffFilter.Filter(allEnriched)
// Apply diff filter when diff-aware mode is active. Shared with
// cmd/ci.go via applyDiffFilter to keep the empty-diff contract
// (return zero findings, do NOT fall back to a full scan) honoured
// in both commands. See cmd/diff_filter.go for the rationale.
totalBefore := len(allEnriched)
var filterApplied bool
allEnriched, filterApplied = applyDiffFilter(allEnriched, changedFiles, diffAware)
if filterApplied {
logger.Progress("Diff filter: %d/%d findings in changed files", len(allEnriched), totalBefore)
}

Expand Down
8 changes: 6 additions & 2 deletions sast-engine/diff/git_provider.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,17 @@ func (p *GitDiffProvider) findMergeBase() (string, error) {
}

// diffFiles runs git diff --name-only to list changed files from merge-base to head.
// Uses --diff-filter=ACMR to include Added, Copied, Modified, and Renamed files only.
// Uses --diff-filter=ACMRD to include Added, Copied, Modified, Renamed, AND Deleted
// files. Deletions matter because callers use this list as a sentinel: an empty list
// signals "no files in the diff," and dropping deletions would make a delete-only PR
// look identical to an empty PR, which historically caused the diff filter at
// cmd/ci.go to fall back to a full-repo scan.
func (p *GitDiffProvider) diffFiles(mergeBase string) ([]string, error) {
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()

diffRange := mergeBase + ".." + p.HeadRef
cmd := exec.CommandContext(ctx, "git", "diff", "--name-only", "--diff-filter=ACMR", diffRange)
cmd := exec.CommandContext(ctx, "git", "diff", "--name-only", "--diff-filter=ACMRD", diffRange)
cmd.Dir = p.ProjectRoot

output, err := cmd.Output()
Expand Down
42 changes: 36 additions & 6 deletions sast-engine/diff/git_provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,18 +109,20 @@ func TestGitDiffProvider_BranchWithMergeBase(t *testing.T) {
assert.Equal(t, []string{"feature.py"}, files)
}

func TestGitDiffProvider_DeletedFileExcluded(t *testing.T) {
// Tests that deleted files are excluded (--diff-filter=ACMR does not include D).
func TestGitDiffProvider_DeletedFileIncluded(t *testing.T) {
// Deleted files MUST appear in GetChangedFiles. They drive a separate
// downstream signal: cmd/ci.go treats an empty list as "no source in the
// diff" and refuses to fall back to a full scan, so dropping deletions
// would make a delete-only PR look identical to an empty PR. See the
// comment on diffFiles for the full rationale (--diff-filter=ACMRD).
dir := setupTestRepo(t)

// Add a file on main.
writeFile(t, filepath.Join(dir, "to_delete.py"), "# will be deleted")
runGit(t, dir, "add", ".")
runGit(t, dir, "commit", "-m", "add file to delete")

runGit(t, dir, "checkout", "-b", "feature")

// Delete the file and add a new one.
require.NoError(t, os.Remove(filepath.Join(dir, "to_delete.py")))
writeFile(t, filepath.Join(dir, "new_file.py"), "# new")
runGit(t, dir, "add", ".")
Expand All @@ -134,8 +136,36 @@ func TestGitDiffProvider_DeletedFileExcluded(t *testing.T) {

files, err := provider.GetChangedFiles()
require.NoError(t, err)
// to_delete.py should NOT appear (deleted). Only new_file.py.
assert.Equal(t, []string{"new_file.py"}, files)
assert.ElementsMatch(t, []string{"to_delete.py", "new_file.py"}, files)
}

func TestGitDiffProvider_DeletionOnlyPR(t *testing.T) {
// Regression for the May 2026 207-findings incident: a PR that only
// deletes a file (no additions or modifications) must surface that
// deletion in GetChangedFiles. Returning an empty list here looks
// identical to an empty PR and historically caused cmd/ci.go to skip
// the diff filter entirely, dumping every full-scan finding.
dir := setupTestRepo(t)

writeFile(t, filepath.Join(dir, ".github", "workflows", "doomed.yml"), "name: doomed\n")
runGit(t, dir, "add", ".")
runGit(t, dir, "commit", "-m", "add workflow we'll later delete")

runGit(t, dir, "checkout", "-b", "feature")
require.NoError(t, os.Remove(filepath.Join(dir, ".github", "workflows", "doomed.yml")))
runGit(t, dir, "add", ".")
runGit(t, dir, "commit", "-m", "delete the workflow")

provider := &GitDiffProvider{
ProjectRoot: dir,
BaseRef: "main",
HeadRef: "HEAD",
}

files, err := provider.GetChangedFiles()
require.NoError(t, err)
assert.Equal(t, []string{".github/workflows/doomed.yml"}, files,
"deletion-only diff must surface the deleted path so callers don't mistake it for an empty PR")
}

func TestGitDiffProvider_EmptyDiff(t *testing.T) {
Expand Down
15 changes: 8 additions & 7 deletions sast-engine/output/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@ func NewDiffFilter(changedFiles []string) *DiffFilter {
}

// Filter returns only detections whose RelPath is in the changed files set.
// If no changed files were provided (empty set), all detections are returned.
// An empty set filters everything out: a DiffFilter constructed with no
// changed files represents "nothing in the diff to match," and the right
// answer is zero detections, not pass-through. Callers that want "no
// filtering" must not call Filter at all (see cmd.applyDiffFilter which
// gates on the diffEnabled flag for exactly this reason). Falling back to
// pass-through was the May 2026 207-findings regression.
func (f *DiffFilter) Filter(detections []*dsl.EnrichedDetection) []*dsl.EnrichedDetection {
if len(f.changedFiles) == 0 {
return detections
}
filtered := make([]*dsl.EnrichedDetection, 0, len(detections))
for _, det := range detections {
if f.changedFiles[det.Location.RelPath] {
Expand All @@ -35,10 +37,9 @@ func (f *DiffFilter) Filter(detections []*dsl.EnrichedDetection) []*dsl.Enriched
}

// FilteredCount returns the number of detections that would be removed.
// With an empty changed-files set, every detection would be removed, so
// the count equals len(detections). See Filter for the rationale.
func (f *DiffFilter) FilteredCount(detections []*dsl.EnrichedDetection) int {
if len(f.changedFiles) == 0 {
return 0
}
count := 0
for _, det := range detections {
if !f.changedFiles[det.Location.RelPath] {
Expand Down
16 changes: 11 additions & 5 deletions sast-engine/output/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,17 @@ func TestDiffFilter_Filter(t *testing.T) {
wantRelPaths: []string{"app/views.py", "app/auth.py"},
},
{
name: "empty changed files returns all detections",
// An empty changed-files set means "nothing in the diff to match";
// the right answer is zero detections, not pass-through. Callers
// that want "no filtering" must skip Filter entirely.
name: "empty changed files filters everything out",
changedFiles: []string{},
detections: []*dsl.EnrichedDetection{
makeDetection("app/views.py", "critical"),
makeDetection("app/models.py", "high"),
},
wantCount: 2,
wantRelPaths: []string{"app/views.py", "app/models.py"},
wantCount: 0,
wantRelPaths: nil,
},
{
name: "nil detections",
Expand Down Expand Up @@ -257,12 +260,15 @@ func TestDiffFilter_FilteredCount(t *testing.T) {
wantFiltered: 2,
},
{
name: "empty changed files means no filtering",
// With an empty changed-files set, every detection would be
// removed by Filter (see Filter's contract): FilteredCount must
// report that as well, not 0.
name: "empty changed files reports all as removed",
changedFiles: []string{},
detections: []*dsl.EnrichedDetection{
makeDetection("app/views.py", "critical"),
},
wantFiltered: 0,
wantFiltered: 1,
},
{
name: "nil detections",
Expand Down
Loading