Skip to content

Commit 32f8180

Browse files
aksOpsclaude
andcommitted
feat(analyzer): confidence-aware node + canonical edge dedup with surfaced counts
Plan Phase 1.1, 1.2, 1.5 — make the graph deterministic and canonical. Before: GraphBuilder used first-write-wins on node ID. A class touched by both ClassHierarchyDetector and SpringRestDetector would keep whichever landed first (often the lower-confidence LEXICAL detector) and silently drop the higher-confidence framework annotations. After: - mergeNode picks the higher-Confidence emission as the survivor. - Survivor gap-fills missing FQN / Module / FilePath / LineStart / LineEnd / Layer / Source from the donor. - Properties union with non-clobber semantics: donor only fills keys the survivor doesn't already have (preserves the high-confidence framework/auth_type/etc). - Annotations unioned and sorted for determinism. Edges now dedupe by canonical (sourceID, targetID, kind) tuple instead of detector-assigned edge ID strings — two detectors emitting "a calls b" with different edge ID conventions now collapse to one edge, with the higher-confidence one winning. Snapshot surfaces DedupedNodes / DedupedEdges / DroppedEdges counts. codeiq index prints "Deduped: N nodes, M edges Dropped: K phantom edges" when any of those are non-zero, so operators can see graph health. Tests (TDD per CLAUDE.md): - TestGraphBuilderDedup_HigherConfidenceWins - TestGraphBuilderDedup_AnnotationsUnioned - TestGraphBuilderDedup_PropertiesMergeNonClobber - TestGraphBuilderEdgeDedup_ByKey - TestGraphBuilderEdgeDedup_DifferentKindKept - TestGraphBuilderEdgeDedup_PropertiesUnioned - TestGraphBuilderStats_DedupAndDropCounts Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 1dc466c commit 32f8180

5 files changed

Lines changed: 417 additions & 20 deletions

File tree

go/internal/analyzer/analyzer.go

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,17 @@ func NewAnalyzer(opts Options) *Analyzer {
4343
}
4444

4545
// Stats reports per-run counts.
46+
//
47+
// Plan §1.5 — DedupedNodes/DedupedEdges/DroppedEdges expose dedup activity
48+
// so operators can see "graph collapsed 312 duplicate nodes, dropped 14
49+
// phantom edges" — the visibility is what makes "meaningful" diagnosable.
4650
type Stats struct {
47-
Files int
48-
Nodes int
49-
Edges int
51+
Files int
52+
Nodes int
53+
Edges int
54+
DedupedNodes int
55+
DedupedEdges int
56+
DroppedEdges int
5057
}
5158

5259
// Run executes FileDiscovery → parse → detectors → GraphBuilder → cache writes
@@ -86,9 +93,12 @@ func (a *Analyzer) Run(root string) (Stats, error) {
8693

8794
snap := gb.Snapshot()
8895
return Stats{
89-
Files: len(files),
90-
Nodes: len(snap.Nodes),
91-
Edges: len(snap.Edges),
96+
Files: len(files),
97+
Nodes: len(snap.Nodes),
98+
Edges: len(snap.Edges),
99+
DedupedNodes: snap.DedupedNodes,
100+
DedupedEdges: snap.DedupedEdges,
101+
DroppedEdges: snap.DroppedEdges,
92102
}, nil
93103
}
94104

go/internal/analyzer/graph_builder.go

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -9,50 +9,82 @@ import (
99
)
1010

1111
// GraphBuilder buffers detector results across batches. Concurrent-safe.
12-
// Snapshot() produces a deterministic sorted view with dangling edges
13-
// dropped — the same determinism contract as the Java GraphBuilder.
12+
//
13+
// Phase 1 (plan §1.1, §1.2):
14+
// - Nodes are deduped by ID via mergeNode (confidence-aware).
15+
// - Edges are deduped by canonical (source, target, kind) key via mergeEdge.
16+
//
17+
// Snapshot() produces a deterministic sorted view with phantom edges (those
18+
// whose endpoint is still missing) dropped, and exposes the dedup/drop
19+
// counts so the CLI can surface "deduped N, dropped K" diagnostics.
1420
type GraphBuilder struct {
1521
mu sync.Mutex
1622
nodes map[string]*model.CodeNode
17-
edges map[string]*model.CodeEdge
23+
edges map[edgeKey]*model.CodeEdge
24+
25+
// Counters incremented as Add() observes duplicates and used by
26+
// Snapshot() to populate the surfaced stats.
27+
dedupedNodes int
28+
dedupedEdges int
1829
}
1930

2031
// NewGraphBuilder returns an empty builder.
2132
func NewGraphBuilder() *GraphBuilder {
2233
return &GraphBuilder{
2334
nodes: make(map[string]*model.CodeNode),
24-
edges: make(map[string]*model.CodeEdge),
35+
edges: make(map[edgeKey]*model.CodeEdge),
2536
}
2637
}
2738

28-
// Add merges a detector result. Duplicate node IDs are dropped (first write
29-
// wins — matches Java behaviour). Duplicate edge IDs likewise.
39+
// Add merges a detector result. Duplicate node IDs and duplicate edge
40+
// (source, target, kind) tuples collapse with confidence-aware merging.
3041
func (b *GraphBuilder) Add(r *detector.Result) {
3142
if r == nil {
3243
return
3344
}
3445
b.mu.Lock()
3546
defer b.mu.Unlock()
3647
for _, n := range r.Nodes {
37-
if _, exists := b.nodes[n.ID]; !exists {
38-
b.nodes[n.ID] = n
48+
if existing, ok := b.nodes[n.ID]; ok {
49+
b.nodes[n.ID] = mergeNode(existing, n)
50+
b.dedupedNodes++
51+
continue
3952
}
53+
b.nodes[n.ID] = n
4054
}
4155
for _, e := range r.Edges {
42-
if _, exists := b.edges[e.ID]; !exists {
43-
b.edges[e.ID] = e
56+
k := makeEdgeKey(e)
57+
if existing, ok := b.edges[k]; ok {
58+
b.edges[k] = mergeEdge(existing, e)
59+
b.dedupedEdges++
60+
continue
4461
}
62+
b.edges[k] = e
4563
}
4664
}
4765

48-
// Snapshot is the deterministic, sorted view of buffered state with dangling
49-
// edges (source or target node missing) dropped.
66+
// Snapshot is the deterministic, sorted view of buffered state with
67+
// phantom edges (source or target node missing) dropped. It also exposes
68+
// the count of duplicate emissions collapsed during Add() and the count
69+
// of dangling edges dropped during this Snapshot call.
5070
type Snapshot struct {
5171
Nodes []*model.CodeNode
5272
Edges []*model.CodeEdge
73+
74+
// DedupedNodes is the count of node emissions that collided with an
75+
// existing node ID and were merged in. Zero on a graph where no
76+
// detector double-emitted.
77+
DedupedNodes int
78+
// DedupedEdges is the same for edges by (source, target, kind).
79+
DedupedEdges int
80+
// DroppedEdges is the count of edges that had no matching source or
81+
// target node in the final node set — phantom references usually
82+
// caused by a linker pointing at a node that no detector emitted.
83+
DroppedEdges int
5384
}
5485

55-
// Snapshot returns the current state as a sorted, dangling-edge-free Snapshot.
86+
// Snapshot returns the current state as a sorted, dangling-edge-free
87+
// Snapshot with surfaced dedup/drop counts.
5688
func (b *GraphBuilder) Snapshot() Snapshot {
5789
b.mu.Lock()
5890
defer b.mu.Unlock()
@@ -63,16 +95,25 @@ func (b *GraphBuilder) Snapshot() Snapshot {
6395
sort.Slice(nodes, func(i, j int) bool { return nodes[i].ID < nodes[j].ID })
6496

6597
edges := make([]*model.CodeEdge, 0, len(b.edges))
98+
dropped := 0
6699
for _, e := range b.edges {
67100
if _, src := b.nodes[e.SourceID]; !src {
101+
dropped++
68102
continue
69103
}
70104
if _, tgt := b.nodes[e.TargetID]; !tgt {
105+
dropped++
71106
continue
72107
}
73108
edges = append(edges, e)
74109
}
75110
sort.Slice(edges, func(i, j int) bool { return edges[i].ID < edges[j].ID })
76111

77-
return Snapshot{Nodes: nodes, Edges: edges}
112+
return Snapshot{
113+
Nodes: nodes,
114+
Edges: edges,
115+
DedupedNodes: b.dedupedNodes,
116+
DedupedEdges: b.dedupedEdges,
117+
DroppedEdges: dropped,
118+
}
78119
}

go/internal/analyzer/merger.go

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
package analyzer
2+
3+
import (
4+
"sort"
5+
6+
"github.com/randomcodespace/codeiq/go/internal/model"
7+
)
8+
9+
// mergeNode merges incoming into existing, picking the higher-confidence
10+
// node as the survivor, then filling gaps and unioning properties /
11+
// annotations. Returns the survivor (which is mutated in place).
12+
//
13+
// Plan §1.1 — semantics:
14+
// - Higher Confidence wins; ties keep existing.
15+
// - Non-empty FQN / Module / FilePath / LineStart / LineEnd / Layer
16+
// fill in from whichever side has them.
17+
// - Properties: incoming wins per-key only when existing's value is nil
18+
// or missing (do not clobber framework/auth_type already stamped by a
19+
// higher-confidence detector).
20+
// - Annotations are unioned and sorted for determinism.
21+
func mergeNode(existing, incoming *model.CodeNode) *model.CodeNode {
22+
if existing == nil {
23+
return incoming
24+
}
25+
if incoming == nil {
26+
return existing
27+
}
28+
29+
survivor := existing
30+
donor := incoming
31+
if incoming.Confidence > existing.Confidence {
32+
survivor = incoming
33+
donor = existing
34+
}
35+
36+
// Gap-fill scalar fields from the donor when the survivor has none.
37+
if survivor.FQN == "" && donor.FQN != "" {
38+
survivor.FQN = donor.FQN
39+
}
40+
if survivor.Module == "" && donor.Module != "" {
41+
survivor.Module = donor.Module
42+
}
43+
if survivor.FilePath == "" && donor.FilePath != "" {
44+
survivor.FilePath = donor.FilePath
45+
}
46+
if survivor.LineStart == 0 && donor.LineStart != 0 {
47+
survivor.LineStart = donor.LineStart
48+
}
49+
if survivor.LineEnd == 0 && donor.LineEnd != 0 {
50+
survivor.LineEnd = donor.LineEnd
51+
}
52+
if survivor.Layer == model.LayerUnknown && donor.Layer != model.LayerUnknown {
53+
survivor.Layer = donor.Layer
54+
}
55+
if survivor.Source == "" && donor.Source != "" {
56+
survivor.Source = donor.Source
57+
}
58+
59+
// Property union: donor fills missing keys; never clobbers existing.
60+
if survivor.Properties == nil {
61+
survivor.Properties = map[string]any{}
62+
}
63+
for k, v := range donor.Properties {
64+
if _, exists := survivor.Properties[k]; exists {
65+
continue
66+
}
67+
survivor.Properties[k] = v
68+
}
69+
70+
// Annotation union — dedup + sort for determinism.
71+
survivor.Annotations = unionSorted(survivor.Annotations, donor.Annotations)
72+
73+
return survivor
74+
}
75+
76+
// mergeEdge merges two edges with the same EdgeKey (src, tgt, kind).
77+
// Higher-confidence wins; ties keep existing. Properties unioned with
78+
// non-clobber semantics.
79+
func mergeEdge(existing, incoming *model.CodeEdge) *model.CodeEdge {
80+
if existing == nil {
81+
return incoming
82+
}
83+
if incoming == nil {
84+
return existing
85+
}
86+
87+
survivor := existing
88+
donor := incoming
89+
if incoming.Confidence > existing.Confidence {
90+
survivor = incoming
91+
donor = existing
92+
}
93+
if survivor.Source == "" && donor.Source != "" {
94+
survivor.Source = donor.Source
95+
}
96+
if survivor.Properties == nil {
97+
survivor.Properties = map[string]any{}
98+
}
99+
for k, v := range donor.Properties {
100+
if _, exists := survivor.Properties[k]; exists {
101+
continue
102+
}
103+
survivor.Properties[k] = v
104+
}
105+
return survivor
106+
}
107+
108+
func unionSorted(a, b []string) []string {
109+
seen := make(map[string]struct{}, len(a)+len(b))
110+
for _, s := range a {
111+
seen[s] = struct{}{}
112+
}
113+
for _, s := range b {
114+
seen[s] = struct{}{}
115+
}
116+
out := make([]string, 0, len(seen))
117+
for s := range seen {
118+
out = append(out, s)
119+
}
120+
sort.Strings(out)
121+
return out
122+
}
123+
124+
// edgeKey is the canonical key used to dedupe edges. Two edges with the
125+
// same (source, target, kind) are considered the same edge regardless of
126+
// detector-assigned ID strings.
127+
type edgeKey struct {
128+
source string
129+
target string
130+
kind model.EdgeKind
131+
}
132+
133+
func makeEdgeKey(e *model.CodeEdge) edgeKey {
134+
return edgeKey{source: e.SourceID, target: e.TargetID, kind: e.Kind}
135+
}

0 commit comments

Comments
 (0)