Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions go/internal/graph/bulk.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,12 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error {
// Cleanup runs whether COPY succeeds or fails.
defer os.Remove(tmp.Name())

// Use pipe '|' as the field delimiter so that JSON property values
// containing commas (e.g. {"language":"python","module":"glob"}) are not
// mis-parsed by Kuzu's CSV reader. Go's json.Marshal never emits '|',
// so it is unambiguous as a separator.
w := csv.NewWriter(tmp)
w.Comma = '|'
for _, n := range batch {
row, err := encodeNodeRow(n)
if err != nil {
Expand All @@ -96,8 +101,9 @@ func (s *Store) copyNodeBatch(batch []*model.CodeNode) error {

// Kuzu COPY FROM with explicit column list. ToSlash for Windows path
// portability — Kuzu's parser accepts forward slashes on all platforms.
// DELIM='|' matches the pipe-separated staging file written above.
q := fmt.Sprintf(
"COPY CodeNode(%s) FROM '%s' (header=false)",
"COPY CodeNode(%s) FROM '%s' (header=false, DELIM='|')",
strings.Join(nodeColumns, ", "),
filepath.ToSlash(tmp.Name()),
)
Expand Down Expand Up @@ -226,7 +232,9 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro
}
defer os.Remove(tmp.Name())

// Use pipe '|' as the field delimiter — see copyNodeBatch for the rationale.
w := csv.NewWriter(tmp)
w.Comma = '|'
for _, e := range batch {
props, err := json.Marshal(e.Properties)
if err != nil {
Expand Down Expand Up @@ -255,8 +263,9 @@ func (s *Store) copyEdgeBatch(kind model.EdgeKind, batch []*model.CodeEdge) erro
return fmt.Errorf("graph: csv close: %w", err)
}

// DELIM='|' matches the pipe-separated staging file written above.
q := fmt.Sprintf(
"COPY %s FROM '%s' (header=false)",
"COPY %s FROM '%s' (header=false, DELIM='|')",
relTableName(kind),
filepath.ToSlash(tmp.Name()),
)
Expand Down
78 changes: 78 additions & 0 deletions go/internal/graph/bulk_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,84 @@ func TestBulkLoadEdgesGroupedByKind(t *testing.T) {
}
}

// TestBulkLoadEdgesCommaInProperties is a regression test for the bug where
// Properties JSON containing commas (e.g. {"language":"python","module":"glob"})
// caused Kuzu's CSV parser to count more fields than expected and abort with
// "Copy exception: expected 6 values per row, but got more". The fix switches
// the staging file to pipe-separated (DELIM='|'), which is unambiguous because
// Go's json.Marshal never emits a '|' character.
func TestBulkLoadEdgesCommaInProperties(t *testing.T) {
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))
if err != nil {
t.Fatal(err)
}
defer s.Close()
if err := s.ApplySchema(); err != nil {
t.Fatal(err)
}
nodes := []*model.CodeNode{
{ID: "py:file:check_structure.py", Kind: model.NodeModule, Label: "check_structure.py"},
{ID: "py:external:glob", Kind: model.NodeExternal, Label: "glob"},
}
if err := s.BulkLoadNodes(nodes); err != nil {
t.Fatal(err)
}
edges := []*model.CodeEdge{{
ID: "py:file:check_structure.py->py:external:glob:imports",
Kind: model.EdgeImports,
SourceID: "py:file:check_structure.py",
TargetID: "py:external:glob",
Confidence: model.ConfidenceLexical,
Source: "GenericImportsDetector",
Properties: map[string]any{
"language": "python",
"module": "glob",
},
}}
if err := s.BulkLoadEdges(edges); err != nil {
t.Fatalf("BulkLoadEdges with comma-bearing Properties: %v", err)
}
rows, err := s.Cypher("MATCH ()-[r:IMPORTS]->() RETURN r.id AS id")
if err != nil {
t.Fatal(err)
}
if len(rows) != 1 {
t.Fatalf("want 1 IMPORTS row, got %d: %v", len(rows), rows)
}
}

// TestBulkLoadNodesCommaInProperties is a regression test for nodes whose
// props JSON column contains commas — same root cause as the edge variant.
func TestBulkLoadNodesCommaInProperties(t *testing.T) {
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))
if err != nil {
t.Fatal(err)
}
defer s.Close()
if err := s.ApplySchema(); err != nil {
t.Fatal(err)
}
nodes := []*model.CodeNode{{
ID: "py:file:app.py",
Kind: model.NodeModule,
Label: "app.py",
Properties: map[string]any{
"language": "python",
"module": "flask,requests,os", // value itself contains commas
},
}}
if err := s.BulkLoadNodes(nodes); err != nil {
t.Fatalf("BulkLoadNodes with comma-bearing Properties: %v", err)
}
rows, err := s.Cypher("MATCH (n:CodeNode {id: 'py:file:app.py'}) RETURN n.id AS id")
if err != nil {
t.Fatal(err)
}
if len(rows) != 1 {
t.Fatalf("want 1 node, got %d: %v", len(rows), rows)
}
}

// TestBulkLoadEdgesEmpty — zero edges is a no-op like the node path.
func TestBulkLoadEdgesEmpty(t *testing.T) {
s, err := graph.Open(filepath.Join(t.TempDir(), "g.kuzu"))
Expand Down
Loading