Skip to content

Commit d8fd9f9

Browse files
aksOpsclaude
andcommitted
fix(parser): unquote TOML keys and section headers
Apache Airflow's `.cherry_picker.toml` uses TOML's quoted-key form: "check_sha" = "..." `parseTOML` was reading the LHS as the raw text including the literal quotes. The TomlStructureDetector then emitted node IDs like `toml:.cherry_picker.toml:"check_sha"` while the CONTAINS edges (and any downstream lookup) referenced different shapes — Kuzu's BulkLoad aborted with: Copy exception: Unable to find primary key value "toml:.cherry_picker.toml:""check_sha""" Bug was symmetric for `["quoted-section"]` headers. Fix both: call the existing `unquote` helper on the key/section before storing. Regression tests added in structured_test.go (new file). End-to-end: `codeiq enrich ~/projects/polyglot-bench/airflow` now exits 0 (was exit 2): 95k nodes, 246k edges, 165 services loaded. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
1 parent 4a6d82a commit d8fd9f9

2 files changed

Lines changed: 62 additions & 2 deletions

File tree

go/internal/parser/structured.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ func parseTOML(source []byte) ParsedEnvelope {
162162
continue
163163
}
164164
if strings.HasPrefix(raw, "[") && strings.HasSuffix(raw, "]") {
165-
section := strings.TrimSpace(raw[1 : len(raw)-1])
165+
section := unquote(strings.TrimSpace(raw[1 : len(raw)-1]))
166166
currentSection = section
167167
// Walk into a nested map; only create the top-level section in
168168
// data — nested namespacing is preserved by the dotted key.
@@ -176,7 +176,7 @@ func parseTOML(source []byte) ParsedEnvelope {
176176
if eq <= 0 {
177177
continue
178178
}
179-
key := strings.TrimSpace(raw[:eq])
179+
key := unquote(strings.TrimSpace(raw[:eq]))
180180
val := strings.TrimSpace(raw[eq+1:])
181181
val = unquote(val)
182182
if currentSection == "" {
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package parser
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestParseTOMLUnquotesKeys(t *testing.T) {
8+
// `.cherry_picker.toml` in apache/airflow has `"check_sha" = "..."` —
9+
// a quoted top-level key. Pre-fix the key string included the literal
10+
// quotes which propagated into node IDs like
11+
// `toml:.cherry_picker.toml:"check_sha"`, and CONTAINS edges then
12+
// referenced PKs that the bulk-load couldn't resolve.
13+
src := []byte(`team = "apache"
14+
repo = "airflow"
15+
"check_sha" = "abc123"
16+
'literal_key' = "single-quoted"
17+
`)
18+
env := parseTOML(src)
19+
data, ok := env["data"].(map[string]any)
20+
if !ok {
21+
t.Fatalf("envelope missing data map: %#v", env)
22+
}
23+
for k, want := range map[string]string{
24+
"team": "apache",
25+
"repo": "airflow",
26+
"check_sha": "abc123",
27+
"literal_key": "single-quoted",
28+
} {
29+
got, ok := data[k].(string)
30+
if !ok {
31+
t.Errorf("key %q missing or non-string: %#v", k, data[k])
32+
continue
33+
}
34+
if got != want {
35+
t.Errorf("data[%q] = %q, want %q", k, got, want)
36+
}
37+
}
38+
// Negative: a quoted form must NOT appear as its own key.
39+
for _, badKey := range []string{`"check_sha"`, `'literal_key'`} {
40+
if _, exists := data[badKey]; exists {
41+
t.Errorf("data still has quote-bearing key %q — unquote not applied", badKey)
42+
}
43+
}
44+
}
45+
46+
func TestParseTOMLUnquotesSectionHeaders(t *testing.T) {
47+
// Less common in practice, but TOML spec allows `["foo.bar"]` quoted
48+
// section headers. Same fix applies — unquote before using as map key.
49+
src := []byte(`["quoted-section"]
50+
inner = "v"
51+
`)
52+
env := parseTOML(src)
53+
data := env["data"].(map[string]any)
54+
if _, ok := data["quoted-section"]; !ok {
55+
t.Errorf("missing top-level section 'quoted-section': %#v", data)
56+
}
57+
if _, ok := data[`"quoted-section"`]; ok {
58+
t.Errorf("section header retained literal quotes — unquote not applied")
59+
}
60+
}

0 commit comments

Comments
 (0)