-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparser.go
More file actions
255 lines (243 loc) · 6.07 KB
/
parser.go
File metadata and controls
255 lines (243 loc) · 6.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
package parser
import (
"context"
"fmt"
"strings"
sitter "github.com/smacker/go-tree-sitter"
)
// Language identifies a supported source language. Phase 1 supports only Java
// and Python; the rest land in phase 2 / phase 4.
type Language int
const (
LanguageUnknown Language = iota
LanguageJava
LanguagePython
LanguageTypeScript
LanguageGo
// Structured / textual languages added in phase 4 (batch 1 / 2). No
// tree-sitter grammar — the analyzer parses these via the structured
// parser in internal/parser/structured.go.
LanguageYaml
LanguageJSON
LanguageTOML
LanguageINI
LanguageProperties
LanguageSQL
LanguageBatch
LanguageVue
LanguageSvelte
// Additional languages discovered through file extension but parsed via
// regex/structured paths (no tree-sitter grammar wired in).
LanguageCSharp
LanguageKotlin
LanguageScala
LanguageCpp
LanguageRust
LanguageTerraform
LanguageBicep
LanguageProto
LanguageDockerfile
LanguageXML
LanguageMarkdown
LanguagePowerShell
LanguageBash
LanguageRuby
LanguageGroovy
)
func (l Language) String() string {
switch l {
case LanguageJava:
return "java"
case LanguagePython:
return "python"
case LanguageTypeScript:
return "typescript"
case LanguageGo:
return "go"
case LanguageYaml:
return "yaml"
case LanguageJSON:
return "json"
case LanguageTOML:
return "toml"
case LanguageINI:
return "ini"
case LanguageProperties:
return "properties"
case LanguageSQL:
return "sql"
case LanguageBatch:
return "batch"
case LanguageVue:
return "vue"
case LanguageSvelte:
return "svelte"
case LanguageCSharp:
return "csharp"
case LanguageKotlin:
return "kotlin"
case LanguageScala:
return "scala"
case LanguageCpp:
return "cpp"
case LanguageRust:
return "rust"
case LanguageTerraform:
return "terraform"
case LanguageBicep:
return "bicep"
case LanguageProto:
return "proto"
case LanguageDockerfile:
return "dockerfile"
case LanguageXML:
return "xml"
case LanguageMarkdown:
return "markdown"
case LanguagePowerShell:
return "powershell"
case LanguageBash:
return "bash"
case LanguageRuby:
return "ruby"
case LanguageGroovy:
return "groovy"
default:
return "unknown"
}
}
// LanguageFromExtension maps a file extension (including leading dot, e.g.
// ".java") to a Language. Returns LanguageUnknown for anything unsupported.
func LanguageFromExtension(ext string) Language {
switch strings.ToLower(ext) {
case ".java":
return LanguageJava
case ".py", ".pyw":
return LanguagePython
case ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs":
return LanguageTypeScript
case ".go":
return LanguageGo
case ".yaml", ".yml":
return LanguageYaml
case ".json":
return LanguageJSON
case ".toml":
return LanguageTOML
case ".ini", ".cfg":
return LanguageINI
case ".properties":
return LanguageProperties
case ".sql":
return LanguageSQL
case ".bat", ".cmd":
return LanguageBatch
case ".vue":
return LanguageVue
case ".svelte":
return LanguageSvelte
case ".cs":
return LanguageCSharp
case ".kt", ".kts":
return LanguageKotlin
case ".scala", ".sc":
return LanguageScala
case ".cpp", ".cc", ".cxx", ".hpp", ".hh", ".hxx", ".h":
return LanguageCpp
case ".rs":
return LanguageRust
case ".tf", ".tfvars":
return LanguageTerraform
case ".bicep":
return LanguageBicep
case ".proto":
return LanguageProto
case ".xml":
return LanguageXML
case ".md", ".markdown":
return LanguageMarkdown
case ".ps1", ".psm1", ".psd1":
return LanguagePowerShell
case ".sh", ".bash", ".zsh":
return LanguageBash
case ".rb":
return LanguageRuby
case ".groovy", ".gradle":
return LanguageGroovy
default:
return LanguageUnknown
}
}
// Tree wraps a parsed *sitter.Tree along with the source bytes so detectors
// can pull node text via tree-sitter's byte-range API.
type Tree struct {
Lang Language
Source []byte
Root *sitter.Tree
}
// Close releases the tree-sitter parse tree.
func (t *Tree) Close() {
if t.Root != nil {
t.Root.Close()
}
}
// Parse parses the source bytes in the given language. The returned Tree must
// be Close()d. Returns (nil, nil) for structured / textual languages without
// a tree-sitter grammar (yaml/json/toml/ini/properties/sql/batch/vue/svelte)
// — those are handled by the structured / regex paths, not tree-sitter.
// Returns an error for LanguageUnknown (truly unsupported).
func Parse(lang Language, source []byte) (*Tree, error) {
if lang == LanguageUnknown {
return nil, fmt.Errorf("unsupported language: %v", lang)
}
tsLang, err := tsLanguage(lang)
if err != nil {
// Structured / textual languages are a soft miss, not an error.
if isStructuredOrTextual(lang) {
return nil, nil
}
return nil, err
}
p := sitter.NewParser()
p.SetLanguage(tsLang)
root, err := p.ParseCtx(context.Background(), nil, source)
if err != nil {
return nil, fmt.Errorf("tree-sitter parse: %w", err)
}
return &Tree{Lang: lang, Source: source, Root: root}, nil
}
// isStructuredOrTextual reports whether the language is handled by the
// structured / textual parser path (no tree-sitter grammar).
func isStructuredOrTextual(l Language) bool {
switch l {
case LanguageYaml, LanguageJSON, LanguageTOML, LanguageINI,
LanguageProperties, LanguageSQL, LanguageBatch, LanguageVue,
LanguageSvelte,
// Regex-handled languages: no tree-sitter grammar wired; detectors
// consume the raw content directly.
LanguageCSharp, LanguageKotlin, LanguageScala, LanguageCpp,
LanguageRust, LanguageTerraform, LanguageBicep, LanguageProto,
LanguageDockerfile, LanguageXML, LanguageMarkdown,
LanguagePowerShell, LanguageBash, LanguageRuby, LanguageGroovy:
return true
}
return false
}
// NodeText returns the source text for a tree-sitter node.
func NodeText(n *sitter.Node, source []byte) string {
return n.Content(source)
}
func tsLanguage(l Language) (*sitter.Language, error) {
switch l {
case LanguageJava:
return javaLanguage(), nil
case LanguagePython:
return pythonLanguage(), nil
case LanguageTypeScript:
return typescriptLanguage(), nil
case LanguageGo:
return goLanguage(), nil
default:
return nil, fmt.Errorf("unsupported language: %v", l)
}
}