Skip to content

Commit 1dc466c

Browse files
aksOpsclaude
andcommitted
perf(detector/auth): tight pre-screen on CertificateAuthDetector
CPU profile of indexing PSScriptAnalyzer (593 files, mostly C#) showed CertificateAuthDetector consuming 99% of CPU (137 of 138 sample-seconds in regexp.match). Root cause: the detector's file-level pre-screen included .pem/.crt/.cert path-extension keywords that match almost every .NET file via `using System.Security.Cryptography.X509Certificates;` and similar, defeating the gate. Fix: split out a STRICT keyword list (certStrictKeywords) that drops the path-extension keywords and keeps only high-signal markers (SSLContext, X509AuthenticationFilter, AzureAd, etc). Used as both file-level and per-line gate before running the 20 per-pattern regexes. Bench (rm -rf .codeiq && time codeiq index PSScriptAnalyzer): - before: 42.9s wall, 2m20s CPU - after: 18.4s wall, 32.5s CPU Node counts unchanged (1674 nodes / 872 edges). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 336309c commit 1dc466c

1 file changed

Lines changed: 38 additions & 9 deletions

File tree

go/internal/detector/auth/certificate.go

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,26 @@ var (
6262
}
6363
certCertPathRE = regexp.MustCompile(`['"]([^'"]*\.(?:pem|crt|key|cert|pfx|p12))['"]`)
6464
certTenantIDRE = regexp.MustCompile(`AZURE_TENANT_ID\s*[=:]\s*['"]?([a-f0-9-]+)['"]?`)
65-
certPreScreen = regexp.MustCompile(
66-
`ssl_verify_client|requestCert|clientAuth|X509|` +
67-
`AddCertificateForwarding|CertificateAuthenticationDefaults|` +
68-
`\.x509\(|javax\.net\.ssl|SSLContext|tls\.createServer|` +
69-
`trustStore|AzureAd|AZURE_TENANT_ID|AZURE_CLIENT_ID|` +
70-
`ClientCertificateCredential|AddMicrosoftIdentityWebApi|` +
71-
`msal|MSAL|@azure/msal|\.pem|\.crt|\.cert`,
72-
)
65+
// certStrictKeywords gate detector entry. STRICT subset: file must
66+
// contain at least one of these high-signal markers before we even
67+
// consider running the 20 per-pattern regexes. Loose keywords like
68+
// ".pem"/".crt"/".cert" are NOT in this set because they show up as
69+
// path/extension references in millions of unrelated lines (e.g. C#
70+
// `using System.Security.Cryptography.X509Certificates`) and would
71+
// turn the per-line gate into a no-op.
72+
//
73+
// Profiling on PSScriptAnalyzer (593 files, 203 C#) showed
74+
// CertificateAuthDetector consuming 99% of indexing CPU before this
75+
// pre-screen. Tighter gate keeps the detector fast on cert-free repos.
76+
certStrictKeywords = []string{
77+
"ssl_verify_client", "requestCert", "clientAuth=",
78+
"AddCertificateForwarding", "CertificateAuthenticationDefaults",
79+
".x509(", "X509AuthenticationFilter",
80+
"javax.net.ssl", "SSLContext", "tls.createServer",
81+
"trustStore", "AzureAd", "AZURE_TENANT_ID", "AZURE_CLIENT_ID",
82+
"ClientCertificateCredential", "AddMicrosoftIdentityWebApi",
83+
"@azure/msal",
84+
}
7385
)
7486

7587
var certAllPatterns []certPatternDef
@@ -81,12 +93,24 @@ func init() {
8193
certAllPatterns = append(certAllPatterns, certAzureAdPatterns...)
8294
}
8395

96+
// certLineQuickScan returns true if s contains any of the auth-cert
97+
// keywords. Cheap O(n*k) byte scan beats running 20 regex alternation
98+
// engines per line. Used both as a file-level and a per-line gate.
99+
func certLineQuickScan(s string) bool {
100+
for _, kw := range certStrictKeywords {
101+
if strings.Contains(s, kw) {
102+
return true
103+
}
104+
}
105+
return false
106+
}
107+
84108
func (d CertificateAuthDetector) Detect(ctx *detector.Context) *detector.Result {
85109
text := ctx.Content
86110
if text == "" {
87111
return detector.EmptyResult()
88112
}
89-
if !certPreScreen.MatchString(text) {
113+
if !certLineQuickScan(text) {
90114
return detector.EmptyResult()
91115
}
92116

@@ -95,6 +119,11 @@ func (d CertificateAuthDetector) Detect(ctx *detector.Context) *detector.Result
95119
seenLines := map[int]bool{}
96120

97121
for lineIdx, line := range lines {
122+
// Per-line pre-screen: skip the 20 regex passes on lines without
123+
// any cert-auth keyword. ~99% reduction on real codebases.
124+
if !certLineQuickScan(line) {
125+
continue
126+
}
98127
for _, pdef := range certAllPatterns {
99128
if seenLines[lineIdx] {
100129
break

0 commit comments

Comments
 (0)