Skip to content

Commit 9629447

Browse files
committed
test(05-02): harden fallback paths with file ceilings and integration tests
- Add file ceiling pre-check (500KB/10K lines) in GenericAnalyzer before AST chunking - Wrap createASTAlignedChunks in try/catch with debug-guarded error logging - Create 6 integration tests: normal AST, oversized fallback, parse error, unsupported lang, scope prefix, coverage
1 parent 3dbd43e commit 9629447

File tree

2 files changed

+309
-13
lines changed

2 files changed

+309
-13
lines changed

src/analyzers/generic/index.ts

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,11 @@ import {
1717
Dependency
1818
} from '../../types/index.js';
1919
import { createChunksFromCode } from '../../utils/chunking.js';
20-
import { createASTAlignedChunks } from '../../utils/ast-chunker.js';
20+
import {
21+
createASTAlignedChunks,
22+
MAX_AST_CHUNK_FILE_SIZE,
23+
MAX_AST_CHUNK_FILE_LINES
24+
} from '../../utils/ast-chunker.js';
2125
import { detectLanguage } from '../../utils/language-detection.js';
2226
import { extractTreeSitterSymbols, type TreeSitterSymbol } from '../../utils/tree-sitter.js';
2327
import {
@@ -150,19 +154,46 @@ export class GenericAnalyzer implements FrameworkAnalyzer {
150154
}
151155

152156
// Create chunks — use AST-aligned chunker when Tree-sitter symbols are available
157+
// File ceiling pre-check: skip AST chunking for very large files
158+
const lineCount = content.split('\n').length;
159+
const byteSize = Buffer.byteLength(content, 'utf8');
160+
const useASTChunking =
161+
usesTreeSitterSymbols &&
162+
treeSitterSymbols.length > 0 &&
163+
byteSize <= MAX_AST_CHUNK_FILE_SIZE &&
164+
lineCount <= MAX_AST_CHUNK_FILE_LINES;
165+
153166
let chunks: CodeChunk[];
154-
if (usesTreeSitterSymbols && treeSitterSymbols.length > 0) {
155-
chunks = createASTAlignedChunks(content, treeSitterSymbols, {
156-
minChunkLines: 10,
157-
maxChunkLines: 150,
158-
filePath,
159-
language,
160-
framework: 'generic',
161-
componentType: 'module'
162-
});
163-
// Enrich AST chunks with the correct relativePath
164-
for (const chunk of chunks) {
165-
chunk.relativePath = relativePath;
167+
if (useASTChunking) {
168+
try {
169+
chunks = createASTAlignedChunks(content, treeSitterSymbols, {
170+
minChunkLines: 10,
171+
maxChunkLines: 150,
172+
filePath,
173+
language,
174+
framework: 'generic',
175+
componentType: 'module'
176+
});
177+
// Enrich AST chunks with the correct relativePath
178+
for (const chunk of chunks) {
179+
chunk.relativePath = relativePath;
180+
}
181+
} catch (err) {
182+
if (process.env.CODEBASE_CONTEXT_DEBUG) {
183+
console.error(
184+
`[ast-chunker] AST chunking failed for ${filePath}, falling back to line chunks:`,
185+
err
186+
);
187+
}
188+
// Fall through to line-based chunking
189+
chunks = await createChunksFromCode(
190+
content,
191+
filePath,
192+
relativePath,
193+
language,
194+
components,
195+
metadata
196+
);
166197
}
167198
} else {
168199
chunks = await createChunksFromCode(
Lines changed: 265 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,265 @@
1+
import { describe, expect, it } from 'vitest';
2+
import { GenericAnalyzer } from '../src/analyzers/generic/index';
3+
import { MAX_AST_CHUNK_FILE_LINES } from '../src/utils/ast-chunker';
4+
5+
// ---------------------------------------------------------------------------
6+
// Fixtures
7+
// ---------------------------------------------------------------------------
8+
9+
const TYPESCRIPT_FIXTURE = `
10+
import { EventEmitter } from 'events';
11+
12+
const MAX_RETRIES = 3;
13+
14+
export class UserService extends EventEmitter {
15+
private users: Map<string, User> = new Map();
16+
17+
constructor(private readonly db: Database) {
18+
super();
19+
this.init();
20+
}
21+
22+
async getById(id: string): Promise<User | null> {
23+
if (!id) {
24+
throw new Error('ID required');
25+
}
26+
const cached = this.users.get(id);
27+
if (cached) return cached;
28+
const user = await this.db.findUser(id);
29+
if (user) {
30+
this.users.set(id, user);
31+
}
32+
return user;
33+
}
34+
35+
async updateUser(id: string, data: Partial<User>): Promise<User> {
36+
const user = await this.getById(id);
37+
if (!user) {
38+
throw new Error(\`User \${id} not found\`);
39+
}
40+
const updated = { ...user, ...data };
41+
this.users.set(id, updated);
42+
this.emit('user:updated', updated);
43+
return updated;
44+
}
45+
46+
private init(): void {
47+
console.log('UserService initialized');
48+
}
49+
}
50+
51+
interface User {
52+
id: string;
53+
name: string;
54+
email: string;
55+
}
56+
57+
interface Database {
58+
findUser(id: string): Promise<User | null>;
59+
}
60+
61+
export function createUserService(db: Database): UserService {
62+
return new UserService(db);
63+
}
64+
`.trim();
65+
66+
// ---------------------------------------------------------------------------
67+
// Tests
68+
// ---------------------------------------------------------------------------
69+
70+
const analyzer = new GenericAnalyzer();
71+
72+
describe('AST Chunker Integration', () => {
73+
// Test 1: Supported language, normal file — AST chunks with scope prefixes
74+
it('produces AST-aligned chunks with scope prefixes for a normal TypeScript file', async () => {
75+
const result = await analyzer.analyze('/virtual/user-service.ts', TYPESCRIPT_FIXTURE);
76+
77+
expect(result.metadata.chunkStrategy).toBe('ast-aligned');
78+
expect(result.metadata.symbolAware).toBe(true);
79+
80+
// Should have symbol-aware chunks
81+
const symbolChunks = result.chunks.filter((c) => c.metadata?.symbolAware === true);
82+
expect(symbolChunks.length).toBeGreaterThan(0);
83+
84+
// Check key symbols exist
85+
const names = symbolChunks.map((c) => c.metadata.symbolName);
86+
expect(names.some((n) => n?.includes('getById'))).toBe(true);
87+
expect(names.some((n) => n?.includes('updateUser'))).toBe(true);
88+
expect(names.some((n) => n?.includes('createUserService'))).toBe(true);
89+
90+
// Every symbol chunk should have a scope prefix (starts with //)
91+
for (const chunk of symbolChunks) {
92+
expect(chunk.content.startsWith('//')).toBe(true);
93+
}
94+
});
95+
96+
// Test 2: Oversized file — falls back to line chunks
97+
it('falls back to line-based chunking for oversized files (>10K lines)', async () => {
98+
// Generate a large file exceeding MAX_AST_CHUNK_FILE_LINES
99+
const bigLines: string[] = [];
100+
bigLines.push('// Large generated file');
101+
for (let i = 1; i <= MAX_AST_CHUNK_FILE_LINES + 100; i++) {
102+
bigLines.push(`export const var_${i} = ${i};`);
103+
}
104+
const bigContent = bigLines.join('\n');
105+
106+
const result = await analyzer.analyze('/virtual/huge-file.ts', bigContent);
107+
108+
// Should NOT be ast-aligned due to file ceiling
109+
expect(result.chunks.length).toBeGreaterThan(0);
110+
111+
// Chunks should be produced (via line/component fallback)
112+
const hasAstAligned = result.chunks.some((c) => c.metadata?.chunkStrategy === 'ast-aligned');
113+
expect(hasAstAligned).toBe(false);
114+
});
115+
116+
// Test 3: Parse error simulation — fallback, no crash
117+
it('falls back gracefully on files with syntax errors', async () => {
118+
// Content with syntax errors that cause Tree-sitter hasError
119+
const badContent = [
120+
'export class Broken {',
121+
' method() {',
122+
' const x = {{{{{;', // severe syntax error
123+
' return \\\\\\\\;',
124+
' }',
125+
' another() {',
126+
' return 42;',
127+
' }',
128+
'}'
129+
].join('\n');
130+
131+
// Should not throw
132+
const result = await analyzer.analyze('/virtual/broken.ts', badContent);
133+
134+
// Chunks should still be produced (via fallback)
135+
expect(result.chunks.length).toBeGreaterThan(0);
136+
});
137+
138+
// Test 4: Unsupported language — regex/line fallback
139+
it('produces chunks via fallback for unsupported languages (.rb)', async () => {
140+
const rubyContent = [
141+
'class Calculator',
142+
' def add(a, b)',
143+
' a + b',
144+
' end',
145+
'',
146+
' def subtract(a, b)',
147+
' a - b',
148+
' end',
149+
'end',
150+
'',
151+
'def standalone_function(x)',
152+
' x * 2',
153+
'end'
154+
].join('\n');
155+
156+
const result = await analyzer.analyze('/virtual/calculator.rb', rubyContent);
157+
158+
// Chunks produced
159+
expect(result.chunks.length).toBeGreaterThan(0);
160+
161+
// Should NOT be ast-aligned (Ruby has no grammar)
162+
expect(result.metadata.chunkStrategy).toBe('line-or-component');
163+
expect(result.metadata.symbolAware).toBeUndefined();
164+
165+
// No chunk should have AST-related metadata
166+
for (const chunk of result.chunks) {
167+
expect(chunk.metadata?.symbolAware).not.toBe(true);
168+
}
169+
});
170+
171+
// Test 5: Scope prefix correctness — nested class > method format
172+
it('generates correct scope prefix format for nested symbols', async () => {
173+
const result = await analyzer.analyze('/virtual/user-service.ts', TYPESCRIPT_FIXTURE);
174+
175+
const symbolChunks = result.chunks.filter((c) => c.metadata?.symbolAware === true);
176+
177+
// Find a method chunk inside UserService
178+
const getByIdChunk = symbolChunks.find((c) => c.metadata.symbolName === 'getById');
179+
if (getByIdChunk) {
180+
// Should have prefix format: // UserService > getById :: (...)
181+
const firstLine = getByIdChunk.content.split('\n')[0];
182+
expect(firstLine).toMatch(/\/\/\s*UserService\s*>\s*getById\s*::/);
183+
}
184+
185+
// Find standalone function chunk
186+
const createChunk = symbolChunks.find((c) =>
187+
c.metadata.symbolName?.includes('createUserService')
188+
);
189+
if (createChunk) {
190+
// Should have prefix format: // createUserService :: (...)
191+
const firstLine = createChunk.content.split('\n')[0];
192+
expect(firstLine).toMatch(/\/\/\s*createUserService\s*::/);
193+
// Should NOT have parent path separator
194+
expect(firstLine).not.toMatch(/>/);
195+
}
196+
});
197+
198+
// Test 6: Full coverage verification — chunks cover the file with small
199+
// structural gaps only where container headers/footers are below the
200+
// 2-non-blank-line threshold.
201+
it('AST chunks cover the file with at most small structural gaps', async () => {
202+
const result = await analyzer.analyze('/virtual/user-service.ts', TYPESCRIPT_FIXTURE);
203+
204+
// Only check when we get AST-aligned chunks
205+
expect(result.metadata.chunkStrategy).toBe('ast-aligned');
206+
207+
const sorted = [...result.chunks].sort((a, b) => a.startLine - b.startLine);
208+
const totalLines = TYPESCRIPT_FIXTURE.split('\n').length;
209+
210+
// Collect all line numbers covered by chunks
211+
const coveredLines = new Set<number>();
212+
for (const chunk of sorted) {
213+
for (let line = chunk.startLine; line <= chunk.endLine; line++) {
214+
coveredLines.add(line);
215+
}
216+
}
217+
218+
// Count uncovered lines — should be minimal (small headers/footers below threshold)
219+
const allLines = TYPESCRIPT_FIXTURE.split('\n');
220+
const uncoveredLines: number[] = [];
221+
for (let i = 1; i <= totalLines; i++) {
222+
if (!coveredLines.has(i)) {
223+
uncoveredLines.push(i);
224+
}
225+
}
226+
227+
// Uncovered lines should be small structural fragments (class opening/closing braces, etc.)
228+
// Allow up to 15% uncovered for container header/footer gaps
229+
const uncoveredPct = (uncoveredLines.length / totalLines) * 100;
230+
expect(uncoveredPct).toBeLessThan(15);
231+
232+
// Every uncovered line should be structurally trivial (blank, brace, or short header)
233+
for (const lineNum of uncoveredLines) {
234+
const line = allLines[lineNum - 1].trim();
235+
const isTrivial = line === '' || line === '}' || line === '};' || line.length < 60;
236+
expect(isTrivial).toBe(true);
237+
}
238+
239+
// Verify no overlapping line ranges
240+
for (let i = 1; i < sorted.length; i++) {
241+
expect(sorted[i].startLine).toBeGreaterThan(sorted[i - 1].endLine);
242+
}
243+
244+
// Content from chunks (minus scope prefixes) should contain all significant source lines
245+
const chunkContent: string[] = [];
246+
for (const chunk of sorted) {
247+
const lines = chunk.content.split('\n');
248+
for (const line of lines) {
249+
// Skip scope prefix lines
250+
if (line.match(/^\/\/\s*.+\s*::\s*.+/) && !TYPESCRIPT_FIXTURE.includes(line)) {
251+
continue;
252+
}
253+
chunkContent.push(line);
254+
}
255+
}
256+
const joined = chunkContent.join('\n');
257+
258+
// All important function/class names must be present in reconstructed content
259+
// Note: 'class UserService' may be in a dropped header (<= 2 non-blank lines)
260+
// but the methods and standalone functions must be present
261+
expect(joined).toContain('async getById');
262+
expect(joined).toContain('async updateUser');
263+
expect(joined).toContain('function createUserService');
264+
});
265+
});

0 commit comments

Comments
 (0)