@@ -181,21 +181,29 @@ export function extractDocumentMetadata(doc: DocumentNode): DocumentMetadata {
181181 }
182182 }
183183
184+ // Maximum characters per block. Blocks exceeding this are split
185+ // at node boundaries to prevent oversized chunks that dominate
186+ // BM25 search through sheer term frequency.
187+ const MAX_BLOCK_CHARS = 1500 ;
188+
184189 for ( const group of groups ) {
185- const blockId = `block-${ blockCounter ++ } ` ;
190+ // Register heading in headings list
191+ if ( group . heading ) {
192+ const text = extractInlineText (
193+ ( group . heading as { children : readonly InlineNode [ ] } ) . children ,
194+ ) ;
195+ const id = ( group . heading as { id ?: string } ) . id ?? slugify ( text ) ;
196+ headings . push ( { depth : ( group . heading as { depth : number } ) . depth , text, id } ) ;
197+ }
186198
187- // Collect refs from all nodes in the group
199+ // Collect refs from all nodes
200+ const blockId = `block-${ blockCounter ++ } ` ;
188201 if ( group . heading ) {
189202 collectInlineRefs (
190203 ( group . heading as { children : readonly InlineNode [ ] } ) . children ,
191204 doc . filePath ,
192205 blockId ,
193206 ) ;
194- const text = extractInlineText (
195- ( group . heading as { children : readonly InlineNode [ ] } ) . children ,
196- ) ;
197- const id = ( group . heading as { id ?: string } ) . id ?? slugify ( text ) ;
198- headings . push ( { depth : ( group . heading as { depth : number } ) . depth , text, id } ) ;
199207 }
200208 for ( const bodyNode of group . body ) {
201209 if ( bodyNode . type === 'paragraph' && 'children' in bodyNode ) {
@@ -207,29 +215,91 @@ export function extractDocumentMetadata(doc: DocumentNode): DocumentMetadata {
207215 }
208216 }
209217
210- // Build textContent from heading + all body nodes
211- const allNodes = group . heading ? [ group . heading , ...group . body ] : group . body ;
212- const textContent = allNodes . map ( extractBlockText ) . join ( '\n' ) . trim ( ) ;
213- if ( ! textContent ) continue ;
214-
215218 const headingNode = group . heading as { children : readonly InlineNode [ ] ; depth : number } | undefined ;
219+ const headingText = headingNode ? extractInlineText ( headingNode . children ) : undefined ;
220+ const headingDepth = headingNode ?. depth ;
216221 const annotations = group . heading && 'annotations' in group . heading
217222 ? ( group . heading as { annotations : Record < string , unknown > } ) . annotations
218223 : undefined ;
219224
220- blocks . push ( {
221- id : ( annotations as Record < string , string > | undefined ) ?. id ?? blockId ,
222- type : ( annotations as Record < string , string > | undefined ) ?. type ?? 'section' ,
223- headingText : headingNode ? extractInlineText ( headingNode . children ) : undefined ,
224- headingDepth : headingNode ?. depth ,
225- textContent,
226- annotations : ( annotations as Record < string , unknown > ) ?? { } ,
227- codeBlocks : allNodes . flatMap ( collectCodeBlocks ) ,
228- sourcePath : doc . filePath ,
229- sourceLocation : group . heading ?. location
230- ? { startLine : group . heading . location . start . line , endLine : group . body . at ( - 1 ) ?. location ?. end . line ?? group . heading . location . end . line }
231- : undefined ,
232- } ) ;
225+ // Split body into size-limited text segments.
226+ // When a single node exceeds MAX_BLOCK_CHARS (e.g., a massive list),
227+ // split its extracted text at line boundaries. This ensures no chunk
228+ // dominates BM25 search through sheer term frequency.
229+ const segments : string [ ] = [ ] ;
230+ let currentSegment = '' ;
231+
232+ for ( const bodyNode of group . body ) {
233+ const nodeText = extractBlockText ( bodyNode ) ;
234+ if ( ! nodeText ) continue ;
235+
236+ if ( nodeText . length > MAX_BLOCK_CHARS ) {
237+ // Flush current segment before splitting the large node
238+ if ( currentSegment ) {
239+ segments . push ( currentSegment . trim ( ) ) ;
240+ currentSegment = '' ;
241+ }
242+ // Split large node text at line/sentence boundaries
243+ const lines = nodeText . split ( / (?< = \. | \n ) / ) ;
244+ let lineBuf = '' ;
245+ for ( const line of lines ) {
246+ if ( lineBuf . length + line . length > MAX_BLOCK_CHARS && lineBuf ) {
247+ segments . push ( lineBuf . trim ( ) ) ;
248+ lineBuf = '' ;
249+ }
250+ lineBuf += line ;
251+ }
252+ if ( lineBuf . trim ( ) ) segments . push ( lineBuf . trim ( ) ) ;
253+ } else if ( currentSegment . length + nodeText . length > MAX_BLOCK_CHARS ) {
254+ segments . push ( currentSegment . trim ( ) ) ;
255+ currentSegment = nodeText ;
256+ } else {
257+ currentSegment += ( currentSegment ? '\n' : '' ) + nodeText ;
258+ }
259+ }
260+ if ( currentSegment . trim ( ) ) segments . push ( currentSegment . trim ( ) ) ;
261+
262+ const nonEmptySegments = segments . filter ( ( s ) => s . length > 0 ) ;
263+
264+ if ( nonEmptySegments . length === 0 && group . heading ) {
265+ const textContent = extractBlockText ( group . heading ) ;
266+ if ( textContent ) {
267+ blocks . push ( {
268+ id : ( annotations as Record < string , string > | undefined ) ?. id ?? blockId ,
269+ type : ( annotations as Record < string , string > | undefined ) ?. type ?? 'section' ,
270+ headingText,
271+ headingDepth,
272+ textContent,
273+ annotations : ( annotations as Record < string , unknown > ) ?? { } ,
274+ codeBlocks : group . body . flatMap ( collectCodeBlocks ) ,
275+ sourcePath : doc . filePath ,
276+ sourceLocation : group . heading . location
277+ ? { startLine : group . heading . location . start . line , endLine : group . heading . location . end . line }
278+ : undefined ,
279+ } ) ;
280+ }
281+ } else {
282+ for ( let i = 0 ; i < nonEmptySegments . length ; i ++ ) {
283+ const segmentText = i === 0 && group . heading
284+ ? extractBlockText ( group . heading ) + '\n' + nonEmptySegments [ i ] !
285+ : nonEmptySegments [ i ] ! ;
286+
287+ const subId = nonEmptySegments . length > 1
288+ ? `${ blockId } -part${ i } `
289+ : blockId ;
290+
291+ blocks . push ( {
292+ id : ( annotations as Record < string , string > | undefined ) ?. id ?? subId ,
293+ type : ( annotations as Record < string , string > | undefined ) ?. type ?? 'section' ,
294+ headingText : i === 0 ? headingText : headingText ? `${ headingText } (continued)` : undefined ,
295+ headingDepth,
296+ textContent : segmentText . trim ( ) ,
297+ annotations : ( annotations as Record < string , unknown > ) ?? { } ,
298+ codeBlocks : i === 0 ? group . body . flatMap ( collectCodeBlocks ) : [ ] ,
299+ sourcePath : doc . filePath ,
300+ } ) ;
301+ }
302+ }
233303 }
234304 }
235305
0 commit comments