11import fs from 'fs/promises'
22import path from 'path'
33import { generateEmbeddings } from '@/lib/embeddings/utils'
4- import { isDev } from '@/lib/environment'
5- import { TextChunker } from '@/lib/knowledge/documents/chunker'
6- import type { DocChunk , DocsChunkerOptions , HeaderInfo } from '@/lib/knowledge/documents/types'
74import { createLogger } from '@/lib/logs/console/logger'
5+ import { TextChunker } from './text-chunker'
6+ import type { DocChunk , DocsChunkerOptions } from './types'
7+
8+ interface HeaderInfo {
9+ level : number
10+ text : string
11+ slug ?: string
12+ anchor ?: string
13+ position ?: number
14+ }
815
916interface Frontmatter {
1017 title ?: string
@@ -29,7 +36,7 @@ export class DocsChunker {
2936 overlap : options . overlap ?? 50 ,
3037 } )
3138 // Use localhost docs in development, production docs otherwise
32- this . baseUrl = options . baseUrl ?? ( isDev ? 'http://localhost:3001' : ' https://docs.sim.ai')
39+ this . baseUrl = options . baseUrl ?? ' https://docs.sim.ai'
3340 }
3441
3542 /**
@@ -108,9 +115,7 @@ export class DocsChunker {
108115 metadata : {
109116 startIndex : chunkStart ,
110117 endIndex : chunkEnd ,
111- hasFrontmatter : i === 0 && content . startsWith ( '---' ) ,
112- documentTitle : frontmatter . title ,
113- documentDescription : frontmatter . description ,
118+ title : frontmatter . title ,
114119 } ,
115120 }
116121
@@ -200,7 +205,7 @@ export class DocsChunker {
200205 let relevantHeader : HeaderInfo | null = null
201206
202207 for ( const header of headers ) {
203- if ( header . position <= position ) {
208+ if ( header . position !== undefined && header . position <= position ) {
204209 relevantHeader = header
205210 } else {
206211 break
@@ -285,53 +290,6 @@ export class DocsChunker {
285290 return { data, content : markdownContent }
286291 }
287292
288- /**
289- * Split content by headers to respect document structure
290- */
291- private splitByHeaders (
292- content : string
293- ) : Array < { header : string | null ; content : string ; level : number } > {
294- const lines = content . split ( '\n' )
295- const sections : Array < { header : string | null ; content : string ; level : number } > = [ ]
296-
297- let currentHeader : string | null = null
298- let currentLevel = 0
299- let currentContent : string [ ] = [ ]
300-
301- for ( const line of lines ) {
302- const headerMatch = line . match ( / ^ ( # { 1 , 3 } ) \s + ( .+ ) $ / ) // Only split on H1-H3, not H4-H6
303-
304- if ( headerMatch ) {
305- // Save previous section
306- if ( currentContent . length > 0 ) {
307- sections . push ( {
308- header : currentHeader ,
309- content : currentContent . join ( '\n' ) . trim ( ) ,
310- level : currentLevel ,
311- } )
312- }
313-
314- // Start new section
315- currentHeader = line
316- currentLevel = headerMatch [ 1 ] . length
317- currentContent = [ ]
318- } else {
319- currentContent . push ( line )
320- }
321- }
322-
323- // Add final section
324- if ( currentContent . length > 0 ) {
325- sections . push ( {
326- header : currentHeader ,
327- content : currentContent . join ( '\n' ) . trim ( ) ,
328- level : currentLevel ,
329- } )
330- }
331-
332- return sections . filter ( ( section ) => section . content . trim ( ) . length > 0 )
333- }
334-
335293 /**
336294 * Estimate token count (rough approximation)
337295 */
@@ -340,175 +298,6 @@ export class DocsChunker {
340298 return Math . ceil ( text . length / 4 )
341299 }
342300
343- /**
344- * Merge small adjacent chunks to reach target size
345- */
346- private mergeSmallChunks ( chunks : string [ ] ) : string [ ] {
347- const merged : string [ ] = [ ]
348- let currentChunk = ''
349-
350- for ( const chunk of chunks ) {
351- const currentTokens = this . estimateTokens ( currentChunk )
352- const chunkTokens = this . estimateTokens ( chunk )
353-
354- // If adding this chunk would exceed target size, save current and start new
355- if ( currentTokens > 0 && currentTokens + chunkTokens > 500 ) {
356- if ( currentChunk . trim ( ) ) {
357- merged . push ( currentChunk . trim ( ) )
358- }
359- currentChunk = chunk
360- } else {
361- // Merge with current chunk
362- currentChunk = currentChunk ? `${ currentChunk } \n\n${ chunk } ` : chunk
363- }
364- }
365-
366- // Add final chunk
367- if ( currentChunk . trim ( ) ) {
368- merged . push ( currentChunk . trim ( ) )
369- }
370-
371- return merged
372- }
373-
374- /**
375- * Chunk a section while preserving tables and structure
376- */
377- private async chunkSection ( section : {
378- header : string | null
379- content : string
380- level : number
381- } ) : Promise < string [ ] > {
382- const content = section . content
383- const header = section . header
384-
385- // Check if content contains tables
386- const hasTable = this . containsTable ( content )
387-
388- if ( hasTable ) {
389- // Split by tables and handle each part
390- return this . splitContentWithTables ( content , header )
391- }
392- // Regular chunking for text-only content
393- const chunks = await this . textChunker . chunk ( content )
394- return chunks . map ( ( chunk , index ) => {
395- // Add header to first chunk only
396- if ( index === 0 && header ) {
397- return `${ header } \n\n${ chunk . text } ` . trim ( )
398- }
399- return chunk . text
400- } )
401- }
402-
403- /**
404- * Check if content contains markdown tables
405- */
406- private containsTable ( content : string ) : boolean {
407- const lines = content . split ( '\n' )
408- return lines . some ( ( line , index ) => {
409- if ( line . includes ( '|' ) && line . split ( '|' ) . length >= 3 ) {
410- const nextLine = lines [ index + 1 ]
411- return nextLine ?. includes ( '|' ) && nextLine . includes ( '-' )
412- }
413- return false
414- } )
415- }
416-
417- /**
418- * Split content that contains tables, keeping tables intact
419- */
420- private splitContentWithTables ( content : string , header : string | null ) : string [ ] {
421- const lines = content . split ( '\n' )
422- const chunks : string [ ] = [ ]
423- let currentChunk : string [ ] = [ ]
424- let inTable = false
425- let tableLines : string [ ] = [ ]
426-
427- for ( let i = 0 ; i < lines . length ; i ++ ) {
428- const line = lines [ i ]
429-
430- // Detect table start
431- if ( line . includes ( '|' ) && line . split ( '|' ) . length >= 3 && ! inTable ) {
432- const nextLine = lines [ i + 1 ]
433- if ( nextLine ?. includes ( '|' ) && nextLine . includes ( '-' ) ) {
434- inTable = true
435-
436- // Save current chunk if it has content
437- if ( currentChunk . length > 0 && currentChunk . join ( '\n' ) . trim ( ) . length > 50 ) {
438- const chunkText = currentChunk . join ( '\n' ) . trim ( )
439- const withHeader =
440- chunks . length === 0 && header ? `${ header } \n\n${ chunkText } ` : chunkText
441- chunks . push ( withHeader )
442- currentChunk = [ ]
443- }
444-
445- tableLines = [ line ]
446- continue
447- }
448- }
449-
450- if ( inTable ) {
451- tableLines . push ( line )
452-
453- // Detect table end
454- if ( ! line . includes ( '|' ) || line . trim ( ) === '' ) {
455- inTable = false
456-
457- // Save table as its own chunk
458- const tableText = tableLines
459- . filter ( ( l ) => l . trim ( ) )
460- . join ( '\n' )
461- . trim ( )
462- if ( tableText . length > 0 ) {
463- const withHeader =
464- chunks . length === 0 && header ? `${ header } \n\n${ tableText } ` : tableText
465- chunks . push ( withHeader )
466- }
467-
468- tableLines = [ ]
469-
470- // Start new chunk if current line has content
471- if ( line . trim ( ) !== '' ) {
472- currentChunk = [ line ]
473- }
474- }
475- } else {
476- currentChunk . push ( line )
477-
478- // If chunk is getting large, save it
479- if ( this . estimateTokens ( currentChunk . join ( '\n' ) ) > 250 ) {
480- const chunkText = currentChunk . join ( '\n' ) . trim ( )
481- if ( chunkText . length > 50 ) {
482- const withHeader =
483- chunks . length === 0 && header ? `${ header } \n\n${ chunkText } ` : chunkText
484- chunks . push ( withHeader )
485- }
486- currentChunk = [ ]
487- }
488- }
489- }
490-
491- // Handle remaining content
492- if ( inTable && tableLines . length > 0 ) {
493- const tableText = tableLines
494- . filter ( ( l ) => l . trim ( ) )
495- . join ( '\n' )
496- . trim ( )
497- if ( tableText . length > 0 ) {
498- const withHeader = chunks . length === 0 && header ? `${ header } \n\n${ tableText } ` : tableText
499- chunks . push ( withHeader )
500- }
501- } else if ( currentChunk . length > 0 ) {
502- const chunkText = currentChunk . join ( '\n' ) . trim ( )
503- if ( chunkText . length > 50 ) {
504- const withHeader = chunks . length === 0 && header ? `${ header } \n\n${ chunkText } ` : chunkText
505- chunks . push ( withHeader )
506- }
507- }
508-
509- return chunks . filter ( ( chunk ) => chunk . trim ( ) . length > 50 )
510- }
511-
512301 /**
513302 * Detect table boundaries in markdown content to avoid splitting them
514303 */
0 commit comments