Skip to content

Commit d57a8c2

Browse files
author
waleed
committed
feat(kb): added json/yaml parser+chunker, added dedicated csv chunker
1 parent 10692b5 commit d57a8c2

File tree

24 files changed

+1450
-706
lines changed

24 files changed

+1450
-706
lines changed

.github/workflows/docs-embeddings.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ jobs:
3232
env:
3333
DATABASE_URL: ${{ github.ref == 'refs/heads/main' && secrets.DATABASE_URL || secrets.STAGING_DATABASE_URL }}
3434
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
35-
run: bun run scripts/process-docs-embeddings.ts --clear
35+
run: bun run scripts/process-docs.ts --clear

apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/upload-modal/upload-modal.tsx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ export function UploadModal({
6464
return `File "${file.name}" is too large. Maximum size is 100MB.`
6565
}
6666
if (!ACCEPTED_FILE_TYPES.includes(file.type)) {
67-
return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, or HTML files.`
67+
return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML files.`
6868
}
6969
return null
7070
}
@@ -193,8 +193,8 @@ export function UploadModal({
193193
{isDragging ? 'Drop files here!' : 'Drop files here or click to browse'}
194194
</p>
195195
<p className='text-muted-foreground text-xs'>
196-
Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB
197-
each)
196+
Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML,
197+
YML (max 100MB each)
198198
</p>
199199
</div>
200200
</div>

apps/sim/app/workspace/[workspaceId]/knowledge/components/create-modal/create-modal.tsx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ export function CreateModal({ open, onOpenChange, onKnowledgeBaseCreated }: Crea
158158
// Check file type
159159
if (!ACCEPTED_FILE_TYPES.includes(file.type)) {
160160
setFileError(
161-
`File ${file.name} has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, or HTML.`
161+
`File ${file.name} has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML.`
162162
)
163163
hasError = true
164164
continue
@@ -501,8 +501,8 @@ export function CreateModal({ open, onOpenChange, onKnowledgeBaseCreated }: Crea
501501
: 'Drop files here or click to browse'}
502502
</p>
503503
<p className='text-muted-foreground text-xs'>
504-
Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max
505-
100MB each)
504+
Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML,
505+
JSON, YAML, YML (max 100MB each)
506506
</p>
507507
</div>
508508
</div>

apps/sim/lib/knowledge/documents/docs-chunker.ts renamed to apps/sim/lib/chunkers/docs-chunker.ts

Lines changed: 13 additions & 224 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
11
import fs from 'fs/promises'
22
import path from 'path'
33
import { generateEmbeddings } from '@/lib/embeddings/utils'
4-
import { isDev } from '@/lib/environment'
5-
import { TextChunker } from '@/lib/knowledge/documents/chunker'
6-
import type { DocChunk, DocsChunkerOptions, HeaderInfo } from '@/lib/knowledge/documents/types'
74
import { createLogger } from '@/lib/logs/console/logger'
5+
import { TextChunker } from './text-chunker'
6+
import type { DocChunk, DocsChunkerOptions } from './types'
7+
8+
interface HeaderInfo {
9+
level: number
10+
text: string
11+
slug?: string
12+
anchor?: string
13+
position?: number
14+
}
815

916
interface Frontmatter {
1017
title?: string
@@ -29,7 +36,7 @@ export class DocsChunker {
2936
overlap: options.overlap ?? 50,
3037
})
3138
// Use localhost docs in development, production docs otherwise
32-
this.baseUrl = options.baseUrl ?? (isDev ? 'http://localhost:3001' : 'https://docs.sim.ai')
39+
this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'
3340
}
3441

3542
/**
@@ -108,9 +115,7 @@ export class DocsChunker {
108115
metadata: {
109116
startIndex: chunkStart,
110117
endIndex: chunkEnd,
111-
hasFrontmatter: i === 0 && content.startsWith('---'),
112-
documentTitle: frontmatter.title,
113-
documentDescription: frontmatter.description,
118+
title: frontmatter.title,
114119
},
115120
}
116121

@@ -200,7 +205,7 @@ export class DocsChunker {
200205
let relevantHeader: HeaderInfo | null = null
201206

202207
for (const header of headers) {
203-
if (header.position <= position) {
208+
if (header.position !== undefined && header.position <= position) {
204209
relevantHeader = header
205210
} else {
206211
break
@@ -285,53 +290,6 @@ export class DocsChunker {
285290
return { data, content: markdownContent }
286291
}
287292

288-
/**
289-
* Split content by headers to respect document structure
290-
*/
291-
private splitByHeaders(
292-
content: string
293-
): Array<{ header: string | null; content: string; level: number }> {
294-
const lines = content.split('\n')
295-
const sections: Array<{ header: string | null; content: string; level: number }> = []
296-
297-
let currentHeader: string | null = null
298-
let currentLevel = 0
299-
let currentContent: string[] = []
300-
301-
for (const line of lines) {
302-
const headerMatch = line.match(/^(#{1,3})\s+(.+)$/) // Only split on H1-H3, not H4-H6
303-
304-
if (headerMatch) {
305-
// Save previous section
306-
if (currentContent.length > 0) {
307-
sections.push({
308-
header: currentHeader,
309-
content: currentContent.join('\n').trim(),
310-
level: currentLevel,
311-
})
312-
}
313-
314-
// Start new section
315-
currentHeader = line
316-
currentLevel = headerMatch[1].length
317-
currentContent = []
318-
} else {
319-
currentContent.push(line)
320-
}
321-
}
322-
323-
// Add final section
324-
if (currentContent.length > 0) {
325-
sections.push({
326-
header: currentHeader,
327-
content: currentContent.join('\n').trim(),
328-
level: currentLevel,
329-
})
330-
}
331-
332-
return sections.filter((section) => section.content.trim().length > 0)
333-
}
334-
335293
/**
336294
* Estimate token count (rough approximation)
337295
*/
@@ -340,175 +298,6 @@ export class DocsChunker {
340298
return Math.ceil(text.length / 4)
341299
}
342300

343-
/**
344-
* Merge small adjacent chunks to reach target size
345-
*/
346-
private mergeSmallChunks(chunks: string[]): string[] {
347-
const merged: string[] = []
348-
let currentChunk = ''
349-
350-
for (const chunk of chunks) {
351-
const currentTokens = this.estimateTokens(currentChunk)
352-
const chunkTokens = this.estimateTokens(chunk)
353-
354-
// If adding this chunk would exceed target size, save current and start new
355-
if (currentTokens > 0 && currentTokens + chunkTokens > 500) {
356-
if (currentChunk.trim()) {
357-
merged.push(currentChunk.trim())
358-
}
359-
currentChunk = chunk
360-
} else {
361-
// Merge with current chunk
362-
currentChunk = currentChunk ? `${currentChunk}\n\n${chunk}` : chunk
363-
}
364-
}
365-
366-
// Add final chunk
367-
if (currentChunk.trim()) {
368-
merged.push(currentChunk.trim())
369-
}
370-
371-
return merged
372-
}
373-
374-
/**
375-
* Chunk a section while preserving tables and structure
376-
*/
377-
private async chunkSection(section: {
378-
header: string | null
379-
content: string
380-
level: number
381-
}): Promise<string[]> {
382-
const content = section.content
383-
const header = section.header
384-
385-
// Check if content contains tables
386-
const hasTable = this.containsTable(content)
387-
388-
if (hasTable) {
389-
// Split by tables and handle each part
390-
return this.splitContentWithTables(content, header)
391-
}
392-
// Regular chunking for text-only content
393-
const chunks = await this.textChunker.chunk(content)
394-
return chunks.map((chunk, index) => {
395-
// Add header to first chunk only
396-
if (index === 0 && header) {
397-
return `${header}\n\n${chunk.text}`.trim()
398-
}
399-
return chunk.text
400-
})
401-
}
402-
403-
/**
404-
* Check if content contains markdown tables
405-
*/
406-
private containsTable(content: string): boolean {
407-
const lines = content.split('\n')
408-
return lines.some((line, index) => {
409-
if (line.includes('|') && line.split('|').length >= 3) {
410-
const nextLine = lines[index + 1]
411-
return nextLine?.includes('|') && nextLine.includes('-')
412-
}
413-
return false
414-
})
415-
}
416-
417-
/**
418-
* Split content that contains tables, keeping tables intact
419-
*/
420-
private splitContentWithTables(content: string, header: string | null): string[] {
421-
const lines = content.split('\n')
422-
const chunks: string[] = []
423-
let currentChunk: string[] = []
424-
let inTable = false
425-
let tableLines: string[] = []
426-
427-
for (let i = 0; i < lines.length; i++) {
428-
const line = lines[i]
429-
430-
// Detect table start
431-
if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
432-
const nextLine = lines[i + 1]
433-
if (nextLine?.includes('|') && nextLine.includes('-')) {
434-
inTable = true
435-
436-
// Save current chunk if it has content
437-
if (currentChunk.length > 0 && currentChunk.join('\n').trim().length > 50) {
438-
const chunkText = currentChunk.join('\n').trim()
439-
const withHeader =
440-
chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
441-
chunks.push(withHeader)
442-
currentChunk = []
443-
}
444-
445-
tableLines = [line]
446-
continue
447-
}
448-
}
449-
450-
if (inTable) {
451-
tableLines.push(line)
452-
453-
// Detect table end
454-
if (!line.includes('|') || line.trim() === '') {
455-
inTable = false
456-
457-
// Save table as its own chunk
458-
const tableText = tableLines
459-
.filter((l) => l.trim())
460-
.join('\n')
461-
.trim()
462-
if (tableText.length > 0) {
463-
const withHeader =
464-
chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
465-
chunks.push(withHeader)
466-
}
467-
468-
tableLines = []
469-
470-
// Start new chunk if current line has content
471-
if (line.trim() !== '') {
472-
currentChunk = [line]
473-
}
474-
}
475-
} else {
476-
currentChunk.push(line)
477-
478-
// If chunk is getting large, save it
479-
if (this.estimateTokens(currentChunk.join('\n')) > 250) {
480-
const chunkText = currentChunk.join('\n').trim()
481-
if (chunkText.length > 50) {
482-
const withHeader =
483-
chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
484-
chunks.push(withHeader)
485-
}
486-
currentChunk = []
487-
}
488-
}
489-
}
490-
491-
// Handle remaining content
492-
if (inTable && tableLines.length > 0) {
493-
const tableText = tableLines
494-
.filter((l) => l.trim())
495-
.join('\n')
496-
.trim()
497-
if (tableText.length > 0) {
498-
const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
499-
chunks.push(withHeader)
500-
}
501-
} else if (currentChunk.length > 0) {
502-
const chunkText = currentChunk.join('\n').trim()
503-
if (chunkText.length > 50) {
504-
const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
505-
chunks.push(withHeader)
506-
}
507-
}
508-
509-
return chunks.filter((chunk) => chunk.trim().length > 50)
510-
}
511-
512301
/**
513302
* Detect table boundaries in markdown content to avoid splitting them
514303
*/

apps/sim/lib/chunkers/index.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
// Re-export all chunkers and types for easy importing
2+
3+
export { DocsChunker } from './docs-chunker'
4+
export { JsonYamlChunker } from './json-yaml-chunker'
5+
export { StructuredDataChunker } from './structured-data-chunker'
6+
export { TextChunker } from './text-chunker'
7+
export * from './types'

0 commit comments

Comments
 (0)