simstudioai
diff --git a/‎.github/workflows/docs-embeddings.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/docs-embeddings.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/upload-modal/upload-modal.tsx‎
Lines changed: 3 additions & 3 deletions b/‎apps/sim/app/workspace/[workspaceId]/knowledge/[id]/components/upload-modal/upload-modal.tsx‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎apps/sim/app/workspace/[workspaceId]/knowledge/components/create-modal/create-modal.tsx‎
Lines changed: 3 additions & 3 deletions b/‎apps/sim/app/workspace/[workspaceId]/knowledge/components/create-modal/create-modal.tsx‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎…/lib/knowledge/documents/docs-chunker.ts‎ ‎apps/sim/lib/chunkers/docs-chunker.ts‎apps/sim/lib/knowledge/documents/docs-chunker.ts renamed to apps/sim/lib/chunkers/docs-chunker.ts
Lines changed: 13 additions & 224 deletions b/‎…/lib/knowledge/documents/docs-chunker.ts‎ ‎apps/sim/lib/chunkers/docs-chunker.ts‎apps/sim/lib/knowledge/documents/docs-chunker.ts renamed to apps/sim/lib/chunkers/docs-chunker.ts
Lines changed: 13 additions & 224 deletions
diff --git a/‎apps/sim/lib/chunkers/index.ts‎
Lines changed: 7 additions & 0 deletions b/‎apps/sim/lib/chunkers/index.ts‎
Lines changed: 7 additions & 0 deletions
@@ -32,4 +32,4 @@ jobs:
         env:
           DATABASE_URL: ${{ github.ref == 'refs/heads/main' && secrets.DATABASE_URL || secrets.STAGING_DATABASE_URL }}
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-        run: bun run scripts/process-docs-embeddings.ts --clear 
+        run: bun run scripts/process-docs.ts --clear 
@@ -64,7 +64,7 @@ export function UploadModal({
       return `File "${file.name}" is too large. Maximum size is 100MB.`
     }
     if (!ACCEPTED_FILE_TYPES.includes(file.type)) {
-      return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, or HTML files.`
+      return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML files.`
     }
     return null
   }
@@ -193,8 +193,8 @@ export function UploadModal({
                     {isDragging ? 'Drop files here!' : 'Drop files here or click to browse'}
                   </p>
                   <p className='text-muted-foreground text-xs'>
-                    Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB
-                    each)
+                    Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML,
+                    YML (max 100MB each)
                   </p>
                 </div>
               </div>
 
@@ -158,7 +158,7 @@ export function CreateModal({ open, onOpenChange, onKnowledgeBaseCreated }: Crea
         // Check file type
         if (!ACCEPTED_FILE_TYPES.includes(file.type)) {
           setFileError(
-            `File ${file.name} has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, or HTML.`
+            `File ${file.name} has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML.`
           )
           hasError = true
           continue
@@ -501,8 +501,8 @@ export function CreateModal({ open, onOpenChange, onKnowledgeBaseCreated }: Crea
                                 : 'Drop files here or click to browse'}
                             </p>
                             <p className='text-muted-foreground text-xs'>
-                              Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max
-                              100MB each)
+                              Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML,
+                              JSON, YAML, YML (max 100MB each)
                             </p>
                           </div>
                         </div>
 
@@ -1,10 +1,17 @@
 import fs from 'fs/promises'
 import path from 'path'
 import { generateEmbeddings } from '@/lib/embeddings/utils'
-import { isDev } from '@/lib/environment'
-import { TextChunker } from '@/lib/knowledge/documents/chunker'
-import type { DocChunk, DocsChunkerOptions, HeaderInfo } from '@/lib/knowledge/documents/types'
 import { createLogger } from '@/lib/logs/console/logger'
+import { TextChunker } from './text-chunker'
+import type { DocChunk, DocsChunkerOptions } from './types'
+
+interface HeaderInfo {
+  level: number
+  text: string
+  slug?: string
+  anchor?: string
+  position?: number
+}
 
 interface Frontmatter {
   title?: string
@@ -29,7 +36,7 @@ export class DocsChunker {
       overlap: options.overlap ?? 50,
     })
     // Use localhost docs in development, production docs otherwise
-    this.baseUrl = options.baseUrl ?? (isDev ? 'http://localhost:3001' : 'https://docs.sim.ai')
+    this.baseUrl = options.baseUrl ?? 'https://docs.sim.ai'
   }
 
   /**
@@ -108,9 +115,7 @@ export class DocsChunker {
         metadata: {
           startIndex: chunkStart,
           endIndex: chunkEnd,
-          hasFrontmatter: i === 0 && content.startsWith('---'),
-          documentTitle: frontmatter.title,
-          documentDescription: frontmatter.description,
+          title: frontmatter.title,
         },
       }
 
@@ -200,7 +205,7 @@ export class DocsChunker {
     let relevantHeader: HeaderInfo | null = null
 
     for (const header of headers) {
-      if (header.position <= position) {
+      if (header.position !== undefined && header.position <= position) {
         relevantHeader = header
       } else {
         break
@@ -285,53 +290,6 @@ export class DocsChunker {
     return { data, content: markdownContent }
   }
 
-  /**
-   * Split content by headers to respect document structure
-   */
-  private splitByHeaders(
-    content: string
-  ): Array<{ header: string | null; content: string; level: number }> {
-    const lines = content.split('\n')
-    const sections: Array<{ header: string | null; content: string; level: number }> = []
-
-    let currentHeader: string | null = null
-    let currentLevel = 0
-    let currentContent: string[] = []
-
-    for (const line of lines) {
-      const headerMatch = line.match(/^(#{1,3})\s+(.+)$/) // Only split on H1-H3, not H4-H6
-
-      if (headerMatch) {
-        // Save previous section
-        if (currentContent.length > 0) {
-          sections.push({
-            header: currentHeader,
-            content: currentContent.join('\n').trim(),
-            level: currentLevel,
-          })
-        }
-
-        // Start new section
-        currentHeader = line
-        currentLevel = headerMatch[1].length
-        currentContent = []
-      } else {
-        currentContent.push(line)
-      }
-    }
-
-    // Add final section
-    if (currentContent.length > 0) {
-      sections.push({
-        header: currentHeader,
-        content: currentContent.join('\n').trim(),
-        level: currentLevel,
-      })
-    }
-
-    return sections.filter((section) => section.content.trim().length > 0)
-  }
-
   /**
    * Estimate token count (rough approximation)
    */
@@ -340,175 +298,6 @@ export class DocsChunker {
     return Math.ceil(text.length / 4)
   }
 
-  /**
-   * Merge small adjacent chunks to reach target size
-   */
-  private mergeSmallChunks(chunks: string[]): string[] {
-    const merged: string[] = []
-    let currentChunk = ''
-
-    for (const chunk of chunks) {
-      const currentTokens = this.estimateTokens(currentChunk)
-      const chunkTokens = this.estimateTokens(chunk)
-
-      // If adding this chunk would exceed target size, save current and start new
-      if (currentTokens > 0 && currentTokens + chunkTokens > 500) {
-        if (currentChunk.trim()) {
-          merged.push(currentChunk.trim())
-        }
-        currentChunk = chunk
-      } else {
-        // Merge with current chunk
-        currentChunk = currentChunk ? `${currentChunk}\n\n${chunk}` : chunk
-      }
-    }
-
-    // Add final chunk
-    if (currentChunk.trim()) {
-      merged.push(currentChunk.trim())
-    }
-
-    return merged
-  }
-
-  /**
-   * Chunk a section while preserving tables and structure
-   */
-  private async chunkSection(section: {
-    header: string | null
-    content: string
-    level: number
-  }): Promise<string[]> {
-    const content = section.content
-    const header = section.header
-
-    // Check if content contains tables
-    const hasTable = this.containsTable(content)
-
-    if (hasTable) {
-      // Split by tables and handle each part
-      return this.splitContentWithTables(content, header)
-    }
-    // Regular chunking for text-only content
-    const chunks = await this.textChunker.chunk(content)
-    return chunks.map((chunk, index) => {
-      // Add header to first chunk only
-      if (index === 0 && header) {
-        return `${header}\n\n${chunk.text}`.trim()
-      }
-      return chunk.text
-    })
-  }
-
-  /**
-   * Check if content contains markdown tables
-   */
-  private containsTable(content: string): boolean {
-    const lines = content.split('\n')
-    return lines.some((line, index) => {
-      if (line.includes('|') && line.split('|').length >= 3) {
-        const nextLine = lines[index + 1]
-        return nextLine?.includes('|') && nextLine.includes('-')
-      }
-      return false
-    })
-  }
-
-  /**
-   * Split content that contains tables, keeping tables intact
-   */
-  private splitContentWithTables(content: string, header: string | null): string[] {
-    const lines = content.split('\n')
-    const chunks: string[] = []
-    let currentChunk: string[] = []
-    let inTable = false
-    let tableLines: string[] = []
-
-    for (let i = 0; i < lines.length; i++) {
-      const line = lines[i]
-
-      // Detect table start
-      if (line.includes('|') && line.split('|').length >= 3 && !inTable) {
-        const nextLine = lines[i + 1]
-        if (nextLine?.includes('|') && nextLine.includes('-')) {
-          inTable = true
-
-          // Save current chunk if it has content
-          if (currentChunk.length > 0 && currentChunk.join('\n').trim().length > 50) {
-            const chunkText = currentChunk.join('\n').trim()
-            const withHeader =
-              chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
-            chunks.push(withHeader)
-            currentChunk = []
-          }
-
-          tableLines = [line]
-          continue
-        }
-      }
-
-      if (inTable) {
-        tableLines.push(line)
-
-        // Detect table end
-        if (!line.includes('|') || line.trim() === '') {
-          inTable = false
-
-          // Save table as its own chunk
-          const tableText = tableLines
-            .filter((l) => l.trim())
-            .join('\n')
-            .trim()
-          if (tableText.length > 0) {
-            const withHeader =
-              chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
-            chunks.push(withHeader)
-          }
-
-          tableLines = []
-
-          // Start new chunk if current line has content
-          if (line.trim() !== '') {
-            currentChunk = [line]
-          }
-        }
-      } else {
-        currentChunk.push(line)
-
-        // If chunk is getting large, save it
-        if (this.estimateTokens(currentChunk.join('\n')) > 250) {
-          const chunkText = currentChunk.join('\n').trim()
-          if (chunkText.length > 50) {
-            const withHeader =
-              chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
-            chunks.push(withHeader)
-          }
-          currentChunk = []
-        }
-      }
-    }
-
-    // Handle remaining content
-    if (inTable && tableLines.length > 0) {
-      const tableText = tableLines
-        .filter((l) => l.trim())
-        .join('\n')
-        .trim()
-      if (tableText.length > 0) {
-        const withHeader = chunks.length === 0 && header ? `${header}\n\n${tableText}` : tableText
-        chunks.push(withHeader)
-      }
-    } else if (currentChunk.length > 0) {
-      const chunkText = currentChunk.join('\n').trim()
-      if (chunkText.length > 50) {
-        const withHeader = chunks.length === 0 && header ? `${header}\n\n${chunkText}` : chunkText
-        chunks.push(withHeader)
-      }
-    }
-
-    return chunks.filter((chunk) => chunk.trim().length > 50)
-  }
-
   /**
    * Detect table boundaries in markdown content to avoid splitting them
    */
 
@@ -0,0 +1,7 @@
+// Re-export all chunkers and types for easy importing
+
+export { DocsChunker } from './docs-chunker'
+export { JsonYamlChunker } from './json-yaml-chunker'
+export { StructuredDataChunker } from './structured-data-chunker'
+export { TextChunker } from './text-chunker'
+export * from './types'
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@ export function UploadModal({`
`64`	`64`	return `File "${file.name}" is too large. Maximum size is 100MB.`
`65`	`65`	`}`
`66`	`66`	`if (!ACCEPTED_FILE_TYPES.includes(file.type)) {`
`67`		- return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, or HTML files.`
	`67`	+ return `File "${file.name}" has an unsupported format. Please use PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML, or YML files.`
`68`	`68`	`}`
`69`	`69`	`return null`
`70`	`70`	`}`
`@@ -193,8 +193,8 @@ export function UploadModal({`
`193`	`193`	`{isDragging ? 'Drop files here!' : 'Drop files here or click to browse'}`
`194`	`194`	`</p>`
`195`	`195`	`<p className='text-muted-foreground text-xs'>`
`196`		`- Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML (max 100MB`
`197`		`- each)`
	`196`	`+ Supports PDF, DOC, DOCX, TXT, CSV, XLS, XLSX, MD, PPT, PPTX, HTML, JSON, YAML,`
	`197`	`+ YML (max 100MB each)`
`198`	`198`	`</p>`
`199`	`199`	`</div>`
`200`	`200`	`</div>`