Documentation
¶
Index ¶
- type CSVLoader
- type CSVReaderLoader
- type CharacterChunker
- type Chunk
- type Chunker
- type DirectoryLoader
- type Document
- type HTMLLoader
- type HTMLReaderLoader
- type JSONLoader
- type JSONReaderLoader
- type Loader
- type MultiURLLoader
- type PDFDirectoryLoader
- type PDFLoader
- type PDFReaderLoader
- type ParagraphChunker
- type ReaderLoader
- type SentenceChunker
- type TextLoader
- type URLLoader
- type WebCrawler
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CSVLoader ¶ added in v1.1.0
type CSVLoader struct {
FilePath string
Delimiter rune // Default: ','
HasHeader bool // Whether first row is header
TextColumns []int // Indices of columns to include (nil = all)
RowsPerDoc int // Number of rows per document (0 = all in one doc)
ContentFormat string // "json" or "text" (default: "text")
}
CSVLoader loads documents from CSV files
func NewCSVLoader ¶ added in v1.1.0
NewCSVLoader creates a new CSV loader
type CSVReaderLoader ¶ added in v1.1.0
type CSVReaderLoader struct {
Reader io.Reader
ID string
Delimiter rune
HasHeader bool
RowsPerDoc int
ContentFormat string
Metadata map[string]interface{}
}
CSVReaderLoader loads CSV from an io.Reader
func NewCSVReaderLoader ¶ added in v1.1.0
func NewCSVReaderLoader(reader io.Reader, id string, metadata map[string]interface{}) *CSVReaderLoader
NewCSVReaderLoader creates a new CSV reader loader
func (*CSVReaderLoader) Load ¶ added in v1.1.0
func (l *CSVReaderLoader) Load() ([]Document, error)
Load loads CSV content from a reader
type CharacterChunker ¶
type CharacterChunker struct {
ChunkSize int // Number of characters per chunk
ChunkOverlap int // Number of characters to overlap between chunks
Separator string
}
CharacterChunker splits documents by character count
func NewCharacterChunker ¶
func NewCharacterChunker(chunkSize, chunkOverlap int) *CharacterChunker
NewCharacterChunker creates a new character-based chunker
type Chunk ¶
type Chunk struct {
ID string `json:"id"`
Content string `json:"content"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
Index int `json:"index"` // Position in original document
}
Chunk represents a chunk of a document
type DirectoryLoader ¶
type DirectoryLoader struct {
DirPath string
Pattern string // File pattern to match (e.g., "*.txt", "*.md")
Recursive bool // Whether to search subdirectories
// contains filtered or unexported fields
}
DirectoryLoader loads documents from a directory
func NewDirectoryLoader ¶
func NewDirectoryLoader(dirPath string, pattern string, recursive bool) *DirectoryLoader
NewDirectoryLoader creates a new directory loader
func (*DirectoryLoader) Load ¶
func (l *DirectoryLoader) Load() ([]Document, error)
Load loads all matching files from a directory
type Document ¶
type Document struct {
ID string `json:"id"`
Content string `json:"content"`
Metadata map[string]interface{} `json:"metadata,omitempty"`
Source string `json:"source,omitempty"` // File path or URL
}
Document represents a document with metadata
type HTMLLoader ¶ added in v1.1.0
type HTMLLoader struct {
FilePath string
RemoveScripts bool // Remove <script> tags
RemoveStyles bool // Remove <style> tags
ExtractMetaTags bool // Extract meta tags as metadata
Selectors []string // CSS selectors to extract specific content (nil = extract all)
PreserveLinks bool // Keep links in content
}
HTMLLoader loads documents from HTML files
func NewHTMLLoader ¶ added in v1.1.0
func NewHTMLLoader(filePath string) *HTMLLoader
NewHTMLLoader creates a new HTML loader
func (*HTMLLoader) Load ¶ added in v1.1.0
func (l *HTMLLoader) Load() ([]Document, error)
Load loads an HTML file
type HTMLReaderLoader ¶ added in v1.1.0
type HTMLReaderLoader struct {
Reader io.Reader
ID string
RemoveScripts bool
RemoveStyles bool
ExtractMetaTags bool
Selectors []string
PreserveLinks bool
Metadata map[string]interface{}
}
HTMLReaderLoader loads HTML from an io.Reader
func NewHTMLReaderLoader ¶ added in v1.1.0
func NewHTMLReaderLoader(reader io.Reader, id string, metadata map[string]interface{}) *HTMLReaderLoader
NewHTMLReaderLoader creates a new HTML reader loader
func (*HTMLReaderLoader) Load ¶ added in v1.1.0
func (l *HTMLReaderLoader) Load() ([]Document, error)
Load loads HTML content from a reader
type JSONLoader ¶ added in v1.1.0
type JSONLoader struct {
FilePath string
JSONPath string // JSONPath expression to extract content (optional)
ContentFields []string // Fields to use as content (if JSON is object/array of objects)
MetadataFields []string // Fields to include in metadata
TextTemplate string // Template for formatting content (e.g., "{title}: {body}")
}
JSONLoader loads documents from JSON files
func NewJSONLoader ¶ added in v1.1.0
func NewJSONLoader(filePath string) *JSONLoader
NewJSONLoader creates a new JSON loader
func (*JSONLoader) Load ¶ added in v1.1.0
func (l *JSONLoader) Load() ([]Document, error)
Load loads a JSON file
type JSONReaderLoader ¶ added in v1.1.0
type JSONReaderLoader struct {
Reader io.Reader
ID string
ContentFields []string
MetadataFields []string
Metadata map[string]interface{}
}
JSONReaderLoader loads JSON from an io.Reader
func NewJSONReaderLoader ¶ added in v1.1.0
func NewJSONReaderLoader(reader io.Reader, id string, metadata map[string]interface{}) *JSONReaderLoader
NewJSONReaderLoader creates a new JSON reader loader
func (*JSONReaderLoader) Load ¶ added in v1.1.0
func (l *JSONReaderLoader) Load() ([]Document, error)
Load loads JSON content from a reader
type MultiURLLoader ¶ added in v1.1.0
type MultiURLLoader struct {
URLs []string
Timeout time.Duration
MaxConcurrent int // Maximum concurrent requests (default: 5)
ContinueOnErr bool
CommonHeaders map[string]string
CommonMetadata map[string]interface{}
}
MultiURLLoader loads documents from multiple URLs
func NewMultiURLLoader ¶ added in v1.1.0
func NewMultiURLLoader(urls []string) *MultiURLLoader
NewMultiURLLoader creates a new multi-URL loader
func (*MultiURLLoader) Load ¶ added in v1.1.0
func (l *MultiURLLoader) Load() ([]Document, error)
Load fetches content from multiple URLs concurrently
type PDFDirectoryLoader ¶ added in v1.1.0
PDFDirectoryLoader loads all PDF files from a directory
func NewPDFDirectoryLoader ¶ added in v1.1.0
func NewPDFDirectoryLoader(dirPath string, recursive bool) *PDFDirectoryLoader
NewPDFDirectoryLoader creates a new PDF directory loader
func (*PDFDirectoryLoader) Load ¶ added in v1.1.0
func (l *PDFDirectoryLoader) Load() ([]Document, error)
Load loads all PDF files from a directory
type PDFLoader ¶ added in v1.1.0
type PDFLoader struct {
FilePath string
ExtractImages bool // Future: extract images from PDF
PageSeparator string
PreserveLayout bool // Try to preserve text layout
}
PDFLoader loads documents from PDF files
func NewPDFLoader ¶ added in v1.1.0
NewPDFLoader creates a new PDF loader
type PDFReaderLoader ¶ added in v1.1.0
PDFReaderLoader loads PDF from a byte slice (in-memory PDF)
func NewPDFReaderLoader ¶ added in v1.1.0
func NewPDFReaderLoader(data []byte, id string, metadata map[string]interface{}) *PDFReaderLoader
NewPDFReaderLoader creates a new PDF reader loader from bytes
func (*PDFReaderLoader) Load ¶ added in v1.1.0
func (l *PDFReaderLoader) Load() ([]Document, error)
Load loads PDF content from bytes
type ParagraphChunker ¶
type ParagraphChunker struct {
MaxChunkSize int // Maximum characters per chunk
}
ParagraphChunker splits documents by paragraphs
func NewParagraphChunker ¶
func NewParagraphChunker(maxChunkSize int) *ParagraphChunker
NewParagraphChunker creates a new paragraph-based chunker
type ReaderLoader ¶
ReaderLoader loads documents from an io.Reader
func NewReaderLoader ¶
func NewReaderLoader(reader io.Reader, id string, metadata map[string]interface{}) *ReaderLoader
NewReaderLoader creates a new reader loader
func (*ReaderLoader) Load ¶
func (l *ReaderLoader) Load() ([]Document, error)
Load loads content from a reader
type SentenceChunker ¶
type SentenceChunker struct {
MaxChunkSize int // Maximum characters per chunk
MinChunkSize int // Minimum characters per chunk
}
SentenceChunker splits documents by sentences
func NewSentenceChunker ¶
func NewSentenceChunker(maxChunkSize, minChunkSize int) *SentenceChunker
NewSentenceChunker creates a new sentence-based chunker
type TextLoader ¶
type TextLoader struct {
FilePath string
}
TextLoader loads documents from text files
func NewTextLoader ¶
func NewTextLoader(filePath string) *TextLoader
NewTextLoader creates a new text file loader
type URLLoader ¶ added in v1.1.0
type URLLoader struct {
URL string
Method string // HTTP method (default: GET)
Headers map[string]string // Custom headers
Timeout time.Duration // Request timeout (default: 30s)
FollowRedirect bool // Follow redirects (default: true)
UserAgent string // User agent string
ContentType string // Expected content type (html, json, pdf, text)
AutoDetect bool // Auto-detect content type from response
}
URLLoader loads documents from URLs (web pages, APIs, etc.)
func NewURLLoader ¶ added in v1.1.0
NewURLLoader creates a new URL loader
type WebCrawler ¶ added in v1.1.0
type WebCrawler struct {
StartURL string
MaxDepth int // Maximum crawl depth (default: 2)
MaxPages int // Maximum pages to crawl (default: 10)
SameDomain bool // Only crawl same domain (default: true)
IncludeFilter []string // URL patterns to include
ExcludeFilter []string // URL patterns to exclude
Timeout time.Duration // Request timeout per page
Headers map[string]string // Custom headers
}
WebCrawler crawls web pages starting from a URL
func NewWebCrawler ¶ added in v1.1.0
func NewWebCrawler(startURL string) *WebCrawler
NewWebCrawler creates a new web crawler
func (*WebCrawler) Load ¶ added in v1.1.0
func (c *WebCrawler) Load() ([]Document, error)
Load crawls web pages and loads them as documents Note: This is a basic implementation. For production, consider using a dedicated crawler library