Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ComplexityScore ¶
type ComplexityScore struct {
HasTables bool
HasImages bool
IsMultiCol bool
FontVariety int // number of distinct fonts
Score float64 // 0.0 = simple text, 1.0 = highly complex
}
ComplexityScore represents the structural complexity of a PDF page.
func DetectComplexity ¶
func DetectComplexity(path string) (*ComplexityScore, error)
DetectComplexity analyzes a PDF file for structural complexity.
func (*ComplexityScore) IsComplex ¶
func (cs *ComplexityScore) IsComplex() bool
IsComplex returns true if the PDF should be routed to vision processing.
type DOCXParser ¶
type DOCXParser struct{}
func (*DOCXParser) Parse ¶
func (p *DOCXParser) Parse(ctx context.Context, path string) (*ParseResult, error)
func (*DOCXParser) SupportedFormats ¶
func (p *DOCXParser) SupportedFormats() []string
type LegacyParser ¶
type LegacyParser struct{}
LegacyParser routes legacy binary formats to an external service.
func (*LegacyParser) Parse ¶
func (p *LegacyParser) Parse(ctx context.Context, path string) (*ParseResult, error)
func (*LegacyParser) SupportedFormats ¶
func (p *LegacyParser) SupportedFormats() []string
type LlamaParseConfig ¶
type LlamaParseParser ¶
type LlamaParseParser struct {
// contains filtered or unexported fields
}
func NewLlamaParseParser ¶
func NewLlamaParseParser(cfg LlamaParseConfig) *LlamaParseParser
func (*LlamaParseParser) Parse ¶
func (p *LlamaParseParser) Parse(ctx context.Context, path string) (*ParseResult, error)
func (*LlamaParseParser) SupportedFormats ¶
func (p *LlamaParseParser) SupportedFormats() []string
type PDFParser ¶
type PDFParser struct{}
func (*PDFParser) SupportedFormats ¶
type PDFVisionParser ¶
type PDFVisionParser struct {
// contains filtered or unexported fields
}
PDFVisionParser uses a vision LLM to extract text from complex PDF pages (tables, diagrams, multi-column layouts).
func NewPDFVisionParser ¶
func NewPDFVisionParser(provider llm.VisionProvider) *PDFVisionParser
func (*PDFVisionParser) Parse ¶
func (p *PDFVisionParser) Parse(ctx context.Context, path string) (*ParseResult, error)
func (*PDFVisionParser) SupportedFormats ¶
func (p *PDFVisionParser) SupportedFormats() []string
type PPTXParser ¶
type PPTXParser struct{}
func (*PPTXParser) Parse ¶
func (p *PPTXParser) Parse(ctx context.Context, path string) (*ParseResult, error)
func (*PPTXParser) SupportedFormats ¶
func (p *PPTXParser) SupportedFormats() []string
type ParseResult ¶
type ParseResult struct {
Sections []Section // Ordered sections extracted from the document
Method string // "native", "llamaparse", "vision"
Metadata map[string]string
}
ParseResult is what a parser produces from a document file.
type Parser ¶
type Parser interface {
Parse(ctx context.Context, path string) (*ParseResult, error)
SupportedFormats() []string
}
Parser can parse a specific document format.
type Registry ¶
type Registry struct {
// contains filtered or unexported fields
}
func NewRegistry ¶
func NewRegistry() *Registry
func (*Registry) SetLlamaParse ¶
func (r *Registry) SetLlamaParse(cfg LlamaParseConfig)
type Section ¶
type Section struct {
Heading string
Content string
Level int // Heading level (1=top, 2=sub, etc.)
PageNumber int
Type string // "section", "table", "definition", "requirement", "paragraph"
Children []Section
Metadata map[string]string
}
Section represents a logical section of a parsed document.
type XLSXParser ¶
type XLSXParser struct{}
func (*XLSXParser) Parse ¶
func (p *XLSXParser) Parse(ctx context.Context, path string) (*ParseResult, error)
func (*XLSXParser) SupportedFormats ¶
func (p *XLSXParser) SupportedFormats() []string
Click to show internal directories.
Click to hide internal directories.