parser

package
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 4, 2026 License: Apache-2.0 Imports: 18 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ComplexityScore

type ComplexityScore struct {
	HasTables   bool
	HasImages   bool
	IsMultiCol  bool
	FontVariety int     // number of distinct fonts
	Score       float64 // 0.0 = simple text, 1.0 = highly complex
}

ComplexityScore represents the structural complexity of a PDF page.

func DetectComplexity

func DetectComplexity(path string) (*ComplexityScore, error)

DetectComplexity analyzes a PDF file for structural complexity.

func (*ComplexityScore) IsComplex

func (cs *ComplexityScore) IsComplex() bool

IsComplex returns true if the PDF should be routed to vision processing.

type DOCXParser

type DOCXParser struct{}

func (*DOCXParser) Parse

func (p *DOCXParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*DOCXParser) SupportedFormats

func (p *DOCXParser) SupportedFormats() []string

type LegacyParser

type LegacyParser struct{}

LegacyParser routes legacy binary formats to an external service.

func (*LegacyParser) Parse

func (p *LegacyParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*LegacyParser) SupportedFormats

func (p *LegacyParser) SupportedFormats() []string

type LlamaParseConfig

type LlamaParseConfig struct {
	APIKey  string
	BaseURL string
}

type LlamaParseParser

type LlamaParseParser struct {
	// contains filtered or unexported fields
}

func NewLlamaParseParser

func NewLlamaParseParser(cfg LlamaParseConfig) *LlamaParseParser

func (*LlamaParseParser) Parse

func (p *LlamaParseParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*LlamaParseParser) SupportedFormats

func (p *LlamaParseParser) SupportedFormats() []string

type PDFParser

type PDFParser struct{}

func (*PDFParser) Parse

func (p *PDFParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*PDFParser) SupportedFormats

func (p *PDFParser) SupportedFormats() []string

type PDFVisionParser

type PDFVisionParser struct {
	// contains filtered or unexported fields
}

PDFVisionParser uses a vision LLM to extract text from complex PDF pages (tables, diagrams, multi-column layouts).

func NewPDFVisionParser

func NewPDFVisionParser(provider llm.VisionProvider) *PDFVisionParser

func (*PDFVisionParser) Parse

func (p *PDFVisionParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*PDFVisionParser) SupportedFormats

func (p *PDFVisionParser) SupportedFormats() []string

type PPTXParser

type PPTXParser struct{}

func (*PPTXParser) Parse

func (p *PPTXParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*PPTXParser) SupportedFormats

func (p *PPTXParser) SupportedFormats() []string

type ParseResult

type ParseResult struct {
	Sections []Section // Ordered sections extracted from the document
	Method   string    // "native", "llamaparse", "vision"
	Metadata map[string]string
}

ParseResult is what a parser produces from a document file.

type Parser

type Parser interface {
	Parse(ctx context.Context, path string) (*ParseResult, error)
	SupportedFormats() []string
}

Parser can parse a specific document format.

type Registry

type Registry struct {
	// contains filtered or unexported fields
}

func NewRegistry

func NewRegistry() *Registry

func (*Registry) Get

func (r *Registry) Get(format string) (Parser, error)

func (*Registry) Register

func (r *Registry) Register(format string, p Parser)

func (*Registry) SetLlamaParse

func (r *Registry) SetLlamaParse(cfg LlamaParseConfig)

type Section

type Section struct {
	Heading    string
	Content    string
	Level      int // Heading level (1=top, 2=sub, etc.)
	PageNumber int
	Type       string // "section", "table", "definition", "requirement", "paragraph"
	Children   []Section
	Metadata   map[string]string
}

Section represents a logical section of a parsed document.

type XLSXParser

type XLSXParser struct{}

func (*XLSXParser) Parse

func (p *XLSXParser) Parse(ctx context.Context, path string) (*ParseResult, error)

func (*XLSXParser) SupportedFormats

func (p *XLSXParser) SupportedFormats() []string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL