parser

package
v0.0.0-...-d54656b Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 7, 2026 License: MIT Imports: 23 Imported by: 0

Documentation

Index

Constants

View Source
const (
	FileTypeUnknown = "unknown"
	FileTypeTXT     = "txt"
	FileTypeMD      = "md"
	FileTypeCSV     = "csv"
	FileTypeHTML    = "html"
	FileTypeJSON    = "json"
	FileTypeYAML    = "yaml"
	FileTypeYML     = "yml"
	FileTypeEML     = "eml"
	FileTypeRTF     = "rtf"
	FileTypePDF     = "pdf"
	FileTypePNG     = "png"
	FileTypeJPG     = "jpg"
	FileTypeJPEG    = "jpeg"
	FileTypeDOC     = "doc"
	FileTypeDOCX    = "docx"
	FileTypePPTX    = "pptx"
	FileTypeXLSX    = "xlsx"
)

Variables

View Source
var (
	ErrUnsupportedFileType = errors.New("unsupported file type")
	ErrEmptyInput          = errors.New("empty input")
)

Functions

func DetectFileType

func DetectFileType(req *ParseRequest) string

Types

type CSVParser

type CSVParser struct{}

func (*CSVParser) Parse

func (p *CSVParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*CSVParser) Provider

func (p *CSVParser) Provider() string

func (*CSVParser) SupportedTypes

func (p *CSVParser) SupportedTypes() []string

type DOCParser

type DOCParser struct{}

func (*DOCParser) Parse

func (p *DOCParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*DOCParser) Provider

func (p *DOCParser) Provider() string

func (*DOCParser) SupportedTypes

func (p *DOCParser) SupportedTypes() []string

type DOCXParser

type DOCXParser struct{}

func (*DOCXParser) Parse

func (p *DOCXParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*DOCXParser) Provider

func (p *DOCXParser) Provider() string

func (*DOCXParser) SupportedTypes

func (p *DOCXParser) SupportedTypes() []string

type EMLParser

type EMLParser struct{}

func (*EMLParser) Parse

func (p *EMLParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*EMLParser) Provider

func (p *EMLParser) Provider() string

func (*EMLParser) SupportedTypes

func (p *EMLParser) SupportedTypes() []string

type HTMLParser

type HTMLParser struct{}

func (*HTMLParser) Parse

func (p *HTMLParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*HTMLParser) Provider

func (p *HTMLParser) Provider() string

func (*HTMLParser) SupportedTypes

func (p *HTMLParser) SupportedTypes() []string

type JSONParser

type JSONParser struct{}

func (*JSONParser) Parse

func (p *JSONParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*JSONParser) Provider

func (p *JSONParser) Provider() string

func (*JSONParser) SupportedTypes

func (p *JSONParser) SupportedTypes() []string

type OCRParser

type OCRParser struct {
	Language string
}

func (*OCRParser) Parse

func (p *OCRParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*OCRParser) Provider

func (p *OCRParser) Provider() string

func (*OCRParser) SupportedTypes

func (p *OCRParser) SupportedTypes() []string

type PDFParser

type PDFParser struct{}

func (*PDFParser) Parse

func (p *PDFParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*PDFParser) Provider

func (p *PDFParser) Provider() string

func (*PDFParser) SupportedTypes

func (p *PDFParser) SupportedTypes() []string

type PPTXParser

type PPTXParser struct{}

func (*PPTXParser) Parse

func (p *PPTXParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*PPTXParser) Provider

func (p *PPTXParser) Provider() string

func (*PPTXParser) SupportedTypes

func (p *PPTXParser) SupportedTypes() []string

type ParseOptions

type ParseOptions struct {
	MaxTextLength      int
	IncludeTables      bool
	IncludeHidden      bool
	PreserveLineBreaks bool
}

type ParseRequest

type ParseRequest struct {
	FileType    string
	FileName    string
	Path        string
	ContentType string
	Content     []byte
	Reader      io.Reader
	Metadata    map[string]any
}

type ParseResult

type ParseResult struct {
	FileType string
	FileName string
	Text     string
	Sections []Section
	Metadata map[string]any
	ParsedAt time.Time
}

func ParseAuto

func ParseAuto(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func ParseBytes

func ParseBytes(ctx context.Context, fileName string, content []byte, opts *ParseOptions) (*ParseResult, error)

func ParsePath

func ParsePath(ctx context.Context, path string, opts *ParseOptions) (*ParseResult, error)

type Parser

type Parser interface {
	Provider() string

	SupportedTypes() []string

	Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)
}

type RTFParser

type RTFParser struct{}

func (*RTFParser) Parse

func (p *RTFParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*RTFParser) Provider

func (p *RTFParser) Provider() string

func (*RTFParser) SupportedTypes

func (p *RTFParser) SupportedTypes() []string

type Router

type Router struct {
	// contains filtered or unexported fields
}

func DefaultRouter

func DefaultRouter() *Router

func NewRouter

func NewRouter(parsers ...Parser) *Router

func (*Router) Parse

func (r *Router) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*Router) Register

func (r *Router) Register(p Parser) error

type Section

type Section struct {
	Type     SectionType
	Index    int
	Title    string
	Text     string
	Metadata map[string]any
}

type SectionType

type SectionType string
const (
	SectionTypeUnknown  SectionType = "unknown"
	SectionTypeDocument SectionType = "document"
	SectionTypePage     SectionType = "page"
	SectionTypeSheet    SectionType = "sheet"
	SectionTypeSlide    SectionType = "slide"
)

type TXTParser

type TXTParser struct{}

func (*TXTParser) Parse

func (p *TXTParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*TXTParser) Provider

func (p *TXTParser) Provider() string

func (*TXTParser) SupportedTypes

func (p *TXTParser) SupportedTypes() []string

type XLSXParser

type XLSXParser struct{}

func (*XLSXParser) Parse

func (p *XLSXParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*XLSXParser) Provider

func (p *XLSXParser) Provider() string

func (*XLSXParser) SupportedTypes

func (p *XLSXParser) SupportedTypes() []string

type YAMLParser

type YAMLParser struct{}

func (*YAMLParser) Parse

func (p *YAMLParser) Parse(ctx context.Context, req *ParseRequest, opts *ParseOptions) (*ParseResult, error)

func (*YAMLParser) Provider

func (p *YAMLParser) Provider() string

func (*YAMLParser) SupportedTypes

func (p *YAMLParser) SupportedTypes() []string

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL