parser

package
v0.0.18 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 15, 2025 License: AGPL-3.0 Imports: 23 Imported by: 0

Documentation

Index

Constants

View Source
const FeaturePDFEnabled = false

Variables

View Source
var ErrBadFile = errors.New("bad file or corrupted")
View Source
var ErrParserDisabled = errors.New("parser disabled")

Functions

This section is empty.

Types

type BMPParser

type BMPParser struct {
	// contains filtered or unexported fields
}

Parses `image/bmp` files

func NewBMPParser

func NewBMPParser(ocrProvider ocr.Provider) *BMPParser

func (*BMPParser) Parse

func (p *BMPParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*BMPParser) ParseStream added in v0.0.10

func (p *BMPParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*BMPParser) SupportedMimeTypes

func (p *BMPParser) SupportedMimeTypes() []string

type CompositeParser

type CompositeParser struct {
	// contains filtered or unexported fields
}

func NewCompositeParser

func NewCompositeParser(parsers ...Parser) *CompositeParser

func (*CompositeParser) AddParsers

func (p *CompositeParser) AddParsers(parsers ...Parser)

func (*CompositeParser) Parse

func (p *CompositeParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*CompositeParser) ParseStream added in v0.0.10

func (p *CompositeParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*CompositeParser) SupportedMimeTypes

func (p *CompositeParser) SupportedMimeTypes() []string

type CompositeParserResult

type CompositeParserResult struct {
	FullPath string `json:"path"`
	Err      error  `json:"error"`
	MimeType string `json:"mimeType"`
	Inner    Result `json:"inner"`
}

func (*CompositeParserResult) Error

func (r *CompositeParserResult) Error() error

func (*CompositeParserResult) Path added in v0.0.10

func (r *CompositeParserResult) Path() string

func (*CompositeParserResult) String

func (r *CompositeParserResult) String() string

func (*CompositeParserResult) Subfiles added in v0.0.10

func (r *CompositeParserResult) Subfiles() []Result

type CompositeParserStreamResult added in v0.0.10

type CompositeParserStreamResult struct {
	FullPath        string             `json:"path"`
	Text            string             `json:"text"`
	MimeType        string             `json:"mimeType"`
	Inner           StreamResult       `json:"inner"`
	CurrentStage    ParseProgressStage `json:"stage"`
	CurrentProgress uint8              `json:"progress"`
	Err             error              `json:"error"`
}

func (*CompositeParserStreamResult) Error added in v0.0.10

func (*CompositeParserStreamResult) Path added in v0.0.10

func (*CompositeParserStreamResult) Progress added in v0.0.10

func (r *CompositeParserStreamResult) Progress() uint8

func (*CompositeParserStreamResult) Stage added in v0.0.10

func (*CompositeParserStreamResult) String added in v0.0.10

func (r *CompositeParserStreamResult) String() string

func (*CompositeParserStreamResult) SubResult added in v0.0.10

type CompositeStreamResultIterator added in v0.0.18

type CompositeStreamResultIterator struct {
	// contains filtered or unexported fields
}

func (*CompositeStreamResultIterator) Close added in v0.0.18

func (i *CompositeStreamResultIterator) Close()

func (*CompositeStreamResultIterator) Current added in v0.0.18

func (*CompositeStreamResultIterator) Next added in v0.0.18

type EMLParser added in v0.0.10

type EMLParser struct {
	// contains filtered or unexported fields
}

Parses `message/rfc822` files (.eml)

func NewEMLParser added in v0.0.10

func NewEMLParser(innerParser Parser) *EMLParser

func (*EMLParser) Parse added in v0.0.10

func (p *EMLParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*EMLParser) ParseStream added in v0.0.10

func (p *EMLParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*EMLParser) SupportedMimeTypes added in v0.0.10

func (p *EMLParser) SupportedMimeTypes() []string

type EMLParserResult added in v0.0.10

type EMLParserResult struct {
	FullPath    string              `json:"path"`
	Headers     map[string][]string `json:"headers"`
	Text        string              `json:"text"`
	Err         error               `json:"error"`
	Attachments []Result            `json:"attachments"`
}

func (*EMLParserResult) Error added in v0.0.10

func (r *EMLParserResult) Error() error

func (*EMLParserResult) Path added in v0.0.10

func (r *EMLParserResult) Path() string

func (*EMLParserResult) String added in v0.0.10

func (r *EMLParserResult) String() string

func (*EMLParserResult) Subfiles added in v0.0.10

func (r *EMLParserResult) Subfiles() []Result

type EMLParserStreamResult added in v0.0.10

type EMLParserStreamResult struct {
	FullPath          string              `json:"path"`
	Text              string              `json:"text"`
	CurrentStage      ParseProgressStage  `json:"stage"`
	Headers           map[string][]string `json:"headers"`
	CurrentPartHeader mail.PartHeader     `json:"subResultHeader"`
	CurrentPart       StreamResult        `json:"subResult"`
	Err               error               `json:"error"`
}

func (*EMLParserStreamResult) Error added in v0.0.10

func (r *EMLParserStreamResult) Error() error

func (*EMLParserStreamResult) Path added in v0.0.10

func (r *EMLParserStreamResult) Path() string

func (*EMLParserStreamResult) Progress added in v0.0.10

func (r *EMLParserStreamResult) Progress() uint8

func (*EMLParserStreamResult) Stage added in v0.0.10

func (*EMLParserStreamResult) String added in v0.0.10

func (r *EMLParserStreamResult) String() string

func (*EMLParserStreamResult) SubResult added in v0.0.10

func (r *EMLParserStreamResult) SubResult() StreamResult

type EMLStreamResultIterator added in v0.0.18

type EMLStreamResultIterator struct {
	// contains filtered or unexported fields
}

func (*EMLStreamResultIterator) Close added in v0.0.18

func (i *EMLStreamResultIterator) Close()

func (*EMLStreamResultIterator) Current added in v0.0.18

func (*EMLStreamResultIterator) Next added in v0.0.18

type ErrMimeTypeNotSupported

type ErrMimeTypeNotSupported struct {
	MimeType *mimetype.MIME
}

func (*ErrMimeTypeNotSupported) Error

func (e *ErrMimeTypeNotSupported) Error() string

type GIFParser

type GIFParser struct {
	// contains filtered or unexported fields
}

Parses `image/gif` files. Only decodes first frame

func NewGIFParser

func NewGIFParser(ocrProvider ocr.Provider) *GIFParser

func (*GIFParser) Parse

func (p *GIFParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*GIFParser) ParseStream added in v0.0.10

func (p *GIFParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*GIFParser) SupportedMimeTypes

func (p *GIFParser) SupportedMimeTypes() []string

type ImageParserResult added in v0.0.18

type ImageParserResult struct {
	FullPath string `json:"path"`
	Text     string `json:"text"`
	Err      error  `json:"error"`
}

func (*ImageParserResult) Error added in v0.0.18

func (r *ImageParserResult) Error() error

func (*ImageParserResult) Path added in v0.0.18

func (r *ImageParserResult) Path() string

func (*ImageParserResult) String added in v0.0.18

func (r *ImageParserResult) String() string

func (*ImageParserResult) Subfiles added in v0.0.18

func (r *ImageParserResult) Subfiles() []Result

type ImageParserStreamResult added in v0.0.18

type ImageParserStreamResult struct {
	FullPath        string             `json:"path"`
	Text            string             `json:"text"`
	CurrentStage    ParseProgressStage `json:"stage"`
	CurrentProgress uint8              `json:"progress"`
	Err             error              `json:"error"`
}

func (*ImageParserStreamResult) Error added in v0.0.18

func (r *ImageParserStreamResult) Error() error

func (*ImageParserStreamResult) Path added in v0.0.18

func (r *ImageParserStreamResult) Path() string

func (*ImageParserStreamResult) Progress added in v0.0.18

func (r *ImageParserStreamResult) Progress() uint8

func (*ImageParserStreamResult) Stage added in v0.0.18

func (*ImageParserStreamResult) String added in v0.0.18

func (r *ImageParserStreamResult) String() string

func (*ImageParserStreamResult) SubResult added in v0.0.18

func (r *ImageParserStreamResult) SubResult() StreamResult

type ImageStreamResultIterator added in v0.0.18

type ImageStreamResultIterator struct {
	// contains filtered or unexported fields
}

func (*ImageStreamResultIterator) Close added in v0.0.18

func (i *ImageStreamResultIterator) Close()

func (*ImageStreamResultIterator) Current added in v0.0.18

func (*ImageStreamResultIterator) Next added in v0.0.18

type JPEGParser

type JPEGParser struct {
	// contains filtered or unexported fields
}

Parses `image/jpeg` files

func NewJPEGParser

func NewJPEGParser(ocrProvider ocr.Provider) *JPEGParser

func (*JPEGParser) Parse

func (p *JPEGParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*JPEGParser) ParseStream added in v0.0.10

func (p *JPEGParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*JPEGParser) SupportedMimeTypes

func (p *JPEGParser) SupportedMimeTypes() []string

type PDFParser

type PDFParser struct {
}

Parses `application/pdf` files

func NewPDFParser

func NewPDFParser(innerParser Parser) *PDFParser

func (*PDFParser) Parse

func (p *PDFParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*PDFParser) ParseStream added in v0.0.10

func (p *PDFParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*PDFParser) SupportedMimeTypes

func (p *PDFParser) SupportedMimeTypes() []string

type PDFParserResult

type PDFParserResult struct {
	FullPath string   `json:"path"`
	Metadata string   `json:"metadata"`
	Pages    []string `json:"pages"`
	Err      error    `json:"error"`
}

func (*PDFParserResult) Error

func (r *PDFParserResult) Error() error

func (*PDFParserResult) Path added in v0.0.10

func (r *PDFParserResult) Path() string

func (*PDFParserResult) String

func (r *PDFParserResult) String() string

func (*PDFParserResult) Subfiles added in v0.0.10

func (r *PDFParserResult) Subfiles() []Result

type PDFParserStreamResult added in v0.0.10

type PDFParserStreamResult struct {
	FullPath        string             `json:"path"`
	CurrentStage    ParseProgressStage `json:"stage"`
	CurrentProgress uint8              `json:"progress"`
	Text            string             `json:"text"`
	Err             error              `json:"error"`
}

func (*PDFParserStreamResult) Error added in v0.0.10

func (r *PDFParserStreamResult) Error() error

func (*PDFParserStreamResult) Path added in v0.0.10

func (r *PDFParserStreamResult) Path() string

func (*PDFParserStreamResult) Progress added in v0.0.10

func (r *PDFParserStreamResult) Progress() uint8

func (*PDFParserStreamResult) Stage added in v0.0.10

func (*PDFParserStreamResult) String added in v0.0.10

func (r *PDFParserStreamResult) String() string

func (*PDFParserStreamResult) SubResult added in v0.0.10

func (r *PDFParserStreamResult) SubResult() StreamResult

type PDFStreamResultIterator added in v0.0.18

type PDFStreamResultIterator struct {
	// contains filtered or unexported fields
}

func (*PDFStreamResultIterator) Close added in v0.0.18

func (i *PDFStreamResultIterator) Close()

func (*PDFStreamResultIterator) Current added in v0.0.18

func (*PDFStreamResultIterator) Next added in v0.0.18

type PNGParser

type PNGParser struct {
	// contains filtered or unexported fields
}

Parses `image/png` files

func NewPNGParser

func NewPNGParser(ocrProvider ocr.Provider) *PNGParser

func (*PNGParser) Parse

func (p *PNGParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*PNGParser) ParseStream added in v0.0.10

func (p *PNGParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*PNGParser) SupportedMimeTypes

func (p *PNGParser) SupportedMimeTypes() []string

type ParseProgressStage added in v0.0.10

type ParseProgressStage string
const ProgressCompleted ParseProgressStage = "COMPLETED"

Raises on the end of file parsing

const ProgressNew ParseProgressStage = "NEW"
const ProgressUpdate ParseProgressStage = "UPDATE"

Indicates that

type Parser

type Parser interface {
	// Returns list of supported mime types by this parser
	SupportedMimeTypes() []string
	// Parse file. Thread safe. Use path to track subfiles or use file name as hint for mime type detection.
	Parse(ctx context.Context, file io.Reader, path string) Result
	// Parse file. Thread safe. Use path to track subfiles or use file name as hint for mime type detection. Return chanel that streams results.
	ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator
}

func New

func New(ocrProvider ocr.Provider) Parser

Build parser with all possible file types included

type RAWBGRAParser added in v0.0.8

type RAWBGRAParser struct {
	// contains filtered or unexported fields
}

Parses internal `image/file2llm-raw-bgra` streams

func NewRAWBGRAParser added in v0.0.8

func NewRAWBGRAParser(ocrProvider ocr.Provider) *RAWBGRAParser

func (*RAWBGRAParser) Parse added in v0.0.8

func (p *RAWBGRAParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*RAWBGRAParser) ParseStream added in v0.0.10

func (p *RAWBGRAParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*RAWBGRAParser) SupportedMimeTypes added in v0.0.8

func (p *RAWBGRAParser) SupportedMimeTypes() []string

type Result

type Result interface {
	// Get full path to the file
	Path() string
	// Convert entire result to LLM readable string
	String() string
	// Not empty if there where error
	Error() error
	// Parsed subfiles. For example files inside archives
	Subfiles() []Result
}

Parsing result

type StreamResult added in v0.0.10

type StreamResult interface {
	// Get full path to the file
	Path() string
	// Current file processing progress
	Stage() ParseProgressStage
	// Progress in percents from 0 to 100
	Progress() uint8
	// Underlying result
	SubResult() StreamResult
	// Convert entire result to LLM readable string
	String() string
	// Not empty if there where error
	Error() error
}

type StreamResultIterator added in v0.0.18

type StreamResultIterator interface {
	// Block until next stream result available or context is done. If no result available, returns false.
	Next(ctx context.Context) bool
	// Return current stream result
	Current() StreamResult
	// Free all the associated resources
	Close()
}

type TARParser added in v0.0.10

type TARParser struct {
	// contains filtered or unexported fields
}

func NewTARParser added in v0.0.10

func NewTARParser(innerParser Parser) *TARParser

func (*TARParser) Parse added in v0.0.10

func (p *TARParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*TARParser) ParseStream added in v0.0.10

func (p *TARParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*TARParser) SupportedMimeTypes added in v0.0.10

func (p *TARParser) SupportedMimeTypes() []string

type TARParserResult added in v0.0.10

type TARParserResult struct {
	FullPath        string   `json:"path"`
	SubfilesResults []Result `json:"subfiles"`
	Err             error    `json:"error"`
}

func (*TARParserResult) Error added in v0.0.10

func (r *TARParserResult) Error() error

func (*TARParserResult) Path added in v0.0.10

func (r *TARParserResult) Path() string

func (*TARParserResult) String added in v0.0.10

func (r *TARParserResult) String() string

func (*TARParserResult) Subfiles added in v0.0.10

func (r *TARParserResult) Subfiles() []Result

type TARParserStreamResult added in v0.0.10

type TARParserStreamResult struct {
	FullPath       string             `json:"path"`
	CurrentStage   ParseProgressStage `json:"stage"`
	CurrentSubfile StreamResult       `json:"subResult"`
	Err            error              `json:"error"`
}

func (*TARParserStreamResult) Error added in v0.0.10

func (r *TARParserStreamResult) Error() error

func (*TARParserStreamResult) Path added in v0.0.10

func (r *TARParserStreamResult) Path() string

func (*TARParserStreamResult) Progress added in v0.0.10

func (r *TARParserStreamResult) Progress() uint8

func (*TARParserStreamResult) Stage added in v0.0.10

func (*TARParserStreamResult) String added in v0.0.10

func (r *TARParserStreamResult) String() string

func (*TARParserStreamResult) SubResult added in v0.0.10

func (r *TARParserStreamResult) SubResult() StreamResult

type TARStreamResultIterator added in v0.0.18

type TARStreamResultIterator struct {
	// contains filtered or unexported fields
}

func (*TARStreamResultIterator) Close added in v0.0.18

func (i *TARStreamResultIterator) Close()

func (*TARStreamResultIterator) Current added in v0.0.18

func (*TARStreamResultIterator) Next added in v0.0.18

type TiffParser

type TiffParser struct {
	// contains filtered or unexported fields
}

Parses `image/tiff` files

func NewTiffParser

func NewTiffParser(ocrProvider ocr.Provider) *TiffParser

func (*TiffParser) Parse

func (p *TiffParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*TiffParser) ParseStream added in v0.0.10

func (p *TiffParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*TiffParser) SupportedMimeTypes

func (p *TiffParser) SupportedMimeTypes() []string

type WebPParser

type WebPParser struct {
	// contains filtered or unexported fields
}

Parses `image/webp` files

func NewWebPParser

func NewWebPParser(ocrProvider ocr.Provider) *WebPParser

func (*WebPParser) Parse

func (p *WebPParser) Parse(ctx context.Context, file io.Reader, path string) Result

func (*WebPParser) ParseStream added in v0.0.10

func (p *WebPParser) ParseStream(ctx context.Context, file io.Reader, path string) StreamResultIterator

func (*WebPParser) SupportedMimeTypes

func (p *WebPParser) SupportedMimeTypes() []string

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL