Documentation
¶
Index ¶
- type HtmlStreamParser
- func (p *HtmlStreamParser) GetSupportedTypes() []string
- func (p *HtmlStreamParser) Parse(ctx context.Context, content []byte, metadata map[string]any) (*core.Document, error)
- func (p *HtmlStreamParser) ParseStream(ctx context.Context, r io.Reader, metadata map[string]any) (<-chan *core.Document, error)
- func (p *HtmlStreamParser) SetChunkOverlap(overlap int)
- func (p *HtmlStreamParser) SetChunkSize(size int)
- func (p *HtmlStreamParser) SetCleanScripts(clean bool)
- func (p *HtmlStreamParser) SetCleanStyles(clean bool)
- func (p *HtmlStreamParser) SetExtractLinks(extract bool)
- func (p *HtmlStreamParser) Supports(contentType string) bool
- type Parser
- func (p *Parser) Parse(ctx context.Context, r io.Reader) ([]core.Chunk, error)
- func (p *Parser) ParseWithCallback(ctx context.Context, r io.Reader, callback func(core.Chunk) error) error
- func (p *Parser) SetChunkOverlap(overlap int)
- func (p *Parser) SetChunkSize(size int)
- func (p *Parser) SetCleanScripts(clean bool)
- func (p *Parser) SetCleanStyles(clean bool)
- func (p *Parser) SetExtractLinks(extract bool)
- func (p *Parser) SupportedFormats() []string
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type HtmlStreamParser ¶
type HtmlStreamParser struct {
// contains filtered or unexported fields
}
func DefaultHtmlStreamParser ¶ added in v1.1.3
func DefaultHtmlStreamParser() *HtmlStreamParser
DefaultHtmlStreamParser creates a new HTML stream parser
func (*HtmlStreamParser) GetSupportedTypes ¶
func (p *HtmlStreamParser) GetSupportedTypes() []string
GetSupportedTypes returns the supported file formats
func (*HtmlStreamParser) Parse ¶
func (p *HtmlStreamParser) Parse(ctx context.Context, content []byte, metadata map[string]any) (*core.Document, error)
Parse implements core.Parser interface.
func (*HtmlStreamParser) ParseStream ¶
func (p *HtmlStreamParser) ParseStream(ctx context.Context, r io.Reader, metadata map[string]any) (<-chan *core.Document, error)
ParseStream implements the core.Parser interface
func (*HtmlStreamParser) SetChunkOverlap ¶
func (p *HtmlStreamParser) SetChunkOverlap(overlap int)
SetChunkOverlap sets the chunk overlap
func (*HtmlStreamParser) SetChunkSize ¶
func (p *HtmlStreamParser) SetChunkSize(size int)
SetChunkSize sets the chunk size
func (*HtmlStreamParser) SetCleanScripts ¶
func (p *HtmlStreamParser) SetCleanScripts(clean bool)
SetCleanScripts sets whether to remove <script> tags
func (*HtmlStreamParser) SetCleanStyles ¶
func (p *HtmlStreamParser) SetCleanStyles(clean bool)
SetCleanStyles sets whether to remove <style> tags
func (*HtmlStreamParser) SetExtractLinks ¶
func (p *HtmlStreamParser) SetExtractLinks(extract bool)
SetExtractLinks sets whether to extract links
func (*HtmlStreamParser) Supports ¶
func (p *HtmlStreamParser) Supports(contentType string) bool
type Parser ¶
type Parser struct {
// contains filtered or unexported fields
}
Parser implements an HTML document parser
func DefaultParser ¶ added in v1.1.3
func DefaultParser() *Parser
DefaultParser creates a new HTML parser
func (*Parser) ParseWithCallback ¶
func (p *Parser) ParseWithCallback(ctx context.Context, r io.Reader, callback func(core.Chunk) error) error
ParseWithCallback parses HTML and calls the callback for each chunk
func (*Parser) SetChunkOverlap ¶
SetChunkOverlap sets the chunk overlap
func (*Parser) SetChunkSize ¶
SetChunkSize sets the chunk size
func (*Parser) SetCleanScripts ¶
SetCleanScripts sets whether to remove <script> tags
func (*Parser) SetCleanStyles ¶
SetCleanStyles sets whether to remove <style> tags
func (*Parser) SetExtractLinks ¶
SetExtractLinks sets whether to extract links
func (*Parser) SupportedFormats ¶
SupportedFormats returns supported formats