processor

package
v0.0.0-...-2e8b532 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 28, 2025 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ContentProcessor

type ContentProcessor struct{}

ContentProcessor handles HTML content cleaning and processing

func NewContentProcessor

func NewContentProcessor() *ContentProcessor

func (*ContentProcessor) ConvertRelativeURLs

func (cp *ContentProcessor) ConvertRelativeURLs(html, baseURL string) (string, error)

func (*ContentProcessor) ExtractContent

func (cp *ContentProcessor) ExtractContent(html string, selectors []string) (string, error)

ExtractContent extracts main content using selectors

func (*ContentProcessor) ExtractImages

func (cp *ContentProcessor) ExtractImages(html, baseURL string) ([]string, error)

func (*ContentProcessor) ExtractMetadata

func (cp *ContentProcessor) ExtractMetadata(html string) (string, *models.PageMeta, error)

func (*ContentProcessor) ProcessPage

func (cp *ContentProcessor) ProcessPage(page *models.Page, baseURL string) error

func (*ContentProcessor) RemoveUnwantedElements

func (cp *ContentProcessor) RemoveUnwantedElements(html string) (string, error)

type ContentValidator

type ContentValidator struct{}

func NewContentValidator

func NewContentValidator() *ContentValidator

func (*ContentValidator) ValidateContent

func (cv *ContentValidator) ValidateContent(page *models.Page) *ValidationResult

type HTMLToMarkdownConverter

type HTMLToMarkdownConverter struct{}

func NewHTMLToMarkdownConverter

func NewHTMLToMarkdownConverter() *HTMLToMarkdownConverter

func (*HTMLToMarkdownConverter) Convert

func (c *HTMLToMarkdownConverter) Convert(html string) (string, error)

type ValidationResult

type ValidationResult struct {
	IsValid     bool     `json:"is_valid"`
	Score       float64  `json:"score"`
	Issues      []string `json:"issues"`
	WordCount   int      `json:"word_count"`
	HasContent  bool     `json:"has_content"`
	HasHeadings bool     `json:"has_headings"`
}

type WorkerPool

type WorkerPool struct {
	// contains filtered or unexported fields
}

func NewWorkerPool

func NewWorkerPool(workerCount int) *WorkerPool

func (*WorkerPool) ProcessPages

func (wp *WorkerPool) ProcessPages(pages []*models.Page, baseURL string) <-chan *models.CrawlResult

func (*WorkerPool) Stop

func (wp *WorkerPool) Stop()

func (*WorkerPool) WorkerCount

func (wp *WorkerPool) WorkerCount() int

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL