Documentation
¶
Index ¶
- type ContentProcessor
- func (cp *ContentProcessor) ConvertRelativeURLs(html, baseURL string) (string, error)
- func (cp *ContentProcessor) ExtractContent(html string, selectors []string) (string, error)
- func (cp *ContentProcessor) ExtractImages(html, baseURL string) ([]string, error)
- func (cp *ContentProcessor) ExtractMetadata(html string) (string, *models.PageMeta, error)
- func (cp *ContentProcessor) ProcessPage(page *models.Page, baseURL string) error
- func (cp *ContentProcessor) RemoveUnwantedElements(html string) (string, error)
- type ContentValidator
- type HTMLToMarkdownConverter
- type ValidationResult
- type WorkerPool
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ContentProcessor ¶
type ContentProcessor struct{}
ContentProcessor handles HTML content cleaning and processing
func NewContentProcessor ¶
func NewContentProcessor() *ContentProcessor
func (*ContentProcessor) ConvertRelativeURLs ¶
func (cp *ContentProcessor) ConvertRelativeURLs(html, baseURL string) (string, error)
func (*ContentProcessor) ExtractContent ¶
func (cp *ContentProcessor) ExtractContent(html string, selectors []string) (string, error)
ExtractContent extracts main content using selectors
func (*ContentProcessor) ExtractImages ¶
func (cp *ContentProcessor) ExtractImages(html, baseURL string) ([]string, error)
func (*ContentProcessor) ExtractMetadata ¶
func (*ContentProcessor) ProcessPage ¶
func (cp *ContentProcessor) ProcessPage(page *models.Page, baseURL string) error
func (*ContentProcessor) RemoveUnwantedElements ¶
func (cp *ContentProcessor) RemoveUnwantedElements(html string) (string, error)
type ContentValidator ¶
type ContentValidator struct{}
func NewContentValidator ¶
func NewContentValidator() *ContentValidator
func (*ContentValidator) ValidateContent ¶
func (cv *ContentValidator) ValidateContent(page *models.Page) *ValidationResult
type HTMLToMarkdownConverter ¶
type HTMLToMarkdownConverter struct{}
func NewHTMLToMarkdownConverter ¶
func NewHTMLToMarkdownConverter() *HTMLToMarkdownConverter
type ValidationResult ¶
type WorkerPool ¶
type WorkerPool struct {
// contains filtered or unexported fields
}
func NewWorkerPool ¶
func NewWorkerPool(workerCount int) *WorkerPool
func (*WorkerPool) ProcessPages ¶
func (wp *WorkerPool) ProcessPages(pages []*models.Page, baseURL string) <-chan *models.CrawlResult
func (*WorkerPool) Stop ¶
func (wp *WorkerPool) Stop()
func (*WorkerPool) WorkerCount ¶
func (wp *WorkerPool) WorkerCount() int
Click to show internal directories.
Click to hide internal directories.