Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetFrameworkSelector ¶
GetFrameworkSelector returns the recommended selector for a known framework
func IsAutoSelector ¶
IsAutoSelector returns true if the selector value indicates auto-detection
Types ¶
type ContentDetector ¶
type ContentDetector struct {
// contains filtered or unexported fields
}
ContentDetector detects the appropriate content selector for a page
func NewContentDetector ¶
func NewContentDetector(log *logrus.Entry) *ContentDetector
NewContentDetector creates a new content detector with caching
func (*ContentDetector) Detect ¶
func (d *ContentDetector) Detect(doc *goquery.Document, pageURL *url.URL) DetectionResult
Detect determines the best content selector for the given document It first checks the cache, then tries framework detection, and falls back to readability
type DetectionResult ¶
type DetectionResult struct {
Framework Framework // Detected framework (or unknown)
Selector string // CSS selector for main content
Fallback bool // True if using readability fallback
}
DetectionResult contains the result of content selector detection
type FrameworkSignature ¶
type FrameworkSignature struct {
Framework Framework
Selector string // CSS selector for main content
Attributes []string // HTML attributes to look for (e.g., "data-docusaurus")
Classes []string // CSS classes to look for
Scripts []string // Script src patterns to look for
HTMLPatterns []string // Substring patterns to look for in raw HTML
}
FrameworkSignature defines detection patterns for a documentation framework
type ReadabilityExtractor ¶
type ReadabilityExtractor struct{}
ReadabilityExtractor extracts main content using Mozilla's Readability algorithm
func NewReadabilityExtractor ¶
func NewReadabilityExtractor() *ReadabilityExtractor
NewReadabilityExtractor creates a new readability-based content extractor
func (*ReadabilityExtractor) Extract ¶
func (r *ReadabilityExtractor) Extract(doc *goquery.Document, pageURL *url.URL) (*goquery.Selection, string, error)
Extract extracts the main content from an HTML document using readability Returns the extracted content as a goquery Selection that can be processed like regular content
func (*ReadabilityExtractor) ExtractText ¶
func (r *ReadabilityExtractor) ExtractText(doc *goquery.Document, pageURL *url.URL) (string, string, error)
ExtractText extracts just the text content (no HTML) using readability Useful for search indexing or simple text output
type SelectorCache ¶
type SelectorCache struct {
// contains filtered or unexported fields
}
SelectorCache caches detected selectors per domain to avoid repeated detection
func NewSelectorCache ¶
func NewSelectorCache() *SelectorCache
NewSelectorCache creates a new selector cache
func (*SelectorCache) Get ¶
func (c *SelectorCache) Get(domain string) (DetectionResult, bool)
Get retrieves a cached detection result for a domain
func (*SelectorCache) Set ¶
func (c *SelectorCache) Set(domain string, result DetectionResult)
Set stores a detection result for a domain
func (*SelectorCache) Size ¶
func (c *SelectorCache) Size() int
Size returns the number of cached entries