detect

package

v1.3.3 Latest Latest Go to latest Published: Feb 21, 2026 License: Apache-2.0 Imports: 8 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/Sriram-PR/doc-scraper

Links

Open Source Insights

Documentation ¶

Index ¶

func GetFrameworkSelector(fw Framework) string
func IsAutoSelector(selector string) bool
type ContentDetector
- func NewContentDetector(log *logrus.Entry) *ContentDetector
- func (d *ContentDetector) Detect(doc *goquery.Document, pageURL *url.URL) DetectionResult
type DetectionResult
type Framework
type FrameworkSignature
- func (sig *FrameworkSignature) Matches(doc *goquery.Document, html string) bool
type ReadabilityExtractor
- func NewReadabilityExtractor() *ReadabilityExtractor
- func (r *ReadabilityExtractor) Extract(doc *goquery.Document, pageURL *url.URL) (*goquery.Selection, string, error)
- func (r *ReadabilityExtractor) ExtractText(doc *goquery.Document, pageURL *url.URL) (string, string, error)
type SelectorCache
- func NewSelectorCache() *SelectorCache

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func GetFrameworkSelector ¶

func GetFrameworkSelector(fw Framework) string

GetFrameworkSelector returns the recommended selector for a known framework

func IsAutoSelector ¶

func IsAutoSelector(selector string) bool

IsAutoSelector returns true if the selector value indicates auto-detection

Types ¶

type ContentDetector ¶

type ContentDetector struct {
	// contains filtered or unexported fields
}

ContentDetector detects the appropriate content selector for a page

func NewContentDetector ¶

func NewContentDetector(log *logrus.Entry) *ContentDetector

NewContentDetector creates a new content detector with caching

func (*ContentDetector) Detect ¶

func (d *ContentDetector) Detect(doc *goquery.Document, pageURL *url.URL) DetectionResult

Detect determines the best content selector for the given document It first checks the cache, then tries framework detection, and falls back to readability

type DetectionResult ¶

type DetectionResult struct {
	Framework Framework // Detected framework (or unknown)
	Selector  string    // CSS selector for main content
	Fallback  bool      // True if using readability fallback
}

DetectionResult contains the result of content selector detection

type Framework ¶

type Framework string

Framework represents a detected documentation framework

const (
	FrameworkUnknown     Framework = "unknown"
	FrameworkDocusaurus  Framework = "docusaurus"
	FrameworkMkDocs      Framework = "mkdocs"
	FrameworkSphinx      Framework = "sphinx"
	FrameworkGitBook     Framework = "gitbook"
	FrameworkReadTheDocs Framework = "readthedocs"
)

type FrameworkSignature ¶

type FrameworkSignature struct {
	Framework    Framework
	Selector     string   // CSS selector for main content
	Attributes   []string // HTML attributes to look for (e.g., "data-docusaurus")
	Classes      []string // CSS classes to look for
	Scripts      []string // Script src patterns to look for
	HTMLPatterns []string // Substring patterns to look for in raw HTML
}

FrameworkSignature defines detection patterns for a documentation framework

func (*FrameworkSignature) Matches ¶

func (sig *FrameworkSignature) Matches(doc *goquery.Document, html string) bool

Matches returns true if the document matches this framework's signature

type ReadabilityExtractor ¶

type ReadabilityExtractor struct{}

ReadabilityExtractor extracts main content using Mozilla's Readability algorithm

func NewReadabilityExtractor ¶

func NewReadabilityExtractor() *ReadabilityExtractor

NewReadabilityExtractor creates a new readability-based content extractor

func (*ReadabilityExtractor) Extract ¶

func (r *ReadabilityExtractor) Extract(doc *goquery.Document, pageURL *url.URL) (*goquery.Selection, string, error)

Extract extracts the main content from an HTML document using readability Returns the extracted content as a goquery Selection that can be processed like regular content

func (*ReadabilityExtractor) ExtractText ¶

func (r *ReadabilityExtractor) ExtractText(doc *goquery.Document, pageURL *url.URL) (string, string, error)

ExtractText extracts just the text content (no HTML) using readability Useful for search indexing or simple text output

type SelectorCache ¶

type SelectorCache struct {
	// contains filtered or unexported fields
}

SelectorCache caches detected selectors per domain to avoid repeated detection

func NewSelectorCache ¶

func NewSelectorCache() *SelectorCache

NewSelectorCache creates a new selector cache

func (*SelectorCache) Clear ¶

func (c *SelectorCache) Clear()

Clear removes all cached entries

func (*SelectorCache) Get ¶

func (c *SelectorCache) Get(domain string) (DetectionResult, bool)

Get retrieves a cached detection result for a domain

func (*SelectorCache) Set ¶

func (c *SelectorCache) Set(domain string, result DetectionResult)

Set stores a detection result for a domain

func (*SelectorCache) Size ¶

func (c *SelectorCache) Size() int

Size returns the number of cached entries

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL