Documentation
¶
Index ¶
- func StructuredDataToItem(results []StructuredData, sourceURL string) *types.Item
- type AutoSelectorGenerator
- type CSSParser
- type CompositeParser
- type DOMTraverser
- func (dt *DOMTraverser) ExtractList(resp *types.Response, listSelector string) ([]string, error)
- func (dt *DOMTraverser) ExtractTable(resp *types.Response, tableSelector string) ([][]string, error)
- func (dt *DOMTraverser) FindChildren(resp *types.Response, selector, childSelector string) ([]TraversalResult, error)
- func (dt *DOMTraverser) FindClosest(resp *types.Response, startSelector, ancestorSelector string) ([]TraversalResult, error)
- func (dt *DOMTraverser) FindParent(resp *types.Response, selector string, levels int) ([]TraversalResult, error)
- func (dt *DOMTraverser) FindSiblings(resp *types.Response, selector string, direction string) ([]TraversalResult, error)
- type Parser
- type RegexParser
- type SelectorCandidate
- type SimilarElement
- type SmartTracker
- func (st *SmartTracker) FindSimilar(resp *types.Response, selector string, maxResults int) ([]SimilarElement, error)
- func (st *SmartTracker) Relocate(resp *types.Response, name string) (string, *goquery.Selection, error)
- func (st *SmartTracker) TakeSnapshot(resp *types.Response, selector string, name string) error
- type StructuredData
- type StructuredDataExtractor
- type StructuredDataType
- type TraversalResult
- type XPathParser
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func StructuredDataToItem ¶
func StructuredDataToItem(results []StructuredData, sourceURL string) *types.Item
ToItem converts structured data results into a types.Item.
Types ¶
type AutoSelectorGenerator ¶
type AutoSelectorGenerator struct {
// contains filtered or unexported fields
}
AutoSelectorGenerator automatically generates CSS selectors for elements.
func NewAutoSelectorGenerator ¶
func NewAutoSelectorGenerator(logger *slog.Logger) *AutoSelectorGenerator
NewAutoSelectorGenerator creates a new auto-selector generator.
func (*AutoSelectorGenerator) GenerateForElement ¶
func (asg *AutoSelectorGenerator) GenerateForElement(resp *types.Response, basicSelector string) ([]SelectorCandidate, error)
GenerateForElement generates selectors for a specific element matched by a basic selector.
func (*AutoSelectorGenerator) GenerateForText ¶
func (asg *AutoSelectorGenerator) GenerateForText(resp *types.Response, text string) ([]SelectorCandidate, error)
GenerateForText finds elements containing the given text and generates selectors for them.
type CSSParser ¶
type CSSParser struct {
// contains filtered or unexported fields
}
CSSParser extracts data using CSS selectors via goquery.
func NewCSSParser ¶
NewCSSParser creates a new CSS selector parser.
type CompositeParser ¶
type CompositeParser struct {
// contains filtered or unexported fields
}
CompositeParser combines multiple parser implementations. It delegates to the appropriate parser based on rule type.
func NewCompositeParser ¶
func NewCompositeParser(logger *slog.Logger) *CompositeParser
NewCompositeParser creates a parser that handles CSS, regex, and XPath rules.
type DOMTraverser ¶
type DOMTraverser struct {
// contains filtered or unexported fields
}
DOMTraverser provides parent/child/sibling DOM navigation.
func NewDOMTraverser ¶
func NewDOMTraverser(logger *slog.Logger) *DOMTraverser
NewDOMTraverser creates a new DOM traversal helper.
func (*DOMTraverser) ExtractList ¶
ExtractList extracts list items (li) from a list (ul/ol).
func (*DOMTraverser) ExtractTable ¶
func (dt *DOMTraverser) ExtractTable(resp *types.Response, tableSelector string) ([][]string, error)
ExtractTable parses an HTML table into a 2D string array.
func (*DOMTraverser) FindChildren ¶
func (dt *DOMTraverser) FindChildren(resp *types.Response, selector, childSelector string) ([]TraversalResult, error)
FindChildren navigates to direct children of matched elements.
func (*DOMTraverser) FindClosest ¶
func (dt *DOMTraverser) FindClosest(resp *types.Response, startSelector, ancestorSelector string) ([]TraversalResult, error)
FindClosest traverses up the tree finding the first ancestor matching the selector.
func (*DOMTraverser) FindParent ¶
func (dt *DOMTraverser) FindParent(resp *types.Response, selector string, levels int) ([]TraversalResult, error)
FindParent navigates to the parent element of matches.
func (*DOMTraverser) FindSiblings ¶
func (dt *DOMTraverser) FindSiblings(resp *types.Response, selector string, direction string) ([]TraversalResult, error)
FindSiblings finds sibling elements of matched elements.
type Parser ¶
type Parser interface {
// Parse extracts items and follow-up URLs from a response.
// It returns scraped items, discovered links, and any error.
Parse(resp *types.Response, rules []config.ParseRule) ([]*types.Item, []string, error)
}
Parser extracts data and links from a fetched response.
type RegexParser ¶
type RegexParser struct {
// contains filtered or unexported fields
}
RegexParser extracts data using regular expressions.
func NewRegexParser ¶
func NewRegexParser(logger *slog.Logger) *RegexParser
NewRegexParser creates a new regex parser.
type SelectorCandidate ¶
type SelectorCandidate struct {
Selector string `json:"selector"`
Specificity int `json:"specificity"` // Higher = more specific
MatchCount int `json:"match_count"` // How many elements this matches
Score float64 `json:"score"` // Confidence score (0-1)
}
SelectorCandidate represents a generated selector with a confidence score.
type SimilarElement ¶
type SimilarElement struct {
Selector string `json:"selector"`
Text string `json:"text"`
Similarity float64 `json:"similarity"` // 0 to 1
Tag string `json:"tag"`
}
SimilarElement represents an element found via similarity matching.
type SmartTracker ¶
type SmartTracker struct {
// contains filtered or unexported fields
}
SmartTracker tracks elements across page changes using multiple strategies. It can relocate content even when a site's HTML structure changes.
func NewSmartTracker ¶
func NewSmartTracker(logger *slog.Logger) *SmartTracker
NewSmartTracker creates a new smart element tracker.
func (*SmartTracker) FindSimilar ¶
func (st *SmartTracker) FindSimilar(resp *types.Response, selector string, maxResults int) ([]SimilarElement, error)
FindSimilar finds elements with similar structure/content to a given element.
func (*SmartTracker) Relocate ¶
func (st *SmartTracker) Relocate(resp *types.Response, name string) (string, *goquery.Selection, error)
Relocate tries to find a previously snapshotted element on a new page. It uses multiple strategies: original selector, ID, class, text content, path.
func (*SmartTracker) TakeSnapshot ¶
TakeSnapshot saves the current state of an element for later relocation.
type StructuredData ¶
type StructuredData struct {
Type StructuredDataType `json:"type"`
Data map[string]any `json:"data"`
Raw string `json:"raw,omitempty"`
}
StructuredData represents extracted structured data from a page.
type StructuredDataExtractor ¶
type StructuredDataExtractor struct {
// contains filtered or unexported fields
}
StructuredDataExtractor extracts JSON-LD, Microdata, OpenGraph, etc.
func NewStructuredDataExtractor ¶
func NewStructuredDataExtractor(logger *slog.Logger) *StructuredDataExtractor
NewStructuredDataExtractor creates a new structured data extractor.
func (*StructuredDataExtractor) Extract ¶
func (sde *StructuredDataExtractor) Extract(resp *types.Response) ([]StructuredData, error)
Extract finds and parses all structured data in a response.
type StructuredDataType ¶
type StructuredDataType string
StructuredDataType identifies the type of structured data.
const ( JSONLD StructuredDataType = "json-ld" Microdata StructuredDataType = "microdata" OpenGraph StructuredDataType = "opengraph" TwitterCard StructuredDataType = "twitter_card" RDFa StructuredDataType = "rdfa" MetaTags StructuredDataType = "meta" )
type TraversalResult ¶
type TraversalResult struct {
Text string
HTML string
Attribute string
Tag string
Children []TraversalResult
}
TraversalResult holds the result of a DOM traversal operation.
type XPathParser ¶
type XPathParser struct {
// contains filtered or unexported fields
}
XPathParser extracts data using XPath expressions.
func NewXPathParser ¶
func NewXPathParser(logger *slog.Logger) *XPathParser
NewXPathParser creates a new XPath parser.