parser

package
v0.0.0-...-50e7da2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 5, 2026 License: GPL-3.0 Imports: 12 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func StructuredDataToItem

func StructuredDataToItem(results []StructuredData, sourceURL string) *types.Item

ToItem converts structured data results into a types.Item.

Types

type AutoSelectorGenerator

type AutoSelectorGenerator struct {
	// contains filtered or unexported fields
}

AutoSelectorGenerator automatically generates CSS selectors for elements.

func NewAutoSelectorGenerator

func NewAutoSelectorGenerator(logger *slog.Logger) *AutoSelectorGenerator

NewAutoSelectorGenerator creates a new auto-selector generator.

func (*AutoSelectorGenerator) GenerateForElement

func (asg *AutoSelectorGenerator) GenerateForElement(resp *types.Response, basicSelector string) ([]SelectorCandidate, error)

GenerateForElement generates selectors for a specific element matched by a basic selector.

func (*AutoSelectorGenerator) GenerateForText

func (asg *AutoSelectorGenerator) GenerateForText(resp *types.Response, text string) ([]SelectorCandidate, error)

GenerateForText finds elements containing the given text and generates selectors for them.

type CSSParser

type CSSParser struct {
	// contains filtered or unexported fields
}

CSSParser extracts data using CSS selectors via goquery.

func NewCSSParser

func NewCSSParser(logger *slog.Logger) *CSSParser

NewCSSParser creates a new CSS selector parser.

func (*CSSParser) Parse

func (p *CSSParser) Parse(resp *types.Response, rules []config.ParseRule) ([]*types.Item, []string, error)

Parse implements Parser.

type CompositeParser

type CompositeParser struct {
	// contains filtered or unexported fields
}

CompositeParser combines multiple parser implementations. It delegates to the appropriate parser based on rule type.

func NewCompositeParser

func NewCompositeParser(logger *slog.Logger) *CompositeParser

NewCompositeParser creates a parser that handles CSS, regex, and XPath rules.

func (*CompositeParser) Parse

func (p *CompositeParser) Parse(resp *types.Response, rules []config.ParseRule) ([]*types.Item, []string, error)

Parse implements Parser by delegating to sub-parsers.

type DOMTraverser

type DOMTraverser struct {
	// contains filtered or unexported fields
}

DOMTraverser provides parent/child/sibling DOM navigation.

func NewDOMTraverser

func NewDOMTraverser(logger *slog.Logger) *DOMTraverser

NewDOMTraverser creates a new DOM traversal helper.

func (*DOMTraverser) ExtractList

func (dt *DOMTraverser) ExtractList(resp *types.Response, listSelector string) ([]string, error)

ExtractList extracts list items (li) from a list (ul/ol).

func (*DOMTraverser) ExtractTable

func (dt *DOMTraverser) ExtractTable(resp *types.Response, tableSelector string) ([][]string, error)

ExtractTable parses an HTML table into a 2D string array.

func (*DOMTraverser) FindChildren

func (dt *DOMTraverser) FindChildren(resp *types.Response, selector, childSelector string) ([]TraversalResult, error)

FindChildren navigates to direct children of matched elements.

func (*DOMTraverser) FindClosest

func (dt *DOMTraverser) FindClosest(resp *types.Response, startSelector, ancestorSelector string) ([]TraversalResult, error)

FindClosest traverses up the tree finding the first ancestor matching the selector.

func (*DOMTraverser) FindParent

func (dt *DOMTraverser) FindParent(resp *types.Response, selector string, levels int) ([]TraversalResult, error)

FindParent navigates to the parent element of matches.

func (*DOMTraverser) FindSiblings

func (dt *DOMTraverser) FindSiblings(resp *types.Response, selector string, direction string) ([]TraversalResult, error)

FindSiblings finds sibling elements of matched elements.

type Parser

type Parser interface {
	// Parse extracts items and follow-up URLs from a response.
	// It returns scraped items, discovered links, and any error.
	Parse(resp *types.Response, rules []config.ParseRule) ([]*types.Item, []string, error)
}

Parser extracts data and links from a fetched response.

type RegexParser

type RegexParser struct {
	// contains filtered or unexported fields
}

RegexParser extracts data using regular expressions.

func NewRegexParser

func NewRegexParser(logger *slog.Logger) *RegexParser

NewRegexParser creates a new regex parser.

func (*RegexParser) Parse

func (p *RegexParser) Parse(resp *types.Response, rules []config.ParseRule) ([]*types.Item, []string, error)

Parse implements Parser for regex rules.

type SelectorCandidate

type SelectorCandidate struct {
	Selector    string  `json:"selector"`
	Specificity int     `json:"specificity"` // Higher = more specific
	MatchCount  int     `json:"match_count"` // How many elements this matches
	Score       float64 `json:"score"`       // Confidence score (0-1)
}

SelectorCandidate represents a generated selector with a confidence score.

type SimilarElement

type SimilarElement struct {
	Selector   string  `json:"selector"`
	Text       string  `json:"text"`
	Similarity float64 `json:"similarity"` // 0 to 1
	Tag        string  `json:"tag"`
}

SimilarElement represents an element found via similarity matching.

type SmartTracker

type SmartTracker struct {
	// contains filtered or unexported fields
}

SmartTracker tracks elements across page changes using multiple strategies. It can relocate content even when a site's HTML structure changes.

func NewSmartTracker

func NewSmartTracker(logger *slog.Logger) *SmartTracker

NewSmartTracker creates a new smart element tracker.

func (*SmartTracker) FindSimilar

func (st *SmartTracker) FindSimilar(resp *types.Response, selector string, maxResults int) ([]SimilarElement, error)

FindSimilar finds elements with similar structure/content to a given element.

func (*SmartTracker) Relocate

func (st *SmartTracker) Relocate(resp *types.Response, name string) (string, *goquery.Selection, error)

Relocate tries to find a previously snapshotted element on a new page. It uses multiple strategies: original selector, ID, class, text content, path.

func (*SmartTracker) TakeSnapshot

func (st *SmartTracker) TakeSnapshot(resp *types.Response, selector string, name string) error

TakeSnapshot saves the current state of an element for later relocation.

type StructuredData

type StructuredData struct {
	Type StructuredDataType `json:"type"`
	Data map[string]any     `json:"data"`
	Raw  string             `json:"raw,omitempty"`
}

StructuredData represents extracted structured data from a page.

type StructuredDataExtractor

type StructuredDataExtractor struct {
	// contains filtered or unexported fields
}

StructuredDataExtractor extracts JSON-LD, Microdata, OpenGraph, etc.

func NewStructuredDataExtractor

func NewStructuredDataExtractor(logger *slog.Logger) *StructuredDataExtractor

NewStructuredDataExtractor creates a new structured data extractor.

func (*StructuredDataExtractor) Extract

func (sde *StructuredDataExtractor) Extract(resp *types.Response) ([]StructuredData, error)

Extract finds and parses all structured data in a response.

type StructuredDataType

type StructuredDataType string

StructuredDataType identifies the type of structured data.

const (
	JSONLD      StructuredDataType = "json-ld"
	Microdata   StructuredDataType = "microdata"
	OpenGraph   StructuredDataType = "opengraph"
	TwitterCard StructuredDataType = "twitter_card"
	RDFa        StructuredDataType = "rdfa"
	MetaTags    StructuredDataType = "meta"
)

type TraversalResult

type TraversalResult struct {
	Text      string
	HTML      string
	Attribute string
	Tag       string
	Children  []TraversalResult
}

TraversalResult holds the result of a DOM traversal operation.

type XPathParser

type XPathParser struct {
	// contains filtered or unexported fields
}

XPathParser extracts data using XPath expressions.

func NewXPathParser

func NewXPathParser(logger *slog.Logger) *XPathParser

NewXPathParser creates a new XPath parser.

func (*XPathParser) Parse

func (p *XPathParser) Parse(resp *types.Response, rules []config.ParseRule) ([]*types.Item, []string, error)

Parse implements Parser for XPath rules.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL