scrape

package

v0.5.42 Latest Latest Go to latest Published: Feb 24, 2026 License: GPL-3.0 Imports: 33 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/findyourpaths/goskyr

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func ApplyDerivedFields(derivedFields []DerivedField, rec map[string]interface{}) error
func ComputeFieldHash(selectorPath string) string
func DebugDateTime(args ...any)
func DetailPages(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, ...) error
func GQDocument(ctx context.Context, c *Config, s *Scraper, gqdoc *fetch.Document) (output.Records, error)
func GQSelection(ctx context.Context, c *Config, s *Scraper, sel *fetch.Selection, ...) (output.Record, error)
func GenerateFieldName(selectorPath, attr string, textNodeIndex int) string
func GetTextStringAndURL(e *ElementLocation, sel *fetch.Selection, baseURL string) (string, *url.URL, error)
func IsGoskyrFieldName(name string) bool
func Page(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, ...) (output.Records, error)
func SubGQDocument(ctx context.Context, c *Config, s *Scraper, rec output.Record, fname string, ...) error
type Config
- func ReadConfig(configPath string) (*Config, error)
- func (c Config) Copy() *Config
- func (c Config) String() string
- func (c Config) WriteToFile(dir string) error
type ConfigID
- func (cid ConfigID) String() string
type DateComponent
type DerivedField
- func (df *DerivedField) Extract(input string) (map[string]string, error)
- func (df *DerivedField) Initialize() error
type DerivedOutput
type ElementLocation
type ElementLocations
- func (e *ElementLocations) UnmarshalYAML(value *yaml.Node) error
type FetchConfig
type Field
type FieldNameComponents
- func ParseFieldName(name string) (FieldNameComponents, bool)
type Filter
- func (f *Filter) FilterMatch(value interface{}) bool
- func (f *Filter) FilterMatchWithCondition(rec map[string]interface{}) bool
- func (f *Filter) Initialize(fieldType string) error
type GlobalConfig
type OutputCondition
- func (c *OutputCondition) Evaluate(value string) bool
- func (c *OutputCondition) Initialize() error
type Pagination
type Paginator
type RegexConfig
type Scraper
- func FindByName(config *Config, scraperName string) *Scraper
- func (c *Scraper) GetDetailPageURLFields() []Field
- func (s Scraper) HostSlug() string
type TransformConfig
type ValidationConfig

Constants ¶

View Source

const (
	// UnitSeparator separates siblings within a single matched element (entire_subtree).
	UnitSeparator = "\x1f"
	// RecordSeparator separates values from multiple matched elements (all_nodes).
	RecordSeparator = "\x1e"
	// GroupSeparator reserved for future use (groups of records).
	GroupSeparator = "\x1d"
)

ASCII separator characters for unambiguous field/record delimiting. These never appear in HTML content, unlike \n and \t.

View Source

const FieldNameFormat = "F<hash>[-<attr>]-<textnode>"

FieldNameFormat documents the goskyr field naming convention.

Format: F<hash>[-<attr>]-<textnode>

Components:

F           - Literal prefix identifying as goskyr field
<hash>      - 8-char hex hash of normalized selector path (CRC32)
<attr>      - Optional: attribute name (href, src, datetime, etc.)
              Empty string for text content, shown as double hyphen "--"
<textnode>  - 0-based text node index within element

Examples:

Fa1b2c3d4--0       - Text content, first text node
Fa1b2c3d4--1       - Text content, second text node
Fa1b2c3d4-href-0   - href attribute value
Fa1b2c3d4-src-0    - src attribute value
Fa1b2c3d4-datetime-0 - datetime attribute value

Stability Guarantee:

Same selector path always produces same hash.
The hash is computed using CRC32-IEEE on the normalized path string.

Note: The text node index is the index of the text node within the element, NOT the position of the item in a list. All items on a list page will have identical field keys.

Variables ¶

View Source

var DateRE = regexp.MustCompile(`(?i)\b(2024|2025|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b`)

View Source

var DateTimeFieldName = "Pdate_time_tz_ranges"

View Source

var DateTimeFieldSuffix = "__" + DateTimeFieldName

View Source

var DebugGQFind = true

View Source

var DoDebug = true

View Source

var FieldPartSeparator = "\n"

FieldPartSeparator is used to join multiple parts when extracting field values. Using newline preserves text structure and helps with date parsing.

View Source

var KeepSubURLScheme = map[string]bool{
	"http":  true,
	"https": true,
}

View Source

var SkipSubURLExt = map[string]bool{
	".gif":  true,
	".jfif": true,
	".jpeg": true,
	".jpg":  true,
	".mp4":  true,
	".pdf":  true,
	".png":  true,
	".webp": true,
	".zip":  true,
}

View Source

var SkipTag = map[string]bool{
	"noscript": true,
	"script":   true,
	"style":    true,
}

View Source

var TitleFieldName = "Atitle"

View Source

var TitleFieldSuffix = "__" + TitleFieldName

View Source

var URLFieldName = "Aurl"

View Source

var URLFieldSuffix = "__" + URLFieldName

Functions ¶

func ApplyDerivedFields ¶ added in v0.5.39

func ApplyDerivedFields(derivedFields []DerivedField, rec map[string]interface{}) error

ApplyDerivedFields processes all derived fields for a record

func ComputeFieldHash ¶ added in v0.5.39

func ComputeFieldHash(selectorPath string) string

ComputeFieldHash returns a stable 8-char hex hash for a selector path. Uses CRC32-IEEE which produces 32-bit (8 hex char) hashes.

The hash is deterministic: same selector path always produces same hash.

func DebugDateTime ¶ added in v0.5.39

func DebugDateTime(args ...any)

func DetailPages ¶ added in v0.5.39

func DetailPages(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, recs output.Records, domain string) error

DetailPages follows URL fields in records to scrape detail pages and merge the extracted data back into the original records.

func GQDocument ¶

func GQDocument(ctx context.Context, c *Config, s *Scraper, gqdoc *fetch.Document) (output.Records, error)

GQDocument fetches and returns all records from a website according to the Scraper's paramaters. When rawDyn is set to true the records returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not detail pages). This is used by the ML feature generation.

func GQSelection ¶

func GQSelection(ctx context.Context, c *Config, s *Scraper, sel *fetch.Selection, baseURL string) (output.Record, error)

GQSelection fetches and returns an records from a website according to the Scraper's paramaters. When rawDyn is set to true the record returned is not processed according to its type but instead the raw value based only on the location is returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not detail pages). This is used by the ML feature generation.

func GenerateFieldName ¶ added in v0.5.39

func GenerateFieldName(selectorPath, attr string, textNodeIndex int) string

GenerateFieldName creates a field name from selector path, attribute, and text node index.

Parameters:

selectorPath: The DOM selector path (e.g., "div.event > span.title")
attr: Attribute name ("href", "src", etc.) or empty string for text content
textNodeIndex: 0-based index of text node within element

Returns field name in format: F<hash>[-<attr>]-<textnode>

func GetTextStringAndURL ¶ added in v0.5.39

func GetTextStringAndURL(e *ElementLocation, sel *fetch.Selection, baseURL string) (string, *url.URL, error)

GetTextStringAndURL extracts text or attribute value from an element and resolves it as a URL relative to the base URL.

func IsGoskyrFieldName ¶ added in v0.5.39

func IsGoskyrFieldName(name string) bool

IsGoskyrFieldName checks if a string is a valid goskyr field name.

func Page ¶

func Page(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, globalConfig *GlobalConfig, rawDyn bool, path string) (output.Records, error)

Page fetches and returns all records from a webpage according to the Scraper's paramaters. When rawDyn is set to true the records returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not detail pages). This is used by the ML feature generation.

func SubGQDocument ¶

func SubGQDocument(ctx context.Context, c *Config, s *Scraper, rec output.Record, fname string, gqdoc *fetch.Document) error

SubGQDocument scrapes a detail page document and merges the extracted fields into the parent record with field names prefixed by the source field name.

Types ¶

type Config ¶

type Config struct {
	ID       ConfigID
	Writer   output.WriterConfig `yaml:"writer,omitempty"`
	Scrapers []Scraper           `yaml:"scrapers,omitempty"`
	Global   GlobalConfig        `yaml:"global,omitempty"`
	Records  output.Records
}

Config defines the overall structure of the scraper configuration. Values will be taken from a config yml file or environment variables or both.

func ReadConfig ¶

func ReadConfig(configPath string) (*Config, error)

ReadConfig reads a scraper configuration from a YAML file or directory of YAML files.

func (Config) Copy ¶

func (c Config) Copy() *Config

Copy creates a deep copy of the Config including all records.

func (Config) String ¶

func (c Config) String() string

String converts a Config to its YAML representation, excluding records.

func (Config) WriteToFile ¶

func (c Config) WriteToFile(dir string) error

WriteToFile writes the Config to a YAML file and optionally writes records to a JSON file in the specified directory.

type ConfigID ¶

type ConfigID struct {
	Slug  string
	ID    string
	Field string
	SubID string
}

func (ConfigID) String ¶

func (cid ConfigID) String() string

String converts a ConfigID to its string representation by joining its components with underscores, creating a hierarchical identifier.

type DateComponent ¶

type DateComponent struct {
	Covers          date.CoveredDateParts `yaml:"covers"`
	ElementLocation ElementLocation       `yaml:"location"`
	Layout          []string              `yaml:"layout"`
	Transform       []TransformConfig     `yaml:"transform,omitempty"`
}

A DateComponent is used to find a specific part of a date within a html document

type DerivedField ¶ added in v0.5.39

type DerivedField struct {
	Source   string          `yaml:"source"`   // source field name
	Template string          `yaml:"template"` // "{name} | {date}" - delimiter-based
	Regex    string          `yaml:"regex"`    // fallback: named capture groups (?P<name>...)
	Outputs  []DerivedOutput `yaml:"outputs"`
	// contains filtered or unexported fields
}

DerivedField creates new fields from existing ones via templates or regex

func (*DerivedField) Extract ¶ added in v0.5.39

func (df *DerivedField) Extract(input string) (map[string]string, error)

Extract parses the input string and returns symbol values

func (*DerivedField) Initialize ¶ added in v0.5.39

func (df *DerivedField) Initialize() error

Initialize prepares the DerivedField for use by compiling patterns

type DerivedOutput ¶ added in v0.5.39

type DerivedOutput struct {
	Symbol    string           `yaml:"symbol"`
	Target    string           `yaml:"target"`
	Condition *OutputCondition `yaml:"condition,omitempty"`
	Value     string           `yaml:"value,omitempty"` // override extracted value
}

DerivedOutput maps a template symbol to a target field

type ElementLocation ¶

type ElementLocation struct {
	Selector      string      `yaml:"selector,omitempty"`
	JsonSelector  string      `yaml:"json_selector,omitempty"`
	ChildIndex    int         `yaml:"child_index,omitempty"`
	RegexExtract  RegexConfig `yaml:"regex_extract,omitempty"`
	Attr          string      `yaml:"attr,omitempty"`
	MaxLength     int         `yaml:"max_length,omitempty"`
	EntireSubtree bool        `yaml:"entire_subtree,omitempty"`
	AllNodes      bool        `yaml:"all_nodes,omitempty"`
	Separator     string      `yaml:"separator,omitempty"`      // Intra-node sibling separator (default: \x1F)
	NodeSeparator string      `yaml:"node_separator,omitempty"` // Inter-node separator (default: \x1E)
}

ElementLocation is used to find a specific string in a html document

type ElementLocations ¶

type ElementLocations []ElementLocation

func (*ElementLocations) UnmarshalYAML ¶

func (e *ElementLocations) UnmarshalYAML(value *yaml.Node) error

UnmarshalYAML handles YAML unmarshalling for ElementLocations, accepting either a single ElementLocation or a list of ElementLocations.

type FetchConfig ¶ added in v0.5.39

type FetchConfig struct {
	UseJavascript          bool   `yaml:"use_javascript,omitempty"`           // Enable headless browser
	WaitSelector           string `yaml:"wait_selector,omitempty"`            // CSS selector to wait for
	WaitTimeoutMs          int    `yaml:"wait_timeout_ms,omitempty"`          // Timeout for wait (default 30000)
	FerretQL               string `yaml:"ferret_ql,omitempty"`                // Custom FerretQL script (advanced)
	Script                 string `yaml:"script,omitempty"`                   // JavaScript to run after page load (Rod only, runs before scraping)
	InfiniteScrollSelector string `yaml:"infinite_scroll_selector,omitempty"` // CSS selector for "Load More" button (Rod clicks it repeatedly)
}

FetchConfig controls how pages are fetched (JS rendering, waits, etc.)

type Field ¶

type Field struct {
	Name             string           `yaml:"name"`
	Value            string           `yaml:"value,omitempty"`
	Type             string           `yaml:"type,omitempty"`     // can be text (default), html, url, or date_time_tz_ranges
	ElementLocations ElementLocations `yaml:"location,omitempty"` // elements are extracted strings joined with newlines
	Default          string           `yaml:"default,omitempty"`  // the default for a dynamic field (text or url) if no value is found
	// If a field can be found on a detail page the following variable has to
	// contain a field name of a field of type 'url' that is located on the main
	// page.
	OnDetailPage string            `yaml:"on_detail_page,omitempty"` // applies to text, url, date
	Required     bool              `yaml:"required,omitempty"`       // applies to text, url - if true, skip record when field is empty
	Components   []DateComponent   `yaml:"components,omitempty"`     // applies to date
	DateLocation string            `yaml:"date_location,omitempty"`  // applies to date
	DateLanguage string            `yaml:"date_language,omitempty"`  // applies to date
	Hide         bool              `yaml:"hide,omitempty"`           // applies to text, url, date
	GuessYear    bool              `yaml:"guess_year,omitempty"`     // applies to date
	Transform    []TransformConfig `yaml:"transform,omitempty"`      // applies to text
}

A Field contains all the information necessary to scrape a dynamic field from a website, ie a field who's value changes for each record.

type FieldNameComponents ¶ added in v0.5.39

type FieldNameComponents struct {
	Hash          string // 8-char hex hash
	Attribute     string // Attribute name, or empty for text content
	TextNodeIndex int    // 0-based text node index
}

FieldNameComponents contains the parsed components of a goskyr field name.

func ParseFieldName ¶ added in v0.5.39

func ParseFieldName(name string) (FieldNameComponents, bool)

ParseFieldName extracts components from a field name. Returns the components and true if parsing succeeded, or zero values and false if not.

Examples:

ParseFieldName("Fa1b2c3d4--0") → {Hash: "a1b2c3d4", Attribute: "", TextNodeIndex: 0}, true
ParseFieldName("Fa1b2c3d4-href-0") → {Hash: "a1b2c3d4", Attribute: "href", TextNodeIndex: 0}, true
ParseFieldName("invalid") → {}, false

type Filter ¶

type Filter struct {
	Field           string `yaml:"field"`
	Type            string
	Expression      string `yaml:"exp"` // changed from 'regex' to 'exp' in version 0.5.7
	RegexComp       *regexp.Regexp
	DateComp        time.Time
	DateOp          string
	Match           bool   `yaml:"match"`
	Condition       string `yaml:"condition,omitempty"`        // matches, not_matches, missing, missing_or_matches, exists
	CaseInsensitive bool   `yaml:"case_insensitive,omitempty"` // for regex matching
}

A Filter is used to filter certain recs from the result list

func (*Filter) FilterMatch ¶

func (f *Filter) FilterMatch(value interface{}) bool

FilterMatch checks if a value matches the filter's criteria based on regex or date comparison.

func (*Filter) FilterMatchWithCondition ¶ added in v0.5.39

func (f *Filter) FilterMatchWithCondition(rec map[string]interface{}) bool

FilterMatchWithCondition checks the filter against record with extended conditions. Returns true if the record should be KEPT.

func (*Filter) Initialize ¶

func (f *Filter) Initialize(fieldType string) error

Initialize compiles the filter's regex pattern or parses date comparison expressions.

type GlobalConfig ¶

type GlobalConfig struct {
	UserAgent string `yaml:"user-agent"`
}

GlobalConfig is used for storing global configuration parameters that are needed across all scrapers

type OutputCondition ¶ added in v0.5.39

type OutputCondition struct {
	Equals          string `yaml:"equals,omitempty"`
	NotEquals       string `yaml:"not_equals,omitempty"`
	Matches         string `yaml:"matches,omitempty"`
	NotMatches      string `yaml:"not_matches,omitempty"`
	CaseInsensitive bool   `yaml:"case_insensitive,omitempty"`
	// contains filtered or unexported fields
}

OutputCondition controls conditional field mapping

func (*OutputCondition) Evaluate ¶ added in v0.5.39

func (c *OutputCondition) Evaluate(value string) bool

Evaluate checks if the condition is satisfied by the value

func (*OutputCondition) Initialize ¶ added in v0.5.39

func (c *OutputCondition) Initialize() error

Initialize compiles regex patterns in the condition

type Pagination struct {
	Type           string `yaml:"type"`            // query_param, scroll, next_button
	ParamName      string `yaml:"param_name"`      // for query_param: e.g. "start", "page"
	StartValue     int    `yaml:"start_value"`     // starting value (usually 0)
	Increment      int    `yaml:"increment"`       // increment per page
	MaxPages       int    `yaml:"max_pages"`       // safety limit
	ButtonSelector string `yaml:"button_selector"` // for scroll/next_button types
	WaitMs         int    `yaml:"wait_ms"`         // delay between pages (milliseconds)
}

Pagination configures multi-page extraction

type Paginator ¶

type Paginator struct {
	Location ElementLocation `yaml:"location,omitempty"`
	MaxPages int             `yaml:"max_pages,omitempty"`
}

A Paginator is used to paginate through a website

type RegexConfig ¶

type RegexConfig struct {
	RegexPattern string `yaml:"exp"`
	Index        int    `yaml:"index"`
}

RegexConfig is used for extracting a substring from a string based on the given RegexPattern and Index

type Scraper ¶

type Scraper struct {
	Interaction  []*fetch.Interaction `yaml:"interaction,omitempty"`
	Name         string               `yaml:"name"`
	PageLoadWait int                  `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
	RenderJs     bool                 `yaml:"render_js,omitempty"`
	Selector     string               `yaml:"selector"`
	Strategy     string               `yaml:"strategy,omitempty"` // "nested" (default) or "sequential"
	URL          string               `yaml:"url"`
	Validation   *ValidationConfig    `yaml:"validation,omitempty"`
	Fields       []Field              `yaml:"fields,omitempty"`
	Filters      []*Filter            `yaml:"filters,omitempty"`
	Paginators   []Paginator          `yaml:"paginators,omitempty"`

	// New declarative config fields
	Fetch         *FetchConfig   `yaml:"fetch,omitempty"`          // Fetch configuration
	Pagination    *Pagination    `yaml:"pagination,omitempty"`     // Declarative pagination
	DerivedFields []DerivedField `yaml:"derived_fields,omitempty"` // Template-based field derivation

	// MergeKey makes this an independent scraper (not a detail-page follower).
	// When set, this scraper is run separately via Page() and its records are
	// merged into the primary scraper's records by matching on this field name.
	// The field must exist in both scrapers' output.
	MergeKey string `yaml:"merge_key,omitempty"`
}

A Scraper contains all the necessary config parameters and structs needed to extract the desired information from a website

func FindByName ¶ added in v0.5.39

func FindByName(config *Config, scraperName string) *Scraper

FindByName returns a pointer to the named scraper within config. Returns nil if config is nil, has no scrapers, or no scraper matches the name. Exact name match only — no fallback.

func (*Scraper) GetDetailPageURLFields ¶ added in v0.5.39

func (c *Scraper) GetDetailPageURLFields() []Field

GetDetailPageURLFields returns all URL-type fields that can be used to navigate to detail pages.

func (Scraper) HostSlug ¶ added in v0.5.39

func (s Scraper) HostSlug() string

HostSlug extracts and returns a URL slug from the scraper's URL host.

type TransformConfig ¶

type TransformConfig struct {
	TransformType string `yaml:"type,omitempty"`    // only regex-replace for now
	RegexPattern  string `yaml:"regex,omitempty"`   // a container for the pattern
	Replacement   string `yaml:"replace,omitempty"` // a plain string for replacement
}

TransformConfig is used to replace an existing substring with some other kind of string. Processing needs to happen before extracting dates.

type ValidationConfig ¶ added in v0.5.39

type ValidationConfig struct {
	RequiresCTASelector string `yaml:"requires_cta_selector,omitempty"`
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func ApplyDerivedFields ¶ added in v0.5.39

func ComputeFieldHash ¶ added in v0.5.39

func DebugDateTime ¶ added in v0.5.39

func DetailPages ¶ added in v0.5.39

func GQDocument ¶

func GQSelection ¶

func GenerateFieldName ¶ added in v0.5.39

func GetTextStringAndURL ¶ added in v0.5.39

func IsGoskyrFieldName ¶ added in v0.5.39

func Page ¶

func SubGQDocument ¶

Types ¶

type Config ¶

func ReadConfig ¶

func (Config) Copy ¶

func (Config) String ¶

func (Config) WriteToFile ¶

type ConfigID ¶

func (ConfigID) String ¶

type DateComponent ¶

type DerivedField ¶ added in v0.5.39

func (*DerivedField) Extract ¶ added in v0.5.39

func (*DerivedField) Initialize ¶ added in v0.5.39

type DerivedOutput ¶ added in v0.5.39

type ElementLocation ¶

type ElementLocations ¶

func (*ElementLocations) UnmarshalYAML ¶

type FetchConfig ¶ added in v0.5.39

type Field ¶

type FieldNameComponents ¶ added in v0.5.39

func ParseFieldName ¶ added in v0.5.39

type Filter ¶

func (*Filter) FilterMatch ¶

func (*Filter) FilterMatchWithCondition ¶ added in v0.5.39

func (*Filter) Initialize ¶

type GlobalConfig ¶

type OutputCondition ¶ added in v0.5.39

func (*OutputCondition) Evaluate ¶ added in v0.5.39

func (*OutputCondition) Initialize ¶ added in v0.5.39

type Pagination ¶ added in v0.5.39

type Paginator ¶

type RegexConfig ¶

type Scraper ¶

func FindByName ¶ added in v0.5.39

func (*Scraper) GetDetailPageURLFields ¶ added in v0.5.39

func (Scraper) HostSlug ¶ added in v0.5.39

type TransformConfig ¶

type ValidationConfig ¶ added in v0.5.39

Source Files ¶