Documentation
¶
Index ¶
- Constants
- Variables
- func ApplyDerivedFields(derivedFields []DerivedField, rec map[string]interface{}) error
- func ComputeFieldHash(selectorPath string) string
- func DebugDateTime(args ...any)
- func DetailPages(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, ...) error
- func GQDocument(ctx context.Context, c *Config, s *Scraper, gqdoc *fetch.Document) (output.Records, error)
- func GQSelection(ctx context.Context, c *Config, s *Scraper, sel *fetch.Selection, ...) (output.Record, error)
- func GenerateFieldName(selectorPath, attr string, textNodeIndex int) string
- func GetTextStringAndURL(e *ElementLocation, sel *fetch.Selection, baseURL string) (string, *url.URL, error)
- func IsGoskyrFieldName(name string) bool
- func Page(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, ...) (output.Records, error)
- func SubGQDocument(ctx context.Context, c *Config, s *Scraper, rec output.Record, fname string, ...) error
- type Config
- type ConfigID
- type DateComponent
- type DerivedField
- type DerivedOutput
- type ElementLocation
- type ElementLocations
- type FetchConfig
- type Field
- type FieldNameComponents
- type Filter
- type GlobalConfig
- type OutputCondition
- type Pagination
- type Paginator
- type RegexConfig
- type Scraper
- type TransformConfig
- type ValidationConfig
Constants ¶
const ( // UnitSeparator separates siblings within a single matched element (entire_subtree). UnitSeparator = "\x1f" // RecordSeparator separates values from multiple matched elements (all_nodes). RecordSeparator = "\x1e" // GroupSeparator reserved for future use (groups of records). GroupSeparator = "\x1d" )
ASCII separator characters for unambiguous field/record delimiting. These never appear in HTML content, unlike \n and \t.
const FieldNameFormat = "F<hash>[-<attr>]-<textnode>"
FieldNameFormat documents the goskyr field naming convention.
Format: F<hash>[-<attr>]-<textnode>
Components:
F - Literal prefix identifying as goskyr field
<hash> - 8-char hex hash of normalized selector path (CRC32)
<attr> - Optional: attribute name (href, src, datetime, etc.)
Empty string for text content, shown as double hyphen "--"
<textnode> - 0-based text node index within element
Examples:
Fa1b2c3d4--0 - Text content, first text node Fa1b2c3d4--1 - Text content, second text node Fa1b2c3d4-href-0 - href attribute value Fa1b2c3d4-src-0 - src attribute value Fa1b2c3d4-datetime-0 - datetime attribute value
Stability Guarantee:
Same selector path always produces same hash. The hash is computed using CRC32-IEEE on the normalized path string.
Note: The text node index is the index of the text node within the element, NOT the position of the item in a list. All items on a list page will have identical field keys.
Variables ¶
var DateRE = regexp.MustCompile(`(?i)\b(2024|2025|January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Mon|Tue|Wed|Thu|Fri|Sat|Sun)\b`)
var DateTimeFieldName = "Pdate_time_tz_ranges"
var DateTimeFieldSuffix = "__" + DateTimeFieldName
var DebugGQFind = true
var DoDebug = true
var FieldPartSeparator = "\n"
FieldPartSeparator is used to join multiple parts when extracting field values. Using newline preserves text structure and helps with date parsing.
var KeepSubURLScheme = map[string]bool{ "http": true, "https": true, }
var SkipSubURLExt = map[string]bool{ ".gif": true, ".jfif": true, ".jpeg": true, ".jpg": true, ".mp4": true, ".pdf": true, ".png": true, ".webp": true, ".zip": true, }
var TitleFieldName = "Atitle"
var TitleFieldSuffix = "__" + TitleFieldName
var URLFieldName = "Aurl"
var URLFieldSuffix = "__" + URLFieldName
Functions ¶
func ApplyDerivedFields ¶ added in v0.5.39
func ApplyDerivedFields(derivedFields []DerivedField, rec map[string]interface{}) error
ApplyDerivedFields processes all derived fields for a record
func ComputeFieldHash ¶ added in v0.5.39
ComputeFieldHash returns a stable 8-char hex hash for a selector path. Uses CRC32-IEEE which produces 32-bit (8 hex char) hashes.
The hash is deterministic: same selector path always produces same hash.
func DebugDateTime ¶ added in v0.5.39
func DebugDateTime(args ...any)
func DetailPages ¶ added in v0.5.39
func DetailPages(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, recs output.Records, domain string) error
DetailPages follows URL fields in records to scrape detail pages and merge the extracted data back into the original records.
func GQDocument ¶
func GQDocument(ctx context.Context, c *Config, s *Scraper, gqdoc *fetch.Document) (output.Records, error)
GQDocument fetches and returns all records from a website according to the Scraper's paramaters. When rawDyn is set to true the records returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not detail pages). This is used by the ML feature generation.
func GQSelection ¶
func GQSelection(ctx context.Context, c *Config, s *Scraper, sel *fetch.Selection, baseURL string) (output.Record, error)
GQSelection fetches and returns an records from a website according to the Scraper's paramaters. When rawDyn is set to true the record returned is not processed according to its type but instead the raw value based only on the location is returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not detail pages). This is used by the ML feature generation.
func GenerateFieldName ¶ added in v0.5.39
GenerateFieldName creates a field name from selector path, attribute, and text node index.
Parameters:
- selectorPath: The DOM selector path (e.g., "div.event > span.title")
- attr: Attribute name ("href", "src", etc.) or empty string for text content
- textNodeIndex: 0-based index of text node within element
Returns field name in format: F<hash>[-<attr>]-<textnode>
func GetTextStringAndURL ¶ added in v0.5.39
func GetTextStringAndURL(e *ElementLocation, sel *fetch.Selection, baseURL string) (string, *url.URL, error)
GetTextStringAndURL extracts text or attribute value from an element and resolves it as a URL relative to the base URL.
func IsGoskyrFieldName ¶ added in v0.5.39
IsGoskyrFieldName checks if a string is a valid goskyr field name.
func Page ¶
func Page(ctx context.Context, cache fetch.Cache, c *Config, s *Scraper, globalConfig *GlobalConfig, rawDyn bool, path string) (output.Records, error)
Page fetches and returns all records from a webpage according to the Scraper's paramaters. When rawDyn is set to true the records returned are not processed according to their type but instead the raw values based only on the location are returned (ignore regex_extract??). And only those of dynamic fields, ie fields that don't have a predefined value and that are present on the main page (not detail pages). This is used by the ML feature generation.
Types ¶
type Config ¶
type Config struct {
ID ConfigID
Writer output.WriterConfig `yaml:"writer,omitempty"`
Scrapers []Scraper `yaml:"scrapers,omitempty"`
Global GlobalConfig `yaml:"global,omitempty"`
Records output.Records
}
Config defines the overall structure of the scraper configuration. Values will be taken from a config yml file or environment variables or both.
func ReadConfig ¶
ReadConfig reads a scraper configuration from a YAML file or directory of YAML files.
func (Config) WriteToFile ¶
WriteToFile writes the Config to a YAML file and optionally writes records to a JSON file in the specified directory.
type DateComponent ¶
type DateComponent struct {
Covers date.CoveredDateParts `yaml:"covers"`
ElementLocation ElementLocation `yaml:"location"`
Layout []string `yaml:"layout"`
Transform []TransformConfig `yaml:"transform,omitempty"`
}
A DateComponent is used to find a specific part of a date within a html document
type DerivedField ¶ added in v0.5.39
type DerivedField struct {
Source string `yaml:"source"` // source field name
Template string `yaml:"template"` // "{name} | {date}" - delimiter-based
Regex string `yaml:"regex"` // fallback: named capture groups (?P<name>...)
Outputs []DerivedOutput `yaml:"outputs"`
// contains filtered or unexported fields
}
DerivedField creates new fields from existing ones via templates or regex
func (*DerivedField) Extract ¶ added in v0.5.39
func (df *DerivedField) Extract(input string) (map[string]string, error)
Extract parses the input string and returns symbol values
func (*DerivedField) Initialize ¶ added in v0.5.39
func (df *DerivedField) Initialize() error
Initialize prepares the DerivedField for use by compiling patterns
type DerivedOutput ¶ added in v0.5.39
type DerivedOutput struct {
Symbol string `yaml:"symbol"`
Target string `yaml:"target"`
Condition *OutputCondition `yaml:"condition,omitempty"`
Value string `yaml:"value,omitempty"` // override extracted value
}
DerivedOutput maps a template symbol to a target field
type ElementLocation ¶
type ElementLocation struct {
Selector string `yaml:"selector,omitempty"`
JsonSelector string `yaml:"json_selector,omitempty"`
ChildIndex int `yaml:"child_index,omitempty"`
RegexExtract RegexConfig `yaml:"regex_extract,omitempty"`
Attr string `yaml:"attr,omitempty"`
MaxLength int `yaml:"max_length,omitempty"`
EntireSubtree bool `yaml:"entire_subtree,omitempty"`
AllNodes bool `yaml:"all_nodes,omitempty"`
Separator string `yaml:"separator,omitempty"` // Intra-node sibling separator (default: \x1F)
NodeSeparator string `yaml:"node_separator,omitempty"` // Inter-node separator (default: \x1E)
}
ElementLocation is used to find a specific string in a html document
type ElementLocations ¶
type ElementLocations []ElementLocation
func (*ElementLocations) UnmarshalYAML ¶
func (e *ElementLocations) UnmarshalYAML(value *yaml.Node) error
UnmarshalYAML handles YAML unmarshalling for ElementLocations, accepting either a single ElementLocation or a list of ElementLocations.
type FetchConfig ¶ added in v0.5.39
type FetchConfig struct {
UseJavascript bool `yaml:"use_javascript,omitempty"` // Enable headless browser
WaitSelector string `yaml:"wait_selector,omitempty"` // CSS selector to wait for
WaitTimeoutMs int `yaml:"wait_timeout_ms,omitempty"` // Timeout for wait (default 30000)
FerretQL string `yaml:"ferret_ql,omitempty"` // Custom FerretQL script (advanced)
Script string `yaml:"script,omitempty"` // JavaScript to run after page load (Rod only, runs before scraping)
InfiniteScrollSelector string `yaml:"infinite_scroll_selector,omitempty"` // CSS selector for "Load More" button (Rod clicks it repeatedly)
}
FetchConfig controls how pages are fetched (JS rendering, waits, etc.)
type Field ¶
type Field struct {
Name string `yaml:"name"`
Value string `yaml:"value,omitempty"`
Type string `yaml:"type,omitempty"` // can be text (default), html, url, or date_time_tz_ranges
ElementLocations ElementLocations `yaml:"location,omitempty"` // elements are extracted strings joined with newlines
Default string `yaml:"default,omitempty"` // the default for a dynamic field (text or url) if no value is found
// If a field can be found on a detail page the following variable has to
// contain a field name of a field of type 'url' that is located on the main
// page.
OnDetailPage string `yaml:"on_detail_page,omitempty"` // applies to text, url, date
Required bool `yaml:"required,omitempty"` // applies to text, url - if true, skip record when field is empty
Components []DateComponent `yaml:"components,omitempty"` // applies to date
DateLocation string `yaml:"date_location,omitempty"` // applies to date
DateLanguage string `yaml:"date_language,omitempty"` // applies to date
Hide bool `yaml:"hide,omitempty"` // applies to text, url, date
GuessYear bool `yaml:"guess_year,omitempty"` // applies to date
Transform []TransformConfig `yaml:"transform,omitempty"` // applies to text
}
A Field contains all the information necessary to scrape a dynamic field from a website, ie a field who's value changes for each record.
type FieldNameComponents ¶ added in v0.5.39
type FieldNameComponents struct {
Hash string // 8-char hex hash
Attribute string // Attribute name, or empty for text content
TextNodeIndex int // 0-based text node index
}
FieldNameComponents contains the parsed components of a goskyr field name.
func ParseFieldName ¶ added in v0.5.39
func ParseFieldName(name string) (FieldNameComponents, bool)
ParseFieldName extracts components from a field name. Returns the components and true if parsing succeeded, or zero values and false if not.
Examples:
ParseFieldName("Fa1b2c3d4--0") → {Hash: "a1b2c3d4", Attribute: "", TextNodeIndex: 0}, true
ParseFieldName("Fa1b2c3d4-href-0") → {Hash: "a1b2c3d4", Attribute: "href", TextNodeIndex: 0}, true
ParseFieldName("invalid") → {}, false
type Filter ¶
type Filter struct {
Field string `yaml:"field"`
Type string
Expression string `yaml:"exp"` // changed from 'regex' to 'exp' in version 0.5.7
RegexComp *regexp.Regexp
DateComp time.Time
DateOp string
Match bool `yaml:"match"`
Condition string `yaml:"condition,omitempty"` // matches, not_matches, missing, missing_or_matches, exists
CaseInsensitive bool `yaml:"case_insensitive,omitempty"` // for regex matching
}
A Filter is used to filter certain recs from the result list
func (*Filter) FilterMatch ¶
FilterMatch checks if a value matches the filter's criteria based on regex or date comparison.
func (*Filter) FilterMatchWithCondition ¶ added in v0.5.39
FilterMatchWithCondition checks the filter against record with extended conditions. Returns true if the record should be KEPT.
func (*Filter) Initialize ¶
Initialize compiles the filter's regex pattern or parses date comparison expressions.
type GlobalConfig ¶
type GlobalConfig struct {
UserAgent string `yaml:"user-agent"`
}
GlobalConfig is used for storing global configuration parameters that are needed across all scrapers
type OutputCondition ¶ added in v0.5.39
type OutputCondition struct {
Equals string `yaml:"equals,omitempty"`
NotEquals string `yaml:"not_equals,omitempty"`
Matches string `yaml:"matches,omitempty"`
NotMatches string `yaml:"not_matches,omitempty"`
CaseInsensitive bool `yaml:"case_insensitive,omitempty"`
// contains filtered or unexported fields
}
OutputCondition controls conditional field mapping
func (*OutputCondition) Evaluate ¶ added in v0.5.39
func (c *OutputCondition) Evaluate(value string) bool
Evaluate checks if the condition is satisfied by the value
func (*OutputCondition) Initialize ¶ added in v0.5.39
func (c *OutputCondition) Initialize() error
Initialize compiles regex patterns in the condition
type Pagination ¶ added in v0.5.39
type Pagination struct {
Type string `yaml:"type"` // query_param, scroll, next_button
ParamName string `yaml:"param_name"` // for query_param: e.g. "start", "page"
StartValue int `yaml:"start_value"` // starting value (usually 0)
Increment int `yaml:"increment"` // increment per page
MaxPages int `yaml:"max_pages"` // safety limit
ButtonSelector string `yaml:"button_selector"` // for scroll/next_button types
WaitMs int `yaml:"wait_ms"` // delay between pages (milliseconds)
}
Pagination configures multi-page extraction
type Paginator ¶
type Paginator struct {
Location ElementLocation `yaml:"location,omitempty"`
MaxPages int `yaml:"max_pages,omitempty"`
}
A Paginator is used to paginate through a website
type RegexConfig ¶
RegexConfig is used for extracting a substring from a string based on the given RegexPattern and Index
type Scraper ¶
type Scraper struct {
Interaction []*fetch.Interaction `yaml:"interaction,omitempty"`
Name string `yaml:"name"`
PageLoadWait int `yaml:"page_load_wait,omitempty"` // milliseconds. Only taken into account when render_js = true
RenderJs bool `yaml:"render_js,omitempty"`
Selector string `yaml:"selector"`
Strategy string `yaml:"strategy,omitempty"` // "nested" (default) or "sequential"
URL string `yaml:"url"`
Validation *ValidationConfig `yaml:"validation,omitempty"`
Fields []Field `yaml:"fields,omitempty"`
Filters []*Filter `yaml:"filters,omitempty"`
Paginators []Paginator `yaml:"paginators,omitempty"`
// New declarative config fields
Fetch *FetchConfig `yaml:"fetch,omitempty"` // Fetch configuration
Pagination *Pagination `yaml:"pagination,omitempty"` // Declarative pagination
DerivedFields []DerivedField `yaml:"derived_fields,omitempty"` // Template-based field derivation
// MergeKey makes this an independent scraper (not a detail-page follower).
// When set, this scraper is run separately via Page() and its records are
// merged into the primary scraper's records by matching on this field name.
// The field must exist in both scrapers' output.
MergeKey string `yaml:"merge_key,omitempty"`
}
A Scraper contains all the necessary config parameters and structs needed to extract the desired information from a website
func FindByName ¶ added in v0.5.39
FindByName returns a pointer to the named scraper within config. Returns nil if config is nil, has no scrapers, or no scraper matches the name. Exact name match only — no fallback.
func (*Scraper) GetDetailPageURLFields ¶ added in v0.5.39
GetDetailPageURLFields returns all URL-type fields that can be used to navigate to detail pages.
type TransformConfig ¶
type TransformConfig struct {
TransformType string `yaml:"type,omitempty"` // only regex-replace for now
RegexPattern string `yaml:"regex,omitempty"` // a container for the pattern
Replacement string `yaml:"replace,omitempty"` // a plain string for replacement
}
TransformConfig is used to replace an existing substring with some other kind of string. Processing needs to happen before extracting dates.
type ValidationConfig ¶ added in v0.5.39
type ValidationConfig struct {
RequiresCTASelector string `yaml:"requires_cta_selector,omitempty"`
}