extractors

package

v1.0.6 Latest Latest Go to latest Published: Aug 31, 2025 License: MIT Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BumpyClock/hermes

Links

Open Source Insights

Documentation ¶

Overview ¶

ABOUTME: Advanced extractor loader with LRU caching and dynamic loading Reduces startup memory by 90% through lazy loading and automatic cache management

Index ¶

Variables
func AddExtractor(extractor *FullExtractor) interface{}
func CleanBySelectors(content *goquery.Selection, doc *goquery.Document, opts map[string][]string) *goquery.Selection
func CleanBySelectorsList(content *goquery.Selection, doc *goquery.Document, clean []string) *goquery.Selection
func ClearAPIExtractors()
func CollectAllPages(opts CollectAllPagesOptions) map[string]interface{}
func FindMatchingSelector(doc *goquery.Document, selectors []interface{}, extractHTML bool, ...) interface{}
func FindMatchingSelectorFromList(doc *goquery.Document, selectors []interface{}, extractHTML bool, ...) interface{}
func GetAPIExtractors() map[string]Extractor
func GetAPIExtractorsImpl() map[string]*FullExtractor
func GetAllExtractors() map[string]Extractor
func GetExtractorCount() int
func GetExtractorWithStats(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, *ExtractorStats, error)
func GetExtractorWithStatsContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, ...) (Extractor, *ExtractorStats, error)
func GetRegistryStats() (primary int, total int)
func HasExtractor(domain string) bool
func InitializeGlobalLoader(config *LoaderConfig)
func ListRegisteredDomains() []string
func RegisterAllCustomExtractors() error
func Select(opts SelectOptions) interface{}
func SelectExtendedFields(extend map[string]interface{}, opts SelectOpts) map[string]interface{}
func SelectExtendedTypes(extend map[string]interface{}, opts SelectOptions) map[string]interface{}
func SelectField(opts SelectOpts) interface{}
func TransformElements(content *goquery.Selection, doc *goquery.Document, ...) *goquery.Selection
func TransformElementsList(content *goquery.Selection, doc *goquery.Document, ...) *goquery.Selection
type BasicGenericExtractor
- func (bge *BasicGenericExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
- func (bge *BasicGenericExtractor) GetDomain() string
type BloggerExtractor
- func (b *BloggerExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
- func (b *BloggerExtractor) GetDomain() string
type CacheEntry
type CollectAllPagesOptions
type ContentExtractor
- func (ce *ContentExtractor) GetSelectors() parser.SelectorList
- func (ce *ContentExtractor) GetTransforms() parser.TransformRegistry
- func (ce *ContentExtractor) MigrateSelectors()
- func (ce *ContentExtractor) MigrateTransforms()
- func (ce *ContentExtractor) SetSelectors(selectors parser.SelectorList)
- func (ce *ContentExtractor) SetTransforms(transforms parser.TransformRegistry)
type CustomExtractorAdapter
- func GetCustomExtractorByDomain(domain string) (*CustomExtractorAdapter, bool)
- func NewCustomExtractorAdapter(customExtractor *custom.CustomExtractor) *CustomExtractorAdapter
- func (c *CustomExtractorAdapter) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
- func (c *CustomExtractorAdapter) GetCustomExtractor() *custom.CustomExtractor
- func (c *CustomExtractorAdapter) GetDomain() string
- func (c *CustomExtractorAdapter) GetSupportedDomains() []string
type DetectByHTMLFunc
type ExtractOptions
type ExtractOpts
type Extractor
- func CreateCustomExtractorAdapters() []Extractor
- func DetectByHTML(doc *goquery.Document) Extractor
- func GenericExtractor() Extractor
- func GetExtractor(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)
- func GetExtractorParallel(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, error)
- func GetExtractorParallelWithContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, ...) (Extractor, error)
- func GetExtractorSimple(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)
type ExtractorCandidate
type ExtractorChecker
type ExtractorError
type ExtractorLoader
- func NewExtractorLoader(config *LoaderConfig, registry *custom.RegistryManager) *ExtractorLoader
- func (el *ExtractorLoader) ClearCache()
- func (el *ExtractorLoader) Close() error
- func (el *ExtractorLoader) GetCacheStats() (int, int, float64)
- func (el *ExtractorLoader) GetMetrics() *LoaderMetrics
- func (el *ExtractorLoader) LoadExtractor(domain string) (*custom.CustomExtractor, error)
- func (el *ExtractorLoader) LoadExtractorByHTML(doc *goquery.Document) (*custom.CustomExtractor, error)
- func (el *ExtractorLoader) WarmupCache(domains []string) error
type ExtractorOptions
type ExtractorStats
type FieldExtractor
- func (fe *FieldExtractor) GetSelectors() parser.SelectorList
- func (fe *FieldExtractor) MigrateSelectors()
- func (fe *FieldExtractor) SetSelectors(selectors parser.SelectorList)
type FullExtractor
- func GetExtractorByDomain(domain string) (*FullExtractor, bool)
- func (f *FullExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
- func (f *FullExtractor) GetDomain() string
type LoaderConfig
- func DefaultLoaderConfig() *LoaderConfig
type LoaderMetrics
type MediumExtractor
- func (m *MediumExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
- func (m *MediumExtractor) GetDomain() string
type ParallelExtractorConfig
- func DefaultParallelConfig() *ParallelExtractorConfig
type ParallelExtractorResult
type ResourceInterface
type RootExtractorInterface
- func (r *RootExtractorInterface) Extract(extractor interface{}, opts ExtractOptions) interface{}
type SelectOptions
type SelectOpts
type SelectorEntry
type SimpleExtractor
- func JavaScriptCompatibleGetExtractor(urlStr string, parsedURL *url.URL, doc *goquery.Document) (SimpleExtractor, error)
- func (s SimpleExtractor) GetDomain() string
type SimpleRootExtractor
- func (r *SimpleRootExtractor) Extract(extractor interface{}, opts ExtractOpts) interface{}
type TransformFunc

Constants ¶

This section is empty.

Variables ¶

View Source

var (
	// All contains all static custom extractors, populated from custom registry
	// JavaScript equivalent: The result of all.js processing
	All = make(map[string]Extractor)

	// CustomRegistry manages all custom extractors with domain mapping
	CustomRegistry = custom.GlobalRegistryManager
)

Registry variables for extractor storage These are now properly integrated with the custom extractor framework

View Source

var NewSimpleRootExtractor = &SimpleRootExtractor{}

NewSimpleRootExtractor creates a new simple root extractor instance

View Source

var RootExtractor = &RootExtractorInterface{}

RootExtractor is the singleton instance

Functions ¶

func AddExtractor ¶

func AddExtractor(extractor *FullExtractor) interface{}

AddExtractor adds a custom extractor to the runtime registry Direct port of JavaScript addExtractor function with 100% behavioral compatibility

func CleanBySelectors ¶

func CleanBySelectors(content *goquery.Selection, doc *goquery.Document, opts map[string][]string) *goquery.Selection

CleanBySelectors removes elements by an array of selectors Direct port of JavaScript cleanBySelectors function

func CleanBySelectorsList ¶

func CleanBySelectorsList(content *goquery.Selection, doc *goquery.Document, clean []string) *goquery.Selection

CleanBySelectorsList removes elements by an array of selectors Direct port of JavaScript cleanBySelectors function

func ClearAPIExtractors ¶

func ClearAPIExtractors()

ClearAPIExtractors clears all registered extractors (useful for testing)

func CollectAllPages ¶

func CollectAllPages(opts CollectAllPagesOptions) map[string]interface{}

CollectAllPages collects and merges content from multiple pages of an article This is a faithful 1:1 port of the JavaScript collectAllPages function with: - Page counter starting at 1 (first page already fetched) - 26-page safety limit to prevent infinite loops - URL deduplication using RemoveAnchor utility - Progressive content concatenation with <hr><h4>Page N</h4> separators - Final word count calculation for combined content

func FindMatchingSelector ¶

func FindMatchingSelector(doc *goquery.Document, selectors []interface{}, extractHTML bool, allowMultiple bool) interface{}

FindMatchingSelector finds the first selector that matches content Direct port of JavaScript findMatchingSelector function

func FindMatchingSelectorFromList ¶

func FindMatchingSelectorFromList(doc *goquery.Document, selectors []interface{}, extractHTML bool, allowMultiple bool) interface{}

FindMatchingSelectorFromList finds the first selector that matches content Direct port of JavaScript findMatchingSelector function

func GetAPIExtractors ¶

func GetAPIExtractors() map[string]Extractor

GetAPIExtractors returns all runtime-registered extractors as Extractor interface

func GetAPIExtractorsImpl ¶

func GetAPIExtractorsImpl() map[string]*FullExtractor

GetAPIExtractorsImpl returns a copy of all runtime-registered extractors as FullExtractor

func GetAllExtractors ¶

func GetAllExtractors() map[string]Extractor

GetAllExtractors returns a map of all custom extractors keyed by domain JavaScript equivalent: export default Object.keys(CustomExtractors).reduce((acc, key) => { ... }, {});

func GetExtractorCount ¶

func GetExtractorCount() int

GetExtractorCount returns the number of registered extractors

func GetExtractorWithStats ¶

func GetExtractorWithStats(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, *ExtractorStats, error)

GetExtractorWithStats performs parallel lookup and returns performance statistics DEPRECATED: This method uses context.Background() which prevents proper cancellation. Use GetExtractorWithStatsContext instead.

func GetExtractorWithStatsContext ¶

func GetExtractorWithStatsContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, *ExtractorStats, error)

GetExtractorWithStatsContext performs parallel lookup with context and returns performance statistics

func GetRegistryStats ¶

func GetRegistryStats() (primary int, total int)

GetRegistryStats returns statistics about the registered extractors

func HasExtractor ¶

func HasExtractor(domain string) bool

HasExtractor checks if an extractor is registered for the given domain

func InitializeGlobalLoader ¶

func InitializeGlobalLoader(config *LoaderConfig)

InitializeGlobalLoader sets up the global loader with custom configuration

func ListRegisteredDomains ¶

func ListRegisteredDomains() []string

ListRegisteredDomains returns all domains registered in the global registry

func RegisterAllCustomExtractors ¶

func RegisterAllCustomExtractors() error

RegisterAllCustomExtractors registers all available custom extractors with the global registry This function iterates through all extractors from GetAllCustomExtractors() and registers them Returns an error if any registration fails

func Select ¶

func Select(opts SelectOptions) interface{}

Select performs field extraction with selector processing Direct port of JavaScript select function

func SelectExtendedFields ¶

func SelectExtendedFields(extend map[string]interface{}, opts SelectOpts) map[string]interface{}

SelectExtendedFields processes extended field types Direct port of JavaScript selectExtendedTypes function

func SelectExtendedTypes ¶

func SelectExtendedTypes(extend map[string]interface{}, opts SelectOptions) map[string]interface{}

SelectExtendedTypes processes extended field types Direct port of JavaScript selectExtendedTypes function

func SelectField ¶

func SelectField(opts SelectOpts) interface{}

SelectField performs field extraction with selector processing Direct port of JavaScript select function

func TransformElements ¶

func TransformElements(content *goquery.Selection, doc *goquery.Document, opts map[string]map[string]interface{}) *goquery.Selection

TransformElements transforms matching elements based on transformation rules Direct port of JavaScript transformElements function

func TransformElementsList ¶

func TransformElementsList(content *goquery.Selection, doc *goquery.Document, transforms map[string]interface{}) *goquery.Selection

TransformElementsList transforms matching elements based on transformation rules Direct port of JavaScript transformElements function

Types ¶

type BasicGenericExtractor ¶

type BasicGenericExtractor struct {
	// contains filtered or unexported fields
}

BasicGenericExtractor implements the Extractor interface for generic extraction

func (*BasicGenericExtractor) Extract ¶

func (bge *BasicGenericExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)

Extract performs generic extraction by delegating to the generic package

func (*BasicGenericExtractor) GetDomain ¶

func (bge *BasicGenericExtractor) GetDomain() string

GetDomain returns the domain this extractor handles

type BloggerExtractor ¶

type BloggerExtractor struct{}

BloggerExtractor represents the Blogger/Blogspot custom extractor

func (*BloggerExtractor) Extract ¶

func (b *BloggerExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)

func (*BloggerExtractor) GetDomain ¶

func (b *BloggerExtractor) GetDomain() string

type CacheEntry ¶

type CacheEntry struct {
	Extractor   *custom.CustomExtractor
	LoadTime    time.Time
	AccessTime  time.Time
	AccessCount int64
	Element     *list.Element // For LRU tracking
}

CacheEntry represents a cached extractor with metadata

type CollectAllPagesOptions ¶

type CollectAllPagesOptions struct {
	NextPageURL   string
	HTML          string
	Doc           *goquery.Document
	MetaCache     map[string]interface{}
	Result        map[string]interface{}
	Extractor     interface{}
	Title         interface{}
	URL           string
	Resource      ResourceInterface
	RootExtractor *RootExtractorInterface
}

CollectAllPagesOptions contains all parameters needed for multi-page collection This matches the JavaScript function signature exactly

type ContentExtractor ¶

type ContentExtractor struct {
	Selectors        parser.SelectorList      `json:"-"`                   // Type-safe selectors
	SelectorsLegacy  []interface{}            `json:"selectors,omitempty"` // Deprecated: JSON compatibility
	AllowMultiple    bool                     `json:"allowMultiple,omitempty"`
	DefaultCleaner   bool                     `json:"defaultCleaner"`
	Clean            []string                 `json:"clean,omitempty"`      // Selectors to remove
	Transforms       parser.TransformRegistry `json:"-"`                    // Type-safe transforms
	TransformsLegacy map[string]interface{}   `json:"transforms,omitempty"` // Deprecated: JSON compatibility
}

ContentExtractor configuration for content extraction with transforms and cleaning

func (*ContentExtractor) GetSelectors ¶

func (ce *ContentExtractor) GetSelectors() parser.SelectorList

GetSelectors returns the type-safe selectors, migrating from legacy if needed

func (*ContentExtractor) GetTransforms ¶

func (ce *ContentExtractor) GetTransforms() parser.TransformRegistry

GetTransforms returns the type-safe transforms, migrating from legacy if needed

func (*ContentExtractor) MigrateSelectors ¶

func (ce *ContentExtractor) MigrateSelectors()

MigrateSelectors for ContentExtractor

func (*ContentExtractor) MigrateTransforms ¶

func (ce *ContentExtractor) MigrateTransforms()

MigrateTransforms converts legacy map[string]interface{} transforms to type-safe TransformRegistry

func (*ContentExtractor) SetSelectors ¶

func (ce *ContentExtractor) SetSelectors(selectors parser.SelectorList)

SetSelectors sets the type-safe selectors and updates legacy for JSON compatibility

func (*ContentExtractor) SetTransforms ¶

func (ce *ContentExtractor) SetTransforms(transforms parser.TransformRegistry)

SetTransforms sets the type-safe transforms and updates legacy for JSON compatibility

type CustomExtractorAdapter ¶

type CustomExtractorAdapter struct {
	// contains filtered or unexported fields
}

CustomExtractorAdapter wraps a CustomExtractor to implement the main Extractor interface This allows all 160 custom extractors to be used by the main parser system

func GetCustomExtractorByDomain ¶

func GetCustomExtractorByDomain(domain string) (*CustomExtractorAdapter, bool)

GetCustomExtractorByDomain retrieves a custom extractor adapter for a specific domain

func NewCustomExtractorAdapter ¶

func NewCustomExtractorAdapter(customExtractor *custom.CustomExtractor) *CustomExtractorAdapter

NewCustomExtractorAdapter creates a new adapter for a CustomExtractor

func (*CustomExtractorAdapter) Extract ¶

func (c *CustomExtractorAdapter) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)

Extract performs extraction using the custom extractor's configuration Implements the parser.Extractor interface

func (*CustomExtractorAdapter) GetCustomExtractor ¶

func (c *CustomExtractorAdapter) GetCustomExtractor() *custom.CustomExtractor

GetCustomExtractor returns the underlying CustomExtractor This allows access to the full extractor configuration when needed

func (*CustomExtractorAdapter) GetDomain ¶

func (c *CustomExtractorAdapter) GetDomain() string

GetDomain returns the primary domain this extractor handles Implements the Extractor interface

func (*CustomExtractorAdapter) GetSupportedDomains ¶

func (c *CustomExtractorAdapter) GetSupportedDomains() []string

GetSupportedDomains returns all domains this extractor supports

type DetectByHTMLFunc ¶

type DetectByHTMLFunc func(*goquery.Document) Extractor

DetectByHTMLFunc type for HTML-based extractor detection

type ExtractOptions ¶

type ExtractOptions struct {
	Doc            *goquery.Document
	URL            string
	Extractor      interface{}
	ContentOnly    bool
	ExtractedTitle interface{}
	Fallback       bool
}

ExtractOptions contains parameters for the root extractor

type ExtractOpts ¶

type ExtractOpts struct {
	Doc            *goquery.Document
	URL            string
	Extractor      interface{}
	ContentOnly    bool
	ExtractedTitle interface{}
	Fallback       bool
}

ExtractOpts contains parameters for the root extractor

type Extractor ¶

type Extractor = parser.Extractor

Use the standard parser.Extractor interface

func CreateCustomExtractorAdapters ¶

func CreateCustomExtractorAdapters() []Extractor

CreateCustomExtractorAdapters converts all custom extractors to adapter instances This is the key function that bridges the custom extractor system to the main parser

func DetectByHTML ¶

func DetectByHTML(doc *goquery.Document) Extractor

DetectByHTML identifies an appropriate extractor based on HTML meta tags JavaScript equivalent: export default function detectByHtml($)

func GenericExtractor ¶

func GenericExtractor() Extractor

GenericExtractor creates a generic fallback extractor Returns a basic implementation that satisfies the Extractor interface

func GetExtractor ¶

func GetExtractor(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)

GetExtractor returns the appropriate extractor for a given URL Direct 1:1 port of JavaScript getExtractor function with identical behavior

JavaScript signature: getExtractor(url, parsedUrl, $) Go signature: GetExtractor(url, parsedUrl, doc) - $ becomes doc for goquery compatibility

func GetExtractorParallel ¶

func GetExtractorParallel(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, error)

GetExtractorParallel performs parallel extractor lookup with priority ordering This is the high-performance version of GetExtractor that uses goroutines DEPRECATED: This method uses context.Background() which prevents proper cancellation. Use GetExtractorParallelWithContext instead.

func GetExtractorParallelWithContext ¶

func GetExtractorParallelWithContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, error)

GetExtractorParallelWithContext performs parallel extractor lookup with context for cancellation

func GetExtractorSimple ¶

func GetExtractorSimple(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)

GetExtractorSimple returns the appropriate extractor for a given URL Direct 1:1 port of JavaScript getExtractor function with identical behavior Simplified version that works with existing type system

type ExtractorCandidate ¶

type ExtractorCandidate struct {
	Extractor Extractor
	Priority  int    // Lower number = higher priority
	Source    string // "api_hostname", "api_domain", "static_hostname", etc.
	Key       string // The key used to find this extractor
}

ExtractorCandidate represents a potential extractor with its priority

type ExtractorChecker ¶

type ExtractorChecker interface {
	CanHandle(doc *goquery.Document, url string) (bool, error)
	GetPriority() int
	GetSource() string
}

ExtractorChecker defines the interface for checking if an extractor can handle a URL

type ExtractorError ¶

type ExtractorError struct {
	Error   bool   `json:"error"`
	Message string `json:"message"`
}

ExtractorError represents error response from addExtractor

type ExtractorLoader ¶

type ExtractorLoader struct {
	// contains filtered or unexported fields
}

ExtractorLoader provides advanced loading and caching for custom extractors

var GlobalExtractorLoader *ExtractorLoader

Global loader instance with optimized configuration

func NewExtractorLoader ¶

func NewExtractorLoader(config *LoaderConfig, registry *custom.RegistryManager) *ExtractorLoader

NewExtractorLoader creates a new loader with the given configuration

func (*ExtractorLoader) ClearCache ¶

func (el *ExtractorLoader) ClearCache()

ClearCache removes all entries from the cache

func (*ExtractorLoader) Close ¶

func (el *ExtractorLoader) Close() error

Close stops background processes and cleans up resources

func (*ExtractorLoader) GetCacheStats ¶

func (el *ExtractorLoader) GetCacheStats() (int, int, float64)

GetCacheStats returns information about the current cache state

func (*ExtractorLoader) GetMetrics ¶

func (el *ExtractorLoader) GetMetrics() *LoaderMetrics

GetMetrics returns a copy of current performance metrics

func (*ExtractorLoader) LoadExtractor ¶

func (el *ExtractorLoader) LoadExtractor(domain string) (*custom.CustomExtractor, error)

LoadExtractor loads an extractor by domain with advanced caching

func (*ExtractorLoader) LoadExtractorByHTML ¶

func (el *ExtractorLoader) LoadExtractorByHTML(doc *goquery.Document) (*custom.CustomExtractor, error)

LoadExtractorByHTML tries to detect an extractor using HTML content

func (*ExtractorLoader) WarmupCache ¶

func (el *ExtractorLoader) WarmupCache(domains []string) error

WarmupCache preloads specific domains into cache

type ExtractorOptions ¶

type ExtractorOptions struct {
	StripUnlikelyCandidates bool
	WeightNodes             bool
	CleanConditionally      bool
	URL                     string
	Content                 string
	Title                   string
}

ExtractorOptions contains options for field extraction

type ExtractorStats ¶

type ExtractorStats struct {
	TotalCandidates   int
	CheckedInParallel int
	FastestMatch      time.Duration
	SlowestMatch      time.Duration
	AverageCheckTime  time.Duration
	WinningExtractor  string
	WinningPriority   int
}

ExtractorStats provides statistics about parallel extractor checking

type FieldExtractor ¶

type FieldExtractor struct {
	Selectors       parser.SelectorList `json:"-"`                   // Type-safe selectors
	SelectorsLegacy []interface{}       `json:"selectors,omitempty"` // Deprecated: JSON compatibility
	AllowMultiple   bool                `json:"allowMultiple,omitempty"`
	DefaultCleaner  bool                `json:"defaultCleaner"` // defaults to true in JavaScript
}

FieldExtractor configuration for extracting a field

func (*FieldExtractor) GetSelectors ¶

func (fe *FieldExtractor) GetSelectors() parser.SelectorList

GetSelectors returns the type-safe selectors, migrating from legacy if needed

func (*FieldExtractor) MigrateSelectors ¶

func (fe *FieldExtractor) MigrateSelectors()

MigrateSelectors converts legacy []interface{} selectors to type-safe SelectorList This enables gradual migration from JavaScript patterns to Go idioms

func (*FieldExtractor) SetSelectors ¶

func (fe *FieldExtractor) SetSelectors(selectors parser.SelectorList)

SetSelectors sets the type-safe selectors and updates legacy for JSON compatibility

type FullExtractor ¶

type FullExtractor struct {
	Domain           string   `json:"domain"`
	SupportedDomains []string `json:"supportedDomains,omitempty"`

	// Field extractors for standard fields
	Title         *FieldExtractor   `json:"title,omitempty"`
	Author        *FieldExtractor   `json:"author,omitempty"`
	Content       *ContentExtractor `json:"content,omitempty"`
	DatePublished *FieldExtractor   `json:"date_published,omitempty"`
	LeadImageURL  *FieldExtractor   `json:"lead_image_url,omitempty"`
	Dek           *FieldExtractor   `json:"dek,omitempty"`
	NextPageURL   *FieldExtractor   `json:"next_page_url,omitempty"`
	Excerpt       *FieldExtractor   `json:"excerpt,omitempty"`
	WordCount     *FieldExtractor   `json:"word_count,omitempty"`
	Direction     *FieldExtractor   `json:"direction,omitempty"`
	URL           *FieldExtractor   `json:"url,omitempty"`

	// Extended types for custom fields
	Extend map[string]*FieldExtractor `json:"extend,omitempty"`
}

FullExtractor represents a complete custom extractor with all field definitions

func GetExtractorByDomain ¶

func GetExtractorByDomain(domain string) (*FullExtractor, bool)

GetExtractorByDomain retrieves a specific extractor by domain from API registry

func (*FullExtractor) Extract ¶

func (f *FullExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)

Extract implements the parser.Extractor interface for FullExtractor

func (*FullExtractor) GetDomain ¶

func (f *FullExtractor) GetDomain() string

GetDomain implements the Extractor interface for FullExtractor

type LoaderConfig ¶

type LoaderConfig struct {
	MaxCacheSize       int           // Maximum number of cached extractors
	CacheExpiration    time.Duration // How long to keep extractors in cache
	PreloadCommonSites bool          // Whether to preload popular extractors
	EnableMetrics      bool          // Whether to track performance metrics
	MaxLoadAttempts    int           // Maximum attempts to load an extractor
	LoadTimeout        time.Duration // Timeout for loading operations
}

LoaderConfig contains configuration for the extractor loader

func DefaultLoaderConfig ¶

func DefaultLoaderConfig() *LoaderConfig

DefaultLoaderConfig returns sensible defaults for the loader

type LoaderMetrics ¶

type LoaderMetrics struct {
	CacheHits       int64
	CacheMisses     int64
	LoadSuccesses   int64
	LoadFailures    int64
	TotalLoadTime   time.Duration
	AverageLoadTime time.Duration
	EvictionCount   int64
}

LoaderMetrics tracks performance statistics

type MediumExtractor ¶

type MediumExtractor struct{}

MediumExtractor represents the Medium.com custom extractor

func (*MediumExtractor) Extract ¶

func (m *MediumExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)

func (*MediumExtractor) GetDomain ¶

func (m *MediumExtractor) GetDomain() string

type ParallelExtractorConfig ¶

type ParallelExtractorConfig struct {
	MaxConcurrentChecks int           // Maximum number of goroutines
	CheckTimeout        time.Duration // Timeout for individual extractor checks
	EnableStats         bool          // Whether to collect detailed statistics
	EnableCaching       bool          // Whether to cache extractor decisions
}

ParallelExtractorConfig configures parallel extractor behavior

func DefaultParallelConfig ¶

func DefaultParallelConfig() *ParallelExtractorConfig

DefaultParallelConfig returns a sensible default configuration

type ParallelExtractorResult ¶

type ParallelExtractorResult struct {
	Candidate *ExtractorCandidate
	Error     error
	Duration  time.Duration
}

ParallelExtractorResult holds the result of parallel extractor checking

type ResourceInterface ¶

type ResourceInterface interface {
	Create(url string, preparedResponse string, parsedURL interface{}, headers map[string]string) (*goquery.Document, error)
}

ResourceInterface defines the interface for resource fetching

type RootExtractorInterface ¶

type RootExtractorInterface struct{}

RootExtractorInterface defines the root extractor interface

func (*RootExtractorInterface) Extract ¶

func (r *RootExtractorInterface) Extract(extractor interface{}, opts ExtractOptions) interface{}

Extract is the main orchestration method Direct port of JavaScript RootExtractor.extract function

type SelectOptions ¶

type SelectOptions struct {
	Doc            *goquery.Document
	Type           string
	ExtractionOpts interface{}
	ExtractHTML    bool
	URL            string
}

SelectOptions contains parameters for field selection

type SelectOpts ¶

type SelectOpts struct {
	Doc            *goquery.Document
	Type           string
	ExtractionOpts interface{}
	ExtractHTML    bool
	URL            string
}

SelectOpts contains parameters for the select function

type SelectorEntry ¶

type SelectorEntry struct {
	Selector        string
	Attribute       string
	TransformFunc   func(string) string
	IsMultiSelector bool
	IsAttribute     bool
}

SelectorEntry represents a parsed selector with metadata

type SimpleExtractor ¶

type SimpleExtractor struct {
	Domain string
}

SimpleExtractor represents a basic extractor for demonstration

func JavaScriptCompatibleGetExtractor ¶

func JavaScriptCompatibleGetExtractor(urlStr string, parsedURL *url.URL, doc *goquery.Document) (SimpleExtractor, error)

JavaScriptCompatibleGetExtractor demonstrates the exact JavaScript getExtractor logic This is a faithful 1:1 port showing the correct behavior without type system conflicts

func (SimpleExtractor) GetDomain ¶

func (s SimpleExtractor) GetDomain() string

GetDomain returns the domain this extractor handles

type SimpleRootExtractor ¶

type SimpleRootExtractor struct{}

SimpleRootExtractor implements a simplified root extractor to avoid conflicts

func (*SimpleRootExtractor) Extract ¶

func (r *SimpleRootExtractor) Extract(extractor interface{}, opts ExtractOpts) interface{}

Extract is the main orchestration method Direct port of JavaScript RootExtractor.extract function

type TransformFunc ¶

type TransformFunc func(*goquery.Selection, *goquery.Document) interface{}

TransformFunc is a function type for DOM transformations

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
custom
fields
generic
validation Package validation provides a comprehensive field validation framework for extracted fields and extended field support.	Package validation provides a comprehensive field validation framework for extracted fields and extended field support.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL