Documentation
¶
Overview ¶
ABOUTME: Advanced extractor loader with LRU caching and dynamic loading Reduces startup memory by 90% through lazy loading and automatic cache management
Index ¶
- Variables
- func AddExtractor(extractor *FullExtractor) interface{}
- func CleanBySelectors(content *goquery.Selection, doc *goquery.Document, opts map[string][]string) *goquery.Selection
- func CleanBySelectorsList(content *goquery.Selection, doc *goquery.Document, clean []string) *goquery.Selection
- func ClearAPIExtractors()
- func CollectAllPages(opts CollectAllPagesOptions) map[string]interface{}
- func FindMatchingSelector(doc *goquery.Document, selectors []interface{}, extractHTML bool, ...) interface{}
- func FindMatchingSelectorFromList(doc *goquery.Document, selectors []interface{}, extractHTML bool, ...) interface{}
- func GetAPIExtractors() map[string]Extractor
- func GetAPIExtractorsImpl() map[string]*FullExtractor
- func GetAllExtractors() map[string]Extractor
- func GetExtractorCount() int
- func GetExtractorWithStats(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, *ExtractorStats, error)
- func GetExtractorWithStatsContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, ...) (Extractor, *ExtractorStats, error)
- func GetRegistryStats() (primary int, total int)
- func HasExtractor(domain string) bool
- func InitializeGlobalLoader(config *LoaderConfig)
- func ListRegisteredDomains() []string
- func RegisterAllCustomExtractors() error
- func Select(opts SelectOptions) interface{}
- func SelectExtendedFields(extend map[string]interface{}, opts SelectOpts) map[string]interface{}
- func SelectExtendedTypes(extend map[string]interface{}, opts SelectOptions) map[string]interface{}
- func SelectField(opts SelectOpts) interface{}
- func TransformElements(content *goquery.Selection, doc *goquery.Document, ...) *goquery.Selection
- func TransformElementsList(content *goquery.Selection, doc *goquery.Document, ...) *goquery.Selection
- type BasicGenericExtractor
- type BloggerExtractor
- type CacheEntry
- type CollectAllPagesOptions
- type ContentExtractor
- func (ce *ContentExtractor) GetSelectors() parser.SelectorList
- func (ce *ContentExtractor) GetTransforms() parser.TransformRegistry
- func (ce *ContentExtractor) MigrateSelectors()
- func (ce *ContentExtractor) MigrateTransforms()
- func (ce *ContentExtractor) SetSelectors(selectors parser.SelectorList)
- func (ce *ContentExtractor) SetTransforms(transforms parser.TransformRegistry)
- type CustomExtractorAdapter
- func (c *CustomExtractorAdapter) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
- func (c *CustomExtractorAdapter) GetCustomExtractor() *custom.CustomExtractor
- func (c *CustomExtractorAdapter) GetDomain() string
- func (c *CustomExtractorAdapter) GetSupportedDomains() []string
- type DetectByHTMLFunc
- type ExtractOptions
- type ExtractOpts
- type Extractor
- func CreateCustomExtractorAdapters() []Extractor
- func DetectByHTML(doc *goquery.Document) Extractor
- func GenericExtractor() Extractor
- func GetExtractor(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)
- func GetExtractorParallel(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, error)
- func GetExtractorParallelWithContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, ...) (Extractor, error)
- func GetExtractorSimple(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)
- type ExtractorCandidate
- type ExtractorChecker
- type ExtractorError
- type ExtractorLoader
- func (el *ExtractorLoader) ClearCache()
- func (el *ExtractorLoader) Close() error
- func (el *ExtractorLoader) GetCacheStats() (int, int, float64)
- func (el *ExtractorLoader) GetMetrics() *LoaderMetrics
- func (el *ExtractorLoader) LoadExtractor(domain string) (*custom.CustomExtractor, error)
- func (el *ExtractorLoader) LoadExtractorByHTML(doc *goquery.Document) (*custom.CustomExtractor, error)
- func (el *ExtractorLoader) WarmupCache(domains []string) error
- type ExtractorOptions
- type ExtractorStats
- type FieldExtractor
- type FullExtractor
- type LoaderConfig
- type LoaderMetrics
- type MediumExtractor
- type ParallelExtractorConfig
- type ParallelExtractorResult
- type ResourceInterface
- type RootExtractorInterface
- type SelectOptions
- type SelectOpts
- type SelectorEntry
- type SimpleExtractor
- type SimpleRootExtractor
- type TransformFunc
Constants ¶
This section is empty.
Variables ¶
var ( // All contains all static custom extractors, populated from custom registry // JavaScript equivalent: The result of all.js processing All = make(map[string]Extractor) // CustomRegistry manages all custom extractors with domain mapping CustomRegistry = custom.GlobalRegistryManager )
Registry variables for extractor storage These are now properly integrated with the custom extractor framework
var NewSimpleRootExtractor = &SimpleRootExtractor{}
NewSimpleRootExtractor creates a new simple root extractor instance
var RootExtractor = &RootExtractorInterface{}
RootExtractor is the singleton instance
Functions ¶
func AddExtractor ¶
func AddExtractor(extractor *FullExtractor) interface{}
AddExtractor adds a custom extractor to the runtime registry Direct port of JavaScript addExtractor function with 100% behavioral compatibility
func CleanBySelectors ¶
func CleanBySelectors(content *goquery.Selection, doc *goquery.Document, opts map[string][]string) *goquery.Selection
CleanBySelectors removes elements by an array of selectors Direct port of JavaScript cleanBySelectors function
func CleanBySelectorsList ¶
func CleanBySelectorsList(content *goquery.Selection, doc *goquery.Document, clean []string) *goquery.Selection
CleanBySelectorsList removes elements by an array of selectors Direct port of JavaScript cleanBySelectors function
func ClearAPIExtractors ¶
func ClearAPIExtractors()
ClearAPIExtractors clears all registered extractors (useful for testing)
func CollectAllPages ¶
func CollectAllPages(opts CollectAllPagesOptions) map[string]interface{}
CollectAllPages collects and merges content from multiple pages of an article This is a faithful 1:1 port of the JavaScript collectAllPages function with: - Page counter starting at 1 (first page already fetched) - 26-page safety limit to prevent infinite loops - URL deduplication using RemoveAnchor utility - Progressive content concatenation with <hr><h4>Page N</h4> separators - Final word count calculation for combined content
func FindMatchingSelector ¶
func FindMatchingSelector(doc *goquery.Document, selectors []interface{}, extractHTML bool, allowMultiple bool) interface{}
FindMatchingSelector finds the first selector that matches content Direct port of JavaScript findMatchingSelector function
func FindMatchingSelectorFromList ¶
func FindMatchingSelectorFromList(doc *goquery.Document, selectors []interface{}, extractHTML bool, allowMultiple bool) interface{}
FindMatchingSelectorFromList finds the first selector that matches content Direct port of JavaScript findMatchingSelector function
func GetAPIExtractors ¶
GetAPIExtractors returns all runtime-registered extractors as Extractor interface
func GetAPIExtractorsImpl ¶
func GetAPIExtractorsImpl() map[string]*FullExtractor
GetAPIExtractorsImpl returns a copy of all runtime-registered extractors as FullExtractor
func GetAllExtractors ¶
GetAllExtractors returns a map of all custom extractors keyed by domain JavaScript equivalent: export default Object.keys(CustomExtractors).reduce((acc, key) => { ... }, {});
func GetExtractorCount ¶
func GetExtractorCount() int
GetExtractorCount returns the number of registered extractors
func GetExtractorWithStats ¶
func GetExtractorWithStats(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, *ExtractorStats, error)
GetExtractorWithStats performs parallel lookup and returns performance statistics DEPRECATED: This method uses context.Background() which prevents proper cancellation. Use GetExtractorWithStatsContext instead.
func GetExtractorWithStatsContext ¶
func GetExtractorWithStatsContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, *ExtractorStats, error)
GetExtractorWithStatsContext performs parallel lookup with context and returns performance statistics
func GetRegistryStats ¶
GetRegistryStats returns statistics about the registered extractors
func HasExtractor ¶
HasExtractor checks if an extractor is registered for the given domain
func InitializeGlobalLoader ¶
func InitializeGlobalLoader(config *LoaderConfig)
InitializeGlobalLoader sets up the global loader with custom configuration
func ListRegisteredDomains ¶
func ListRegisteredDomains() []string
ListRegisteredDomains returns all domains registered in the global registry
func RegisterAllCustomExtractors ¶
func RegisterAllCustomExtractors() error
RegisterAllCustomExtractors registers all available custom extractors with the global registry This function iterates through all extractors from GetAllCustomExtractors() and registers them Returns an error if any registration fails
func Select ¶
func Select(opts SelectOptions) interface{}
Select performs field extraction with selector processing Direct port of JavaScript select function
func SelectExtendedFields ¶
func SelectExtendedFields(extend map[string]interface{}, opts SelectOpts) map[string]interface{}
SelectExtendedFields processes extended field types Direct port of JavaScript selectExtendedTypes function
func SelectExtendedTypes ¶
func SelectExtendedTypes(extend map[string]interface{}, opts SelectOptions) map[string]interface{}
SelectExtendedTypes processes extended field types Direct port of JavaScript selectExtendedTypes function
func SelectField ¶
func SelectField(opts SelectOpts) interface{}
SelectField performs field extraction with selector processing Direct port of JavaScript select function
Types ¶
type BasicGenericExtractor ¶
type BasicGenericExtractor struct {
// contains filtered or unexported fields
}
BasicGenericExtractor implements the Extractor interface for generic extraction
func (*BasicGenericExtractor) Extract ¶
func (bge *BasicGenericExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
Extract performs generic extraction by delegating to the generic package
func (*BasicGenericExtractor) GetDomain ¶
func (bge *BasicGenericExtractor) GetDomain() string
GetDomain returns the domain this extractor handles
type BloggerExtractor ¶
type BloggerExtractor struct{}
BloggerExtractor represents the Blogger/Blogspot custom extractor
func (*BloggerExtractor) Extract ¶
func (b *BloggerExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
func (*BloggerExtractor) GetDomain ¶
func (b *BloggerExtractor) GetDomain() string
type CacheEntry ¶
type CacheEntry struct {
Extractor *custom.CustomExtractor
LoadTime time.Time
AccessTime time.Time
AccessCount int64
Element *list.Element // For LRU tracking
}
CacheEntry represents a cached extractor with metadata
type CollectAllPagesOptions ¶
type CollectAllPagesOptions struct {
NextPageURL string
HTML string
Doc *goquery.Document
MetaCache map[string]interface{}
Result map[string]interface{}
Extractor interface{}
Title interface{}
URL string
Resource ResourceInterface
RootExtractor *RootExtractorInterface
}
CollectAllPagesOptions contains all parameters needed for multi-page collection This matches the JavaScript function signature exactly
type ContentExtractor ¶
type ContentExtractor struct {
Selectors parser.SelectorList `json:"-"` // Type-safe selectors
SelectorsLegacy []interface{} `json:"selectors,omitempty"` // Deprecated: JSON compatibility
AllowMultiple bool `json:"allowMultiple,omitempty"`
DefaultCleaner bool `json:"defaultCleaner"`
Clean []string `json:"clean,omitempty"` // Selectors to remove
Transforms parser.TransformRegistry `json:"-"` // Type-safe transforms
TransformsLegacy map[string]interface{} `json:"transforms,omitempty"` // Deprecated: JSON compatibility
}
ContentExtractor configuration for content extraction with transforms and cleaning
func (*ContentExtractor) GetSelectors ¶
func (ce *ContentExtractor) GetSelectors() parser.SelectorList
GetSelectors returns the type-safe selectors, migrating from legacy if needed
func (*ContentExtractor) GetTransforms ¶
func (ce *ContentExtractor) GetTransforms() parser.TransformRegistry
GetTransforms returns the type-safe transforms, migrating from legacy if needed
func (*ContentExtractor) MigrateSelectors ¶
func (ce *ContentExtractor) MigrateSelectors()
MigrateSelectors for ContentExtractor
func (*ContentExtractor) MigrateTransforms ¶
func (ce *ContentExtractor) MigrateTransforms()
MigrateTransforms converts legacy map[string]interface{} transforms to type-safe TransformRegistry
func (*ContentExtractor) SetSelectors ¶
func (ce *ContentExtractor) SetSelectors(selectors parser.SelectorList)
SetSelectors sets the type-safe selectors and updates legacy for JSON compatibility
func (*ContentExtractor) SetTransforms ¶
func (ce *ContentExtractor) SetTransforms(transforms parser.TransformRegistry)
SetTransforms sets the type-safe transforms and updates legacy for JSON compatibility
type CustomExtractorAdapter ¶
type CustomExtractorAdapter struct {
// contains filtered or unexported fields
}
CustomExtractorAdapter wraps a CustomExtractor to implement the main Extractor interface This allows all 160 custom extractors to be used by the main parser system
func GetCustomExtractorByDomain ¶
func GetCustomExtractorByDomain(domain string) (*CustomExtractorAdapter, bool)
GetCustomExtractorByDomain retrieves a custom extractor adapter for a specific domain
func NewCustomExtractorAdapter ¶
func NewCustomExtractorAdapter(customExtractor *custom.CustomExtractor) *CustomExtractorAdapter
NewCustomExtractorAdapter creates a new adapter for a CustomExtractor
func (*CustomExtractorAdapter) Extract ¶
func (c *CustomExtractorAdapter) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
Extract performs extraction using the custom extractor's configuration Implements the parser.Extractor interface
func (*CustomExtractorAdapter) GetCustomExtractor ¶
func (c *CustomExtractorAdapter) GetCustomExtractor() *custom.CustomExtractor
GetCustomExtractor returns the underlying CustomExtractor This allows access to the full extractor configuration when needed
func (*CustomExtractorAdapter) GetDomain ¶
func (c *CustomExtractorAdapter) GetDomain() string
GetDomain returns the primary domain this extractor handles Implements the Extractor interface
func (*CustomExtractorAdapter) GetSupportedDomains ¶
func (c *CustomExtractorAdapter) GetSupportedDomains() []string
GetSupportedDomains returns all domains this extractor supports
type DetectByHTMLFunc ¶
DetectByHTMLFunc type for HTML-based extractor detection
type ExtractOptions ¶
type ExtractOptions struct {
Doc *goquery.Document
URL string
Extractor interface{}
ContentOnly bool
ExtractedTitle interface{}
Fallback bool
}
ExtractOptions contains parameters for the root extractor
type ExtractOpts ¶
type ExtractOpts struct {
Doc *goquery.Document
URL string
Extractor interface{}
ContentOnly bool
ExtractedTitle interface{}
Fallback bool
}
ExtractOpts contains parameters for the root extractor
type Extractor ¶
Use the standard parser.Extractor interface
func CreateCustomExtractorAdapters ¶
func CreateCustomExtractorAdapters() []Extractor
CreateCustomExtractorAdapters converts all custom extractors to adapter instances This is the key function that bridges the custom extractor system to the main parser
func DetectByHTML ¶
DetectByHTML identifies an appropriate extractor based on HTML meta tags JavaScript equivalent: export default function detectByHtml($)
func GenericExtractor ¶
func GenericExtractor() Extractor
GenericExtractor creates a generic fallback extractor Returns a basic implementation that satisfies the Extractor interface
func GetExtractor ¶
GetExtractor returns the appropriate extractor for a given URL Direct 1:1 port of JavaScript getExtractor function with identical behavior
JavaScript signature: getExtractor(url, parsedUrl, $) Go signature: GetExtractor(url, parsedUrl, doc) - $ becomes doc for goquery compatibility
func GetExtractorParallel ¶
func GetExtractorParallel(urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, error)
GetExtractorParallel performs parallel extractor lookup with priority ordering This is the high-performance version of GetExtractor that uses goroutines DEPRECATED: This method uses context.Background() which prevents proper cancellation. Use GetExtractorParallelWithContext instead.
func GetExtractorParallelWithContext ¶
func GetExtractorParallelWithContext(ctx context.Context, urlStr string, parsedURL *neturl.URL, doc *goquery.Document) (Extractor, error)
GetExtractorParallelWithContext performs parallel extractor lookup with context for cancellation
func GetExtractorSimple ¶
func GetExtractorSimple(urlStr string, parsedURL *url.URL, doc *goquery.Document) (Extractor, error)
GetExtractorSimple returns the appropriate extractor for a given URL Direct 1:1 port of JavaScript getExtractor function with identical behavior Simplified version that works with existing type system
type ExtractorCandidate ¶
type ExtractorCandidate struct {
Extractor Extractor
Priority int // Lower number = higher priority
Source string // "api_hostname", "api_domain", "static_hostname", etc.
Key string // The key used to find this extractor
}
ExtractorCandidate represents a potential extractor with its priority
type ExtractorChecker ¶
type ExtractorChecker interface {
CanHandle(doc *goquery.Document, url string) (bool, error)
GetPriority() int
GetSource() string
}
ExtractorChecker defines the interface for checking if an extractor can handle a URL
type ExtractorError ¶
ExtractorError represents error response from addExtractor
type ExtractorLoader ¶
type ExtractorLoader struct {
// contains filtered or unexported fields
}
ExtractorLoader provides advanced loading and caching for custom extractors
var GlobalExtractorLoader *ExtractorLoader
Global loader instance with optimized configuration
func NewExtractorLoader ¶
func NewExtractorLoader(config *LoaderConfig, registry *custom.RegistryManager) *ExtractorLoader
NewExtractorLoader creates a new loader with the given configuration
func (*ExtractorLoader) ClearCache ¶
func (el *ExtractorLoader) ClearCache()
ClearCache removes all entries from the cache
func (*ExtractorLoader) Close ¶
func (el *ExtractorLoader) Close() error
Close stops background processes and cleans up resources
func (*ExtractorLoader) GetCacheStats ¶
func (el *ExtractorLoader) GetCacheStats() (int, int, float64)
GetCacheStats returns information about the current cache state
func (*ExtractorLoader) GetMetrics ¶
func (el *ExtractorLoader) GetMetrics() *LoaderMetrics
GetMetrics returns a copy of current performance metrics
func (*ExtractorLoader) LoadExtractor ¶
func (el *ExtractorLoader) LoadExtractor(domain string) (*custom.CustomExtractor, error)
LoadExtractor loads an extractor by domain with advanced caching
func (*ExtractorLoader) LoadExtractorByHTML ¶
func (el *ExtractorLoader) LoadExtractorByHTML(doc *goquery.Document) (*custom.CustomExtractor, error)
LoadExtractorByHTML tries to detect an extractor using HTML content
func (*ExtractorLoader) WarmupCache ¶
func (el *ExtractorLoader) WarmupCache(domains []string) error
WarmupCache preloads specific domains into cache
type ExtractorOptions ¶
type ExtractorOptions struct {
StripUnlikelyCandidates bool
WeightNodes bool
CleanConditionally bool
URL string
Content string
Title string
}
ExtractorOptions contains options for field extraction
type ExtractorStats ¶
type ExtractorStats struct {
TotalCandidates int
CheckedInParallel int
FastestMatch time.Duration
SlowestMatch time.Duration
AverageCheckTime time.Duration
WinningExtractor string
WinningPriority int
}
ExtractorStats provides statistics about parallel extractor checking
type FieldExtractor ¶
type FieldExtractor struct {
Selectors parser.SelectorList `json:"-"` // Type-safe selectors
SelectorsLegacy []interface{} `json:"selectors,omitempty"` // Deprecated: JSON compatibility
AllowMultiple bool `json:"allowMultiple,omitempty"`
DefaultCleaner bool `json:"defaultCleaner"` // defaults to true in JavaScript
}
FieldExtractor configuration for extracting a field
func (*FieldExtractor) GetSelectors ¶
func (fe *FieldExtractor) GetSelectors() parser.SelectorList
GetSelectors returns the type-safe selectors, migrating from legacy if needed
func (*FieldExtractor) MigrateSelectors ¶
func (fe *FieldExtractor) MigrateSelectors()
MigrateSelectors converts legacy []interface{} selectors to type-safe SelectorList This enables gradual migration from JavaScript patterns to Go idioms
func (*FieldExtractor) SetSelectors ¶
func (fe *FieldExtractor) SetSelectors(selectors parser.SelectorList)
SetSelectors sets the type-safe selectors and updates legacy for JSON compatibility
type FullExtractor ¶
type FullExtractor struct {
Domain string `json:"domain"`
SupportedDomains []string `json:"supportedDomains,omitempty"`
// Field extractors for standard fields
Title *FieldExtractor `json:"title,omitempty"`
Author *FieldExtractor `json:"author,omitempty"`
Content *ContentExtractor `json:"content,omitempty"`
DatePublished *FieldExtractor `json:"date_published,omitempty"`
LeadImageURL *FieldExtractor `json:"lead_image_url,omitempty"`
Dek *FieldExtractor `json:"dek,omitempty"`
NextPageURL *FieldExtractor `json:"next_page_url,omitempty"`
Excerpt *FieldExtractor `json:"excerpt,omitempty"`
WordCount *FieldExtractor `json:"word_count,omitempty"`
Direction *FieldExtractor `json:"direction,omitempty"`
URL *FieldExtractor `json:"url,omitempty"`
// Extended types for custom fields
Extend map[string]*FieldExtractor `json:"extend,omitempty"`
}
FullExtractor represents a complete custom extractor with all field definitions
func GetExtractorByDomain ¶
func GetExtractorByDomain(domain string) (*FullExtractor, bool)
GetExtractorByDomain retrieves a specific extractor by domain from API registry
func (*FullExtractor) Extract ¶
func (f *FullExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
Extract implements the parser.Extractor interface for FullExtractor
func (*FullExtractor) GetDomain ¶
func (f *FullExtractor) GetDomain() string
GetDomain implements the Extractor interface for FullExtractor
type LoaderConfig ¶
type LoaderConfig struct {
MaxCacheSize int // Maximum number of cached extractors
CacheExpiration time.Duration // How long to keep extractors in cache
PreloadCommonSites bool // Whether to preload popular extractors
EnableMetrics bool // Whether to track performance metrics
MaxLoadAttempts int // Maximum attempts to load an extractor
LoadTimeout time.Duration // Timeout for loading operations
}
LoaderConfig contains configuration for the extractor loader
func DefaultLoaderConfig ¶
func DefaultLoaderConfig() *LoaderConfig
DefaultLoaderConfig returns sensible defaults for the loader
type LoaderMetrics ¶
type LoaderMetrics struct {
CacheHits int64
CacheMisses int64
LoadSuccesses int64
LoadFailures int64
TotalLoadTime time.Duration
AverageLoadTime time.Duration
EvictionCount int64
}
LoaderMetrics tracks performance statistics
type MediumExtractor ¶
type MediumExtractor struct{}
MediumExtractor represents the Medium.com custom extractor
func (*MediumExtractor) Extract ¶
func (m *MediumExtractor) Extract(doc *goquery.Document, url string, opts *parser.ExtractorOptions) (*parser.Result, error)
func (*MediumExtractor) GetDomain ¶
func (m *MediumExtractor) GetDomain() string
type ParallelExtractorConfig ¶
type ParallelExtractorConfig struct {
MaxConcurrentChecks int // Maximum number of goroutines
CheckTimeout time.Duration // Timeout for individual extractor checks
EnableStats bool // Whether to collect detailed statistics
EnableCaching bool // Whether to cache extractor decisions
}
ParallelExtractorConfig configures parallel extractor behavior
func DefaultParallelConfig ¶
func DefaultParallelConfig() *ParallelExtractorConfig
DefaultParallelConfig returns a sensible default configuration
type ParallelExtractorResult ¶
type ParallelExtractorResult struct {
Candidate *ExtractorCandidate
Error error
Duration time.Duration
}
ParallelExtractorResult holds the result of parallel extractor checking
type ResourceInterface ¶
type ResourceInterface interface {
Create(url string, preparedResponse string, parsedURL interface{}, headers map[string]string) (*goquery.Document, error)
}
ResourceInterface defines the interface for resource fetching
type RootExtractorInterface ¶
type RootExtractorInterface struct{}
RootExtractorInterface defines the root extractor interface
func (*RootExtractorInterface) Extract ¶
func (r *RootExtractorInterface) Extract(extractor interface{}, opts ExtractOptions) interface{}
Extract is the main orchestration method Direct port of JavaScript RootExtractor.extract function
type SelectOptions ¶
type SelectOptions struct {
Doc *goquery.Document
Type string
ExtractionOpts interface{}
ExtractHTML bool
URL string
}
SelectOptions contains parameters for field selection
type SelectOpts ¶
type SelectOpts struct {
Doc *goquery.Document
Type string
ExtractionOpts interface{}
ExtractHTML bool
URL string
}
SelectOpts contains parameters for the select function
type SelectorEntry ¶
type SelectorEntry struct {
Selector string
Attribute string
TransformFunc func(string) string
IsMultiSelector bool
IsAttribute bool
}
SelectorEntry represents a parsed selector with metadata
type SimpleExtractor ¶
type SimpleExtractor struct {
Domain string
}
SimpleExtractor represents a basic extractor for demonstration
func JavaScriptCompatibleGetExtractor ¶
func JavaScriptCompatibleGetExtractor(urlStr string, parsedURL *url.URL, doc *goquery.Document) (SimpleExtractor, error)
JavaScriptCompatibleGetExtractor demonstrates the exact JavaScript getExtractor logic This is a faithful 1:1 port showing the correct behavior without type system conflicts
func (SimpleExtractor) GetDomain ¶
func (s SimpleExtractor) GetDomain() string
GetDomain returns the domain this extractor handles
type SimpleRootExtractor ¶
type SimpleRootExtractor struct{}
SimpleRootExtractor implements a simplified root extractor to avoid conflicts
func (*SimpleRootExtractor) Extract ¶
func (r *SimpleRootExtractor) Extract(extractor interface{}, opts ExtractOpts) interface{}
Extract is the main orchestration method Direct port of JavaScript RootExtractor.extract function
Source Files
¶
Directories
¶
| Path | Synopsis |
|---|---|
|
Package validation provides a comprehensive field validation framework for extracted fields and extended field support.
|
Package validation provides a comprehensive field validation framework for extracted fields and extended field support. |