Documentation
¶
Index ¶
- func ApplyTransforms(selection *goquery.Selection, transforms TransformRegistry) *goquery.Selection
- func ClassifyErrorCode(err error, ctx context.Context, op string) int
- func WrapError(err error, phase string, url string) error
- type AttributeRemoveTransform
- type AttributeSetTransform
- type ClassAddTransform
- type ClassRemoveTransform
- type ContentExtractor
- type CustomExtractor
- type CustomFunctionTransform
- type ErrorCollection
- func (ec *ErrorCollection) Add(err *ParseError)
- func (ec *ErrorCollection) Clear()
- func (ec *ErrorCollection) Count() int
- func (ec *ErrorCollection) Error() string
- func (ec *ErrorCollection) First() *ParseError
- func (ec *ErrorCollection) GetByPhase(phase string) []*ParseError
- func (ec *ErrorCollection) GetByURL(url string) []*ParseError
- func (ec *ErrorCollection) HasErrors() bool
- func (ec *ErrorCollection) HasPhaseErrors(phase string) bool
- func (ec *ErrorCollection) Last() *ParseError
- type Extractor
- type ExtractorFunc
- type ExtractorOptions
- type FieldExtractor
- type Hermes
- func (h *Hermes) GetStats() *PoolStats
- func (h *Hermes) Parse(targetURL string, opts *ParserOptions) (*Result, error)
- func (h *Hermes) ParseHTML(html string, targetURL string, opts *ParserOptions) (*Result, error)
- func (h *Hermes) ParseHTMLWithContext(ctx context.Context, html string, targetURL string, opts *ParserOptions) (*Result, error)
- func (h *Hermes) ParseWithContext(ctx context.Context, targetURL string, opts *ParserOptions) (*Result, error)
- func (h *Hermes) ResetStats()
- func (h *Hermes) ReturnResult(result *Result)
- type ParseError
- func ConvertError(err error) *ParseError
- func NewExtractionError(url string, field string, selector string, err error) *ParseError
- func NewFetchError(url string, err error) *ParseError
- func NewParseError(phase string, url string, err error) *ParseError
- func NewTimeoutError(url string, phase string, duration time.Duration) *ParseError
- func NewValidationError(url string, message string, err error) *ParseError
- func (pe *ParseError) Error() string
- func (pe *ParseError) GetDomain() string
- func (pe *ParseError) Is(target error) bool
- func (pe *ParseError) IsExtractionError() bool
- func (pe *ParseError) IsNetworkError() bool
- func (pe *ParseError) IsTimeoutError() bool
- func (pe *ParseError) IsValidationError() bool
- func (pe *ParseError) Unwrap() error
- func (pe *ParseError) WithField(field string) *ParseError
- func (pe *ParseError) WithMessage(message string) *ParseError
- func (pe *ParseError) WithSelector(selector string) *ParseError
- type ParseErrorType
- type Parser
- type ParserOptions
- type PoolStats
- type Result
- type SelectorConfig
- type SelectorList
- type TagRenameTransform
- type TextReplaceTransform
- type Transform
- type TransformFunc
- type TransformRegistry
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ApplyTransforms ¶
func ApplyTransforms(selection *goquery.Selection, transforms TransformRegistry) *goquery.Selection
ApplyTransforms applies all transforms in the registry to a selection
func ClassifyErrorCode ¶
ClassifyErrorCode determines the appropriate error code based on the error type and context This replaces string-based error classification with proper type checking Returns an int that corresponds to the public ErrorCode values
Types ¶
type AttributeRemoveTransform ¶
type AttributeRemoveTransform struct {
AttributeName string
}
AttributeRemoveTransform removes an attribute
func (*AttributeRemoveTransform) Name ¶
func (t *AttributeRemoveTransform) Name() string
type AttributeSetTransform ¶
AttributeSetTransform sets an attribute value
func (*AttributeSetTransform) Name ¶
func (t *AttributeSetTransform) Name() string
type ClassAddTransform ¶
type ClassAddTransform struct {
ClassName string
}
ClassAddTransform adds a CSS class to elements
func (*ClassAddTransform) Name ¶
func (t *ClassAddTransform) Name() string
type ClassRemoveTransform ¶
type ClassRemoveTransform struct {
ClassName string
}
ClassRemoveTransform removes a CSS class from elements
func (*ClassRemoveTransform) Name ¶
func (t *ClassRemoveTransform) Name() string
type ContentExtractor ¶
type ContentExtractor struct {
FieldExtractor
Clean []string // Selectors to remove
Transforms map[string]TransformFunc // Element transformations
}
ContentExtractor extends FieldExtractor with cleaning options
type CustomExtractor ¶
type CustomExtractor struct {
Domain string
Title FieldExtractor
Author FieldExtractor
Content ContentExtractor
DatePublished FieldExtractor
LeadImageURL FieldExtractor
Dek FieldExtractor
NextPageURL FieldExtractor
Excerpt FieldExtractor
Extend map[string]FieldExtractor
}
CustomExtractor defines site-specific extraction rules
type CustomFunctionTransform ¶
type CustomFunctionTransform struct {
TransformName string
TransformFunction func(*goquery.Selection) *goquery.Selection
}
CustomFunctionTransform wraps a custom function for compatibility Used during migration period when some transforms are still functions
func (*CustomFunctionTransform) Name ¶
func (t *CustomFunctionTransform) Name() string
type ErrorCollection ¶
type ErrorCollection struct {
Errors []*ParseError `json:"errors"`
}
ErrorCollection holds multiple parse errors
func (*ErrorCollection) Add ¶
func (ec *ErrorCollection) Add(err *ParseError)
Add adds a new error to the collection
func (*ErrorCollection) Clear ¶
func (ec *ErrorCollection) Clear()
Clear removes all errors from the collection
func (*ErrorCollection) Count ¶
func (ec *ErrorCollection) Count() int
Count returns the number of errors
func (*ErrorCollection) Error ¶
func (ec *ErrorCollection) Error() string
Error implements the error interface for ErrorCollection
func (*ErrorCollection) First ¶
func (ec *ErrorCollection) First() *ParseError
First returns the first error or nil if no errors
func (*ErrorCollection) GetByPhase ¶
func (ec *ErrorCollection) GetByPhase(phase string) []*ParseError
GetByPhase returns all errors from a specific phase
func (*ErrorCollection) GetByURL ¶
func (ec *ErrorCollection) GetByURL(url string) []*ParseError
GetByURL returns all errors from a specific URL
func (*ErrorCollection) HasErrors ¶
func (ec *ErrorCollection) HasErrors() bool
HasErrors returns true if there are any errors
func (*ErrorCollection) HasPhaseErrors ¶
func (ec *ErrorCollection) HasPhaseErrors(phase string) bool
HasPhaseErrors checks if there are errors in a specific phase
func (*ErrorCollection) Last ¶
func (ec *ErrorCollection) Last() *ParseError
Last returns the last error or nil if no errors
type Extractor ¶
type Extractor interface {
Extract(doc *goquery.Document, url string, opts *ExtractorOptions) (*Result, error)
GetDomain() string
}
Extractor defines the interface for content extractors
type ExtractorFunc ¶
ExtractorFunc is a custom extraction function
type ExtractorOptions ¶
type ExtractorOptions struct {
URL string
HTML string
MetaCache map[string]string
Fallback bool
ContentType string
}
ExtractorOptions configures individual extractors
func DefaultExtractorOptions ¶
func DefaultExtractorOptions() *ExtractorOptions
DefaultExtractorOptions returns default extractor options
type FieldExtractor ¶
type FieldExtractor struct {
Selectors SelectorList // Type-safe CSS selectors (replaces []interface{})
SelectorsLegacy []interface{} `json:"selectors,omitempty"` // Deprecated: use Selectors instead
AllowMultiple bool
DefaultCleaner bool
}
FieldExtractor defines extraction rules for a specific field
type Hermes ¶
type Hermes struct {
// contains filtered or unexported fields
}
Hermes (formerly Mercury) is the main parser implementation
func NewParser ¶
func NewParser() *Hermes
NewParser creates a new parser instance (convenience function)
func (*Hermes) Parse ¶
func (h *Hermes) Parse(targetURL string, opts *ParserOptions) (*Result, error)
Parse extracts content from a URL
func (*Hermes) ParseHTMLWithContext ¶
func (h *Hermes) ParseHTMLWithContext(ctx context.Context, html string, targetURL string, opts *ParserOptions) (*Result, error)
ParseHTMLWithContext extracts content from provided HTML with context support
func (*Hermes) ParseWithContext ¶
func (h *Hermes) ParseWithContext(ctx context.Context, targetURL string, opts *ParserOptions) (*Result, error)
ParseWithContext extracts content from a URL with context support
func (*Hermes) ResetStats ¶
func (h *Hermes) ResetStats()
ResetStats is deprecated - no longer tracks statistics
func (*Hermes) ReturnResult ¶
ReturnResult is deprecated - no longer needed without object pooling
type ParseError ¶
type ParseError struct {
URL string `json:"url"` // URL being parsed when error occurred
Phase string `json:"phase"` // Parse phase: "fetch", "extract", "clean", etc.
Err error `json:"error"` // Underlying error
Timestamp time.Time `json:"timestamp"` // When the error occurred
Field string `json:"field,omitempty"` // Specific field being extracted (if applicable)
Selector string `json:"selector,omitempty"` // CSS selector being processed (if applicable)
Message string `json:"message,omitempty"` // Additional context message
}
ParseError represents an error that occurred during parsing
func ConvertError ¶
func ConvertError(err error) *ParseError
ConvertError converts any error to a ParseError for consistent error handling
func NewExtractionError ¶
func NewExtractionError(url string, field string, selector string, err error) *ParseError
NewExtractionError creates an error for content extraction issues
func NewFetchError ¶
func NewFetchError(url string, err error) *ParseError
NewFetchError creates an error for HTTP/network issues
func NewParseError ¶
func NewParseError(phase string, url string, err error) *ParseError
NewParseError creates a new ParseError with context
func NewTimeoutError ¶
func NewTimeoutError(url string, phase string, duration time.Duration) *ParseError
NewTimeoutError creates an error for timeout issues
func NewValidationError ¶
func NewValidationError(url string, message string, err error) *ParseError
NewValidationError creates an error for input validation issues
func (*ParseError) Error ¶
func (pe *ParseError) Error() string
Error implements the error interface
func (*ParseError) GetDomain ¶
func (pe *ParseError) GetDomain() string
GetDomain extracts the domain from the URL
func (*ParseError) Is ¶
func (pe *ParseError) Is(target error) bool
Is supports error checking with errors.Is()
func (*ParseError) IsExtractionError ¶
func (pe *ParseError) IsExtractionError() bool
IsExtractionError checks if the error is extraction-related
func (*ParseError) IsNetworkError ¶
func (pe *ParseError) IsNetworkError() bool
IsNetworkError checks if the error is network-related
func (*ParseError) IsTimeoutError ¶
func (pe *ParseError) IsTimeoutError() bool
IsTimeoutError checks if the error is timeout-related
func (*ParseError) IsValidationError ¶
func (pe *ParseError) IsValidationError() bool
IsValidationError checks if the error is validation-related
func (*ParseError) Unwrap ¶
func (pe *ParseError) Unwrap() error
Unwrap returns the underlying error for error unwrapping
func (*ParseError) WithField ¶
func (pe *ParseError) WithField(field string) *ParseError
WithField adds field context to an existing error
func (*ParseError) WithMessage ¶
func (pe *ParseError) WithMessage(message string) *ParseError
WithMessage adds additional context message
func (*ParseError) WithSelector ¶
func (pe *ParseError) WithSelector(selector string) *ParseError
WithSelector adds selector context to an existing error
type ParseErrorType ¶
type ParseErrorType string
ParseErrorType represents different categories of parse errors
const ( ErrorTypeFetch ParseErrorType = "fetch" // Network/HTTP errors ErrorTypeExtract ParseErrorType = "extract" // Content extraction errors ErrorTypeClean ParseErrorType = "clean" // Content cleaning errors ErrorTypeValidate ParseErrorType = "validate" // Input validation errors ErrorTypeTransform ParseErrorType = "transform" // Content transformation errors ErrorTypeTimeout ParseErrorType = "timeout" // Timeout errors ErrorTypeResource ParseErrorType = "resource" // Resource loading errors )
type Parser ¶
type Parser interface {
Parse(url string, opts *ParserOptions) (*Result, error)
ParseHTML(html string, url string, opts *ParserOptions) (*Result, error)
}
Parser is the main interface for content extraction
type ParserOptions ¶
type ParserOptions struct {
FetchAllPages bool // Fetch and merge multi-page articles
Fallback bool // Use generic extractor as fallback
ContentType string // Output format: "html", "markdown", "text"
Headers map[string]string // Custom HTTP headers
CustomExtractor *CustomExtractor // Custom extraction rules
Extend map[string]ExtractorFunc // Extended fields
HTTPClient *http.Client // HTTP client to use for requests
AllowPrivateNetworks bool // Allow SSRF to private networks (default: false)
}
ParserOptions configures the parser behavior
func DefaultParserOptions ¶
func DefaultParserOptions() *ParserOptions
DefaultParserOptions returns default parser options
type PoolStats ¶
type PoolStats struct {
// All fields are deprecated and return zero values
ResultsCreated int64
ResultsReused int64
BuffersCreated int64
BuffersReused int64
ParsersCreated int64
ParsersReused int64
LastReset time.Time
}
PoolStats is deprecated - kept for backward compatibility Object pooling has been removed in favor of simplicity
type Result ¶
type Result struct {
Title string `json:"title"`
Content string `json:"content"`
Author string `json:"author"`
DatePublished *time.Time `json:"date_published"`
LeadImageURL string `json:"lead_image_url"`
Dek string `json:"dek"`
NextPageURL string `json:"next_page_url"`
URL string `json:"url"`
Domain string `json:"domain"`
Excerpt string `json:"excerpt"`
WordCount int `json:"word_count"`
Direction string `json:"direction"`
TotalPages int `json:"total_pages"`
RenderedPages int `json:"rendered_pages"`
ExtractorUsed string `json:"extractor_used,omitempty"`
Extended map[string]interface{} `json:"extended,omitempty"`
// Site metadata fields
SiteName string `json:"site_name"`
SiteTitle string `json:"site_title"`
SiteImage string `json:"site_image"`
Favicon string `json:"favicon"`
Description string `json:"description"`
Language string `json:"language"`
ThemeColor string `json:"theme_color,omitempty"`
// Video metadata fields
VideoURL string `json:"video_url,omitempty"`
VideoMetadata map[string]interface{} `json:"video_metadata,omitempty"`
// Error handling fields for JS compatibility
Error bool `json:"error,omitempty"`
Message string `json:"message,omitempty"`
}
Result contains the extracted article data
func (*Result) FormatMarkdown ¶
FormatMarkdown formats the result as markdown with metadata header
type SelectorConfig ¶
type SelectorConfig struct {
// CSS selector string (e.g., "h1.title", ".article-body")
Selector string
// Optional attribute to extract (e.g., "content", "datetime", "href")
// If empty, extracts text content
Attribute string
// Optional index for multiple matches (0-based, -1 for all)
Index int
}
SelectorConfig represents a type-safe CSS selector configuration Replaces []interface{} patterns with proper Go types for massive performance gains
func FastAttributeSelector ¶
func FastAttributeSelector(selector, attribute string) SelectorConfig
FastAttributeSelector creates a selector with attribute extraction Optimized for common meta tag patterns
func FastStringSelector ¶
func FastStringSelector(selector string) SelectorConfig
FastStringSelector creates a simple string selector (most common case) Optimized for performance with minimal allocations
func NewSelectorConfig ¶
func NewSelectorConfig(selector interface{}) SelectorConfig
NewSelectorConfig creates a selector config from various input types Handles the conversion from JavaScript-style patterns to Go types
func (SelectorConfig) IsAttributeSelector ¶
func (sc SelectorConfig) IsAttributeSelector() bool
IsAttributeSelector returns true if this selector extracts an attribute
func (SelectorConfig) IsTextSelector ¶
func (sc SelectorConfig) IsTextSelector() bool
IsTextSelector returns true if this selector extracts text content
func (SelectorConfig) String ¶
func (sc SelectorConfig) String() string
String returns a human-readable representation
func (SelectorConfig) ToLegacyInterface ¶
func (sc SelectorConfig) ToLegacyInterface() interface{}
ToLegacyInterface converts back to []interface{} for compatibility Used during migration period when some code still expects old format
func (SelectorConfig) Validate ¶
func (sc SelectorConfig) Validate() error
Validate ensures the selector configuration is valid
type SelectorList ¶
type SelectorList []SelectorConfig
SelectorList is a type-safe replacement for []interface{} selector arrays
func NewSelectorList ¶
func NewSelectorList(selectors []interface{}) SelectorList
NewSelectorList converts []interface{} to type-safe SelectorList This is the main conversion function for migrating JavaScript patterns
func (SelectorList) GetFirstSelector ¶
func (sl SelectorList) GetFirstSelector() SelectorConfig
GetFirstSelector returns the first selector or empty config if list is empty
func (SelectorList) HasMultipleSelectors ¶
func (sl SelectorList) HasMultipleSelectors() bool
HasMultipleSelectors returns true if any selector in the list could match multiple elements
func (SelectorList) ToLegacyInterfaceSlice ¶
func (sl SelectorList) ToLegacyInterfaceSlice() []interface{}
ToLegacyInterfaceSlice converts SelectorList to []interface{} for compatibility
type TagRenameTransform ¶
TagRenameTransform renames HTML tags (e.g., h1 -> h2)
func (*TagRenameTransform) Name ¶
func (t *TagRenameTransform) Name() string
type TextReplaceTransform ¶
TextReplaceTransform replaces text content using string replacement
func (*TextReplaceTransform) Name ¶
func (t *TextReplaceTransform) Name() string
type Transform ¶
type Transform interface {
// Transform applies the transformation to the given selection
Transform(selection *goquery.Selection) *goquery.Selection
// Name returns the name/type of this transform
Name() string
}
Transform defines the interface for content transformation operations Replaces JavaScript transform callback functions with proper Go interfaces
type TransformFunc ¶
TransformFunc modifies extracted elements
type TransformRegistry ¶
TransformRegistry holds type-safe transform implementations Replaces map[string]interface{} with proper typed registry
func ConvertLegacyTransforms ¶
func ConvertLegacyTransforms(legacy map[string]interface{}) TransformRegistry
ConvertLegacyTransforms converts map[string]interface{} to TransformRegistry This enables gradual migration from JavaScript patterns to Go interfaces
func NewTransformRegistry ¶
func NewTransformRegistry() TransformRegistry
NewTransformRegistry creates a registry with common transforms
func (TransformRegistry) GetTransform ¶
func (tr TransformRegistry) GetTransform(name string) (Transform, bool)
GetTransform retrieves a specific transform by name
func (TransformRegistry) GetTransformNames ¶
func (tr TransformRegistry) GetTransformNames() []string
GetTransformNames returns all transform names in the registry
func (TransformRegistry) HasTransform ¶
func (tr TransformRegistry) HasTransform(name string) bool
HasTransform checks if a transform exists in the registry