parser

package
v1.0.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 31, 2025 License: MIT Imports: 18 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ApplyTransforms

func ApplyTransforms(selection *goquery.Selection, transforms TransformRegistry) *goquery.Selection

ApplyTransforms applies all transforms in the registry to a selection

func ClassifyErrorCode

func ClassifyErrorCode(err error, ctx context.Context, op string) int

ClassifyErrorCode determines the appropriate error code based on the error type and context This replaces string-based error classification with proper type checking Returns an int that corresponds to the public ErrorCode values

func WrapError

func WrapError(err error, phase string, url string) error

WrapError wraps a regular error as a ParseError if it isn't already one

Types

type AttributeRemoveTransform

type AttributeRemoveTransform struct {
	AttributeName string
}

AttributeRemoveTransform removes an attribute

func (*AttributeRemoveTransform) Name

func (t *AttributeRemoveTransform) Name() string

func (*AttributeRemoveTransform) Transform

func (t *AttributeRemoveTransform) Transform(selection *goquery.Selection) *goquery.Selection

type AttributeSetTransform

type AttributeSetTransform struct {
	AttributeName  string
	AttributeValue string
}

AttributeSetTransform sets an attribute value

func (*AttributeSetTransform) Name

func (t *AttributeSetTransform) Name() string

func (*AttributeSetTransform) Transform

func (t *AttributeSetTransform) Transform(selection *goquery.Selection) *goquery.Selection

type ClassAddTransform

type ClassAddTransform struct {
	ClassName string
}

ClassAddTransform adds a CSS class to elements

func (*ClassAddTransform) Name

func (t *ClassAddTransform) Name() string

func (*ClassAddTransform) Transform

func (t *ClassAddTransform) Transform(selection *goquery.Selection) *goquery.Selection

type ClassRemoveTransform

type ClassRemoveTransform struct {
	ClassName string
}

ClassRemoveTransform removes a CSS class from elements

func (*ClassRemoveTransform) Name

func (t *ClassRemoveTransform) Name() string

func (*ClassRemoveTransform) Transform

func (t *ClassRemoveTransform) Transform(selection *goquery.Selection) *goquery.Selection

type ContentExtractor

type ContentExtractor struct {
	FieldExtractor
	Clean      []string                 // Selectors to remove
	Transforms map[string]TransformFunc // Element transformations
}

ContentExtractor extends FieldExtractor with cleaning options

type CustomExtractor

type CustomExtractor struct {
	Domain        string
	Title         FieldExtractor
	Author        FieldExtractor
	Content       ContentExtractor
	DatePublished FieldExtractor
	LeadImageURL  FieldExtractor
	Dek           FieldExtractor
	NextPageURL   FieldExtractor
	Excerpt       FieldExtractor
	Extend        map[string]FieldExtractor
}

CustomExtractor defines site-specific extraction rules

type CustomFunctionTransform

type CustomFunctionTransform struct {
	TransformName     string
	TransformFunction func(*goquery.Selection) *goquery.Selection
}

CustomFunctionTransform wraps a custom function for compatibility Used during migration period when some transforms are still functions

func (*CustomFunctionTransform) Name

func (t *CustomFunctionTransform) Name() string

func (*CustomFunctionTransform) Transform

func (t *CustomFunctionTransform) Transform(selection *goquery.Selection) *goquery.Selection

type ErrorCollection

type ErrorCollection struct {
	Errors []*ParseError `json:"errors"`
}

ErrorCollection holds multiple parse errors

func (*ErrorCollection) Add

func (ec *ErrorCollection) Add(err *ParseError)

Add adds a new error to the collection

func (*ErrorCollection) Clear

func (ec *ErrorCollection) Clear()

Clear removes all errors from the collection

func (*ErrorCollection) Count

func (ec *ErrorCollection) Count() int

Count returns the number of errors

func (*ErrorCollection) Error

func (ec *ErrorCollection) Error() string

Error implements the error interface for ErrorCollection

func (*ErrorCollection) First

func (ec *ErrorCollection) First() *ParseError

First returns the first error or nil if no errors

func (*ErrorCollection) GetByPhase

func (ec *ErrorCollection) GetByPhase(phase string) []*ParseError

GetByPhase returns all errors from a specific phase

func (*ErrorCollection) GetByURL

func (ec *ErrorCollection) GetByURL(url string) []*ParseError

GetByURL returns all errors from a specific URL

func (*ErrorCollection) HasErrors

func (ec *ErrorCollection) HasErrors() bool

HasErrors returns true if there are any errors

func (*ErrorCollection) HasPhaseErrors

func (ec *ErrorCollection) HasPhaseErrors(phase string) bool

HasPhaseErrors checks if there are errors in a specific phase

func (*ErrorCollection) Last

func (ec *ErrorCollection) Last() *ParseError

Last returns the last error or nil if no errors

type Extractor

type Extractor interface {
	Extract(doc *goquery.Document, url string, opts *ExtractorOptions) (*Result, error)
	GetDomain() string
}

Extractor defines the interface for content extractors

type ExtractorFunc

type ExtractorFunc func(*goquery.Document, string) (interface{}, error)

ExtractorFunc is a custom extraction function

type ExtractorOptions

type ExtractorOptions struct {
	URL         string
	HTML        string
	MetaCache   map[string]string
	Fallback    bool
	ContentType string
}

ExtractorOptions configures individual extractors

func DefaultExtractorOptions

func DefaultExtractorOptions() *ExtractorOptions

DefaultExtractorOptions returns default extractor options

type FieldExtractor

type FieldExtractor struct {
	Selectors       SelectorList  // Type-safe CSS selectors (replaces []interface{})
	SelectorsLegacy []interface{} `json:"selectors,omitempty"` // Deprecated: use Selectors instead
	AllowMultiple   bool
	DefaultCleaner  bool
}

FieldExtractor defines extraction rules for a specific field

type Hermes

type Hermes struct {
	// contains filtered or unexported fields
}

Hermes (formerly Mercury) is the main parser implementation

func New

func New(opts ...*ParserOptions) *Hermes

New creates a new Hermes parser instance

func NewParser

func NewParser() *Hermes

NewParser creates a new parser instance (convenience function)

func (*Hermes) GetStats

func (h *Hermes) GetStats() *PoolStats

GetStats is deprecated - no longer tracks statistics

func (*Hermes) Parse

func (h *Hermes) Parse(targetURL string, opts *ParserOptions) (*Result, error)

Parse extracts content from a URL

func (*Hermes) ParseHTML

func (h *Hermes) ParseHTML(html string, targetURL string, opts *ParserOptions) (*Result, error)

ParseHTML extracts content from provided HTML

func (*Hermes) ParseHTMLWithContext

func (h *Hermes) ParseHTMLWithContext(ctx context.Context, html string, targetURL string, opts *ParserOptions) (*Result, error)

ParseHTMLWithContext extracts content from provided HTML with context support

func (*Hermes) ParseWithContext

func (h *Hermes) ParseWithContext(ctx context.Context, targetURL string, opts *ParserOptions) (*Result, error)

ParseWithContext extracts content from a URL with context support

func (*Hermes) ResetStats

func (h *Hermes) ResetStats()

ResetStats is deprecated - no longer tracks statistics

func (*Hermes) ReturnResult

func (h *Hermes) ReturnResult(result *Result)

ReturnResult is deprecated - no longer needed without object pooling

type ParseError

type ParseError struct {
	URL       string    `json:"url"`                // URL being parsed when error occurred
	Phase     string    `json:"phase"`              // Parse phase: "fetch", "extract", "clean", etc.
	Err       error     `json:"error"`              // Underlying error
	Timestamp time.Time `json:"timestamp"`          // When the error occurred
	Field     string    `json:"field,omitempty"`    // Specific field being extracted (if applicable)
	Selector  string    `json:"selector,omitempty"` // CSS selector being processed (if applicable)
	Message   string    `json:"message,omitempty"`  // Additional context message
}

ParseError represents an error that occurred during parsing

func ConvertError

func ConvertError(err error) *ParseError

ConvertError converts any error to a ParseError for consistent error handling

func NewExtractionError

func NewExtractionError(url string, field string, selector string, err error) *ParseError

NewExtractionError creates an error for content extraction issues

func NewFetchError

func NewFetchError(url string, err error) *ParseError

NewFetchError creates an error for HTTP/network issues

func NewParseError

func NewParseError(phase string, url string, err error) *ParseError

NewParseError creates a new ParseError with context

func NewTimeoutError

func NewTimeoutError(url string, phase string, duration time.Duration) *ParseError

NewTimeoutError creates an error for timeout issues

func NewValidationError

func NewValidationError(url string, message string, err error) *ParseError

NewValidationError creates an error for input validation issues

func (*ParseError) Error

func (pe *ParseError) Error() string

Error implements the error interface

func (*ParseError) GetDomain

func (pe *ParseError) GetDomain() string

GetDomain extracts the domain from the URL

func (*ParseError) Is

func (pe *ParseError) Is(target error) bool

Is supports error checking with errors.Is()

func (*ParseError) IsExtractionError

func (pe *ParseError) IsExtractionError() bool

IsExtractionError checks if the error is extraction-related

func (*ParseError) IsNetworkError

func (pe *ParseError) IsNetworkError() bool

IsNetworkError checks if the error is network-related

func (*ParseError) IsTimeoutError

func (pe *ParseError) IsTimeoutError() bool

IsTimeoutError checks if the error is timeout-related

func (*ParseError) IsValidationError

func (pe *ParseError) IsValidationError() bool

IsValidationError checks if the error is validation-related

func (*ParseError) Unwrap

func (pe *ParseError) Unwrap() error

Unwrap returns the underlying error for error unwrapping

func (*ParseError) WithField

func (pe *ParseError) WithField(field string) *ParseError

WithField adds field context to an existing error

func (*ParseError) WithMessage

func (pe *ParseError) WithMessage(message string) *ParseError

WithMessage adds additional context message

func (*ParseError) WithSelector

func (pe *ParseError) WithSelector(selector string) *ParseError

WithSelector adds selector context to an existing error

type ParseErrorType

type ParseErrorType string

ParseErrorType represents different categories of parse errors

const (
	ErrorTypeFetch     ParseErrorType = "fetch"     // Network/HTTP errors
	ErrorTypeExtract   ParseErrorType = "extract"   // Content extraction errors
	ErrorTypeClean     ParseErrorType = "clean"     // Content cleaning errors
	ErrorTypeValidate  ParseErrorType = "validate"  // Input validation errors
	ErrorTypeTransform ParseErrorType = "transform" // Content transformation errors
	ErrorTypeTimeout   ParseErrorType = "timeout"   // Timeout errors
	ErrorTypeResource  ParseErrorType = "resource"  // Resource loading errors
)

type Parser

type Parser interface {
	Parse(url string, opts *ParserOptions) (*Result, error)
	ParseHTML(html string, url string, opts *ParserOptions) (*Result, error)
}

Parser is the main interface for content extraction

type ParserOptions

type ParserOptions struct {
	FetchAllPages        bool                     // Fetch and merge multi-page articles
	Fallback             bool                     // Use generic extractor as fallback
	ContentType          string                   // Output format: "html", "markdown", "text"
	Headers              map[string]string        // Custom HTTP headers
	CustomExtractor      *CustomExtractor         // Custom extraction rules
	Extend               map[string]ExtractorFunc // Extended fields
	HTTPClient           *http.Client             // HTTP client to use for requests
	AllowPrivateNetworks bool                     // Allow SSRF to private networks (default: false)
}

ParserOptions configures the parser behavior

func DefaultParserOptions

func DefaultParserOptions() *ParserOptions

DefaultParserOptions returns default parser options

type PoolStats

type PoolStats struct {
	// All fields are deprecated and return zero values
	ResultsCreated int64
	ResultsReused  int64
	BuffersCreated int64
	BuffersReused  int64
	ParsersCreated int64
	ParsersReused  int64
	LastReset      time.Time
}

PoolStats is deprecated - kept for backward compatibility Object pooling has been removed in favor of simplicity

type Result

type Result struct {
	Title         string                 `json:"title"`
	Content       string                 `json:"content"`
	Author        string                 `json:"author"`
	DatePublished *time.Time             `json:"date_published"`
	LeadImageURL  string                 `json:"lead_image_url"`
	Dek           string                 `json:"dek"`
	NextPageURL   string                 `json:"next_page_url"`
	URL           string                 `json:"url"`
	Domain        string                 `json:"domain"`
	Excerpt       string                 `json:"excerpt"`
	WordCount     int                    `json:"word_count"`
	Direction     string                 `json:"direction"`
	TotalPages    int                    `json:"total_pages"`
	RenderedPages int                    `json:"rendered_pages"`
	ExtractorUsed string                 `json:"extractor_used,omitempty"`
	Extended      map[string]interface{} `json:"extended,omitempty"`

	// Site metadata fields
	SiteName    string `json:"site_name"`
	SiteTitle   string `json:"site_title"`
	SiteImage   string `json:"site_image"`
	Favicon     string `json:"favicon"`
	Description string `json:"description"`
	Language    string `json:"language"`
	ThemeColor  string `json:"theme_color,omitempty"`

	// Video metadata fields
	VideoURL      string                 `json:"video_url,omitempty"`
	VideoMetadata map[string]interface{} `json:"video_metadata,omitempty"`

	// Error handling fields for JS compatibility
	Error   bool   `json:"error,omitempty"`
	Message string `json:"message,omitempty"`
}

Result contains the extracted article data

func (*Result) FormatMarkdown

func (r *Result) FormatMarkdown() string

FormatMarkdown formats the result as markdown with metadata header

func (*Result) IsError

func (r *Result) IsError() bool

IsError checks if result contains an error

func (*Result) SetError

func (r *Result) SetError(message string)

SetError sets error state for JS compatibility

type SelectorConfig

type SelectorConfig struct {
	// CSS selector string (e.g., "h1.title", ".article-body")
	Selector string

	// Optional attribute to extract (e.g., "content", "datetime", "href")
	// If empty, extracts text content
	Attribute string

	// Optional index for multiple matches (0-based, -1 for all)
	Index int
}

SelectorConfig represents a type-safe CSS selector configuration Replaces []interface{} patterns with proper Go types for massive performance gains

func FastAttributeSelector

func FastAttributeSelector(selector, attribute string) SelectorConfig

FastAttributeSelector creates a selector with attribute extraction Optimized for common meta tag patterns

func FastStringSelector

func FastStringSelector(selector string) SelectorConfig

FastStringSelector creates a simple string selector (most common case) Optimized for performance with minimal allocations

func NewSelectorConfig

func NewSelectorConfig(selector interface{}) SelectorConfig

NewSelectorConfig creates a selector config from various input types Handles the conversion from JavaScript-style patterns to Go types

func (SelectorConfig) IsAttributeSelector

func (sc SelectorConfig) IsAttributeSelector() bool

IsAttributeSelector returns true if this selector extracts an attribute

func (SelectorConfig) IsTextSelector

func (sc SelectorConfig) IsTextSelector() bool

IsTextSelector returns true if this selector extracts text content

func (SelectorConfig) String

func (sc SelectorConfig) String() string

String returns a human-readable representation

func (SelectorConfig) ToLegacyInterface

func (sc SelectorConfig) ToLegacyInterface() interface{}

ToLegacyInterface converts back to []interface{} for compatibility Used during migration period when some code still expects old format

func (SelectorConfig) Validate

func (sc SelectorConfig) Validate() error

Validate ensures the selector configuration is valid

type SelectorList

type SelectorList []SelectorConfig

SelectorList is a type-safe replacement for []interface{} selector arrays

func NewSelectorList

func NewSelectorList(selectors []interface{}) SelectorList

NewSelectorList converts []interface{} to type-safe SelectorList This is the main conversion function for migrating JavaScript patterns

func (SelectorList) GetFirstSelector

func (sl SelectorList) GetFirstSelector() SelectorConfig

GetFirstSelector returns the first selector or empty config if list is empty

func (SelectorList) HasMultipleSelectors

func (sl SelectorList) HasMultipleSelectors() bool

HasMultipleSelectors returns true if any selector in the list could match multiple elements

func (SelectorList) ToLegacyInterfaceSlice

func (sl SelectorList) ToLegacyInterfaceSlice() []interface{}

ToLegacyInterfaceSlice converts SelectorList to []interface{} for compatibility

type TagRenameTransform

type TagRenameTransform struct {
	OriginalTag string
	NewTag      string
}

TagRenameTransform renames HTML tags (e.g., h1 -> h2)

func (*TagRenameTransform) Name

func (t *TagRenameTransform) Name() string

func (*TagRenameTransform) Transform

func (t *TagRenameTransform) Transform(selection *goquery.Selection) *goquery.Selection

type TextReplaceTransform

type TextReplaceTransform struct {
	OldText string
	NewText string
}

TextReplaceTransform replaces text content using string replacement

func (*TextReplaceTransform) Name

func (t *TextReplaceTransform) Name() string

func (*TextReplaceTransform) Transform

func (t *TextReplaceTransform) Transform(selection *goquery.Selection) *goquery.Selection

type Transform

type Transform interface {
	// Transform applies the transformation to the given selection
	Transform(selection *goquery.Selection) *goquery.Selection

	// Name returns the name/type of this transform
	Name() string
}

Transform defines the interface for content transformation operations Replaces JavaScript transform callback functions with proper Go interfaces

type TransformFunc

type TransformFunc func(*goquery.Selection) string

TransformFunc modifies extracted elements

type TransformRegistry

type TransformRegistry map[string]Transform

TransformRegistry holds type-safe transform implementations Replaces map[string]interface{} with proper typed registry

func ConvertLegacyTransforms

func ConvertLegacyTransforms(legacy map[string]interface{}) TransformRegistry

ConvertLegacyTransforms converts map[string]interface{} to TransformRegistry This enables gradual migration from JavaScript patterns to Go interfaces

func NewTransformRegistry

func NewTransformRegistry() TransformRegistry

NewTransformRegistry creates a registry with common transforms

func (TransformRegistry) GetTransform

func (tr TransformRegistry) GetTransform(name string) (Transform, bool)

GetTransform retrieves a specific transform by name

func (TransformRegistry) GetTransformNames

func (tr TransformRegistry) GetTransformNames() []string

GetTransformNames returns all transform names in the registry

func (TransformRegistry) HasTransform

func (tr TransformRegistry) HasTransform(name string) bool

HasTransform checks if a transform exists in the registry

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL