Documentation
¶
Index ¶
- func CreateArticleFromURL(url, dir string) (*models.Article, error)
- type ArticleParser
- func (p *ArticleParser) AddRule(domain string, rule *ParsingRule)
- func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error)
- func (p *ArticleParser) GetSupportedDomains() []string
- func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error)
- func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error)
- func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error)
- func (p *ArticleParser) SetHTTPClient(client *http.Client)
- type ContentScore
- type ExtractionResult
- type HeuristicExtractor
- type MetadataExtractor
- func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string
- func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string
- func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult
- func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string
- func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string
- func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string
- type ParsedContent
- type Parser
- type ParsingRule
- type Scorer
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func CreateArticleFromURL ¶
CreateArticleFromURL is a convenience function that parses a URL and creates an instance of models.Article
Types ¶
type ArticleParser ¶
type ArticleParser struct {
// contains filtered or unexported fields
}
ArticleParser implements the Parser interface
func NewArticleParser ¶
func NewArticleParser(client *http.Client) (*ArticleParser, error)
NewArticleParser creates a new ArticleParser with the specified HTTP client and loaded rules
func (*ArticleParser) AddRule ¶
func (p *ArticleParser) AddRule(domain string, rule *ParsingRule)
AddRule adds or replaces a parsing rule for a specific domain
func (*ArticleParser) Convert ¶
func (p *ArticleParser) Convert(htmlContent, domain, sourceURL string) (string, error)
Convert HTML content directly to markdown using domain-specific rules
func (*ArticleParser) GetSupportedDomains ¶
func (p *ArticleParser) GetSupportedDomains() []string
GetSupportedDomains returns a list of domains that have parsing rules
func (*ArticleParser) Parse ¶
func (p *ArticleParser) Parse(htmlContent, domain, sourceURL string) (*ParsedContent, error)
ParseHTML extracts article content from HTML string using domain-specific rules with heuristic fallback. Implements dual validation: compares XPath results with heuristic extraction when rules exist.
func (*ArticleParser) ParseURL ¶
func (p *ArticleParser) ParseURL(s string) (*ParsedContent, error)
ParseURL extracts article content from a given URL
func (*ArticleParser) SaveArticle ¶
func (p *ArticleParser) SaveArticle(content *ParsedContent, dir string) (markdownPath, htmlPath string, err error)
SaveArticle saves the parsed content to filesystem and returns file paths
func (*ArticleParser) SetHTTPClient ¶
func (p *ArticleParser) SetHTTPClient(client *http.Client)
SetHTTPClient overrides the HTTP client used for fetching article content.
type ContentScore ¶
type ContentScore struct {
Node *html.Node
Score float64
TextLength int
LinkDensity float64
ParagraphCount int
AncestorDepth int
ConfidenceLevel float64
}
ContentScore represents the score and metadata for a content node.
type ExtractionResult ¶
type ExtractionResult struct {
Content string
Title string
Author string
PublishedDate string
SiteName string
Language string
Confidence float64
ExtractionMethod string // "heuristic" or "xpath" or "dual"
}
ExtractionResult contains the results of heuristic content extraction.
type HeuristicExtractor ¶
type HeuristicExtractor struct {
// contains filtered or unexported fields
}
HeuristicExtractor implements Readability-style content extraction.
func NewHeuristicExtractor ¶
func NewHeuristicExtractor() *HeuristicExtractor
NewHeuristicExtractor creates a new extractor with default scoring.
func (*HeuristicExtractor) CompareWithXPath ¶
func (e *HeuristicExtractor) CompareWithXPath(doc *html.Node, xpathNode *html.Node) *ExtractionResult
CompareWithXPath compares heuristic extraction with XPath-based extraction.
func (*HeuristicExtractor) ExtractContent ¶
func (e *HeuristicExtractor) ExtractContent(doc *html.Node) *ExtractionResult
ExtractContent performs heuristic-based content extraction from an HTML document.
func (*HeuristicExtractor) ExtractWithSemanticHTML ¶
func (e *HeuristicExtractor) ExtractWithSemanticHTML(doc *html.Node) *ExtractionResult
ExtractWithSemanticHTML attempts extraction using semantic HTML5 elements first. Falls back to heuristic scoring if semantic elements aren't found.
type MetadataExtractor ¶
type MetadataExtractor struct{}
MetadataExtractor implements multi-strategy metadata extraction from HTML documents. It attempts to extract article metadata using OpenGraph, Schema.org, meta tags, and semantic HTML5 elements, with fallback chains for each field.
func NewMetadataExtractor ¶
func NewMetadataExtractor() *MetadataExtractor
NewMetadataExtractor creates a new metadata extractor.
func (*MetadataExtractor) ExtractAuthor ¶
func (m *MetadataExtractor) ExtractAuthor(doc *html.Node) string
ExtractAuthor extracts the article author using multiple strategies. Tries in order: OpenGraph, Schema.org, meta tags, rel=author, byline elements.
func (*MetadataExtractor) ExtractLanguage ¶
func (m *MetadataExtractor) ExtractLanguage(doc *html.Node) string
ExtractLanguage extracts the document language. Tries in order: html lang attribute, OpenGraph, meta tags.
func (*MetadataExtractor) ExtractMetadata ¶
func (m *MetadataExtractor) ExtractMetadata(doc *html.Node) *ExtractionResult
ExtractMetadata extracts all available metadata from an HTML document. Returns an ExtractionResult with populated metadata fields.
func (*MetadataExtractor) ExtractPublishedDate ¶
func (m *MetadataExtractor) ExtractPublishedDate(doc *html.Node) string
ExtractPublishedDate extracts the publication date using multiple strategies. Tries in order: OpenGraph, Schema.org, article:published_time, time elements.
func (*MetadataExtractor) ExtractSiteName ¶
func (m *MetadataExtractor) ExtractSiteName(doc *html.Node) string
ExtractSiteName extracts the site name using multiple strategies. Tries in order: OpenGraph, Schema.org, meta tags.
func (*MetadataExtractor) ExtractTitle ¶
func (m *MetadataExtractor) ExtractTitle(doc *html.Node) string
ExtractTitle extracts the article title using multiple strategies. Tries in order: OpenGraph, Schema.org, meta tags, h1, title tag.
type ParsedContent ¶
type ParsedContent struct {
Title string
Author string
Date string
Content string
URL string
Confidence float64 // 0-1 scale, confidence in extraction quality
ExtractionMethod string // "xpath", "heuristic", "dual-validated", etc.
}
ParsedContent represents the extracted content from a web page
type Parser ¶
type Parser interface {
// ParseURL extracts article content from a given URL
ParseURL(url string) (*ParsedContent, error)
// Convert HTML content directly to markdown using domain-specific rules
Convert(htmlContent, domain, sourceURL string) (string, error)
// GetSupportedDomains returns a list of domains that have parsing rules
GetSupportedDomains() []string
// SaveArticle saves the parsed content to filesystem and returns file paths
SaveArticle(content *ParsedContent, storageDir string) (markdownPath, htmlPath string, err error)
}
Parser interface defines methods for parsing articles from URLs
type ParsingRule ¶
type ParsingRule struct {
Domain string
Title string
Author string
Date string
Body string
Strip []string // XPath selectors for elements to remove
StripIDsOrClasses []string
TestURLs []string
Headers map[string]string
Prune bool
Tidy bool
}
ParsingRule represents XPath rules for extracting content from a specific domain
type Scorer ¶
type Scorer struct {
// contains filtered or unexported fields
}
Scorer implements Readability-style heuristic scoring for content extraction.
func NewScorer ¶
func NewScorer() *Scorer
NewScorer creates a new Scorer with default Readability.js-inspired weights.
func (*Scorer) FindTopCandidates ¶
func (s *Scorer) FindTopCandidates(root *html.Node, n int) []*ContentScore
FindTopCandidates identifies the N highest-scoring content candidates.
func (*Scorer) IsProbablyReadable ¶
IsProbablyReadable determines if a document is likely to have extractable content. This is inspired by Readability.js's isProbablyReaderable function.
func (*Scorer) ScoreAncestors ¶
func (s *Scorer) ScoreAncestors(scores map[*html.Node]*ContentScore, node *html.Node, baseScore float64)
ScoreAncestors propagates scores up the DOM tree with decay. This implements the Readability algorithm's ancestor scoring.