Documentation
¶
Index ¶
- Constants
- Variables
- func CleanContent(article *goquery.Selection, opts CleanContentOptions) *goquery.Selection
- func DirectionExtractor(params ExtractorParams) (string, error)
- func ExtractBestNode(doc *goquery.Document, opts ExtractBestNodeOptions) *goquery.Selection
- func GetDirection(input interface{}) (string, error)
- func NodeIsSufficient(node *goquery.Selection) bool
- type CleanContentOptions
- type ExtractBestNodeOptions
- type ExtractionOptions
- type ExtractionResult
- type ExtractorImageParams
- type ExtractorOptions
- type ExtractorParams
- type GenericAuthorExtractor
- type GenericContentExtractor
- func (e *GenericContentExtractor) CleanAndReturnNode(node *goquery.Selection, doc *goquery.Document) string
- func (e *GenericContentExtractor) Extract(params ExtractorParams, opts ExtractorOptions) string
- func (e *GenericContentExtractor) GetContentNode(doc *goquery.Document, title, url string, opts ExtractorOptions) *goquery.Selection
- type GenericDateExtractorType
- type GenericDekExtractor
- type GenericDescriptionExtractor
- type GenericExcerptExtractor
- type GenericExtractor
- type GenericFaviconExtractor
- type GenericLanguageExtractor
- type GenericLeadImageExtractor
- type GenericNextPageUrlExtractor
- type GenericSiteImageExtractor
- type GenericSiteNameExtractor
- type GenericSiteTitleExtractor
- type GenericThemeColorExtractor
- type GenericVideoExtractor
- type RTLScriptRange
- type URLResult
- type VideoMetadata
Constants ¶
const ( LTRMark = "\u200e" // Left-to-right mark RTLMark = "\u200f" // Right-to-left mark LTR = "ltr" // Left to right direction content RTL = "rtl" // Right to left direction content BIDI = "bidi" // Both directions - bidirectional content NODI = "" // No direction - empty string for no detectable direction )
Direction constants matching JavaScript string-direction library exactly
const AUTHOR_MAX_LENGTH = 300
AUTHOR_MAX_LENGTH - maximum length for valid author names
Variables ¶
var ( MS_DATE_STRING = regexp.MustCompile(`^\d{13}$`) SEC_DATE_STRING = regexp.MustCompile(`^\d{10}$`) CLEAN_DATE_STRING_RE = regexp.MustCompile(`^\s*published\s*:?\s*(.*)`) TIME_MERIDIAN_SPACE_RE = regexp.MustCompile(`(.*\d)(am|pm)(.*)`) TIME_MERIDIAN_DOTS_RE = regexp.MustCompile(`\.m\.`) TIME_NOW_STRING = regexp.MustCompile(`^\s*(just|right)?\s*now\s*`) TIME_WITH_OFFSET_RE = regexp.MustCompile(`-\d{3,4}$`) )
JavaScript date cleaner constants (ported from cleaners/constants.js)
var ( POSITIVE_LEAD_IMAGE_URL_HINTS_RE = regexp.MustCompile("(?i)" + strings.Join(POSITIVE_LEAD_IMAGE_URL_HINTS, "|")) NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = regexp.MustCompile("(?i)" + strings.Join(NEGATIVE_LEAD_IMAGE_URL_HINTS, "|")) GIF_RE = regexp.MustCompile(`(?i)\.gif(\?.*)?$`) JPG_RE = regexp.MustCompile(`(?i)\.jpe?g(\?.*)?$`) PHOTO_HINTS_RE = regexp.MustCompile(`(?i)figure|photo|image|caption`) // From constants.go )
Compiled regexes for URL scoring
var ( // DIGIT_RE matches any digit character DIGIT_RE = regexp.MustCompile(`\d`) // EXTRANEOUS_LINK_HINTS are words that indicate a link is probably not a next page EXTRANEOUS_LINK_HINTS = []string{ "print", "archive", "comment", "discuss", "e-mail", "email", "share", "reply", "all", "login", "sign", "single", "adx", "entry-unrelated", } EXTRANEOUS_LINK_HINTS_RE = regexp.MustCompile(`(?i)` + strings.Join(EXTRANEOUS_LINK_HINTS, "|")) // NEXT_LINK_TEXT_RE matches text that likely indicates a next page link NEXT_LINK_TEXT_RE = regexp.MustCompile(`(?i)(next|weiter|continue|>([^|]|$)|»([^|]|$))`) // CAP_LINK_TEXT_RE matches text that indicates end links (first, last, etc.) CAP_LINK_TEXT_RE = regexp.MustCompile(`(?i)(first|last|end)`) // PREV_LINK_TEXT_RE matches text that indicates previous page links PREV_LINK_TEXT_RE = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`) // PAGE_RE matches pagination-related text PAGE_RE = regexp.MustCompile(`(?i)pag(e|ing|inat)`) )
Constants from JavaScript implementation
var ( // An ordered list of meta tag names that denote likely article titles. // All attributes should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. STRONG_TITLE_META_TAGS = []string{ "tweetmeme-title", "dc.title", "rbtitle", "headline", "title", } // og:title is weak because it typically contains context that we don't like, // for example the source site's name. Gotta get that brand into facebook! WEAK_TITLE_META_TAGS = []string{ "og:title", } // An ordered list of CSS Selectors to find likely article titles. // From most explicit to least explicit. // // Note - this does not use classes like CSS. This checks to see if the string // exists in the className, which is not as accurate as .className (which // splits on spaces/endlines), but for our purposes it's close enough. STRONG_TITLE_SELECTORS = []string{ ".hentry .entry-title", "h1#articleHeader", "h1.articleHeader", "h1.article", ".instapaper_title", "#meebo-title", } WEAK_TITLE_SELECTORS = []string{ "article h1", "#entry-title", ".entry-title", "#entryTitle", "#entrytitle", ".entryTitle", ".entrytitle", "#articleTitle", ".articleTitle", "post post-title", "h1.title", "h2.article", "h1", "html head title", "title", } // Regular expression for title separators TITLE_SPLITTERS_RE = regexp.MustCompile(`(: | - | \| )`) // Domain endings regex for cleaning DOMAIN_ENDINGS_RE = regexp.MustCompile(`\.com$|\.net$|\.org$|\.co\.uk$`) )
Title extraction constants matching JavaScript behavior exactly
var AUTHOR_META_TAGS = []string{
"byl",
"clmst",
"dc.author",
"dcsext.author",
"dc.creator",
"rbauthors",
"authors",
}
AUTHOR_META_TAGS - ordered list of meta tag names that denote likely article authors From most distinct to least distinct. Note: "author" is too often the developer of the page, so it is not included here.
var AUTHOR_SELECTORS = []string{
".entry .entry-author",
".author.vcard .fn",
".author .vcard .fn",
".byline.vcard .fn",
".byline .vcard .fn",
".byline .by .author",
".byline .by",
".byline .author",
".post-author.vcard",
".post-author .vcard",
"a[rel=author]",
"#by_author",
".by_author",
"#entryAuthor",
".entryAuthor",
".byline a[href*=author]",
"#author .authorname",
".author .authorname",
"#author",
".author",
".articleauthor",
".ArticleAuthor",
".byline",
}
AUTHOR_SELECTORS - ordered list of CSS selectors to find likely article authors From most explicit to least explicit. Uses class substring matching like JavaScript.
var BYLINE_SELECTORS_RE = [][2]interface{}{
{"#byline", bylineRe},
{".byline", bylineRe},
}
var ( // CANONICAL_META_SELECTORS - meta tag names for canonical URL extraction // From JavaScript: export const CANONICAL_META_SELECTORS = ['og:url']; CANONICAL_META_SELECTORS = []string{ "og:url", } )
URL extraction constants matching JavaScript behavior exactly
var CLEAN_AUTHOR_RE = regexp.MustCompile(`(?i)^\s*(posted |written )?by\s*:?\s*(.*)`)
CLEAN_AUTHOR_RE - regex for cleaning author prefixes Matches /^\s*(posted |written )?by\s*:?\s*(.*)/i from JavaScript
var DATE_PUBLISHED_META_TAGS = []string{
"article:published_time",
"displaydate",
"dc.date",
"dc.date.issued",
"rbpubdate",
"publish_date",
"pub_date",
"pagedate",
"pubdate",
"revision_date",
"doc_date",
"date_created",
"content_create_date",
"lastmodified",
"created",
"date",
}
DATE_PUBLISHED_META_TAGS - Ordered list of meta tag names that denote likely date published dates All attributes should be lowercase for faster case-insensitive matching From most distinct to least distinct (matches JavaScript exactly)
var DATE_PUBLISHED_SELECTORS = []string{
".hentry .dtstamp.published",
".hentry .published",
".hentry .dtstamp.updated",
".hentry .updated",
".single .published",
".meta .published",
".meta .postDate",
".entry-date",
".byline .date",
".postmetadata .date",
".article_datetime",
".date-header",
".story-date",
".dateStamp",
"#story .datetime",
".dateline",
".pubdate",
}
DATE_PUBLISHED_SELECTORS - Ordered list of CSS selectors to find likely date published dates From most explicit to least explicit (matches JavaScript exactly)
var DATE_PUBLISHED_URL_RES = []*regexp.Regexp{ regexp.MustCompile(`/(20\d{2}/\d{2}/\d{2})/`), regexp.MustCompile(`(20\d{2}-[01]\d-[0-3]\d)`), regexp.MustCompile(`/(20\d{2}/(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/[0-3]\d)/`), }
DATE_PUBLISHED_URL_RES - Ordered list of compiled regular expressions to find likely date published dates from the URL. These should always have the first reference be a date string that is parseable. Matches JavaScript exactly.
var EXCERPT_META_SELECTORS = []string{"og:description", "twitter:description"}
EXCERPT_META_SELECTORS defines the meta tag names to search for excerpt content This matches the JavaScript constants exactly: ['og:description', 'twitter:description']
var GenericDateExtractor = GenericDateExtractorType{}
var GenericTitleExtractor = struct { Extract func(doc *goquery.Selection, url string, metaCache []string) string }{ Extract: func(doc *goquery.Selection, url string, metaCache []string) string { html := "<html></html>" if doc.Length() > 0 { if fullHtml, err := doc.Html(); err == nil && fullHtml != "" { html = "<html>" + fullHtml + "</html>" } else { if doc.Parent().Length() > 0 { if parentHtml, err := doc.Parent().Html(); err == nil { html = "<html>" + parentHtml + "</html>" } } } } else { return "" } document, err := goquery.NewDocumentFromReader(strings.NewReader(html)) if err != nil { return "" } title := dom.ExtractFromMeta(document, STRONG_TITLE_META_TAGS, metaCache, true) if title != nil && *title != "" { return cleanTitle(*title, url, doc) } title = dom.ExtractFromSelectors(doc, STRONG_TITLE_SELECTORS, 1, true) if title != nil && *title != "" { return cleanTitle(*title, url, doc) } title = dom.ExtractFromMeta(document, WEAK_TITLE_META_TAGS, metaCache, true) if title != nil && *title != "" { return cleanTitle(*title, url, doc) } title = dom.ExtractFromSelectors(doc, WEAK_TITLE_SELECTORS, 1, true) if title != nil && *title != "" { return cleanTitle(*title, url, doc) } return "" }, }
GenericTitleExtractor extracts article titles using multiple fallback strategies
var GenericUrlExtractor = struct { Extract func(doc *goquery.Selection, url string, metaCache []string) URLResult }{ Extract: func(doc *goquery.Selection, url string, metaCache []string) URLResult { canonical := doc.Find("link[rel=canonical]") if canonical.Length() != 0 { href, exists := canonical.Attr("href") if exists && href != "" { return result(href) } } // Second, check for canonical URL in meta tags // Need to convert selection to document for meta tag extraction var document *goquery.Document if doc.Is("html") { if docNode := doc.Get(0); docNode != nil { document = goquery.NewDocumentFromNode(docNode) } } if document == nil { if html, err := doc.Html(); err == nil { fullHTML := html if !containsHTML(html) { fullHTML = "<html>" + html + "</html>" } if tempDoc, err := goquery.NewDocumentFromReader(strings.NewReader(fullHTML)); err == nil { document = tempDoc } } } if document != nil { metaURL := dom.ExtractFromMeta(document, CANONICAL_META_SELECTORS, metaCache, false) if metaURL != nil && *metaURL != "" { return result(*metaURL) } } return result(url) }, }
GenericUrlExtractor provides URL extraction functionality matching JavaScript exactly
var GenericWordCountExtractor = struct { Extract func(options map[string]interface{}) int }{ Extract: func(options map[string]interface{}) int { if options == nil { return 1 } contentInterface, exists := options["content"] if !exists { return 1 } content, ok := contentInterface.(string) if !ok { return 1 } count := getWordCount(content) if count == 1 { count = getWordCountAlt(content) } return count }, }
GenericWordCountExtractor extracts word count from content using JavaScript-compatible logic
var LEAD_IMAGE_URL_META_TAGS = []string{
"og:image",
"twitter:image",
"image_src",
}
Lead image URL meta tags in priority order (most distinct first)
var LEAD_IMAGE_URL_SELECTORS = []string{
"link[rel=image_src]",
}
Fallback selectors for lead image extraction
var NEGATIVE_LEAD_IMAGE_URL_HINTS = []string{
"spacer", "sprite", "blank", "throbber", "gradient", "tile", "bg",
"background", "icon", "social", "header", "hdr", "advert", "spinner",
"loader", "loading", "default", "rating", "share", "facebook",
"twitter", "theme", "promo", "ads", "wp-includes",
}
Negative hints that decrease image score
var POSITIVE_LEAD_IMAGE_URL_HINTS = []string{
"upload",
"wp-content",
"large",
"photo",
"wp-image",
}
Positive hints that increase image score
var SPLIT_DATE_STRING = regexp.MustCompile(`(?i)([0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?)|([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})|(-[0-9]{3,4}$)|([0-9]{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)`)
SPLIT_DATE_STRING regex for splitting date components (matches JavaScript exactly with case-insensitive)
var TIME_AGO_STRING = regexp.MustCompile(`(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s+ago`)
TIME_AGO_STRING regex for parsing relative dates (X minutes ago, etc.)
Functions ¶
func CleanContent ¶
func CleanContent(article *goquery.Selection, opts CleanContentOptions) *goquery.Selection
CleanContent cleans article content, returning a new, cleaned node This adapts the JavaScript extractCleanNode function to work with Go's document-based DOM functions
func DirectionExtractor ¶
func DirectionExtractor(params ExtractorParams) (string, error)
DirectionExtractor extracts text direction from title field only Matches JavaScript: direction: ({ title }) => stringDirection.getDirection(title)
func ExtractBestNode ¶
func ExtractBestNode(doc *goquery.Document, opts ExtractBestNodeOptions) *goquery.Selection
ExtractBestNode extracts the content most likely to be article text using a variety of scoring techniques.
The function orchestrates the complete extraction pipeline: 1. Optionally strips unlikely candidates (comments, ads, etc.) 2. Converts elements to paragraphs for better scoring 3. Scores all content based on various signals 4. Finds and returns the top candidate element
This is a direct port of the JavaScript extractBestNode function with 100% compatibility.
Parameters:
- doc: A goquery Document representing the DOM to extract from
- opts: ExtractBestNodeOptions with configuration flags
- StripUnlikelyCandidates: If true, remove elements that match exclusion criteria
- WeightNodes: If true, use classNames and IDs to determine node worthiness
Returns:
- *goquery.Selection: The top candidate element, or nil if no suitable content found
func GetDirection ¶
GetDirection analyzes string direction and returns 'ltr', 'rtl', 'bidi', or ” Direct port of JavaScript stringDirection.getDirection() function
func NodeIsSufficient ¶
NodeIsSufficient determines if a node has enough content to be considered article-like Given a node, determine if it's article-like enough to return Direct port of JavaScript nodeIsSufficient function
Types ¶
type CleanContentOptions ¶
type CleanContentOptions struct {
Doc *goquery.Document
CleanConditionally bool
Title string
URL string
DefaultCleaner bool
}
CleanContentOptions represents options for content cleaning
type ExtractBestNodeOptions ¶
ExtractBestNodeOptions represents configuration options for content extraction
type ExtractionOptions ¶
type ExtractionOptions struct {
URL string
HTML string
Doc *goquery.Document
MetaCache []string
Fallback bool
ContentType string
}
ExtractionOptions contains all parameters needed for extraction Matches JavaScript options object structure
type ExtractionResult ¶
type ExtractionResult struct {
Title string `json:"title"`
Author string `json:"author"`
DatePublished *time.Time `json:"date_published"` // null if not found
Dek string `json:"dek"`
LeadImageURL string `json:"lead_image_url"`
Content string `json:"content"`
NextPageURL string `json:"next_page_url"`
URL string `json:"url"`
Domain string `json:"domain"`
Excerpt string `json:"excerpt"`
WordCount int `json:"word_count"`
Direction string `json:"direction"`
SiteName string `json:"site_name"`
SiteTitle string `json:"site_title"`
SiteImage string `json:"site_image"`
Favicon string `json:"favicon"`
}
ExtractionResult represents the complete result from generic extraction Matches JavaScript extraction result structure exactly
type ExtractorImageParams ¶
type ExtractorImageParams struct {
Doc *goquery.Document
Content string
MetaCache map[string]string
HTML string
}
ExtractorImageParams contains parameters for image extraction
type ExtractorOptions ¶
type ExtractorOptions struct {
StripUnlikelyCandidates bool
WeightNodes bool
CleanConditionally bool
}
ExtractorOptions represents configuration options for content extraction
type ExtractorParams ¶
ExtractorParams contains all the parameters needed for extraction
type GenericAuthorExtractor ¶
type GenericAuthorExtractor struct{}
GenericAuthorExtractor provides author extraction functionality
type GenericContentExtractor ¶
type GenericContentExtractor struct {
DefaultOpts ExtractorOptions
}
GenericContentExtractor implements the main content extraction logic
func NewGenericContentExtractor ¶
func NewGenericContentExtractor() *GenericContentExtractor
NewGenericContentExtractor creates a new extractor with default options
func (*GenericContentExtractor) CleanAndReturnNode ¶
func (e *GenericContentExtractor) CleanAndReturnNode(node *goquery.Selection, doc *goquery.Document) string
CleanAndReturnNode finalizes the content by ensuring we have something and normalizing spaces Once we got here, either we're at our last-resort node, or we broke early. Make sure we at least have -something- before we move forward.
func (*GenericContentExtractor) Extract ¶
func (e *GenericContentExtractor) Extract(params ExtractorParams, opts ExtractorOptions) string
Extract extracts the content for this resource - initially, pass in the most restrictive opts which will return the highest quality content. On each failure, retry with slightly more lax opts.
The function implements the JavaScript extraction strategy: 1. Try with default strict options 2. If content is insufficient, cascade through options, disabling them one by one 3. Return the best content found
This matches the JavaScript behavior exactly for option cascading and content validation.
func (*GenericContentExtractor) GetContentNode ¶
func (e *GenericContentExtractor) GetContentNode(doc *goquery.Document, title, url string, opts ExtractorOptions) *goquery.Selection
GetContentNode gets the content node given current options This orchestrates the extraction pipeline: extract best node -> clean content
type GenericDateExtractorType ¶
type GenericDateExtractorType struct{}
GenericDateExtractor - Extractor for publication dates with 100% JavaScript compatibility
type GenericDekExtractor ¶
type GenericDekExtractor struct{}
GenericDekExtractor extracts article subtitles/descriptions (deks)
type GenericDescriptionExtractor ¶
type GenericDescriptionExtractor struct{}
GenericDescriptionExtractor extracts site descriptions
type GenericExcerptExtractor ¶
type GenericExcerptExtractor struct{}
GenericExcerptExtractor implements excerpt extraction logic
func NewGenericExcerptExtractor ¶
func NewGenericExcerptExtractor() *GenericExcerptExtractor
NewGenericExcerptExtractor creates a new excerpt extractor
type GenericExtractor ¶
type GenericExtractor struct {
Domain string
}
GenericExtractor coordinates individual field extractors This is NOT an implementation of parser.Extractor interface It's used internally by the parser package for generic extraction
func NewGenericExtractor ¶
func NewGenericExtractor() *GenericExtractor
NewGenericExtractor creates a new generic extractor instance
func (*GenericExtractor) ExtractGeneric ¶
func (ge *GenericExtractor) ExtractGeneric(options *ExtractionOptions) (*ExtractionResult, error)
ExtractGeneric performs the main generic extraction with full options
func (*GenericExtractor) GetDomain ¶
func (ge *GenericExtractor) GetDomain() string
GetDomain returns the domain this extractor handles
type GenericFaviconExtractor ¶
type GenericFaviconExtractor struct{}
GenericFaviconExtractor extracts the favicon URL
type GenericLanguageExtractor ¶
type GenericLanguageExtractor struct{}
GenericLanguageExtractor extracts content language information
type GenericLeadImageExtractor ¶
type GenericLeadImageExtractor struct{}
GenericLeadImageExtractor implements lead image extraction logic
func NewGenericLeadImageExtractor ¶
func NewGenericLeadImageExtractor() *GenericLeadImageExtractor
NewGenericLeadImageExtractor creates a new lead image extractor
func (*GenericLeadImageExtractor) Extract ¶
func (e *GenericLeadImageExtractor) Extract(params ExtractorImageParams) *string
Extract finds the lead image URL from the document using scoring and fallback strategies Matches JavaScript behavior: meta tags → content images → fallback selectors
type GenericNextPageUrlExtractor ¶
type GenericNextPageUrlExtractor struct{}
GenericNextPageUrlExtractor extracts next page URLs for multi-page articles
func NewGenericNextPageUrlExtractor ¶
func NewGenericNextPageUrlExtractor() *GenericNextPageUrlExtractor
NewGenericNextPageUrlExtractor creates a new instance
type GenericSiteImageExtractor ¶
type GenericSiteImageExtractor struct{}
GenericSiteImageExtractor extracts the main site image
type GenericSiteNameExtractor ¶
type GenericSiteNameExtractor struct{}
GenericSiteNameExtractor extracts the site name from meta tags
type GenericSiteTitleExtractor ¶
type GenericSiteTitleExtractor struct{}
GenericSiteTitleExtractor extracts the site title
type GenericThemeColorExtractor ¶
type GenericThemeColorExtractor struct{}
GenericThemeColorExtractor extracts the theme color from meta tags
type GenericVideoExtractor ¶
type GenericVideoExtractor struct{}
GenericVideoExtractor extracts video metadata from Open Graph and other meta tags
func (*GenericVideoExtractor) Extract ¶
func (extractor *GenericVideoExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) *VideoMetadata
Extract extracts video metadata from the page
func (*GenericVideoExtractor) ExtractVideoURL ¶
func (extractor *GenericVideoExtractor) ExtractVideoURL(selection *goquery.Selection, pageURL string, metaCache []string) string
ExtractVideoURL is a convenience function that returns just the primary video URL
type RTLScriptRange ¶
type RTLScriptRange struct {
From int // Starting Unicode code point
To int // Ending Unicode code point
}
RTLScriptRange represents a Unicode block range for RTL scripts
type VideoMetadata ¶
type VideoMetadata struct {
URL string `json:"url,omitempty"`
Type string `json:"type,omitempty"`
Width int `json:"width,omitempty"`
Height int `json:"height,omitempty"`
Duration int `json:"duration,omitempty"`
SecureURL string `json:"secure_url,omitempty"`
}
VideoMetadata contains structured video metadata