Documentation
¶
Overview ¶
ABOUTME: Cleans H1 tags from article content based on count threshold analysis. ABOUTME: Removes H1s if less than 3, converts to H2s if 3 or more to preserve content.
ABOUTME: Rewrite top-level DOM elements (html, body) to avoid multiple body tag complications ABOUTME: Converts html and body elements to div tags while preserving their content and attributes
ABOUTME: Single attribute setter utility that mirrors JavaScript setAttr behavior ABOUTME: Handles goquery Selection objects for DOM manipulation
Index ¶
- Constants
- Variables
- func ArticleBaseURL(rawURL string) string
- func BrsToPs(doc *goquery.Document) *goquery.Document
- func CleanAttributes(doc *goquery.Document) *goquery.Document
- func CleanHOnes(doc *goquery.Document) *goquery.Document
- func CleanHeaders(doc *goquery.Document, title string) *goquery.Document
- func CleanHeadersWithoutTitle(doc *goquery.Document) *goquery.Document
- func CleanImages(doc *goquery.Document) *goquery.Document
- func CleanTags(doc *goquery.Document) *goquery.Document
- func ConvertNodeTo(node *goquery.Selection, tag string)
- func ConvertToParagraphs(doc *goquery.Document) *goquery.Document
- func CountSentences(text string) int
- func CountWords(text string) int
- func DetectTextDirection(text string) string
- func ExtractFromMeta(doc *goquery.Document, metaNames []string, cachedNames []string, ...) *string
- func ExtractFromSelectors(doc *goquery.Selection, selectors []string, maxChildren int, textOnly bool) *string
- func FindTopCandidate(doc *goquery.Document) *goquery.Selection
- func GetAttr(selection *goquery.Selection, attrName string) (string, bool)
- func GetAttrs(selection *goquery.Selection) map[string]string
- func GetBaseDomain(rawURL string) string
- func GetContentScore(element *goquery.Selection) float64
- func GetDomain(rawURL string) string
- func GetRemoveAttrSelectors() []string
- func GetRemoveEmptySelectors() []string
- func GetWeight(element *goquery.Selection) int
- func HasAttr(selection *goquery.Selection, attrName string) bool
- func HasSentenceEnd(text string) bool
- func IsLikelyArticleElement(element *goquery.Selection) bool
- func IsWordpress(doc *goquery.Document) bool
- func LinkDensity(element *goquery.Selection) float64
- func MakeLinksAbsolute(doc *goquery.Document, rootURL string) *goquery.Document
- func MarkToKeep(doc *goquery.Document) *goquery.Document
- func MergeSiblings(candidate *goquery.Selection, topScore int, doc *goquery.Document) *goquery.Selection
- func NodeIsSufficient(element *goquery.Selection) bool
- func RemoveAnchor(rawURL string) string
- func RemoveAttr(selection *goquery.Selection, attrName string) *goquery.Selection
- func RemoveEmpty(doc *goquery.Document) *goquery.Document
- func RewriteTopLevel(doc *goquery.Document) *goquery.Document
- func SanitizeURL(rawURL string) string
- func ScoreContent(doc *goquery.Document, weightNodes bool)
- func SetAttr(selection *goquery.Selection, attr, val string) *goquery.Selection
- func StripJunkTags(doc *goquery.Document) *goquery.Document
- func StripTags(text string) string
- func StripUnlikelyCandidates(doc *goquery.Document) *goquery.Document
- func ValidateURL(rawURL string) bool
- func WithinComment(element *goquery.Selection) bool
Constants ¶
const IS_WP_SELECTOR = `meta[name="generator"][value^="WordPress"]`
XPath to try to determine if a page is wordpress. Not always successful.
const KEEP_CLASS = "hermes-parser-keep"
The class we will use to mark elements we want to keep but would normally remove
Variables ¶
var BAD_TAGS = regexp.MustCompile(`(?i)^(address|form)$`)
var BLOCK_LEVEL_TAGS = []string{
"article",
"aside",
"blockquote",
"body",
"br",
"button",
"canvas",
"caption",
"col",
"colgroup",
"dd",
"div",
"dl",
"dt",
"embed",
"fieldset",
"figcaption",
"figure",
"footer",
"form",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hgroup",
"hr",
"li",
"map",
"object",
"ol",
"output",
"p",
"pre",
"progress",
"section",
"table",
"tbody",
"textarea",
"tfoot",
"th",
"thead",
"tr",
"ul",
"video",
}
A list of all of the block level tags known in HTML5 and below. Taken from http://bit.ly/qneNIT
var BLOCK_LEVEL_TAGS_RE = regexp.MustCompile(`(?i)^(article|aside|blockquote|body|br|button|canvas|caption|col|colgroup|dd|div|dl|dt|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|map|object|ol|output|p|pre|progress|section|table|tbody|textarea|tfoot|th|thead|tr|ul|video)$`)
var BR_TAGS_RE = regexp.MustCompile(`(?i)(<br[^>]*>[ \n\r\t]*){2,}`)
Match 2 or more consecutive <br> tags
var BR_TAG_RE = regexp.MustCompile(`(?i)<br[^>]*>`)
Match 1 BR tag.
var CANDIDATES_BLACKLIST = regexp.MustCompile(`(?i)(` + candidatesBlacklist + `)`)
var CANDIDATES_WHITELIST = regexp.MustCompile(`(?i)(` + candidatesWhitelist + `)`)
var CAP_LINK_TEXT_RE = regexp.MustCompile(`(?i)(first|last|end)`)
Match any link text/classname/id that looks like it is an end link: things like "first", "last", "end", etc.
var CHILD_CONTENT_TAGS = regexp.MustCompile(`(?i)^(td|blockquote|ol|ul|dl)$`)
var CLEAN_CONDITIONALLY_TAGS = []string{
"ul",
"ol",
"table",
"div",
"button",
"form",
}
cleanTags
var CLEAN_CONDITIONALLY_TAGS_LIST = "ul,ol,table,div,button,form"
var DIGIT_RE = regexp.MustCompile(`[0-9]`)
Match a digit. Pretty clear.
var DIV_TO_P_BLOCK_TAGS = []string{
"a",
"blockquote",
"dl",
"div",
"img",
"p",
"pre",
"table",
}
A list of tags which, if found inside, should cause a <div /> to NOT be turned into a paragraph tag. Shallow div tags without these elements should be turned into <p /> tags.
var DIV_TO_P_BLOCK_TAGS_LIST = "a,blockquote,dl,div,img,p,pre,table"
var EXTRANEOUS_LINK_HINTS = []string{
"print",
"archive",
"comment",
"discuss",
"e-mail",
"email",
"share",
"reply",
"all",
"login",
"sign",
"single",
"adx",
"entry-unrelated",
}
A list of words that, if found in link text or URLs, likely mean that this link is not a next page link.
var EXTRANEOUS_LINK_HINTS_RE = regexp.MustCompile(`(?i)print|archive|comment|discuss|e-mail|email|share|reply|all|login|sign|single|adx|entry-unrelated`)
var HEADER_TAGS = []string{"h2", "h3", "h4", "h5", "h6"}
cleanHeaders
var HEADER_TAG_LIST = "h2,h3,h4,h5,h6"
var HNEWS_CONTENT_SELECTORS = [][]string{
{".hentry", ".entry-content"},
{"entry", ".entry-content"},
{".entry", ".entry_content"},
{".post", ".postbody"},
{".post", ".post_body"},
{".post", ".post-body"},
}
A list of selectors that specify, very clearly, either hNews or other very content-specific style content, like Blogger templates. More examples here: http://microformats.org/wiki/blog-post-formats
var HTML_OR_BODY_RE = regexp.MustCompile(`(?i)^(html|body)$`)
var KEEP_SELECTORS = []string{
`iframe[src^="https://www.youtube.com"]`,
`iframe[src^="https://www.youtube-nocookie.com"]`,
`iframe[src^="http://www.youtube.com"]`,
`iframe[src^="https://player.vimeo"]`,
`iframe[src^="http://player.vimeo"]`,
`iframe[src^="https://www.redditmedia.com"]`,
}
var NEGATIVE_SCORE_HINTS = []string{
"adbox",
"advert",
"author",
"bio",
"bookmark",
"bottom",
"byline",
"clear",
"com-",
"combx",
"comment",
`comment\\B`,
"contact",
"copy",
"credit",
"crumb",
"date",
"deck",
"excerpt",
"featured",
"foot",
"footer",
"footnote",
"graf",
"head",
"info",
"infotext",
"instapaper_ignore",
"jump",
"linebreak",
"link",
"masthead",
"media",
"meta",
"modal",
"outbrain",
"promo",
"pr_",
"related",
"respond",
"roundcontent",
"scroll",
"secondary",
"share",
"shopping",
"shoutbox",
"side",
"sidebar",
"sponsor",
"stamp",
"sub",
"summary",
"tags",
"tools",
"widget",
}
A list of strings that denote a negative scoring for this content as being an article container. Checked against className and id.
TODO: Perhaps have these scale based on their odds of being quality?
var NEGATIVE_SCORE_RE = regexp.MustCompile(`(?i)adbox|advert|author|bio|bookmark|bottom|byline|clear|com-|combx|comment|comment\\B|contact|copy|credit|crumb|date|deck|excerpt|featured|foot|footer|footnote|graf|head|info|infotext|instapaper_ignore|jump|linebreak|link|masthead|media|meta|modal|outbrain|promo|pr_|related|respond|roundcontent|scroll|secondary|share|shopping|shoutbox|side|sidebar|sponsor|stamp|sub|summary|tags|tools|widget`)
The above list, joined into a matching regular expression
var NEXT_LINK_TEXT_RE = regexp.MustCompile(`(?i)(next|weiter|continue|>([^|]|$)|»([^|]|$))`)
Match any link text/classname/id that looks like it could mean the next page. Things like: next, continue, >, >>, » but not >|, »| as those can mean last page.
var NON_TOP_CANDIDATE_TAGS = []string{
"br",
"b",
"i",
"label",
"hr",
"area",
"base",
"basefont",
"input",
"img",
"link",
"meta",
}
A list of tags that should be ignored when trying to find the top candidate for a document.
var NON_TOP_CANDIDATE_TAGS_RE = regexp.MustCompile(`(?i)^(br|b|i|label|hr|area|base|basefont|input|img|link|meta)$`)
var PAGE_RE = regexp.MustCompile(`(?i)pag(e|ing|inat)`)
Match any phrase that looks like it could be page, or paging, or pagination
var PARAGRAPH_SCORE_TAGS = regexp.MustCompile(`(?i)^(p|li|span|pre)$`)
var PHOTO_HINTS = []string{"figure", "photo", "image", "caption"}
var PHOTO_HINTS_RE = regexp.MustCompile(`(?i)figure|photo|image|caption`)
var POSITIVE_SCORE_HINTS = []string{
"article",
"articlecontent",
"instapaper_body",
"blog",
"body",
"content",
"entry-content-asset",
"entry",
"hentry",
"main",
"Normal",
"page",
"pagination",
"permalink",
"post",
"story",
"text",
"[-_]copy",
`\\Bcopy`,
}
A list of strings that denote a positive scoring for this content as being an article container. Checked against className and id.
TODO: Perhaps have these scale based on their odds of being quality?
var POSITIVE_SCORE_RE = regexp.MustCompile(`(?i)article|articlecontent|instapaper_body|blog|body|content|entry-content-asset|entry|hentry|main|Normal|page|pagination|permalink|post|story|text|[-_]copy|\\Bcopy`)
The above list, joined into a matching regular expression
var PREV_LINK_TEXT_RE = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`)
Match any link text/classname/id that looks like it means the previous page.
var READABILITY_ASSET = regexp.MustCompile(`(?i)entry-content-asset`)
Readability publisher-specific guidelines
var REMOVE_ATTRS = []string{"style", "align"}
cleanAttributes
var REMOVE_ATTR_LIST = "style,align"
var REMOVE_EMPTY_SELECTORS = "p:empty"
var REMOVE_EMPTY_TAGS = []string{"p"}
removeEmpty
var SPACER_RE = regexp.MustCompile(`(?i)transparent|spacer|blank`)
Spacer images to be removed
var STRIP_OUTPUT_TAGS = []string{
"title",
"script",
"noscript",
"link",
"style",
"hr",
"embed",
"iframe",
"object",
}
A list of tags to strip from the output if we encounter them.
var UNLIKELY_CANDIDATES_BLACKLIST = []string{
"ad-break",
"adbox",
"advert",
"addthis",
"agegate",
"aux",
"blogger-labels",
"combx",
"comment",
"conversation",
"disqus",
"entry-unrelated",
"extra",
"foot",
"header",
"hidden",
"loader",
"login",
"menu",
"meta",
"nav",
"outbrain",
"pager",
"pagination",
"predicta",
"presence_control_external",
"popup",
"printfriendly",
"related",
"remove",
"remark",
"rss",
"share",
"shoutbox",
"sidebar",
"sociable",
"sponsor",
"taboola",
"tools",
}
A list of strings that can be considered unlikely candidates when extracting content from a resource. These strings are joined together and then tested for existence using regex, so may contain simple, non-pipe style regular expression queries if necessary.
var UNLIKELY_CANDIDATES_WHITELIST = []string{
"and",
"article",
"body",
"blogindex",
"column",
"content",
"entry-content-asset",
"format",
"hfeed",
"hentry",
"hatom",
"main",
"page",
"posts",
"shadow",
}
A list of strings that can be considered LIKELY candidates when extracting content from a resource. Essentially, the inverse of the blacklist above - if something matches both blacklist and whitelist, it is kept. This is useful, for example, if something has a className of "rss-content entry-content". It matched 'rss', so it would normally be removed, however, it's also the entry content, so it should be left alone.
These strings are joined together and then tested for existence using regex, so may contain simple, non-pipe style regular expression queries if necessary.
var UNLIKELY_RE = regexp.MustCompile(`(?i)!(` + candidatesWhitelist + `)|(` + candidatesBlacklist + `)`)
var WHITELIST_ATTRS = []string{
"src",
"srcset",
"sizes",
"type",
"href",
"class",
"id",
"alt",
"xlink:href",
"width",
"height",
}
var WHITELIST_ATTRS_RE = regexp.MustCompile(`(?i)^(src|srcset|sizes|type|href|class|id|alt|xlink:href|width|height)$`)
Functions ¶
func ArticleBaseURL ¶
ArticleBaseURL extracts the base URL for the article, removing fragments and query parameters
func BrsToPs ¶
BrsToPs converts consecutive <br /> tags into <p /> tags JavaScript implementation: src/utils/dom/brs-to-ps.js
Given goquery Document, convert consecutive <br /> tags into <p /> tags instead. The algorithm exactly matches JavaScript: 1. Iterate through all BR elements 2. If next element is also BR, set collapsing=true and remove current BR 3. If collapsing and current BR is NOT followed by another BR, call paragraphize on that last BR
func CleanAttributes ¶
CleanAttributes removes unwanted attributes from elements and keeps only whitelisted ones
func CleanHOnes ¶
CleanHOnes processes H1 tags in a document based on their count. H1 tags are typically the article title, which should be extracted by the title extractor instead. If there's less than 3 of them (<3), strip them. Otherwise, turn them into H2s.
This preserves content structure when there are multiple H1s that likely represent section headers rather than the main title.
:param doc: A goquery Document to process :return: The modified goquery Document (returned for convenience, mutation is in-place)
func CleanHeaders ¶
CleanHeaders removes headers that don't meet certain criteria This exactly matches the JavaScript implementation with 3 removal conditions: 1. Headers appearing before all <p> tags (likely title/subtitle) 2. Headers that exactly match the article title 3. Headers with negative content weight (likely ads/junk)
func CleanHeadersWithoutTitle ¶
CleanHeadersWithoutTitle is a convenience function for when title is not available
func CleanImages ¶
CleanImages removes images that are likely spacers, ads, or decorative This exactly matches the JavaScript implementation with proper size thresholds
func CleanTags ¶
CleanTags conditionally removes elements based on their content and context This exactly matches the JavaScript cleanTags implementation JavaScript: export default function cleanTags($article, $)
func ConvertNodeTo ¶
ConvertNodeTo converts a node to a different tag type while preserving attributes and content
func ConvertToParagraphs ¶
ConvertToParagraphs loops through the provided doc, and converts any p-like elements to actual paragraph tags.
Things fitting this criteria: * Multiple consecutive <br /> tags. * <div /> tags without block level elements inside of them * <span /> tags who are not children of <p /> or <div /> tags.
:param doc: A goquery Document to search :return: goquery Document with new p elements (By-reference mutation, though. Returned just for convenience.)
func CountSentences ¶
CountSentences estimates the number of sentences in text
func DetectTextDirection ¶
DetectTextDirection attempts to detect the text direction (LTR/RTL) of content
func ExtractFromMeta ¶
func ExtractFromMeta(doc *goquery.Document, metaNames []string, cachedNames []string, cleanTags bool) *string
ExtractFromMeta extracts content from HTML meta tags Given a list of meta tag names to search for, find a meta tag associated. This function provides 100% JavaScript compatibility.
func ExtractFromSelectors ¶
func ExtractFromSelectors(doc *goquery.Selection, selectors []string, maxChildren int, textOnly bool) *string
ExtractFromSelectors finds content that may be extractable from the document using CSS selectors. This is for flat meta-information, like author, title, date published, etc.
Parameters: - doc: The goquery document/selection to search within - selectors: List of CSS selectors to try in order - maxChildren: Maximum number of child elements allowed (default 1) - textOnly: If true, extract text content; if false, extract HTML (default true)
Returns: - *string: The extracted content, or nil if nothing suitable found
func FindTopCandidate ¶
FindTopCandidate finds the element with the highest score after calculating all scores After we've calculated scores, loop through all of the possible candidate nodes we found and find the one with the highest score. JavaScript: export default function findTopCandidate($)
func GetAttr ¶
GetAttr is a convenience function to get a single attribute It's equivalent to selection.Attr() but provides consistent behavior
func GetAttrs ¶
GetAttrs returns all attributes of a goquery node as a map This mimics the JavaScript getAttrs function that works with both cheerio's attribs and browser DOM attributes
func GetBaseDomain ¶
GetBaseDomain extracts the base domain (removing subdomains) from a URL
func GetContentScore ¶
GetContentScore calculates a basic content score for an element
func GetRemoveAttrSelectors ¶
func GetRemoveAttrSelectors() []string
func GetRemoveEmptySelectors ¶
func GetRemoveEmptySelectors() []string
func GetWeight ¶
GetWeight scores a node based on its className and id JavaScript: function getWeight(node)
func HasSentenceEnd ¶
HasSentenceEnd checks if text ends with proper sentence punctuation
func IsLikelyArticleElement ¶
IsLikelyArticleElement checks if an element is likely to contain article content
func IsWordpress ¶
IsWordpress detects if a page is likely WordPress-based
func LinkDensity ¶
LinkDensity calculates the density of links in an element Returns the ratio of link text length to total text length
func MakeLinksAbsolute ¶
MakeLinksAbsolute converts all relative URLs in the document to absolute URLs This exactly matches the JavaScript makeLinksAbsolute implementation JavaScript: export default function makeLinksAbsolute($content, $, url)
func MarkToKeep ¶
MarkToKeep marks important elements that should be preserved during cleaning
func MergeSiblings ¶
func MergeSiblings(candidate *goquery.Selection, topScore int, doc *goquery.Document) *goquery.Selection
MergeSiblings merges sibling elements that may be part of the main content Now that we have a top_candidate, look through the siblings of it to see if any of them are decently scored. JavaScript: export default function mergeSiblings($candidate, topScore, $)
func NodeIsSufficient ¶
NodeIsSufficient determines if a node has enough content to be considered sufficient This exactly matches the JavaScript nodeIsSufficient implementation JavaScript: export default function nodeIsSufficient($node) { return $node.text().trim().length >= 100; }
func RemoveAnchor ¶
RemoveAnchor removes the anchor/fragment from a URL
func RemoveAttr ¶
RemoveAttr is a convenience function to remove an attribute
func RemoveEmpty ¶
RemoveEmpty removes elements that are empty or contain only whitespace
func RewriteTopLevel ¶
RewriteTopLevel rewrites the tag name to div if it's a top level node like body or html to avoid later complications with multiple body tags. This is a faithful port of the JavaScript rewriteTopLevel function.
func SanitizeURL ¶
SanitizeURL cleans up a URL by removing tracking parameters and normalizing
func ScoreContent ¶
ScoreContent orchestrates the entire content scoring process JavaScript: export default function scoreContent($, weightNodes = true)
func SetAttr ¶
SetAttr sets a single attribute on a DOM node This function mirrors the JavaScript setAttr behavior, handling goquery selections which are equivalent to cheerio nodes in the original JavaScript implementation.
In JavaScript, this function handled two cases: 1. Cheerio nodes (with attribs property) - equivalent to our goquery selections 2. Browser DOM nodes (with setAttribute method) - not applicable in Go/server environment
Parameters:
- selection: The goquery selection to modify
- attr: The attribute name to set
- val: The attribute value to set
Returns:
- The modified goquery selection for method chaining
func StripJunkTags ¶
StripJunkTags removes unwanted elements like scripts, styles, etc.
func StripTags ¶
StripTags removes all HTML tags from a string of text Returns plain text content with all HTML tags removed Removes non-content elements (script, style, noscript, head, meta, link) and HTML comments If the result is empty, returns the original text (JavaScript behavior)
func StripUnlikelyCandidates ¶
StripUnlikelyCandidates loops through the provided document and removes any non-link nodes that are unlikely candidates for article content.
Links are ignored because there are very often links to content that are identified as non-body-content, but may be inside article-like content.
:param doc: a goquery Document to strip nodes from :return: the cleaned goquery Document
func ValidateURL ¶
ValidateURL checks if a URL is valid and well-formed
func WithinComment ¶
WithinComment checks if an element is within a comment section
Types ¶
This section is empty.