dom

package

v1.0.7 Latest Latest Go to latest Published: Nov 26, 2025 License: MIT Imports: 10 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BumpyClock/hermes

Links

Open Source Insights

Documentation ¶

Overview ¶

ABOUTME: Cleans H1 tags from article content based on count threshold analysis. ABOUTME: Removes H1s if less than 3, converts to H2s if 3 or more to preserve content.

ABOUTME: Rewrite top-level DOM elements (html, body) to avoid multiple body tag complications ABOUTME: Converts html and body elements to div tags while preserving their content and attributes

ABOUTME: Single attribute setter utility that mirrors JavaScript setAttr behavior ABOUTME: Handles goquery Selection objects for DOM manipulation

Index ¶

Constants
Variables
func ArticleBaseURL(rawURL string) string
func BrsToPs(doc *goquery.Document) *goquery.Document
func CleanAttributes(doc *goquery.Document) *goquery.Document
func CleanHOnes(doc *goquery.Document) *goquery.Document
func CleanHeaders(doc *goquery.Document, title string) *goquery.Document
func CleanHeadersWithoutTitle(doc *goquery.Document) *goquery.Document
func CleanImages(doc *goquery.Document) *goquery.Document
func CleanTags(doc *goquery.Document) *goquery.Document
func ConvertNodeTo(node *goquery.Selection, tag string)
func ConvertToParagraphs(doc *goquery.Document) *goquery.Document
func CountSentences(text string) int
func CountWords(text string) int
func DetectTextDirection(text string) string
func ExtractFromMeta(doc *goquery.Document, metaNames []string, cachedNames []string, ...) *string
func ExtractFromSelectors(doc *goquery.Selection, selectors []string, maxChildren int, textOnly bool) *string
func FindTopCandidate(doc *goquery.Document) *goquery.Selection
func GetAttr(selection *goquery.Selection, attrName string) (string, bool)
func GetAttrs(selection *goquery.Selection) map[string]string
func GetBaseDomain(rawURL string) string
func GetContentScore(element *goquery.Selection) float64
func GetDomain(rawURL string) string
func GetRemoveAttrSelectors() []string
func GetRemoveEmptySelectors() []string
func GetWeight(element *goquery.Selection) int
func HasAttr(selection *goquery.Selection, attrName string) bool
func HasSentenceEnd(text string) bool
func IsLikelyArticleElement(element *goquery.Selection) bool
func IsWordpress(doc *goquery.Document) bool
func LinkDensity(element *goquery.Selection) float64
func MakeLinksAbsolute(doc *goquery.Document, rootURL string) *goquery.Document
func MarkToKeep(doc *goquery.Document) *goquery.Document
func MergeSiblings(candidate *goquery.Selection, topScore int, doc *goquery.Document) *goquery.Selection
func NodeIsSufficient(element *goquery.Selection) bool
func RemoveAnchor(rawURL string) string
func RemoveAttr(selection *goquery.Selection, attrName string) *goquery.Selection
func RemoveEmpty(doc *goquery.Document) *goquery.Document
func RewriteTopLevel(doc *goquery.Document) *goquery.Document
func SanitizeURL(rawURL string) string
func ScoreContent(doc *goquery.Document, weightNodes bool)
func SetAttr(selection *goquery.Selection, attr, val string) *goquery.Selection
func StripJunkTags(doc *goquery.Document) *goquery.Document
func StripTags(text string) string
func StripUnlikelyCandidates(doc *goquery.Document) *goquery.Document
func ValidateURL(rawURL string) bool
func WithinComment(element *goquery.Selection) bool

Constants ¶

View Source

const IS_WP_SELECTOR = `meta[name="generator"][value^="WordPress"]`

XPath to try to determine if a page is wordpress. Not always successful.

View Source

const KEEP_CLASS = "hermes-parser-keep"

The class we will use to mark elements we want to keep but would normally remove

Variables ¶

View Source

var BAD_TAGS = regexp.MustCompile(`(?i)^(address|form)$`)

View Source

var BLOCK_LEVEL_TAGS = []string{
	"article",
	"aside",
	"blockquote",
	"body",
	"br",
	"button",
	"canvas",
	"caption",
	"col",
	"colgroup",
	"dd",
	"div",
	"dl",
	"dt",
	"embed",
	"fieldset",
	"figcaption",
	"figure",
	"footer",
	"form",
	"h1",
	"h2",
	"h3",
	"h4",
	"h5",
	"h6",
	"header",
	"hgroup",
	"hr",
	"li",
	"map",
	"object",
	"ol",
	"output",
	"p",
	"pre",
	"progress",
	"section",
	"table",
	"tbody",
	"textarea",
	"tfoot",
	"th",
	"thead",
	"tr",
	"ul",
	"video",
}

A list of all of the block level tags known in HTML5 and below. Taken from http://bit.ly/qneNIT

View Source

var BLOCK_LEVEL_TAGS_RE = regexp.MustCompile(`(?i)^(article|aside|blockquote|body|br|button|canvas|caption|col|colgroup|dd|div|dl|dt|embed|fieldset|figcaption|figure|footer|form|h1|h2|h3|h4|h5|h6|header|hgroup|hr|li|map|object|ol|output|p|pre|progress|section|table|tbody|textarea|tfoot|th|thead|tr|ul|video)$`)

View Source

var BR_TAGS_RE = regexp.MustCompile(`(?i)(<br[^>]*>[ \n\r\t]*){2,}`)

Match 2 or more consecutive <br> tags

View Source

var BR_TAG_RE = regexp.MustCompile(`(?i)<br[^>]*>`)

Match 1 BR tag.

View Source

var CANDIDATES_BLACKLIST = regexp.MustCompile(`(?i)(` + candidatesBlacklist + `)`)

View Source

var CANDIDATES_WHITELIST = regexp.MustCompile(`(?i)(` + candidatesWhitelist + `)`)

View Source

var CAP_LINK_TEXT_RE = regexp.MustCompile(`(?i)(first|last|end)`)

Match any link text/classname/id that looks like it is an end link: things like "first", "last", "end", etc.

View Source

var CHILD_CONTENT_TAGS = regexp.MustCompile(`(?i)^(td|blockquote|ol|ul|dl)$`)

View Source

var CLEAN_CONDITIONALLY_TAGS = []string{
	"ul",
	"ol",
	"table",
	"div",
	"button",
	"form",
}

cleanTags

View Source

var CLEAN_CONDITIONALLY_TAGS_LIST = "ul,ol,table,div,button,form"

View Source

var DIGIT_RE = regexp.MustCompile(`[0-9]`)

Match a digit. Pretty clear.

View Source

var DIV_TO_P_BLOCK_TAGS = []string{
	"a",
	"blockquote",
	"dl",
	"div",
	"img",
	"p",
	"pre",
	"table",
}

A list of tags which, if found inside, should cause a <div /> to NOT be turned into a paragraph tag. Shallow div tags without these elements should be turned into <p /> tags.

View Source

var DIV_TO_P_BLOCK_TAGS_LIST = "a,blockquote,dl,div,img,p,pre,table"

View Source

var EXTRANEOUS_LINK_HINTS = []string{
	"print",
	"archive",
	"comment",
	"discuss",
	"e-mail",
	"email",
	"share",
	"reply",
	"all",
	"login",
	"sign",
	"single",
	"adx",
	"entry-unrelated",
}

A list of words that, if found in link text or URLs, likely mean that this link is not a next page link.

View Source

var EXTRANEOUS_LINK_HINTS_RE = regexp.MustCompile(`(?i)print|archive|comment|discuss|e-mail|email|share|reply|all|login|sign|single|adx|entry-unrelated`)

View Source

var HEADER_TAGS = []string{"h2", "h3", "h4", "h5", "h6"}

cleanHeaders

View Source

var HEADER_TAG_LIST = "h2,h3,h4,h5,h6"

View Source

var HNEWS_CONTENT_SELECTORS = [][]string{
	{".hentry", ".entry-content"},
	{"entry", ".entry-content"},
	{".entry", ".entry_content"},
	{".post", ".postbody"},
	{".post", ".post_body"},
	{".post", ".post-body"},
}

A list of selectors that specify, very clearly, either hNews or other very content-specific style content, like Blogger templates. More examples here: http://microformats.org/wiki/blog-post-formats

View Source

var HTML_OR_BODY_RE = regexp.MustCompile(`(?i)^(html|body)$`)

View Source

var KEEP_SELECTORS = []string{
	`iframe[src^="https://www.youtube.com"]`,
	`iframe[src^="https://www.youtube-nocookie.com"]`,
	`iframe[src^="http://www.youtube.com"]`,
	`iframe[src^="https://player.vimeo"]`,
	`iframe[src^="http://player.vimeo"]`,
	`iframe[src^="https://www.redditmedia.com"]`,
}

View Source

var NEGATIVE_SCORE_HINTS = []string{
	"adbox",
	"advert",
	"author",
	"bio",
	"bookmark",
	"bottom",
	"byline",
	"clear",
	"com-",
	"combx",
	"comment",
	`comment\\B`,
	"contact",
	"copy",
	"credit",
	"crumb",
	"date",
	"deck",
	"excerpt",
	"featured",
	"foot",
	"footer",
	"footnote",
	"graf",
	"head",
	"info",
	"infotext",
	"instapaper_ignore",
	"jump",
	"linebreak",
	"link",
	"masthead",
	"media",
	"meta",
	"modal",
	"outbrain",
	"promo",
	"pr_",
	"related",
	"respond",
	"roundcontent",
	"scroll",
	"secondary",
	"share",
	"shopping",
	"shoutbox",
	"side",
	"sidebar",
	"sponsor",
	"stamp",
	"sub",
	"summary",
	"tags",
	"tools",
	"widget",
}

A list of strings that denote a negative scoring for this content as being an article container. Checked against className and id.

TODO: Perhaps have these scale based on their odds of being quality?

View Source

var NEGATIVE_SCORE_RE = regexp.MustCompile(`(?i)adbox|advert|author|bio|bookmark|bottom|byline|clear|com-|combx|comment|comment\\B|contact|copy|credit|crumb|date|deck|excerpt|featured|foot|footer|footnote|graf|head|info|infotext|instapaper_ignore|jump|linebreak|link|masthead|media|meta|modal|outbrain|promo|pr_|related|respond|roundcontent|scroll|secondary|share|shopping|shoutbox|side|sidebar|sponsor|stamp|sub|summary|tags|tools|widget`)

The above list, joined into a matching regular expression

View Source

var NEXT_LINK_TEXT_RE = regexp.MustCompile(`(?i)(next|weiter|continue|>([^|]|$)|»([^|]|$))`)

Match any link text/classname/id that looks like it could mean the next page. Things like: next, continue, >, >>, » but not >|, »| as those can mean last page.

View Source

var NON_TOP_CANDIDATE_TAGS = []string{
	"br",
	"b",
	"i",
	"label",
	"hr",
	"area",
	"base",
	"basefont",
	"input",
	"img",
	"link",
	"meta",
}

A list of tags that should be ignored when trying to find the top candidate for a document.

View Source

var NON_TOP_CANDIDATE_TAGS_RE = regexp.MustCompile(`(?i)^(br|b|i|label|hr|area|base|basefont|input|img|link|meta)$`)

View Source

var PAGE_RE = regexp.MustCompile(`(?i)pag(e|ing|inat)`)

Match any phrase that looks like it could be page, or paging, or pagination

View Source

var PARAGRAPH_SCORE_TAGS = regexp.MustCompile(`(?i)^(p|li|span|pre)$`)

View Source

var PHOTO_HINTS = []string{"figure", "photo", "image", "caption"}

View Source

var PHOTO_HINTS_RE = regexp.MustCompile(`(?i)figure|photo|image|caption`)

View Source

var POSITIVE_SCORE_HINTS = []string{
	"article",
	"articlecontent",
	"instapaper_body",
	"blog",
	"body",
	"content",
	"entry-content-asset",
	"entry",
	"hentry",
	"main",
	"Normal",
	"page",
	"pagination",
	"permalink",
	"post",
	"story",
	"text",
	"[-_]copy",
	`\\Bcopy`,
}

A list of strings that denote a positive scoring for this content as being an article container. Checked against className and id.

TODO: Perhaps have these scale based on their odds of being quality?

View Source

var POSITIVE_SCORE_RE = regexp.MustCompile(`(?i)article|articlecontent|instapaper_body|blog|body|content|entry-content-asset|entry|hentry|main|Normal|page|pagination|permalink|post|story|text|[-_]copy|\\Bcopy`)

The above list, joined into a matching regular expression

View Source

var PREV_LINK_TEXT_RE = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`)

Match any link text/classname/id that looks like it means the previous page.

View Source

var READABILITY_ASSET = regexp.MustCompile(`(?i)entry-content-asset`)

Readability publisher-specific guidelines

View Source

var REMOVE_ATTRS = []string{"style", "align"}

cleanAttributes

View Source

var REMOVE_ATTR_LIST = "style,align"

View Source

var REMOVE_EMPTY_SELECTORS = "p:empty"

View Source

var REMOVE_EMPTY_TAGS = []string{"p"}

removeEmpty

View Source

var SPACER_RE = regexp.MustCompile(`(?i)transparent|spacer|blank`)

Spacer images to be removed

View Source

var STRIP_OUTPUT_TAGS = []string{
	"title",
	"script",
	"noscript",
	"link",
	"style",
	"hr",
	"embed",
	"iframe",
	"object",
}

A list of tags to strip from the output if we encounter them.

View Source

var UNLIKELY_CANDIDATES_BLACKLIST = []string{
	"ad-break",
	"adbox",
	"advert",
	"addthis",
	"agegate",
	"aux",
	"blogger-labels",
	"combx",
	"comment",
	"conversation",
	"disqus",
	"entry-unrelated",
	"extra",
	"foot",

	"header",
	"hidden",
	"loader",
	"login",
	"menu",
	"meta",
	"nav",
	"outbrain",
	"pager",
	"pagination",
	"predicta",
	"presence_control_external",
	"popup",
	"printfriendly",
	"related",
	"remove",
	"remark",
	"rss",
	"share",
	"shoutbox",
	"sidebar",
	"sociable",
	"sponsor",
	"taboola",
	"tools",
}

A list of strings that can be considered unlikely candidates when extracting content from a resource. These strings are joined together and then tested for existence using regex, so may contain simple, non-pipe style regular expression queries if necessary.

View Source

var UNLIKELY_CANDIDATES_WHITELIST = []string{
	"and",
	"article",
	"body",
	"blogindex",
	"column",
	"content",
	"entry-content-asset",
	"format",
	"hfeed",
	"hentry",
	"hatom",
	"main",
	"page",
	"posts",
	"shadow",
}

A list of strings that can be considered LIKELY candidates when extracting content from a resource. Essentially, the inverse of the blacklist above - if something matches both blacklist and whitelist, it is kept. This is useful, for example, if something has a className of "rss-content entry-content". It matched 'rss', so it would normally be removed, however, it's also the entry content, so it should be left alone.

These strings are joined together and then tested for existence using regex, so may contain simple, non-pipe style regular expression queries if necessary.

View Source

var UNLIKELY_RE = regexp.MustCompile(`(?i)!(` + candidatesWhitelist + `)|(` + candidatesBlacklist + `)`)

View Source

var WHITELIST_ATTRS = []string{
	"src",
	"srcset",
	"sizes",
	"type",
	"href",
	"class",
	"id",
	"alt",
	"xlink:href",
	"width",
	"height",
}

View Source

var WHITELIST_ATTRS_RE = regexp.MustCompile(`(?i)^(src|srcset|sizes|type|href|class|id|alt|xlink:href|width|height)$`)

Functions ¶

func ArticleBaseURL ¶

func ArticleBaseURL(rawURL string) string

ArticleBaseURL extracts the base URL for the article, removing fragments and query parameters

func BrsToPs ¶

func BrsToPs(doc *goquery.Document) *goquery.Document

BrsToPs converts consecutive <br /> tags into <p /> tags JavaScript implementation: src/utils/dom/brs-to-ps.js

Given goquery Document, convert consecutive <br /> tags into <p /> tags instead. The algorithm exactly matches JavaScript: 1. Iterate through all BR elements 2. If next element is also BR, set collapsing=true and remove current BR 3. If collapsing and current BR is NOT followed by another BR, call paragraphize on that last BR

func CleanAttributes ¶

func CleanAttributes(doc *goquery.Document) *goquery.Document

CleanAttributes removes unwanted attributes from elements and keeps only whitelisted ones

func CleanHOnes ¶

func CleanHOnes(doc *goquery.Document) *goquery.Document

CleanHOnes processes H1 tags in a document based on their count. H1 tags are typically the article title, which should be extracted by the title extractor instead. If there's less than 3 of them (<3), strip them. Otherwise, turn them into H2s.

This preserves content structure when there are multiple H1s that likely represent section headers rather than the main title.

:param doc: A goquery Document to process :return: The modified goquery Document (returned for convenience, mutation is in-place)

func CleanHeaders ¶

func CleanHeaders(doc *goquery.Document, title string) *goquery.Document

CleanHeaders removes headers that don't meet certain criteria This exactly matches the JavaScript implementation with 3 removal conditions: 1. Headers appearing before all <p> tags (likely title/subtitle) 2. Headers that exactly match the article title 3. Headers with negative content weight (likely ads/junk)

func CleanHeadersWithoutTitle ¶

func CleanHeadersWithoutTitle(doc *goquery.Document) *goquery.Document

CleanHeadersWithoutTitle is a convenience function for when title is not available

func CleanImages ¶

func CleanImages(doc *goquery.Document) *goquery.Document

CleanImages removes images that are likely spacers, ads, or decorative This exactly matches the JavaScript implementation with proper size thresholds

func CleanTags ¶

func CleanTags(doc *goquery.Document) *goquery.Document

CleanTags conditionally removes elements based on their content and context This exactly matches the JavaScript cleanTags implementation JavaScript: export default function cleanTags($article, $)

func ConvertNodeTo ¶

func ConvertNodeTo(node *goquery.Selection, tag string)

ConvertNodeTo converts a node to a different tag type while preserving attributes and content

func ConvertToParagraphs ¶

func ConvertToParagraphs(doc *goquery.Document) *goquery.Document

ConvertToParagraphs loops through the provided doc, and converts any p-like elements to actual paragraph tags.

Things fitting this criteria: * Multiple consecutive <br /> tags. * <div /> tags without block level elements inside of them * <span /> tags who are not children of <p /> or <div /> tags.

:param doc: A goquery Document to search :return: goquery Document with new p elements (By-reference mutation, though. Returned just for convenience.)

func CountSentences ¶

func CountSentences(text string) int

CountSentences estimates the number of sentences in text

func CountWords ¶

func CountWords(text string) int

CountWords counts the number of words in text

func DetectTextDirection ¶

func DetectTextDirection(text string) string

DetectTextDirection attempts to detect the text direction (LTR/RTL) of content

func ExtractFromMeta ¶

func ExtractFromMeta(doc *goquery.Document, metaNames []string, cachedNames []string, cleanTags bool) *string

ExtractFromMeta extracts content from HTML meta tags Given a list of meta tag names to search for, find a meta tag associated. This function provides 100% JavaScript compatibility.

func ExtractFromSelectors ¶

func ExtractFromSelectors(doc *goquery.Selection, selectors []string, maxChildren int, textOnly bool) *string

ExtractFromSelectors finds content that may be extractable from the document using CSS selectors. This is for flat meta-information, like author, title, date published, etc.

Parameters: - doc: The goquery document/selection to search within - selectors: List of CSS selectors to try in order - maxChildren: Maximum number of child elements allowed (default 1) - textOnly: If true, extract text content; if false, extract HTML (default true)

Returns: - *string: The extracted content, or nil if nothing suitable found

func FindTopCandidate ¶

func FindTopCandidate(doc *goquery.Document) *goquery.Selection

FindTopCandidate finds the element with the highest score after calculating all scores After we've calculated scores, loop through all of the possible candidate nodes we found and find the one with the highest score. JavaScript: export default function findTopCandidate($)

func GetAttr ¶

func GetAttr(selection *goquery.Selection, attrName string) (string, bool)

GetAttr is a convenience function to get a single attribute It's equivalent to selection.Attr() but provides consistent behavior

func GetAttrs ¶

func GetAttrs(selection *goquery.Selection) map[string]string

GetAttrs returns all attributes of a goquery node as a map This mimics the JavaScript getAttrs function that works with both cheerio's attribs and browser DOM attributes

func GetBaseDomain ¶

func GetBaseDomain(rawURL string) string

GetBaseDomain extracts the base domain (removing subdomains) from a URL

func GetContentScore ¶

func GetContentScore(element *goquery.Selection) float64

GetContentScore calculates a basic content score for an element

func GetDomain ¶

func GetDomain(rawURL string) string

GetDomain extracts the domain from a URL

func GetRemoveAttrSelectors ¶

func GetRemoveAttrSelectors() []string

func GetRemoveEmptySelectors ¶

func GetRemoveEmptySelectors() []string

func GetWeight ¶

func GetWeight(element *goquery.Selection) int

GetWeight scores a node based on its className and id JavaScript: function getWeight(node)

func HasAttr ¶

func HasAttr(selection *goquery.Selection, attrName string) bool

HasAttr checks if an element has a specific attribute

func HasSentenceEnd ¶

func HasSentenceEnd(text string) bool

HasSentenceEnd checks if text ends with proper sentence punctuation

func IsLikelyArticleElement ¶

func IsLikelyArticleElement(element *goquery.Selection) bool

IsLikelyArticleElement checks if an element is likely to contain article content

func IsWordpress ¶

func IsWordpress(doc *goquery.Document) bool

IsWordpress detects if a page is likely WordPress-based

func LinkDensity ¶

func LinkDensity(element *goquery.Selection) float64

LinkDensity calculates the density of links in an element Returns the ratio of link text length to total text length

func MakeLinksAbsolute ¶

func MakeLinksAbsolute(doc *goquery.Document, rootURL string) *goquery.Document

MakeLinksAbsolute converts all relative URLs in the document to absolute URLs This exactly matches the JavaScript makeLinksAbsolute implementation JavaScript: export default function makeLinksAbsolute($content, $, url)

func MarkToKeep ¶

func MarkToKeep(doc *goquery.Document) *goquery.Document

MarkToKeep marks important elements that should be preserved during cleaning

func MergeSiblings ¶

func MergeSiblings(candidate *goquery.Selection, topScore int, doc *goquery.Document) *goquery.Selection

MergeSiblings merges sibling elements that may be part of the main content Now that we have a top_candidate, look through the siblings of it to see if any of them are decently scored. JavaScript: export default function mergeSiblings($candidate, topScore, $)

func NodeIsSufficient ¶

func NodeIsSufficient(element *goquery.Selection) bool

NodeIsSufficient determines if a node has enough content to be considered sufficient This exactly matches the JavaScript nodeIsSufficient implementation JavaScript: export default function nodeIsSufficient($node) { return $node.text().trim().length >= 100; }

func RemoveAnchor ¶

func RemoveAnchor(rawURL string) string

RemoveAnchor removes the anchor/fragment from a URL

func RemoveAttr ¶

func RemoveAttr(selection *goquery.Selection, attrName string) *goquery.Selection

RemoveAttr is a convenience function to remove an attribute

func RemoveEmpty ¶

func RemoveEmpty(doc *goquery.Document) *goquery.Document

RemoveEmpty removes elements that are empty or contain only whitespace

func RewriteTopLevel ¶

func RewriteTopLevel(doc *goquery.Document) *goquery.Document

RewriteTopLevel rewrites the tag name to div if it's a top level node like body or html to avoid later complications with multiple body tags. This is a faithful port of the JavaScript rewriteTopLevel function.

func SanitizeURL ¶

func SanitizeURL(rawURL string) string

SanitizeURL cleans up a URL by removing tracking parameters and normalizing

func ScoreContent ¶

func ScoreContent(doc *goquery.Document, weightNodes bool)

ScoreContent orchestrates the entire content scoring process JavaScript: export default function scoreContent($, weightNodes = true)

func SetAttr ¶

func SetAttr(selection *goquery.Selection, attr, val string) *goquery.Selection

SetAttr sets a single attribute on a DOM node This function mirrors the JavaScript setAttr behavior, handling goquery selections which are equivalent to cheerio nodes in the original JavaScript implementation.

In JavaScript, this function handled two cases: 1. Cheerio nodes (with attribs property) - equivalent to our goquery selections 2. Browser DOM nodes (with setAttribute method) - not applicable in Go/server environment

Parameters:

selection: The goquery selection to modify
attr: The attribute name to set
val: The attribute value to set

Returns:

The modified goquery selection for method chaining

func StripJunkTags ¶

func StripJunkTags(doc *goquery.Document) *goquery.Document

StripJunkTags removes unwanted elements like scripts, styles, etc.

func StripTags ¶

func StripTags(text string) string

StripTags removes all HTML tags from a string of text Returns plain text content with all HTML tags removed Removes non-content elements (script, style, noscript, head, meta, link) and HTML comments If the result is empty, returns the original text (JavaScript behavior)

func StripUnlikelyCandidates ¶

func StripUnlikelyCandidates(doc *goquery.Document) *goquery.Document

StripUnlikelyCandidates loops through the provided document and removes any non-link nodes that are unlikely candidates for article content.

Links are ignored because there are very often links to content that are identified as non-body-content, but may be inside article-like content.

:param doc: a goquery Document to strip nodes from :return: the cleaned goquery Document

func ValidateURL ¶

func ValidateURL(rawURL string) bool

ValidateURL checks if a URL is valid and well-formed

func WithinComment ¶

func WithinComment(element *goquery.Selection) bool

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func ArticleBaseURL ¶

func BrsToPs ¶

func CleanAttributes ¶

func CleanHOnes ¶

func CleanHeaders ¶

func CleanHeadersWithoutTitle ¶

func CleanImages ¶

func CleanTags ¶

func ConvertNodeTo ¶

func ConvertToParagraphs ¶

func CountSentences ¶

func CountWords ¶

func DetectTextDirection ¶

func ExtractFromMeta ¶

func ExtractFromSelectors ¶

func FindTopCandidate ¶

func GetAttr ¶

func GetAttrs ¶

func GetBaseDomain ¶

func GetContentScore ¶

func GetDomain ¶

func GetRemoveAttrSelectors ¶

func GetRemoveEmptySelectors ¶

func GetWeight ¶

func HasAttr ¶

func HasSentenceEnd ¶

func IsLikelyArticleElement ¶

func IsWordpress ¶

func LinkDensity ¶

func MakeLinksAbsolute ¶

func MarkToKeep ¶

func MergeSiblings ¶

func NodeIsSufficient ¶

func RemoveAnchor ¶

func RemoveAttr ¶

func RemoveEmpty ¶

func RewriteTopLevel ¶

func SanitizeURL ¶

func ScoreContent ¶

func SetAttr ¶

func StripJunkTags ¶

func StripTags ¶

func StripUnlikelyCandidates ¶

func ValidateURL ¶

func WithinComment ¶

Types ¶

Source Files ¶