generic

package

v1.0.6 Latest Latest Go to latest Published: Aug 31, 2025 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BumpyClock/hermes

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func CleanContent(article *goquery.Selection, opts CleanContentOptions) *goquery.Selection
func DirectionExtractor(params ExtractorParams) (string, error)
func ExtractBestNode(doc *goquery.Document, opts ExtractBestNodeOptions) *goquery.Selection
func GetDirection(input interface{}) (string, error)
func NodeIsSufficient(node *goquery.Selection) bool
type CleanContentOptions
type ExtractBestNodeOptions
type ExtractionOptions
type ExtractionResult
type ExtractorImageParams
type ExtractorOptions
type ExtractorParams
type GenericAuthorExtractor
- func (e *GenericAuthorExtractor) Extract(doc *goquery.Selection, metaCache []string) *string
type GenericContentExtractor
- func NewGenericContentExtractor() *GenericContentExtractor
- func (e *GenericContentExtractor) CleanAndReturnNode(node *goquery.Selection, doc *goquery.Document) string
- func (e *GenericContentExtractor) Extract(params ExtractorParams, opts ExtractorOptions) string
- func (e *GenericContentExtractor) GetContentNode(doc *goquery.Document, title, url string, opts ExtractorOptions) *goquery.Selection
type GenericDateExtractorType
- func (e GenericDateExtractorType) Extract(doc *goquery.Selection, url string, metaCache []string) *string
type GenericDekExtractor
- func (e *GenericDekExtractor) Extract(doc *goquery.Document, opts map[string]interface{}) string
type GenericDescriptionExtractor
- func (extractor *GenericDescriptionExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericExcerptExtractor
- func NewGenericExcerptExtractor() *GenericExcerptExtractor
- func (e *GenericExcerptExtractor) Extract(doc *goquery.Document, content string, metaCache []string) string
type GenericExtractor
- func NewGenericExtractor() *GenericExtractor
- func (ge *GenericExtractor) ExtractGeneric(options *ExtractionOptions) (*ExtractionResult, error)
- func (ge *GenericExtractor) GetDomain() string
type GenericFaviconExtractor
- func (extractor *GenericFaviconExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericLanguageExtractor
- func (extractor *GenericLanguageExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericLeadImageExtractor
- func NewGenericLeadImageExtractor() *GenericLeadImageExtractor
- func (e *GenericLeadImageExtractor) Extract(params ExtractorImageParams) *string
type GenericNextPageUrlExtractor
- func NewGenericNextPageUrlExtractor() *GenericNextPageUrlExtractor
- func (e *GenericNextPageUrlExtractor) Extract(doc *goquery.Document, articleURL string, parsedURL *url.URL, ...) string
type GenericSiteImageExtractor
- func (extractor *GenericSiteImageExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericSiteNameExtractor
- func (extractor *GenericSiteNameExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericSiteTitleExtractor
- func (extractor *GenericSiteTitleExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericThemeColorExtractor
- func (extractor *GenericThemeColorExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string
type GenericVideoExtractor
- func (extractor *GenericVideoExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) *VideoMetadata
- func (extractor *GenericVideoExtractor) ExtractVideoURL(selection *goquery.Selection, pageURL string, metaCache []string) string
type RTLScriptRange
type URLResult
type VideoMetadata

Constants ¶

View Source

const (
	LTRMark = "\u200e" // Left-to-right mark
	RTLMark = "\u200f" // Right-to-left mark
	LTR     = "ltr"    // Left to right direction content
	RTL     = "rtl"    // Right to left direction content
	BIDI    = "bidi"   // Both directions - bidirectional content
	NODI    = ""       // No direction - empty string for no detectable direction
)

Direction constants matching JavaScript string-direction library exactly

View Source

const AUTHOR_MAX_LENGTH = 300

AUTHOR_MAX_LENGTH - maximum length for valid author names

Variables ¶

View Source

var (
	MS_DATE_STRING         = regexp.MustCompile(`^\d{13}$`)
	SEC_DATE_STRING        = regexp.MustCompile(`^\d{10}$`)
	CLEAN_DATE_STRING_RE   = regexp.MustCompile(`^\s*published\s*:?\s*(.*)`)
	TIME_MERIDIAN_SPACE_RE = regexp.MustCompile(`(.*\d)(am|pm)(.*)`)
	TIME_MERIDIAN_DOTS_RE  = regexp.MustCompile(`\.m\.`)
	TIME_NOW_STRING        = regexp.MustCompile(`^\s*(just|right)?\s*now\s*`)
	TIME_WITH_OFFSET_RE    = regexp.MustCompile(`-\d{3,4}$`)
)

JavaScript date cleaner constants (ported from cleaners/constants.js)

View Source

var (
	POSITIVE_LEAD_IMAGE_URL_HINTS_RE = regexp.MustCompile("(?i)" + strings.Join(POSITIVE_LEAD_IMAGE_URL_HINTS, "|"))
	NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = regexp.MustCompile("(?i)" + strings.Join(NEGATIVE_LEAD_IMAGE_URL_HINTS, "|"))
	GIF_RE                           = regexp.MustCompile(`(?i)\.gif(\?.*)?$`)
	JPG_RE                           = regexp.MustCompile(`(?i)\.jpe?g(\?.*)?$`)
	PHOTO_HINTS_RE                   = regexp.MustCompile(`(?i)figure|photo|image|caption`) // From constants.go
)

Compiled regexes for URL scoring

View Source

var (
	// DIGIT_RE matches any digit character
	DIGIT_RE = regexp.MustCompile(`\d`)

	// EXTRANEOUS_LINK_HINTS are words that indicate a link is probably not a next page
	EXTRANEOUS_LINK_HINTS = []string{
		"print", "archive", "comment", "discuss", "e-mail", "email",
		"share", "reply", "all", "login", "sign", "single", "adx", "entry-unrelated",
	}
	EXTRANEOUS_LINK_HINTS_RE = regexp.MustCompile(`(?i)` + strings.Join(EXTRANEOUS_LINK_HINTS, "|"))

	// NEXT_LINK_TEXT_RE matches text that likely indicates a next page link
	NEXT_LINK_TEXT_RE = regexp.MustCompile(`(?i)(next|weiter|continue|>([^|]|$)|»([^|]|$))`)

	// CAP_LINK_TEXT_RE matches text that indicates end links (first, last, etc.)
	CAP_LINK_TEXT_RE = regexp.MustCompile(`(?i)(first|last|end)`)

	// PREV_LINK_TEXT_RE matches text that indicates previous page links
	PREV_LINK_TEXT_RE = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`)

	// PAGE_RE matches pagination-related text
	PAGE_RE = regexp.MustCompile(`(?i)pag(e|ing|inat)`)
)

Constants from JavaScript implementation

View Source

var (
	// An ordered list of meta tag names that denote likely article titles.
	// All attributes should be lowercase for faster case-insensitive matching.
	// From most distinct to least distinct.
	STRONG_TITLE_META_TAGS = []string{
		"tweetmeme-title",
		"dc.title",
		"rbtitle",
		"headline",
		"title",
	}

	// og:title is weak because it typically contains context that we don't like,
	// for example the source site's name. Gotta get that brand into facebook!
	WEAK_TITLE_META_TAGS = []string{
		"og:title",
	}

	// An ordered list of CSS Selectors to find likely article titles.
	// From most explicit to least explicit.
	//
	// Note - this does not use classes like CSS. This checks to see if the string
	// exists in the className, which is not as accurate as .className (which
	// splits on spaces/endlines), but for our purposes it's close enough.
	STRONG_TITLE_SELECTORS = []string{
		".hentry .entry-title",
		"h1#articleHeader",
		"h1.articleHeader",
		"h1.article",
		".instapaper_title",
		"#meebo-title",
	}

	WEAK_TITLE_SELECTORS = []string{
		"article h1",
		"#entry-title",
		".entry-title",
		"#entryTitle",
		"#entrytitle",
		".entryTitle",
		".entrytitle",
		"#articleTitle",
		".articleTitle",
		"post post-title",
		"h1.title",
		"h2.article",
		"h1",
		"html head title",
		"title",
	}

	// Regular expression for title separators
	TITLE_SPLITTERS_RE = regexp.MustCompile(`(: | - | \| )`)

	// Domain endings regex for cleaning
	DOMAIN_ENDINGS_RE = regexp.MustCompile(`\.com$|\.net$|\.org$|\.co\.uk$`)
)

Title extraction constants matching JavaScript behavior exactly

View Source

var AUTHOR_META_TAGS = []string{
	"byl",
	"clmst",
	"dc.author",
	"dcsext.author",
	"dc.creator",
	"rbauthors",
	"authors",
}

AUTHOR_META_TAGS - ordered list of meta tag names that denote likely article authors From most distinct to least distinct. Note: "author" is too often the developer of the page, so it is not included here.

View Source

var AUTHOR_SELECTORS = []string{
	".entry .entry-author",
	".author.vcard .fn",
	".author .vcard .fn",
	".byline.vcard .fn",
	".byline .vcard .fn",
	".byline .by .author",
	".byline .by",
	".byline .author",
	".post-author.vcard",
	".post-author .vcard",
	"a[rel=author]",
	"#by_author",
	".by_author",
	"#entryAuthor",
	".entryAuthor",
	".byline a[href*=author]",
	"#author .authorname",
	".author .authorname",
	"#author",
	".author",
	".articleauthor",
	".ArticleAuthor",
	".byline",
}

AUTHOR_SELECTORS - ordered list of CSS selectors to find likely article authors From most explicit to least explicit. Uses class substring matching like JavaScript.

View Source

var BYLINE_SELECTORS_RE = [][2]interface{}{
	{"#byline", bylineRe},
	{".byline", bylineRe},
}

View Source

var (
	// CANONICAL_META_SELECTORS - meta tag names for canonical URL extraction
	// From JavaScript: export const CANONICAL_META_SELECTORS = ['og:url'];
	CANONICAL_META_SELECTORS = []string{
		"og:url",
	}
)

URL extraction constants matching JavaScript behavior exactly

View Source

var CLEAN_AUTHOR_RE = regexp.MustCompile(`(?i)^\s*(posted |written )?by\s*:?\s*(.*)`)

CLEAN_AUTHOR_RE - regex for cleaning author prefixes Matches /^\s*(posted |written )?by\s*:?\s*(.*)/i from JavaScript

View Source

var DATE_PUBLISHED_META_TAGS = []string{
	"article:published_time",
	"displaydate",
	"dc.date",
	"dc.date.issued",
	"rbpubdate",
	"publish_date",
	"pub_date",
	"pagedate",
	"pubdate",
	"revision_date",
	"doc_date",
	"date_created",
	"content_create_date",
	"lastmodified",
	"created",
	"date",
}

DATE_PUBLISHED_META_TAGS - Ordered list of meta tag names that denote likely date published dates All attributes should be lowercase for faster case-insensitive matching From most distinct to least distinct (matches JavaScript exactly)

View Source

var DATE_PUBLISHED_SELECTORS = []string{
	".hentry .dtstamp.published",
	".hentry .published",
	".hentry .dtstamp.updated",
	".hentry .updated",
	".single .published",
	".meta .published",
	".meta .postDate",
	".entry-date",
	".byline .date",
	".postmetadata .date",
	".article_datetime",
	".date-header",
	".story-date",
	".dateStamp",
	"#story .datetime",
	".dateline",
	".pubdate",
}

DATE_PUBLISHED_SELECTORS - Ordered list of CSS selectors to find likely date published dates From most explicit to least explicit (matches JavaScript exactly)

View Source

var DATE_PUBLISHED_URL_RES = []*regexp.Regexp{
	regexp.MustCompile(`/(20\d{2}/\d{2}/\d{2})/`),
	regexp.MustCompile(`(20\d{2}-[01]\d-[0-3]\d)`),
	regexp.MustCompile(`/(20\d{2}/(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/[0-3]\d)/`),
}

DATE_PUBLISHED_URL_RES - Ordered list of compiled regular expressions to find likely date published dates from the URL. These should always have the first reference be a date string that is parseable. Matches JavaScript exactly.

View Source

var EXCERPT_META_SELECTORS = []string{"og:description", "twitter:description"}

EXCERPT_META_SELECTORS defines the meta tag names to search for excerpt content This matches the JavaScript constants exactly: ['og:description', 'twitter:description']

View Source

var GenericDateExtractor = GenericDateExtractorType{}

View Source

var GenericTitleExtractor = struct {
	Extract func(doc *goquery.Selection, url string, metaCache []string) string
}{
	Extract: func(doc *goquery.Selection, url string, metaCache []string) string {

		html := "<html></html>"
		if doc.Length() > 0 {
			if fullHtml, err := doc.Html(); err == nil && fullHtml != "" {
				html = "<html>" + fullHtml + "</html>"
			} else {

				if doc.Parent().Length() > 0 {
					if parentHtml, err := doc.Parent().Html(); err == nil {
						html = "<html>" + parentHtml + "</html>"
					}
				}
			}
		} else {
			return ""
		}

		document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
		if err != nil {
			return ""
		}

		title := dom.ExtractFromMeta(document, STRONG_TITLE_META_TAGS, metaCache, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		title = dom.ExtractFromSelectors(doc, STRONG_TITLE_SELECTORS, 1, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		title = dom.ExtractFromMeta(document, WEAK_TITLE_META_TAGS, metaCache, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		title = dom.ExtractFromSelectors(doc, WEAK_TITLE_SELECTORS, 1, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		return ""
	},
}

GenericTitleExtractor extracts article titles using multiple fallback strategies

View Source

var GenericUrlExtractor = struct {
	Extract func(doc *goquery.Selection, url string, metaCache []string) URLResult
}{
	Extract: func(doc *goquery.Selection, url string, metaCache []string) URLResult {

		canonical := doc.Find("link[rel=canonical]")
		if canonical.Length() != 0 {
			href, exists := canonical.Attr("href")
			if exists && href != "" {
				return result(href)
			}
		}

		// Second, check for canonical URL in meta tags
		// Need to convert selection to document for meta tag extraction
		var document *goquery.Document

		if doc.Is("html") {

			if docNode := doc.Get(0); docNode != nil {
				document = goquery.NewDocumentFromNode(docNode)
			}
		}

		if document == nil {

			if html, err := doc.Html(); err == nil {
				fullHTML := html
				if !containsHTML(html) {
					fullHTML = "<html>" + html + "</html>"
				}

				if tempDoc, err := goquery.NewDocumentFromReader(strings.NewReader(fullHTML)); err == nil {
					document = tempDoc
				}
			}
		}

		if document != nil {

			metaURL := dom.ExtractFromMeta(document, CANONICAL_META_SELECTORS, metaCache, false)
			if metaURL != nil && *metaURL != "" {
				return result(*metaURL)
			}
		}

		return result(url)
	},
}

GenericUrlExtractor provides URL extraction functionality matching JavaScript exactly

View Source

var GenericWordCountExtractor = struct {
	Extract func(options map[string]interface{}) int
}{
	Extract: func(options map[string]interface{}) int {

		if options == nil {
			return 1
		}

		contentInterface, exists := options["content"]
		if !exists {
			return 1
		}

		content, ok := contentInterface.(string)
		if !ok {
			return 1
		}

		count := getWordCount(content)

		if count == 1 {
			count = getWordCountAlt(content)
		}

		return count
	},
}

GenericWordCountExtractor extracts word count from content using JavaScript-compatible logic

View Source

var LEAD_IMAGE_URL_META_TAGS = []string{
	"og:image",
	"twitter:image",
	"image_src",
}

Lead image URL meta tags in priority order (most distinct first)

View Source

var LEAD_IMAGE_URL_SELECTORS = []string{
	"link[rel=image_src]",
}

Fallback selectors for lead image extraction

View Source

var NEGATIVE_LEAD_IMAGE_URL_HINTS = []string{
	"spacer", "sprite", "blank", "throbber", "gradient", "tile", "bg",
	"background", "icon", "social", "header", "hdr", "advert", "spinner",
	"loader", "loading", "default", "rating", "share", "facebook",
	"twitter", "theme", "promo", "ads", "wp-includes",
}

Negative hints that decrease image score

View Source

var POSITIVE_LEAD_IMAGE_URL_HINTS = []string{
	"upload",
	"wp-content",
	"large",
	"photo",
	"wp-image",
}

Positive hints that increase image score

View Source

var SPLIT_DATE_STRING = regexp.MustCompile(`(?i)([0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?)|([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})|(-[0-9]{3,4}$)|([0-9]{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)`)

SPLIT_DATE_STRING regex for splitting date components (matches JavaScript exactly with case-insensitive)

View Source

var TIME_AGO_STRING = regexp.MustCompile(`(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s+ago`)

TIME_AGO_STRING regex for parsing relative dates (X minutes ago, etc.)

Functions ¶

func CleanContent ¶

func CleanContent(article *goquery.Selection, opts CleanContentOptions) *goquery.Selection

CleanContent cleans article content, returning a new, cleaned node This adapts the JavaScript extractCleanNode function to work with Go's document-based DOM functions

func DirectionExtractor ¶

func DirectionExtractor(params ExtractorParams) (string, error)

DirectionExtractor extracts text direction from title field only Matches JavaScript: direction: ({ title }) => stringDirection.getDirection(title)

func ExtractBestNode ¶

func ExtractBestNode(doc *goquery.Document, opts ExtractBestNodeOptions) *goquery.Selection

ExtractBestNode extracts the content most likely to be article text using a variety of scoring techniques.

The function orchestrates the complete extraction pipeline: 1. Optionally strips unlikely candidates (comments, ads, etc.) 2. Converts elements to paragraphs for better scoring 3. Scores all content based on various signals 4. Finds and returns the top candidate element

This is a direct port of the JavaScript extractBestNode function with 100% compatibility.

Parameters:

doc: A goquery Document representing the DOM to extract from
opts: ExtractBestNodeOptions with configuration flags
StripUnlikelyCandidates: If true, remove elements that match exclusion criteria
WeightNodes: If true, use classNames and IDs to determine node worthiness

Returns:

*goquery.Selection: The top candidate element, or nil if no suitable content found

func GetDirection ¶

func GetDirection(input interface{}) (string, error)

GetDirection analyzes string direction and returns 'ltr', 'rtl', 'bidi', or ” Direct port of JavaScript stringDirection.getDirection() function

func NodeIsSufficient ¶

func NodeIsSufficient(node *goquery.Selection) bool

NodeIsSufficient determines if a node has enough content to be considered article-like Given a node, determine if it's article-like enough to return Direct port of JavaScript nodeIsSufficient function

Types ¶

type CleanContentOptions ¶

type CleanContentOptions struct {
	Doc                *goquery.Document
	CleanConditionally bool
	Title              string
	URL                string
	DefaultCleaner     bool
}

CleanContentOptions represents options for content cleaning

type ExtractBestNodeOptions ¶

type ExtractBestNodeOptions struct {
	StripUnlikelyCandidates bool
	WeightNodes             bool
}

ExtractBestNodeOptions represents configuration options for content extraction

type ExtractionOptions ¶

type ExtractionOptions struct {
	URL         string
	HTML        string
	Doc         *goquery.Document
	MetaCache   []string
	Fallback    bool
	ContentType string
}

ExtractionOptions contains all parameters needed for extraction Matches JavaScript options object structure

type ExtractionResult ¶

type ExtractionResult struct {
	Title         string     `json:"title"`
	Author        string     `json:"author"`
	DatePublished *time.Time `json:"date_published"` // null if not found
	Dek           string     `json:"dek"`
	LeadImageURL  string     `json:"lead_image_url"`
	Content       string     `json:"content"`
	NextPageURL   string     `json:"next_page_url"`
	URL           string     `json:"url"`
	Domain        string     `json:"domain"`
	Excerpt       string     `json:"excerpt"`
	WordCount     int        `json:"word_count"`
	Direction     string     `json:"direction"`
	SiteName      string     `json:"site_name"`
	SiteTitle     string     `json:"site_title"`
	SiteImage     string     `json:"site_image"`
	Favicon       string     `json:"favicon"`
}

ExtractionResult represents the complete result from generic extraction Matches JavaScript extraction result structure exactly

type ExtractorImageParams ¶

type ExtractorImageParams struct {
	Doc       *goquery.Document
	Content   string
	MetaCache map[string]string
	HTML      string
}

ExtractorImageParams contains parameters for image extraction

type ExtractorOptions ¶

type ExtractorOptions struct {
	StripUnlikelyCandidates bool
	WeightNodes             bool
	CleanConditionally      bool
}

ExtractorOptions represents configuration options for content extraction

type ExtractorParams ¶

type ExtractorParams struct {
	Doc   *goquery.Document
	HTML  string
	Title string
	URL   string
}

ExtractorParams contains all the parameters needed for extraction

type GenericAuthorExtractor ¶

type GenericAuthorExtractor struct{}

GenericAuthorExtractor provides author extraction functionality

func (*GenericAuthorExtractor) Extract ¶

func (e *GenericAuthorExtractor) Extract(doc *goquery.Selection, metaCache []string) *string

Extract extracts author information from HTML using the three-tier strategy Returns *string to allow nil for no author found (matching JavaScript behavior)

type GenericContentExtractor ¶

type GenericContentExtractor struct {
	DefaultOpts ExtractorOptions
}

GenericContentExtractor implements the main content extraction logic

func NewGenericContentExtractor ¶

func NewGenericContentExtractor() *GenericContentExtractor

NewGenericContentExtractor creates a new extractor with default options

func (*GenericContentExtractor) CleanAndReturnNode ¶

func (e *GenericContentExtractor) CleanAndReturnNode(node *goquery.Selection, doc *goquery.Document) string

CleanAndReturnNode finalizes the content by ensuring we have something and normalizing spaces Once we got here, either we're at our last-resort node, or we broke early. Make sure we at least have -something- before we move forward.

func (*GenericContentExtractor) Extract ¶

func (e *GenericContentExtractor) Extract(params ExtractorParams, opts ExtractorOptions) string

Extract extracts the content for this resource - initially, pass in the most restrictive opts which will return the highest quality content. On each failure, retry with slightly more lax opts.

The function implements the JavaScript extraction strategy: 1. Try with default strict options 2. If content is insufficient, cascade through options, disabling them one by one 3. Return the best content found

This matches the JavaScript behavior exactly for option cascading and content validation.

func (*GenericContentExtractor) GetContentNode ¶

func (e *GenericContentExtractor) GetContentNode(doc *goquery.Document, title, url string, opts ExtractorOptions) *goquery.Selection

GetContentNode gets the content node given current options This orchestrates the extraction pipeline: extract best node -> clean content

type GenericDateExtractorType ¶

type GenericDateExtractorType struct{}

GenericDateExtractor - Extractor for publication dates with 100% JavaScript compatibility

func (GenericDateExtractorType) Extract ¶

func (e GenericDateExtractorType) Extract(doc *goquery.Selection, url string, metaCache []string) *string

Extract publication date from document using meta tags, selectors, and URL patterns

type GenericDekExtractor ¶

type GenericDekExtractor struct{}

GenericDekExtractor extracts article subtitles/descriptions (deks)

func (*GenericDekExtractor) Extract ¶

func (e *GenericDekExtractor) Extract(doc *goquery.Document, opts map[string]interface{}) string

Extract extracts dek from meta tags and selectors with validation and cleaning

type GenericDescriptionExtractor ¶

type GenericDescriptionExtractor struct{}

GenericDescriptionExtractor extracts site descriptions

func (*GenericDescriptionExtractor) Extract ¶

func (extractor *GenericDescriptionExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts site description using priority-based strategies

type GenericExcerptExtractor ¶

type GenericExcerptExtractor struct{}

GenericExcerptExtractor implements excerpt extraction logic

func NewGenericExcerptExtractor ¶

func NewGenericExcerptExtractor() *GenericExcerptExtractor

NewGenericExcerptExtractor creates a new excerpt extractor

func (*GenericExcerptExtractor) Extract ¶

func (e *GenericExcerptExtractor) Extract(doc *goquery.Document, content string, metaCache []string) string

Extract extracts excerpt from meta tags or falls back to content This is a faithful port of the JavaScript GenericExcerptExtractor.extract method

type GenericExtractor ¶

type GenericExtractor struct {
	Domain string
}

GenericExtractor coordinates individual field extractors This is NOT an implementation of parser.Extractor interface It's used internally by the parser package for generic extraction

func NewGenericExtractor ¶

func NewGenericExtractor() *GenericExtractor

NewGenericExtractor creates a new generic extractor instance

func (*GenericExtractor) ExtractGeneric ¶

func (ge *GenericExtractor) ExtractGeneric(options *ExtractionOptions) (*ExtractionResult, error)

ExtractGeneric performs the main generic extraction with full options

func (*GenericExtractor) GetDomain ¶

func (ge *GenericExtractor) GetDomain() string

GetDomain returns the domain this extractor handles

type GenericFaviconExtractor ¶

type GenericFaviconExtractor struct{}

GenericFaviconExtractor extracts the favicon URL

func (*GenericFaviconExtractor) Extract ¶

func (extractor *GenericFaviconExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the favicon URL from the page

type GenericLanguageExtractor ¶

type GenericLanguageExtractor struct{}

GenericLanguageExtractor extracts content language information

func (*GenericLanguageExtractor) Extract ¶

func (extractor *GenericLanguageExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts content language using priority-based strategies

type GenericLeadImageExtractor ¶

type GenericLeadImageExtractor struct{}

GenericLeadImageExtractor implements lead image extraction logic

func NewGenericLeadImageExtractor ¶

func NewGenericLeadImageExtractor() *GenericLeadImageExtractor

NewGenericLeadImageExtractor creates a new lead image extractor

func (*GenericLeadImageExtractor) Extract ¶

func (e *GenericLeadImageExtractor) Extract(params ExtractorImageParams) *string

Extract finds the lead image URL from the document using scoring and fallback strategies Matches JavaScript behavior: meta tags → content images → fallback selectors

type GenericNextPageUrlExtractor ¶

type GenericNextPageUrlExtractor struct{}

GenericNextPageUrlExtractor extracts next page URLs for multi-page articles

func NewGenericNextPageUrlExtractor ¶

func NewGenericNextPageUrlExtractor() *GenericNextPageUrlExtractor

NewGenericNextPageUrlExtractor creates a new instance

func (*GenericNextPageUrlExtractor) Extract ¶

func (e *GenericNextPageUrlExtractor) Extract(doc *goquery.Document, articleURL string, parsedURL *url.URL, previousUrls []string) string

Extract finds and returns the most likely next page URL

type GenericSiteImageExtractor ¶

type GenericSiteImageExtractor struct{}

GenericSiteImageExtractor extracts the main site image

func (*GenericSiteImageExtractor) Extract ¶

func (extractor *GenericSiteImageExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the site's main image from meta tags

type GenericSiteNameExtractor ¶

type GenericSiteNameExtractor struct{}

GenericSiteNameExtractor extracts the site name from meta tags

func (*GenericSiteNameExtractor) Extract ¶

func (extractor *GenericSiteNameExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the site name from various meta tags

type GenericSiteTitleExtractor ¶

type GenericSiteTitleExtractor struct{}

GenericSiteTitleExtractor extracts the site title

func (*GenericSiteTitleExtractor) Extract ¶

func (extractor *GenericSiteTitleExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the site title from the page

type GenericThemeColorExtractor ¶

type GenericThemeColorExtractor struct{}

GenericThemeColorExtractor extracts the theme color from meta tags

func (*GenericThemeColorExtractor) Extract ¶

func (extractor *GenericThemeColorExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the theme color from the page

type GenericVideoExtractor ¶

type GenericVideoExtractor struct{}

GenericVideoExtractor extracts video metadata from Open Graph and other meta tags

func (*GenericVideoExtractor) Extract ¶

func (extractor *GenericVideoExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) *VideoMetadata

Extract extracts video metadata from the page

func (*GenericVideoExtractor) ExtractVideoURL ¶

func (extractor *GenericVideoExtractor) ExtractVideoURL(selection *goquery.Selection, pageURL string, metaCache []string) string

ExtractVideoURL is a convenience function that returns just the primary video URL

type RTLScriptRange ¶

type RTLScriptRange struct {
	From int // Starting Unicode code point
	To   int // Ending Unicode code point
}

RTLScriptRange represents a Unicode block range for RTL scripts

type URLResult ¶

type URLResult struct {
	URL    string `json:"url"`
	Domain string `json:"domain"`
}

URLResult represents the extracted URL and domain information

type VideoMetadata ¶

type VideoMetadata struct {
	URL       string `json:"url,omitempty"`
	Type      string `json:"type,omitempty"`
	Width     int    `json:"width,omitempty"`
	Height    int    `json:"height,omitempty"`
	Duration  int    `json:"duration,omitempty"`
	SecureURL string `json:"secure_url,omitempty"`
}

VideoMetadata contains structured video metadata

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func CleanContent ¶

func DirectionExtractor ¶

func ExtractBestNode ¶

func GetDirection ¶

func NodeIsSufficient ¶

Types ¶

type CleanContentOptions ¶

type ExtractBestNodeOptions ¶

type ExtractionOptions ¶

type ExtractionResult ¶

type ExtractorImageParams ¶

type ExtractorOptions ¶

type ExtractorParams ¶

type GenericAuthorExtractor ¶

func (*GenericAuthorExtractor) Extract ¶

type GenericContentExtractor ¶

func NewGenericContentExtractor ¶

func (*GenericContentExtractor) CleanAndReturnNode ¶

func (*GenericContentExtractor) Extract ¶

func (*GenericContentExtractor) GetContentNode ¶

type GenericDateExtractorType ¶

func (GenericDateExtractorType) Extract ¶

type GenericDekExtractor ¶

func (*GenericDekExtractor) Extract ¶

type GenericDescriptionExtractor ¶

func (*GenericDescriptionExtractor) Extract ¶

type GenericExcerptExtractor ¶

func NewGenericExcerptExtractor ¶

func (*GenericExcerptExtractor) Extract ¶

type GenericExtractor ¶

func NewGenericExtractor ¶

func (*GenericExtractor) ExtractGeneric ¶

func (*GenericExtractor) GetDomain ¶

type GenericFaviconExtractor ¶

func (*GenericFaviconExtractor) Extract ¶

type GenericLanguageExtractor ¶

func (*GenericLanguageExtractor) Extract ¶

type GenericLeadImageExtractor ¶

func NewGenericLeadImageExtractor ¶

func (*GenericLeadImageExtractor) Extract ¶

type GenericNextPageUrlExtractor ¶

func NewGenericNextPageUrlExtractor ¶

func (*GenericNextPageUrlExtractor) Extract ¶

type GenericSiteImageExtractor ¶

func (*GenericSiteImageExtractor) Extract ¶

type GenericSiteNameExtractor ¶

func (*GenericSiteNameExtractor) Extract ¶

type GenericSiteTitleExtractor ¶

func (*GenericSiteTitleExtractor) Extract ¶

type GenericThemeColorExtractor ¶

func (*GenericThemeColorExtractor) Extract ¶

type GenericVideoExtractor ¶

func (*GenericVideoExtractor) Extract ¶

func (*GenericVideoExtractor) ExtractVideoURL ¶

type RTLScriptRange ¶

type URLResult ¶

type VideoMetadata ¶

Source Files ¶