generic

package
v1.0.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 31, 2025 License: MIT Imports: 13 Imported by: 0

Documentation

Index

Constants

View Source
const (
	LTRMark = "\u200e" // Left-to-right mark
	RTLMark = "\u200f" // Right-to-left mark
	LTR     = "ltr"    // Left to right direction content
	RTL     = "rtl"    // Right to left direction content
	BIDI    = "bidi"   // Both directions - bidirectional content
	NODI    = ""       // No direction - empty string for no detectable direction
)

Direction constants matching JavaScript string-direction library exactly

View Source
const AUTHOR_MAX_LENGTH = 300

AUTHOR_MAX_LENGTH - maximum length for valid author names

Variables

View Source
var (
	MS_DATE_STRING         = regexp.MustCompile(`^\d{13}$`)
	SEC_DATE_STRING        = regexp.MustCompile(`^\d{10}$`)
	CLEAN_DATE_STRING_RE   = regexp.MustCompile(`^\s*published\s*:?\s*(.*)`)
	TIME_MERIDIAN_SPACE_RE = regexp.MustCompile(`(.*\d)(am|pm)(.*)`)
	TIME_MERIDIAN_DOTS_RE  = regexp.MustCompile(`\.m\.`)
	TIME_NOW_STRING        = regexp.MustCompile(`^\s*(just|right)?\s*now\s*`)
	TIME_WITH_OFFSET_RE    = regexp.MustCompile(`-\d{3,4}$`)
)

JavaScript date cleaner constants (ported from cleaners/constants.js)

View Source
var (
	POSITIVE_LEAD_IMAGE_URL_HINTS_RE = regexp.MustCompile("(?i)" + strings.Join(POSITIVE_LEAD_IMAGE_URL_HINTS, "|"))
	NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = regexp.MustCompile("(?i)" + strings.Join(NEGATIVE_LEAD_IMAGE_URL_HINTS, "|"))
	GIF_RE                           = regexp.MustCompile(`(?i)\.gif(\?.*)?$`)
	JPG_RE                           = regexp.MustCompile(`(?i)\.jpe?g(\?.*)?$`)
	PHOTO_HINTS_RE                   = regexp.MustCompile(`(?i)figure|photo|image|caption`) // From constants.go
)

Compiled regexes for URL scoring

View Source
var (
	// DIGIT_RE matches any digit character
	DIGIT_RE = regexp.MustCompile(`\d`)

	// EXTRANEOUS_LINK_HINTS are words that indicate a link is probably not a next page
	EXTRANEOUS_LINK_HINTS = []string{
		"print", "archive", "comment", "discuss", "e-mail", "email",
		"share", "reply", "all", "login", "sign", "single", "adx", "entry-unrelated",
	}
	EXTRANEOUS_LINK_HINTS_RE = regexp.MustCompile(`(?i)` + strings.Join(EXTRANEOUS_LINK_HINTS, "|"))

	// NEXT_LINK_TEXT_RE matches text that likely indicates a next page link
	NEXT_LINK_TEXT_RE = regexp.MustCompile(`(?i)(next|weiter|continue|>([^|]|$)|»([^|]|$))`)

	// CAP_LINK_TEXT_RE matches text that indicates end links (first, last, etc.)
	CAP_LINK_TEXT_RE = regexp.MustCompile(`(?i)(first|last|end)`)

	// PREV_LINK_TEXT_RE matches text that indicates previous page links
	PREV_LINK_TEXT_RE = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`)

	// PAGE_RE matches pagination-related text
	PAGE_RE = regexp.MustCompile(`(?i)pag(e|ing|inat)`)
)

Constants from JavaScript implementation

View Source
var (
	// An ordered list of meta tag names that denote likely article titles.
	// All attributes should be lowercase for faster case-insensitive matching.
	// From most distinct to least distinct.
	STRONG_TITLE_META_TAGS = []string{
		"tweetmeme-title",
		"dc.title",
		"rbtitle",
		"headline",
		"title",
	}

	// og:title is weak because it typically contains context that we don't like,
	// for example the source site's name. Gotta get that brand into facebook!
	WEAK_TITLE_META_TAGS = []string{
		"og:title",
	}

	// An ordered list of CSS Selectors to find likely article titles.
	// From most explicit to least explicit.
	//
	// Note - this does not use classes like CSS. This checks to see if the string
	// exists in the className, which is not as accurate as .className (which
	// splits on spaces/endlines), but for our purposes it's close enough.
	STRONG_TITLE_SELECTORS = []string{
		".hentry .entry-title",
		"h1#articleHeader",
		"h1.articleHeader",
		"h1.article",
		".instapaper_title",
		"#meebo-title",
	}

	WEAK_TITLE_SELECTORS = []string{
		"article h1",
		"#entry-title",
		".entry-title",
		"#entryTitle",
		"#entrytitle",
		".entryTitle",
		".entrytitle",
		"#articleTitle",
		".articleTitle",
		"post post-title",
		"h1.title",
		"h2.article",
		"h1",
		"html head title",
		"title",
	}

	// Regular expression for title separators
	TITLE_SPLITTERS_RE = regexp.MustCompile(`(: | - | \| )`)

	// Domain endings regex for cleaning
	DOMAIN_ENDINGS_RE = regexp.MustCompile(`\.com$|\.net$|\.org$|\.co\.uk$`)
)

Title extraction constants matching JavaScript behavior exactly

View Source
var AUTHOR_META_TAGS = []string{
	"byl",
	"clmst",
	"dc.author",
	"dcsext.author",
	"dc.creator",
	"rbauthors",
	"authors",
}

AUTHOR_META_TAGS - ordered list of meta tag names that denote likely article authors From most distinct to least distinct. Note: "author" is too often the developer of the page, so it is not included here.

View Source
var AUTHOR_SELECTORS = []string{
	".entry .entry-author",
	".author.vcard .fn",
	".author .vcard .fn",
	".byline.vcard .fn",
	".byline .vcard .fn",
	".byline .by .author",
	".byline .by",
	".byline .author",
	".post-author.vcard",
	".post-author .vcard",
	"a[rel=author]",
	"#by_author",
	".by_author",
	"#entryAuthor",
	".entryAuthor",
	".byline a[href*=author]",
	"#author .authorname",
	".author .authorname",
	"#author",
	".author",
	".articleauthor",
	".ArticleAuthor",
	".byline",
}

AUTHOR_SELECTORS - ordered list of CSS selectors to find likely article authors From most explicit to least explicit. Uses class substring matching like JavaScript.

View Source
var BYLINE_SELECTORS_RE = [][2]interface{}{
	{"#byline", bylineRe},
	{".byline", bylineRe},
}
View Source
var (
	// CANONICAL_META_SELECTORS - meta tag names for canonical URL extraction
	// From JavaScript: export const CANONICAL_META_SELECTORS = ['og:url'];
	CANONICAL_META_SELECTORS = []string{
		"og:url",
	}
)

URL extraction constants matching JavaScript behavior exactly

View Source
var CLEAN_AUTHOR_RE = regexp.MustCompile(`(?i)^\s*(posted |written )?by\s*:?\s*(.*)`)

CLEAN_AUTHOR_RE - regex for cleaning author prefixes Matches /^\s*(posted |written )?by\s*:?\s*(.*)/i from JavaScript

View Source
var DATE_PUBLISHED_META_TAGS = []string{
	"article:published_time",
	"displaydate",
	"dc.date",
	"dc.date.issued",
	"rbpubdate",
	"publish_date",
	"pub_date",
	"pagedate",
	"pubdate",
	"revision_date",
	"doc_date",
	"date_created",
	"content_create_date",
	"lastmodified",
	"created",
	"date",
}

DATE_PUBLISHED_META_TAGS - Ordered list of meta tag names that denote likely date published dates All attributes should be lowercase for faster case-insensitive matching From most distinct to least distinct (matches JavaScript exactly)

View Source
var DATE_PUBLISHED_SELECTORS = []string{
	".hentry .dtstamp.published",
	".hentry .published",
	".hentry .dtstamp.updated",
	".hentry .updated",
	".single .published",
	".meta .published",
	".meta .postDate",
	".entry-date",
	".byline .date",
	".postmetadata .date",
	".article_datetime",
	".date-header",
	".story-date",
	".dateStamp",
	"#story .datetime",
	".dateline",
	".pubdate",
}

DATE_PUBLISHED_SELECTORS - Ordered list of CSS selectors to find likely date published dates From most explicit to least explicit (matches JavaScript exactly)

View Source
var DATE_PUBLISHED_URL_RES = []*regexp.Regexp{
	regexp.MustCompile(`/(20\d{2}/\d{2}/\d{2})/`),
	regexp.MustCompile(`(20\d{2}-[01]\d-[0-3]\d)`),
	regexp.MustCompile(`/(20\d{2}/(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/[0-3]\d)/`),
}

DATE_PUBLISHED_URL_RES - Ordered list of compiled regular expressions to find likely date published dates from the URL. These should always have the first reference be a date string that is parseable. Matches JavaScript exactly.

View Source
var EXCERPT_META_SELECTORS = []string{"og:description", "twitter:description"}

EXCERPT_META_SELECTORS defines the meta tag names to search for excerpt content This matches the JavaScript constants exactly: ['og:description', 'twitter:description']

View Source
var GenericDateExtractor = GenericDateExtractorType{}
View Source
var GenericTitleExtractor = struct {
	Extract func(doc *goquery.Selection, url string, metaCache []string) string
}{
	Extract: func(doc *goquery.Selection, url string, metaCache []string) string {

		html := "<html></html>"
		if doc.Length() > 0 {
			if fullHtml, err := doc.Html(); err == nil && fullHtml != "" {
				html = "<html>" + fullHtml + "</html>"
			} else {

				if doc.Parent().Length() > 0 {
					if parentHtml, err := doc.Parent().Html(); err == nil {
						html = "<html>" + parentHtml + "</html>"
					}
				}
			}
		} else {
			return ""
		}

		document, err := goquery.NewDocumentFromReader(strings.NewReader(html))
		if err != nil {
			return ""
		}

		title := dom.ExtractFromMeta(document, STRONG_TITLE_META_TAGS, metaCache, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		title = dom.ExtractFromSelectors(doc, STRONG_TITLE_SELECTORS, 1, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		title = dom.ExtractFromMeta(document, WEAK_TITLE_META_TAGS, metaCache, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		title = dom.ExtractFromSelectors(doc, WEAK_TITLE_SELECTORS, 1, true)
		if title != nil && *title != "" {
			return cleanTitle(*title, url, doc)
		}

		return ""
	},
}

GenericTitleExtractor extracts article titles using multiple fallback strategies

View Source
var GenericUrlExtractor = struct {
	Extract func(doc *goquery.Selection, url string, metaCache []string) URLResult
}{
	Extract: func(doc *goquery.Selection, url string, metaCache []string) URLResult {

		canonical := doc.Find("link[rel=canonical]")
		if canonical.Length() != 0 {
			href, exists := canonical.Attr("href")
			if exists && href != "" {
				return result(href)
			}
		}

		// Second, check for canonical URL in meta tags
		// Need to convert selection to document for meta tag extraction
		var document *goquery.Document

		if doc.Is("html") {

			if docNode := doc.Get(0); docNode != nil {
				document = goquery.NewDocumentFromNode(docNode)
			}
		}

		if document == nil {

			if html, err := doc.Html(); err == nil {
				fullHTML := html
				if !containsHTML(html) {
					fullHTML = "<html>" + html + "</html>"
				}

				if tempDoc, err := goquery.NewDocumentFromReader(strings.NewReader(fullHTML)); err == nil {
					document = tempDoc
				}
			}
		}

		if document != nil {

			metaURL := dom.ExtractFromMeta(document, CANONICAL_META_SELECTORS, metaCache, false)
			if metaURL != nil && *metaURL != "" {
				return result(*metaURL)
			}
		}

		return result(url)
	},
}

GenericUrlExtractor provides URL extraction functionality matching JavaScript exactly

View Source
var GenericWordCountExtractor = struct {
	Extract func(options map[string]interface{}) int
}{
	Extract: func(options map[string]interface{}) int {

		if options == nil {
			return 1
		}

		contentInterface, exists := options["content"]
		if !exists {
			return 1
		}

		content, ok := contentInterface.(string)
		if !ok {
			return 1
		}

		count := getWordCount(content)

		if count == 1 {
			count = getWordCountAlt(content)
		}

		return count
	},
}

GenericWordCountExtractor extracts word count from content using JavaScript-compatible logic

View Source
var LEAD_IMAGE_URL_META_TAGS = []string{
	"og:image",
	"twitter:image",
	"image_src",
}

Lead image URL meta tags in priority order (most distinct first)

View Source
var LEAD_IMAGE_URL_SELECTORS = []string{
	"link[rel=image_src]",
}

Fallback selectors for lead image extraction

View Source
var NEGATIVE_LEAD_IMAGE_URL_HINTS = []string{
	"spacer", "sprite", "blank", "throbber", "gradient", "tile", "bg",
	"background", "icon", "social", "header", "hdr", "advert", "spinner",
	"loader", "loading", "default", "rating", "share", "facebook",
	"twitter", "theme", "promo", "ads", "wp-includes",
}

Negative hints that decrease image score

View Source
var POSITIVE_LEAD_IMAGE_URL_HINTS = []string{
	"upload",
	"wp-content",
	"large",
	"photo",
	"wp-image",
}

Positive hints that increase image score

View Source
var SPLIT_DATE_STRING = regexp.MustCompile(`(?i)([0-9]{1,2}:[0-9]{2,2}( ?[ap].?m.?)?)|([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})|(-[0-9]{3,4}$)|([0-9]{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)`)

SPLIT_DATE_STRING regex for splitting date components (matches JavaScript exactly with case-insensitive)

View Source
var TIME_AGO_STRING = regexp.MustCompile(`(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s+ago`)

TIME_AGO_STRING regex for parsing relative dates (X minutes ago, etc.)

Functions

func CleanContent

func CleanContent(article *goquery.Selection, opts CleanContentOptions) *goquery.Selection

CleanContent cleans article content, returning a new, cleaned node This adapts the JavaScript extractCleanNode function to work with Go's document-based DOM functions

func DirectionExtractor

func DirectionExtractor(params ExtractorParams) (string, error)

DirectionExtractor extracts text direction from title field only Matches JavaScript: direction: ({ title }) => stringDirection.getDirection(title)

func ExtractBestNode

func ExtractBestNode(doc *goquery.Document, opts ExtractBestNodeOptions) *goquery.Selection

ExtractBestNode extracts the content most likely to be article text using a variety of scoring techniques.

The function orchestrates the complete extraction pipeline: 1. Optionally strips unlikely candidates (comments, ads, etc.) 2. Converts elements to paragraphs for better scoring 3. Scores all content based on various signals 4. Finds and returns the top candidate element

This is a direct port of the JavaScript extractBestNode function with 100% compatibility.

Parameters:

  • doc: A goquery Document representing the DOM to extract from
  • opts: ExtractBestNodeOptions with configuration flags
  • StripUnlikelyCandidates: If true, remove elements that match exclusion criteria
  • WeightNodes: If true, use classNames and IDs to determine node worthiness

Returns:

  • *goquery.Selection: The top candidate element, or nil if no suitable content found

func GetDirection

func GetDirection(input interface{}) (string, error)

GetDirection analyzes string direction and returns 'ltr', 'rtl', 'bidi', or ” Direct port of JavaScript stringDirection.getDirection() function

func NodeIsSufficient

func NodeIsSufficient(node *goquery.Selection) bool

NodeIsSufficient determines if a node has enough content to be considered article-like Given a node, determine if it's article-like enough to return Direct port of JavaScript nodeIsSufficient function

Types

type CleanContentOptions

type CleanContentOptions struct {
	Doc                *goquery.Document
	CleanConditionally bool
	Title              string
	URL                string
	DefaultCleaner     bool
}

CleanContentOptions represents options for content cleaning

type ExtractBestNodeOptions

type ExtractBestNodeOptions struct {
	StripUnlikelyCandidates bool
	WeightNodes             bool
}

ExtractBestNodeOptions represents configuration options for content extraction

type ExtractionOptions

type ExtractionOptions struct {
	URL         string
	HTML        string
	Doc         *goquery.Document
	MetaCache   []string
	Fallback    bool
	ContentType string
}

ExtractionOptions contains all parameters needed for extraction Matches JavaScript options object structure

type ExtractionResult

type ExtractionResult struct {
	Title         string     `json:"title"`
	Author        string     `json:"author"`
	DatePublished *time.Time `json:"date_published"` // null if not found
	Dek           string     `json:"dek"`
	LeadImageURL  string     `json:"lead_image_url"`
	Content       string     `json:"content"`
	NextPageURL   string     `json:"next_page_url"`
	URL           string     `json:"url"`
	Domain        string     `json:"domain"`
	Excerpt       string     `json:"excerpt"`
	WordCount     int        `json:"word_count"`
	Direction     string     `json:"direction"`
	SiteName      string     `json:"site_name"`
	SiteTitle     string     `json:"site_title"`
	SiteImage     string     `json:"site_image"`
	Favicon       string     `json:"favicon"`
}

ExtractionResult represents the complete result from generic extraction Matches JavaScript extraction result structure exactly

type ExtractorImageParams

type ExtractorImageParams struct {
	Doc       *goquery.Document
	Content   string
	MetaCache map[string]string
	HTML      string
}

ExtractorImageParams contains parameters for image extraction

type ExtractorOptions

type ExtractorOptions struct {
	StripUnlikelyCandidates bool
	WeightNodes             bool
	CleanConditionally      bool
}

ExtractorOptions represents configuration options for content extraction

type ExtractorParams

type ExtractorParams struct {
	Doc   *goquery.Document
	HTML  string
	Title string
	URL   string
}

ExtractorParams contains all the parameters needed for extraction

type GenericAuthorExtractor

type GenericAuthorExtractor struct{}

GenericAuthorExtractor provides author extraction functionality

func (*GenericAuthorExtractor) Extract

func (e *GenericAuthorExtractor) Extract(doc *goquery.Selection, metaCache []string) *string

Extract extracts author information from HTML using the three-tier strategy Returns *string to allow nil for no author found (matching JavaScript behavior)

type GenericContentExtractor

type GenericContentExtractor struct {
	DefaultOpts ExtractorOptions
}

GenericContentExtractor implements the main content extraction logic

func NewGenericContentExtractor

func NewGenericContentExtractor() *GenericContentExtractor

NewGenericContentExtractor creates a new extractor with default options

func (*GenericContentExtractor) CleanAndReturnNode

func (e *GenericContentExtractor) CleanAndReturnNode(node *goquery.Selection, doc *goquery.Document) string

CleanAndReturnNode finalizes the content by ensuring we have something and normalizing spaces Once we got here, either we're at our last-resort node, or we broke early. Make sure we at least have -something- before we move forward.

func (*GenericContentExtractor) Extract

Extract extracts the content for this resource - initially, pass in the most restrictive opts which will return the highest quality content. On each failure, retry with slightly more lax opts.

The function implements the JavaScript extraction strategy: 1. Try with default strict options 2. If content is insufficient, cascade through options, disabling them one by one 3. Return the best content found

This matches the JavaScript behavior exactly for option cascading and content validation.

func (*GenericContentExtractor) GetContentNode

func (e *GenericContentExtractor) GetContentNode(doc *goquery.Document, title, url string, opts ExtractorOptions) *goquery.Selection

GetContentNode gets the content node given current options This orchestrates the extraction pipeline: extract best node -> clean content

type GenericDateExtractorType

type GenericDateExtractorType struct{}

GenericDateExtractor - Extractor for publication dates with 100% JavaScript compatibility

func (GenericDateExtractorType) Extract

func (e GenericDateExtractorType) Extract(doc *goquery.Selection, url string, metaCache []string) *string

Extract publication date from document using meta tags, selectors, and URL patterns

type GenericDekExtractor

type GenericDekExtractor struct{}

GenericDekExtractor extracts article subtitles/descriptions (deks)

func (*GenericDekExtractor) Extract

func (e *GenericDekExtractor) Extract(doc *goquery.Document, opts map[string]interface{}) string

Extract extracts dek from meta tags and selectors with validation and cleaning

type GenericDescriptionExtractor

type GenericDescriptionExtractor struct{}

GenericDescriptionExtractor extracts site descriptions

func (*GenericDescriptionExtractor) Extract

func (extractor *GenericDescriptionExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts site description using priority-based strategies

type GenericExcerptExtractor

type GenericExcerptExtractor struct{}

GenericExcerptExtractor implements excerpt extraction logic

func NewGenericExcerptExtractor

func NewGenericExcerptExtractor() *GenericExcerptExtractor

NewGenericExcerptExtractor creates a new excerpt extractor

func (*GenericExcerptExtractor) Extract

func (e *GenericExcerptExtractor) Extract(doc *goquery.Document, content string, metaCache []string) string

Extract extracts excerpt from meta tags or falls back to content This is a faithful port of the JavaScript GenericExcerptExtractor.extract method

type GenericExtractor

type GenericExtractor struct {
	Domain string
}

GenericExtractor coordinates individual field extractors This is NOT an implementation of parser.Extractor interface It's used internally by the parser package for generic extraction

func NewGenericExtractor

func NewGenericExtractor() *GenericExtractor

NewGenericExtractor creates a new generic extractor instance

func (*GenericExtractor) ExtractGeneric

func (ge *GenericExtractor) ExtractGeneric(options *ExtractionOptions) (*ExtractionResult, error)

ExtractGeneric performs the main generic extraction with full options

func (*GenericExtractor) GetDomain

func (ge *GenericExtractor) GetDomain() string

GetDomain returns the domain this extractor handles

type GenericFaviconExtractor

type GenericFaviconExtractor struct{}

GenericFaviconExtractor extracts the favicon URL

func (*GenericFaviconExtractor) Extract

func (extractor *GenericFaviconExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the favicon URL from the page

type GenericLanguageExtractor

type GenericLanguageExtractor struct{}

GenericLanguageExtractor extracts content language information

func (*GenericLanguageExtractor) Extract

func (extractor *GenericLanguageExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts content language using priority-based strategies

type GenericLeadImageExtractor

type GenericLeadImageExtractor struct{}

GenericLeadImageExtractor implements lead image extraction logic

func NewGenericLeadImageExtractor

func NewGenericLeadImageExtractor() *GenericLeadImageExtractor

NewGenericLeadImageExtractor creates a new lead image extractor

func (*GenericLeadImageExtractor) Extract

Extract finds the lead image URL from the document using scoring and fallback strategies Matches JavaScript behavior: meta tags → content images → fallback selectors

type GenericNextPageUrlExtractor

type GenericNextPageUrlExtractor struct{}

GenericNextPageUrlExtractor extracts next page URLs for multi-page articles

func NewGenericNextPageUrlExtractor

func NewGenericNextPageUrlExtractor() *GenericNextPageUrlExtractor

NewGenericNextPageUrlExtractor creates a new instance

func (*GenericNextPageUrlExtractor) Extract

func (e *GenericNextPageUrlExtractor) Extract(doc *goquery.Document, articleURL string, parsedURL *url.URL, previousUrls []string) string

Extract finds and returns the most likely next page URL

type GenericSiteImageExtractor

type GenericSiteImageExtractor struct{}

GenericSiteImageExtractor extracts the main site image

func (*GenericSiteImageExtractor) Extract

func (extractor *GenericSiteImageExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the site's main image from meta tags

type GenericSiteNameExtractor

type GenericSiteNameExtractor struct{}

GenericSiteNameExtractor extracts the site name from meta tags

func (*GenericSiteNameExtractor) Extract

func (extractor *GenericSiteNameExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the site name from various meta tags

type GenericSiteTitleExtractor

type GenericSiteTitleExtractor struct{}

GenericSiteTitleExtractor extracts the site title

func (*GenericSiteTitleExtractor) Extract

func (extractor *GenericSiteTitleExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the site title from the page

type GenericThemeColorExtractor

type GenericThemeColorExtractor struct{}

GenericThemeColorExtractor extracts the theme color from meta tags

func (*GenericThemeColorExtractor) Extract

func (extractor *GenericThemeColorExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) string

Extract extracts the theme color from the page

type GenericVideoExtractor

type GenericVideoExtractor struct{}

GenericVideoExtractor extracts video metadata from Open Graph and other meta tags

func (*GenericVideoExtractor) Extract

func (extractor *GenericVideoExtractor) Extract(selection *goquery.Selection, pageURL string, metaCache []string) *VideoMetadata

Extract extracts video metadata from the page

func (*GenericVideoExtractor) ExtractVideoURL

func (extractor *GenericVideoExtractor) ExtractVideoURL(selection *goquery.Selection, pageURL string, metaCache []string) string

ExtractVideoURL is a convenience function that returns just the primary video URL

type RTLScriptRange

type RTLScriptRange struct {
	From int // Starting Unicode code point
	To   int // Ending Unicode code point
}

RTLScriptRange represents a Unicode block range for RTL scripts

type URLResult

type URLResult struct {
	URL    string `json:"url"`
	Domain string `json:"domain"`
}

URLResult represents the extracted URL and domain information

type VideoMetadata

type VideoMetadata struct {
	URL       string `json:"url,omitempty"`
	Type      string `json:"type,omitempty"`
	Width     int    `json:"width,omitempty"`
	Height    int    `json:"height,omitempty"`
	Duration  int    `json:"duration,omitempty"`
	SecureURL string `json:"secure_url,omitempty"`
}

VideoMetadata contains structured video metadata

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL