Documentation
¶
Index ¶
- Variables
- func CleanAuthor(author string) string
- func CleanDatePublished(dateString, timezone, format string) *string
- func CleanDek(dek string, doc *goquery.Document, excerpt string) *string
- func CleanDomainFromTitle(splitTitle []string, urlStr string) string
- func CleanLeadImageURL(imageURL, baseURL string) string
- func CleanLeadImageURLString(leadImageURL string) string
- func CleanLeadImageURLValidated(leadImageURL string) *string
- func CleanTitle(title string, url string, doc *goquery.Document) string
- func CleanTitleSimple(title, targetURL string) string
- func ExtractBreadcrumbTitle(splitTitle []string, text string) string
- func ExtractCleanNode(article *goquery.Selection, doc *goquery.Document, opts ContentCleanOptions) *goquery.Selection
- func LevenshteinRatio(s1, s2 string) float64
- func RegisterCleaner(fieldType string, cleaner FieldCleaner)
- func ResolveSplitTitle(title, url string) string
- func SplitTitleWithSeparators(title string) []string
- type CleanerOptions
- type ContentCleanOptions
- type ContentCleanOptionsStruct
- type ContentCleaner
- type FieldCleaner
- type LeadImageURLCleaner
- type ResolveSplitTitleCleaner
Constants ¶
This section is empty.
Variables ¶
var CLEAN_AUTHOR_RE = regexp.MustCompile(`(?i)^\s*(posted |written )?by\s*:?\s*([^\r\n]*)`)
CLEAN_AUTHOR_RE matches "by" prefixes in author strings Matches the JavaScript regex: /^\s*(posted |written )?by\s*:?\s*(.*)/i Note: In JavaScript, .* does NOT match newlines by default, so we use [^\r\n]*
var CLEAN_DATE_STRING_RE = regexp.MustCompile(`(?i)^\s*published\s*:?\s*(.*)`)
CLEAN_DATE_STRING_RE matches "published:" prefixes in date strings Matches the JavaScript regex: /^\s*published\s*:?\s*(.*)/i
var DEK_META_TAGS = []string{}
DEK_META_TAGS is an ordered list of meta tag names for article deks From most distinct to least distinct NOTE: Currently empty as no meta tags provide consistent dek content
var DEK_SELECTORS = []string{".entry-summary"}
DEK_SELECTORS is an ordered list of CSS selectors for article deks From most explicit to least explicit
var DOMAIN_ENDINGS_RE = regexp.MustCompile(`\.com$|\.net$|\.org$|\.co\.uk$`)
DOMAIN_ENDINGS_RE matches common domain endings Matches the JavaScript regex: /.com$|.net$|.org$|.co.uk$/g
var ExtractCleanNodeFunc = ExtractCleanNode
ExtractCleanNode is the main content cleaning function It can be used as a standalone utility or integrated with content extractors
var MS_DATE_STRING = regexp.MustCompile(`(?i)^\d{13}$`)
MS_DATE_STRING matches 13-digit millisecond timestamps Matches the JavaScript regex: /^\d{13}$/i
var SEC_DATE_STRING = regexp.MustCompile(`(?i)^\d{10}$`)
SEC_DATE_STRING matches 10-digit second timestamps Matches the JavaScript regex: /^\d{10}$/i
var SPLIT_DATE_STRING = regexp.MustCompile(`(?i)([0-9]{1,2}:[0-9]{2,2}( ?[ap]\.?m\.?)?)|([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})|(-[0-9]{3,4}$)|([0-9]{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)`)
SPLIT_DATE_STRING matches various date/time components Complex regex built from multiple timestamp patterns
var TEXT_LINK_RE = regexp.MustCompile(`(?i)http(s)?:`)
TEXT_LINK_RE matches HTTP/HTTPS URLs in text Matches the JavaScript regex: /http(s)?:/i
var TIME_AGO_STRING = regexp.MustCompile(`(?i)(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s+ago`)
TIME_AGO_STRING matches relative time expressions like "5 minutes ago" Dynamically built from timeUnits like JavaScript version
var TIME_MERIDIAN_DOTS_RE = regexp.MustCompile(`(?i)\.m\.`)
TIME_MERIDIAN_DOTS_RE matches ".m." in time strings Matches the JavaScript regex: /\.m\./i
var TIME_MERIDIAN_SPACE_RE = regexp.MustCompile(`(?i)(.*\d)(a|p)(\s*m.*)`)
TIME_MERIDIAN_SPACE_RE matches time strings with AM/PM Matches the JavaScript regex: /(.*\d)(am|pm)(.*)/i
var TIME_NOW_STRING = regexp.MustCompile(`(?i)^\s*(just|right)?\s*now\s*`)
TIME_NOW_STRING matches "now" time indicators Matches the JavaScript regex: /^\s*(just|right)?\s*now\s*/i
var TIME_WITH_OFFSET_RE = regexp.MustCompile(`[+-]\d{3,4}$`)
TIME_WITH_OFFSET_RE checks if datetime string has timezone offset at end Matches the JavaScript regex: /-\d{3,4}$/ but also handles positive offsets
var TITLE_SPLITTERS_RE = regexp.MustCompile(`(: | - | \| )`)
TITLE_SPLITTERS_RE matches title separating characters Matches the JavaScript regex: /(: | - | \| )/g
Functions ¶
func CleanAuthor ¶
CleanAuthor takes an author string (like 'By David Smith ') and cleans it to just the name(s): 'David Smith'.
This is a faithful 1:1 port of the JavaScript cleanAuthor function: - Removes "By", "Posted by", "Written by" prefixes (case insensitive) - Handles optional colons after prefixes - Normalizes all whitespace to single spaces - Trims leading and trailing whitespace
JavaScript equivalent:
export default function cleanAuthor(author) {
return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}
func CleanDatePublished ¶
CleanDatePublished takes a date published string and returns a clean ISO date string. Returns nil if the date cannot be parsed or is invalid.
This is a faithful 1:1 port of the JavaScript cleanDatePublished function: - Handles millisecond/second timestamps - Supports relative time expressions ("5 minutes ago") - Handles "now" time indicators - Supports timezone and format parameters - Cleans date strings by removing "published:" prefixes - Returns ISO 8601 formatted string or nil for invalid dates
JavaScript equivalent:
export default function cleanDatePublished(dateString, { timezone, format } = {}) {
// Timestamp handling, date cleaning, and parsing logic
return date.isValid() ? date.toISOString() : null;
}
func CleanDek ¶
CleanDek takes a dek HTML fragment and returns the cleaned version of it. Returns nil if the dek wasn't good enough (too short, too long, has URLs, matches excerpt).
This is a faithful 1:1 port of the JavaScript cleanDek function: - Validates length between 5 and 1000 characters - Checks that dek isn't the same as excerpt (first 10 words) - Strips HTML tags using stripTags function - Rejects deks containing plain text URLs (http/https) - Normalizes whitespace using normalizeSpaces
JavaScript equivalent:
export default function cleanDek(dek, { $, excerpt }) {
if (dek.length > 1000 || dek.length < 5) return null;
if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;
const dekText = stripTags(dek, $);
if (TEXT_LINK_RE.test(dekText)) return null;
return normalizeSpaces(dekText.trim());
}
func CleanDomainFromTitle ¶
CleanDomainFromTitle removes domain name matches from title segments Search the ends of the title, looking for bits that fuzzy match the URL too closely. If one is found, discard it and return the rest.
func CleanLeadImageURL ¶
CleanLeadImageURL ensures image URLs are properly formatted and absolute
func CleanLeadImageURLString ¶
CleanLeadImageURLString provides a string-returning version for backward compatibility Returns empty string if URL is invalid, cleaned URL if valid
func CleanLeadImageURLValidated ¶
CleanLeadImageURLValidated validates and cleans a lead image URL Returns nil if the URL is invalid, cleaned URL string if valid Matches JavaScript behavior: trim whitespace and validate as web URI
func CleanTitle ¶
CleanTitle cleans and normalizes title text by removing site names, HTML tags, and extra whitespace This is a faithful port of the JavaScript cleanTitle function
func CleanTitleSimple ¶
CleanTitleSimple provides a simple title cleaner that doesn't need a document
func ExtractBreadcrumbTitle ¶
ExtractBreadcrumbTitle extracts the most relevant title from breadcrumb-style titles This must be a very breadcrumbed title, like: The Best Gadgets on Earth : Bits : Blogs : NYTimes.com NYTimes - Blogs - Bits - The Best Gadgets on Earth
func ExtractCleanNode ¶
func ExtractCleanNode(article *goquery.Selection, doc *goquery.Document, opts ContentCleanOptions) *goquery.Selection
ExtractCleanNode cleans article content, returning a new, cleaned node Direct port of JavaScript extractCleanNode function with identical cleaning pipeline:
1. rewriteTopLevel - Convert HTML/BODY tags to DIV to avoid complications 2. cleanImages - Remove small/spacer images (if defaultCleaner enabled) 3. makeLinksAbsolute - Convert relative URLs to absolute URLs 4. markToKeep - Mark video iframes and important elements for preservation 5. stripJunkTags - Remove script, style, title and other junk tags 6. cleanHOnes - Remove or convert H1 tags based on count 7. cleanHeaders - Clean headers that match article title 8. cleanTags - Remove low-quality tags with high link density (if defaultCleaner enabled) 9. removeEmpty - Remove empty paragraph and other empty elements 10. cleanAttributes - Remove unnecessary attributes
This function matches the JavaScript implementation exactly, including: - Same cleaning order and logic - Same conditional cleaning based on options - Same default behaviors for aggressive vs conservative cleaning
func LevenshteinRatio ¶
LevenshteinRatio calculates the Levenshtein similarity ratio between two strings This is compatible with the JavaScript wuzzy.levenshtein function
func RegisterCleaner ¶
func RegisterCleaner(fieldType string, cleaner FieldCleaner)
RegisterCleaner registers a new cleaner for a field type
func ResolveSplitTitle ¶
ResolveSplitTitle resolves whether any of the segments should be removed from a title with separators Given a title with separators in it (colons, dashes, etc), resolve whether any of the segments should be removed.
func SplitTitleWithSeparators ¶
SplitTitleWithSeparators splits title while preserving separators This mimics JavaScript's split behavior with capturing groups
Types ¶
type CleanerOptions ¶
type CleanerOptions struct {
URL string
Title string
Content string
Excerpt string
DefaultCleaner bool
}
CleanerOptions represents unified options for all cleaner types
type ContentCleanOptions ¶
type ContentCleanOptions struct {
CleanConditionally bool
Title string
URL string
DefaultCleaner *bool // Use pointer to distinguish between unset and explicitly false
}
ContentCleanOptions represents configuration options for content cleaning
type ContentCleanOptionsStruct ¶
type ContentCleanOptionsStruct = ContentCleanOptions
ContentCleanOptions represents the configuration options for content cleaning
type ContentCleaner ¶
type ContentCleaner struct{}
ContentCleaner implements FieldCleaner for content fields
func (*ContentCleaner) Clean ¶
func (c *ContentCleaner) Clean(value interface{}, opts CleanerOptions) interface{}
func (*ContentCleaner) CleanSelection ¶
func (c *ContentCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection
type FieldCleaner ¶
type FieldCleaner interface {
// Clean cleans a field value (string, []string, etc.)
Clean(value interface{}, opts CleanerOptions) interface{}
// CleanSelection cleans a goquery selection (for HTML content)
CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection
}
FieldCleaner represents the interface for field-specific cleaners
func GetCleaner ¶
func GetCleaner(fieldType string) FieldCleaner
GetCleaner retrieves a cleaner by field type
type LeadImageURLCleaner ¶
type LeadImageURLCleaner struct{}
LeadImageURLCleaner implements FieldCleaner for lead image URL fields
func (*LeadImageURLCleaner) Clean ¶
func (c *LeadImageURLCleaner) Clean(value interface{}, opts CleanerOptions) interface{}
func (*LeadImageURLCleaner) CleanSelection ¶
func (c *LeadImageURLCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection
type ResolveSplitTitleCleaner ¶
type ResolveSplitTitleCleaner struct{}
ResolveSplitTitleCleaner implements FieldCleaner for title fields with split resolution
func (*ResolveSplitTitleCleaner) Clean ¶
func (c *ResolveSplitTitleCleaner) Clean(value interface{}, opts CleanerOptions) interface{}
func (*ResolveSplitTitleCleaner) CleanSelection ¶
func (c *ResolveSplitTitleCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection