cleaners

package

v1.0.6 Latest Latest Go to latest Published: Aug 31, 2025 License: MIT Imports: 8 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BumpyClock/hermes

Links

Open Source Insights

Documentation ¶

Index ¶

Variables
func CleanAuthor(author string) string
func CleanDatePublished(dateString, timezone, format string) *string
func CleanDek(dek string, doc *goquery.Document, excerpt string) *string
func CleanDomainFromTitle(splitTitle []string, urlStr string) string
func CleanLeadImageURL(imageURL, baseURL string) string
func CleanLeadImageURLString(leadImageURL string) string
func CleanLeadImageURLValidated(leadImageURL string) *string
func CleanTitle(title string, url string, doc *goquery.Document) string
func CleanTitleSimple(title, targetURL string) string
func ExtractBreadcrumbTitle(splitTitle []string, text string) string
func ExtractCleanNode(article *goquery.Selection, doc *goquery.Document, opts ContentCleanOptions) *goquery.Selection
func LevenshteinRatio(s1, s2 string) float64
func RegisterCleaner(fieldType string, cleaner FieldCleaner)
func ResolveSplitTitle(title, url string) string
func SplitTitleWithSeparators(title string) []string
type CleanerOptions
type ContentCleanOptions
type ContentCleanOptionsStruct
type ContentCleaner
- func (c *ContentCleaner) Clean(value interface{}, opts CleanerOptions) interface{}
- func (c *ContentCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection
type FieldCleaner
- func GetCleaner(fieldType string) FieldCleaner
type LeadImageURLCleaner
- func (c *LeadImageURLCleaner) Clean(value interface{}, opts CleanerOptions) interface{}
- func (c *LeadImageURLCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection
type ResolveSplitTitleCleaner
- func (c *ResolveSplitTitleCleaner) Clean(value interface{}, opts CleanerOptions) interface{}
- func (c *ResolveSplitTitleCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection

Constants ¶

This section is empty.

Variables ¶

View Source

var CLEAN_AUTHOR_RE = regexp.MustCompile(`(?i)^\s*(posted |written )?by\s*:?\s*([^\r\n]*)`)

CLEAN_AUTHOR_RE matches "by" prefixes in author strings Matches the JavaScript regex: /^\s*(posted |written )?by\s*:?\s*(.*)/i Note: In JavaScript, .* does NOT match newlines by default, so we use [^\r\n]*

View Source

var CLEAN_DATE_STRING_RE = regexp.MustCompile(`(?i)^\s*published\s*:?\s*(.*)`)

CLEAN_DATE_STRING_RE matches "published:" prefixes in date strings Matches the JavaScript regex: /^\s*published\s*:?\s*(.*)/i

View Source

var DEK_META_TAGS = []string{}

DEK_META_TAGS is an ordered list of meta tag names for article deks From most distinct to least distinct NOTE: Currently empty as no meta tags provide consistent dek content

View Source

var DEK_SELECTORS = []string{".entry-summary"}

DEK_SELECTORS is an ordered list of CSS selectors for article deks From most explicit to least explicit

View Source

var DOMAIN_ENDINGS_RE = regexp.MustCompile(`\.com$|\.net$|\.org$|\.co\.uk$`)

DOMAIN_ENDINGS_RE matches common domain endings Matches the JavaScript regex: /.com$|.net$|.org$|.co.uk$/g

View Source

var ExtractCleanNodeFunc = ExtractCleanNode

ExtractCleanNode is the main content cleaning function It can be used as a standalone utility or integrated with content extractors

View Source

var MS_DATE_STRING = regexp.MustCompile(`(?i)^\d{13}$`)

MS_DATE_STRING matches 13-digit millisecond timestamps Matches the JavaScript regex: /^\d{13}$/i

View Source

var SEC_DATE_STRING = regexp.MustCompile(`(?i)^\d{10}$`)

SEC_DATE_STRING matches 10-digit second timestamps Matches the JavaScript regex: /^\d{10}$/i

View Source

var SPLIT_DATE_STRING = regexp.MustCompile(`(?i)([0-9]{1,2}:[0-9]{2,2}( ?[ap]\.?m\.?)?)|([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})|(-[0-9]{3,4}$)|([0-9]{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)`)

SPLIT_DATE_STRING matches various date/time components Complex regex built from multiple timestamp patterns

View Source

var TEXT_LINK_RE = regexp.MustCompile(`(?i)http(s)?:`)

TEXT_LINK_RE matches HTTP/HTTPS URLs in text Matches the JavaScript regex: /http(s)?:/i

View Source

var TIME_AGO_STRING = regexp.MustCompile(`(?i)(\d+)\s+(seconds?|minutes?|hours?|days?|weeks?|months?|years?)\s+ago`)

TIME_AGO_STRING matches relative time expressions like "5 minutes ago" Dynamically built from timeUnits like JavaScript version

View Source

var TIME_MERIDIAN_DOTS_RE = regexp.MustCompile(`(?i)\.m\.`)

TIME_MERIDIAN_DOTS_RE matches ".m." in time strings Matches the JavaScript regex: /\.m\./i

View Source

var TIME_MERIDIAN_SPACE_RE = regexp.MustCompile(`(?i)(.*\d)(a|p)(\s*m.*)`)

TIME_MERIDIAN_SPACE_RE matches time strings with AM/PM Matches the JavaScript regex: /(.*\d)(am|pm)(.*)/i

View Source

var TIME_NOW_STRING = regexp.MustCompile(`(?i)^\s*(just|right)?\s*now\s*`)

TIME_NOW_STRING matches "now" time indicators Matches the JavaScript regex: /^\s*(just|right)?\s*now\s*/i

View Source

var TIME_WITH_OFFSET_RE = regexp.MustCompile(`[+-]\d{3,4}$`)

TIME_WITH_OFFSET_RE checks if datetime string has timezone offset at end Matches the JavaScript regex: /-\d{3,4}$/ but also handles positive offsets

View Source

var TITLE_SPLITTERS_RE = regexp.MustCompile(`(: | - | \| )`)

TITLE_SPLITTERS_RE matches title separating characters Matches the JavaScript regex: /(: | - | \| )/g

Functions ¶

func CleanAuthor ¶

func CleanAuthor(author string) string

CleanAuthor takes an author string (like 'By David Smith ') and cleans it to just the name(s): 'David Smith'.

This is a faithful 1:1 port of the JavaScript cleanAuthor function: - Removes "By", "Posted by", "Written by" prefixes (case insensitive) - Handles optional colons after prefixes - Normalizes all whitespace to single spaces - Trims leading and trailing whitespace

JavaScript equivalent:

export default function cleanAuthor(author) {
  return normalizeSpaces(author.replace(CLEAN_AUTHOR_RE, '$2').trim());
}

func CleanDatePublished ¶

func CleanDatePublished(dateString, timezone, format string) *string

CleanDatePublished takes a date published string and returns a clean ISO date string. Returns nil if the date cannot be parsed or is invalid.

This is a faithful 1:1 port of the JavaScript cleanDatePublished function: - Handles millisecond/second timestamps - Supports relative time expressions ("5 minutes ago") - Handles "now" time indicators - Supports timezone and format parameters - Cleans date strings by removing "published:" prefixes - Returns ISO 8601 formatted string or nil for invalid dates

JavaScript equivalent:

export default function cleanDatePublished(dateString, { timezone, format } = {}) {
  // Timestamp handling, date cleaning, and parsing logic
  return date.isValid() ? date.toISOString() : null;
}

func CleanDek ¶

func CleanDek(dek string, doc *goquery.Document, excerpt string) *string

CleanDek takes a dek HTML fragment and returns the cleaned version of it. Returns nil if the dek wasn't good enough (too short, too long, has URLs, matches excerpt).

This is a faithful 1:1 port of the JavaScript cleanDek function: - Validates length between 5 and 1000 characters - Checks that dek isn't the same as excerpt (first 10 words) - Strips HTML tags using stripTags function - Rejects deks containing plain text URLs (http/https) - Normalizes whitespace using normalizeSpaces

JavaScript equivalent:

export default function cleanDek(dek, { $, excerpt }) {
  if (dek.length > 1000 || dek.length < 5) return null;
  if (excerpt && excerptContent(excerpt, 10) === excerptContent(dek, 10)) return null;
  const dekText = stripTags(dek, $);
  if (TEXT_LINK_RE.test(dekText)) return null;
  return normalizeSpaces(dekText.trim());
}

func CleanDomainFromTitle ¶

func CleanDomainFromTitle(splitTitle []string, urlStr string) string

CleanDomainFromTitle removes domain name matches from title segments Search the ends of the title, looking for bits that fuzzy match the URL too closely. If one is found, discard it and return the rest.

func CleanLeadImageURL ¶

func CleanLeadImageURL(imageURL, baseURL string) string

CleanLeadImageURL ensures image URLs are properly formatted and absolute

func CleanLeadImageURLString ¶

func CleanLeadImageURLString(leadImageURL string) string

CleanLeadImageURLString provides a string-returning version for backward compatibility Returns empty string if URL is invalid, cleaned URL if valid

func CleanLeadImageURLValidated ¶

func CleanLeadImageURLValidated(leadImageURL string) *string

CleanLeadImageURLValidated validates and cleans a lead image URL Returns nil if the URL is invalid, cleaned URL string if valid Matches JavaScript behavior: trim whitespace and validate as web URI

func CleanTitle ¶

func CleanTitle(title string, url string, doc *goquery.Document) string

CleanTitle cleans and normalizes title text by removing site names, HTML tags, and extra whitespace This is a faithful port of the JavaScript cleanTitle function

func CleanTitleSimple ¶

func CleanTitleSimple(title, targetURL string) string

CleanTitleSimple provides a simple title cleaner that doesn't need a document

func ExtractBreadcrumbTitle ¶

func ExtractBreadcrumbTitle(splitTitle []string, text string) string

ExtractBreadcrumbTitle extracts the most relevant title from breadcrumb-style titles This must be a very breadcrumbed title, like: The Best Gadgets on Earth : Bits : Blogs : NYTimes.com NYTimes - Blogs - Bits - The Best Gadgets on Earth

func ExtractCleanNode ¶

func ExtractCleanNode(article *goquery.Selection, doc *goquery.Document, opts ContentCleanOptions) *goquery.Selection

ExtractCleanNode cleans article content, returning a new, cleaned node Direct port of JavaScript extractCleanNode function with identical cleaning pipeline:

1. rewriteTopLevel - Convert HTML/BODY tags to DIV to avoid complications 2. cleanImages - Remove small/spacer images (if defaultCleaner enabled) 3. makeLinksAbsolute - Convert relative URLs to absolute URLs 4. markToKeep - Mark video iframes and important elements for preservation 5. stripJunkTags - Remove script, style, title and other junk tags 6. cleanHOnes - Remove or convert H1 tags based on count 7. cleanHeaders - Clean headers that match article title 8. cleanTags - Remove low-quality tags with high link density (if defaultCleaner enabled) 9. removeEmpty - Remove empty paragraph and other empty elements 10. cleanAttributes - Remove unnecessary attributes

This function matches the JavaScript implementation exactly, including: - Same cleaning order and logic - Same conditional cleaning based on options - Same default behaviors for aggressive vs conservative cleaning

func LevenshteinRatio ¶

func LevenshteinRatio(s1, s2 string) float64

LevenshteinRatio calculates the Levenshtein similarity ratio between two strings This is compatible with the JavaScript wuzzy.levenshtein function

func RegisterCleaner ¶

func RegisterCleaner(fieldType string, cleaner FieldCleaner)

RegisterCleaner registers a new cleaner for a field type

func ResolveSplitTitle ¶

func ResolveSplitTitle(title, url string) string

ResolveSplitTitle resolves whether any of the segments should be removed from a title with separators Given a title with separators in it (colons, dashes, etc), resolve whether any of the segments should be removed.

func SplitTitleWithSeparators ¶

func SplitTitleWithSeparators(title string) []string

SplitTitleWithSeparators splits title while preserving separators This mimics JavaScript's split behavior with capturing groups

Types ¶

type CleanerOptions ¶

type CleanerOptions struct {
	URL            string
	Title          string
	Content        string
	Excerpt        string
	DefaultCleaner bool
}

CleanerOptions represents unified options for all cleaner types

type ContentCleanOptions ¶

type ContentCleanOptions struct {
	CleanConditionally bool
	Title              string
	URL                string
	DefaultCleaner     *bool // Use pointer to distinguish between unset and explicitly false
}

ContentCleanOptions represents configuration options for content cleaning

type ContentCleanOptionsStruct ¶

type ContentCleanOptionsStruct = ContentCleanOptions

ContentCleanOptions represents the configuration options for content cleaning

type ContentCleaner ¶

type ContentCleaner struct{}

ContentCleaner implements FieldCleaner for content fields

func (*ContentCleaner) Clean ¶

func (c *ContentCleaner) Clean(value interface{}, opts CleanerOptions) interface{}

func (*ContentCleaner) CleanSelection ¶

func (c *ContentCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection

type FieldCleaner ¶

type FieldCleaner interface {
	// Clean cleans a field value (string, []string, etc.)
	Clean(value interface{}, opts CleanerOptions) interface{}

	// CleanSelection cleans a goquery selection (for HTML content)
	CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection
}

FieldCleaner represents the interface for field-specific cleaners

func GetCleaner ¶

func GetCleaner(fieldType string) FieldCleaner

GetCleaner retrieves a cleaner by field type

type LeadImageURLCleaner ¶

type LeadImageURLCleaner struct{}

LeadImageURLCleaner implements FieldCleaner for lead image URL fields

func (*LeadImageURLCleaner) Clean ¶

func (c *LeadImageURLCleaner) Clean(value interface{}, opts CleanerOptions) interface{}

func (*LeadImageURLCleaner) CleanSelection ¶

func (c *LeadImageURLCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection

type ResolveSplitTitleCleaner ¶

type ResolveSplitTitleCleaner struct{}

ResolveSplitTitleCleaner implements FieldCleaner for title fields with split resolution

func (*ResolveSplitTitleCleaner) Clean ¶

func (c *ResolveSplitTitleCleaner) Clean(value interface{}, opts CleanerOptions) interface{}

func (*ResolveSplitTitleCleaner) CleanSelection ¶

func (c *ResolveSplitTitleCleaner) CleanSelection(selection *goquery.Selection, doc *goquery.Document, opts CleanerOptions) *goquery.Selection

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL