scoring

package
v0.1.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 29, 2025 License: MIT Imports: 7 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func FindBestElement

func FindBestElement(elements []*goquery.Selection, minScore float64) *goquery.Selection

FindBestElement finds the best scoring element from a list JavaScript original code:

static findBestElement(elements: Element[], minScore: number = 50): Element | null {
	let bestElement: Element | null = null;
	let bestScore = 0;

	elements.forEach(element => {
		const score = this.scoreElement(element);
		if (score > bestScore) {
			bestScore = score;
			bestElement = element;
		}
	});

	return bestScore > minScore ? bestElement : null;
}

func ScoreAndRemove

func ScoreAndRemove(doc *goquery.Document, debug bool)

ScoreAndRemove scores blocks and removes those that are likely not content JavaScript original code:

public static scoreAndRemove(doc: Document, debug: boolean = false) {
	const startTime = Date.now();
	let removedCount = 0;

	// Track all elements to be removed
	const elementsToRemove = new Set<Element>();

	// Get all block elements
	const blockElements = Array.from(doc.querySelectorAll(BLOCK_ELEMENTS.join(',')));

	// Process each block element
	blockElements.forEach(element => {
		// Skip elements that are already marked for removal
		if (elementsToRemove.has(element)) {
			return;
		}

		// Skip elements that are likely to be content
		if (ContentScorer.isLikelyContent(element)) {
			return;
		}

		// Score the element based on various criteria
		const score = ContentScorer.scoreNonContentBlock(element);

		// If the score is below the threshold, mark for removal
		if (score < 0) {
			elementsToRemove.add(element);
			removedCount++;
		}
	});

	// Remove all collected elements in a single pass
	elementsToRemove.forEach(el => el.remove());

	const endTime = Date.now();
	if (debug) {
		console.log('Defuddle', 'Removed non-content blocks:', {
			count: removedCount,
			processingTime: `${(endTime - startTime).toFixed(2)}ms`
		});
	}
}

func ScoreElement

func ScoreElement(element *goquery.Selection) float64

ScoreElement scores an element based on various content indicators JavaScript original code:

static scoreElement(element: Element): number {
	let score = 0;

	// Text density
	const text = element.textContent || '';
	const words = text.split(/\s+/).length;
	score += words;

	// Paragraph ratio
	const paragraphs = element.getElementsByTagName('p').length;
	score += paragraphs * 10;

	// Link density (penalize high link density)
	const links = element.getElementsByTagName('a').length;
	const linkDensity = links / (words || 1);
	score -= linkDensity * 5;

	// Image ratio (penalize high image density)
	const images = element.getElementsByTagName('img').length;
	const imageDensity = images / (words || 1);
	score -= imageDensity * 3;

	// Position bonus (center/right elements)
	try {
		const style = element.getAttribute('style') || '';
		const align = element.getAttribute('align') || '';
		const isRightSide = style.includes('float: right') ||
						   style.includes('text-align: right') ||
						   align === 'right';
		if (isRightSide) score += 5;
	} catch (e) {
		// Ignore position if we can't get style
	}

	// Content indicators
	const hasDate = /\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},?\s+\d{4}\b/i.test(text);
	if (hasDate) score += 10;

	const hasAuthor = /\b(?:by|written by|author:)\s+[A-Za-z\s]+\b/i.test(text);
	if (hasAuthor) score += 10;

	// Check for common content classes/attributes
	const className = element.className.toLowerCase();
	if (className.includes('content') || className.includes('article') || className.includes('post')) {
		score += 15;
	}

	// Check for footnotes/references
	const hasFootnotes = element.querySelector(FOOTNOTE_INLINE_REFERENCES);
	if (hasFootnotes) score += 10;

	const hasFootnotesList = element.querySelector(FOOTNOTE_LIST_SELECTORS);
	if (hasFootnotesList) score += 10;

	// Check for nested tables (penalize)
	const nestedTables = element.getElementsByTagName('table').length;
	score -= nestedTables * 5;

	// Additional scoring for table cells
	if (element.tagName.toLowerCase() === 'td') {
		// Table cells get a bonus for being in the main content area
		const parentTable = element.closest('table');
		if (parentTable) {
			// Only favor cells in tables that look like old-style content layouts
			const tableWidth = parseInt(parentTable.getAttribute('width') || '0');
			const tableAlign = parentTable.getAttribute('align') || '';
			const tableClass = parentTable.className.toLowerCase();
			const isTableLayout =
				tableWidth > 400 || // Common width for main content tables
				tableAlign === 'center' ||
				tableClass.includes('content') ||
				tableClass.includes('article');

			if (isTableLayout) {
				// Additional checks to ensure this is likely the main content cell
				const allCells = Array.from(parentTable.getElementsByTagName('td'));
				const cellIndex = allCells.indexOf(element as HTMLTableCellElement);
				const isCenterCell = cellIndex > 0 && cellIndex < allCells.length - 1;

				if (isCenterCell) {
					score += 10;
				}
			}
		}
	}

	return score;
}

Types

type ContentScore

type ContentScore struct {
	Score   float64
	Element *goquery.Selection
}

ContentScore represents a scored element JavaScript original code:

export interface ContentScore {
  score: number;
  element: Element;
}

type ContentScorer

type ContentScorer struct {
	// contains filtered or unexported fields
}

ContentScorer provides content scoring functionality JavaScript original code:

export class ContentScorer {
	private doc: Document;
	private debug: boolean;

	constructor(doc: Document, debug: boolean = false) {
		this.doc = doc;
		this.debug = debug;
	}
}

func NewContentScorer

func NewContentScorer(doc *goquery.Document, debug bool) *ContentScorer

NewContentScorer creates a new ContentScorer instance

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL