standardize

package
v0.1.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 29, 2025 License: MIT Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ELEMENT_STANDARDIZATION_RULES = []StandardizationRule{

	{
		Selector: `div[data-testid^="paragraph"], div[role="paragraph"]`,
		Element:  "p",
		Transform: func(el *goquery.Selection, doc *goquery.Document) *goquery.Selection {

			html, _ := el.Html()

			newHtml := "<p"

			if el.Length() > 0 {
				node := el.Get(0)
				for _, attr := range node.Attr {
					if constants.IsAllowedAttribute(attr.Key) && attr.Key != "role" {
						newHtml += ` ` + attr.Key + `="` + attr.Val + `"`
					}
				}
			}

			newHtml += ">" + html + "</p>"

			el.ReplaceWithHtml(newHtml)

			return nil
		},
	},

	{
		Selector:  `div[role="list"]`,
		Element:   "ul",
		Transform: transformListElement,
	},
	{
		Selector:  `div[role="listitem"]`,
		Element:   "li",
		Transform: transformListItemElement,
	},
}

ELEMENT_STANDARDIZATION_RULES maps selectors to their target HTML element name JavaScript original code:

const ELEMENT_STANDARDIZATION_RULES: StandardizationRule[] = [
	...mathRules,
	...codeBlockRules,
	...headingRules,
	...imageRules,
	// Convert divs with paragraph role to actual paragraphs
	{
		selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
		element: 'p',
		transform: (el: Element, doc: Document): Element => { ... }
	},
	// Convert divs with list roles to actual lists
	{
		selector: 'div[role="list"]',
		element: 'ul',
		transform: (el: Element, doc: Document): Element => { ... }
	},
	{
		selector: 'div[role="listitem"]',
		element: 'li',
		transform: (el: Element, doc: Document): Element => { ... }
	}
];

Functions

func StandardizeContent

func StandardizeContent(element *goquery.Selection, metadata *metadata.Metadata, doc *goquery.Document, debug bool)

StandardizeContent standardizes and cleans up the main content element JavaScript original code:

export function standardizeContent(element: Element, metadata: DefuddleMetadata, doc: Document, debug: boolean = false): void {
	standardizeSpaces(element);

	// Remove HTML comments
	removeHtmlComments(element);

	// Handle H1 elements - remove first one and convert others to H2
	standardizeHeadings(element, metadata.title, doc);

	// Standardize footnotes and citations
	standardizeFootnotes(element);

	// Convert embedded content to standard formats
	standardizeElements(element, doc);

	// If not debug mode, do the full cleanup
	if (!debug) {
		// First pass of div flattening
		flattenWrapperElements(element, doc);

		// Strip unwanted attributes
		stripUnwantedAttributes(element, debug);

		// Remove empty elements
		removeEmptyElements(element);

		// Remove trailing headings
		removeTrailingHeadings(element);

		// Final pass of div flattening after cleanup operations
		flattenWrapperElements(element, doc);

		// Standardize consecutive br elements
		stripExtraBrElements(element);

		// Clean up empty lines
		removeEmptyLines(element, doc);
	} else {
		// In debug mode, still do basic cleanup but preserve structure
		stripUnwantedAttributes(element, debug);
		removeTrailingHeadings(element);
		stripExtraBrElements(element);
		logDebug('Debug mode: Skipping div flattening to preserve structure');
	}
}

Types

type StandardizationRule

type StandardizationRule struct {
	Selector  string
	Element   string
	Transform func(el *goquery.Selection, doc *goquery.Document) *goquery.Selection
}

StandardizationRule represents element standardization rules JavaScript original code:

interface StandardizationRule {
	selector: string;
	element: string;
	transform?: (el: Element, doc: Document) => Element;
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL