Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var ELEMENT_STANDARDIZATION_RULES = []StandardizationRule{ { Selector: `div[data-testid^="paragraph"], div[role="paragraph"]`, Element: "p", Transform: func(el *goquery.Selection, doc *goquery.Document) *goquery.Selection { html, _ := el.Html() newHtml := "<p" if el.Length() > 0 { node := el.Get(0) for _, attr := range node.Attr { if constants.IsAllowedAttribute(attr.Key) && attr.Key != "role" { newHtml += ` ` + attr.Key + `="` + attr.Val + `"` } } } newHtml += ">" + html + "</p>" el.ReplaceWithHtml(newHtml) return nil }, }, { Selector: `div[role="list"]`, Element: "ul", Transform: transformListElement, }, { Selector: `div[role="listitem"]`, Element: "li", Transform: transformListItemElement, }, }
ELEMENT_STANDARDIZATION_RULES maps selectors to their target HTML element name JavaScript original code:
const ELEMENT_STANDARDIZATION_RULES: StandardizationRule[] = [
...mathRules,
...codeBlockRules,
...headingRules,
...imageRules,
// Convert divs with paragraph role to actual paragraphs
{
selector: 'div[data-testid^="paragraph"], div[role="paragraph"]',
element: 'p',
transform: (el: Element, doc: Document): Element => { ... }
},
// Convert divs with list roles to actual lists
{
selector: 'div[role="list"]',
element: 'ul',
transform: (el: Element, doc: Document): Element => { ... }
},
{
selector: 'div[role="listitem"]',
element: 'li',
transform: (el: Element, doc: Document): Element => { ... }
}
];
Functions ¶
func StandardizeContent ¶
func StandardizeContent(element *goquery.Selection, metadata *metadata.Metadata, doc *goquery.Document, debug bool)
StandardizeContent standardizes and cleans up the main content element JavaScript original code:
export function standardizeContent(element: Element, metadata: DefuddleMetadata, doc: Document, debug: boolean = false): void {
standardizeSpaces(element);
// Remove HTML comments
removeHtmlComments(element);
// Handle H1 elements - remove first one and convert others to H2
standardizeHeadings(element, metadata.title, doc);
// Standardize footnotes and citations
standardizeFootnotes(element);
// Convert embedded content to standard formats
standardizeElements(element, doc);
// If not debug mode, do the full cleanup
if (!debug) {
// First pass of div flattening
flattenWrapperElements(element, doc);
// Strip unwanted attributes
stripUnwantedAttributes(element, debug);
// Remove empty elements
removeEmptyElements(element);
// Remove trailing headings
removeTrailingHeadings(element);
// Final pass of div flattening after cleanup operations
flattenWrapperElements(element, doc);
// Standardize consecutive br elements
stripExtraBrElements(element);
// Clean up empty lines
removeEmptyLines(element, doc);
} else {
// In debug mode, still do basic cleanup but preserve structure
stripUnwantedAttributes(element, debug);
removeTrailingHeadings(element);
stripExtraBrElements(element);
logDebug('Debug mode: Skipping div flattening to preserve structure');
}
}
Types ¶
type StandardizationRule ¶
type StandardizationRule struct {
Selector string
Element string
Transform func(el *goquery.Selection, doc *goquery.Document) *goquery.Selection
}
StandardizationRule represents element standardization rules JavaScript original code:
interface StandardizationRule {
selector: string;
element: string;
transform?: (el: Element, doc: Document) => Element;
}
Click to show internal directories.
Click to hide internal directories.