Documentation
¶
Overview ¶
Package constants provides configuration constants and selectors for the defuddle content extraction system. It includes CSS selectors for finding main content, removing unwanted elements, and processing footnotes.
Index ¶
- Constants
- Variables
- func GetAllowedEmptyElements() []string
- func GetBlockElements() []string
- func GetEntryPointElements() []string
- func GetExactSelectors() []string
- func GetFootnoteInlineReferences() []string
- func GetFootnoteListSelectors() []string
- func GetInlineElements() []string
- func GetMobileWidth() int
- func GetPartialSelectors() []string
- func GetTestAttributes() []string
- func IsAllowedAttribute(attrName string) bool
- func IsAllowedAttributeDebug(attrName string) bool
- func IsAllowedEmptyElement(tagName string) bool
- func IsInlineElement(tagName string) bool
- func IsPreserveElement(tagName string) bool
Constants ¶
const MobileWidth = 600
MobileWidth is the width threshold for mobile styles JavaScript original code: export const MOBILE_WIDTH = 600;
Variables ¶
var AllowedAttributes = map[string]bool{ "alt": true, "allow": true, "allowfullscreen": true, "aria-label": true, "checked": true, "colspan": true, "controls": true, "data-latex": true, "data-src": true, "data-srcset": true, "data-lang": true, "dir": true, "display": true, "frameborder": true, "headers": true, "height": true, "href": true, "lang": true, "role": true, "rowspan": true, "src": true, "srcset": true, "title": true, "type": true, "width": true, "accent": true, "accentunder": true, "align": true, "columnalign": true, "columnlines": true, "columnspacing": true, "columnspan": true, "data-mjx-texclass": true, "depth": true, "displaystyle": true, "fence": true, "frame": true, "framespacing": true, "linethickness": true, "lspace": true, "mathsize": true, "mathvariant": true, "maxsize": true, "minsize": true, "movablelimits": true, "notation": true, "rowalign": true, "rowlines": true, "rowspacing": true, "rspace": true, "scriptlevel": true, "separator": true, "stretchy": true, "symmetric": true, "voffset": true, "xmlns": true, }
AllowedAttributes are attributes to keep JavaScript original code: export const ALLOWED_ATTRIBUTES = new Set([
'alt', 'allow', 'allowfullscreen', 'aria-label', 'checked', 'colspan', 'controls', 'data-latex', 'data-src', 'data-srcset', 'data-lang', 'dir', 'display', 'frameborder', 'headers', 'height', 'href', 'lang', 'role', 'rowspan', 'src', 'srcset', 'title', 'type', 'width', // MathML attributes 'accent', 'accentunder', 'align', 'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'data-mjx-texclass', 'depth', 'displaystyle', 'fence', 'frame', 'framespacing', 'linethickness', 'lspace', 'mathsize', 'mathvariant', 'maxsize', 'minsize', 'movablelimits', 'notation', 'rowalign', 'rowlines', 'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'separator', 'stretchy', 'symmetric', 'voffset', 'xmlns'
]);
var AllowedAttributesDebug = map[string]bool{ "class": true, "id": true, }
AllowedAttributesDebug are additional attributes to keep in debug mode JavaScript original code: export const ALLOWED_ATTRIBUTES_DEBUG = new Set([
'class', 'id',
]);
var AllowedEmptyElements = map[string]bool{ "area": true, "audio": true, "base": true, "br": true, "circle": true, "col": true, "defs": true, "ellipse": true, "embed": true, "figure": true, "g": true, "hr": true, "iframe": true, "img": true, "input": true, "line": true, "link": true, "mask": true, "meta": true, "object": true, "param": true, "path": true, "pattern": true, "picture": true, "polygon": true, "polyline": true, "rect": true, "source": true, "stop": true, "svg": true, "td": true, "th": true, "track": true, "use": true, "video": true, "wbr": true, }
AllowedEmptyElements are elements that are allowed to be empty These are not removed even if they have no content JavaScript original code: export const ALLOWED_EMPTY_ELEMENTS = new Set([
'area', 'audio', 'base', 'br', 'circle', 'col', 'defs', 'ellipse', 'embed', 'figure', 'g', 'hr', 'iframe', 'img', 'input', 'line', 'link', 'mask', 'meta', 'object', 'param', 'path', 'pattern', 'picture', 'polygon', 'polyline', 'rect', 'source', 'stop', 'svg', 'td', 'th', 'track', 'use', 'video', 'wbr'
]);
var BlockElements = []string{
"div", "section", "article", "main", "aside", "header", "footer", "nav", "content",
}
BlockElements are HTML block-level elements JavaScript original code: export const BLOCK_ELEMENTS = ['div', 'section', 'article', 'main', 'aside', 'header', 'footer', 'nav', 'content'];
var EntryPointElements = []string{
"#post",
".post-content",
".article-content",
"#article-content",
".article_post",
".article-wrapper",
".entry-content",
".content-article",
".post",
".markdown-body",
"article",
`[role="article"]`,
"main",
`[role="main"]`,
"body",
}
EntryPointElements are the elements that will be used to find the main content JavaScript original code: export const ENTRY_POINT_ELEMENTS = [
'#post', '.post-content', '.article-content', '#article-content', '.article_post', '.article-wrapper', '.entry-content', '.content-article', '.post', '.markdown-body', 'article', '[role="article"]', 'main', '[role="main"]', 'body' // ensures there is always a match
];
var ExactSelectors = []string{}/* 144 elements not displayed */
ExactSelectors are selectors to be removed exactly JavaScript original code: (first part of EXACT_SELECTORS array)
var FootnoteInlineReferences = []string{
"sup.reference",
"cite.ltx_cite",
`sup[id^="fnr"]`,
`span[id^="fnr"]`,
`span[class*="footnote_ref"]`,
"span.footnote-link",
"a.citation",
`a[id^="ref-link"]`,
`a[href^="#fn"]`,
`a[href^="#cite"]`,
`a[href^="#reference"]`,
`a[href^="#footnote"]`,
`a[href^="#r"]`,
`a[href^="#b"]`,
`a[href*="cite_note"]`,
`a[href*="cite_ref"]`,
"a.footnote-anchor",
"span.footnote-hovercard-target a",
`a[role="doc-biblioref"]`,
`a[id^="fnref"]`,
`a[id^="ref-link"]`,
}
FootnoteInlineReferences are selectors for footnotes and citations JavaScript original code: export const FOOTNOTE_INLINE_REFERENCES = [
'sup.reference', 'cite.ltx_cite', 'sup[id^="fnr"]', 'span[id^="fnr"]', 'span[class*="footnote_ref"]', 'span.footnote-link', 'a.citation', 'a[id^="ref-link"]', 'a[href^="#fn"]', 'a[href^="#cite"]', 'a[href^="#reference"]', 'a[href^="#footnote"]', 'a[href^="#r"]', // Common in academic papers 'a[href^="#b"]', // Common for bibliography references 'a[href*="cite_note"]', 'a[href*="cite_ref"]', 'a.footnote-anchor', // Substack 'span.footnote-hovercard-target a', // Substack 'a[role="doc-biblioref"]', // Science.org 'a[id^="fnref"]', 'a[id^="ref-link"]', // Nature.com
].join(',');
var FootnoteListSelectors = []string{
"div.footnote ol",
"div.footnotes ol",
`div[role="doc-endnotes"]`,
`div[role="doc-footnotes"]`,
"ol.footnotes-list",
"ol.footnotes",
"ol.references",
`ol[class*="article-references"]`,
"section.footnotes ol",
`section[role="doc-endnotes"]`,
`section[role="doc-footnotes"]`,
`section[role="doc-bibliography"]`,
"ul.footnotes-list",
"ul.ltx_biblist",
`div.footnote[data-component-name="FootnoteToDOM"]`,
}
FootnoteListSelectors are selectors for footnote lists JavaScript original code: export const FOOTNOTE_LIST_SELECTORS = [
'div.footnote ol', 'div.footnotes ol', 'div[role="doc-endnotes"]', 'div[role="doc-footnotes"]', 'ol.footnotes-list', 'ol.footnotes', 'ol.references', 'ol[class*="article-references"]', 'section.footnotes ol', 'section[role="doc-endnotes"]', 'section[role="doc-footnotes"]', 'section[role="doc-bibliography"]', 'ul.footnotes-list', 'ul.ltx_biblist', 'div.footnote[data-component-name="FootnoteToDOM"]' // Substack
].join(',');
var InlineElements = map[string]bool{ "a": true, "span": true, "strong": true, "em": true, "i": true, "b": true, "u": true, "code": true, "br": true, "small": true, "sub": true, "sup": true, "mark": true, "date": true, "del": true, "ins": true, "q": true, "abbr": true, "cite": true, "relative-time": true, "time": true, "font": true, }
InlineElements are inline elements that should not be unwrapped JavaScript original code: export const INLINE_ELEMENTS = new Set([
'a', 'span', 'strong', 'em', 'i', 'b', 'u', 'code', 'br', 'small', 'sub', 'sup', 'mark', 'date', 'del', 'ins', 'q', 'abbr', 'cite', 'relative-time', 'time', 'font'
]);
var PartialSelectors = []string{}/* 475 elements not displayed */
PartialSelectors are removal patterns tested against attributes above Case insensitive, partial matches allowed JavaScript original code: (first part of PARTIAL_SELECTORS array)
var PreserveElements = map[string]bool{ "pre": true, "code": true, "table": true, "thead": true, "tbody": true, "tr": true, "td": true, "th": true, "ul": true, "ol": true, "li": true, "dl": true, "dt": true, "dd": true, "figure": true, "figcaption": true, "picture": true, "details": true, "summary": true, "blockquote": true, "form": true, "fieldset": true, }
PreserveElements are elements that should not be unwrapped JavaScript original code: export const PRESERVE_ELEMENTS = new Set([
'pre', 'code', 'table', 'thead', 'tbody', 'tr', 'td', 'th', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'figure', 'figcaption', 'picture', 'details', 'summary', 'blockquote', 'form', 'fieldset'
]);
var TestAttributes = []string{
"class",
"id",
"data-test",
"data-testid",
"data-test-id",
"data-qa",
"data-cy",
}
TestAttributes are attributes to test against for partial matches JavaScript original code: export const TEST_ATTRIBUTES = [
'class', 'id', 'data-test', 'data-testid', 'data-test-id', 'data-qa', 'data-cy'
];
Functions ¶
func GetAllowedEmptyElements ¶
func GetAllowedEmptyElements() []string
GetAllowedEmptyElements returns a slice of allowed empty element names
func GetBlockElements ¶
func GetBlockElements() []string
GetBlockElements returns the block elements slice
func GetEntryPointElements ¶
func GetEntryPointElements() []string
GetEntryPointElements returns the entry point elements slice
func GetExactSelectors ¶
func GetExactSelectors() []string
GetExactSelectors returns the exact selectors slice
func GetFootnoteInlineReferences ¶
func GetFootnoteInlineReferences() []string
GetFootnoteInlineReferences returns the footnote inline reference selectors
func GetFootnoteListSelectors ¶
func GetFootnoteListSelectors() []string
GetFootnoteListSelectors returns the footnote list selectors
func GetInlineElements ¶
func GetInlineElements() []string
GetInlineElements returns a slice of inline element names
func GetPartialSelectors ¶
func GetPartialSelectors() []string
GetPartialSelectors returns the partial selectors slice
func GetTestAttributes ¶
func GetTestAttributes() []string
GetTestAttributes returns the test attributes slice
func IsAllowedAttribute ¶
IsAllowedAttribute checks if an attribute is allowed
func IsAllowedAttributeDebug ¶
IsAllowedAttributeDebug checks if an attribute is allowed in debug mode
func IsAllowedEmptyElement ¶
IsAllowedEmptyElement checks if an element is allowed to be empty
func IsInlineElement ¶
IsInlineElement checks if an element is inline
func IsPreserveElement ¶
IsPreserveElement checks if an element should be preserved
Types ¶
This section is empty.