Documentation
¶
Index ¶
- Variables
- func BuildAllExtractorsMap(extractors []*CustomExtractor) map[string]*CustomExtractor
- func CountCustomExtractors() int
- func DeobfuscateAbendblattText(selection *goquery.Selection) *goquery.Selection
- func GetAllCustomExtractors() map[string]*CustomExtractor
- func GetAllCustomExtractorsList() []string
- func GetCustomExtractorDomains() []string
- func MergeSupportedDomains(extractor *CustomExtractor) map[string]*CustomExtractor
- type ContentExtractor
- type CustomExtractor
- func GetABCNewsExtractor() *CustomExtractor
- func GetArstechnicaComExtractor() *CustomExtractor
- func GetBiorxivOrgExtractor() *CustomExtractor
- func GetBloggerExtractor() *CustomExtractor
- func GetBlogspotExtractor() *CustomExtractor
- func GetBloombergExtractor() *CustomExtractor
- func GetBookwalkerJpExtractor() *CustomExtractor
- func GetBustleExtractor() *CustomExtractor
- func GetBuzzFeedExtractor() *CustomExtractor
- func GetBuzzapJpExtractor() *CustomExtractor
- func GetCNBCExtractor() *CustomExtractor
- func GetCNNExtractor() *CustomExtractor
- func GetChicagoTribuneExtractor() *CustomExtractor
- func GetClinicaltrialsGovExtractor() *CustomExtractor
- func GetCustomExtractorByDomain(domain string) (*CustomExtractor, bool)
- func GetDaringFireballExtractor() *CustomExtractor
- func GetDeadlineExtractor() *CustomExtractor
- func GetDeadspinComExtractor() *CustomExtractor
- func GetEOnlineExtractor() *CustomExtractor
- func GetEpaperZeitDeExtractor() *CustomExtractor
- func GetFandomWikiaExtractor() *CustomExtractor
- func GetFortuneComExtractor() *CustomExtractor
- func GetGeniusExtractor() *CustomExtractor
- func GetGetnewsJpExtractor() *CustomExtractor
- func GetGithubComExtractor() *CustomExtractor
- func GetGothamistComExtractor() *CustomExtractor
- func GetHuffingtonPostExtractor() *CustomExtractor
- func GetIciRadioCanadaCaExtractor() *CustomExtractor
- func GetJapanCnetComExtractor() *CustomExtractor
- func GetJapanZdnetComExtractor() *CustomExtractor
- func GetJvndbJvnJpExtractor() *CustomExtractor
- func GetLATimesExtractor() *CustomExtractor
- func GetLinkedInExtractor() *CustomExtractor
- func GetLittleThingsExtractor() *CustomExtractor
- func GetMaTtiasBeExtractor() *CustomExtractor
- func GetMashableComExtractor() *CustomExtractor
- func GetMediumExtractor() *CustomExtractor
- func GetMediumExtractorFixed() *CustomExtractor
- func GetMiamiHeraldExtractor() *CustomExtractor
- func GetMoneyCNNExtractor() *CustomExtractor
- func GetNBCNewsExtractor() *CustomExtractor
- func GetNPRExtractor() *CustomExtractor
- func GetNYDailyNewsExtractor() *CustomExtractor
- func GetNYMagExtractor() *CustomExtractor
- func GetNYTimesExtractor() *CustomExtractor
- func GetNewYorkerExtractor() *CustomExtractor
- func GetNewsMynaviJpExtractor() *CustomExtractor
- func GetNewsNationalgeographicComExtractor() *CustomExtractor
- func GetPastebinExtractor() *CustomExtractor
- func GetPeopleExtractor() *CustomExtractor
- func GetPhpspotOrgExtractor() *CustomExtractor
- func GetPitchforkExtractor() *CustomExtractor
- func GetPoliticoExtractor() *CustomExtractor
- func GetPolygonExtractor() *CustomExtractor
- func GetPopSugarExtractor() *CustomExtractor
- func GetQdailyExtractor() *CustomExtractor
- func GetRedditExtractor() *CustomExtractor
- func GetReutersExtractor() *CustomExtractor
- func GetRollingStoneExtractor() *CustomExtractor
- func GetScanNetsecurityNeJpExtractor() *CustomExtractor
- func GetScienceflyComExtractor() *CustomExtractor
- func GetSectIijAdJpExtractor() *CustomExtractor
- func GetTMZExtractor() *CustomExtractor
- func GetTakagihiromitsuJpExtractor() *CustomExtractor
- func GetTechlogIijAdJpExtractor() *CustomExtractor
- func GetTheAtlanticExtractor() *CustomExtractor
- func GetTheGuardianExtractor() *CustomExtractor
- func GetThoughtCatalogExtractor() *CustomExtractor
- func GetTimesofindiaIndiatimesComExtractor() *CustomExtractor
- func GetTwitterExtractor() *CustomExtractor
- func GetTwofortysevensportsComExtractor() *CustomExtractor
- func GetUSMagazineExtractor() *CustomExtractor
- func GetUproxxExtractor() *CustomExtractor
- func GetVoxExtractor() *CustomExtractor
- func GetWashingtonPostExtractor() *CustomExtractor
- func GetWeeklyAsciiJpExtractor() *CustomExtractor
- func GetWikipediaExtractor() *CustomExtractor
- func GetWiredJpExtractor() *CustomExtractor
- func GetWwwAbendblattDeExtractor() *CustomExtractor
- func GetWwwAlComExtractor() *CustomExtractor
- func GetWwwAmericanowComExtractor() *CustomExtractor
- func GetWwwAndroidcentralComExtractor() *CustomExtractor
- func GetWwwAolComExtractor() *CustomExtractor
- func GetWwwApartmenttherapyComExtractor() *CustomExtractor
- func GetWwwAsahiComExtractor() *CustomExtractor
- func GetWwwBroadwayworldComExtractor() *CustomExtractor
- func GetWwwCbcCaExtractor() *CustomExtractor
- func GetWwwCbssportsComExtractor() *CustomExtractor
- func GetWwwCnetComExtractor() *CustomExtractor
- func GetWwwDmagazineComExtractor() *CustomExtractor
- func GetWwwElecomCoJpExtractor() *CustomExtractor
- func GetWwwEngadgetComExtractor() *CustomExtractor
- func GetWwwFastcompanyComExtractor() *CustomExtractor
- func GetWwwFoolComExtractor() *CustomExtractor
- func GetWwwFortinetComExtractor() *CustomExtractor
- func GetWwwGizmodoJpExtractor() *CustomExtractor
- func GetWwwGrueneDeExtractor() *CustomExtractor
- func GetWwwInfoqComExtractor() *CustomExtractor
- func GetWwwInquisitrComExtractor() *CustomExtractor
- func GetWwwIpaGoJpExtractor() *CustomExtractor
- func GetWwwItmediaCoJpExtractor() *CustomExtractor
- func GetWwwJnsaOrgExtractor() *CustomExtractor
- func GetWwwLadbibleComExtractor() *CustomExtractor
- func GetWwwLemondeFrExtractor() *CustomExtractor
- func GetWwwLifehackerJpExtractor() *CustomExtractor
- func GetWwwMacrumorsComExtractor() *CustomExtractor
- func GetWwwMentalflossComExtractor() *CustomExtractor
- func GetWwwMoongiftJpExtractor() *CustomExtractor
- func GetWwwMsnComExtractor() *CustomExtractor
- func GetWwwNationalgeographicComExtractor() *CustomExtractor
- func GetWwwNdtvComExtractor() *CustomExtractor
- func GetWwwOpposingviewsComExtractor() *CustomExtractor
- func GetWwwOssnewsJpExtractor() *CustomExtractor
- func GetWwwPhoronixComExtractor() *CustomExtractor
- func GetWwwProspectmagazineCoUkExtractor() *CustomExtractor
- func GetWwwPublickey1JpExtractor() *CustomExtractor
- func GetWwwRawstoryComExtractor() *CustomExtractor
- func GetWwwRbbtodayComExtractor() *CustomExtractor
- func GetWwwRockpapershotgunComExtractor() *CustomExtractor
- func GetWwwSbnationComExtractor() *CustomExtractor
- func GetWwwSiComExtractor() *CustomExtractor
- func GetWwwSlateComExtractor() *CustomExtractor
- func GetWwwSpektrumDeExtractor() *CustomExtractor
- func GetWwwThevergeComExtractor() *CustomExtractor
- func GetWwwTodayComExtractor() *CustomExtractor
- func GetWwwWesternjournalismComExtractor() *CustomExtractor
- func GetWwwWiredComExtractor() *CustomExtractor
- func GetWwwYahooComExtractor() *CustomExtractor
- func GetWwwYomiuriCoJpExtractor() *CustomExtractor
- func GetYouTubeExtractor() *CustomExtractor
- type ExtractorFactory
- type ExtractorOptions
- type ExtractorRegistry
- type FieldExtractor
- type FunctionTransform
- type RegistryManager
- func (rm *RegistryManager) Clear()
- func (rm *RegistryManager) Clone() *RegistryManager
- func (rm *RegistryManager) Count() (int, int)
- func (rm *RegistryManager) GetAll() map[string]*CustomExtractor
- func (rm *RegistryManager) GetBaseDomain(hostname string) string
- func (rm *RegistryManager) GetByDomain(domain string) (*CustomExtractor, bool)
- func (rm *RegistryManager) GetByDomainWithFallback(hostname string) (*CustomExtractor, bool)
- func (rm *RegistryManager) GetByHTML(doc *goquery.Document) *CustomExtractor
- func (rm *RegistryManager) GetDomainMapping() map[string]*CustomExtractor
- func (rm *RegistryManager) ListDomains() []string
- func (rm *RegistryManager) ListPrimaryDomains() []string
- func (rm *RegistryManager) Register(extractor *CustomExtractor) error
- func (rm *RegistryManager) RegisterFactory(domain string, factory ExtractorFactory) error
- func (rm *RegistryManager) RegisterHTMLDetector(selector string, extractor *CustomExtractor) error
- func (rm *RegistryManager) Remove(domain string) bool
- type SelectorEntry
- type StringTransform
- type TransformFunction
Constants ¶
This section is empty.
Variables ¶
var ArstechnicaComExtractor = &CustomExtractor{ Domain: "arstechnica.com", Title: &FieldExtractor{ Selectors: []interface{}{"title"}, }, Author: &FieldExtractor{ Selectors: []interface{}{ "*[rel=\"author\"] *[itemprop=\"name\"]", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{".byline time", "datetime"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "h2[itemprop=\"description\"]", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div[itemprop=\"articleBody\"]", }, }, Transforms: map[string]TransformFunction{ "h2": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { selection.BeforeHtml("<p></p>") return nil }, }, }, Clean: []string{ "figcaption .enlarge-link", "figcaption .sep", "figure.video", ".gallery", "aside", ".sidebar", }, }, }
ArstechnicaComExtractor provides the custom extraction rules for arstechnica.com JavaScript equivalent: export const ArstechnicaComExtractor = { ... }
var BiorxivOrgExtractor = &CustomExtractor{ Domain: "biorxiv.org", Title: &FieldExtractor{ Selectors: []interface{}{ "h1#page-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div#abstract-1", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
BiorxivOrgExtractor provides the custom extraction rules for biorxiv.org JavaScript equivalent: export const BiorxivOrgExtractor = { ... }
var BloggerCustomExtractor = &CustomExtractor{ Domain: "blogspot.com", SupportedDomains: []string{ "www.blogspot.com", "blogspot.co.uk", "blogspot.ca", "blogspot.de", "blogspot.fr", "blogspot.jp", "blogspot.in", "blogspot.com.au", "blogspot.com.br", "blogspot.mx", }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{".post-content noscript"}, }, Clean: []string{}, Transforms: map[string]TransformFunction{ "noscript": &StringTransform{TargetTag: "div"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{".post-author-name"}, }, Title: &FieldExtractor{ Selectors: []interface{}{".post h2.title"}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{"span.publishdate"}, }, }
BloggerCustomExtractor provides the custom extraction rules for Blogger/Blogspot JavaScript equivalent: export const BloggerExtractor = { ... }
var BlogspotCustomExtractor = &CustomExtractor{ Domain: "blogspot.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".post h2.title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".post-author-name", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".post-content noscript", }, }, Transforms: map[string]TransformFunction{ "noscript": &StringTransform{ TargetTag: "div", }, }, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "span.publishdate", }, }, LeadImageURL: nil, Dek: nil, NextPageURL: nil, Excerpt: nil, }
BlogspotCustomExtractor provides the custom extraction rules for blogspot.com JavaScript equivalent: export const BloggerExtractor = { ... }
var BookwalkerJpExtractor = &CustomExtractor{ Domain: "bookwalker.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.p-main__title", "h1.main-heading", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "div.p-author__list", "div.authors", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "dl.p-information__data dd:nth-of-type(7)", ".work-info .work-detail:first-of-type .work-detail-contents:last-of-type", }, Timezone: "Asia/Tokyo", }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.p-main__information", []interface{}{"div.main-info", "div.main-cover-inner"}, }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{ "span.label.label--trial", "dt.info-head.info-head--coin", "dd.info-contents.info-contents--coin", "div.info-notice.fn-toggleClass", }, }, }
BookwalkerJpExtractor provides the custom extraction rules for bookwalker.jp JavaScript equivalent: export const BookwalkerJpExtractor = { ... }
var BustleCustomExtractor = &CustomExtractor{ Domain: "www.bustle.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", "h1.post-page__title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a[href*=\"profile\"]", "div.content-meta__author", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time", "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "article", ".post-page__body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
BustleCustomExtractor provides the custom extraction rules for www.bustle.com JavaScript equivalent: export const WwwBustleComExtractor = { ... }
var BuzzFeedCustomExtractor = &CustomExtractor{ Domain: "www.buzzfeed.com", SupportedDomains: []string{"www.buzzfeednews.com"}, Title: &FieldExtractor{ Selectors: []interface{}{ "h1.embed-headline-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ `a[data-action="user/username"]`, "byline__author", []string{"meta[name=\"author\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []string{`div[class^="featureimage_featureImageWrapper"]`, ".js-subbuzz-wrapper"}, []string{".js-subbuzz-wrapper"}, }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{ "h2": &StringTransform{ TargetTag: "b", }, "div.longform_custom_header_media": &FunctionTransform{ Fn: transformBuzzFeedHeaderMedia, }, "figure.longform_custom_header_media .longform_header_image_source": &StringTransform{ TargetTag: "figcaption", }, }, Clean: []string{ ".instapaper_ignore", ".suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline", ".share-box", ".print", ".js-inline-share-bar", ".js-ad-placement", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time[datetime]", "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".embed-headline-description", }, }, NextPageURL: nil, Excerpt: nil, }
BuzzFeedCustomExtractor provides the custom extraction rules for www.buzzfeed.com JavaScript equivalent: export const BuzzfeedExtractor = { ... }
var BuzzapJpExtractor = &CustomExtractor{ Domain: "buzzap.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.entry-title", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time.entry-date", "datetime"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.ctiframe", }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
BuzzapJpExtractor provides the custom extraction rules for buzzap.jp JavaScript equivalent: export const BuzzapJpExtractor = { ... }
var ClinicaltrialsGovExtractor = &CustomExtractor{ Domain: "clinicaltrials.gov", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.tr-solo_record", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "div#sponsor.tr-info-text", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ `div:has(> span.term[data-term="Last Update Posted"])`, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div#tab-body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".usa-alert> img", }, }, }
ClinicaltrialsGovExtractor provides the custom extraction rules for clinicaltrials.gov JavaScript equivalent: export const ClinicaltrialsGovExtractor = { ... }
var DaringFireballExtractor = &CustomExtractor{ Domain: "daringfireball.net", SupportedDomains: []string{ "www.daringfireball.net", }, Title: &FieldExtractor{ Selectors: []interface{}{ "title", "h1", "h2.entry-title", "h1.entry-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "[name='author']", ".author", ".byline", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time", "datetime"}, []string{"[datetime]", "datetime"}, "p.smallprint em", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div#Main", ".main-content", "main", "article", "body", }, }, Clean: []string{ "div#Banner", "div#Sidebar", "div#Footer", "#SidebarMartini", "nav", "ul", "div#Sidebar ul", "a[title*='Daring Fireball']", "img[alt*='Daring Fireball']", "p:contains('By John Gruber')", ".smallprint", "div#Footer", "[href='/preferences/']", "a[href='/preferences/']", "em", "p:last-child", "div#Main > p:last-child", "div#Main > p:last-of-type", "[href*='apps.apple.com']", "img[src*='/martini/']", "a:contains('Walk the World')", "script", "style", "noscript", ".ads", ".advertisement", ".sponsored", }, Transforms: map[string]TransformFunction{ "p": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { text := selection.Text() if strings.Contains(text, "★ _") || strings.Contains(text, "Display Preferences") || strings.Contains(text, "Copyright ©") { selection.Remove() } return nil }, }, "a": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { href, exists := selection.Attr("href") if exists && strings.Contains(href, "/preferences/") { selection.Remove() } return nil }, }, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[property='og:image']", "content"}, []string{"meta[name='twitter:image']", "content"}, []string{"meta[name='og:image']", "content"}, }, }, }
DaringFireballExtractor provides the custom extraction rules for daringfireball.net
var DeadlineCustomExtractor = &CustomExtractor{ Domain: "deadline.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "section.author h2", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item", }, }, Transforms: map[string]TransformFunction{ ".embed-twitter": &FunctionTransform{ Fn: transformDeadlineTwitterEmbed, }, }, Clean: []string{ "figcaption", }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
DeadlineCustomExtractor provides the custom extraction rules for deadline.com JavaScript equivalent: export const DeadlineComExtractor = { ... }
var DeadspinComExtractor = &CustomExtractor{ Domain: "deadspin.com", SupportedDomains: []string{ "jezebel.com", "lifehacker.com", "kotaku.com", "gizmodo.com", "jalopnik.com", "kinja.com", "avclub.com", "clickhole.com", "splinternews.com", "theonion.com", "theroot.com", "thetakeout.com", "theinventory.com", }, Title: &FieldExtractor{ Selectors: []interface{}{ "header h1", "h1.headline", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a[data-ga*=\"Author\"]", ".author", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, []string{"time.updated[datetime]", "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".js_post-content", ".post-content", ".entry-content", }, }, Transforms: map[string]TransformFunction{ "iframe.lazyload[data-recommend-id^=\"youtube://\"]": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { id, exists := selection.Attr("id") if exists && strings.HasPrefix(id, "youtube-") { youtubeId := strings.TrimPrefix(id, "youtube-") selection.SetAttr("src", "https://www.youtube.com/embed/"+youtubeId) } return nil }, }, }, Clean: []string{ ".magnifier", ".lightbox", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: nil, Excerpt: nil, }
DeadspinComExtractor provides the custom extraction rules for deadspin.com and supported domains JavaScript equivalent: export const DeadspinExtractor = { ... }
var EOnlineCustomExtractor = &CustomExtractor{ Domain: "www.eonline.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.article-detail__title", "h1.article__title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".article-detail__meta__author", ".entry-meta__author a", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, []string{"meta[itemprop=\"datePublished\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article-detail__main-content section", ".post-content section, .post-content div.post-content__image", }, }, Transforms: map[string]TransformFunction{ "div.post-content__image": &StringTransform{TargetTag: "figure"}, "div.post-content__image .image__credits": &StringTransform{TargetTag: "figcaption"}, }, Clean: []string{}, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
EOnlineCustomExtractor provides the custom extraction rules for www.eonline.com JavaScript equivalent: export const WwwEonlineComExtractor = { ... }
var EpaperZeitDeExtractor = &CustomExtractor{ Domain: "epaper.zeit.de", Title: &FieldExtractor{ Selectors: []interface{}{ "p.title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".article__author", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article", }, }, Transforms: map[string]TransformFunction{ "p.title": &StringTransform{"h1"}, ".article__author": &StringTransform{"p"}, "byline": &StringTransform{"p"}, "linkbox": &StringTransform{"p"}, }, Clean: []string{ "image-credits", "box[type=citation]", }, }, DatePublished: nil, LeadImageURL: nil, Dek: nil, NextPageURL: nil, Excerpt: &FieldExtractor{ Selectors: []interface{}{ "subtitle", }, }, }
EpaperZeitDeExtractor provides the custom extraction rules for epaper.zeit.de JavaScript equivalent: export const EpaperZeitDeExtractor = { ... }
var FandomWikiaCustomExtractor = &CustomExtractor{ Domain: "fandom.wikia.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.entry-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".author vcard", ".fn", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".grid-content", ".entry-content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: nil, Excerpt: nil, }
FandomWikiaCustomExtractor provides the custom extraction rules for fandom.wikia.com JavaScript equivalent: export const WikiaExtractor = { ... }
var GeniusCustomExtractor = &CustomExtractor{ Domain: "genius.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "h2 a", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".lyrics", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []interface{}{ "meta[itemprop=page_data]", "value", transformGeniusDateFromJSON, }, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []interface{}{ "meta[itemprop=page_data]", "value", transformGeniusImageFromJSON, }, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: nil, Excerpt: nil, }
GeniusCustomExtractor provides the custom extraction rules for genius.com JavaScript equivalent: export const GeniusComExtractor = { ... }
var GetnewsJpExtractor = &CustomExtractor{ Domain: "getnews.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "article h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, "span.prof", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, []string{"ul.cattag-top time", "datetime"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.post-bodycopy", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
GetnewsJpExtractor provides the custom extraction rules for getnews.jp JavaScript equivalent: export const GetnewsJpExtractor = { ... }
var GithubComExtractor = &CustomExtractor{ Domain: "github.com", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"relative-time[datetime]", "datetime"}, []string{"span[itemprop=\"dateModified\"] relative-time", "datetime"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"description\"]", "value"}, "span[itemprop=\"about\"]", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []interface{}{"#readme article"}, }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
GithubComExtractor provides the custom extraction rules for github.com JavaScript equivalent: export const GithubComExtractor = { ... }
var GlobalRegistryManager = NewRegistryManager()
Default global registry instance JavaScript equivalent: The implicit global registry used throughout the codebase
var HuffingtonPostCustomExtractor = &CustomExtractor{ Domain: "www.huffingtonpost.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.headline__title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "span.author-card__details__name", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.entry__body", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".pull-quote", ".tag-cloud", ".embed-asset", ".below-entry", ".entry-corrections", "#suggested-story", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:modified_time\"]", "value"}, []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "h2.headline__subtitle", }, }, NextPageURL: nil, Excerpt: nil, }
HuffingtonPostCustomExtractor provides the custom extraction rules for www.huffingtonpost.com JavaScript equivalent: export const WwwHuffingtonpostComExtractor = { ... }
var IciRadioCanadaCaExtractor = &CustomExtractor{ Domain: "ici.radio-canada.ca", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"dc.creator\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "section.document-content-style", []string{".main-multimedia-item", ".news-story-content"}, }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"dc.date.created\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "div.lead-container", ".bunker-component.lead", }, }, NextPageURL: nil, Excerpt: nil, }
IciRadioCanadaCaExtractor provides the custom extraction rules for ici.radio-canada.ca JavaScript equivalent: export const IciRadioCanadaCaExtractor = { ... }
var JapanCnetComExtractor = &CustomExtractor{ Domain: "japan.cnet.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".leaf-headline-ttl", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".writer", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".date", }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article_body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
JapanCnetComExtractor provides the custom extraction rules for japan.cnet.com JavaScript equivalent: export const JapanCnetComExtractor = { ... }
var JapanZdnetComExtractor = &CustomExtractor{ Domain: "japan.zdnet.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"cXenseParse:author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article_body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
JapanZdnetComExtractor provides the custom extraction rules for japan.zdnet.com JavaScript equivalent: export const JapanZdnetComExtractor = { ... }
var JvndbJvnJpExtractor = &CustomExtractor{ Domain: "jvndb.jvn.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "title", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "div.modifytxt:nth-child(2)", }, }, Dek: nil, LeadImageURL: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#news-list", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
JvndbJvnJpExtractor provides the custom extraction rules for jvndb.jvn.jp JavaScript equivalent: export const JvndbJvnJpExtractor = { ... }
var LinkedInCustomExtractor = &CustomExtractor{ Domain: "www.linkedin.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".article-title", "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".main-author-card h3", []string{"meta[name=\"article:author\"]", "value"}, ".entity-name a[rel=author]", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article-content__body", []string{"header figure", ".prose"}, ".prose", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".entity-image", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".base-main-card__metadata", []string{`time[itemprop="datePublished"]`, "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: nil, Excerpt: nil, }
LinkedInCustomExtractor provides the custom extraction rules for www.linkedin.com JavaScript equivalent: export const WwwLinkedinComExtractor = { ... }
var LittleThingsCustomExtractor = &CustomExtractor{ Domain: "www.littlethings.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1[class*=\"PostHeader\"]", "h1.post-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "div[class^=\"PostHeader__ScAuthorNameSection\"]", []string{"meta[name=\"author\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "section[class*=\"PostMainArticle\"]", ".mainContentIntro", ".content-wrapper", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{}, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, }
LittleThingsCustomExtractor provides the custom extraction rules for www.littlethings.com JavaScript equivalent: export const LittleThingsExtractor = { ... }
var MaTtiasBeExtractor = &CustomExtractor{ Domain: "ma.ttias.be", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"twitter:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".content", }, }, Transforms: map[string]TransformFunction{ "h2": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { selection.RemoveAttr("id") selection.Get(0).Data = "h3" return nil }, }, "h1": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { selection.RemoveAttr("id") selection.AfterHtml("<p></p>") return nil }, }, "ul": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { selection.AddClass("entry-content-asset") return nil }, }, }, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: nil, Dek: nil, NextPageURL: nil, Excerpt: nil, }
MaTtiasBeExtractor provides the custom extraction rules for ma.ttias.be JavaScript equivalent: export const MaTtiasBeExtractor = { ... }
var MashableComExtractor = &CustomExtractor{ Domain: "mashable.com", Title: &FieldExtractor{ Selectors: []interface{}{ "header h1", "h1.title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, "span.author_name a", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#article", "section.article-content.blueprint", }, }, Transforms: map[string]TransformFunction{ ".image-credit": &StringTransform{ TargetTag: "figcaption", }, }, Clean: []string{}, }, }
MashableComExtractor provides the custom extraction rules for mashable.com JavaScript equivalent: export const MashableComExtractor = { ... }
var MediumCustomExtractor = &CustomExtractor{ Domain: "medium.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{"article"}, }, Clean: []string{"span a", "svg"}, Transforms: map[string]TransformFunction{ "section span:first-of-type": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { text := selection.Text() if len(text) == 1 && regexp.MustCompile(`^[a-zA-Z()]+$`).MatchString(text) { selection.ReplaceWith(text) } return nil }, }, "iframe": &FunctionTransform{ Fn: transformMediumIframe, }, "figure": &FunctionTransform{ Fn: transformMediumFigure, }, "img": &FunctionTransform{ Fn: transformMediumImage, }, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: nil, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
MediumCustomExtractor provides the custom extraction rules for Medium.com JavaScript equivalent: export const MediumExtractor = { ... }
var MediumCustomExtractorFixed = &CustomExtractor{ Domain: "medium.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{"article"}, }, Clean: []string{"span a", "svg"}, Transforms: map[string]TransformFunction{ "section span:first-of-type": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { text := selection.Text() if len(text) == 1 && regexp.MustCompile(`^[a-zA-Z()]+$`).MatchString(text) { selection.ReplaceWith(text) } return nil }, }, "img": &FunctionTransform{ Fn: transformMediumImageFixed, }, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: nil, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
MediumCustomExtractor provides the custom extraction rules for Medium.com JavaScript equivalent: export const MediumExtractor = { ... }
var NYMagCustomExtractor = &CustomExtractor{ Domain: "nymag.com", Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article-content", "section.body", "article.article", }, }, Clean: []string{ ".ad", ".single-related-story", }, Transforms: map[string]TransformFunction{ "h1": &StringTransform{TargetTag: "h2"}, "noscript": &FunctionTransform{ Fn: transformNYMagNoscript, }, }, }, Title: &FieldExtractor{ Selectors: []interface{}{ "h1.lede-feature-title", "h1.headline-primary", "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".by-authors", ".lede-feature-author", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".lede-feature-teaser", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time.article-timestamp[datetime]", "datetime"}, "time.article-timestamp", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
NYMagCustomExtractor provides the custom extraction rules for nymag.com JavaScript equivalent: export const NYMagExtractor = { ... }
var NewYorkerCustomExtractor = &CustomExtractor{ Domain: "www.newyorker.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1[class^=\"content-header\"]", "h1[class^=\"ArticleHeader__hed\"]", "h1[class*=\"ContentHeaderHed\"]", []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "article header div[class^=\"BylinesWrapper\"]", []string{"meta[name=\"article:author\"]", "value"}, "div[class^=\"ArticleContributors\"] a[rel=\"author\"]", "article header div[class*=\"Byline__multipleContributors\"]", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article__body", "article.article.main-content", "main[class^=\"Layout__content\"]", }, }, Transforms: map[string]TransformFunction{ ".caption__text": &StringTransform{TargetTag: "figcaption"}, ".caption__credit": &StringTransform{TargetTag: "figcaption"}, }, Clean: []string{ "footer[class^=\"ArticleFooter__footer\"]", "aside", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, "time.content-header__publish-date", []string{"meta[name=\"pubdate\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "div[class^=\"ContentHeaderDek\"]", "div.content-header__dek", "h2[class^=\"ArticleHeader__dek\"]", }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
NewYorkerCustomExtractor provides the custom extraction rules for www.newyorker.com JavaScript equivalent: export const NewYorkerExtractor = { ... }
Domain: "news.mynavi.jp", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a.articleHeader_name", "main div.article-author a.article-author__name", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article-body", "main article div", }, }, Transforms: map[string]TransformFunction{ "img": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { dataOriginal, exists := selection.Attr("data-original") if exists && dataOriginal != "" { selection.SetAttr("src", dataOriginal) } return nil }, }, }, Clean: []string{}, }, }
NewsMynaviJpExtractor provides the custom extraction rules for news.mynavi.jp JavaScript equivalent: export const NewsMynaviJpExtractor = { ... }
var NewsNationalgeographicComExtractor = &CustomExtractor{ Domain: "news.nationalgeographic.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", "h1.main-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".byline-component__contributors b span", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".article__deck", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []string{".parsys.content", ".__image-lead__"}, ".content", }, }, Transforms: map[string]TransformFunction{ ".parsys.content": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { imgSrc, exists := selection.Find(".image.parbase.section").Find(".picturefill").First().Attr("data-platform-src") if exists && imgSrc != "" { imageHTML := fmt.Sprintf(`<img class="__image-lead__" src="%s"/>`, imgSrc) selection.PrependHtml(imageHTML) } return nil }, }, }, Clean: []string{ ".pull-quote.pull-quote--large", }, }, }
NewsNationalgeographicComExtractor provides the custom extraction rules for news.nationalgeographic.com JavaScript equivalent: export const NewsNationalgeographicComExtractor = { ... }
var PastebinCustomExtractor = &CustomExtractor{ Domain: "pastebin.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".username", ".paste_box_line2 .t_us + a", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".source", "#selectable .text", }, }, Transforms: map[string]TransformFunction{ "ol": &StringTransform{ TargetTag: "div", }, "li": &StringTransform{ TargetTag: "p", }, }, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".date", ".paste_box_line2 .t_da + span", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: nil, NextPageURL: nil, Excerpt: nil, }
PastebinCustomExtractor provides the custom extraction rules for pastebin.com JavaScript equivalent: export const PastebinComExtractor = { ... }
var PeopleCustomExtractor = &CustomExtractor{ Domain: "people.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".article-header h1", []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"sailthru.author\"]", "value"}, "a.author.url.fn", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".mntl-attribution__item-date", []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".article-header h2", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div[class^=\"loc article-content\"]", "div.article-body__inner", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
PeopleCustomExtractor provides the custom extraction rules for people.com JavaScript equivalent: export const PeopleComExtractor = { ... }
var PhpspotOrgExtractor = &CustomExtractor{ Domain: "phpspot.org", Title: &FieldExtractor{ Selectors: []interface{}{ "h3.hl", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "h4.hl", }, Format: "YYYY年MM月DD日", Timezone: "Asia/Tokyo", }, Dek: nil, LeadImageURL: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.entrybody", }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
PhpspotOrgExtractor provides the custom extraction rules for phpspot.org JavaScript equivalent: export const PhpspotOrgExtractor = { ... }
var PitchforkCustomExtractor = &CustomExtractor{ Domain: "pitchfork.com", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:title\"]", "value"}, "title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, ".authors-detail__display-name", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "div[class^=\"InfoSliceWrapper-\"]", []string{".pub-date", "datetime"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:description\"]", "value"}, ".review-detail__abstract", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, []string{".single-album-tombstone__art img", "src"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.body__inner-container", ".review-detail__text", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, Extend: map[string]*FieldExtractor{ "score": { Selectors: []interface{}{ "p[class*=\"Rating\"]", ".score", }, }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
PitchforkCustomExtractor provides the custom extraction rules for pitchfork.com JavaScript equivalent: export const PitchforkComExtractor = { ... }
var PolygonExtractor = &CustomExtractor{ Domain: "www.polygon.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.article-header-title", "h1[class*='article']", "h1", []string{"meta[property=\"og:title\"]", "content"}, []string{"meta[name=\"twitter:title\"]", "content"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".article-author", ".meta_txt.article-author", ".w-author-name .article-author", []string{"meta[name=\"author\"]", "content"}, []string{"meta[property=\"article:author\"]", "content"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[property=\"article:published_time\"]", "content"}, []string{"meta[property=\"og:published_time\"]", "content"}, ".article-date", ".meta_txt.article-date", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "header p", ".article-excerpt", []string{"meta[name=\"description\"]", "content"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[property=\"og:image\"]", "content"}, []string{"meta[name=\"twitter:image\"]", "content"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []interface{}{ "#article-body .content-block-regular", "#article-body > p", "#article-body > h1", "#article-body > h2", "#article-body > h3", "#article-body > h4", "#article-body > figure", "#article-body > blockquote", "#article-body > img", }, "#article-body", ".article-body", "article.w-article .article-body", "main .w-article", ".entry-content", ".post-content", }, }, Transforms: map[string]TransformFunction{ "noscript": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { html, err := selection.Html() if err != nil { return err } if html != "" { selection.ReplaceWithHtml("<span>" + html + "</span>") } return nil }, }, }, Clean: []string{ ".w-directory-warning", "nav.article-directory-sidenav", ".sidenav-level", ".sidenav-item", ".directory-warning", "a.directory-warning", ".article-footer-nav", ".pagination-nav", ".article-nav", "[class*='ad-']", "[id*='ad-']", ".advertisement", ".promo", ".social-share", ".share-buttons", ".w-sharing-copy", ".follow-container", ".w-follow-btn", ".w-like-btn", ".option-btn", ".btn-fab", ".disqus-load-btn", ".w-related-content", ".w-header-related-feed", ".section-header", ".section-title", ".display-card-title", ".display-card", ".w-display-card-content", ".article-header-complementary", ".sidebar-tabs", ".tabs-ul", ".tabs-header", ".tab-content", ".sidebar-el-content", ".related-articles", ".newsletter-signup", ".email-signup", ".w-heading-options", ".w-header-user-box", ".user-box-title", ".article-header-data", ".comments-section", ".comment-form", "#disqus_thread", ".article-comments", "[class*='trending']", "[class*='popular']", "[id*='trending']", "[id*='popular']", ".article-header-complementary", ".w-login", ".valnet-login", ".w-valnet-login", "[id*='login']", ".article-header-bg", ".thread-option", ".fab-label", "script", "noscript", "style", "img.c-dynamic-image", }, }, }
PolygonExtractor provides the custom extraction rules for www.polygon.com
var PopSugarCustomExtractor = &CustomExtractor{ Domain: "www.popsugar.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h2.post-title", "title-text", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".share-copy-title", ".post-tags", ".reactions", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
PopSugarCustomExtractor provides the custom extraction rules for www.popsugar.com JavaScript equivalent: export const WwwPopsugarComExtractor = { ... }
var QdailyCustomExtractor = &CustomExtractor{ Domain: "www.qdaily.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h2", "h2.title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".name", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".detail", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".lazyload", ".lazylad", ".lazylood", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{".date.smart-date", "data-origindate"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{".article-detail-hd img", "src"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".excerpt", }, }, NextPageURL: nil, Excerpt: nil, }
QdailyCustomExtractor provides the custom extraction rules for www.qdaily.com JavaScript equivalent: export const WwwQdailyComExtractor = { ... }
var RedditCustomExtractor = &CustomExtractor{ Domain: "www.reddit.com", Title: &FieldExtractor{ Selectors: []interface{}{ `div[data-test-id="post-content"] h1`, `div[data-test-id="post-content"] h2`, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ `div[data-test-id="post-content"] a[href*="user/"]`, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []string{`div[data-test-id="post-content"] p`}, []string{ `div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])`, `div[data-test-id="post-content"] div[data-click-id="media"]`, }, []string{`div[data-test-id="post-content"] div[data-click-id="media"]`}, []string{`div[data-test-id="post-content"] a`}, `div[data-test-id="post-content"]`, }, }, Transforms: map[string]TransformFunction{ `div[role="img"]`: &FunctionTransform{ Fn: transformRedditImagePreview, }, }, Clean: []string{ ".icon", `span[id^="PostAwardBadges"]`, `div a[data-test-id="comments-page-link-num-comments"]`, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ `div[data-test-id="post-content"] span[data-click-id="timestamp"]`, `div[data-test-id="post-content"] a[data-click-id="timestamp"]`, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: nil, NextPageURL: nil, Excerpt: nil, }
RedditCustomExtractor provides the custom extraction rules for www.reddit.com JavaScript equivalent: export const WwwRedditComExtractor = { ... }
var RollingStoneCustomExtractor = &CustomExtractor{ Domain: "www.rollingstone.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.l-article-header__row--title", "h1.content-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a.c-byline__link", "a.content-author.tracked-offpage", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, "time.content-published-date", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "h2.l-article-header__row--lead", ".content-description", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".l-article-content", []string{".lead-container", ".article-content"}, ".article-content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".c-related-links-wrapper", ".module-related", }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
RollingStoneCustomExtractor provides the custom extraction rules for www.rollingstone.com JavaScript equivalent: export const WwwRollingstoneComExtractor = { ... }
var ScanNetsecurityNeJpExtractor = &CustomExtractor{ Domain: "scan.netsecurity.ne.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "header.arti-header h1.head", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:modified_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "header.arti-header p.arti-summary", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.arti-content.arti-content--thumbnail", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "aside.arti-giga", }, }, }
ScanNetsecurityNeJpExtractor provides the custom extraction rules for scan.netsecurity.ne.jp JavaScript equivalent: export const ScanNetsecurityNeJpExtractor = { ... }
var ScienceflyComExtractor = &CustomExtractor{ Domain: "sciencefly.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".entry-title", ".cb-entry-title", ".cb-single-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "div.cb-author", "div.cb-author-title", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"div.theiaPostSlider_slides img", "src"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.theiaPostSlider_slides", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
ScienceflyComExtractor provides the custom extraction rules for sciencefly.com JavaScript equivalent: export const ScienceflyComExtractor = { ... }
var SectIijAdJpExtractor = &CustomExtractor{ Domain: "sect.iij.ad.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "div.title-box-inner h1", "h3", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "p.post-author a", "dl.entrydate dd", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "time", }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".entry-inner", "#article", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "dl.entrydate", }, }, }
SectIijAdJpExtractor provides the custom extraction rules for sect.iij.ad.jp JavaScript equivalent: export const SectIijAdJpExtractor = { ... }
var TMZCustomExtractor = &CustomExtractor{ Domain: "www.tmz.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".post-title-breadcrumb", "h1", ".headline", }, }, Author: &FieldExtractor{ Selectors: []interface{}{"TMZ STAFF"}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".article__published-at", ".article-posted-date", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article__blocks", ".article-content", ".all-post-body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".lightbox-link", }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
TMZCustomExtractor provides the custom extraction rules for www.tmz.com JavaScript equivalent: export const WwwTmzComExtractor = { ... }
var TakagihiromitsuJpExtractor = &CustomExtractor{ Domain: "takagi-hiromitsu.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h3", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[http-equiv=\"Last-Modified\"]", "value"}, }, }, Dek: nil, LeadImageURL: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.body", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
TakagihiromitsuJpExtractor provides the custom extraction rules for takagi-hiromitsu.jp JavaScript equivalent: export const TakagihiromitsuJpExtractor = { ... }
var TechlogIijAdJpExtractor = &CustomExtractor{ Domain: "techlog.iij.ad.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.entry-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a[rel=\"author\"]", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time.entry-date", "datetime"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.entry-content", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".wp_social_bookmarking_light", }, }, }
TechlogIijAdJpExtractor provides the custom extraction rules for techlog.iij.ad.jp JavaScript equivalent: export const TechlogIijAdJpExtractor = { ... }
var TheAtlanticCustomExtractor = &CustomExtractor{ Domain: "www.theatlantic.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", ".c-article-header__hed", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, ".c-byline__author", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "article", ".article-body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".partner-box", ".callout", ".c-article-writer__image", ".c-article-writer__content", ".c-letters-cta__text", ".c-footer__logo", ".c-recirculation-link", ".twitter-tweet", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"description\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time[itemprop=\"datePublished\"]", "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
TheAtlanticCustomExtractor provides the custom extraction rules for www.theatlantic.com JavaScript equivalent: export const TheAtlanticExtractor = { ... }
var ThoughtCatalogCustomExtractor = &CustomExtractor{ Domain: "thoughtcatalog.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.title", []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "cite a", "div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name", "h1.writer-name", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".entry.post", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".tc_mark", "figcaption", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: nil, NextPageURL: nil, Excerpt: nil, }
ThoughtCatalogCustomExtractor provides the custom extraction rules for thoughtcatalog.com JavaScript equivalent: export const ThoughtcatalogComExtractor = { ... }
var TimesofindiaIndiatimesComExtractor = &CustomExtractor{ Domain: "timesofindia.indiatimes.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.contentwrapper:has(section)", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "section", "h1", ".byline", ".img_cptn", ".icon_share_wrap", "ul[itemtype=\"https://schema.org/BreadcrumbList\"]", }, DefaultCleaner: false, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".byline", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: nil, NextPageURL: nil, Excerpt: nil, Extend: map[string]*FieldExtractor{ "reporter": { Selectors: []interface{}{ "div.byline", }, }, }, }
TimesofindiaIndiatimesComExtractor provides the custom extraction rules for timesofindia.indiatimes.com JavaScript equivalent: export const TimesofindiaIndiatimesComExtractor = { ... }
var TwitterCustomExtractor = &CustomExtractor{ Domain: "twitter.com", Title: nil, Author: &FieldExtractor{ Selectors: []interface{}{ ".tweet.permalink-tweet .username", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ `.permalink[role=main]`, }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{ `.permalink[role=main]`: &FunctionTransform{ Fn: transformTwitterPermalink, }, "s": &StringTransform{ TargetTag: "span", }, }, Clean: []string{ ".stream-item-footer", "button", ".tweet-details-fixer", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{`.permalink-tweet ._timestamp[data-time-ms]`, "data-time-ms"}, }, }, LeadImageURL: nil, Dek: nil, NextPageURL: nil, Excerpt: nil, }
TwitterCustomExtractor provides the custom extraction rules for twitter.com JavaScript equivalent: export const TwitterExtractor = { ... }
var TwofortysevensportsComExtractor = &CustomExtractor{ Domain: "247sports.com", Title: &FieldExtractor{ Selectors: []interface{}{ "title", "article header h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".article-cnt__author", ".author", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time[data-published]", "data-published"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article-body", "section.body.article", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
TwofortysevensportsComExtractor provides the custom extraction rules for 247sports.com JavaScript equivalent: export const twofortysevensportsComExtractor = { ... }
var USMagazineCustomExtractor = &CustomExtractor{ Domain: "www.usmagazine.com", Title: &FieldExtractor{ Selectors: []interface{}{ "header h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a.author", "a.article-byline.tracked-offpage", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article-content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".module-related", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
USMagazineCustomExtractor provides the custom extraction rules for www.usmagazine.com JavaScript equivalent: export const WwwUsmagazineComExtractor = { ... }
var UproxxCustomExtractor = &CustomExtractor{ Domain: "uproxx.com", Title: &FieldExtractor{ Selectors: []interface{}{ "div.entry-header h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"qc:author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".entry-content", }, }, Transforms: map[string]TransformFunction{ "div.image": &StringTransform{TargetTag: "figure"}, "div.image .wp-media-credit": &StringTransform{TargetTag: "figcaption"}, }, Clean: []string{}, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Excerpt: &FieldExtractor{ Selectors: []interface{}{}, }, }
UproxxCustomExtractor provides the custom extraction rules for uproxx.com JavaScript equivalent: export const UproxxComExtractor = { ... }
var VoxCustomExtractor = &CustomExtractor{ Domain: "www.vox.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1[class*=\"h74scy\"]", "h1.c-page-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".duet--article--article-body-component", "div[id*='zephr-anchor']", ".duet--layout--entry-body", []string{"figure.e-image--hero", ".c-entry-content"}, ".c-entry-content", }, }, Transforms: map[string]TransformFunction{ "figure .e-image__image noscript": &FunctionTransform{ Fn: transformVoxNoscriptImage, }, "figure .e-image__meta": &StringTransform{ TargetTag: "figcaption", }, }, Clean: []string{ ".duet--article--block-placement", ".duet--article--related", ".duet--cta--newsletter", "form", ".duet--article--share-buttons", ".duet--article--article-pullquote", ".duet--media--caption", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "p[class*=\"h74scyi\"]", ".p-dek", }, }, NextPageURL: nil, Excerpt: nil, }
VoxCustomExtractor provides the custom extraction rules for www.vox.com JavaScript equivalent: export const WwwVoxComExtractor = { ... }
var WeeklyAsciiJpExtractor = &CustomExtractor{ Domain: "weekly.ascii.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "article h1", "h1[itemprop=\"headline\"]", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "p.author", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "p.date", []string{"meta[name=\"odate\"]", "value"}, }, Format: "YYYY年MM月DD日 HH:mm", Timezone: "Asia/Tokyo", }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div#contents_detail", "div.article", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WeeklyAsciiJpExtractor provides the custom extraction rules for weekly.ascii.jp JavaScript equivalent: export const WeeklyAsciiJpExtractor = { ... }
var WikipediaCustomExtractor = &CustomExtractor{ Domain: "wikipedia.org", Title: &FieldExtractor{ Selectors: []interface{}{ "h2.title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{}, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#mw-content-text", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{ ".infobox img": &FunctionTransform{ Fn: transformWikipediaInfoboxImg, }, ".infobox caption": &StringTransform{ TargetTag: "figcaption", }, ".infobox": &StringTransform{ TargetTag: "figure", }, }, Clean: []string{ ".mw-editsection", "figure tr, figure td, figure tbody", "#toc", ".navbox", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "#footer-info-lastmod", }, }, LeadImageURL: nil, Dek: nil, NextPageURL: nil, Excerpt: nil, }
WikipediaCustomExtractor provides the custom extraction rules for wikipedia.org JavaScript equivalent: export const WikipediaExtractor = { ... }
var WiredJpExtractor = &CustomExtractor{ Domain: "wired.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1[data-testid=\"ContentHeaderHed\"]", "h1.post-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, "p[itemprop=\"author\"]", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, []string{"time", "datetime"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "div[class^=\"ContentHeaderDek\"]", ".post-intro", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div[data-attribute-verso-pattern=\"article-body\"]", "article.article-detail", }, }, Transforms: map[string]TransformFunction{ "img[data-original]": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { dataOriginal, hasDataOriginal := selection.Attr("data-original") src, hasSrc := selection.Attr("src") if hasDataOriginal && hasSrc { base, err := url.Parse(src) if err != nil { return err } ref, err := url.Parse(dataOriginal) if err != nil { return err } resolved := base.ResolveReference(ref) selection.SetAttr("src", resolved.String()) } return nil }, }, }, Clean: []string{ ".post-category", "time", "h1.post-title", ".social-area-syncer", }, }, }
WiredJpExtractor provides the custom extraction rules for wired.jp JavaScript equivalent: export const WiredJpExtractor = { ... }
var WwwAbendblattDeExtractor = &CustomExtractor{ Domain: "www.abendblatt.de", Title: &FieldExtractor{ Selectors: []interface{}{ "h2.article__header__headline", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "span.author-info__name-text", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article__body", }, }, Transforms: map[string]TransformFunction{ "p": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { DeobfuscateAbendblattText(selection) return nil }, }, "div": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { DeobfuscateAbendblattText(selection) return nil }, }, }, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time.teaser-stream-time", "datetime"}, []string{"time.article__header__date", "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"description\"]", "value"}, }, }, NextPageURL: nil, Excerpt: nil, }
WwwAbendblattDeExtractor provides the custom extraction rules for www.abendblatt.de JavaScript equivalent: export const WwwAbendblattDeExtractor = { ... }
var WwwAndroidcentralComExtractor = &CustomExtractor{ Domain: "www.androidcentral.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", "h1.main-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"parsely-author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#article-body", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".intro", "blockquote", }, }, }
WwwAndroidcentralComExtractor provides the custom extraction rules for www.androidcentral.com JavaScript equivalent: export const WwwAndroidcentralComExtractor = { ... }
var WwwAsahiComExtractor = &CustomExtractor{ Domain: "www.asahi.com", Title: &FieldExtractor{ Selectors: []interface{}{ "main h1", ".ArticleTitle h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"pubdate\"]", "value"}, }, }, Dek: nil, Excerpt: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "main", }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{ "div.AdMod", "div.LoginSelectArea", "time", "div.notPrint", }, }, }
WwwAsahiComExtractor provides the custom extraction rules for www.asahi.com JavaScript equivalent: export const WwwAsahiComExtractor = { ... }
var WwwCbcCaExtractor = &CustomExtractor{ Domain: "www.cbc.ca", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".authorText", ".bylineDetails", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".story", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{".timeStamp[datetime]", "datetime"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".deck", }, }, NextPageURL: nil, Excerpt: nil, }
WwwCbcCaExtractor provides the custom extraction rules for www.cbc.ca JavaScript equivalent: export const WwwCbcCaExtractor = { ... }
var WwwCbssportsComExtractor = &CustomExtractor{ Domain: "www.cbssports.com", Title: &FieldExtractor{ Selectors: []interface{}{ ".Article-headline", ".article-headline", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".ArticleAuthor-nameText", ".author-name", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[itemprop=\"datePublished\"]", "value"}, }, Timezone: "UTC", }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".Article-subline", ".article-subline", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwCbssportsComExtractor provides the custom extraction rules for www.cbssports.com JavaScript equivalent: export const WwwCbssportsComExtractor = { ... }
var WwwCnetComExtractor = &CustomExtractor{ Domain: "www.cnet.com", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "span.author", "a.author", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "time", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".c-head_dek", ".article-dek", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []interface{}{"img.__image-lead__", ".article-main-body"}, ".article-main-body", }, }, Transforms: map[string]TransformFunction{ "figure.image": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { img := selection.Find("img") if img.Length() > 0 { img.SetAttr("width", "100%") img.SetAttr("height", "100%") img.AddClass("__image-lead__") selection.Find(".imgContainer").Remove() selection.PrependSelection(img) } return nil }, }, }, Clean: []string{}, }, }
WwwCnetComExtractor provides the custom extraction rules for www.cnet.com JavaScript equivalent: export const WwwCnetComExtractor = { ... }
var WwwEngadgetComExtractor = &CustomExtractor{ Domain: "www.engadget.com", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:title\"]", "value"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "a.th-meta[data-ylk*=\"subsec:author\"]", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{}, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "div[class*=\"o-title_mark\"] div", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{}, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []interface{}{ "#page_body figure:not(div.article-text figure)", "div.article-text", }, }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwEngadgetComExtractor provides the custom extraction rules for www.engadget.com JavaScript equivalent: export const WwwEngadgetComExtractor = { ... }
var WwwFortinetComExtractor = &CustomExtractor{ Domain: "www.fortinet.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".b15-blog-meta__author", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12", }, }, Transforms: map[string]TransformFunction{ "noscript": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { children := selection.Children() if children.Length() == 1 { firstChild := children.First() if firstChild.Is("img") { selection.ReplaceWithSelection(firstChild.WrapInner("<figure>").Parent()) } } return nil }, }, }, Clean: []string{}, }, }
WwwFortinetComExtractor provides the custom extraction rules for www.fortinet.com JavaScript equivalent: export const WwwFortinetComExtractor = { ... }
var WwwGizmodoJpExtractor = &CustomExtractor{ Domain: "www.gizmodo.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.p-post-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "li.p-post-AssistAuthor", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"li.p-post-AssistTime time", "datetime"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "article.p-post", }, }, Transforms: map[string]TransformFunction{ "img.p-post-thumbnailImage": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { src, exists := selection.Attr("src") if exists { src = strings.ReplaceAll(src, "%27", "'") if idx := strings.LastIndex(src, "='"); idx >= 0 { src = src[idx+2:] } src = strings.TrimSuffix(src, "';") selection.SetAttr("src", src) } return nil }, }, }, Clean: []string{ "h1.p-post-title", "ul.p-post-Assist", }, }, }
WwwGizmodoJpExtractor provides the custom extraction rules for www.gizmodo.jp JavaScript equivalent: export const WwwGizmodoJpExtractor = { ... }
var WwwGrueneDeExtractor = &CustomExtractor{ Domain: "www.gruene.de", Title: &FieldExtractor{ Selectors: []interface{}{ "header h1", }, }, Author: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ []string{"section header", "section h2", "section p", "section ol"}, }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "figcaption", "p[class]", }, }, DatePublished: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[property=\"og:image\"]", "content"}, }, }, Dek: nil, NextPageURL: nil, Excerpt: nil, }
WwwGrueneDeExtractor provides the custom extraction rules for www.gruene.de JavaScript equivalent: export const WwwGrueneDeExtractor = { ... }
var WwwInfoqComExtractor = &CustomExtractor{ Domain: "www.infoq.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.heading", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "div.widget.article__authors", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".article__readTime.date", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.article__data", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwInfoqComExtractor provides the custom extraction rules for www.infoq.com JavaScript equivalent: export const WwwInfoqComExtractor = { ... }
var WwwIpaGoJpExtractor = &CustomExtractor{ Domain: "www.ipa.go.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "p.ipar_text_right", }, }, Dek: nil, LeadImageURL: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#ipar_main", }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "p.ipar_text_right", }, }, }
WwwIpaGoJpExtractor provides the custom extraction rules for www.ipa.go.jp JavaScript equivalent: export const WwwIpaGoJpExtractor = { ... }
var WwwItmediaCoJpExtractor = &CustomExtractor{ Domain: "www.itmedia.co.jp", SupportedDomains: []string{ "www.atmarkit.co.jp", "techtarget.itmedia.co.jp", "nlab.itmedia.co.jp", }, Title: &FieldExtractor{ Selectors: []interface{}{ "#cmsTitle h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "#byline", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:modified_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "#cmsAbstract h2", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#cmsBody", }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{ "#snsSharebox", }, }, }
WwwItmediaCoJpExtractor provides the custom extraction rules for www.itmedia.co.jp and related domains JavaScript equivalent: export const WwwItmediaCoJpExtractor = { ... }
var WwwJnsaOrgExtractor = &CustomExtractor{ Domain: "www.jnsa.org", Title: &FieldExtractor{ Selectors: []interface{}{ "#wgtitle h2", }, }, Author: nil, DatePublished: nil, Dek: nil, Excerpt: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#main_area", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "#pankuzu", "#side", }, }, }
WwwJnsaOrgExtractor provides the custom extraction rules for www.jnsa.org JavaScript equivalent: export const WwwJnsaOrgExtractor = { ... }
var WwwLemondeFrExtractor = &CustomExtractor{ Domain: "www.lemonde.fr", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.article__title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".author__name", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article__content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "figcaption", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".article__desc", }, }, NextPageURL: nil, Excerpt: nil, }
WwwLemondeFrExtractor provides the custom extraction rules for www.lemonde.fr JavaScript equivalent: export const WwwLemondeFrExtractor = { ... }
var WwwLifehackerJpExtractor = &CustomExtractor{ Domain: "www.lifehacker.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1[class^=\"article_pArticle_Title\"]", "h1.lh-summary-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, "p.lh-entryDetailInner--credit", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, []string{"div.lh-entryDetail-header time", "datetime"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div[class^=\"article_pArticle_Body__\"]", "div.lh-entryDetail-body", }, }, Transforms: map[string]TransformFunction{ "img.lazyload": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { src, exists := selection.Attr("src") if exists { src = strings.ReplaceAll(src, "%27", "'") if idx := strings.LastIndex(src, "='"); idx >= 0 { src = src[idx+2:] } src = strings.TrimSuffix(src, "';") selection.SetAttr("src", src) } return nil }, }, }, Clean: []string{ "p.lh-entryDetailInner--credit", }, }, }
WwwLifehackerJpExtractor provides the custom extraction rules for www.lifehacker.jp JavaScript equivalent: export const WwwLifehackerJpExtractor = { ... }
var WwwMacrumorsComExtractor = &CustomExtractor{ Domain: "www.macrumors.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", "h1.title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ "article a[rel=\"author\"]", ".author-url", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time", "datetime"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "article", ".article", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwMacrumorsComExtractor provides the custom extraction rules for www.macrumors.com JavaScript equivalent: export const WwwMacrumorsComExtractor = { ... }
var WwwMoongiftJpExtractor = &CustomExtractor{ Domain: "www.moongift.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.title a", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "ul.meta li:not(.social):first-of-type", }, Timezone: "Asia/Tokyo", }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:description\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#main", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ "ul.mg_service.cf", }, }, }
WwwMoongiftJpExtractor provides the custom extraction rules for www.moongift.jp JavaScript equivalent: export const WwwMoongiftJpExtractor = { ... }
var WwwNationalgeographicComExtractor = &CustomExtractor{ Domain: "www.nationalgeographic.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", "h1.main-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".byline-component__contributors b span", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".Article__Headline__Desc", ".article__deck", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "section.Article__Content", []string{".parsys.content", ".__image-lead__"}, ".content", }, }, Transforms: map[string]TransformFunction{ ".parsys.content": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { imageParent := selection.Children().First() if imageParent.HasClass("imageGroup") { dataAttrContainer := imageParent.Find(".media--medium__container").Children().First() imgPath1, exists1 := dataAttrContainer.Attr("data-platform-image1-path") imgPath2, exists2 := dataAttrContainer.Attr("data-platform-image2-path") if exists1 && exists2 && imgPath1 != "" && imgPath2 != "" { imageHTML := fmt.Sprintf(`<div class="__image-lead__"> <img src="%s"/> <img src="%s"/> </div>`, imgPath1, imgPath2) selection.PrependHtml(imageHTML) } } else { imgSrc, exists := selection.Find(".image.parbase.section").Find(".picturefill").First().Attr("data-platform-src") if exists && imgSrc != "" { imageHTML := fmt.Sprintf(`<img class="__image-lead__" src="%s"/>`, imgSrc) selection.PrependHtml(imageHTML) } } return nil }, }, }, Clean: []string{ ".pull-quote.pull-quote--small", }, }, }
WwwNationalgeographicComExtractor provides the custom extraction rules for www.nationalgeographic.com JavaScript equivalent: export const WwwNationalgeographicComExtractor = { ... }
var WwwOssnewsJpExtractor = &CustomExtractor{ Domain: "www.ossnews.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "#alpha-block h1.hxnewstitle", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "p.fs12", }, Format: "YYYY年MM月DD日 HH:mm", Timezone: "Asia/Tokyo", }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#alpha-block .section:has(h1.hxnewstitle)", }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwOssnewsJpExtractor provides the custom extraction rules for www.ossnews.jp JavaScript equivalent: export const WwwOssnewsJpExtractor = { ... }
var WwwPhoronixComExtractor = &CustomExtractor{ Domain: "www.phoronix.com", Title: &FieldExtractor{ Selectors: []interface{}{ "article h1", "article header", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".author a:first-child", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".author", }, }, Dek: nil, LeadImageURL: nil, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwPhoronixComExtractor provides the custom extraction rules for www.phoronix.com JavaScript equivalent: export const WwwPhoronixComExtractor = { ... }
var WwwProspectmagazineCoUkExtractor = &CustomExtractor{ Domain: "www.prospectmagazine.co.uk", Title: &FieldExtractor{ Selectors: []interface{}{ ".blog-header__title", ".page-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".blog-header__author-link", ".aside_author .title", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".blog__container", "article .post_content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, ".post-info", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".blog-header__description", ".page-subtitle", }, }, NextPageURL: nil, Excerpt: nil, }
WwwProspectmagazineCoUkExtractor provides the custom extraction rules for www.prospectmagazine.co.uk JavaScript equivalent: export const WwwProspectmagazineCoUkExtractor = { ... }
var WwwPublickey1JpExtractor = &CustomExtractor{ Domain: "www.publickey1.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".bloggerinchief p:first-of-type", "#subcol p:has(img)", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ "div.pubdate", }, Format: "YYYY年MM月DD日", Timezone: "Asia/Tokyo", }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#maincol", }, }, DefaultCleaner: false, Transforms: map[string]TransformFunction{}, Clean: []string{ "#breadcrumbs", "div.sbm", "div.ad_footer", }, }, }
WwwPublickey1JpExtractor provides the custom extraction rules for www.publickey1.jp JavaScript equivalent: export const WwwPublickey1JpExtractor = { ... }
var WwwRbbtodayComExtractor = &CustomExtractor{ Domain: "www.rbbtoday.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".writer.writer-name", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"header time", "datetime"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"description\"]", "value"}, ".arti-summary", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".arti-content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".arti-giga", }, }, }
WwwRbbtodayComExtractor provides the custom extraction rules for www.rbbtoday.com JavaScript equivalent: export const WwwRbbtodayComExtractor = { ... }
var WwwRockpapershotgunComExtractor = &CustomExtractor{ Domain: "www.rockpapershotgun.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.title", "h1", []string{"meta[property=\"og:title\"]", "content"}, }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".byline .author a", ".byline .author", []string{"meta[name=\"author\"]", "content"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"time", "datetime"}, []string{"meta[property=\"article:published_time\"]", "content"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "p.strapline", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[property=\"og:image\"]", "content"}, ".headline_image", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".article_body_content.article-styling", ".article_body_content", ".article-content", "article .article_body", }, }, Clean: []string{ ".inlinead", ".desktop_mpu", ".mpu_container", ".advert_container", ".leaderboard_container", ".injection_placeholder", "span.injection_placeholder", "[data-position]", ".read-next", ".article_footer", ".comments__link", ".load-comments", ".smart-slot", ".sign-in-buttons", ".byline", ".metadata", ".avatar", ".published_at", ".tagged_with", ".author-inline", "button", "form", ".social-sign-in-button", ".tagged_with_item", ".comments-bubble", }, }, }
WwwRockpapershotgunComExtractor provides the custom extraction rules for www.rockpapershotgun.com
var WwwSbnationComExtractor = &CustomExtractor{ Domain: "www.sbnation.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.c-page-title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ "p.c-entry-summary.p-dek", "h2.c-entry-summary.p-dek", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.c-entry-content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwSbnationComExtractor provides the custom extraction rules for www.sbnation.com JavaScript equivalent: export const WwwSbnationComExtractor = { ... }
var WwwSiComExtractor = &CustomExtractor{ Domain: "www.si.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1", "h1.headline", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"published\"]", "value"}, }, Timezone: "America/New_York", }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".m-detail-header--dek", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".m-detail--body", []interface{}{"p", ".marquee_large_2x", ".component.image"}, }, }, Transforms: map[string]TransformFunction{ "noscript": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { children := selection.Children() if children.Length() == 1 { firstChild := children.First() if goquery.NodeName(firstChild) == "img" { html, _ := children.Html() selection.ReplaceWithHtml("<figure>" + html + "</figure>") } } return nil }, }, }, Clean: []string{ ".inline-thumb", ".primary-message", ".description", ".instructions", }, }, }
WwwSiComExtractor provides the custom extraction rules for www.si.com JavaScript equivalent: export const WwwSiComExtractor = { ... }
var WwwSpektrumDeExtractor = &CustomExtractor{ Domain: "www.spektrum.de", Title: &FieldExtractor{ Selectors: []interface{}{ ".content__title", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ ".content__author__info__name", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "article.content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".breadcrumbs", ".hide-for-print", "aside", "header h2", ".image__article__top", ".content__author", ".copyright", ".callout-box", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ ".content__meta__date", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, []string{"meta[property=\"og:image\"]", "content"}, ".image__article__top img", }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".content__intro", }, }, NextPageURL: nil, Excerpt: nil, }
WwwSpektrumDeExtractor provides the custom extraction rules for www.spektrum.de JavaScript equivalent: export const SpektrumExtractor = { ... }
var WwwThevergeComExtractor = &CustomExtractor{ Domain: "www.theverge.com", Title: &FieldExtractor{ Selectors: []interface{}{"h1"}, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"author\"]", "value"}, []string{"meta[name=\"parsely-author\"]", "value"}, []string{"meta[name=\"cse-authors\"]", "value"}, }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{ ".p-dek", }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ ".duet--article--article-body-component", "div[id*='zephr-anchor']", "article", ".article-content", []interface{}{".c-entry-hero .e-image", ".c-entry-intro", ".c-entry-content"}, []interface{}{".e-image--hero", ".c-entry-content"}, ".l-wrapper .l-feature", "div.c-entry-content", }, }, Transforms: map[string]TransformFunction{ "noscript": &FunctionTransform{ Fn: func(selection *goquery.Selection) error { children := selection.Children() if children.Length() == 1 { firstChild := children.First() if goquery.NodeName(firstChild) == "img" { html, _ := children.Html() selection.ReplaceWithHtml("<span>" + html + "</span>") } } return nil }, }, }, Clean: []string{ ".aside", "img.c-dynamic-image", ".duet--article--image-gallery-two-up .kqz8fh5 .kqz8fh8 .kqz8fh7", ".duet--article--image-gallery-two-up .kqz8fha .kqz8fh9", "div[class*='image-gallery'] img[srcset]", ".duet--media--content-warning", "._1etxtj1", ".c-related-list", ".c-entry-group-labels", ".c-follow-button", ".tly2fw0", "button", ".c-image-gallery__nav", "[class*='follow']", }, }, }
WwwThevergeComExtractor provides the custom extraction rules for www.theverge.com JavaScript equivalent: export const WwwThevergeComExtractor = { ... }
var WwwWiredComExtractor = &CustomExtractor{ Domain: "www.wired.com", Title: &FieldExtractor{ Selectors: []interface{}{ "h1[data-testId=\"ContentHeaderHed\"]", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:author\"]", "value"}, "a[rel=\"author\"]", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "article.article.main-content", "article.content", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{ ".visually-hidden", "figcaption img.photo", ".alert-message", }, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: nil, Excerpt: nil, }
WwwWiredComExtractor provides the custom extraction rules for www.wired.com JavaScript equivalent: export const WiredExtractor = { ... }
var WwwYomiuriCoJpExtractor = &CustomExtractor{ Domain: "www.yomiuri.co.jp", Title: &FieldExtractor{ Selectors: []interface{}{ "h1.title-article.c-article-title", }, }, Author: nil, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"article:published_time\"]", "value"}, }, }, Dek: nil, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "div.p-main-contents", }, }, Transforms: map[string]TransformFunction{}, Clean: []string{}, }, }
WwwYomiuriCoJpExtractor provides the custom extraction rules for www.yomiuri.co.jp JavaScript equivalent: export const WwwYomiuriCoJpExtractor = { ... }
var YouTubeCustomExtractor = &CustomExtractor{ Domain: "www.youtube.com", Title: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"title\"]", "value"}, ".watch-title", "h1.watch-title-container", }, }, Author: &FieldExtractor{ Selectors: []interface{}{ []string{`link[itemprop="name"]`, "content"}, ".yt-user-info", }, }, Content: &ContentExtractor{ FieldExtractor: &FieldExtractor{ Selectors: []interface{}{ "#player-container-outer", "ytd-expandable-video-description-body-renderer #description", []string{"#player-api", "#description"}, }, DefaultCleaner: false, }, Transforms: map[string]TransformFunction{ "#player-api": &FunctionTransform{ Fn: transformYouTubePlayerAPI, }, "#player-container-outer": &FunctionTransform{ Fn: transformYouTubePlayerContainer, }, }, Clean: []string{}, }, DatePublished: &FieldExtractor{ Selectors: []interface{}{ []string{`meta[itemProp="datePublished"]`, "value"}, }, }, LeadImageURL: &FieldExtractor{ Selectors: []interface{}{ []string{"meta[name=\"og:image\"]", "value"}, }, }, Dek: &FieldExtractor{ Selectors: []interface{}{}, }, NextPageURL: nil, Excerpt: nil, }
YouTubeCustomExtractor provides the custom extraction rules for www.youtube.com JavaScript equivalent: export const WwwYoutubeComExtractor = { ... }
Functions ¶
func BuildAllExtractorsMap ¶
func BuildAllExtractorsMap(extractors []*CustomExtractor) map[string]*CustomExtractor
BuildAllExtractorsMap creates the complete domain mapping JavaScript equivalent: The result of all.js processing
func CountCustomExtractors ¶
func CountCustomExtractors() int
CountCustomExtractors returns the total number of custom extractors
func DeobfuscateAbendblattText ¶
DeobfuscateAbendblattText handles the complex obfuscation transform for Abendblatt.de JavaScript equivalent: complex function in transforms.p and transforms.div
func GetAllCustomExtractors ¶
func GetAllCustomExtractors() map[string]*CustomExtractor
GetAllCustomExtractors returns all registered custom extractors JavaScript equivalent: export * from './blogspot.com'; export * from './medium.com'; etc.
func GetAllCustomExtractorsList ¶
func GetAllCustomExtractorsList() []string
GetAllCustomExtractorsList returns a list of all custom extractor names
func GetCustomExtractorDomains ¶
func GetCustomExtractorDomains() []string
GetCustomExtractorDomains returns all domains covered by custom extractors
func MergeSupportedDomains ¶
func MergeSupportedDomains(extractor *CustomExtractor) map[string]*CustomExtractor
MergeSupportedDomains creates domain mappings for an extractor JavaScript equivalent: mergeSupportedDomains function in utils/merge-supported-domains.js
Types ¶
type ContentExtractor ¶
type ContentExtractor struct {
*FieldExtractor
Clean []string `json:"clean"` // Selectors to remove from content
Transforms map[string]TransformFunction `json:"transforms"` // Element transformations
DefaultCleaner bool `json:"defaultCleaner"` // Apply default content cleaner
}
ContentExtractor defines how to extract and clean main content JavaScript equivalent: { selectors: [...], clean: [...], transforms: {...} }
type CustomExtractor ¶
type CustomExtractor struct {
Domain string `json:"domain"`
SupportedDomains []string `json:"supportedDomains,omitempty"`
Title *FieldExtractor `json:"title,omitempty"`
Author *FieldExtractor `json:"author,omitempty"`
Content *ContentExtractor `json:"content,omitempty"`
DatePublished *FieldExtractor `json:"date_published,omitempty"`
LeadImageURL *FieldExtractor `json:"lead_image_url,omitempty"`
Dek *FieldExtractor `json:"dek,omitempty"`
NextPageURL *FieldExtractor `json:"next_page_url,omitempty"`
Excerpt *FieldExtractor `json:"excerpt,omitempty"`
Extend map[string]*FieldExtractor `json:"extend,omitempty"`
}
CustomExtractor represents a site-specific content extractor JavaScript equivalent: Each extractor export in custom/[domain]/index.js
func GetABCNewsExtractor ¶
func GetABCNewsExtractor() *CustomExtractor
GetABCNewsExtractor returns the custom extractor for abcnews.go.com
func GetArstechnicaComExtractor ¶
func GetArstechnicaComExtractor() *CustomExtractor
GetArstechnicaComExtractor returns the Ars Technica custom extractor
func GetBiorxivOrgExtractor ¶
func GetBiorxivOrgExtractor() *CustomExtractor
GetBiorxivOrgExtractor returns the BioRxiv custom extractor
func GetBloggerExtractor ¶
func GetBloggerExtractor() *CustomExtractor
GetBloggerExtractor returns the Blogger custom extractor
func GetBlogspotExtractor ¶
func GetBlogspotExtractor() *CustomExtractor
GetBlogspotExtractor returns the Blogspot custom extractor
func GetBloombergExtractor ¶
func GetBloombergExtractor() *CustomExtractor
GetBloombergExtractor returns the custom extractor for www.bloomberg.com
func GetBookwalkerJpExtractor ¶
func GetBookwalkerJpExtractor() *CustomExtractor
GetBookwalkerJpExtractor returns the BookWalker Japan custom extractor
func GetBustleExtractor ¶
func GetBustleExtractor() *CustomExtractor
GetBustleExtractor returns the Bustle custom extractor
func GetBuzzFeedExtractor ¶
func GetBuzzFeedExtractor() *CustomExtractor
GetBuzzFeedExtractor returns the BuzzFeed custom extractor
func GetBuzzapJpExtractor ¶
func GetBuzzapJpExtractor() *CustomExtractor
GetBuzzapJpExtractor returns the BuzzAP Japan custom extractor
func GetCNBCExtractor ¶
func GetCNBCExtractor() *CustomExtractor
GetCNBCExtractor returns the custom extractor for www.cnbc.com
func GetCNNExtractor ¶
func GetCNNExtractor() *CustomExtractor
GetCNNExtractor returns the custom extractor for www.cnn.com
func GetChicagoTribuneExtractor ¶
func GetChicagoTribuneExtractor() *CustomExtractor
GetChicagoTribuneExtractor returns the custom extractor for www.chicagotribune.com
func GetClinicaltrialsGovExtractor ¶
func GetClinicaltrialsGovExtractor() *CustomExtractor
GetClinicaltrialsGovExtractor returns the ClinicalTrials.gov custom extractor
func GetCustomExtractorByDomain ¶
func GetCustomExtractorByDomain(domain string) (*CustomExtractor, bool)
GetCustomExtractorByDomain returns a custom extractor for a specific domain
func GetDaringFireballExtractor ¶
func GetDaringFireballExtractor() *CustomExtractor
GetDaringFireballExtractor returns the Daring Fireball custom extractor
func GetDeadlineExtractor ¶
func GetDeadlineExtractor() *CustomExtractor
GetDeadlineExtractor returns the Deadline.com custom extractor
func GetDeadspinComExtractor ¶
func GetDeadspinComExtractor() *CustomExtractor
GetDeadspinComExtractor returns the Deadspin custom extractor
func GetEOnlineExtractor ¶
func GetEOnlineExtractor() *CustomExtractor
GetEOnlineExtractor returns the E! Online custom extractor
func GetEpaperZeitDeExtractor ¶
func GetEpaperZeitDeExtractor() *CustomExtractor
GetEpaperZeitDeExtractor returns the Zeit.de e-paper custom extractor
func GetFandomWikiaExtractor ¶
func GetFandomWikiaExtractor() *CustomExtractor
GetFandomWikiaExtractor returns the Fandom Wikia custom extractor
func GetFortuneComExtractor ¶
func GetFortuneComExtractor() *CustomExtractor
GetFortuneComExtractor returns the custom extractor for fortune.com
func GetGeniusExtractor ¶
func GetGeniusExtractor() *CustomExtractor
GetGeniusExtractor returns the Genius custom extractor
func GetGetnewsJpExtractor ¶
func GetGetnewsJpExtractor() *CustomExtractor
GetGetnewsJpExtractor returns the GetNews Japan custom extractor
func GetGithubComExtractor ¶
func GetGithubComExtractor() *CustomExtractor
GetGithubComExtractor returns the GitHub custom extractor
func GetGothamistComExtractor ¶
func GetGothamistComExtractor() *CustomExtractor
GetGothamistComExtractor returns the custom extractor for gothamist.com and related city sites
func GetHuffingtonPostExtractor ¶
func GetHuffingtonPostExtractor() *CustomExtractor
GetHuffingtonPostExtractor returns the HuffingtonPost custom extractor
func GetIciRadioCanadaCaExtractor ¶
func GetIciRadioCanadaCaExtractor() *CustomExtractor
GetIciRadioCanadaCaExtractor returns the ICI Radio-Canada custom extractor
func GetJapanCnetComExtractor ¶
func GetJapanCnetComExtractor() *CustomExtractor
GetJapanCnetComExtractor returns the CNET Japan custom extractor
func GetJapanZdnetComExtractor ¶
func GetJapanZdnetComExtractor() *CustomExtractor
GetJapanZdnetComExtractor returns the ZDNet Japan custom extractor
func GetJvndbJvnJpExtractor ¶
func GetJvndbJvnJpExtractor() *CustomExtractor
GetJvndbJvnJpExtractor returns the JVNDB custom extractor
func GetLATimesExtractor ¶
func GetLATimesExtractor() *CustomExtractor
GetLATimesExtractor returns the custom extractor for www.latimes.com
func GetLinkedInExtractor ¶
func GetLinkedInExtractor() *CustomExtractor
GetLinkedInExtractor returns the LinkedIn custom extractor
func GetLittleThingsExtractor ¶
func GetLittleThingsExtractor() *CustomExtractor
GetLittleThingsExtractor returns the LittleThings custom extractor
func GetMaTtiasBeExtractor ¶
func GetMaTtiasBeExtractor() *CustomExtractor
GetMaTtiasBeExtractor returns the ma.ttias.be custom extractor
func GetMashableComExtractor ¶
func GetMashableComExtractor() *CustomExtractor
GetMashableComExtractor returns the Mashable custom extractor
func GetMediumExtractor ¶
func GetMediumExtractor() *CustomExtractor
GetMediumExtractor returns the Medium custom extractor
func GetMediumExtractorFixed ¶
func GetMediumExtractorFixed() *CustomExtractor
GetMediumExtractorFixed returns the Medium custom extractor
func GetMiamiHeraldExtractor ¶
func GetMiamiHeraldExtractor() *CustomExtractor
GetMiamiHeraldExtractor returns the custom extractor for www.miamiherald.com
func GetMoneyCNNExtractor ¶
func GetMoneyCNNExtractor() *CustomExtractor
GetMoneyCNNExtractor returns the custom extractor for money.cnn.com
func GetNBCNewsExtractor ¶
func GetNBCNewsExtractor() *CustomExtractor
GetNBCNewsExtractor returns the custom extractor for www.nbcnews.com
func GetNPRExtractor ¶
func GetNPRExtractor() *CustomExtractor
GetNPRExtractor returns the custom extractor for www.npr.org
func GetNYDailyNewsExtractor ¶
func GetNYDailyNewsExtractor() *CustomExtractor
GetNYDailyNewsExtractor returns the custom extractor for www.nydailynews.com
func GetNYMagExtractor ¶
func GetNYMagExtractor() *CustomExtractor
GetNYMagExtractor returns the NY Magazine custom extractor
func GetNYTimesExtractor ¶
func GetNYTimesExtractor() *CustomExtractor
GetNYTimesExtractor returns the custom extractor for www.nytimes.com
func GetNewYorkerExtractor ¶
func GetNewYorkerExtractor() *CustomExtractor
GetNewYorkerExtractor returns the New Yorker custom extractor
func GetNewsMynaviJpExtractor ¶
func GetNewsMynaviJpExtractor() *CustomExtractor
GetNewsMynaviJpExtractor returns the MyNavi News Japan custom extractor
func GetNewsNationalgeographicComExtractor ¶
func GetNewsNationalgeographicComExtractor() *CustomExtractor
GetNewsNationalgeographicComExtractor returns the News National Geographic custom extractor
func GetPastebinExtractor ¶
func GetPastebinExtractor() *CustomExtractor
GetPastebinExtractor returns the Pastebin custom extractor
func GetPeopleExtractor ¶
func GetPeopleExtractor() *CustomExtractor
GetPeopleExtractor returns the People.com custom extractor
func GetPhpspotOrgExtractor ¶
func GetPhpspotOrgExtractor() *CustomExtractor
GetPhpspotOrgExtractor returns the PHPSpot Japan custom extractor
func GetPitchforkExtractor ¶
func GetPitchforkExtractor() *CustomExtractor
GetPitchforkExtractor returns the Pitchfork custom extractor
func GetPoliticoExtractor ¶
func GetPoliticoExtractor() *CustomExtractor
GetPoliticoExtractor returns the custom extractor for www.politico.com
func GetPolygonExtractor ¶
func GetPolygonExtractor() *CustomExtractor
GetPolygonExtractor returns the Polygon custom extractor
func GetPopSugarExtractor ¶
func GetPopSugarExtractor() *CustomExtractor
GetPopSugarExtractor returns the PopSugar custom extractor
func GetQdailyExtractor ¶
func GetQdailyExtractor() *CustomExtractor
GetQdailyExtractor returns the Qdaily custom extractor
func GetRedditExtractor ¶
func GetRedditExtractor() *CustomExtractor
GetRedditExtractor returns the Reddit custom extractor
func GetReutersExtractor ¶
func GetReutersExtractor() *CustomExtractor
GetReutersExtractor returns the custom extractor for www.reuters.com
func GetRollingStoneExtractor ¶
func GetRollingStoneExtractor() *CustomExtractor
GetRollingStoneExtractor returns the Rolling Stone custom extractor
func GetScanNetsecurityNeJpExtractor ¶
func GetScanNetsecurityNeJpExtractor() *CustomExtractor
GetScanNetsecurityNeJpExtractor returns the ScanNetSecurity custom extractor
func GetScienceflyComExtractor ¶
func GetScienceflyComExtractor() *CustomExtractor
GetScienceflyComExtractor returns the ScienceFly custom extractor
func GetSectIijAdJpExtractor ¶
func GetSectIijAdJpExtractor() *CustomExtractor
GetSectIijAdJpExtractor returns the SECT IIJ custom extractor
func GetTMZExtractor ¶
func GetTMZExtractor() *CustomExtractor
GetTMZExtractor returns the TMZ custom extractor
func GetTakagihiromitsuJpExtractor ¶
func GetTakagihiromitsuJpExtractor() *CustomExtractor
GetTakagihiromitsuJpExtractor returns the Takagi Hiromitsu custom extractor
func GetTechlogIijAdJpExtractor ¶
func GetTechlogIijAdJpExtractor() *CustomExtractor
GetTechlogIijAdJpExtractor returns the TechLog IIJ custom extractor
func GetTheAtlanticExtractor ¶
func GetTheAtlanticExtractor() *CustomExtractor
GetTheAtlanticExtractor returns the The Atlantic custom extractor
func GetTheGuardianExtractor ¶
func GetTheGuardianExtractor() *CustomExtractor
GetTheGuardianExtractor returns the custom extractor for www.theguardian.com
func GetThoughtCatalogExtractor ¶
func GetThoughtCatalogExtractor() *CustomExtractor
GetThoughtCatalogExtractor returns the ThoughtCatalog custom extractor
func GetTimesofindiaIndiatimesComExtractor ¶
func GetTimesofindiaIndiatimesComExtractor() *CustomExtractor
GetTimesofindiaIndiatimesComExtractor returns the Times of India custom extractor
func GetTwitterExtractor ¶
func GetTwitterExtractor() *CustomExtractor
GetTwitterExtractor returns the Twitter custom extractor
func GetTwofortysevensportsComExtractor ¶
func GetTwofortysevensportsComExtractor() *CustomExtractor
GetTwofortysevensportsComExtractor returns the 247Sports custom extractor
func GetUSMagazineExtractor ¶
func GetUSMagazineExtractor() *CustomExtractor
GetUSMagazineExtractor returns the US Magazine custom extractor
func GetUproxxExtractor ¶
func GetUproxxExtractor() *CustomExtractor
GetUproxxExtractor returns the Uproxx custom extractor
func GetVoxExtractor ¶
func GetVoxExtractor() *CustomExtractor
GetVoxExtractor returns the Vox custom extractor
func GetWashingtonPostExtractor ¶
func GetWashingtonPostExtractor() *CustomExtractor
GetWashingtonPostExtractor returns the custom extractor for www.washingtonpost.com
func GetWeeklyAsciiJpExtractor ¶
func GetWeeklyAsciiJpExtractor() *CustomExtractor
GetWeeklyAsciiJpExtractor returns the Weekly ASCII Japan custom extractor
func GetWikipediaExtractor ¶
func GetWikipediaExtractor() *CustomExtractor
GetWikipediaExtractor returns the Wikipedia custom extractor
func GetWiredJpExtractor ¶
func GetWiredJpExtractor() *CustomExtractor
GetWiredJpExtractor returns the Wired Japan custom extractor
func GetWwwAbendblattDeExtractor ¶
func GetWwwAbendblattDeExtractor() *CustomExtractor
GetWwwAbendblattDeExtractor returns the Abendblatt.de custom extractor
func GetWwwAlComExtractor ¶
func GetWwwAlComExtractor() *CustomExtractor
GetWwwAlComExtractor returns the custom extractor for www.al.com
func GetWwwAmericanowComExtractor ¶
func GetWwwAmericanowComExtractor() *CustomExtractor
GetWwwAmericanowComExtractor returns the custom extractor for www.americanow.com
func GetWwwAndroidcentralComExtractor ¶
func GetWwwAndroidcentralComExtractor() *CustomExtractor
GetWwwAndroidcentralComExtractor returns the Android Central custom extractor
func GetWwwAolComExtractor ¶
func GetWwwAolComExtractor() *CustomExtractor
GetWwwAolComExtractor returns the custom extractor for www.aol.com
func GetWwwApartmenttherapyComExtractor ¶
func GetWwwApartmenttherapyComExtractor() *CustomExtractor
GetWwwApartmenttherapyComExtractor returns the custom extractor for www.apartmenttherapy.com
func GetWwwAsahiComExtractor ¶
func GetWwwAsahiComExtractor() *CustomExtractor
GetWwwAsahiComExtractor returns the Asahi Shimbun custom extractor
func GetWwwBroadwayworldComExtractor ¶
func GetWwwBroadwayworldComExtractor() *CustomExtractor
GetWwwBroadwayworldComExtractor returns the custom extractor for www.broadwayworld.com
func GetWwwCbcCaExtractor ¶
func GetWwwCbcCaExtractor() *CustomExtractor
GetWwwCbcCaExtractor returns the CBC custom extractor
func GetWwwCbssportsComExtractor ¶
func GetWwwCbssportsComExtractor() *CustomExtractor
GetWwwCbssportsComExtractor returns the CBS Sports custom extractor
func GetWwwCnetComExtractor ¶
func GetWwwCnetComExtractor() *CustomExtractor
GetWwwCnetComExtractor returns the CNET custom extractor
func GetWwwDmagazineComExtractor ¶
func GetWwwDmagazineComExtractor() *CustomExtractor
GetWwwDmagazineComExtractor returns the custom extractor for www.dmagazine.com
func GetWwwElecomCoJpExtractor ¶
func GetWwwElecomCoJpExtractor() *CustomExtractor
GetWwwElecomCoJpExtractor returns the custom extractor for www.elecom.co.jp
func GetWwwEngadgetComExtractor ¶
func GetWwwEngadgetComExtractor() *CustomExtractor
GetWwwEngadgetComExtractor returns the Engadget custom extractor
func GetWwwFastcompanyComExtractor ¶
func GetWwwFastcompanyComExtractor() *CustomExtractor
GetWwwFastcompanyComExtractor returns the custom extractor for www.fastcompany.com
func GetWwwFoolComExtractor ¶
func GetWwwFoolComExtractor() *CustomExtractor
GetWwwFoolComExtractor returns the custom extractor for www.fool.com
func GetWwwFortinetComExtractor ¶
func GetWwwFortinetComExtractor() *CustomExtractor
GetWwwFortinetComExtractor returns the Fortinet custom extractor
func GetWwwGizmodoJpExtractor ¶
func GetWwwGizmodoJpExtractor() *CustomExtractor
GetWwwGizmodoJpExtractor returns the Gizmodo Japan custom extractor
func GetWwwGrueneDeExtractor ¶
func GetWwwGrueneDeExtractor() *CustomExtractor
GetWwwGrueneDeExtractor returns the Gruene.de custom extractor
func GetWwwInfoqComExtractor ¶
func GetWwwInfoqComExtractor() *CustomExtractor
GetWwwInfoqComExtractor returns the InfoQ custom extractor
func GetWwwInquisitrComExtractor ¶
func GetWwwInquisitrComExtractor() *CustomExtractor
GetWwwInquisitrComExtractor returns the custom extractor for www.inquisitr.com
func GetWwwIpaGoJpExtractor ¶
func GetWwwIpaGoJpExtractor() *CustomExtractor
GetWwwIpaGoJpExtractor returns the IPA Japan custom extractor
func GetWwwItmediaCoJpExtractor ¶
func GetWwwItmediaCoJpExtractor() *CustomExtractor
GetWwwItmediaCoJpExtractor returns the ITmedia Japan custom extractor
func GetWwwJnsaOrgExtractor ¶
func GetWwwJnsaOrgExtractor() *CustomExtractor
GetWwwJnsaOrgExtractor returns the JNSA custom extractor
func GetWwwLadbibleComExtractor ¶
func GetWwwLadbibleComExtractor() *CustomExtractor
GetWwwLadbibleComExtractor returns the custom extractor for www.ladbible.com
func GetWwwLemondeFrExtractor ¶
func GetWwwLemondeFrExtractor() *CustomExtractor
GetWwwLemondeFrExtractor returns the Le Monde custom extractor
func GetWwwLifehackerJpExtractor ¶
func GetWwwLifehackerJpExtractor() *CustomExtractor
GetWwwLifehackerJpExtractor returns the Lifehacker Japan custom extractor
func GetWwwMacrumorsComExtractor ¶
func GetWwwMacrumorsComExtractor() *CustomExtractor
GetWwwMacrumorsComExtractor returns the MacRumors custom extractor
func GetWwwMentalflossComExtractor ¶
func GetWwwMentalflossComExtractor() *CustomExtractor
GetWwwMentalflossComExtractor returns the custom extractor for www.mentalfloss.com
func GetWwwMoongiftJpExtractor ¶
func GetWwwMoongiftJpExtractor() *CustomExtractor
GetWwwMoongiftJpExtractor returns the MOONGIFT Japan custom extractor
func GetWwwMsnComExtractor ¶
func GetWwwMsnComExtractor() *CustomExtractor
GetWwwMsnComExtractor returns the custom extractor for www.msn.com
func GetWwwNationalgeographicComExtractor ¶
func GetWwwNationalgeographicComExtractor() *CustomExtractor
GetWwwNationalgeographicComExtractor returns the National Geographic custom extractor
func GetWwwNdtvComExtractor ¶
func GetWwwNdtvComExtractor() *CustomExtractor
GetWwwNdtvComExtractor returns the custom extractor for www.ndtv.com
func GetWwwOpposingviewsComExtractor ¶
func GetWwwOpposingviewsComExtractor() *CustomExtractor
GetWwwOpposingviewsComExtractor returns the custom extractor for www.opposingviews.com
func GetWwwOssnewsJpExtractor ¶
func GetWwwOssnewsJpExtractor() *CustomExtractor
GetWwwOssnewsJpExtractor returns the OSS News Japan custom extractor
func GetWwwPhoronixComExtractor ¶
func GetWwwPhoronixComExtractor() *CustomExtractor
GetWwwPhoronixComExtractor returns the Phoronix custom extractor
func GetWwwProspectmagazineCoUkExtractor ¶
func GetWwwProspectmagazineCoUkExtractor() *CustomExtractor
GetWwwProspectmagazineCoUkExtractor returns the Prospect Magazine UK custom extractor
func GetWwwPublickey1JpExtractor ¶
func GetWwwPublickey1JpExtractor() *CustomExtractor
GetWwwPublickey1JpExtractor returns the Publickey1 Japan custom extractor
func GetWwwRawstoryComExtractor ¶
func GetWwwRawstoryComExtractor() *CustomExtractor
GetWwwRawstoryComExtractor returns the custom extractor for www.rawstory.com
func GetWwwRbbtodayComExtractor ¶
func GetWwwRbbtodayComExtractor() *CustomExtractor
GetWwwRbbtodayComExtractor returns the RBB TODAY Japan custom extractor
func GetWwwRockpapershotgunComExtractor ¶
func GetWwwRockpapershotgunComExtractor() *CustomExtractor
GetWwwRockpapershotgunComExtractor returns the Rock Paper Shotgun custom extractor
func GetWwwSbnationComExtractor ¶
func GetWwwSbnationComExtractor() *CustomExtractor
GetWwwSbnationComExtractor returns the SB Nation custom extractor
func GetWwwSiComExtractor ¶
func GetWwwSiComExtractor() *CustomExtractor
GetWwwSiComExtractor returns the Sports Illustrated custom extractor
func GetWwwSlateComExtractor ¶
func GetWwwSlateComExtractor() *CustomExtractor
GetWwwSlateComExtractor returns the custom extractor for www.slate.com
func GetWwwSpektrumDeExtractor ¶
func GetWwwSpektrumDeExtractor() *CustomExtractor
GetWwwSpektrumDeExtractor returns the Spektrum.de custom extractor
func GetWwwThevergeComExtractor ¶
func GetWwwThevergeComExtractor() *CustomExtractor
GetWwwThevergeComExtractor returns The Verge custom extractor
func GetWwwTodayComExtractor ¶
func GetWwwTodayComExtractor() *CustomExtractor
GetWwwTodayComExtractor returns the custom extractor for www.today.com
func GetWwwWesternjournalismComExtractor ¶
func GetWwwWesternjournalismComExtractor() *CustomExtractor
GetWwwWesternjournalismComExtractor returns the custom extractor for www.westernjournalism.com
func GetWwwWiredComExtractor ¶
func GetWwwWiredComExtractor() *CustomExtractor
GetWwwWiredComExtractor returns the Wired.com custom extractor
func GetWwwYahooComExtractor ¶
func GetWwwYahooComExtractor() *CustomExtractor
GetWwwYahooComExtractor returns the custom extractor for www.yahoo.com
func GetWwwYomiuriCoJpExtractor ¶
func GetWwwYomiuriCoJpExtractor() *CustomExtractor
GetWwwYomiuriCoJpExtractor returns the Yomiuri Shimbun custom extractor
func GetYouTubeExtractor ¶
func GetYouTubeExtractor() *CustomExtractor
GetYouTubeExtractor returns the YouTube custom extractor
type ExtractorFactory ¶
type ExtractorFactory func() *CustomExtractor
ExtractorFactory creates custom extractors Used for lazy loading and dynamic creation of extractors
type ExtractorOptions ¶
ExtractorOptions provides configuration for extraction operations
type ExtractorRegistry ¶
type ExtractorRegistry struct {
// contains filtered or unexported fields
}
ExtractorRegistry holds all custom extractors
func NewExtractorRegistry ¶
func NewExtractorRegistry() *ExtractorRegistry
NewExtractorRegistry creates a new registry
func (*ExtractorRegistry) Count ¶
func (r *ExtractorRegistry) Count() int
Count returns the number of registered extractors
func (*ExtractorRegistry) Get ¶
func (r *ExtractorRegistry) Get(domain string) (*CustomExtractor, bool)
Get retrieves an extractor by domain
func (*ExtractorRegistry) GetAll ¶
func (r *ExtractorRegistry) GetAll() map[string]*CustomExtractor
GetAll returns all extractors (deduplicated by primary domain)
func (*ExtractorRegistry) List ¶
func (r *ExtractorRegistry) List() []string
List returns all registered domains
func (*ExtractorRegistry) Register ¶
func (r *ExtractorRegistry) Register(extractor *CustomExtractor)
Register adds a custom extractor to the registry
type FieldExtractor ¶
type FieldExtractor struct {
Selectors []interface{} `json:"selectors"` // Can be string or [string, string] for [selector, attribute]
AllowMultiple bool `json:"allowMultiple"` // Allow multiple values
DefaultCleaner bool `json:"defaultCleaner"` // Apply default field cleaner
Format string `json:"format"` // Date format (for date fields)
Timezone string `json:"timezone"` // Timezone (for date fields)
}
FieldExtractor defines how to extract a specific field from a document JavaScript equivalent: { selectors: [...], allowMultiple: bool }
type FunctionTransform ¶
FunctionTransform is a custom function transform JavaScript equivalent: 'selector': $node => { custom logic }
type RegistryManager ¶
type RegistryManager struct {
// contains filtered or unexported fields
}
RegistryManager provides thread-safe management of custom extractors JavaScript equivalent: Combination of all.js, mergeSupportedDomains, and runtime extractor management
func NewRegistryManager ¶
func NewRegistryManager() *RegistryManager
NewRegistryManager creates a new registry manager
func (*RegistryManager) Clear ¶
func (rm *RegistryManager) Clear()
Clear removes all extractors from the registry Useful for testing
func (*RegistryManager) Clone ¶
func (rm *RegistryManager) Clone() *RegistryManager
Clone creates a copy of the registry Useful for testing and isolated environments
func (*RegistryManager) Count ¶
func (rm *RegistryManager) Count() (int, int)
Count returns statistics about registered extractors
func (*RegistryManager) GetAll ¶
func (rm *RegistryManager) GetAll() map[string]*CustomExtractor
GetAll returns all registered extractors (deduplicated by primary domain) JavaScript equivalent: Object.keys(CustomExtractors) processing in all.js
func (*RegistryManager) GetBaseDomain ¶
func (rm *RegistryManager) GetBaseDomain(hostname string) string
GetBaseDomain calculates base domain from hostname JavaScript equivalent: hostname.split('.').slice(-2).join('.') in get-extractor.js
func (*RegistryManager) GetByDomain ¶
func (rm *RegistryManager) GetByDomain(domain string) (*CustomExtractor, bool)
GetByDomain retrieves an extractor by domain with lazy loading support JavaScript equivalent: Extractors[hostname] || Extractors[baseDomain] lookup in get-extractor.js
func (*RegistryManager) GetByDomainWithFallback ¶
func (rm *RegistryManager) GetByDomainWithFallback(hostname string) (*CustomExtractor, bool)
GetByDomainWithFallback tries hostname first, then base domain JavaScript equivalent: Extractors[hostname] || Extractors[baseDomain] logic
func (*RegistryManager) GetByHTML ¶
func (rm *RegistryManager) GetByHTML(doc *goquery.Document) *CustomExtractor
GetByHTML detects extractor using HTML selectors JavaScript equivalent: detectByHtml($) function in detect-by-html.js
func (*RegistryManager) GetDomainMapping ¶
func (rm *RegistryManager) GetDomainMapping() map[string]*CustomExtractor
GetDomainMapping returns the complete domain-to-extractor mapping JavaScript equivalent: The flattened domain mapping created by all.js + mergeSupportedDomains
func (*RegistryManager) ListDomains ¶
func (rm *RegistryManager) ListDomains() []string
ListDomains returns all registered domains (including supported domains)
func (*RegistryManager) ListPrimaryDomains ¶
func (rm *RegistryManager) ListPrimaryDomains() []string
ListPrimaryDomains returns only primary domains (not supported domains)
func (*RegistryManager) Register ¶
func (rm *RegistryManager) Register(extractor *CustomExtractor) error
Register adds a custom extractor to the registry JavaScript equivalent: Building the registry in all.js with mergeSupportedDomains
func (*RegistryManager) RegisterFactory ¶
func (rm *RegistryManager) RegisterFactory(domain string, factory ExtractorFactory) error
RegisterFactory adds a factory for lazy loading of extractors Useful for reducing memory usage when not all extractors are needed
func (*RegistryManager) RegisterHTMLDetector ¶
func (rm *RegistryManager) RegisterHTMLDetector(selector string, extractor *CustomExtractor) error
RegisterHTMLDetector adds an HTML-based extractor detector JavaScript equivalent: Entries in detect-by-html.js Detectors map
func (*RegistryManager) Remove ¶
func (rm *RegistryManager) Remove(domain string) bool
Remove removes an extractor from the registry Useful for testing and dynamic management
type SelectorEntry ¶
SelectorEntry represents a parsed selector with optional attribute extraction
type StringTransform ¶
type StringTransform struct {
TargetTag string
}
StringTransform is a simple transform that changes tag names JavaScript equivalent: 'noscript': 'div'
type TransformFunction ¶
TransformFunction represents a function that transforms DOM elements JavaScript equivalent: 'selector': $node => { ... } or 'selector': 'tag'
Source Files
¶
- 247sports_com.go
- abcnews_go_com.go
- arstechnica_com.go
- biorxiv_org.go
- blogger.go
- blogspot_com.go
- bookwalker_jp.go
- buzzap_jp.go
- clinicaltrials_gov.go
- daringfireball_net.go
- deadline_com.go
- deadspin_com.go
- epaper_zeit_de.go
- extractor_interface.go
- fandom_wikia_com.go
- fortune_com.go
- genius_com.go
- getnews_jp.go
- github_com.go
- gothamist_com.go
- ici_radio_canada_ca.go
- index.go
- japan_cnet_com.go
- japan_zdnet_com.go
- jvndb_jvn_jp.go
- ma_ttias_be.go
- mashable_com.go
- medium.go
- medium_fixed.go
- money_cnn_com.go
- news_mynavi_jp.go
- news_nationalgeographic_com.go
- nymag_com.go
- pastebin_com.go
- people_com.go
- phpspot_org.go
- pitchfork_com.go
- registry.go
- scan_netsecurity_ne_jp.go
- sciencefly_com.go
- sect_iij_ad_jp.go
- takagi_hiromitsu_jp.go
- techlog_iij_ad_jp.go
- thoughtcatalog_com.go
- timesofindia_indiatimes_com.go
- twitter_com.go
- uproxx_com.go
- weekly_ascii_jp.go
- wikipedia_org.go
- wired_jp.go
- www_abendblatt_de.go
- www_al_com.go
- www_americanow_com.go
- www_androidcentral_com.go
- www_aol_com.go
- www_apartmenttherapy_com.go
- www_asahi_com.go
- www_bloomberg_com.go
- www_broadwayworld_com.go
- www_bustle_com.go
- www_buzzfeed_com.go
- www_cbc_ca.go
- www_cbssports_com.go
- www_chicagotribune_com.go
- www_cnbc_com.go
- www_cnet_com.go
- www_cnn_com.go
- www_dmagazine_com.go
- www_elecom_co_jp.go
- www_engadget_com.go
- www_eonline_com.go
- www_fastcompany_com.go
- www_fool_com.go
- www_fortinet_com.go
- www_gizmodo_jp.go
- www_gruene_de.go
- www_huffingtonpost_com.go
- www_infoq_com.go
- www_inquisitr_com.go
- www_ipa_go_jp.go
- www_itmedia_co_jp.go
- www_jnsa_org.go
- www_ladbible_com.go
- www_latimes_com.go
- www_lemonde_fr.go
- www_lifehacker_jp.go
- www_linkedin_com.go
- www_littlethings_com.go
- www_macrumors_com.go
- www_mentalfloss_com.go
- www_miamiherald_com.go
- www_moongift_jp.go
- www_msn_com.go
- www_nationalgeographic_com.go
- www_nbcnews_com.go
- www_ndtv_com.go
- www_newyorker_com.go
- www_npr_org.go
- www_nydailynews_com.go
- www_nytimes_com.go
- www_opposingviews_com.go
- www_ossnews_jp.go
- www_phoronix_com.go
- www_politico_com.go
- www_polygon_com.go
- www_popsugar_com.go
- www_prospectmagazine_co_uk.go
- www_publickey1_jp.go
- www_qdaily_com.go
- www_rawstory_com.go
- www_rbbtoday_com.go
- www_reddit_com.go
- www_reuters_com.go
- www_rockpapershotgun_com.go
- www_rollingstone_com.go
- www_sbnation_com.go
- www_si_com.go
- www_slate_com.go
- www_spektrum_de.go
- www_theatlantic_com.go
- www_theguardian_com.go
- www_theverge_com.go
- www_tmz_com.go
- www_today_com.go
- www_usmagazine_com.go
- www_vox_com.go
- www_washingtonpost_com.go
- www_westernjournalism_com.go
- www_wired_com.go
- www_yahoo_com.go
- www_yomiuri_co_jp.go
- www_youtube_com.go