custom

package
v1.0.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 31, 2025 License: MIT Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var ArstechnicaComExtractor = &CustomExtractor{
	Domain: "arstechnica.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{"title"},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"*[rel=\"author\"] *[itemprop=\"name\"]",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{".byline time", "datetime"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"h2[itemprop=\"description\"]",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div[itemprop=\"articleBody\"]",
			},
		},

		Transforms: map[string]TransformFunction{

			"h2": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					selection.BeforeHtml("<p></p>")
					return nil
				},
			},
		},

		Clean: []string{

			"figcaption .enlarge-link",
			"figcaption .sep",

			"figure.video",

			".gallery",

			"aside",
			".sidebar",
		},
	},
}

ArstechnicaComExtractor provides the custom extraction rules for arstechnica.com JavaScript equivalent: export const ArstechnicaComExtractor = { ... }

View Source
var BiorxivOrgExtractor = &CustomExtractor{
	Domain: "biorxiv.org",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1#page-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"div.highwire-citation-biorxiv-article-top > div.highwire-cite-authors",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div#abstract-1",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

BiorxivOrgExtractor provides the custom extraction rules for biorxiv.org JavaScript equivalent: export const BiorxivOrgExtractor = { ... }

View Source
var BloggerCustomExtractor = &CustomExtractor{
	Domain: "blogspot.com",

	SupportedDomains: []string{
		"www.blogspot.com",
		"blogspot.co.uk",
		"blogspot.ca",
		"blogspot.de",
		"blogspot.fr",
		"blogspot.jp",
		"blogspot.in",
		"blogspot.com.au",
		"blogspot.com.br",
		"blogspot.mx",
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{

			Selectors: []interface{}{".post-content noscript"},
		},

		Clean: []string{},

		Transforms: map[string]TransformFunction{
			"noscript": &StringTransform{TargetTag: "div"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{".post-author-name"},
	},

	Title: &FieldExtractor{
		Selectors: []interface{}{".post h2.title"},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{"span.publishdate"},
	},
}

BloggerCustomExtractor provides the custom extraction rules for Blogger/Blogspot JavaScript equivalent: export const BloggerExtractor = { ... }

View Source
var BlogspotCustomExtractor = &CustomExtractor{
	Domain: "blogspot.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".post h2.title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".post-author-name",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{

				".post-content noscript",
			},
		},

		Transforms: map[string]TransformFunction{
			"noscript": &StringTransform{
				TargetTag: "div",
			},
		},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"span.publishdate",
		},
	},

	LeadImageURL: nil,

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

BlogspotCustomExtractor provides the custom extraction rules for blogspot.com JavaScript equivalent: export const BloggerExtractor = { ... }

View Source
var BookwalkerJpExtractor = &CustomExtractor{
	Domain: "bookwalker.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.p-main__title",
			"h1.main-heading",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"div.p-author__list",
			"div.authors",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"dl.p-information__data dd:nth-of-type(7)",
			".work-info .work-detail:first-of-type .work-detail-contents:last-of-type",
		},

		Timezone: "Asia/Tokyo",
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.p-main__information",
				[]interface{}{"div.main-info", "div.main-cover-inner"},
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"span.label.label--trial",
			"dt.info-head.info-head--coin",
			"dd.info-contents.info-contents--coin",
			"div.info-notice.fn-toggleClass",
		},
	},
}

BookwalkerJpExtractor provides the custom extraction rules for bookwalker.jp JavaScript equivalent: export const BookwalkerJpExtractor = { ... }

View Source
var BustleCustomExtractor = &CustomExtractor{
	Domain: "www.bustle.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			"h1.post-page__title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a[href*=\"profile\"]",
			"div.content-meta__author",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time", "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"article",
				".post-page__body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

BustleCustomExtractor provides the custom extraction rules for www.bustle.com JavaScript equivalent: export const WwwBustleComExtractor = { ... }

View Source
var BuzzFeedCustomExtractor = &CustomExtractor{
	Domain: "www.buzzfeed.com",

	SupportedDomains: []string{"www.buzzfeednews.com"},

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.embed-headline-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			`a[data-action="user/username"]`,
			"byline__author",
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				[]string{`div[class^="featureimage_featureImageWrapper"]`, ".js-subbuzz-wrapper"},
				[]string{".js-subbuzz-wrapper"},
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{

			"h2": &StringTransform{
				TargetTag: "b",
			},

			"div.longform_custom_header_media": &FunctionTransform{
				Fn: transformBuzzFeedHeaderMedia,
			},

			"figure.longform_custom_header_media .longform_header_image_source": &StringTransform{
				TargetTag: "figcaption",
			},
		},

		Clean: []string{
			".instapaper_ignore",
			".suplist_list_hide .buzz_superlist_item .buzz_superlist_number_inline",
			".share-box",
			".print",
			".js-inline-share-bar",
			".js-ad-placement",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time[datetime]", "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".embed-headline-description",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

BuzzFeedCustomExtractor provides the custom extraction rules for www.buzzfeed.com JavaScript equivalent: export const BuzzfeedExtractor = { ... }

View Source
var BuzzapJpExtractor = &CustomExtractor{
	Domain: "buzzap.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.entry-title",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time.entry-date", "datetime"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.ctiframe",
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

BuzzapJpExtractor provides the custom extraction rules for buzzap.jp JavaScript equivalent: export const BuzzapJpExtractor = { ... }

View Source
var ClinicaltrialsGovExtractor = &CustomExtractor{
	Domain: "clinicaltrials.gov",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.tr-solo_record",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"div#sponsor.tr-info-text",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{

			`div:has(> span.term[data-term="Last Update Posted"])`,
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div#tab-body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".usa-alert> img",
		},
	},
}

ClinicaltrialsGovExtractor provides the custom extraction rules for clinicaltrials.gov JavaScript equivalent: export const ClinicaltrialsGovExtractor = { ... }

View Source
var DaringFireballExtractor = &CustomExtractor{
	Domain: "daringfireball.net",
	SupportedDomains: []string{
		"www.daringfireball.net",
	},

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"title",
			"h1",
			"h2.entry-title",
			"h1.entry-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"[name='author']",
			".author",
			".byline",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time", "datetime"},
			[]string{"[datetime]", "datetime"},
			"p.smallprint em",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{

				"div#Main",
				".main-content",
				"main",
				"article",
				"body",
			},
		},

		Clean: []string{

			"div#Banner",
			"div#Sidebar",
			"div#Footer",
			"#SidebarMartini",

			"nav",
			"ul",
			"div#Sidebar ul",

			"a[title*='Daring Fireball']",
			"img[alt*='Daring Fireball']",

			"p:contains('By John Gruber')",

			".smallprint",
			"div#Footer",
			"[href='/preferences/']",
			"a[href='/preferences/']",
			"em",
			"p:last-child",
			"div#Main > p:last-child",
			"div#Main > p:last-of-type",

			"[href*='apps.apple.com']",
			"img[src*='/martini/']",
			"a:contains('Walk the World')",

			"script",
			"style",
			"noscript",

			".ads",
			".advertisement",
			".sponsored",
		},

		Transforms: map[string]TransformFunction{

			"p": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					text := selection.Text()

					if strings.Contains(text, "★ _") ||
						strings.Contains(text, "Display Preferences") ||
						strings.Contains(text, "Copyright ©") {
						selection.Remove()
					}
					return nil
				},
			},

			"a": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					href, exists := selection.Attr("href")
					if exists && strings.Contains(href, "/preferences/") {
						selection.Remove()
					}
					return nil
				},
			},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[property='og:image']", "content"},
			[]string{"meta[name='twitter:image']", "content"},
			[]string{"meta[name='og:image']", "content"},
		},
	},
}

DaringFireballExtractor provides the custom extraction rules for daringfireball.net

View Source
var DeadlineCustomExtractor = &CustomExtractor{
	Domain: "deadline.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"section.author h2",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.a-article-grid__main.pmc-a-grid article.pmc-a-grid-item",
			},
		},

		Transforms: map[string]TransformFunction{
			".embed-twitter": &FunctionTransform{
				Fn: transformDeadlineTwitterEmbed,
			},
		},

		Clean: []string{
			"figcaption",
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

DeadlineCustomExtractor provides the custom extraction rules for deadline.com JavaScript equivalent: export const DeadlineComExtractor = { ... }

View Source
var DeadspinComExtractor = &CustomExtractor{
	Domain: "deadspin.com",

	SupportedDomains: []string{
		"jezebel.com",
		"lifehacker.com",
		"kotaku.com",
		"gizmodo.com",
		"jalopnik.com",
		"kinja.com",
		"avclub.com",
		"clickhole.com",
		"splinternews.com",
		"theonion.com",
		"theroot.com",
		"thetakeout.com",
		"theinventory.com",
	},

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"header h1",
			"h1.headline",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a[data-ga*=\"Author\"]",
			".author",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			[]string{"time.updated[datetime]", "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".js_post-content",
				".post-content",
				".entry-content",
			},
		},

		Transforms: map[string]TransformFunction{

			"iframe.lazyload[data-recommend-id^=\"youtube://\"]": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					id, exists := selection.Attr("id")
					if exists && strings.HasPrefix(id, "youtube-") {
						youtubeId := strings.TrimPrefix(id, "youtube-")
						selection.SetAttr("src", "https://www.youtube.com/embed/"+youtubeId)
					}
					return nil
				},
			},
		},

		Clean: []string{
			".magnifier",
			".lightbox",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

DeadspinComExtractor provides the custom extraction rules for deadspin.com and supported domains JavaScript equivalent: export const DeadspinExtractor = { ... }

View Source
var EOnlineCustomExtractor = &CustomExtractor{
	Domain: "www.eonline.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.article-detail__title",
			"h1.article__title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".article-detail__meta__author",
			".entry-meta__author a",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			[]string{"meta[itemprop=\"datePublished\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article-detail__main-content section",
				".post-content section, .post-content div.post-content__image",
			},
		},

		Transforms: map[string]TransformFunction{
			"div.post-content__image":                 &StringTransform{TargetTag: "figure"},
			"div.post-content__image .image__credits": &StringTransform{TargetTag: "figcaption"},
		},

		Clean: []string{},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

EOnlineCustomExtractor provides the custom extraction rules for www.eonline.com JavaScript equivalent: export const WwwEonlineComExtractor = { ... }

View Source
var EpaperZeitDeExtractor = &CustomExtractor{
	Domain: "epaper.zeit.de",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"p.title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".article__author",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article",
			},
		},

		Transforms: map[string]TransformFunction{
			"p.title":          &StringTransform{"h1"},
			".article__author": &StringTransform{"p"},
			"byline":           &StringTransform{"p"},
			"linkbox":          &StringTransform{"p"},
		},

		Clean: []string{
			"image-credits",
			"box[type=citation]",
		},
	},

	DatePublished: nil,

	LeadImageURL: nil,

	Dek: nil,

	NextPageURL: nil,

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{
			"subtitle",
		},
	},
}

EpaperZeitDeExtractor provides the custom extraction rules for epaper.zeit.de JavaScript equivalent: export const EpaperZeitDeExtractor = { ... }

View Source
var FandomWikiaCustomExtractor = &CustomExtractor{
	Domain: "fandom.wikia.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.entry-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".author vcard",
			".fn",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".grid-content",
				".entry-content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

FandomWikiaCustomExtractor provides the custom extraction rules for fandom.wikia.com JavaScript equivalent: export const WikiaExtractor = { ... }

View Source
var GeniusCustomExtractor = &CustomExtractor{
	Domain: "genius.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"h2 a",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".lyrics",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{

			[]interface{}{
				"meta[itemprop=page_data]",
				"value",
				transformGeniusDateFromJSON,
			},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{

			[]interface{}{
				"meta[itemprop=page_data]",
				"value",
				transformGeniusImageFromJSON,
			},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

GeniusCustomExtractor provides the custom extraction rules for genius.com JavaScript equivalent: export const GeniusComExtractor = { ... }

View Source
var GetnewsJpExtractor = &CustomExtractor{
	Domain: "getnews.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"article h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
			"span.prof",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			[]string{"ul.cattag-top time", "datetime"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.post-bodycopy",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

GetnewsJpExtractor provides the custom extraction rules for getnews.jp JavaScript equivalent: export const GetnewsJpExtractor = { ... }

View Source
var GithubComExtractor = &CustomExtractor{
	Domain: "github.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"relative-time[datetime]", "datetime"},
			[]string{"span[itemprop=\"dateModified\"] relative-time", "datetime"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"description\"]", "value"},
			"span[itemprop=\"about\"]",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				[]interface{}{"#readme article"},
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

GithubComExtractor provides the custom extraction rules for github.com JavaScript equivalent: export const GithubComExtractor = { ... }

View Source
var GlobalRegistryManager = NewRegistryManager()

Default global registry instance JavaScript equivalent: The implicit global registry used throughout the codebase

View Source
var HuffingtonPostCustomExtractor = &CustomExtractor{
	Domain: "www.huffingtonpost.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.headline__title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"span.author-card__details__name",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.entry__body",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".pull-quote",
			".tag-cloud",
			".embed-asset",
			".below-entry",
			".entry-corrections",
			"#suggested-story",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:modified_time\"]", "value"},
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"h2.headline__subtitle",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

HuffingtonPostCustomExtractor provides the custom extraction rules for www.huffingtonpost.com JavaScript equivalent: export const WwwHuffingtonpostComExtractor = { ... }

View Source
var IciRadioCanadaCaExtractor = &CustomExtractor{
	Domain: "ici.radio-canada.ca",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"dc.creator\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"section.document-content-style",
				[]string{".main-multimedia-item", ".news-story-content"},
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"dc.date.created\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"div.lead-container",
			".bunker-component.lead",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

IciRadioCanadaCaExtractor provides the custom extraction rules for ici.radio-canada.ca JavaScript equivalent: export const IciRadioCanadaCaExtractor = { ... }

View Source
var JapanCnetComExtractor = &CustomExtractor{
	Domain: "japan.cnet.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".leaf-headline-ttl",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".writer",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".date",
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.article_body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

JapanCnetComExtractor provides the custom extraction rules for japan.cnet.com JavaScript equivalent: export const JapanCnetComExtractor = { ... }

View Source
var JapanZdnetComExtractor = &CustomExtractor{
	Domain: "japan.zdnet.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"cXenseParse:author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.article_body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

JapanZdnetComExtractor provides the custom extraction rules for japan.zdnet.com JavaScript equivalent: export const JapanZdnetComExtractor = { ... }

View Source
var JvndbJvnJpExtractor = &CustomExtractor{
	Domain: "jvndb.jvn.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"title",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"div.modifytxt:nth-child(2)",
		},
	},

	Dek: nil,

	LeadImageURL: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#news-list",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

JvndbJvnJpExtractor provides the custom extraction rules for jvndb.jvn.jp JavaScript equivalent: export const JvndbJvnJpExtractor = { ... }

View Source
var LinkedInCustomExtractor = &CustomExtractor{
	Domain: "www.linkedin.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".article-title",
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".main-author-card h3",
			[]string{"meta[name=\"article:author\"]", "value"},
			".entity-name a[rel=author]",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article-content__body",
				[]string{"header figure", ".prose"},
				".prose",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".entity-image",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".base-main-card__metadata",
			[]string{`time[itemprop="datePublished"]`, "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

LinkedInCustomExtractor provides the custom extraction rules for www.linkedin.com JavaScript equivalent: export const WwwLinkedinComExtractor = { ... }

View Source
var LittleThingsCustomExtractor = &CustomExtractor{
	Domain: "www.littlethings.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1[class*=\"PostHeader\"]",
			"h1.post-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"div[class^=\"PostHeader__ScAuthorNameSection\"]",
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"section[class*=\"PostMainArticle\"]",
				".mainContentIntro",
				".content-wrapper",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

LittleThingsCustomExtractor provides the custom extraction rules for www.littlethings.com JavaScript equivalent: export const LittleThingsExtractor = { ... }

View Source
var MaTtiasBeExtractor = &CustomExtractor{
	Domain: "ma.ttias.be",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"twitter:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".content",
			},
		},

		Transforms: map[string]TransformFunction{
			"h2": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {

					selection.RemoveAttr("id")

					selection.Get(0).Data = "h3"
					return nil
				},
			},
			"h1": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {

					selection.RemoveAttr("id")

					selection.AfterHtml("<p></p>")
					return nil
				},
			},
			"ul": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {

					selection.AddClass("entry-content-asset")
					return nil
				},
			},
		},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: nil,
	Dek:          nil,
	NextPageURL:  nil,
	Excerpt:      nil,
}

MaTtiasBeExtractor provides the custom extraction rules for ma.ttias.be JavaScript equivalent: export const MaTtiasBeExtractor = { ... }

View Source
var MashableComExtractor = &CustomExtractor{
	Domain: "mashable.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"header h1",
			"h1.title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
			"span.author_name a",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#article",
				"section.article-content.blueprint",
			},
		},

		Transforms: map[string]TransformFunction{
			".image-credit": &StringTransform{
				TargetTag: "figcaption",
			},
		},

		Clean: []string{},
	},
}

MashableComExtractor provides the custom extraction rules for mashable.com JavaScript equivalent: export const MashableComExtractor = { ... }

View Source
var MediumCustomExtractor = &CustomExtractor{
	Domain: "medium.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{"article"},
		},

		Clean: []string{"span a", "svg"},

		Transforms: map[string]TransformFunction{

			"section span:first-of-type": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					text := selection.Text()
					if len(text) == 1 && regexp.MustCompile(`^[a-zA-Z()]+$`).MatchString(text) {
						selection.ReplaceWith(text)
					}
					return nil
				},
			},

			"iframe": &FunctionTransform{
				Fn: transformMediumIframe,
			},

			"figure": &FunctionTransform{
				Fn: transformMediumFigure,
			},

			"img": &FunctionTransform{
				Fn: transformMediumImage,
			},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: nil,

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

MediumCustomExtractor provides the custom extraction rules for Medium.com JavaScript equivalent: export const MediumExtractor = { ... }

View Source
var MediumCustomExtractorFixed = &CustomExtractor{
	Domain: "medium.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{"article"},
		},

		Clean: []string{"span a", "svg"},

		Transforms: map[string]TransformFunction{

			"section span:first-of-type": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					text := selection.Text()
					if len(text) == 1 && regexp.MustCompile(`^[a-zA-Z()]+$`).MatchString(text) {
						selection.ReplaceWith(text)
					}
					return nil
				},
			},

			"img": &FunctionTransform{
				Fn: transformMediumImageFixed,
			},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: nil,

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

MediumCustomExtractor provides the custom extraction rules for Medium.com JavaScript equivalent: export const MediumExtractor = { ... }

View Source
var NYMagCustomExtractor = &CustomExtractor{
	Domain: "nymag.com",

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{

			Selectors: []interface{}{
				"div.article-content",
				"section.body",
				"article.article",
			},
		},

		Clean: []string{
			".ad",
			".single-related-story",
		},

		Transforms: map[string]TransformFunction{

			"h1": &StringTransform{TargetTag: "h2"},

			"noscript": &FunctionTransform{
				Fn: transformNYMagNoscript,
			},
		},
	},

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.lede-feature-title",
			"h1.headline-primary",
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".by-authors",
			".lede-feature-author",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".lede-feature-teaser",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time.article-timestamp[datetime]", "datetime"},
			"time.article-timestamp",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

NYMagCustomExtractor provides the custom extraction rules for nymag.com JavaScript equivalent: export const NYMagExtractor = { ... }

View Source
var NewYorkerCustomExtractor = &CustomExtractor{
	Domain: "www.newyorker.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1[class^=\"content-header\"]",
			"h1[class^=\"ArticleHeader__hed\"]",
			"h1[class*=\"ContentHeaderHed\"]",
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"article header div[class^=\"BylinesWrapper\"]",
			[]string{"meta[name=\"article:author\"]", "value"},
			"div[class^=\"ArticleContributors\"] a[rel=\"author\"]",
			"article header div[class*=\"Byline__multipleContributors\"]",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article__body",
				"article.article.main-content",
				"main[class^=\"Layout__content\"]",
			},
		},

		Transforms: map[string]TransformFunction{
			".caption__text":   &StringTransform{TargetTag: "figcaption"},
			".caption__credit": &StringTransform{TargetTag: "figcaption"},
		},

		Clean: []string{
			"footer[class^=\"ArticleFooter__footer\"]",
			"aside",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			"time.content-header__publish-date",
			[]string{"meta[name=\"pubdate\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"div[class^=\"ContentHeaderDek\"]",
			"div.content-header__dek",
			"h2[class^=\"ArticleHeader__dek\"]",
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

NewYorkerCustomExtractor provides the custom extraction rules for www.newyorker.com JavaScript equivalent: export const NewYorkerExtractor = { ... }

View Source
var NewsMynaviJpExtractor = &CustomExtractor{
	Domain: "news.mynavi.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a.articleHeader_name",
			"main div.article-author a.article-author__name",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.article-body",
				"main article div",
			},
		},

		Transforms: map[string]TransformFunction{
			"img": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					dataOriginal, exists := selection.Attr("data-original")
					if exists && dataOriginal != "" {
						selection.SetAttr("src", dataOriginal)
					}
					return nil
				},
			},
		},

		Clean: []string{},
	},
}

NewsMynaviJpExtractor provides the custom extraction rules for news.mynavi.jp JavaScript equivalent: export const NewsMynaviJpExtractor = { ... }

View Source
var NewsNationalgeographicComExtractor = &CustomExtractor{
	Domain: "news.nationalgeographic.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			"h1.main-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".byline-component__contributors b span",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".article__deck",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				[]string{".parsys.content", ".__image-lead__"},
				".content",
			},
		},

		Transforms: map[string]TransformFunction{
			".parsys.content": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {

					imgSrc, exists := selection.Find(".image.parbase.section").Find(".picturefill").First().Attr("data-platform-src")
					if exists && imgSrc != "" {
						imageHTML := fmt.Sprintf(`<img class="__image-lead__" src="%s"/>`, imgSrc)
						selection.PrependHtml(imageHTML)
					}
					return nil
				},
			},
		},

		Clean: []string{
			".pull-quote.pull-quote--large",
		},
	},
}

NewsNationalgeographicComExtractor provides the custom extraction rules for news.nationalgeographic.com JavaScript equivalent: export const NewsNationalgeographicComExtractor = { ... }

View Source
var PastebinCustomExtractor = &CustomExtractor{
	Domain: "pastebin.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".username",
			".paste_box_line2 .t_us + a",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".source",
				"#selectable .text",
			},
		},

		Transforms: map[string]TransformFunction{

			"ol": &StringTransform{
				TargetTag: "div",
			},

			"li": &StringTransform{
				TargetTag: "p",
			},
		},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".date",
			".paste_box_line2 .t_da + span",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

PastebinCustomExtractor provides the custom extraction rules for pastebin.com JavaScript equivalent: export const PastebinComExtractor = { ... }

View Source
var PeopleCustomExtractor = &CustomExtractor{
	Domain: "people.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".article-header h1",
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"sailthru.author\"]", "value"},
			"a.author.url.fn",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".mntl-attribution__item-date",
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".article-header h2",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div[class^=\"loc article-content\"]",
				"div.article-body__inner",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

PeopleCustomExtractor provides the custom extraction rules for people.com JavaScript equivalent: export const PeopleComExtractor = { ... }

View Source
var PhpspotOrgExtractor = &CustomExtractor{
	Domain: "phpspot.org",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h3.hl",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"h4.hl",
		},

		Format: "YYYY年MM月DD日",

		Timezone: "Asia/Tokyo",
	},

	Dek: nil,

	LeadImageURL: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.entrybody",
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

PhpspotOrgExtractor provides the custom extraction rules for phpspot.org JavaScript equivalent: export const PhpspotOrgExtractor = { ... }

View Source
var PitchforkCustomExtractor = &CustomExtractor{
	Domain: "pitchfork.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:title\"]", "value"},
			"title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
			".authors-detail__display-name",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"div[class^=\"InfoSliceWrapper-\"]",
			[]string{".pub-date", "datetime"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:description\"]", "value"},
			".review-detail__abstract",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
			[]string{".single-album-tombstone__art img", "src"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.body__inner-container",
				".review-detail__text",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	Extend: map[string]*FieldExtractor{
		"score": {
			Selectors: []interface{}{
				"p[class*=\"Rating\"]",
				".score",
			},
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

PitchforkCustomExtractor provides the custom extraction rules for pitchfork.com JavaScript equivalent: export const PitchforkComExtractor = { ... }

View Source
var PolygonExtractor = &CustomExtractor{
	Domain: "www.polygon.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.article-header-title",
			"h1[class*='article']",
			"h1",

			[]string{"meta[property=\"og:title\"]", "content"},
			[]string{"meta[name=\"twitter:title\"]", "content"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".article-author",
			".meta_txt.article-author",
			".w-author-name .article-author",

			[]string{"meta[name=\"author\"]", "content"},
			[]string{"meta[property=\"article:author\"]", "content"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[property=\"article:published_time\"]", "content"},
			[]string{"meta[property=\"og:published_time\"]", "content"},
			".article-date",
			".meta_txt.article-date",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"header p",
			".article-excerpt",
			[]string{"meta[name=\"description\"]", "content"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[property=\"og:image\"]", "content"},
			[]string{"meta[name=\"twitter:image\"]", "content"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{

				[]interface{}{
					"#article-body .content-block-regular",
					"#article-body > p",
					"#article-body > h1",
					"#article-body > h2",
					"#article-body > h3",
					"#article-body > h4",
					"#article-body > figure",
					"#article-body > blockquote",
					"#article-body > img",
				},

				"#article-body",
				".article-body",

				"article.w-article .article-body",
				"main .w-article",

				".entry-content",
				".post-content",
			},
		},

		Transforms: map[string]TransformFunction{

			"noscript": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					html, err := selection.Html()
					if err != nil {
						return err
					}

					if html != "" {
						selection.ReplaceWithHtml("<span>" + html + "</span>")
					}
					return nil
				},
			},
		},

		Clean: []string{

			".w-directory-warning",
			"nav.article-directory-sidenav",
			".sidenav-level",
			".sidenav-item",
			".directory-warning",
			"a.directory-warning",

			".article-footer-nav",
			".pagination-nav",
			".article-nav",

			"[class*='ad-']",
			"[id*='ad-']",
			".advertisement",
			".promo",

			".social-share",
			".share-buttons",
			".w-sharing-copy",

			".follow-container",
			".w-follow-btn",
			".w-like-btn",
			".option-btn",
			".btn-fab",
			".disqus-load-btn",

			".w-related-content",
			".w-header-related-feed",
			".section-header",
			".section-title",
			".display-card-title",
			".display-card",
			".w-display-card-content",
			".article-header-complementary",
			".sidebar-tabs",
			".tabs-ul",
			".tabs-header",
			".tab-content",
			".sidebar-el-content",
			".related-articles",
			".newsletter-signup",
			".email-signup",

			".w-heading-options",
			".w-header-user-box",
			".user-box-title",
			".article-header-data",

			".comments-section",
			".comment-form",
			"#disqus_thread",
			".article-comments",

			"[class*='trending']",
			"[class*='popular']",
			"[id*='trending']",
			"[id*='popular']",
			".article-header-complementary",

			".w-login",
			".valnet-login",
			".w-valnet-login",
			"[id*='login']",

			".article-header-bg",
			".thread-option",
			".fab-label",

			"script",
			"noscript",
			"style",

			"img.c-dynamic-image",
		},
	},
}

PolygonExtractor provides the custom extraction rules for www.polygon.com

View Source
var PopSugarCustomExtractor = &CustomExtractor{
	Domain: "www.popsugar.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h2.post-title",
			"title-text",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".share-copy-title",
			".post-tags",
			".reactions",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

PopSugarCustomExtractor provides the custom extraction rules for www.popsugar.com JavaScript equivalent: export const WwwPopsugarComExtractor = { ... }

View Source
var QdailyCustomExtractor = &CustomExtractor{
	Domain: "www.qdaily.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h2",
			"h2.title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".name",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".detail",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".lazyload",
			".lazylad",
			".lazylood",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{".date.smart-date", "data-origindate"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{".article-detail-hd img", "src"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".excerpt",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

QdailyCustomExtractor provides the custom extraction rules for www.qdaily.com JavaScript equivalent: export const WwwQdailyComExtractor = { ... }

View Source
var RedditCustomExtractor = &CustomExtractor{
	Domain: "www.reddit.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			`div[data-test-id="post-content"] h1`,
			`div[data-test-id="post-content"] h2`,
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			`div[data-test-id="post-content"] a[href*="user/"]`,
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{

				[]string{`div[data-test-id="post-content"] p`},

				[]string{
					`div[data-test-id="post-content"] a[target="_blank"]:not([data-click-id="timestamp"])`,
					`div[data-test-id="post-content"] div[data-click-id="media"]`,
				},

				[]string{`div[data-test-id="post-content"] div[data-click-id="media"]`},

				[]string{`div[data-test-id="post-content"] a`},
				`div[data-test-id="post-content"]`,
			},
		},

		Transforms: map[string]TransformFunction{

			`div[role="img"]`: &FunctionTransform{
				Fn: transformRedditImagePreview,
			},
		},

		Clean: []string{
			".icon",
			`span[id^="PostAwardBadges"]`,
			`div a[data-test-id="comments-page-link-num-comments"]`,
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			`div[data-test-id="post-content"] span[data-click-id="timestamp"]`,
			`div[data-test-id="post-content"] a[data-click-id="timestamp"]`,
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

RedditCustomExtractor provides the custom extraction rules for www.reddit.com JavaScript equivalent: export const WwwRedditComExtractor = { ... }

View Source
var RollingStoneCustomExtractor = &CustomExtractor{
	Domain: "www.rollingstone.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.l-article-header__row--title",
			"h1.content-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a.c-byline__link",
			"a.content-author.tracked-offpage",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			"time.content-published-date",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"h2.l-article-header__row--lead",
			".content-description",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".l-article-content",
				[]string{".lead-container", ".article-content"},
				".article-content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".c-related-links-wrapper",
			".module-related",
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

RollingStoneCustomExtractor provides the custom extraction rules for www.rollingstone.com JavaScript equivalent: export const WwwRollingstoneComExtractor = { ... }

View Source
var ScanNetsecurityNeJpExtractor = &CustomExtractor{
	Domain: "scan.netsecurity.ne.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"header.arti-header h1.head",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:modified_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"header.arti-header p.arti-summary",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.arti-content.arti-content--thumbnail",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"aside.arti-giga",
		},
	},
}

ScanNetsecurityNeJpExtractor provides the custom extraction rules for scan.netsecurity.ne.jp JavaScript equivalent: export const ScanNetsecurityNeJpExtractor = { ... }

View Source
var ScienceflyComExtractor = &CustomExtractor{
	Domain: "sciencefly.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".entry-title",
			".cb-entry-title",
			".cb-single-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"div.cb-author",
			"div.cb-author-title",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{

		Selectors: []interface{}{},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"div.theiaPostSlider_slides img", "src"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.theiaPostSlider_slides",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

ScienceflyComExtractor provides the custom extraction rules for sciencefly.com JavaScript equivalent: export const ScienceflyComExtractor = { ... }

View Source
var SectIijAdJpExtractor = &CustomExtractor{
	Domain: "sect.iij.ad.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"div.title-box-inner h1",
			"h3",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"p.post-author a",
			"dl.entrydate dd",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"time",
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".entry-inner",
				"#article",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"dl.entrydate",
		},
	},
}

SectIijAdJpExtractor provides the custom extraction rules for sect.iij.ad.jp JavaScript equivalent: export const SectIijAdJpExtractor = { ... }

View Source
var TMZCustomExtractor = &CustomExtractor{
	Domain: "www.tmz.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".post-title-breadcrumb",
			"h1",
			".headline",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{"TMZ STAFF"},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".article__published-at",
			".article-posted-date",
		},
	},

	Dek: &FieldExtractor{

		Selectors: []interface{}{},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article__blocks",
				".article-content",
				".all-post-body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".lightbox-link",
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

TMZCustomExtractor provides the custom extraction rules for www.tmz.com JavaScript equivalent: export const WwwTmzComExtractor = { ... }

View Source
var TakagihiromitsuJpExtractor = &CustomExtractor{
	Domain: "takagi-hiromitsu.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h3",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[http-equiv=\"Last-Modified\"]", "value"},
		},
	},

	Dek: nil,

	LeadImageURL: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.body",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

TakagihiromitsuJpExtractor provides the custom extraction rules for takagi-hiromitsu.jp JavaScript equivalent: export const TakagihiromitsuJpExtractor = { ... }

View Source
var TechlogIijAdJpExtractor = &CustomExtractor{
	Domain: "techlog.iij.ad.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.entry-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a[rel=\"author\"]",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time.entry-date", "datetime"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.entry-content",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".wp_social_bookmarking_light",
		},
	},
}

TechlogIijAdJpExtractor provides the custom extraction rules for techlog.iij.ad.jp JavaScript equivalent: export const TechlogIijAdJpExtractor = { ... }

View Source
var TheAtlanticCustomExtractor = &CustomExtractor{
	Domain: "www.theatlantic.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			".c-article-header__hed",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
			".c-byline__author",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"article",
				".article-body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".partner-box",
			".callout",
			".c-article-writer__image",
			".c-article-writer__content",
			".c-letters-cta__text",
			".c-footer__logo",
			".c-recirculation-link",
			".twitter-tweet",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"description\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time[itemprop=\"datePublished\"]", "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

TheAtlanticCustomExtractor provides the custom extraction rules for www.theatlantic.com JavaScript equivalent: export const TheAtlanticExtractor = { ... }

View Source
var ThoughtCatalogCustomExtractor = &CustomExtractor{
	Domain: "thoughtcatalog.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.title",
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"cite a",
			"div.col-xs-12.article_header div.writer-container.writer-container-inline.writer-no-avatar h4.writer-name",
			"h1.writer-name",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".entry.post",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".tc_mark",
			"figcaption",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

ThoughtCatalogCustomExtractor provides the custom extraction rules for thoughtcatalog.com JavaScript equivalent: export const ThoughtcatalogComExtractor = { ... }

View Source
var TimesofindiaIndiatimesComExtractor = &CustomExtractor{
	Domain: "timesofindia.indiatimes.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.contentwrapper:has(section)",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"section",
			"h1",
			".byline",
			".img_cptn",
			".icon_share_wrap",
			"ul[itemtype=\"https://schema.org/BreadcrumbList\"]",
		},

		DefaultCleaner: false,
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".byline",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,

	Extend: map[string]*FieldExtractor{
		"reporter": {
			Selectors: []interface{}{
				"div.byline",
			},
		},
	},
}

TimesofindiaIndiatimesComExtractor provides the custom extraction rules for timesofindia.indiatimes.com JavaScript equivalent: export const TimesofindiaIndiatimesComExtractor = { ... }

View Source
var TwitterCustomExtractor = &CustomExtractor{
	Domain: "twitter.com",

	Title: nil,

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".tweet.permalink-tweet .username",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				`.permalink[role=main]`,
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{

			`.permalink[role=main]`: &FunctionTransform{
				Fn: transformTwitterPermalink,
			},

			"s": &StringTransform{
				TargetTag: "span",
			},
		},

		Clean: []string{
			".stream-item-footer",
			"button",
			".tweet-details-fixer",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{`.permalink-tweet ._timestamp[data-time-ms]`, "data-time-ms"},
		},
	},

	LeadImageURL: nil,

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

TwitterCustomExtractor provides the custom extraction rules for twitter.com JavaScript equivalent: export const TwitterExtractor = { ... }

View Source
var TwofortysevensportsComExtractor = &CustomExtractor{
	Domain: "247sports.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"title",
			"article header h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".article-cnt__author",
			".author",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time[data-published]", "data-published"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article-body",
				"section.body.article",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

TwofortysevensportsComExtractor provides the custom extraction rules for 247sports.com JavaScript equivalent: export const twofortysevensportsComExtractor = { ... }

View Source
var USMagazineCustomExtractor = &CustomExtractor{
	Domain: "www.usmagazine.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"header h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a.author",
			"a.article-byline.tracked-offpage",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.article-content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".module-related",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

USMagazineCustomExtractor provides the custom extraction rules for www.usmagazine.com JavaScript equivalent: export const WwwUsmagazineComExtractor = { ... }

View Source
var UproxxCustomExtractor = &CustomExtractor{
	Domain: "uproxx.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"div.entry-header h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"qc:author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".entry-content",
			},
		},

		Transforms: map[string]TransformFunction{
			"div.image":                  &StringTransform{TargetTag: "figure"},
			"div.image .wp-media-credit": &StringTransform{TargetTag: "figcaption"},
		},

		Clean: []string{},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{},
	},
}

UproxxCustomExtractor provides the custom extraction rules for uproxx.com JavaScript equivalent: export const UproxxComExtractor = { ... }

View Source
var VoxCustomExtractor = &CustomExtractor{
	Domain: "www.vox.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1[class*=\"h74scy\"]",
			"h1.c-page-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".duet--article--article-body-component",
				"div[id*='zephr-anchor']",
				".duet--layout--entry-body",

				[]string{"figure.e-image--hero", ".c-entry-content"},
				".c-entry-content",
			},
		},

		Transforms: map[string]TransformFunction{

			"figure .e-image__image noscript": &FunctionTransform{
				Fn: transformVoxNoscriptImage,
			},

			"figure .e-image__meta": &StringTransform{
				TargetTag: "figcaption",
			},
		},

		Clean: []string{
			".duet--article--block-placement",
			".duet--article--related",
			".duet--cta--newsletter",
			"form",
			".duet--article--share-buttons",
			".duet--article--article-pullquote",
			".duet--media--caption",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"p[class*=\"h74scyi\"]",
			".p-dek",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

VoxCustomExtractor provides the custom extraction rules for www.vox.com JavaScript equivalent: export const WwwVoxComExtractor = { ... }

View Source
var WeeklyAsciiJpExtractor = &CustomExtractor{
	Domain: "weekly.ascii.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"article h1",
			"h1[itemprop=\"headline\"]",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"p.author",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"p.date",
			[]string{"meta[name=\"odate\"]", "value"},
		},

		Format: "YYYY年MM月DD日 HH:mm",

		Timezone: "Asia/Tokyo",
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div#contents_detail",
				"div.article",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WeeklyAsciiJpExtractor provides the custom extraction rules for weekly.ascii.jp JavaScript equivalent: export const WeeklyAsciiJpExtractor = { ... }

View Source
var WikipediaCustomExtractor = &CustomExtractor{
	Domain: "wikipedia.org",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h2.title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#mw-content-text",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{

			".infobox img": &FunctionTransform{
				Fn: transformWikipediaInfoboxImg,
			},

			".infobox caption": &StringTransform{
				TargetTag: "figcaption",
			},

			".infobox": &StringTransform{
				TargetTag: "figure",
			},
		},

		Clean: []string{
			".mw-editsection",
			"figure tr, figure td, figure tbody",
			"#toc",
			".navbox",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"#footer-info-lastmod",
		},
	},

	LeadImageURL: nil,

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

WikipediaCustomExtractor provides the custom extraction rules for wikipedia.org JavaScript equivalent: export const WikipediaExtractor = { ... }

View Source
var WiredJpExtractor = &CustomExtractor{
	Domain: "wired.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1[data-testid=\"ContentHeaderHed\"]",
			"h1.post-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
			"p[itemprop=\"author\"]",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			[]string{"time", "datetime"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"div[class^=\"ContentHeaderDek\"]",
			".post-intro",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div[data-attribute-verso-pattern=\"article-body\"]",
				"article.article-detail",
			},
		},

		Transforms: map[string]TransformFunction{
			"img[data-original]": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					dataOriginal, hasDataOriginal := selection.Attr("data-original")
					src, hasSrc := selection.Attr("src")

					if hasDataOriginal && hasSrc {

						base, err := url.Parse(src)
						if err != nil {
							return err
						}

						ref, err := url.Parse(dataOriginal)
						if err != nil {
							return err
						}

						resolved := base.ResolveReference(ref)
						selection.SetAttr("src", resolved.String())
					}
					return nil
				},
			},
		},

		Clean: []string{
			".post-category",
			"time",
			"h1.post-title",
			".social-area-syncer",
		},
	},
}

WiredJpExtractor provides the custom extraction rules for wired.jp JavaScript equivalent: export const WiredJpExtractor = { ... }

View Source
var WwwAbendblattDeExtractor = &CustomExtractor{
	Domain: "www.abendblatt.de",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h2.article__header__headline",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"span.author-info__name-text",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.article__body",
			},
		},

		Transforms: map[string]TransformFunction{
			"p": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					DeobfuscateAbendblattText(selection)
					return nil
				},
			},
			"div": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					DeobfuscateAbendblattText(selection)
					return nil
				},
			},
		},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time.teaser-stream-time", "datetime"},
			[]string{"time.article__header__date", "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"description\"]", "value"},
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

WwwAbendblattDeExtractor provides the custom extraction rules for www.abendblatt.de JavaScript equivalent: export const WwwAbendblattDeExtractor = { ... }

View Source
var WwwAndroidcentralComExtractor = &CustomExtractor{
	Domain: "www.androidcentral.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			"h1.main-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"parsely-author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#article-body",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".intro",
			"blockquote",
		},
	},
}

WwwAndroidcentralComExtractor provides the custom extraction rules for www.androidcentral.com JavaScript equivalent: export const WwwAndroidcentralComExtractor = { ... }

View Source
var WwwAsahiComExtractor = &CustomExtractor{
	Domain: "www.asahi.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"main h1",
			".ArticleTitle h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"pubdate\"]", "value"},
		},
	},

	Dek: nil,

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"main",
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"div.AdMod",
			"div.LoginSelectArea",
			"time",
			"div.notPrint",
		},
	},
}

WwwAsahiComExtractor provides the custom extraction rules for www.asahi.com JavaScript equivalent: export const WwwAsahiComExtractor = { ... }

View Source
var WwwCbcCaExtractor = &CustomExtractor{
	Domain: "www.cbc.ca",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".authorText",
			".bylineDetails",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".story",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{".timeStamp[datetime]", "datetime"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".deck",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

WwwCbcCaExtractor provides the custom extraction rules for www.cbc.ca JavaScript equivalent: export const WwwCbcCaExtractor = { ... }

View Source
var WwwCbssportsComExtractor = &CustomExtractor{
	Domain: "www.cbssports.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".Article-headline",
			".article-headline",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".ArticleAuthor-nameText",
			".author-name",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[itemprop=\"datePublished\"]", "value"},
		},
		Timezone: "UTC",
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".Article-subline",
			".article-subline",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwCbssportsComExtractor provides the custom extraction rules for www.cbssports.com JavaScript equivalent: export const WwwCbssportsComExtractor = { ... }

View Source
var WwwCnetComExtractor = &CustomExtractor{
	Domain: "www.cnet.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"span.author",
			"a.author",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"time",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".c-head_dek",
			".article-dek",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				[]interface{}{"img.__image-lead__", ".article-main-body"},
				".article-main-body",
			},
		},

		Transforms: map[string]TransformFunction{
			"figure.image": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					img := selection.Find("img")
					if img.Length() > 0 {
						img.SetAttr("width", "100%")
						img.SetAttr("height", "100%")
						img.AddClass("__image-lead__")

						selection.Find(".imgContainer").Remove()
						selection.PrependSelection(img)
					}
					return nil
				},
			},
		},

		Clean: []string{},
	},
}

WwwCnetComExtractor provides the custom extraction rules for www.cnet.com JavaScript equivalent: export const WwwCnetComExtractor = { ... }

View Source
var WwwEngadgetComExtractor = &CustomExtractor{
	Domain: "www.engadget.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:title\"]", "value"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"a.th-meta[data-ylk*=\"subsec:author\"]",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"div[class*=\"o-title_mark\"] div",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				[]interface{}{

					"#page_body figure:not(div.article-text figure)",
					"div.article-text",
				},
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwEngadgetComExtractor provides the custom extraction rules for www.engadget.com JavaScript equivalent: export const WwwEngadgetComExtractor = { ... }

View Source
var WwwFortinetComExtractor = &CustomExtractor{
	Domain: "www.fortinet.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".b15-blog-meta__author",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.responsivegrid.aem-GridColumn.aem-GridColumn--default--12",
			},
		},

		Transforms: map[string]TransformFunction{
			"noscript": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {

					children := selection.Children()
					if children.Length() == 1 {
						firstChild := children.First()
						if firstChild.Is("img") {

							selection.ReplaceWithSelection(firstChild.WrapInner("<figure>").Parent())
						}
					}
					return nil
				},
			},
		},

		Clean: []string{},
	},
}

WwwFortinetComExtractor provides the custom extraction rules for www.fortinet.com JavaScript equivalent: export const WwwFortinetComExtractor = { ... }

View Source
var WwwGizmodoJpExtractor = &CustomExtractor{
	Domain: "www.gizmodo.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.p-post-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"li.p-post-AssistAuthor",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"li.p-post-AssistTime time", "datetime"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"article.p-post",
			},
		},

		Transforms: map[string]TransformFunction{
			"img.p-post-thumbnailImage": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					src, exists := selection.Attr("src")
					if exists {

						src = strings.ReplaceAll(src, "%27", "'")
						if idx := strings.LastIndex(src, "='"); idx >= 0 {
							src = src[idx+2:]
						}
						src = strings.TrimSuffix(src, "';")
						selection.SetAttr("src", src)
					}
					return nil
				},
			},
		},

		Clean: []string{
			"h1.p-post-title",
			"ul.p-post-Assist",
		},
	},
}

WwwGizmodoJpExtractor provides the custom extraction rules for www.gizmodo.jp JavaScript equivalent: export const WwwGizmodoJpExtractor = { ... }

View Source
var WwwGrueneDeExtractor = &CustomExtractor{
	Domain: "www.gruene.de",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"header h1",
		},
	},

	Author: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{

				[]string{"section header", "section h2", "section p", "section ol"},
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"figcaption",
			"p[class]",
		},
	},

	DatePublished: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[property=\"og:image\"]", "content"},
		},
	},

	Dek: nil,

	NextPageURL: nil,

	Excerpt: nil,
}

WwwGrueneDeExtractor provides the custom extraction rules for www.gruene.de JavaScript equivalent: export const WwwGrueneDeExtractor = { ... }

View Source
var WwwInfoqComExtractor = &CustomExtractor{
	Domain: "www.infoq.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.heading",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"div.widget.article__authors",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".article__readTime.date",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.article__data",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwInfoqComExtractor provides the custom extraction rules for www.infoq.com JavaScript equivalent: export const WwwInfoqComExtractor = { ... }

View Source
var WwwIpaGoJpExtractor = &CustomExtractor{
	Domain: "www.ipa.go.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"p.ipar_text_right",
		},
	},

	Dek: nil,

	LeadImageURL: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#ipar_main",
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"p.ipar_text_right",
		},
	},
}

WwwIpaGoJpExtractor provides the custom extraction rules for www.ipa.go.jp JavaScript equivalent: export const WwwIpaGoJpExtractor = { ... }

View Source
var WwwItmediaCoJpExtractor = &CustomExtractor{
	Domain: "www.itmedia.co.jp",

	SupportedDomains: []string{
		"www.atmarkit.co.jp",
		"techtarget.itmedia.co.jp",
		"nlab.itmedia.co.jp",
	},

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"#cmsTitle h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"#byline",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:modified_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"#cmsAbstract h2",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#cmsBody",
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"#snsSharebox",
		},
	},
}

WwwItmediaCoJpExtractor provides the custom extraction rules for www.itmedia.co.jp and related domains JavaScript equivalent: export const WwwItmediaCoJpExtractor = { ... }

View Source
var WwwJnsaOrgExtractor = &CustomExtractor{
	Domain: "www.jnsa.org",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"#wgtitle h2",
		},
	},

	Author: nil,

	DatePublished: nil,

	Dek: nil,

	Excerpt: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#main_area",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"#pankuzu",
			"#side",
		},
	},
}

WwwJnsaOrgExtractor provides the custom extraction rules for www.jnsa.org JavaScript equivalent: export const WwwJnsaOrgExtractor = { ... }

View Source
var WwwLemondeFrExtractor = &CustomExtractor{
	Domain: "www.lemonde.fr",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.article__title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".author__name",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article__content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"figcaption",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".article__desc",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

WwwLemondeFrExtractor provides the custom extraction rules for www.lemonde.fr JavaScript equivalent: export const WwwLemondeFrExtractor = { ... }

View Source
var WwwLifehackerJpExtractor = &CustomExtractor{
	Domain: "www.lifehacker.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1[class^=\"article_pArticle_Title\"]",
			"h1.lh-summary-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
			"p.lh-entryDetailInner--credit",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			[]string{"div.lh-entryDetail-header time", "datetime"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div[class^=\"article_pArticle_Body__\"]",
				"div.lh-entryDetail-body",
			},
		},

		Transforms: map[string]TransformFunction{
			"img.lazyload": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					src, exists := selection.Attr("src")
					if exists {

						src = strings.ReplaceAll(src, "%27", "'")

						if idx := strings.LastIndex(src, "='"); idx >= 0 {
							src = src[idx+2:]
						}

						src = strings.TrimSuffix(src, "';")

						selection.SetAttr("src", src)
					}
					return nil
				},
			},
		},

		Clean: []string{
			"p.lh-entryDetailInner--credit",
		},
	},
}

WwwLifehackerJpExtractor provides the custom extraction rules for www.lifehacker.jp JavaScript equivalent: export const WwwLifehackerJpExtractor = { ... }

View Source
var WwwMacrumorsComExtractor = &CustomExtractor{
	Domain: "www.macrumors.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			"h1.title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			"article a[rel=\"author\"]",
			".author-url",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time", "datetime"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"article",
				".article",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwMacrumorsComExtractor provides the custom extraction rules for www.macrumors.com JavaScript equivalent: export const WwwMacrumorsComExtractor = { ... }

View Source
var WwwMoongiftJpExtractor = &CustomExtractor{
	Domain: "www.moongift.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.title a",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"ul.meta li:not(.social):first-of-type",
		},

		Timezone: "Asia/Tokyo",
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:description\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#main",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"ul.mg_service.cf",
		},
	},
}

WwwMoongiftJpExtractor provides the custom extraction rules for www.moongift.jp JavaScript equivalent: export const WwwMoongiftJpExtractor = { ... }

View Source
var WwwNationalgeographicComExtractor = &CustomExtractor{
	Domain: "www.nationalgeographic.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			"h1.main-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".byline-component__contributors b span",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".Article__Headline__Desc",
			".article__deck",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"section.Article__Content",
				[]string{".parsys.content", ".__image-lead__"},
				".content",
			},
		},

		Transforms: map[string]TransformFunction{
			".parsys.content": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {

					imageParent := selection.Children().First()
					if imageParent.HasClass("imageGroup") {

						dataAttrContainer := imageParent.Find(".media--medium__container").Children().First()
						imgPath1, exists1 := dataAttrContainer.Attr("data-platform-image1-path")
						imgPath2, exists2 := dataAttrContainer.Attr("data-platform-image2-path")

						if exists1 && exists2 && imgPath1 != "" && imgPath2 != "" {

							imageHTML := fmt.Sprintf(`<div class="__image-lead__">
								<img src="%s"/>
								<img src="%s"/>
							</div>`, imgPath1, imgPath2)
							selection.PrependHtml(imageHTML)
						}
					} else {

						imgSrc, exists := selection.Find(".image.parbase.section").Find(".picturefill").First().Attr("data-platform-src")
						if exists && imgSrc != "" {
							imageHTML := fmt.Sprintf(`<img class="__image-lead__" src="%s"/>`, imgSrc)
							selection.PrependHtml(imageHTML)
						}
					}
					return nil
				},
			},
		},

		Clean: []string{
			".pull-quote.pull-quote--small",
		},
	},
}

WwwNationalgeographicComExtractor provides the custom extraction rules for www.nationalgeographic.com JavaScript equivalent: export const WwwNationalgeographicComExtractor = { ... }

View Source
var WwwOssnewsJpExtractor = &CustomExtractor{
	Domain: "www.ossnews.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"#alpha-block h1.hxnewstitle",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"p.fs12",
		},

		Format: "YYYY年MM月DD日 HH:mm",

		Timezone: "Asia/Tokyo",
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#alpha-block .section:has(h1.hxnewstitle)",
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwOssnewsJpExtractor provides the custom extraction rules for www.ossnews.jp JavaScript equivalent: export const WwwOssnewsJpExtractor = { ... }

View Source
var WwwPhoronixComExtractor = &CustomExtractor{
	Domain: "www.phoronix.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"article h1",
			"article header",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".author a:first-child",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".author",
		},
	},

	Dek: nil,

	LeadImageURL: nil,

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwPhoronixComExtractor provides the custom extraction rules for www.phoronix.com JavaScript equivalent: export const WwwPhoronixComExtractor = { ... }

View Source
var WwwProspectmagazineCoUkExtractor = &CustomExtractor{
	Domain: "www.prospectmagazine.co.uk",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".blog-header__title",
			".page-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".blog-header__author-link",
			".aside_author .title",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".blog__container",
				"article .post_content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
			".post-info",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".blog-header__description",
			".page-subtitle",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

WwwProspectmagazineCoUkExtractor provides the custom extraction rules for www.prospectmagazine.co.uk JavaScript equivalent: export const WwwProspectmagazineCoUkExtractor = { ... }

View Source
var WwwPublickey1JpExtractor = &CustomExtractor{
	Domain: "www.publickey1.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".bloggerinchief p:first-of-type",
			"#subcol p:has(img)",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			"div.pubdate",
		},

		Format: "YYYY年MM月DD日",

		Timezone: "Asia/Tokyo",
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#maincol",
			},
		},

		DefaultCleaner: false,

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			"#breadcrumbs",
			"div.sbm",
			"div.ad_footer",
		},
	},
}

WwwPublickey1JpExtractor provides the custom extraction rules for www.publickey1.jp JavaScript equivalent: export const WwwPublickey1JpExtractor = { ... }

View Source
var WwwRbbtodayComExtractor = &CustomExtractor{
	Domain: "www.rbbtoday.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".writer.writer-name",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"header time", "datetime"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"description\"]", "value"},
			".arti-summary",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".arti-content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".arti-giga",
		},
	},
}

WwwRbbtodayComExtractor provides the custom extraction rules for www.rbbtoday.com JavaScript equivalent: export const WwwRbbtodayComExtractor = { ... }

View Source
var WwwRockpapershotgunComExtractor = &CustomExtractor{
	Domain: "www.rockpapershotgun.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.title",
			"h1",
			[]string{"meta[property=\"og:title\"]", "content"},
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".byline .author a",
			".byline .author",
			[]string{"meta[name=\"author\"]", "content"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"time", "datetime"},
			[]string{"meta[property=\"article:published_time\"]", "content"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"p.strapline",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[property=\"og:image\"]", "content"},
			".headline_image",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".article_body_content.article-styling",
				".article_body_content",
				".article-content",
				"article .article_body",
			},
		},

		Clean: []string{

			".inlinead",
			".desktop_mpu",
			".mpu_container",
			".advert_container",
			".leaderboard_container",
			".injection_placeholder",
			"span.injection_placeholder",
			"[data-position]",

			".read-next",
			".article_footer",
			".comments__link",
			".load-comments",
			".smart-slot",
			".sign-in-buttons",

			".byline",
			".metadata",
			".avatar",
			".published_at",
			".tagged_with",
			".author-inline",

			"button",
			"form",

			".social-sign-in-button",

			".tagged_with_item",

			".comments-bubble",
		},
	},
}

WwwRockpapershotgunComExtractor provides the custom extraction rules for www.rockpapershotgun.com

View Source
var WwwSbnationComExtractor = &CustomExtractor{
	Domain: "www.sbnation.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.c-page-title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			"p.c-entry-summary.p-dek",
			"h2.c-entry-summary.p-dek",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.c-entry-content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwSbnationComExtractor provides the custom extraction rules for www.sbnation.com JavaScript equivalent: export const WwwSbnationComExtractor = { ... }

View Source
var WwwSiComExtractor = &CustomExtractor{
	Domain: "www.si.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1",
			"h1.headline",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"published\"]", "value"},
		},
		Timezone: "America/New_York",
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".m-detail-header--dek",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				".m-detail--body",
				[]interface{}{"p", ".marquee_large_2x", ".component.image"},
			},
		},

		Transforms: map[string]TransformFunction{

			"noscript": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					children := selection.Children()
					if children.Length() == 1 {
						firstChild := children.First()
						if goquery.NodeName(firstChild) == "img" {

							html, _ := children.Html()
							selection.ReplaceWithHtml("<figure>" + html + "</figure>")
						}
					}
					return nil
				},
			},
		},

		Clean: []string{
			".inline-thumb",
			".primary-message",
			".description",
			".instructions",
		},
	},
}

WwwSiComExtractor provides the custom extraction rules for www.si.com JavaScript equivalent: export const WwwSiComExtractor = { ... }

View Source
var WwwSpektrumDeExtractor = &CustomExtractor{
	Domain: "www.spektrum.de",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			".content__title",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			".content__author__info__name",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"article.content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".breadcrumbs",
			".hide-for-print",
			"aside",
			"header h2",
			".image__article__top",
			".content__author",
			".copyright",
			".callout-box",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			".content__meta__date",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{

			[]string{"meta[name=\"og:image\"]", "value"},

			[]string{"meta[property=\"og:image\"]", "content"},

			".image__article__top img",
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".content__intro",
		},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

WwwSpektrumDeExtractor provides the custom extraction rules for www.spektrum.de JavaScript equivalent: export const SpektrumExtractor = { ... }

View Source
var WwwThevergeComExtractor = &CustomExtractor{
	Domain: "www.theverge.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{"h1"},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"author\"]", "value"},
			[]string{"meta[name=\"parsely-author\"]", "value"},
			[]string{"meta[name=\"cse-authors\"]", "value"},
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{
			".p-dek",
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{

				".duet--article--article-body-component",

				"div[id*='zephr-anchor']",

				"article",
				".article-content",

				[]interface{}{".c-entry-hero .e-image", ".c-entry-intro", ".c-entry-content"},
				[]interface{}{".e-image--hero", ".c-entry-content"},
				".l-wrapper .l-feature",
				"div.c-entry-content",
			},
		},

		Transforms: map[string]TransformFunction{

			"noscript": &FunctionTransform{
				Fn: func(selection *goquery.Selection) error {
					children := selection.Children()
					if children.Length() == 1 {
						firstChild := children.First()
						if goquery.NodeName(firstChild) == "img" {

							html, _ := children.Html()
							selection.ReplaceWithHtml("<span>" + html + "</span>")
						}
					}
					return nil
				},
			},
		},

		Clean: []string{
			".aside",
			"img.c-dynamic-image",

			".duet--article--image-gallery-two-up .kqz8fh5 .kqz8fh8 .kqz8fh7",
			".duet--article--image-gallery-two-up .kqz8fha .kqz8fh9",
			"div[class*='image-gallery'] img[srcset]",
			".duet--media--content-warning",
			"._1etxtj1",

			".c-related-list",
			".c-entry-group-labels",
			".c-follow-button",
			".tly2fw0",
			"button",

			".c-image-gallery__nav",
			"[class*='follow']",
		},
	},
}

WwwThevergeComExtractor provides the custom extraction rules for www.theverge.com JavaScript equivalent: export const WwwThevergeComExtractor = { ... }

View Source
var WwwWiredComExtractor = &CustomExtractor{
	Domain: "www.wired.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1[data-testId=\"ContentHeaderHed\"]",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:author\"]", "value"},
			"a[rel=\"author\"]",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"article.article.main-content",
				"article.content",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{
			".visually-hidden",
			"figcaption img.photo",
			".alert-message",
		},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

WwwWiredComExtractor provides the custom extraction rules for www.wired.com JavaScript equivalent: export const WiredExtractor = { ... }

View Source
var WwwYomiuriCoJpExtractor = &CustomExtractor{
	Domain: "www.yomiuri.co.jp",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			"h1.title-article.c-article-title",
		},
	},

	Author: nil,

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"article:published_time\"]", "value"},
		},
	},

	Dek: nil,

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"div.p-main-contents",
			},
		},

		Transforms: map[string]TransformFunction{},

		Clean: []string{},
	},
}

WwwYomiuriCoJpExtractor provides the custom extraction rules for www.yomiuri.co.jp JavaScript equivalent: export const WwwYomiuriCoJpExtractor = { ... }

View Source
var YouTubeCustomExtractor = &CustomExtractor{
	Domain: "www.youtube.com",

	Title: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"title\"]", "value"},
			".watch-title",
			"h1.watch-title-container",
		},
	},

	Author: &FieldExtractor{
		Selectors: []interface{}{
			[]string{`link[itemprop="name"]`, "content"},
			".yt-user-info",
		},
	},

	Content: &ContentExtractor{
		FieldExtractor: &FieldExtractor{
			Selectors: []interface{}{
				"#player-container-outer",
				"ytd-expandable-video-description-body-renderer #description",
				[]string{"#player-api", "#description"},
			},
			DefaultCleaner: false,
		},

		Transforms: map[string]TransformFunction{

			"#player-api": &FunctionTransform{
				Fn: transformYouTubePlayerAPI,
			},

			"#player-container-outer": &FunctionTransform{
				Fn: transformYouTubePlayerContainer,
			},
		},

		Clean: []string{},
	},

	DatePublished: &FieldExtractor{
		Selectors: []interface{}{
			[]string{`meta[itemProp="datePublished"]`, "value"},
		},
	},

	LeadImageURL: &FieldExtractor{
		Selectors: []interface{}{
			[]string{"meta[name=\"og:image\"]", "value"},
		},
	},

	Dek: &FieldExtractor{
		Selectors: []interface{}{},
	},

	NextPageURL: nil,

	Excerpt: nil,
}

YouTubeCustomExtractor provides the custom extraction rules for www.youtube.com JavaScript equivalent: export const WwwYoutubeComExtractor = { ... }

Functions

func BuildAllExtractorsMap

func BuildAllExtractorsMap(extractors []*CustomExtractor) map[string]*CustomExtractor

BuildAllExtractorsMap creates the complete domain mapping JavaScript equivalent: The result of all.js processing

func CountCustomExtractors

func CountCustomExtractors() int

CountCustomExtractors returns the total number of custom extractors

func DeobfuscateAbendblattText

func DeobfuscateAbendblattText(selection *goquery.Selection) *goquery.Selection

DeobfuscateAbendblattText handles the complex obfuscation transform for Abendblatt.de JavaScript equivalent: complex function in transforms.p and transforms.div

func GetAllCustomExtractors

func GetAllCustomExtractors() map[string]*CustomExtractor

GetAllCustomExtractors returns all registered custom extractors JavaScript equivalent: export * from './blogspot.com'; export * from './medium.com'; etc.

func GetAllCustomExtractorsList

func GetAllCustomExtractorsList() []string

GetAllCustomExtractorsList returns a list of all custom extractor names

func GetCustomExtractorDomains

func GetCustomExtractorDomains() []string

GetCustomExtractorDomains returns all domains covered by custom extractors

func MergeSupportedDomains

func MergeSupportedDomains(extractor *CustomExtractor) map[string]*CustomExtractor

MergeSupportedDomains creates domain mappings for an extractor JavaScript equivalent: mergeSupportedDomains function in utils/merge-supported-domains.js

Types

type ContentExtractor

type ContentExtractor struct {
	*FieldExtractor
	Clean          []string                     `json:"clean"`          // Selectors to remove from content
	Transforms     map[string]TransformFunction `json:"transforms"`     // Element transformations
	DefaultCleaner bool                         `json:"defaultCleaner"` // Apply default content cleaner
}

ContentExtractor defines how to extract and clean main content JavaScript equivalent: { selectors: [...], clean: [...], transforms: {...} }

type CustomExtractor

type CustomExtractor struct {
	Domain           string                     `json:"domain"`
	SupportedDomains []string                   `json:"supportedDomains,omitempty"`
	Title            *FieldExtractor            `json:"title,omitempty"`
	Author           *FieldExtractor            `json:"author,omitempty"`
	Content          *ContentExtractor          `json:"content,omitempty"`
	DatePublished    *FieldExtractor            `json:"date_published,omitempty"`
	LeadImageURL     *FieldExtractor            `json:"lead_image_url,omitempty"`
	Dek              *FieldExtractor            `json:"dek,omitempty"`
	NextPageURL      *FieldExtractor            `json:"next_page_url,omitempty"`
	Excerpt          *FieldExtractor            `json:"excerpt,omitempty"`
	Extend           map[string]*FieldExtractor `json:"extend,omitempty"`
}

CustomExtractor represents a site-specific content extractor JavaScript equivalent: Each extractor export in custom/[domain]/index.js

func GetABCNewsExtractor

func GetABCNewsExtractor() *CustomExtractor

GetABCNewsExtractor returns the custom extractor for abcnews.go.com

func GetArstechnicaComExtractor

func GetArstechnicaComExtractor() *CustomExtractor

GetArstechnicaComExtractor returns the Ars Technica custom extractor

func GetBiorxivOrgExtractor

func GetBiorxivOrgExtractor() *CustomExtractor

GetBiorxivOrgExtractor returns the BioRxiv custom extractor

func GetBloggerExtractor

func GetBloggerExtractor() *CustomExtractor

GetBloggerExtractor returns the Blogger custom extractor

func GetBlogspotExtractor

func GetBlogspotExtractor() *CustomExtractor

GetBlogspotExtractor returns the Blogspot custom extractor

func GetBloombergExtractor

func GetBloombergExtractor() *CustomExtractor

GetBloombergExtractor returns the custom extractor for www.bloomberg.com

func GetBookwalkerJpExtractor

func GetBookwalkerJpExtractor() *CustomExtractor

GetBookwalkerJpExtractor returns the BookWalker Japan custom extractor

func GetBustleExtractor

func GetBustleExtractor() *CustomExtractor

GetBustleExtractor returns the Bustle custom extractor

func GetBuzzFeedExtractor

func GetBuzzFeedExtractor() *CustomExtractor

GetBuzzFeedExtractor returns the BuzzFeed custom extractor

func GetBuzzapJpExtractor

func GetBuzzapJpExtractor() *CustomExtractor

GetBuzzapJpExtractor returns the BuzzAP Japan custom extractor

func GetCNBCExtractor

func GetCNBCExtractor() *CustomExtractor

GetCNBCExtractor returns the custom extractor for www.cnbc.com

func GetCNNExtractor

func GetCNNExtractor() *CustomExtractor

GetCNNExtractor returns the custom extractor for www.cnn.com

func GetChicagoTribuneExtractor

func GetChicagoTribuneExtractor() *CustomExtractor

GetChicagoTribuneExtractor returns the custom extractor for www.chicagotribune.com

func GetClinicaltrialsGovExtractor

func GetClinicaltrialsGovExtractor() *CustomExtractor

GetClinicaltrialsGovExtractor returns the ClinicalTrials.gov custom extractor

func GetCustomExtractorByDomain

func GetCustomExtractorByDomain(domain string) (*CustomExtractor, bool)

GetCustomExtractorByDomain returns a custom extractor for a specific domain

func GetDaringFireballExtractor

func GetDaringFireballExtractor() *CustomExtractor

GetDaringFireballExtractor returns the Daring Fireball custom extractor

func GetDeadlineExtractor

func GetDeadlineExtractor() *CustomExtractor

GetDeadlineExtractor returns the Deadline.com custom extractor

func GetDeadspinComExtractor

func GetDeadspinComExtractor() *CustomExtractor

GetDeadspinComExtractor returns the Deadspin custom extractor

func GetEOnlineExtractor

func GetEOnlineExtractor() *CustomExtractor

GetEOnlineExtractor returns the E! Online custom extractor

func GetEpaperZeitDeExtractor

func GetEpaperZeitDeExtractor() *CustomExtractor

GetEpaperZeitDeExtractor returns the Zeit.de e-paper custom extractor

func GetFandomWikiaExtractor

func GetFandomWikiaExtractor() *CustomExtractor

GetFandomWikiaExtractor returns the Fandom Wikia custom extractor

func GetFortuneComExtractor

func GetFortuneComExtractor() *CustomExtractor

GetFortuneComExtractor returns the custom extractor for fortune.com

func GetGeniusExtractor

func GetGeniusExtractor() *CustomExtractor

GetGeniusExtractor returns the Genius custom extractor

func GetGetnewsJpExtractor

func GetGetnewsJpExtractor() *CustomExtractor

GetGetnewsJpExtractor returns the GetNews Japan custom extractor

func GetGithubComExtractor

func GetGithubComExtractor() *CustomExtractor

GetGithubComExtractor returns the GitHub custom extractor

func GetGothamistComExtractor

func GetGothamistComExtractor() *CustomExtractor

GetGothamistComExtractor returns the custom extractor for gothamist.com and related city sites

func GetHuffingtonPostExtractor

func GetHuffingtonPostExtractor() *CustomExtractor

GetHuffingtonPostExtractor returns the HuffingtonPost custom extractor

func GetIciRadioCanadaCaExtractor

func GetIciRadioCanadaCaExtractor() *CustomExtractor

GetIciRadioCanadaCaExtractor returns the ICI Radio-Canada custom extractor

func GetJapanCnetComExtractor

func GetJapanCnetComExtractor() *CustomExtractor

GetJapanCnetComExtractor returns the CNET Japan custom extractor

func GetJapanZdnetComExtractor

func GetJapanZdnetComExtractor() *CustomExtractor

GetJapanZdnetComExtractor returns the ZDNet Japan custom extractor

func GetJvndbJvnJpExtractor

func GetJvndbJvnJpExtractor() *CustomExtractor

GetJvndbJvnJpExtractor returns the JVNDB custom extractor

func GetLATimesExtractor

func GetLATimesExtractor() *CustomExtractor

GetLATimesExtractor returns the custom extractor for www.latimes.com

func GetLinkedInExtractor

func GetLinkedInExtractor() *CustomExtractor

GetLinkedInExtractor returns the LinkedIn custom extractor

func GetLittleThingsExtractor

func GetLittleThingsExtractor() *CustomExtractor

GetLittleThingsExtractor returns the LittleThings custom extractor

func GetMaTtiasBeExtractor

func GetMaTtiasBeExtractor() *CustomExtractor

GetMaTtiasBeExtractor returns the ma.ttias.be custom extractor

func GetMashableComExtractor

func GetMashableComExtractor() *CustomExtractor

GetMashableComExtractor returns the Mashable custom extractor

func GetMediumExtractor

func GetMediumExtractor() *CustomExtractor

GetMediumExtractor returns the Medium custom extractor

func GetMediumExtractorFixed

func GetMediumExtractorFixed() *CustomExtractor

GetMediumExtractorFixed returns the Medium custom extractor

func GetMiamiHeraldExtractor

func GetMiamiHeraldExtractor() *CustomExtractor

GetMiamiHeraldExtractor returns the custom extractor for www.miamiherald.com

func GetMoneyCNNExtractor

func GetMoneyCNNExtractor() *CustomExtractor

GetMoneyCNNExtractor returns the custom extractor for money.cnn.com

func GetNBCNewsExtractor

func GetNBCNewsExtractor() *CustomExtractor

GetNBCNewsExtractor returns the custom extractor for www.nbcnews.com

func GetNPRExtractor

func GetNPRExtractor() *CustomExtractor

GetNPRExtractor returns the custom extractor for www.npr.org

func GetNYDailyNewsExtractor

func GetNYDailyNewsExtractor() *CustomExtractor

GetNYDailyNewsExtractor returns the custom extractor for www.nydailynews.com

func GetNYMagExtractor

func GetNYMagExtractor() *CustomExtractor

GetNYMagExtractor returns the NY Magazine custom extractor

func GetNYTimesExtractor

func GetNYTimesExtractor() *CustomExtractor

GetNYTimesExtractor returns the custom extractor for www.nytimes.com

func GetNewYorkerExtractor

func GetNewYorkerExtractor() *CustomExtractor

GetNewYorkerExtractor returns the New Yorker custom extractor

func GetNewsMynaviJpExtractor

func GetNewsMynaviJpExtractor() *CustomExtractor

GetNewsMynaviJpExtractor returns the MyNavi News Japan custom extractor

func GetNewsNationalgeographicComExtractor

func GetNewsNationalgeographicComExtractor() *CustomExtractor

GetNewsNationalgeographicComExtractor returns the News National Geographic custom extractor

func GetPastebinExtractor

func GetPastebinExtractor() *CustomExtractor

GetPastebinExtractor returns the Pastebin custom extractor

func GetPeopleExtractor

func GetPeopleExtractor() *CustomExtractor

GetPeopleExtractor returns the People.com custom extractor

func GetPhpspotOrgExtractor

func GetPhpspotOrgExtractor() *CustomExtractor

GetPhpspotOrgExtractor returns the PHPSpot Japan custom extractor

func GetPitchforkExtractor

func GetPitchforkExtractor() *CustomExtractor

GetPitchforkExtractor returns the Pitchfork custom extractor

func GetPoliticoExtractor

func GetPoliticoExtractor() *CustomExtractor

GetPoliticoExtractor returns the custom extractor for www.politico.com

func GetPolygonExtractor

func GetPolygonExtractor() *CustomExtractor

GetPolygonExtractor returns the Polygon custom extractor

func GetPopSugarExtractor

func GetPopSugarExtractor() *CustomExtractor

GetPopSugarExtractor returns the PopSugar custom extractor

func GetQdailyExtractor

func GetQdailyExtractor() *CustomExtractor

GetQdailyExtractor returns the Qdaily custom extractor

func GetRedditExtractor

func GetRedditExtractor() *CustomExtractor

GetRedditExtractor returns the Reddit custom extractor

func GetReutersExtractor

func GetReutersExtractor() *CustomExtractor

GetReutersExtractor returns the custom extractor for www.reuters.com

func GetRollingStoneExtractor

func GetRollingStoneExtractor() *CustomExtractor

GetRollingStoneExtractor returns the Rolling Stone custom extractor

func GetScanNetsecurityNeJpExtractor

func GetScanNetsecurityNeJpExtractor() *CustomExtractor

GetScanNetsecurityNeJpExtractor returns the ScanNetSecurity custom extractor

func GetScienceflyComExtractor

func GetScienceflyComExtractor() *CustomExtractor

GetScienceflyComExtractor returns the ScienceFly custom extractor

func GetSectIijAdJpExtractor

func GetSectIijAdJpExtractor() *CustomExtractor

GetSectIijAdJpExtractor returns the SECT IIJ custom extractor

func GetTMZExtractor

func GetTMZExtractor() *CustomExtractor

GetTMZExtractor returns the TMZ custom extractor

func GetTakagihiromitsuJpExtractor

func GetTakagihiromitsuJpExtractor() *CustomExtractor

GetTakagihiromitsuJpExtractor returns the Takagi Hiromitsu custom extractor

func GetTechlogIijAdJpExtractor

func GetTechlogIijAdJpExtractor() *CustomExtractor

GetTechlogIijAdJpExtractor returns the TechLog IIJ custom extractor

func GetTheAtlanticExtractor

func GetTheAtlanticExtractor() *CustomExtractor

GetTheAtlanticExtractor returns the The Atlantic custom extractor

func GetTheGuardianExtractor

func GetTheGuardianExtractor() *CustomExtractor

GetTheGuardianExtractor returns the custom extractor for www.theguardian.com

func GetThoughtCatalogExtractor

func GetThoughtCatalogExtractor() *CustomExtractor

GetThoughtCatalogExtractor returns the ThoughtCatalog custom extractor

func GetTimesofindiaIndiatimesComExtractor

func GetTimesofindiaIndiatimesComExtractor() *CustomExtractor

GetTimesofindiaIndiatimesComExtractor returns the Times of India custom extractor

func GetTwitterExtractor

func GetTwitterExtractor() *CustomExtractor

GetTwitterExtractor returns the Twitter custom extractor

func GetTwofortysevensportsComExtractor

func GetTwofortysevensportsComExtractor() *CustomExtractor

GetTwofortysevensportsComExtractor returns the 247Sports custom extractor

func GetUSMagazineExtractor

func GetUSMagazineExtractor() *CustomExtractor

GetUSMagazineExtractor returns the US Magazine custom extractor

func GetUproxxExtractor

func GetUproxxExtractor() *CustomExtractor

GetUproxxExtractor returns the Uproxx custom extractor

func GetVoxExtractor

func GetVoxExtractor() *CustomExtractor

GetVoxExtractor returns the Vox custom extractor

func GetWashingtonPostExtractor

func GetWashingtonPostExtractor() *CustomExtractor

GetWashingtonPostExtractor returns the custom extractor for www.washingtonpost.com

func GetWeeklyAsciiJpExtractor

func GetWeeklyAsciiJpExtractor() *CustomExtractor

GetWeeklyAsciiJpExtractor returns the Weekly ASCII Japan custom extractor

func GetWikipediaExtractor

func GetWikipediaExtractor() *CustomExtractor

GetWikipediaExtractor returns the Wikipedia custom extractor

func GetWiredJpExtractor

func GetWiredJpExtractor() *CustomExtractor

GetWiredJpExtractor returns the Wired Japan custom extractor

func GetWwwAbendblattDeExtractor

func GetWwwAbendblattDeExtractor() *CustomExtractor

GetWwwAbendblattDeExtractor returns the Abendblatt.de custom extractor

func GetWwwAlComExtractor

func GetWwwAlComExtractor() *CustomExtractor

GetWwwAlComExtractor returns the custom extractor for www.al.com

func GetWwwAmericanowComExtractor

func GetWwwAmericanowComExtractor() *CustomExtractor

GetWwwAmericanowComExtractor returns the custom extractor for www.americanow.com

func GetWwwAndroidcentralComExtractor

func GetWwwAndroidcentralComExtractor() *CustomExtractor

GetWwwAndroidcentralComExtractor returns the Android Central custom extractor

func GetWwwAolComExtractor

func GetWwwAolComExtractor() *CustomExtractor

GetWwwAolComExtractor returns the custom extractor for www.aol.com

func GetWwwApartmenttherapyComExtractor

func GetWwwApartmenttherapyComExtractor() *CustomExtractor

GetWwwApartmenttherapyComExtractor returns the custom extractor for www.apartmenttherapy.com

func GetWwwAsahiComExtractor

func GetWwwAsahiComExtractor() *CustomExtractor

GetWwwAsahiComExtractor returns the Asahi Shimbun custom extractor

func GetWwwBroadwayworldComExtractor

func GetWwwBroadwayworldComExtractor() *CustomExtractor

GetWwwBroadwayworldComExtractor returns the custom extractor for www.broadwayworld.com

func GetWwwCbcCaExtractor

func GetWwwCbcCaExtractor() *CustomExtractor

GetWwwCbcCaExtractor returns the CBC custom extractor

func GetWwwCbssportsComExtractor

func GetWwwCbssportsComExtractor() *CustomExtractor

GetWwwCbssportsComExtractor returns the CBS Sports custom extractor

func GetWwwCnetComExtractor

func GetWwwCnetComExtractor() *CustomExtractor

GetWwwCnetComExtractor returns the CNET custom extractor

func GetWwwDmagazineComExtractor

func GetWwwDmagazineComExtractor() *CustomExtractor

GetWwwDmagazineComExtractor returns the custom extractor for www.dmagazine.com

func GetWwwElecomCoJpExtractor

func GetWwwElecomCoJpExtractor() *CustomExtractor

GetWwwElecomCoJpExtractor returns the custom extractor for www.elecom.co.jp

func GetWwwEngadgetComExtractor

func GetWwwEngadgetComExtractor() *CustomExtractor

GetWwwEngadgetComExtractor returns the Engadget custom extractor

func GetWwwFastcompanyComExtractor

func GetWwwFastcompanyComExtractor() *CustomExtractor

GetWwwFastcompanyComExtractor returns the custom extractor for www.fastcompany.com

func GetWwwFoolComExtractor

func GetWwwFoolComExtractor() *CustomExtractor

GetWwwFoolComExtractor returns the custom extractor for www.fool.com

func GetWwwFortinetComExtractor

func GetWwwFortinetComExtractor() *CustomExtractor

GetWwwFortinetComExtractor returns the Fortinet custom extractor

func GetWwwGizmodoJpExtractor

func GetWwwGizmodoJpExtractor() *CustomExtractor

GetWwwGizmodoJpExtractor returns the Gizmodo Japan custom extractor

func GetWwwGrueneDeExtractor

func GetWwwGrueneDeExtractor() *CustomExtractor

GetWwwGrueneDeExtractor returns the Gruene.de custom extractor

func GetWwwInfoqComExtractor

func GetWwwInfoqComExtractor() *CustomExtractor

GetWwwInfoqComExtractor returns the InfoQ custom extractor

func GetWwwInquisitrComExtractor

func GetWwwInquisitrComExtractor() *CustomExtractor

GetWwwInquisitrComExtractor returns the custom extractor for www.inquisitr.com

func GetWwwIpaGoJpExtractor

func GetWwwIpaGoJpExtractor() *CustomExtractor

GetWwwIpaGoJpExtractor returns the IPA Japan custom extractor

func GetWwwItmediaCoJpExtractor

func GetWwwItmediaCoJpExtractor() *CustomExtractor

GetWwwItmediaCoJpExtractor returns the ITmedia Japan custom extractor

func GetWwwJnsaOrgExtractor

func GetWwwJnsaOrgExtractor() *CustomExtractor

GetWwwJnsaOrgExtractor returns the JNSA custom extractor

func GetWwwLadbibleComExtractor

func GetWwwLadbibleComExtractor() *CustomExtractor

GetWwwLadbibleComExtractor returns the custom extractor for www.ladbible.com

func GetWwwLemondeFrExtractor

func GetWwwLemondeFrExtractor() *CustomExtractor

GetWwwLemondeFrExtractor returns the Le Monde custom extractor

func GetWwwLifehackerJpExtractor

func GetWwwLifehackerJpExtractor() *CustomExtractor

GetWwwLifehackerJpExtractor returns the Lifehacker Japan custom extractor

func GetWwwMacrumorsComExtractor

func GetWwwMacrumorsComExtractor() *CustomExtractor

GetWwwMacrumorsComExtractor returns the MacRumors custom extractor

func GetWwwMentalflossComExtractor

func GetWwwMentalflossComExtractor() *CustomExtractor

GetWwwMentalflossComExtractor returns the custom extractor for www.mentalfloss.com

func GetWwwMoongiftJpExtractor

func GetWwwMoongiftJpExtractor() *CustomExtractor

GetWwwMoongiftJpExtractor returns the MOONGIFT Japan custom extractor

func GetWwwMsnComExtractor

func GetWwwMsnComExtractor() *CustomExtractor

GetWwwMsnComExtractor returns the custom extractor for www.msn.com

func GetWwwNationalgeographicComExtractor

func GetWwwNationalgeographicComExtractor() *CustomExtractor

GetWwwNationalgeographicComExtractor returns the National Geographic custom extractor

func GetWwwNdtvComExtractor

func GetWwwNdtvComExtractor() *CustomExtractor

GetWwwNdtvComExtractor returns the custom extractor for www.ndtv.com

func GetWwwOpposingviewsComExtractor

func GetWwwOpposingviewsComExtractor() *CustomExtractor

GetWwwOpposingviewsComExtractor returns the custom extractor for www.opposingviews.com

func GetWwwOssnewsJpExtractor

func GetWwwOssnewsJpExtractor() *CustomExtractor

GetWwwOssnewsJpExtractor returns the OSS News Japan custom extractor

func GetWwwPhoronixComExtractor

func GetWwwPhoronixComExtractor() *CustomExtractor

GetWwwPhoronixComExtractor returns the Phoronix custom extractor

func GetWwwProspectmagazineCoUkExtractor

func GetWwwProspectmagazineCoUkExtractor() *CustomExtractor

GetWwwProspectmagazineCoUkExtractor returns the Prospect Magazine UK custom extractor

func GetWwwPublickey1JpExtractor

func GetWwwPublickey1JpExtractor() *CustomExtractor

GetWwwPublickey1JpExtractor returns the Publickey1 Japan custom extractor

func GetWwwRawstoryComExtractor

func GetWwwRawstoryComExtractor() *CustomExtractor

GetWwwRawstoryComExtractor returns the custom extractor for www.rawstory.com

func GetWwwRbbtodayComExtractor

func GetWwwRbbtodayComExtractor() *CustomExtractor

GetWwwRbbtodayComExtractor returns the RBB TODAY Japan custom extractor

func GetWwwRockpapershotgunComExtractor

func GetWwwRockpapershotgunComExtractor() *CustomExtractor

GetWwwRockpapershotgunComExtractor returns the Rock Paper Shotgun custom extractor

func GetWwwSbnationComExtractor

func GetWwwSbnationComExtractor() *CustomExtractor

GetWwwSbnationComExtractor returns the SB Nation custom extractor

func GetWwwSiComExtractor

func GetWwwSiComExtractor() *CustomExtractor

GetWwwSiComExtractor returns the Sports Illustrated custom extractor

func GetWwwSlateComExtractor

func GetWwwSlateComExtractor() *CustomExtractor

GetWwwSlateComExtractor returns the custom extractor for www.slate.com

func GetWwwSpektrumDeExtractor

func GetWwwSpektrumDeExtractor() *CustomExtractor

GetWwwSpektrumDeExtractor returns the Spektrum.de custom extractor

func GetWwwThevergeComExtractor

func GetWwwThevergeComExtractor() *CustomExtractor

GetWwwThevergeComExtractor returns The Verge custom extractor

func GetWwwTodayComExtractor

func GetWwwTodayComExtractor() *CustomExtractor

GetWwwTodayComExtractor returns the custom extractor for www.today.com

func GetWwwWesternjournalismComExtractor

func GetWwwWesternjournalismComExtractor() *CustomExtractor

GetWwwWesternjournalismComExtractor returns the custom extractor for www.westernjournalism.com

func GetWwwWiredComExtractor

func GetWwwWiredComExtractor() *CustomExtractor

GetWwwWiredComExtractor returns the Wired.com custom extractor

func GetWwwYahooComExtractor

func GetWwwYahooComExtractor() *CustomExtractor

GetWwwYahooComExtractor returns the custom extractor for www.yahoo.com

func GetWwwYomiuriCoJpExtractor

func GetWwwYomiuriCoJpExtractor() *CustomExtractor

GetWwwYomiuriCoJpExtractor returns the Yomiuri Shimbun custom extractor

func GetYouTubeExtractor

func GetYouTubeExtractor() *CustomExtractor

GetYouTubeExtractor returns the YouTube custom extractor

type ExtractorFactory

type ExtractorFactory func() *CustomExtractor

ExtractorFactory creates custom extractors Used for lazy loading and dynamic creation of extractors

type ExtractorOptions

type ExtractorOptions struct {
	ContentType string
	Extend      map[string]interface{}
}

ExtractorOptions provides configuration for extraction operations

type ExtractorRegistry

type ExtractorRegistry struct {
	// contains filtered or unexported fields
}

ExtractorRegistry holds all custom extractors

func NewExtractorRegistry

func NewExtractorRegistry() *ExtractorRegistry

NewExtractorRegistry creates a new registry

func (*ExtractorRegistry) Count

func (r *ExtractorRegistry) Count() int

Count returns the number of registered extractors

func (*ExtractorRegistry) Get

func (r *ExtractorRegistry) Get(domain string) (*CustomExtractor, bool)

Get retrieves an extractor by domain

func (*ExtractorRegistry) GetAll

func (r *ExtractorRegistry) GetAll() map[string]*CustomExtractor

GetAll returns all extractors (deduplicated by primary domain)

func (*ExtractorRegistry) List

func (r *ExtractorRegistry) List() []string

List returns all registered domains

func (*ExtractorRegistry) Register

func (r *ExtractorRegistry) Register(extractor *CustomExtractor)

Register adds a custom extractor to the registry

type FieldExtractor

type FieldExtractor struct {
	Selectors      []interface{} `json:"selectors"`      // Can be string or [string, string] for [selector, attribute]
	AllowMultiple  bool          `json:"allowMultiple"`  // Allow multiple values
	DefaultCleaner bool          `json:"defaultCleaner"` // Apply default field cleaner
	Format         string        `json:"format"`         // Date format (for date fields)
	Timezone       string        `json:"timezone"`       // Timezone (for date fields)
}

FieldExtractor defines how to extract a specific field from a document JavaScript equivalent: { selectors: [...], allowMultiple: bool }

type FunctionTransform

type FunctionTransform struct {
	Fn func(*goquery.Selection) error
}

FunctionTransform is a custom function transform JavaScript equivalent: 'selector': $node => { custom logic }

func (*FunctionTransform) Transform

func (ft *FunctionTransform) Transform(selection *goquery.Selection) error

type RegistryManager

type RegistryManager struct {
	// contains filtered or unexported fields
}

RegistryManager provides thread-safe management of custom extractors JavaScript equivalent: Combination of all.js, mergeSupportedDomains, and runtime extractor management

func NewRegistryManager

func NewRegistryManager() *RegistryManager

NewRegistryManager creates a new registry manager

func (*RegistryManager) Clear

func (rm *RegistryManager) Clear()

Clear removes all extractors from the registry Useful for testing

func (*RegistryManager) Clone

func (rm *RegistryManager) Clone() *RegistryManager

Clone creates a copy of the registry Useful for testing and isolated environments

func (*RegistryManager) Count

func (rm *RegistryManager) Count() (int, int)

Count returns statistics about registered extractors

func (*RegistryManager) GetAll

func (rm *RegistryManager) GetAll() map[string]*CustomExtractor

GetAll returns all registered extractors (deduplicated by primary domain) JavaScript equivalent: Object.keys(CustomExtractors) processing in all.js

func (*RegistryManager) GetBaseDomain

func (rm *RegistryManager) GetBaseDomain(hostname string) string

GetBaseDomain calculates base domain from hostname JavaScript equivalent: hostname.split('.').slice(-2).join('.') in get-extractor.js

func (*RegistryManager) GetByDomain

func (rm *RegistryManager) GetByDomain(domain string) (*CustomExtractor, bool)

GetByDomain retrieves an extractor by domain with lazy loading support JavaScript equivalent: Extractors[hostname] || Extractors[baseDomain] lookup in get-extractor.js

func (*RegistryManager) GetByDomainWithFallback

func (rm *RegistryManager) GetByDomainWithFallback(hostname string) (*CustomExtractor, bool)

GetByDomainWithFallback tries hostname first, then base domain JavaScript equivalent: Extractors[hostname] || Extractors[baseDomain] logic

func (*RegistryManager) GetByHTML

func (rm *RegistryManager) GetByHTML(doc *goquery.Document) *CustomExtractor

GetByHTML detects extractor using HTML selectors JavaScript equivalent: detectByHtml($) function in detect-by-html.js

func (*RegistryManager) GetDomainMapping

func (rm *RegistryManager) GetDomainMapping() map[string]*CustomExtractor

GetDomainMapping returns the complete domain-to-extractor mapping JavaScript equivalent: The flattened domain mapping created by all.js + mergeSupportedDomains

func (*RegistryManager) ListDomains

func (rm *RegistryManager) ListDomains() []string

ListDomains returns all registered domains (including supported domains)

func (*RegistryManager) ListPrimaryDomains

func (rm *RegistryManager) ListPrimaryDomains() []string

ListPrimaryDomains returns only primary domains (not supported domains)

func (*RegistryManager) Register

func (rm *RegistryManager) Register(extractor *CustomExtractor) error

Register adds a custom extractor to the registry JavaScript equivalent: Building the registry in all.js with mergeSupportedDomains

func (*RegistryManager) RegisterFactory

func (rm *RegistryManager) RegisterFactory(domain string, factory ExtractorFactory) error

RegisterFactory adds a factory for lazy loading of extractors Useful for reducing memory usage when not all extractors are needed

func (*RegistryManager) RegisterHTMLDetector

func (rm *RegistryManager) RegisterHTMLDetector(selector string, extractor *CustomExtractor) error

RegisterHTMLDetector adds an HTML-based extractor detector JavaScript equivalent: Entries in detect-by-html.js Detectors map

func (*RegistryManager) Remove

func (rm *RegistryManager) Remove(domain string) bool

Remove removes an extractor from the registry Useful for testing and dynamic management

type SelectorEntry

type SelectorEntry struct {
	Selector  string
	Attribute string // empty if not extracting attribute
}

SelectorEntry represents a parsed selector with optional attribute extraction

type StringTransform

type StringTransform struct {
	TargetTag string
}

StringTransform is a simple transform that changes tag names JavaScript equivalent: 'noscript': 'div'

func (*StringTransform) Transform

func (st *StringTransform) Transform(selection *goquery.Selection) error

type TransformFunction

type TransformFunction interface {
	Transform(selection *goquery.Selection) error
}

TransformFunction represents a function that transforms DOM elements JavaScript equivalent: 'selector': $node => { ... } or 'selector': 'tag'

Source Files

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL