Versions in this module Expand all Collapse all v1 v1.0.1 Aug 16, 2020 Changes in this version + func NormaliseCharset(characterSet string) string + func OpenGraphResolver(doc *goquery.Document) string + func ReadLinesOfFile(filename string) []string + func UTF8encode(raw string, sourceCharset string) string + func WebPageImageResolver(doc *goquery.Document) ([]candidate, int) + func WebPageResolver(article *Article) string + type Article struct + AdditionalData map[string]string + CanonicalLink string + CleanedText string + Delta int64 + Doc *goquery.Document + Domain string + FinalURL string + LinkHash string + Links []string + MetaDescription string + MetaFavicon string + MetaKeywords string + MetaLang string + Movies *set.Set + PublishDate *time.Time + RawHTML string + Tags *set.Set + Title string + TitleUnmodified string + TopImage string + TopNode *goquery.Selection + func (article *Article) ToString() string + type Cleaner struct + func NewCleaner(config Configuration) Cleaner + func (c *Cleaner) Clean(docToClean *goquery.Document) *goquery.Document + type Configuration struct + func GetDefaultConfiguration(args ...string) Configuration + type ContentExtractor struct + func NewExtractor(config Configuration) ContentExtractor + func (extr *ContentExtractor) CalculateBestNode(document *goquery.Document) *goquery.Selection + func (extr *ContentExtractor) GetCanonicalLink(document *goquery.Document) string + func (extr *ContentExtractor) GetCleanTextAndLinks(topNode *goquery.Selection, lang string) (string, []string) + func (extr *ContentExtractor) GetDomain(canonicalLink string) string + func (extr *ContentExtractor) GetFavicon(document *goquery.Document) string + func (extr *ContentExtractor) GetMetaAuthor(document *goquery.Document) string + func (extr *ContentExtractor) GetMetaContent(document *goquery.Document, metaName string) string + func (extr *ContentExtractor) GetMetaContentLocation(document *goquery.Document) string + func (extr *ContentExtractor) GetMetaContentWithSelector(document *goquery.Document, selector string) string + func (extr *ContentExtractor) GetMetaContents(document *goquery.Document, metaNames *set.Set) map[string]string + func (extr *ContentExtractor) GetMetaDescription(document *goquery.Document) string + func (extr *ContentExtractor) GetMetaKeywords(document *goquery.Document) string + func (extr *ContentExtractor) GetMetaLanguage(document *goquery.Document) string + func (extr *ContentExtractor) GetPublishDate(document *goquery.Document) *time.Time + func (extr *ContentExtractor) GetTags(document *goquery.Document) *set.Set + func (extr *ContentExtractor) GetTitle(document *goquery.Document) string + func (extr *ContentExtractor) GetTitleFromUnmodifiedTitle(title string) string + func (extr *ContentExtractor) PostCleanup(targetNode *goquery.Selection) *goquery.Selection + type Crawler struct + Charset string + func NewCrawler(config Configuration) Crawler + func (c *Crawler) Preprocess(RawHTML string) (*goquery.Document, error) + func (c *Crawler) SetCharset(cs string) + func (c Crawler) Crawl(RawHTML string, url string) (*Article, error) + func (c Crawler) GetCharset(document *goquery.Document) string + func (c Crawler) GetContentType(document *goquery.Document) string + type CrawlerShort struct + Charset string + func NewCrawlerShort(config Configuration) CrawlerShort + func (c *CrawlerShort) Preprocess(RawHTML string) (*goquery.Document, error) + func (c *CrawlerShort) SetCharset(cs string) + func (c CrawlerShort) Crawl(RawHTML, url string) (*Article, error) + func (c CrawlerShort) GetCharset(document *goquery.Document) string + func (c CrawlerShort) GetContentType(document *goquery.Document) string + type Goose struct + func New(args ...string) Goose + func (g Goose) ExtractFromRawHTML(RawHTML string, url string) (*Article, error) + func (g Goose) ExtractFromURL(url string) (*Article, error) + type HtmlRequester interface + func NewHtmlRequester(config Configuration) HtmlRequester + type Parser struct + func NewParser() *Parser + type StopWords struct + func NewStopwords() StopWords + func (stop StopWords) SimpleLanguageDetector(text string) string + type VideoExtractor struct + func NewVideoExtractor() VideoExtractor + func (ve *VideoExtractor) GetVideos(doc *goquery.Document) *set.Set