Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Article ¶
type Article struct {
Title string `json:"title,omitempty"`
TitleUnmodified string `json:"titleunmodified,omitempty"`
CleanedText string `json:"content,omitempty"`
MetaDescription string `json:"description,omitempty"`
MetaLang string `json:"lang,omitempty"`
MetaFavicon string `json:"favicon,omitempty"`
MetaKeywords string `json:"keywords,omitempty"`
CanonicalLink string `json:"canonicalurl,omitempty"`
Domain string `json:"domain,omitempty"`
TopNode *goquery.Selection `json:"-"`
TopImage string `json:"image,omitempty"`
Tags *set.Set `json:"tags,omitempty"`
Movies *set.Set `json:"movies,omitempty"`
FinalURL string `json:"url,omitempty"`
LinkHash string `json:"linkhash,omitempty"`
RawHTML string `json:"rawhtml,omitempty"`
Doc *goquery.Document `json:"-"`
Links []string `json:"links,omitempty"`
PublishDate *time.Time `json:"publishdate,omitempty"`
AdditionalData map[string]string `json:"additionaldata,omitempty"`
Delta int64 `json:"delta,omitempty"`
}
Article is a collection of properties extracted from the HTML body
type Configuration ¶
type Configuration struct {
TargetLanguage string
BrowserUserAgent string
Debug bool
ExtractPublishDate bool
AdditionalDataExtractor bool
EnableImageFetching bool
UseMetaLanguage bool
StopWords StopWords
Parser *Parser
Timeout time.Duration
// contains filtered or unexported fields
}
Configuration is a wrapper for various config options
func GetDefaultConfiguration ¶
func GetDefaultConfiguration(args ...string) Configuration
GetDefaultConfiguration returns safe default configuration options
type Goose ¶
type Goose struct {
// contains filtered or unexported fields
}
Goose is the main entry point of the program
func NewWithConfig ¶
func NewWithConfig(config Configuration) Goose
NewWithConfig returns a new instance of the article extractor with configuration
func (Goose) ExtractFromRawHTML ¶
ExtractFromRawHTML returns an article object from the raw HTML content
type HtmlRequester ¶
type HtmlRequester struct {
// contains filtered or unexported fields
}
HtmlRequester is a simple HTTP client for fetching web pages
func NewHtmlRequester ¶
func NewHtmlRequester(config Configuration) HtmlRequester
NewHtmlRequester creates a new HTML requester
type Parser ¶
type Parser struct {
}
Parser is a simple HTML parser
func (Parser) DropTag ¶
func (p Parser) DropTag(selection interface{})
DropTag removes the tag but keeps its contents
func (Parser) RemoveNode ¶
func (p Parser) RemoveNode(selection interface{})
RemoveNode removes the entire node
type SimpleCrawler ¶
type SimpleCrawler struct {
// contains filtered or unexported fields
}
SimpleCrawler is a basic crawler that extracts text
func NewCrawler ¶
func NewCrawler(config Configuration) SimpleCrawler
NewCrawler creates a new crawler
type StopWords ¶
type StopWords struct {
}
StopWords is a simple stopwords implementation
func (StopWords) SimpleLanguageDetector ¶
SimpleLanguageDetector detects language (stub implementation)