goose

package
v0.0.0-...-7179273 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 3, 2025 License: Apache-2.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Article

type Article struct {
	Title           string             `json:"title,omitempty"`
	TitleUnmodified string             `json:"titleunmodified,omitempty"`
	CleanedText     string             `json:"content,omitempty"`
	MetaDescription string             `json:"description,omitempty"`
	MetaLang        string             `json:"lang,omitempty"`
	MetaFavicon     string             `json:"favicon,omitempty"`
	MetaKeywords    string             `json:"keywords,omitempty"`
	CanonicalLink   string             `json:"canonicalurl,omitempty"`
	Domain          string             `json:"domain,omitempty"`
	TopNode         *goquery.Selection `json:"-"`
	TopImage        string             `json:"image,omitempty"`
	Tags            *set.Set           `json:"tags,omitempty"`
	Movies          *set.Set           `json:"movies,omitempty"`
	FinalURL        string             `json:"url,omitempty"`
	LinkHash        string             `json:"linkhash,omitempty"`
	RawHTML         string             `json:"rawhtml,omitempty"`
	Doc             *goquery.Document  `json:"-"`
	Links           []string           `json:"links,omitempty"`
	PublishDate     *time.Time         `json:"publishdate,omitempty"`
	AdditionalData  map[string]string  `json:"additionaldata,omitempty"`
	Delta           int64              `json:"delta,omitempty"`
}

Article is a collection of properties extracted from the HTML body

func (*Article) ToString

func (article *Article) ToString() string

ToString is a simple method to just show the title TODO: add more fields and pretty print

type Configuration

type Configuration struct {
	TargetLanguage string

	BrowserUserAgent        string
	Debug                   bool
	ExtractPublishDate      bool
	AdditionalDataExtractor bool
	EnableImageFetching     bool
	UseMetaLanguage         bool

	StopWords StopWords
	Parser    *Parser

	Timeout time.Duration
	// contains filtered or unexported fields
}

Configuration is a wrapper for various config options

func GetDefaultConfiguration

func GetDefaultConfiguration(args ...string) Configuration

GetDefaultConfiguration returns safe default configuration options

type Goose

type Goose struct {
	// contains filtered or unexported fields
}

Goose is the main entry point of the program

func New

func New(args ...string) Goose

New returns a new instance of the article extractor

func NewWithConfig

func NewWithConfig(config Configuration) Goose

NewWithConfig returns a new instance of the article extractor with configuration

func (Goose) ExtractFromRawHTML

func (g Goose) ExtractFromRawHTML(RawHTML string, url string) (*Article, error)

ExtractFromRawHTML returns an article object from the raw HTML content

func (Goose) ExtractFromURL

func (g Goose) ExtractFromURL(url string) (*Article, error)

ExtractFromURL follows the URL, fetches the HTML page and returns an article object

type HtmlRequester

type HtmlRequester struct {
	// contains filtered or unexported fields
}

HtmlRequester is a simple HTTP client for fetching web pages

func NewHtmlRequester

func NewHtmlRequester(config Configuration) HtmlRequester

NewHtmlRequester creates a new HTML requester

type Parser

type Parser struct {
}

Parser is a simple HTML parser

func NewParser

func NewParser() *Parser

NewParser creates a new parser

func (Parser) DelAttr

func (p Parser) DelAttr(selection interface{}, attr string)

DelAttr removes an attribute from the selection

func (Parser) DropTag

func (p Parser) DropTag(selection interface{})

DropTag removes the tag but keeps its contents

func (Parser) Name

func (p Parser) Name(selector string, selection interface{}) string

Name gets the value of an attribute

func (Parser) RemoveNode

func (p Parser) RemoveNode(selection interface{})

RemoveNode removes the entire node

func (Parser) SetAttr

func (p Parser) SetAttr(selection interface{}, attr string, value string)

SetAttr sets an attribute on the selection

type SimpleCrawler

type SimpleCrawler struct {
	// contains filtered or unexported fields
}

SimpleCrawler is a basic crawler that extracts text

func NewCrawler

func NewCrawler(config Configuration) SimpleCrawler

NewCrawler creates a new crawler

func (SimpleCrawler) Crawl

func (c SimpleCrawler) Crawl(rawHTML string, targetURL string) (*Article, error)

Crawl extracts article content from HTML

type StopWords

type StopWords struct {
}

StopWords is a simple stopwords implementation

func NewStopwords

func NewStopwords() StopWords

NewStopwords creates a new stopwords instance

func (StopWords) SimpleLanguageDetector

func (sw StopWords) SimpleLanguageDetector(text string) string

SimpleLanguageDetector detects language (stub implementation)

func (StopWords) StopWordsCount

func (sw StopWords) StopWordsCount(language string, text string) int

StopWordsCount counts stop words in the text (stub implementation)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL