Documentation
¶
Index ¶
Constants ¶
View Source
const ( // throttling rate DefaultThrottlingRate = 20 // max crawl depth DefaultMaxCrawlDepth = 5 // default compliance level with robots.txt policy // @see https://moz.com/learn/seo/robotstxt DefaultComplyWithRobotPolicy = true // DefaultUserAgent is the default user agent string in HTTPRequest DefaultUserAgent = "GoCrawler/v0.1 (+https://github.com/q/gocrawler)" )
constants
Variables ¶
View Source
var ErrDomainAlreadyRegistered = errors.New("domain is already registered/crawled")
ErrDomainAlreadyRegistered is used when domain already exists
Functions ¶
This section is empty.
Types ¶
type Crawler ¶
type Crawler struct {
// mutex
sync.Mutex
// user agent to send
UserAgent string
// http client
HTTPClient *http.Client
// logger interface
Logger Logger
// contains filtered or unexported fields
}
Crawler is a collection of workers that crawl their respective domains
func (*Crawler) Close ¶
Close cancels the subscriptions in flight, closes the Updates channel, and returns the last fetch error, if any that is captured
type Logger ¶
type Logger interface {
SetOutput(w io.Writer)
SetPrefix(prefix string)
Fatal(v ...interface{})
Fatalf(format string, v ...interface{})
Panic(v ...interface{})
Panicf(format string, v ...interface{})
Print(v ...interface{})
Printf(format string, v ...interface{})
}
Logger defines the logging interface
type Queue ¶
type Queue struct {
// contains filtered or unexported fields
}
Queue is a task queue for crawlers
type Resource ¶
type Resource struct {
// mutex
sync.Mutex
// resource URL
URL *url.URL `json:"_"`
// string version
URLString string `json:"url"`
// from meta
Title string `json:"title"`
// HTTP StatusCode
HTTPStatusCode int `json:"status"`
// root node
Root *url.URL `json:"_"`
// parent node ancestry
Parent []string `json:"_"`
// current depth
Depth int `json:"depth"`
// child nodes
Nodes []*Resource `json:"nodes"`
// last fetched timestamp
LastFetched time.Time `json:"_"`
}
Resource describes a web page and it's nodes
type Worker ¶
type Worker struct {
// inherit wg
sync.WaitGroup
// nodes tree
Tree *Resource
// last updated timestamp
LastUpdated time.Time
// contains filtered or unexported fields
}
Worker is a crawler specific to a domain
type WorkerStatus ¶
type WorkerStatus int
WorkerStatus is used to describe worker status
const ( StatusInitialised WorkerStatus = iota StatusFetchingInProgress StatusFetchingComplete StatusFetchingError )
worker status types
func (WorkerStatus) MarshalJSON ¶
func (s WorkerStatus) MarshalJSON() ([]byte, error)
MarshalJSON definition for WorkerStatus
Click to show internal directories.
Click to hide internal directories.