Documentation
¶
Index ¶
- Constants
- Variables
- type Closer
- type Config
- type CrawlSpider
- type Crawler
- type DupeFilter
- type Event
- type Fetcher
- type FetcherHandler
- type FetcherMiddleware
- type FetcherMiddlewareManager
- type FetchingErrorProcessor
- type FetchingRequestProcessor
- type FetchingResponseProcessor
- type FingerprintDupeFilter
- type FromConfiger
- type FromCrawlerer
- type GoquerySelector
- type HTMLLinkExtractor
- type IFetcher
- type IScheduler
- type ISpider
- type Item
- type ItemPipelineManager
- type ItemPipelineMiddleware
- type ItemProcessor
- type Link
- type LinkExtractor
- type Middleware
- type MiddlewareManager
- type MiddlewareManagerIterator
- type OnSpiderCloseder
- type OnSpiderOpeneder
- type Opener
- type Request
- type Response
- type Rule
- type Scheduler
- type Selector
- type Selectors
- type Spider
- type SpiderErrorProcessor
- type SpiderInputProcessor
- type SpiderMiddleware
- type SpiderMiddlewareManager
- func (smm *SpiderMiddlewareManager) ProcessStartRequests(startRequests []*Request, spider ISpider) ([]*Request, error)
- func (smm *SpiderMiddlewareManager) Register(middleware SpiderMiddleware)
- func (smm *SpiderMiddlewareManager) ScrapeResponse(request *Request, response *Response, spider ISpider) (*SpiderResult, error)
- type SpiderOutputProcessor
- type SpiderResult
- type StartRequestsProcessor
- type Stats
- func (stats *Stats) Clear()
- func (stats *Stats) Close(spider ISpider)
- func (stats *Stats) Del(key string)
- func (stats *Stats) Get(key string) uint64
- func (stats *Stats) GetStr(key string) string
- func (stats *Stats) Inc(key string)
- func (stats *Stats) Max(key string, value uint64)
- func (stats *Stats) Min(key string, value uint64)
- func (stats *Stats) Open(spider ISpider)
- func (stats *Stats) SetStr(key, value string)
Constants ¶
View Source
const ( MIMEJSON = "application/json" MIMEHTML = "text/html" MIMEXML = "application/xml" MIMEXMLText = "text/xml" MIMEPlain = "text/plain" MIMEPOSTForm = "application/x-www-form-urlencoded" MIMEMultipartPOSTForm = "multipart/form-data" MIMEPROTOBUF = "application/x-protobuf" )
View Source
const ItemPipelines = "ItemPipelines"
Variables ¶
View Source
var ( ErrSpiderClosed = errors.New("spider closed") ErrItemDropped = errors.New("item dropped") ErrIgnoreRequest = errors.New("request ignored") )
View Source
var IgnoredExtensions = []string{
"mng", "pct", "bmp", "gif ", "jpg", "jpeg", "png", "pst", "psp", "tif",
"tiff", "ai", "drw", "dxf", "eps", "ps", "svg",
"mp3", "wma", "ogg", "wav", "ra", "aac", "mid", "au", "aiff",
"3gp", "asf", "asx", "avi", "mov", "mp4", "mpg", "qt", "rm", "swf", "wmv",
"m4a",
"xls", "xlsx", "ppt", "pptx", "pps", "doc", "docx", "odt", "ods", "odg",
"odp",
"css", "pdf", "exe", "bin", "rss", "zip", "rar",
}
common file extensions that are not followed if they occur in links
Functions ¶
This section is empty.
Types ¶
type CrawlSpider ¶
type CrawlSpider struct {
*Spider
}
func (*CrawlSpider) Parse ¶
func (s *CrawlSpider) Parse(response *Response) (*SpiderResult, error)
type Crawler ¶
type Crawler struct {
*Config
*logrus.Logger
*Stats
Concurrency int
Spider ISpider
Scheduler IScheduler
Fetcher IFetcher
*SpiderMiddlewareManager
*ItemPipelineManager
*tunny.WorkPool
*concurrency.Worker
// contains filtered or unexported fields
}
func NewCrawler ¶
func NewCrawler(spider ISpider, scheduler IScheduler) *Crawler
type Event ¶
type Event string
const ( CrawlerStarted Event = "CrawlerStarted" CrawlerStopped Event = "CrawlerStopped" SpiderOpened Event = "SpiderOpened " SpiderIdle Event = "SpiderIdle" SpiderClosed Event = "SpiderClosed" SpiderError Event = "SpiderError" RequestScheduled Event = "RequestScheduled" RequestDropped Event = "RequestDropped" ResponseReceived Event = "ResponseReceived" ResponseDownloaded Event = "ResponseDownloaded" ItemScraped Event = "ItemScraped" ItemDropped Event = "ItemDropped" )
type Fetcher ¶
type Fetcher struct {
TotalConcurrency int
DomainConcurrency int
IpConcurrency int
Delay time.Duration
RandomizeDelay bool
// contains filtered or unexported fields
}
func NewFetcher ¶
func NewFetcher() *Fetcher
func (*Fetcher) NeedsBackout ¶
type FetcherHandler ¶
type FetcherMiddleware ¶
type FetcherMiddleware interface{}
type FetcherMiddlewareManager ¶
type FetcherMiddlewareManager struct {
// contains filtered or unexported fields
}
func (*FetcherMiddlewareManager) Register ¶
func (fmm *FetcherMiddlewareManager) Register(middleware FetcherMiddleware)
type FetchingErrorProcessor ¶
type FetchingErrorProcessor interface {
/* ProcessError is called when a fetcher handler or a ProcessRequest (from a fetcher middleware) returns an error.
It should return: either nils, a Response object, or a Request object.
If it returns nils, will continue processing this error, executing any other ProcessError methods of installed
middleware, until no middleware is left.
If it returns a Response object, the ProcessResponse methods chain of installed middleware is started,
and won’t bother calling any other ProcessError methods of middleware.
If it returns a Request object, the returned request is rescheduled to be performed in the future.
This stops the execution of ProcessError methods of the middleware the same as returning a response would.
*/
ProcessError(err error, request *Request, spider ISpider) (*Response, *Request)
}
type FetchingRequestProcessor ¶
type FetchingRequestProcessor interface {
/* ProcessRequest is called for each request that goes through the fetcher middleware.
It should either: return nils, return a Response object, return a Request object, or return an IgnoreRequest error.
If it returns nils, will continue processing this request, executing all other middlewares,
until, finally, the appropriate fetcher handler performs the request performed (and its response downloaded).
If it returns a Response object, won’t bother calling any other ProcessRequest or ProcessError methods,
or the appropriate fetcher handler; it’ll return that response. The ProcessResponse methods of installed middleware
is always called on every response.
If it returns a Request object, will stop calling ProcessRequest methods and reschedule the returned request.
Once the newly returned request is performed, the appropriate middleware chain will again be called on the downloaded response.
If it returns an IgnoreRequest error, the ProcessError methods of installed downloader middleware will be called.
If none of them handle the error, the callback function of the request (Request.Callback) is called.
If no code handles the returned error, it is ignored and not logged (unlike other error).
*/
ProcessRequest(request *Request, spider ISpider) (*Response, *Request, error)
}
type FetchingResponseProcessor ¶
type FetchingResponseProcessor interface {
/* ProcessResponse should either: return a Response object, return a Request object or return an IgnoreRequest error.
If it returns a Response (it could be the same given response, or a brand-new one), that response will continue to be
processed with the ProcessResponse method of the next middleware in the chain.
If it returns a Request object, the middleware chain is halted and the returned request is rescheduled to be performed
in the future. This is the same behavior as if a request is returned from ProcessRequest.
If it returns an IgnoreRequest error, the callback function of the request (Request.Callback) is called.
If no code handles the returned error, it is ignored and not logged (unlike other error).
*/
ProcessResponse(response *Response, request *Request, spider ISpider) (*Response, *Request, error)
}
type FingerprintDupeFilter ¶
func NewFingerprintDupeFilter ¶
func NewFingerprintDupeFilter(logger *logrus.Logger, filename ...string) *FingerprintDupeFilter
func (*FingerprintDupeFilter) Close ¶
func (f *FingerprintDupeFilter) Close(spider ISpider)
func (*FingerprintDupeFilter) Open ¶
func (f *FingerprintDupeFilter) Open(spider ISpider)
func (*FingerprintDupeFilter) SeenRequest ¶
func (f *FingerprintDupeFilter) SeenRequest(request *Request) bool
type FromConfiger ¶
type FromConfiger interface {
FromConfig(config Config)
}
type FromCrawlerer ¶
type FromCrawlerer interface {
FromCrawler(crawler Crawler)
}
type GoquerySelector ¶
func NewGoquerySelector ¶
func NewGoquerySelector(doc *goquery.Document) *GoquerySelector
func (*GoquerySelector) Attr ¶
func (gs *GoquerySelector) Attr(attrName string) (val string, exists bool)
func (*GoquerySelector) Extract ¶
func (gs *GoquerySelector) Extract() string
func (*GoquerySelector) Regex ¶
func (gs *GoquerySelector) Regex(regex interface{}) []string
func (*GoquerySelector) Select ¶
func (gs *GoquerySelector) Select(query string) Selectors
type HTMLLinkExtractor ¶
type HTMLLinkExtractor struct {
// Regular expressions that the (absolute) urls must match in order to be extracted.
// If empty, it will match all links.
Allows []string
// regular expressions that the (absolute) urls must match in order to be excluded.
// It has precedence over the Allows parameter.
// If empty, it won't exclude any links.
Denies []string
// Domains which will be considered for extracting the links.
AllowDomains []string
// Domains which won't be considered for extracting the links.
DenyDomains []string
// File extensions that should be ignored when extracting links.
// If empty, it will default to the IgnoredExtensions.
DenyExtensions []string
// Selectors which define regions inside the response where links should be extracted from.
// If given, only the text selected by those selectors will be scanned for links.
RestrictSelectors []string
// Whether duplicate filtering should be applied to extracted links.
// Defaults to false.
Unique bool
// Function which receives each url value extracted from the tag and attributes scanned
// and can modify the value and return a new one, or return nil to ignore the link altogether.
// If not given, defaults to the untouched link.
ProcessValue func(value *url.URL) *url.URL
// a list of tags to consider when extracting links.
// Defaults to {"a", "area"}.
Tags []string
// Attributes which should be considered when looking for links to extract.
// Only for those tags specified in the tags parameter.
// Defaults to {"href"}.
Attrs []string
// contains filtered or unexported fields
}
func (*HTMLLinkExtractor) ExtractLinks ¶
func (hle *HTMLLinkExtractor) ExtractLinks(response *Response) []*Link
func (*HTMLLinkExtractor) Init ¶
func (hle *HTMLLinkExtractor) Init()
type IScheduler ¶
type ItemPipelineManager ¶
type ItemPipelineManager struct {
// contains filtered or unexported fields
}
func (*ItemPipelineManager) Close ¶
func (ipm *ItemPipelineManager) Close(spider ISpider)
func (*ItemPipelineManager) Open ¶
func (ipm *ItemPipelineManager) Open(spider ISpider)
func (*ItemPipelineManager) ProcessItem ¶
func (ipm *ItemPipelineManager) ProcessItem(item *Item, spider ISpider) (*Item, error)
func (*ItemPipelineManager) Register ¶
func (ipm *ItemPipelineManager) Register(middleware ItemPipelineMiddleware)
type ItemPipelineMiddleware ¶
type ItemPipelineMiddleware interface{}
type ItemProcessor ¶
type Link ¶
type Link struct {
// contains filtered or unexported fields
}
Link represents an extracted link.
type LinkExtractor ¶
type Middleware ¶
type Middleware interface {
OnSpiderOpeneder
OnSpiderCloseder
}
type MiddlewareManager ¶
type MiddlewareManager struct {
// contains filtered or unexported fields
}
func NewMiddlewareManager ¶
func NewMiddlewareManager() *MiddlewareManager
func (*MiddlewareManager) OnSpiderClosed ¶
func (mm *MiddlewareManager) OnSpiderClosed(spider Spider)
func (*MiddlewareManager) OnSpiderOpened ¶
func (mm *MiddlewareManager) OnSpiderOpened(spider Spider)
func (*MiddlewareManager) Register ¶
func (mm *MiddlewareManager) Register(middleware Middleware)
type MiddlewareManagerIterator ¶
type MiddlewareManagerIterator struct {
// contains filtered or unexported fields
}
func (*MiddlewareManagerIterator) HasNext ¶
func (mmi *MiddlewareManagerIterator) HasNext() bool
func (*MiddlewareManagerIterator) Next ¶
func (mmi *MiddlewareManagerIterator) Next() interface{}
type OnSpiderCloseder ¶
type OnSpiderCloseder interface {
OnSpiderClosed(spider ISpider)
}
type OnSpiderOpeneder ¶
type OnSpiderOpeneder interface {
OnSpiderOpened(spider ISpider)
}
type Request ¶
type Request struct {
*http.Request
Error error
Meta map[string]interface{}
NotFilter bool
Callback func(response *Response, err error) (*SpiderResult, error)
}
func NewRequest ¶
func (*Request) Fingerprint ¶
Fingerprint returns a hash that uniquely identifies the request. Ignore all headers.
type Response ¶
type Response struct {
*http.Response
MediaType string
HTMLDoc *goquery.Document
/* Request which generated this response.
This attribute is assigned in the `Crawler`, after the response and the request have passed
through all `Fetcher Middlewares`. In particular, this means that:
- HTTP redirections will cause the original request (to the URL before
redirection) to be assigned to the redirected response (with the final
URL after redirection).
- Response.Request.URL doesn't always equal Response.Response.URL
- This attribute is only available in the spider code, and in the `Spider Middlewares`,
but not in `Downloader Middlewares` (although you have the Request available there by
other means) and handlers of the `response_downloaded` signal.
*/
*Request
// contains filtered or unexported fields
}
func (*Response) ContentType ¶
type Scheduler ¶
type Scheduler struct {
// contains filtered or unexported fields
}
func NewScheduler ¶
func NewScheduler() *Scheduler
func (*Scheduler) EnqueueRequest ¶
func (*Scheduler) NextRequest ¶
type Selectors ¶
type Selectors []Selector
func (Selectors) ExtractFirst ¶
func (Selectors) RegexFirst ¶
type Spider ¶
func (*Spider) StartResusts ¶
type SpiderErrorProcessor ¶
type SpiderErrorProcessor interface {
ProcessSpiderError(err error, response *Response, spider ISpider) (*SpiderResult, error)
}
type SpiderInputProcessor ¶
type SpiderMiddleware ¶
type SpiderMiddleware interface{}
type SpiderMiddlewareManager ¶
type SpiderMiddlewareManager struct {
// contains filtered or unexported fields
}
func (*SpiderMiddlewareManager) ProcessStartRequests ¶
func (smm *SpiderMiddlewareManager) ProcessStartRequests(startRequests []*Request, spider ISpider) ([]*Request, error)
func (*SpiderMiddlewareManager) Register ¶
func (smm *SpiderMiddlewareManager) Register(middleware SpiderMiddleware)
func (*SpiderMiddlewareManager) ScrapeResponse ¶
func (smm *SpiderMiddlewareManager) ScrapeResponse(request *Request, response *Response, spider ISpider) (*SpiderResult, error)
type SpiderOutputProcessor ¶
type SpiderOutputProcessor interface {
ProcessSpiderOutput(result *SpiderResult, response *Response, spider ISpider) (*SpiderResult, error)
}
type SpiderResult ¶
func (*SpiderResult) Empty ¶
func (sr *SpiderResult) Empty() bool
type StartRequestsProcessor ¶
Click to show internal directories.
Click to hide internal directories.