Documentation
¶
Overview ¶
Package tegenaria is a crawler framework based on golang
tegenaria是一个基于golang开发的快速、高效率的网络爬虫框架
Index ¶
- Constants
- Variables
- func AbsFilePathTest(t *testing.T, path string) string
- func DefaultWatcher(ch chan EventType, hooker EventHooksInterface) error
- func GetEngineID() string
- func GetFunctionName(fn Parser) string
- func GetLogger(Name string) *logrus.Entry
- func GetMachineIP() (string, error)
- func GetParserByName(spider SpiderInterface, name string) reflect.Value
- func GetUUID() string
- func GoRunner(wg *conc.WaitGroup, funcs ...GoFunc) <-chan error
- func Interface2Uint(value interface{}) uint
- func MD5(s string) string
- func Map2String(m interface{}) string
- func NewTestProxyServer() *httptest.Server
- func NewTestServer() *httptest.Server
- func OptimalNumOfBits(n int, p float64) int
- func OptimalNumOfHashFunctions(n int, m int) int
- type BaseSpider
- type CacheInterface
- type CheckMasterLive
- type ComponentInterface
- type Configuration
- type Context
- type ContextOption
- type CrawlEngine
- func (e *CrawlEngine) EventsWatcherRunner() error
- func (e *CrawlEngine) Execute(spiderName string) StatisticInterface
- func (e *CrawlEngine) GetComponents() ComponentInterface
- func (e *CrawlEngine) GetCurrentSpider() SpiderInterface
- func (e *CrawlEngine) GetRuntimeStatus() *RuntimeStatus
- func (e *CrawlEngine) GetSpiders() *Spiders
- func (e *CrawlEngine) GetStatic() StatisticInterface
- func (e *CrawlEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)
- func (e *CrawlEngine) RegisterPipelines(pipeline PipelinesInterface)
- func (e *CrawlEngine) RegisterSpiders(spider SpiderInterface)
- func (e *CrawlEngine) Scheduler() error
- type DefaultComponents
- func (d *DefaultComponents) CheckWorkersStop() bool
- func (d *DefaultComponents) GetDupefilter() RFPDupeFilterInterface
- func (d *DefaultComponents) GetEventHooks() EventHooksInterface
- func (d *DefaultComponents) GetLimiter() LimitInterface
- func (d *DefaultComponents) GetQueue() CacheInterface
- func (d *DefaultComponents) GetStats() StatisticInterface
- func (d *DefaultComponents) SetCurrentSpider(spider SpiderInterface)
- func (d *DefaultComponents) SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error
- type DefaultComponentsOption
- func DefaultComponentsWithDefaultHooks(events *DefaultHooks) DefaultComponentsOption
- func DefaultComponentsWithDefaultLimiter(limiter *DefaultLimiter) DefaultComponentsOption
- func DefaultComponentsWithDefaultQueue(queue *DefaultQueue) DefaultComponentsOption
- func DefaultComponentsWithDefaultStatistic(statistic *DefaultStatistic) DefaultComponentsOption
- func DefaultComponentsWithDupefilter(dupefilter *DefaultRFPDupeFilter) DefaultComponentsOption
- type DefaultFieldHook
- type DefaultHooks
- func (d *DefaultHooks) Error(params ...interface{}) error
- func (d *DefaultHooks) EventsWatcher(ch chan EventType) error
- func (d *DefaultHooks) Exit(params ...interface{}) error
- func (d *DefaultHooks) Heartbeat(params ...interface{}) error
- func (d *DefaultHooks) Pause(params ...interface{}) error
- func (d *DefaultHooks) SetCurrentSpider(spider SpiderInterface)
- func (d *DefaultHooks) Start(params ...interface{}) error
- type DefaultLimiter
- type DefaultQueue
- type DefaultRFPDupeFilter
- type DefaultStatistic
- type DistributedWorkerInterface
- type Downloader
- type DownloaderOption
- func DownloadWithClient(client http.Client) DownloaderOption
- func DownloadWithH2(h2 bool) DownloaderOption
- func DownloadWithTLSConfig(tls *tls.Config) DownloaderOption
- func DownloadWithTimeout(timeout time.Duration) DownloaderOption
- func DownloaderWithtransport(transport *http.Transport) DownloaderOption
- type EngineOption
- type ErrorOption
- type EventHooksInterface
- type EventType
- type EventsWatcher
- type GoFunc
- type HandleError
- type Hook
- type ItemInterface
- type ItemMeta
- type ItemPipelines
- type LimitInterface
- type Middlewares
- type MiddlewaresBase
- type MiddlewaresInterface
- type Parser
- type PipelinesBase
- type PipelinesInterface
- type ProcessResponse
- type Proxy
- type RFPDupeFilterInterface
- type RedirectError
- type Request
- type RequestMethod
- type RequestOption
- func RequestWithAllowRedirects(allowRedirects bool) RequestOption
- func RequestWithAllowedStatusCode(allowStatusCode []uint64) RequestOption
- func RequestWithBodyReader(body io.Reader) RequestOption
- func RequestWithDoNotFilter(doNotFilter bool) RequestOption
- func RequestWithMaxConnsPerHost(maxConnsPerHost int) RequestOption
- func RequestWithMaxRedirects(maxRedirects int) RequestOption
- func RequestWithParser(parser Parser) RequestOption
- func RequestWithPostForm(payload url.Values) RequestOption
- func RequestWithRequestBody(body map[string]interface{}) RequestOption
- func RequestWithRequestBytesBody(body []byte) RequestOption
- func RequestWithRequestCookies(cookies map[string]string) RequestOption
- func RequestWithRequestHeader(headers map[string]string) RequestOption
- func RequestWithRequestMeta(meta map[string]interface{}) RequestOption
- func RequestWithRequestParams(params map[string]string) RequestOption
- func RequestWithRequestProxy(proxy Proxy) RequestOption
- func RequestWithTimeout(timeout time.Duration) RequestOption
- type Response
- type RuntimeStatus
- func (r *RuntimeStatus) GetDuration() float64
- func (r *RuntimeStatus) GetRestartAt() int64
- func (r *RuntimeStatus) GetStartAt() int64
- func (r *RuntimeStatus) GetStatusOn() StatusType
- func (r *RuntimeStatus) GetStopAt() int64
- func (r *RuntimeStatus) SetDuration(duration float64)
- func (r *RuntimeStatus) SetRestartAt(startAt int64)
- func (r *RuntimeStatus) SetStartAt(startAt int64)
- func (r *RuntimeStatus) SetStatus(status StatusType)
- func (r *RuntimeStatus) SetStopAt(stopAt int64)
- type Settings
- type SpiderDownloader
- type SpiderInterface
- type Spiders
- type StatisticInterface
- type StatsFieldType
- type StatusType
- type TestDownloadMiddler
- type TestDownloadMiddler2
- type TestItemPipeline
- type TestItemPipeline2
- type TestItemPipeline3
- type TestItemPipeline4
- type TestSpider
Constants ¶
const ( // RequestStats 发起的请求总数 RequestStats string = "requests" // ItemsStats 获取到的items总数 ItemsStats string = "items" // DownloadFailStats 请求失败总数 DownloadFailStats string = "download_fail" // ErrorStats 错误总数 ErrorStats string = "errors" )
Variables ¶
var ( // ErrSpiderMiddleware 下载中间件处理异常 ErrSpiderMiddleware error = errors.New("handle spider middleware error") // ErrSpiderCrawls 抓取流程错误 ErrSpiderCrawls error = errors.New("handle spider crawl error") // ErrDuplicateSpiderName 爬虫名重复错误 ErrDuplicateSpiderName error = errors.New("register a duplicate spider name error") // ErrEmptySpiderName 爬虫名不能为空 ErrEmptySpiderName error = errors.New("register a empty spider name error") // ErrSpiderNotExist 爬虫实例不存在 ErrSpiderNotExist error = errors.New("not found spider") // ErrNotAllowStatusCode 不允许的状态码 ErrNotAllowStatusCode error = errors.New("not allow handle status code") // ErrGetCacheItem 获取item 错误 ErrGetCacheItem error = errors.New("getting item from cache error") // ErrGetHttpProxy 获取http代理错误 ErrGetHttpProxy error = errors.New("getting http proxy ") // ErrGetHttpsProxy 获取https代理错误 ErrGetHttpsProxy error = errors.New("getting https proxy ") // ErrParseSocksProxy 解析socks代理错误 ErrParseSocksProxy error = errors.New("parse socks proxy ") // ErrResponseRead 响应读取失败 ErrResponseRead error = errors.New("read response to buffer error") // ErrResponseParse 响应解析失败 ErrResponseParse error = errors.New("parse response error") // ErrNoMaterNodeLive 找不到主节点 ErrNoMaterNodeLive error = errors.New("no any master node is active") )
var ProcessId string = uuid.New().String()
Functions ¶
func DefaultWatcher ¶ added in v0.5.0
func DefaultWatcher(ch chan EventType, hooker EventHooksInterface) error
DefaultWatcher 默认的事件监听器 ch 用于接收事件 hooker 事件处理实例化接口,比如DefaultHooks
func GetFunctionName ¶ added in v0.4.1
GetFunctionName 提取解析函数名
func GetParserByName ¶ added in v0.4.1
func GetParserByName(spider SpiderInterface, name string) reflect.Value
GetParserByName 通过函数名从spider实例中获取解析函数
func Interface2Uint ¶ added in v0.5.0
func Interface2Uint(value interface{}) uint
func NewTestProxyServer ¶ added in v0.5.0
func NewTestServer ¶ added in v0.5.0
func OptimalNumOfBits ¶ added in v0.4.1
OptimalNumOfBits 计算位数组长度
func OptimalNumOfHashFunctions ¶ added in v0.4.1
OptimalNumOfHashFunctions 计算最优的布隆过滤器哈希函数个数
Types ¶
type BaseSpider ¶
BaseSpider base spider
func NewBaseSpider ¶
func NewBaseSpider(name string, feedUrls []string) *BaseSpider
NewBaseSpider 构建公共爬虫对象
type CacheInterface ¶
type CacheInterface interface {
// enqueue ctx写入缓存
Enqueue(ctx *Context) error
// dequeue ctx 从缓存出队列
Dequeue() (interface{}, error)
// isEmpty 缓存是否为空
IsEmpty() bool
// getSize 缓存大小
GetSize() uint64
// close 关闭缓存
Close() error
// SetCurrentSpider 设置当前的spider
SetCurrentSpider(spider SpiderInterface)
}
CacheInterface request缓存组件
type CheckMasterLive ¶ added in v0.4.1
CheckMasterLive 检查所有的master节点是否都在线
type ComponentInterface ¶ added in v0.5.0
type ComponentInterface interface {
// GetDupefilter 获取过滤器组件
GetDupefilter() RFPDupeFilterInterface
// GetQueue 获取请求队列接口
GetQueue() CacheInterface
// GetLimiter 限速器组件
GetLimiter() LimitInterface
// GetStats 指标统计组件
GetStats() StatisticInterface
// GetEventHooks 事件监控组件
GetEventHooks() EventHooksInterface
// CheckWorkersStop 爬虫停止的条件
CheckWorkersStop() bool
// SetCurrentSpider 当前正在运行的爬虫实例
SetCurrentSpider(spider SpiderInterface)
// SpiderBeforeStart 启动StartRequest之前的动作
SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error
}
ComponentInterface 系统组件接口 包含了爬虫系统运行的必要组件
type Configuration ¶
var Config *Configuration = nil
func (*Configuration) GetValue ¶ added in v0.4.1
func (c *Configuration) GetValue(key string) (interface{}, error)
type Context ¶
type Context struct {
// Request 请求对象
Request *Request
// Response 响应对象
Response *Response
// CtxID context 唯一id由uuid生成
CtxID string
// Error 处理过程中的错误信息
Error error
// Cancel context.CancelFunc
Cancel context.CancelFunc
// Items 读写item的管道
Items chan *ItemMeta
// Spider 爬虫实例
Spider SpiderInterface
// contains filtered or unexported fields
}
Context 在引擎中的数据流通载体,负责单个抓取任务的生命周期维护
func NewContext ¶
func NewContext(request *Request, Spider SpiderInterface, opts ...ContextOption) *Context
NewContext 从内存池中构建context对象
func NewTestRequest ¶ added in v0.5.0
func NewTestRequest(spider SpiderInterface, opts ...RequestOption) *Context
type ContextOption ¶
type ContextOption func(c *Context)
ContextOption 上下文选项
func WithContextID ¶ added in v0.5.0
func WithContextID(ctxID string) ContextOption
WithContextID 设置自定义的ctxId
func WithItemChannelSize ¶ added in v0.4.1
func WithItemChannelSize(size int) ContextOption
WithItemChannelSize 设置 items 管道的缓冲大小
type CrawlEngine ¶ added in v0.4.1
type CrawlEngine struct {
// contains filtered or unexported fields
}
CrawlEngine 引擎是整个框架数据流调度核心
func NewTestEngine ¶ added in v0.5.0
func NewTestEngine(spiderName string, opts ...EngineOption) *CrawlEngine
func (*CrawlEngine) EventsWatcherRunner ¶ added in v0.4.1
func (e *CrawlEngine) EventsWatcherRunner() error
EventsWatcherRunner 事件监听器运行组件
func (*CrawlEngine) Execute ¶ added in v0.4.1
func (e *CrawlEngine) Execute(spiderName string) StatisticInterface
func (*CrawlEngine) GetComponents ¶ added in v0.5.0
func (e *CrawlEngine) GetComponents() ComponentInterface
func (*CrawlEngine) GetCurrentSpider ¶ added in v0.5.0
func (e *CrawlEngine) GetCurrentSpider() SpiderInterface
GetCurrentSpider 获取当前正在运行的spider
func (*CrawlEngine) GetRuntimeStatus ¶ added in v0.5.0
func (e *CrawlEngine) GetRuntimeStatus() *RuntimeStatus
func (*CrawlEngine) GetSpiders ¶ added in v0.4.1
func (e *CrawlEngine) GetSpiders() *Spiders
GetSpiders 获取所有的已经注册到引擎的spider实例
func (*CrawlEngine) GetStatic ¶ added in v0.5.0
func (e *CrawlEngine) GetStatic() StatisticInterface
GetStatic 获取StatisticInterface 统计指标
func (*CrawlEngine) RegisterDownloadMiddlewares ¶ added in v0.4.1
func (e *CrawlEngine) RegisterDownloadMiddlewares(middlewares MiddlewaresInterface)
RegisterDownloadMiddlewares 注册下载中间件到引擎
func (*CrawlEngine) RegisterPipelines ¶ added in v0.4.1
func (e *CrawlEngine) RegisterPipelines(pipeline PipelinesInterface)
RegisterPipelines 注册pipelines到引擎
func (*CrawlEngine) RegisterSpiders ¶ added in v0.4.1
func (e *CrawlEngine) RegisterSpiders(spider SpiderInterface)
RegisterSpiders 将spider实例注册到引擎的 spiders
func (*CrawlEngine) Scheduler ¶ added in v0.4.1
func (e *CrawlEngine) Scheduler() error
Scheduler 调度器
type DefaultComponents ¶ added in v0.5.0
type DefaultComponents struct {
// contains filtered or unexported fields
}
DefaultComponents 默认的组件
func NewDefaultComponents ¶ added in v0.5.0
func NewDefaultComponents(opts ...DefaultComponentsOption) *DefaultComponents
func (*DefaultComponents) CheckWorkersStop ¶ added in v0.5.0
func (d *DefaultComponents) CheckWorkersStop() bool
func (*DefaultComponents) GetDupefilter ¶ added in v0.5.0
func (d *DefaultComponents) GetDupefilter() RFPDupeFilterInterface
func (*DefaultComponents) GetEventHooks ¶ added in v0.5.0
func (d *DefaultComponents) GetEventHooks() EventHooksInterface
func (*DefaultComponents) GetLimiter ¶ added in v0.5.0
func (d *DefaultComponents) GetLimiter() LimitInterface
func (*DefaultComponents) GetQueue ¶ added in v0.5.0
func (d *DefaultComponents) GetQueue() CacheInterface
func (*DefaultComponents) GetStats ¶ added in v0.5.0
func (d *DefaultComponents) GetStats() StatisticInterface
func (*DefaultComponents) SetCurrentSpider ¶ added in v0.5.0
func (d *DefaultComponents) SetCurrentSpider(spider SpiderInterface)
func (*DefaultComponents) SpiderBeforeStart ¶ added in v0.5.0
func (d *DefaultComponents) SpiderBeforeStart(engine *CrawlEngine, spider SpiderInterface) error
type DefaultComponentsOption ¶ added in v0.5.0
type DefaultComponentsOption func(d *DefaultComponents)
func DefaultComponentsWithDefaultHooks ¶ added in v0.5.0
func DefaultComponentsWithDefaultHooks(events *DefaultHooks) DefaultComponentsOption
func DefaultComponentsWithDefaultLimiter ¶ added in v0.5.0
func DefaultComponentsWithDefaultLimiter(limiter *DefaultLimiter) DefaultComponentsOption
func DefaultComponentsWithDefaultQueue ¶ added in v0.5.0
func DefaultComponentsWithDefaultQueue(queue *DefaultQueue) DefaultComponentsOption
func DefaultComponentsWithDefaultStatistic ¶ added in v0.5.0
func DefaultComponentsWithDefaultStatistic(statistic *DefaultStatistic) DefaultComponentsOption
func DefaultComponentsWithDupefilter ¶ added in v0.5.0
func DefaultComponentsWithDupefilter(dupefilter *DefaultRFPDupeFilter) DefaultComponentsOption
type DefaultFieldHook ¶
type DefaultFieldHook struct {
}
func (*DefaultFieldHook) Levels ¶
func (hook *DefaultFieldHook) Levels() []logrus.Level
type DefaultHooks ¶ added in v0.4.5
type DefaultHooks struct {
// contains filtered or unexported fields
}
func NewDefaultHooks ¶ added in v0.4.5
func NewDefaultHooks() *DefaultHooks
NewDefaultHooks 构建新的默认事件监听器
func (*DefaultHooks) Error ¶ added in v0.4.5
func (d *DefaultHooks) Error(params ...interface{}) error
Error 处理ERROR事件
func (*DefaultHooks) EventsWatcher ¶ added in v0.4.5
func (d *DefaultHooks) EventsWatcher(ch chan EventType) error
EventsWatcher DefualtHooks 的事件监听器
func (*DefaultHooks) Exit ¶ added in v0.4.5
func (d *DefaultHooks) Exit(params ...interface{}) error
Exit 处理EXIT事件
func (*DefaultHooks) Heartbeat ¶ added in v0.4.5
func (d *DefaultHooks) Heartbeat(params ...interface{}) error
Heartbeat 处理HEARTBEAT事件
func (*DefaultHooks) Pause ¶ added in v0.5.0
func (d *DefaultHooks) Pause(params ...interface{}) error
Pause 处理STOP事件
func (*DefaultHooks) SetCurrentSpider ¶ added in v0.5.0
func (d *DefaultHooks) SetCurrentSpider(spider SpiderInterface)
func (*DefaultHooks) Start ¶ added in v0.4.5
func (d *DefaultHooks) Start(params ...interface{}) error
Start 处理START事件
type DefaultLimiter ¶ added in v0.5.0
type DefaultLimiter struct {
// contains filtered or unexported fields
}
defaultLimiter 默认的限速器
func NewDefaultLimiter ¶ added in v0.4.1
func NewDefaultLimiter(limitRate int) *DefaultLimiter
NewDefaultLimiter 创建一个新的限速器 limitRate 最大请求速率
func (*DefaultLimiter) CheckAndWaitLimiterPass ¶ added in v0.5.0
func (d *DefaultLimiter) CheckAndWaitLimiterPass() error
checkAndWaitLimiterPass 检查当前并发量 如果并发量达到上限则等待
func (*DefaultLimiter) SetCurrentSpider ¶ added in v0.5.0
func (d *DefaultLimiter) SetCurrentSpider(spider SpiderInterface)
setCurrrentSpider 设置当前的spider名
type DefaultQueue ¶ added in v0.5.0
type DefaultQueue struct {
// contains filtered or unexported fields
}
RequestCache request缓存队列
func NewDefaultQueue ¶ added in v0.5.0
func NewDefaultQueue(size int) *DefaultQueue
NewDefaultQueue get a new DefaultQueue
func (*DefaultQueue) Dequeue ¶ added in v0.5.0
func (c *DefaultQueue) Dequeue() (interface{}, error)
dequeue 从队列中获取request对象
func (*DefaultQueue) Enqueue ¶ added in v0.5.0
func (c *DefaultQueue) Enqueue(ctx *Context) error
enqueue request对象入队列
func (*DefaultQueue) SetCurrentSpider ¶ added in v0.5.0
func (c *DefaultQueue) SetCurrentSpider(spider SpiderInterface)
SetCurrentSpider 设置当前的spider
type DefaultRFPDupeFilter ¶ added in v0.5.0
type DefaultRFPDupeFilter struct {
// contains filtered or unexported fields
}
RFPDupeFilter 去重组件
func NewRFPDupeFilter ¶
func NewRFPDupeFilter(bloomP float64, bloomN int) *DefaultRFPDupeFilter
NewRFPDupeFilter 新建去重组件 bloomP容错率 bloomN数据规模
func (*DefaultRFPDupeFilter) DoDupeFilter ¶ added in v0.5.0
func (f *DefaultRFPDupeFilter) DoDupeFilter(ctx *Context) (bool, error)
DoDupeFilter 通过布隆过滤器对request对象进行去重处理
func (*DefaultRFPDupeFilter) Fingerprint ¶ added in v0.5.0
func (f *DefaultRFPDupeFilter) Fingerprint(ctx *Context) ([]byte, error)
Fingerprint 计算指纹
func (*DefaultRFPDupeFilter) SetCurrentSpider ¶ added in v0.5.0
func (f *DefaultRFPDupeFilter) SetCurrentSpider(spider SpiderInterface)
type DefaultStatistic ¶ added in v0.5.0
type DefaultStatistic struct {
// Metrics 指标-数值缓存
Metrics map[string]*uint64
// contains filtered or unexported fields
}
Statistic 数据统计指标
func NewDefaultStatistic ¶ added in v0.5.0
func NewDefaultStatistic() *DefaultStatistic
NewStatistic 默认统计数据组件构造函数
func (*DefaultStatistic) Get ¶ added in v0.5.0
func (s *DefaultStatistic) Get(metric string) uint64
Get 获取某个指标的数值
func (*DefaultStatistic) GetAllStats ¶ added in v0.5.0
func (s *DefaultStatistic) GetAllStats() map[string]uint64
GetAllStats 格式化统计数据
func (*DefaultStatistic) Incr ¶ added in v0.5.0
func (s *DefaultStatistic) Incr(metrics string)
Incr 新增一个指标值
func (*DefaultStatistic) SetCurrentSpider ¶ added in v0.5.0
func (s *DefaultStatistic) SetCurrentSpider(spider SpiderInterface)
SetCurrentSpider 设置当前的spider
type DistributedWorkerInterface ¶ added in v0.4.1
type DistributedWorkerInterface interface {
// AddNode 新增一个节点
AddNode() error
// DelNode 删除当前的节点
DelNode() error
// PauseNode 停止当前的节点
PauseNode() error
// Heartbeat 心跳
Heartbeat() error
// CheckAllNodesStop 检查所有的节点是否都已经停止
CheckAllNodesStop() (bool, error)
// CheckMasterLive 检测主节点是否还在线
CheckMasterLive() (bool, error)
// SetMaster 是否将当前的节点设置为主节点
SetMaster(flag bool)
// SetCurrentSpider 设置当前的spider
SetCurrentSpider(spider SpiderInterface)
// GetWorkerID 当前工作节点的id
GetWorkerID() string
// IsMaster 是否是主节点
IsMaster() bool
}
DistributedWorkerInterface 分布式组件接口
type Downloader ¶
type Downloader interface {
// Download 下载函数
Download(ctx *Context) (*Response, error)
// CheckStatus 检查响应状态码的合法性
CheckStatus(statusCode uint64, allowStatus []uint64) bool
}
Downloader 下载器接口
type DownloaderOption ¶
type DownloaderOption func(d *SpiderDownloader)
DownloaderOption 下载器可选参数函数
func DownloadWithClient ¶
func DownloadWithClient(client http.Client) DownloaderOption
DownloadWithClient 设置下载器的http.Client客户端
func DownloadWithH2 ¶ added in v0.4.1
func DownloadWithH2(h2 bool) DownloaderOption
DownloadWithH2 下载器是否开启http2
func DownloadWithTLSConfig ¶ added in v0.5.0
func DownloadWithTLSConfig(tls *tls.Config) DownloaderOption
DownloadWithTLSConfig 设置下载器的tls
func DownloadWithTimeout ¶
func DownloadWithTimeout(timeout time.Duration) DownloaderOption
DownloadWithTimeout 设置下载器的网络请求超时时间
func DownloaderWithtransport ¶
func DownloaderWithtransport(transport *http.Transport) DownloaderOption
DownloaderWithtransport 为下载器设置 http.Transport
type EngineOption ¶
type EngineOption func(r *CrawlEngine)
EngineOption 引擎构造过程中的可选参数
func EngineWithComponents ¶ added in v0.5.0
func EngineWithComponents(components ComponentInterface) EngineOption
func EngineWithDownloader ¶
func EngineWithDownloader(downloader Downloader) EngineOption
EngineWithDownloader 引擎使用的下载器组件
func EngineWithReqChannelSize ¶ added in v0.5.0
func EngineWithReqChannelSize(size int) EngineOption
EngineWithReqChannelSize
func EngineWithUniqueReq ¶
func EngineWithUniqueReq(uniqueReq bool) EngineOption
EngineWithUniqueReq 是否进行去重处理, true则进行去重处理,默认值为true
type ErrorOption ¶
type ErrorOption func(e *HandleError)
ErrorOption HandleError 可选参数
func ErrorWithExtras ¶ added in v0.4.1
func ErrorWithExtras(extras map[string]interface{}) ErrorOption
ErrorWithExtras HandleError 添加额外的数据
type EventHooksInterface ¶ added in v0.4.1
type EventHooksInterface interface {
// Start 处理引擎启动事件
Start(params ...interface{}) error
// Stop 处理引擎停止事件
Pause(params ...interface{}) error
// Error处理错误事件
Error(params ...interface{}) error
// Exit 退出引擎事件
Exit(params ...interface{}) error
// Heartbeat 心跳检查事件
Heartbeat(params ...interface{}) error
// EventsWatcher 事件监听器
EventsWatcher(ch chan EventType) error
SetCurrentSpider(spider SpiderInterface)
}
EventHooksInterface 事件处理函数接口
type EventsWatcher ¶ added in v0.4.1
EventsWatcher 事件监听器
type HandleError ¶
type HandleError struct {
// CtxID 上下文id
CtxID string
// Err 处理过程的错误
Err error
// Extras 携带的额外信息
Extras map[string]interface{}
}
HandleError 错误处理接口
func NewError ¶
func NewError(ctx *Context, err error, opts ...ErrorOption) *HandleError
NewError 构建新的HandleError实例
type ItemMeta ¶
type ItemMeta struct {
// CtxID 对应的context id
CtxID string
// Item item对象
Item ItemInterface
}
ItemMeta item元数据结构
type ItemPipelines ¶
type ItemPipelines []PipelinesInterface
func (ItemPipelines) Len ¶
func (p ItemPipelines) Len() int
func (ItemPipelines) Less ¶
func (p ItemPipelines) Less(i, j int) bool
func (ItemPipelines) Swap ¶
func (p ItemPipelines) Swap(i, j int)
type LimitInterface ¶ added in v0.4.1
type LimitInterface interface {
// checkAndWaitLimiterPass 检查当前并发量
// 如果并发量达到上限则等待
CheckAndWaitLimiterPass() error
// setCurrrentSpider 设置当前正在的运行的spider
SetCurrentSpider(spider SpiderInterface)
}
LimitInterface 限速器接口
type Middlewares ¶
type Middlewares []MiddlewaresInterface
Middlewares 下载中间件队列
func (Middlewares) Less ¶
func (p Middlewares) Less(i, j int) bool
func (Middlewares) Swap ¶
func (p Middlewares) Swap(i, j int)
type MiddlewaresBase ¶
type MiddlewaresBase struct {
Priority int
}
type MiddlewaresInterface ¶
type MiddlewaresInterface interface {
// GetPriority 获取优先级,数字越小优先级越高
GetPriority() int
// ProcessRequest 处理request请求对象
// 此处用于增加请求头
// 按优先级执行
ProcessRequest(ctx *Context) error
// ProcessResponse 用于处理请求成功之后的response
// 执行顺序你优先级,及优先级越高执行顺序越晚
ProcessResponse(ctx *Context, req chan<- *Context) error
// GetName 获取中间件的名称
GetName() string
}
MiddlewaresInterface 下载中间件的接口用于处理进入下载器之前的request对象 和下载之后的response
type PipelinesBase ¶
type PipelinesBase struct {
Priority int
}
type PipelinesInterface ¶
type PipelinesInterface interface {
// GetPriority 获取当前pipeline的优先级
GetPriority() int
// ProcessItem item处理单元
ProcessItem(spider SpiderInterface, item *ItemMeta) error
}
PipelinesInterface pipeline 接口 pipeline 主要用于处理item,例如数据存储、数据清洗 将多个pipeline注册到引擎可以实现责任链模式的数据处理
type ProcessResponse ¶
ProcessResponse 处理下载之后的response函数
type RFPDupeFilterInterface ¶
type RFPDupeFilterInterface interface {
// Fingerprint request指纹计算
Fingerprint(ctx *Context) ([]byte, error)
// DoDupeFilter request去重
DoDupeFilter(ctx *Context) (bool, error)
SetCurrentSpider(spider SpiderInterface)
}
RFPDupeFilterInterface request 对象指纹计算和布隆过滤器去重
type Request ¶
type Request struct {
// Url 请求Url
Url string `json:"url"`
// Headers 请求头
Headers map[string]string `json:"headers"`
// Method 请求方式
Method RequestMethod `json:"method"`
// Params 请求url的参数
Params map[string]string `json:"params"`
// Proxy 代理实例
Proxy *Proxy `json:"-"`
// Cookies 请求携带的cookies
Cookies map[string]string `json:"cookies"`
// Meta 请求携带的额外的信息
Meta map[string]interface{} `json:"meta"`
// AllowRedirects 是否允许跳转默认允许
AllowRedirects bool `json:"allowRedirects"`
// MaxRedirects 最大的跳转次数
MaxRedirects int `json:"maxRedirects"`
// Parser 该请求绑定的响应解析函数,必须是一个spider实例
Parser string `json:"parser"`
// MaxConnsPerHost 单个域名最大的连接数
MaxConnsPerHost int `json:"maxConnsPerHost"`
// AllowStatusCode 允许的状态码
AllowStatusCode []uint64 `json:"allowStatusCode"`
// Timeout 请求超时时间
Timeout time.Duration `json:"timeout"`
// DoNotFilter
DoNotFilter bool
// contains filtered or unexported fields
}
Request 请求对象的结构
func NewRequest ¶
func NewRequest(url string, method RequestMethod, parser Parser, opts ...RequestOption) *Request
请注意parser函数必须是某一个spiderinterface实例的解析函数 否则无法正常调用该解析函数
func RequestFromMap ¶ added in v0.4.1
func RequestFromMap(src map[string]interface{}, opts ...RequestOption) *Request
RequestFromMap 从map创建requests
type RequestMethod ¶ added in v0.4.1
type RequestMethod string
RequestMethod 请求方式
const ( // GET 请求 GET RequestMethod = "GET" // POST 请求 POST RequestMethod = "POST" // PUT 请求 PUT RequestMethod = "PUT" // DELETE 请求 DELETE RequestMethod = "DELETE" // OPTIONS 请求 OPTIONS RequestMethod = "OPTIONS" // HEAD 请求 HEAD RequestMethod = "HEAD" )
type RequestOption ¶ added in v0.4.1
type RequestOption func(r *Request)
Option NewRequest 可选参数
func RequestWithAllowRedirects ¶
func RequestWithAllowRedirects(allowRedirects bool) RequestOption
RequestWithAllowRedirects 设置是否允许跳转 如果不允许则MaxRedirects=0
func RequestWithAllowedStatusCode ¶ added in v0.4.1
func RequestWithAllowedStatusCode(allowStatusCode []uint64) RequestOption
RequestWithAllowedStatusCode 设置AllowStatusCode
func RequestWithBodyReader ¶ added in v0.5.0
func RequestWithBodyReader(body io.Reader) RequestOption
RequestWithBodyReader set request body io.Reader
func RequestWithDoNotFilter ¶ added in v0.4.6
func RequestWithDoNotFilter(doNotFilter bool) RequestOption
RequestWithDoNotFilter 设置当前请求是否进行过滤处理, true则认为该条请求无需进入去重流程,默认值为false
func RequestWithMaxConnsPerHost ¶
func RequestWithMaxConnsPerHost(maxConnsPerHost int) RequestOption
RequestWithMaxConnsPerHost 设置MaxConnsPerHost
func RequestWithMaxRedirects ¶
func RequestWithMaxRedirects(maxRedirects int) RequestOption
RequestWithMaxRedirects 设置最大的跳转次数 若maxRedirects <= 0则认为不允许跳转AllowRedirects = false
func RequestWithParser ¶ added in v0.4.1
func RequestWithParser(parser Parser) RequestOption
RequestWithParser 设置Parser
func RequestWithPostForm ¶ added in v0.5.0
func RequestWithPostForm(payload url.Values) RequestOption
RequestWithPostForm set application/x-www-form-urlencoded request body reader
func RequestWithRequestBody ¶
func RequestWithRequestBody(body map[string]interface{}) RequestOption
RequestWithRequestBody 传入请求体到request
func RequestWithRequestBytesBody ¶ added in v0.4.1
func RequestWithRequestBytesBody(body []byte) RequestOption
RequestWithRequestBytesBody request绑定bytes body
func RequestWithRequestCookies ¶
func RequestWithRequestCookies(cookies map[string]string) RequestOption
RequestWithRequestCookies 设置cookie
func RequestWithRequestHeader ¶
func RequestWithRequestHeader(headers map[string]string) RequestOption
RequestWithRequestHeader 设置请求头
func RequestWithRequestMeta ¶
func RequestWithRequestMeta(meta map[string]interface{}) RequestOption
RequestWithRequestMeta 设置 meta
func RequestWithRequestParams ¶
func RequestWithRequestParams(params map[string]string) RequestOption
RequestWithRequestParams 设置请求的url参数
func RequestWithRequestProxy ¶
func RequestWithRequestProxy(proxy Proxy) RequestOption
RequestWithRequestProxy 设置代理
func RequestWithTimeout ¶ added in v0.4.1
func RequestWithTimeout(timeout time.Duration) RequestOption
RequestWithTimeout 设置请求超时时间 若timeout<=0则认为没有超时时间
type Response ¶
type Response struct {
// Status状态码
Status int
// Headers 响应头
Headers map[string][]string // Header response header
// Delay 请求延迟
Delay float64 // Delay the time of handle download request
// ContentLength 响应体大小
ContentLength uint64 // ContentLength response content length
// URL 请求url
URL string // URL of request url
// Buffer 响应体缓存
Buffer *bytes.Buffer // buffer read response buffer
Body io.ReadCloser
// contains filtered or unexported fields
}
Response 请求响应体的结构
type RuntimeStatus ¶ added in v0.5.0
type RuntimeStatus struct {
StartAt int64
Duration float64
StopAt int64
RestartAt int64
// StatusOn 当前引擎的状态
StatusOn StatusType
}
func NewRuntimeStatus ¶ added in v0.5.0
func NewRuntimeStatus() *RuntimeStatus
func (*RuntimeStatus) GetDuration ¶ added in v0.5.0
func (r *RuntimeStatus) GetDuration() float64
GetDuration 爬虫运行时长
func (*RuntimeStatus) GetRestartAt ¶ added in v0.5.0
func (r *RuntimeStatus) GetRestartAt() int64
GetStartAt 获取引擎启动的时间戳
func (*RuntimeStatus) GetStartAt ¶ added in v0.5.0
func (r *RuntimeStatus) GetStartAt() int64
GetStartAt 获取引擎启动的时间戳
func (*RuntimeStatus) GetStatusOn ¶ added in v0.5.0
func (r *RuntimeStatus) GetStatusOn() StatusType
GetStatusOn 获取引擎的状态
func (*RuntimeStatus) GetStopAt ¶ added in v0.5.0
func (r *RuntimeStatus) GetStopAt() int64
GetStopAt 爬虫停止的时间戳
func (*RuntimeStatus) SetDuration ¶ added in v0.5.0
func (r *RuntimeStatus) SetDuration(duration float64)
func (*RuntimeStatus) SetRestartAt ¶ added in v0.5.0
func (r *RuntimeStatus) SetRestartAt(startAt int64)
func (*RuntimeStatus) SetStartAt ¶ added in v0.5.0
func (r *RuntimeStatus) SetStartAt(startAt int64)
func (*RuntimeStatus) SetStatus ¶ added in v0.5.0
func (r *RuntimeStatus) SetStatus(status StatusType)
SetStatus 设置引擎状态 用于控制引擎的启停
func (*RuntimeStatus) SetStopAt ¶ added in v0.5.0
func (r *RuntimeStatus) SetStopAt(stopAt int64)
type SpiderDownloader ¶
type SpiderDownloader struct {
// ProxyFunc 对单个请求进行代理设置
ProxyFunc func(req *http.Request) (*url.URL, error)
// contains filtered or unexported fields
}
SpiderDownloader tegenaria 爬虫下载器
func (*SpiderDownloader) CheckStatus ¶
func (d *SpiderDownloader) CheckStatus(statusCode uint64, allowStatus []uint64) bool
CheckStatus 检查状态码是否合法
type SpiderInterface ¶
type SpiderInterface interface {
// StartRequest 通过GetFeedUrls()获取种子
// urls并构建初始请求
StartRequest(req chan<- *Context)
// Parser 默认的请求响应解析函数
// 在解析过程中生成的新的请求可以推送到req channel
Parser(resp *Context, req chan<- *Context) error
// ErrorHandler 错误处理函数,允许在此过程中生成新的请求
// 并推送到req channel
ErrorHandler(err *Context, req chan<- *Context)
// GetName 获取spider名称
GetName() string
// GetFeedUrls 获取种子urls
GetFeedUrls() []string
}
SpiderInterface Tegenaria spider interface, developer can custom spider must be based on this interface to achieve custom spider.
type Spiders ¶
type Spiders struct {
// SpidersModules spider名称和spider实例的映射
SpidersModules map[string]SpiderInterface
// Parsers parser函数名和函数的映射
// 用于序列化和反序列化
Parsers map[string]Parser
}
Spiders 全局spiders管理器 用于接收注册的SpiderInterface实例
var SpidersList *Spiders
SpidersList 注册到引擎的爬虫列表
func (*Spiders) GetAllSpidersName ¶ added in v0.5.0
func (*Spiders) GetSpider ¶
func (s *Spiders) GetSpider(name string) (SpiderInterface, error)
GetSpider 通过爬虫名获取spider实例
func (*Spiders) Register ¶
func (s *Spiders) Register(spider SpiderInterface) error
Register spider实例注册到Spiders.SpidersModules
type StatisticInterface ¶ added in v0.4.1
type StatisticInterface interface {
// GetAllStats 获取所有的指标数据
GetAllStats() map[string]uint64
// Incr 指定的指标计数器自增1
Incr(metric string)
// Get 获取指标的数值
Get(metric string) uint64
// SetCurrentSpider 设置当前的爬虫实例
SetCurrentSpider(spider SpiderInterface)
}
StatisticInterface 数据统计组件接口
type StatusType ¶ added in v0.5.0
type StatusType uint
StatusType 当前引擎的状态
const ( // ON_START 启动状态 ON_START StatusType = iota // ON_STOP 停止状态 ON_STOP // ON_PAUSE 暂停状态 ON_PAUSE )
func (StatusType) GetTypeName ¶ added in v0.5.0
func (p StatusType) GetTypeName() string
GetTypeName 获取引擎状态的字符串形式
type TestDownloadMiddler ¶ added in v0.5.0
func (TestDownloadMiddler) GetName ¶ added in v0.5.0
func (m TestDownloadMiddler) GetName() string
func (TestDownloadMiddler) GetPriority ¶ added in v0.5.0
func (m TestDownloadMiddler) GetPriority() int
func (TestDownloadMiddler) ProcessRequest ¶ added in v0.5.0
func (m TestDownloadMiddler) ProcessRequest(ctx *Context) error
func (TestDownloadMiddler) ProcessResponse ¶ added in v0.5.0
func (m TestDownloadMiddler) ProcessResponse(ctx *Context, req chan<- *Context) error
type TestDownloadMiddler2 ¶ added in v0.5.0
func (TestDownloadMiddler2) GetName ¶ added in v0.5.0
func (m TestDownloadMiddler2) GetName() string
func (TestDownloadMiddler2) GetPriority ¶ added in v0.5.0
func (m TestDownloadMiddler2) GetPriority() int
func (TestDownloadMiddler2) ProcessRequest ¶ added in v0.5.0
func (m TestDownloadMiddler2) ProcessRequest(ctx *Context) error
func (TestDownloadMiddler2) ProcessResponse ¶ added in v0.5.0
func (m TestDownloadMiddler2) ProcessResponse(ctx *Context, req chan<- *Context) error
type TestItemPipeline ¶ added in v0.5.0
type TestItemPipeline struct {
Priority int
}
func (*TestItemPipeline) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline) GetPriority() int
func (*TestItemPipeline) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestItemPipeline2 ¶ added in v0.5.0
type TestItemPipeline2 struct {
Priority int
}
func (*TestItemPipeline2) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline2) GetPriority() int
func (*TestItemPipeline2) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline2) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestItemPipeline3 ¶ added in v0.5.0
type TestItemPipeline3 struct {
Priority int
}
func (*TestItemPipeline3) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline3) GetPriority() int
func (*TestItemPipeline3) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline3) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestItemPipeline4 ¶ added in v0.5.0
type TestItemPipeline4 struct {
Priority int
}
func (*TestItemPipeline4) GetPriority ¶ added in v0.5.0
func (p *TestItemPipeline4) GetPriority() int
func (*TestItemPipeline4) ProcessItem ¶ added in v0.5.0
func (p *TestItemPipeline4) ProcessItem(spider SpiderInterface, item *ItemMeta) error
type TestSpider ¶ added in v0.5.0
type TestSpider struct {
*BaseSpider
}
func (*TestSpider) ErrorHandler ¶ added in v0.5.0
func (s *TestSpider) ErrorHandler(err *Context, req chan<- *Context)
func (*TestSpider) GetFeedUrls ¶ added in v0.5.0
func (s *TestSpider) GetFeedUrls() []string
func (*TestSpider) GetName ¶ added in v0.5.0
func (s *TestSpider) GetName() string
func (*TestSpider) Parser ¶ added in v0.5.0
func (s *TestSpider) Parser(resp *Context, req chan<- *Context) error
func (*TestSpider) StartRequest ¶ added in v0.5.0
func (s *TestSpider) StartRequest(req chan<- *Context)