cobweb

package
v0.0.0-...-34aaaa0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Sep 13, 2022 License: Apache-2.0 Imports: 29 Imported by: 0

Documentation

Index

Constants

View Source
const (
	DefaultDownloadTimeout       = time.Second * 20
	DefaultCommandFailedCntLimit = 60
)

Variables

This section is empty.

Functions

func NewTestSuits

func NewTestSuits(t *testing.T) *testSuits

Types

type AbsProxyStorage

type AbsProxyStorage interface {
	GetProxy(proxy *Proxy) (*Proxy, error)
	HasProxy(proxy *Proxy) (bool, error)
	CreateProxy(proxy *Proxy) error
	CreateProxyList([]*Proxy) int
	ActivateProxy(proxy *Proxy) error
	DeactivateProxy(proxy *Proxy) error
	GetTopKProxyList(k int) ([]*Proxy, error)
	GetRandTopKProxy(k int) (*Proxy, error)
	GetAllProxy() ([]*Proxy, error)
	GetProxyListWithRefuseList(refuseList []*Proxy, count int) ([]*Proxy, error)
	GetRandProxyWithRefuseList([]*Proxy) (*Proxy, error)
}

func ProxyStorageSingleton

func ProxyStorageSingleton() AbsProxyStorage

type AnonymityLevel

type AnonymityLevel int
const (
	Transparent AnonymityLevel = iota
	Anonymous
	Elite
)

type BaseRule

type BaseRule interface {
	InitLinks() []string
	InitParse(ctx *Context)
}

type CommandDownloadFailLimitRule

type CommandDownloadFailLimitRule interface {
	CommandDownloadFailLimit() int
}

type CommandFailedCntLimitRule

type CommandFailedCntLimitRule interface {
	CommandFailedCntLimit() int
}

type Context

type Context struct {
	// contains filtered or unexported fields
}

func (*Context) Doc

func (c *Context) Doc() *goquery.Document

func (*Context) Follow

func (c *Context) Follow(link string, callback OnParseCallback, data ...H)

func (*Context) FollowWithBuilder

func (c *Context) FollowWithBuilder(link string, callback OnParseCallback, fBuilder *FollowBuilder)

func (*Context) Get

func (c *Context) Get(key string) (interface{}, bool)

func (*Context) HTML

func (c *Context) HTML(selector string, callback func(element *HTMLElement)) int

func (*Context) Item

func (c *Context) Item(item interface{})

add new item in order to be pipelined

func (*Context) MayDoc

func (c *Context) MayDoc() (*goquery.Document, error)

func (*Context) MayHTML

func (c *Context) MayHTML(selector string, callback func(element *HTMLElement)) (int, error)

func (*Context) NewFollowBuilder

func (c *Context) NewFollowBuilder() *FollowBuilder

func (*Context) Retry

func (c *Context) Retry()

func (*Context) SaveResource

func (c *Context) SaveResource(link string, fileName string)

save link's resource to instance/[taskName].[taskID]/[fileName]

func (*Context) Set

func (c *Context) Set(key string, value interface{})

type DownloadTimeoutRule

type DownloadTimeoutRule interface {
	DownloadTimeout() time.Duration
}

type Executor

type Executor struct {
	// contains filtered or unexported fields
}

Executor is a main part of cobweb it accepts rule and creates task according to that.

func NewDefaultExecutor

func NewDefaultExecutor() *Executor

func NewExecutor

func NewExecutor(
	dFactory downloaderFactory,
	downloaderCnt int,
	downloaderConcurrentLimit int,
	downloaderErrCntLimit int,
	downloaderReqHostInterval time.Duration,
) *Executor

func NewExecutorWithSimpleDownloaderManager

func NewExecutorWithSimpleDownloaderManager() *Executor

func NewNoProxyDefaultExecutor

func NewNoProxyDefaultExecutor() *Executor

func (*Executor) AcceptRule

func (e *Executor) AcceptRule(rule BaseRule) *Task

accept rule and create task for it if executor is not running return nil

func (*Executor) Stop

func (e *Executor) Stop()

type FollowBuilder

type FollowBuilder struct {
	// contains filtered or unexported fields
}

Builder pattern for Context.FollowWithBuilder

func (FollowBuilder) Callback

func (b FollowBuilder) Callback(callback OnParseCallback) *commandBuilder

func (FollowBuilder) ContextData

func (b FollowBuilder) ContextData(data H) *commandBuilder

func (FollowBuilder) Cookie

func (b FollowBuilder) Cookie(key, val string) *commandBuilder

func (FollowBuilder) DownloadTimeout

func (b FollowBuilder) DownloadTimeout(timeout time.Duration) *commandBuilder
func (b FollowBuilder) Link(link string) *commandBuilder

func (FollowBuilder) UserAgent

func (b FollowBuilder) UserAgent(agent string) *commandBuilder

type H

type H map[string]interface{}

type HTMLElement

type HTMLElement struct {
	// contains filtered or unexported fields
}

represent for one element in html doc

func (*HTMLElement) Attr

func (e *HTMLElement) Attr(key string) string

func (*HTMLElement) ChildAttr

func (e *HTMLElement) ChildAttr(selector, key string) string

func (*HTMLElement) ChildText

func (e *HTMLElement) ChildText(selector string) string

func (*HTMLElement) ChildrenAttrs

func (e *HTMLElement) ChildrenAttrs(selector, attrname string) []string

func (*HTMLElement) ChildrenTexts

func (e *HTMLElement) ChildrenTexts(selector string) []string

func (*HTMLElement) ForEach

func (e *HTMLElement) ForEach(selector string, callback func(element *HTMLElement))

func (*HTMLElement) MayAttr

func (e *HTMLElement) MayAttr(key string) string

func (*HTMLElement) MayChildAttr

func (e *HTMLElement) MayChildAttr(selector, key string) string

func (*HTMLElement) MayChildText

func (e *HTMLElement) MayChildText(selector string) string

func (*HTMLElement) MayChildrenAttrs

func (e *HTMLElement) MayChildrenAttrs(selector, attrname string) []string

func (*HTMLElement) MayChildrenTexts

func (e *HTMLElement) MayChildrenTexts(selector string) []string

func (*HTMLElement) MayForEach

func (e *HTMLElement) MayForEach(selector string, callback func(element *HTMLElement))

func (*HTMLElement) Text

func (e *HTMLElement) Text() string

type JsonFilePipeline

type JsonFilePipeline struct {
	// contains filtered or unexported fields
}

func (*JsonFilePipeline) Close

func (p *JsonFilePipeline) Close()

func (*JsonFilePipeline) Pipe

func (p *JsonFilePipeline) Pipe(info *itemInfo)

type JsonStdoutPipeline

type JsonStdoutPipeline struct {
}

func (*JsonStdoutPipeline) Close

func (p *JsonStdoutPipeline) Close()

func (*JsonStdoutPipeline) Pipe

func (p *JsonStdoutPipeline) Pipe(info *itemInfo)

type NoProxyFastHTTPDownloaderFactory

type NoProxyFastHTTPDownloaderFactory struct {
}

type OnDownloadFinishCallback

type OnDownloadFinishCallback func(ctx *Context)

type OnDownloadFinishRule

type OnDownloadFinishRule interface {
	OnDownloadFinish(ctx *Context)
}

type OnParseCallback

type OnParseCallback func(ctx *Context)

type OnParseErrorCallback

type OnParseErrorCallback func(info *ParseErrorInfo)

type OnParseErrorRule

type OnParseErrorRule interface {
	OnParseError(info *ParseErrorInfo)
}

type OnPipeErrorCallback

type OnPipeErrorCallback func(info *PipeErrorInfo)

type OnPipeErrorRule

type OnPipeErrorRule interface {
	OnPipeError(info *PipeErrorInfo)
}

type ParseErrorInfo

type ParseErrorInfo struct {
	Ctx        *Context
	ErrKind    ParseErrorKind
	PanicValue interface{}
}

type ParseErrorKind

type ParseErrorKind int
const (
	UnknownParseError ParseErrorKind = iota
	ParseHTMLError
	HTMLNodeNotFoundError
)

func (ParseErrorKind) String

func (kind ParseErrorKind) String() string

type PipeErrorInfo

type PipeErrorInfo struct {
	Ctx       *Context
	ErrKind   PipeErrorKind
	PanicInfo interface{}
}

type PipeErrorKind

type PipeErrorKind int
const (
	UnknownPipeError PipeErrorKind = iota
)

type Pipeline

type Pipeline interface {
	Pipe(info *itemInfo)
	Close()
}

type PipelineRule

type PipelineRule interface {
	Pipelines() []Pipeline
}

type Proxy

type Proxy struct {
	ID int `gorm:"PRIMARY_KEY;AUTO_INCREMENT;"`

	Host      string
	Port      string
	HTTPS     bool
	Anonymity AnonymityLevel

	Score int
}

代理结构体

func (*Proxy) Equal

func (this *Proxy) Equal(proxy *Proxy) bool

func (*Proxy) FastHTTPDialHTTPProxy

func (this *Proxy) FastHTTPDialHTTPProxy() fasthttp.DialFunc

func (*Proxy) GetProxyURL

func (this *Proxy) GetProxyURL() string

func (*Proxy) Json

func (this *Proxy) Json() string

type ProxyFastHTTPDownloaderFactory

type ProxyFastHTTPDownloaderFactory struct {
}

type ProxyPool

type ProxyPool struct {
	// contains filtered or unexported fields
}

func NewProxyPool

func NewProxyPool(
	executor *Executor,
	proxyReqTimeout time.Duration,
	checkRoutineMaxCount int,
) *ProxyPool

func (*ProxyPool) Start

func (p *ProxyPool) Start()

func (*ProxyPool) Stop

func (p *ProxyPool) Stop()

type Task

type Task struct {
	// contains filtered or unexported fields
}

func (*Task) ID

func (t *Task) ID() string

func (*Task) Name

func (t *Task) Name() string

func (*Task) Wait

func (t *Task) Wait()

type TaskNameRule

type TaskNameRule interface {
	TaskName() string
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL