Documentation
¶
Overview ¶
Package spider provides spider rule definition, species registration, and parsing.
Index ¶
- Constants
- Variables
- func PutContext(ctx *Context)
- func RegisterDynamicSpiders()
- type Bell
- type Clock
- type Context
- func (ctx *Context) AddQueue(req *request.Request) *Context
- func (ctx *Context) Aid(aid map[string]interface{}, ruleName ...string) interface{}
- func (ctx *Context) CopyRequest() *request.Request
- func (ctx *Context) CopyTemps() request.Temp
- func (ctx *Context) CreateItem(item map[int]interface{}, ruleName ...string) map[string]interface{}
- func (ctx *Context) FileOutput(nameOrExt ...string)
- func (ctx *Context) GetCookie() string
- func (ctx *Context) GetDom() *goquery.Document
- func (ctx *Context) GetError() error
- func (ctx *Context) GetHeader() http.Header
- func (ctx *Context) GetHost() string
- func (ctx *Context) GetItemField(index int, ruleName ...string) (field string)
- func (ctx *Context) GetItemFieldIndex(field string, ruleName ...string) (index int)
- func (ctx *Context) GetItemFields(ruleName ...string) []string
- func (ctx *Context) GetKeyin() string
- func (ctx *Context) GetLimit() int
- func (ctx *Context) GetMethod() string
- func (ctx *Context) GetName() string
- func (ctx *Context) GetReferer() string
- func (ctx *Context) GetRequest() *request.Request
- func (ctx *Context) GetRequestHeader() http.Header
- func (ctx *Context) GetResponse() *http.Response
- func (ctx *Context) GetRule(ruleName string) *Rule
- func (ctx *Context) GetRuleName() string
- func (ctx *Context) GetRules() map[string]*Rule
- func (ctx *Context) GetSpider() *Spider
- func (ctx *Context) GetStatusCode() int
- func (ctx *Context) GetTemp(key string, defaultValue interface{}) interface{}
- func (ctx *Context) GetTemps() request.Temp
- func (ctx *Context) GetText() string
- func (ctx *Context) GetURL() string
- func (ctx *Context) JsAddQueue(jreq map[string]interface{}) *Context
- func (*Context) Log() logs.Logs
- func (ctx *Context) Output(item interface{}, ruleName ...string)
- func (ctx *Context) Parse(ruleName ...string) *Context
- func (ctx *Context) PullFiles() (fs []data.FileCell)
- func (ctx *Context) PullItems() (ds []data.DataCell)
- func (ctx *Context) ResetText(body string) *Context
- func (ctx *Context) RunTimer(id string) bool
- func (ctx *Context) SetError(err error)
- func (ctx *Context) SetKeyin(keyin string) *Context
- func (ctx *Context) SetLimit(max int) *Context
- func (ctx *Context) SetPausetime(pause int64, runtime ...bool) *Context
- func (ctx *Context) SetReferer(referer string) *Context
- func (ctx *Context) SetResponse(resp *http.Response) *Context
- func (ctx *Context) SetTemp(key string, value interface{}) *Context
- func (ctx *Context) SetTimer(id string, tol time.Duration, bell *Bell) bool
- func (ctx *Context) SetURL(url string) *Context
- func (ctx *Context) UpsertItemField(field string, ruleName ...string) (index int)
- type Rule
- type RuleModle
- type RuleTree
- type Spider
- func (sp *Spider) CanStop() bool
- func (sp *Spider) Copy() *Spider
- func (sp *Spider) Defer()
- func (sp *Spider) DoHistory(req *request.Request, ok bool) bool
- func (sp *Spider) GetDescription() string
- func (sp *Spider) GetEnableCookie() bool
- func (sp *Spider) GetID() int
- func (sp *Spider) GetItemField(rule *Rule, index int) (field string)
- func (sp *Spider) GetItemFieldIndex(rule *Rule, field string) (index int)
- func (sp *Spider) GetItemFields(rule *Rule) []string
- func (sp *Spider) GetKeyin() string
- func (sp *Spider) GetLimit() int64
- func (sp *Spider) GetName() string
- func (sp *Spider) GetRule(ruleName string) *Rule
- func (sp *Spider) GetRules() map[string]*Rule
- func (sp *Spider) GetSubName() string
- func (sp *Spider) IsStopping() bool
- func (sp *Spider) MustGetRule(ruleName string) *Rule
- func (sp *Spider) OutDefaultField() bool
- func (sp *Spider) Register() *Spider
- func (sp *Spider) ReqmatrixInit() *Spider
- func (sp *Spider) RequestFree()
- func (sp *Spider) RequestLen() int
- func (sp *Spider) RequestPull() *request.Request
- func (sp *Spider) RequestPush(req *request.Request)
- func (sp *Spider) RequestUse()
- func (sp *Spider) RunTimer(id string) bool
- func (sp *Spider) SetID(id int)
- func (sp *Spider) SetKeyin(keyword string)
- func (sp *Spider) SetLimit(max int64)
- func (sp *Spider) SetPausetime(pause int64, runtime ...bool)
- func (sp *Spider) SetTimer(id string, tol time.Duration, bell *Bell) bool
- func (sp *Spider) Start()
- func (sp *Spider) Stop()
- func (sp *Spider) TryFlushFailure()
- func (sp *Spider) TryFlushSuccess()
- func (sp *Spider) UpsertItemField(rule *Rule, field string) (index int)
- type SpiderModle
- type SpiderSpecies
- type Timer
Constants ¶
const ( KEYIN = util.USE_KEYIN // rules that use Spider.Keyin must set its initial value to USE_KEYIN LIMIT = math.MaxInt64 // rules that customize Limit must set its initial value to LIMIT FORCED_STOP = "-- Forced stop of Spider --" )
const ( A = iota // alarm mode T // countdown mode )
Variables ¶
var ErrForcedStop = errors.New("forced stop")
var Species = &SpiderSpecies{ list: []*Spider{}, hash: map[string]*Spider{}, }
Species is the singleton spider registry.
Functions ¶
func PutContext ¶
func PutContext(ctx *Context)
PutContext resets a Context and returns it to the pool.
func RegisterDynamicSpiders ¶ added in v1.4.0
func RegisterDynamicSpiders()
RegisterDynamicSpiders loads and registers all dynamic (JS-based) spider rules from config.Conf().SpiderDir. Safe to call multiple times; only the first call performs registration.
Types ¶
type Clock ¶
type Clock struct {
// contains filtered or unexported fields
}
Clock represents a single alarm or countdown timer.
type Context ¶
type Context struct {
Request *request.Request
Response *http.Response // URL is copied from *request.Request
sync.Mutex
// contains filtered or unexported fields
}
Context carries the state for a single crawl request through its lifecycle.
func GetContext ¶
GetContext retrieves a Context from the pool and binds it to the given spider and request.
func (*Context) AddQueue ¶
AddQueue validates and enqueues a new crawl request.
Required fields: Request.URL, Request.Rule. Request.Spider is set automatically; Request.EnableCookie is inherited from Spider.
Fields with defaults (may be omitted):
- Method: GET
- DialTimeout: request.DefaultDialTimeout (negative = unlimited)
- ConnTimeout: request.DefaultConnTimeout (negative = unlimited)
- TryTimes: request.DefaultTryTimes (negative = unlimited retries)
- RedirectTimes: unlimited by default (negative = disable redirects)
- RetryPause: request.DefaultRetryPause
- DownloaderID: 0 = Surf (fast, full-featured), 1 = PhantomJS (slow, JS-capable)
Referer is auto-filled from the current response URL if not set.
func (*Context) Aid ¶
Aid invokes the AidFunc of the specified rule. An empty ruleName defaults to the current rule.
func (*Context) CopyRequest ¶
CopyRequest returns a deep copy of the original request.
func (*Context) CreateItem ¶ added in v1.4.0
CreateItem builds a text result map keyed by field names using the ItemFields of ruleName. An empty ruleName defaults to the current rule.
func (*Context) FileOutput ¶
FileOutput collects a file result from the response body. nameOrExt optionally specifies a file name or extension; empty keeps the original. Errors are logged internally; no return value for JS VM compatibility.
func (*Context) GetDom ¶
GetDom returns the parsed HTML DOM, initializing it lazily from the response body. Errors are stored in ctx.err and can be retrieved via GetError().
func (*Context) GetError ¶
GetError returns the download error, or the spider's stop error if stopping.
func (*Context) GetItemField ¶
GetItemField returns the field name at the given index, or "" if not found. An empty ruleName defaults to the current rule.
func (*Context) GetItemFieldIndex ¶
GetItemFieldIndex returns the index of the given field name, or -1 if not found. An empty ruleName defaults to the current rule.
func (*Context) GetItemFields ¶
GetItemFields returns the result field name list for the given rule.
func (*Context) GetReferer ¶
GetReferer returns the Referer header from the actual HTTP request made.
func (*Context) GetRequest ¶
GetRequest returns the original request.
func (*Context) GetRequestHeader ¶
GetRequestHeader returns the request headers from the actual HTTP request made.
func (*Context) GetResponse ¶
GetResponse returns the HTTP response.
func (*Context) GetRuleName ¶
GetRuleName returns the current rule name from the request.
func (*Context) GetStatusCode ¶
GetStatusCode returns the HTTP response status code, or 0 if no response.
func (*Context) GetTemp ¶
GetTemp retrieves temporary data from the request by key. defaultValue must not be a nil interface{}.
func (*Context) GetText ¶
GetText returns the response body as a UTF-8 string, initializing it lazily. Errors are stored in ctx.err and can be retrieved via GetError().
func (*Context) GetURL ¶ added in v1.4.0
GetURL returns the URL from the original request, preserving the unencoded form.
func (*Context) JsAddQueue ¶
JsAddQueue adds crawl requests from dynamic (JavaScript) rule definitions.
func (*Context) Output ¶
Output collects a text result item.
When item is map[int]interface{}, fields are mapped using the existing ItemFields of ruleName. When item is map[string]interface{}, missing ItemFields are auto-added. An empty ruleName defaults to the current rule.
func (*Context) Parse ¶
Parse dispatches the response to the ParseFunc of the specified rule. An empty ruleName defaults to Root().
func (*Context) PullFiles ¶
PullFiles drains and returns all collected file results, resetting the internal buffer.
func (*Context) PullItems ¶
PullItems drains and returns all collected data items, resetting the internal buffer.
func (*Context) ResetText ¶
ResetText replaces the downloaded text content and invalidates the DOM cache.
func (*Context) RunTimer ¶
RunTimer starts the timer and reports whether it can continue to be used.
func (*Context) SetPausetime ¶
SetPausetime sets a custom pause interval (randomized: pause/2 ~ pause*2). Overrides the externally configured value. Only overwrites an existing value when runtime[0] is true.
func (*Context) SetReferer ¶
func (*Context) SetResponse ¶
SetResponse binds the HTTP response to this context.
func (*Context) SetTimer ¶
SetTimer configures a timer identified by id. When bell is nil, tol is a sleep duration (countdown timer). When bell is non-nil, tol specifies the wake-up point (the tol-th bell occurrence from now).
func (*Context) UpsertItemField ¶
UpsertItemField adds a result field name to the given rule and returns its index. If the field already exists, the existing index is returned. An empty ruleName defaults to the current rule.
type Rule ¶
type Rule struct {
ItemFields []string // result field names (optional; preserves field order)
ParseFunc func(*Context) // content parsing function
AidFunc func(*Context, map[string]interface{}) interface{} // auxiliary helper function
}
Rule defines a single crawl rule node.
type RuleModle ¶
type RuleModle struct {
Name string `xml:"name,attr"`
ParseFunc string `xml:"ParseFunc>Script"`
AidFunc string `xml:"AidFunc>Script"`
}
RuleModle is the XML model for a single dynamic rule node.
type RuleTree ¶
type RuleTree struct {
Root func(*Context) // entry point
Trunk map[string]*Rule // rule map (keyed by rule name)
}
RuleTree defines the crawl rule tree.
type Spider ¶
type Spider struct {
// User-defined fields
Name string // display name (must be unique)
Description string // display description
Pausetime int64 // random pause range (50%~200%); if set in rule, overrides UI parameter
Limit int64 // request limit (0 = unlimited; set to LIMIT for custom limit logic in rules)
Keyin string // custom input config (set to KEYIN in rules to enable)
EnableCookie bool // whether requests carry cookies
NotDefaultField bool // disable default output fields Url/ParentUrl/DownloadTime
Namespace func(sp *Spider) string // namespace for output file/path naming
SubNamespace func(self *Spider, dataCell map[string]interface{}) string // sub-namespace, may depend on specific data content
RuleTree *RuleTree // crawl rule tree
// contains filtered or unexported fields
}
Spider defines a crawl spider with its rules and runtime state.
func (*Spider) Defer ¶
func (sp *Spider) Defer()
Defer performs cleanup before the spider exits: cancels timers, waits for in-flight requests, and flushes failures.
func (*Spider) DoHistory ¶
DoHistory records request history and reports whether a failed request was re-enqueued.
func (*Spider) GetDescription ¶
GetDescription returns the spider description.
func (*Spider) GetEnableCookie ¶
GetEnableCookie reports whether requests carry cookies.
func (*Spider) GetItemField ¶
GetItemField returns the field name at the given index, or "" if out of range.
func (*Spider) GetItemFieldIndex ¶
GetItemFieldIndex returns the index of the given field name, or -1 if not found.
func (*Spider) GetItemFields ¶
GetItemFields returns the result field names for the given rule.
func (*Spider) GetLimit ¶
GetLimit returns the crawl limit. Negative means request-count limiting; positive means custom rule-based limiting.
func (*Spider) GetSubName ¶
GetSubName returns the secondary identifier derived from Keyin (computed once).
func (*Spider) IsStopping ¶
IsStopping reports whether the spider is in the process of stopping.
func (*Spider) MustGetRule ¶
MustGetRule returns the rule with the given name (panics if missing).
func (*Spider) OutDefaultField ¶
OutDefaultField reports whether default fields (Url/ParentUrl/DownloadTime) should be included in output.
func (*Spider) ReqmatrixInit ¶
ReqmatrixInit initializes the request scheduling matrix for this spider.
func (*Spider) RequestFree ¶
func (sp *Spider) RequestFree()
func (*Spider) RequestLen ¶
func (*Spider) RequestPull ¶
RequestPull dequeues the next request from the scheduling matrix.
func (*Spider) RequestPush ¶
RequestPush enqueues a request into the scheduling matrix.
func (*Spider) RequestUse ¶
func (sp *Spider) RequestUse()
func (*Spider) SetPausetime ¶
SetPausetime sets a custom pause interval. Only overwrites an existing value when runtime[0] is true.
func (*Spider) SetTimer ¶
SetTimer configures a timer identified by id. When bell is nil, tol is a countdown sleep duration; otherwise tol specifies the wake-up occurrence.
func (*Spider) Stop ¶
func (sp *Spider) Stop()
Stop gracefully stops the spider and cancels all timers.
func (*Spider) TryFlushFailure ¶
func (sp *Spider) TryFlushFailure()
func (*Spider) TryFlushSuccess ¶
func (sp *Spider) TryFlushSuccess()
type SpiderModle ¶
type SpiderModle struct {
Name string `xml:"Name"`
Description string `xml:"Description"`
Pausetime int64 `xml:"Pausetime"`
EnableLimit bool `xml:"EnableLimit"`
EnableKeyin bool `xml:"EnableKeyin"`
EnableCookie bool `xml:"EnableCookie"`
NotDefaultField bool `xml:"NotDefaultField"`
Namespace string `xml:"Namespace>Script"`
SubNamespace string `xml:"SubNamespace>Script"`
Root string `xml:"Root>Script"`
Trunk []RuleModle `xml:"Rule"`
}
SpiderModle is the XML model for dynamic (JavaScript-based) spider rules.
type SpiderSpecies ¶
type SpiderSpecies struct {
// contains filtered or unexported fields
}
SpiderSpecies is the global registry of available spider types.
func (*SpiderSpecies) Add ¶
func (ss *SpiderSpecies) Add(sp *Spider) *Spider
Add registers a spider. If the name already exists, a numeric suffix is appended.
func (*SpiderSpecies) Get ¶
func (ss *SpiderSpecies) Get() []*Spider
Get returns all registered spiders, sorted by pinyin initials on first call. Dynamic spiders are lazily registered on first access.
func (*SpiderSpecies) GetByNameOpt ¶ added in v1.4.0
func (ss *SpiderSpecies) GetByNameOpt(name string) option.Option[*Spider]
GetByNameOpt returns the spider with the given name as Option.