Documentation
¶
Index ¶
- Constants
- Variables
- type FetchError
- type Item
- func (i *Item) Clone() *Item
- func (i *Item) Delete(key string)
- func (i *Item) Get(key string) (any, bool)
- func (i *Item) GetString(key string) string
- func (i *Item) Has(key string) bool
- func (i *Item) Keys() []string
- func (i *Item) Set(key string, value any)
- func (i *Item) ToFlatMap() map[string]string
- func (i *Item) ToJSON() ([]byte, error)
- type ParseError
- type PipelineError
- type Request
- type Response
- type StorageError
Constants ¶
const ( PriorityHighest = 0 PriorityHigh = 1 PriorityNormal = 2 PriorityLow = 3 PriorityLowest = 4 )
Priority levels for request scheduling.
Variables ¶
var ( ErrTimeout = errors.New("request timed out") ErrMaxRetries = errors.New("max retries exceeded") ErrBlocked = errors.New("blocked by robots.txt") ErrMaxDepth = errors.New("max depth exceeded") ErrDuplicate = errors.New("duplicate URL") ErrEmptyResponse = errors.New("empty response body") ErrInvalidURL = errors.New("invalid URL") ErrCrawlStopped = errors.New("crawl has been stopped") ErrNoFetcher = errors.New("no fetcher available for request") ErrProxyExhausted = errors.New("all proxies exhausted") )
Sentinel errors for common failure modes.
Functions ¶
This section is empty.
Types ¶
type FetchError ¶
type FetchError struct {
URL string
StatusCode int
Err error
Retryable bool
RetryAfter time.Duration // populated from Retry-After header on HTTP 429
}
FetchError wraps errors that occur during fetching.
func (*FetchError) Error ¶
func (e *FetchError) Error() string
func (*FetchError) IsRetryable ¶
func (e *FetchError) IsRetryable() bool
func (*FetchError) Unwrap ¶
func (e *FetchError) Unwrap() error
type Item ¶
type Item struct {
// Fields stores the extracted key-value data.
Fields map[string]any
// URL is the source page URL this item was extracted from.
URL string
// SpiderName identifies which spider produced this item.
SpiderName string
// Timestamp is when this item was created.
Timestamp time.Time
// Depth is the crawl depth at which this item was found.
Depth int
// Checksum is a hash of the item content for deduplication.
Checksum string
}
Item represents a single scraped data record.
type ParseError ¶
ParseError wraps errors that occur during parsing.
func (*ParseError) Error ¶
func (e *ParseError) Error() string
func (*ParseError) Unwrap ¶
func (e *ParseError) Unwrap() error
type PipelineError ¶
PipelineError wraps errors that occur in the processing pipeline.
func (*PipelineError) Error ¶
func (e *PipelineError) Error() string
func (*PipelineError) Unwrap ¶
func (e *PipelineError) Unwrap() error
type Request ¶
type Request struct {
// URL is the target URL to fetch.
URL *url.URL
// Method is the HTTP method (GET, POST, etc.). Defaults to GET.
Method string
// Headers are custom HTTP headers to send with the request.
Headers http.Header
// Body is the request body for POST/PUT requests.
Body []byte
// Depth is the crawl depth from the seed URL.
Depth int
// Priority controls scheduling order (lower = higher priority).
Priority int
// MaxRetries is the maximum number of retries for this request.
MaxRetries int
// RetryCount tracks the current retry attempt.
RetryCount int
// Timeout overrides the global request timeout for this request.
Timeout time.Duration
// Meta stores arbitrary metadata attached to this request.
Meta map[string]any
// Tag categorizes this request (e.g., "listing", "detail", "pagination").
Tag string
// FetcherType specifies which fetcher to use: "http" or "browser".
FetcherType string
// Callbacks are the names of callback functions to invoke on response.
Callbacks []string
// ParentURL tracks which page this request was discovered on.
ParentURL string
// CreatedAt is when this request was created.
CreatedAt time.Time
// ID is a unique identifier for this request.
ID string
}
Request represents an HTTP request to be fetched by the crawler.
func NewRequest ¶
NewRequest creates a new Request with sensible defaults.
type Response ¶
type Response struct {
// StatusCode is the HTTP status code.
StatusCode int
// Headers are the response HTTP headers.
Headers http.Header
// Body is the raw response body bytes.
Body []byte
// Request is a reference to the original request.
Request *Request
// ContentType is the MIME type of the response.
ContentType string
// ContentLength is the size of the response body in bytes.
ContentLength int64
// FinalURL is the URL after any redirects.
FinalURL string
// Doc is a parsed goquery document (lazily loaded).
Doc *goquery.Document
// FetchDuration is how long the fetch took.
FetchDuration time.Duration
// FetchedAt is when this response was received.
FetchedAt time.Time
// Meta stores arbitrary metadata.
Meta map[string]any
}
Response represents the result of fetching a request.
func NewBrowserResponse ¶
func NewBrowserResponse(req *Request, statusCode int, body []byte, finalURL string, duration time.Duration) *Response
NewBrowserResponse creates a Response from headless browser output.
func NewResponse ¶
func NewResponse(req *Request, httpResp *http.Response, body []byte, duration time.Duration) *Response
NewResponse creates a Response from an http.Response.
func (*Response) IsClientError ¶
IsClientError returns true if the response status is 4xx.
func (*Response) IsRedirect ¶
IsRedirect returns true if the response status is 3xx.
func (*Response) IsServerError ¶
IsServerError returns true if the response status is 5xx.
type StorageError ¶
StorageError wraps errors that occur during storage/export.
func (*StorageError) Error ¶
func (e *StorageError) Error() string
func (*StorageError) Unwrap ¶
func (e *StorageError) Unwrap() error