types

package
v0.0.0-...-76f6d82 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 8, 2026 License: GPL-3.0 Imports: 8 Imported by: 0

Documentation

Index

Constants

View Source
const (
	PriorityHighest = 0
	PriorityHigh    = 1
	PriorityNormal  = 2
	PriorityLow     = 3
	PriorityLowest  = 4
)

Priority levels for request scheduling.

Variables

View Source
var (
	ErrTimeout        = errors.New("request timed out")
	ErrMaxRetries     = errors.New("max retries exceeded")
	ErrBlocked        = errors.New("blocked by robots.txt")
	ErrMaxDepth       = errors.New("max depth exceeded")
	ErrDuplicate      = errors.New("duplicate URL")
	ErrEmptyResponse  = errors.New("empty response body")
	ErrInvalidURL     = errors.New("invalid URL")
	ErrCrawlStopped   = errors.New("crawl has been stopped")
	ErrNoFetcher      = errors.New("no fetcher available for request")
	ErrProxyExhausted = errors.New("all proxies exhausted")
)

Sentinel errors for common failure modes.

Functions

This section is empty.

Types

type FetchError

type FetchError struct {
	URL        string
	StatusCode int
	Err        error
	Retryable  bool
	RetryAfter time.Duration // populated from Retry-After header on HTTP 429
}

FetchError wraps errors that occur during fetching.

func (*FetchError) Error

func (e *FetchError) Error() string

func (*FetchError) IsRetryable

func (e *FetchError) IsRetryable() bool

func (*FetchError) Unwrap

func (e *FetchError) Unwrap() error

type Item

type Item struct {
	// Fields stores the extracted key-value data.
	Fields map[string]any

	// URL is the source page URL this item was extracted from.
	URL string

	// SpiderName identifies which spider produced this item.
	SpiderName string

	// Timestamp is when this item was created.
	Timestamp time.Time

	// Depth is the crawl depth at which this item was found.
	Depth int

	// Checksum is a hash of the item content for deduplication.
	Checksum string
}

Item represents a single scraped data record.

func NewItem

func NewItem(sourceURL string) *Item

NewItem creates a new empty Item from a source URL.

func (*Item) Clone

func (i *Item) Clone() *Item

Clone creates a deep copy of the item.

func (*Item) Delete

func (i *Item) Delete(key string)

Delete removes a field.

func (*Item) Get

func (i *Item) Get(key string) (any, bool)

Get retrieves a field value.

func (*Item) GetString

func (i *Item) GetString(key string) string

GetString retrieves a field value as a string.

func (*Item) Has

func (i *Item) Has(key string) bool

Has returns true if the field exists.

func (*Item) Keys

func (i *Item) Keys() []string

Keys returns all field names.

func (*Item) Set

func (i *Item) Set(key string, value any)

Set sets a field value.

func (*Item) ToFlatMap

func (i *Item) ToFlatMap() map[string]string

ToFlatMap returns a flat map suitable for CSV export.

func (*Item) ToJSON

func (i *Item) ToJSON() ([]byte, error)

ToJSON serializes the item to JSON bytes.

type ParseError

type ParseError struct {
	URL      string
	Selector string
	Err      error
}

ParseError wraps errors that occur during parsing.

func (*ParseError) Error

func (e *ParseError) Error() string

func (*ParseError) Unwrap

func (e *ParseError) Unwrap() error

type PipelineError

type PipelineError struct {
	Stage string
	Item  *Item
	Err   error
}

PipelineError wraps errors that occur in the processing pipeline.

func (*PipelineError) Error

func (e *PipelineError) Error() string

func (*PipelineError) Unwrap

func (e *PipelineError) Unwrap() error

type Request

type Request struct {
	// URL is the target URL to fetch.
	URL *url.URL

	// Method is the HTTP method (GET, POST, etc.). Defaults to GET.
	Method string

	// Headers are custom HTTP headers to send with the request.
	Headers http.Header

	// Body is the request body for POST/PUT requests.
	Body []byte

	// Depth is the crawl depth from the seed URL.
	Depth int

	// Priority controls scheduling order (lower = higher priority).
	Priority int

	// MaxRetries is the maximum number of retries for this request.
	MaxRetries int

	// RetryCount tracks the current retry attempt.
	RetryCount int

	// Timeout overrides the global request timeout for this request.
	Timeout time.Duration

	// Meta stores arbitrary metadata attached to this request.
	Meta map[string]any

	// Tag categorizes this request (e.g., "listing", "detail", "pagination").
	Tag string

	// FetcherType specifies which fetcher to use: "http" or "browser".
	FetcherType string

	// Callbacks are the names of callback functions to invoke on response.
	Callbacks []string

	// ParentURL tracks which page this request was discovered on.
	ParentURL string

	// CreatedAt is when this request was created.
	CreatedAt time.Time

	// ID is a unique identifier for this request.
	ID string
}

Request represents an HTTP request to be fetched by the crawler.

func NewRequest

func NewRequest(rawURL string) (*Request, error)

NewRequest creates a new Request with sensible defaults.

func (*Request) Clone

func (r *Request) Clone() *Request

Clone creates a deep copy of the request.

func (*Request) Domain

func (r *Request) Domain() string

Domain returns the hostname of the request URL.

func (*Request) URLString

func (r *Request) URLString() string

URLString returns the string representation of the request URL.

type Response

type Response struct {
	// StatusCode is the HTTP status code.
	StatusCode int

	// Headers are the response HTTP headers.
	Headers http.Header

	// Body is the raw response body bytes.
	Body []byte

	// Request is a reference to the original request.
	Request *Request

	// ContentType is the MIME type of the response.
	ContentType string

	// ContentLength is the size of the response body in bytes.
	ContentLength int64

	// FinalURL is the URL after any redirects.
	FinalURL string

	// Doc is a parsed goquery document (lazily loaded).
	Doc *goquery.Document

	// FetchDuration is how long the fetch took.
	FetchDuration time.Duration

	// FetchedAt is when this response was received.
	FetchedAt time.Time

	// Meta stores arbitrary metadata.
	Meta map[string]any
}

Response represents the result of fetching a request.

func NewBrowserResponse

func NewBrowserResponse(req *Request, statusCode int, body []byte, finalURL string, duration time.Duration) *Response

NewBrowserResponse creates a Response from headless browser output.

func NewResponse

func NewResponse(req *Request, httpResp *http.Response, body []byte, duration time.Duration) *Response

NewResponse creates a Response from an http.Response.

func (*Response) Document

func (r *Response) Document() (*goquery.Document, error)

Document returns a parsed goquery document, lazily initializing it.

func (*Response) IsClientError

func (r *Response) IsClientError() bool

IsClientError returns true if the response status is 4xx.

func (*Response) IsRedirect

func (r *Response) IsRedirect() bool

IsRedirect returns true if the response status is 3xx.

func (*Response) IsServerError

func (r *Response) IsServerError() bool

IsServerError returns true if the response status is 5xx.

func (*Response) IsSuccess

func (r *Response) IsSuccess() bool

IsSuccess returns true if the response status is 2xx.

type StorageError

type StorageError struct {
	Backend string
	Err     error
}

StorageError wraps errors that occur during storage/export.

func (*StorageError) Error

func (e *StorageError) Error() string

func (*StorageError) Unwrap

func (e *StorageError) Unwrap() error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL