Documentation
¶
Index ¶
- type CrawlRequest
- type Engine
- func (e *Engine) BufferState() storage.BufferErrorState
- func (e *Engine) MarkSeen(url string)
- func (e *Engine) PagesCrawled() int64
- func (e *Engine) Phase() string
- func (e *Engine) QueueLen() int
- func (e *Engine) ResumeSession(id string, originalSeeds []string)
- func (e *Engine) Run(seeds []string) error
- func (e *Engine) SessionID(seeds []string) string
- func (e *Engine) SetSessionID(id string)
- func (e *Engine) Stop()
- type ExtractorSetLoader
- type HostHealth
- func (hh *HostHealth) GlobalErrorRate() float64
- func (hh *HostHealth) RateLimitRate() float64
- func (hh *HostHealth) RecordFailure(host string)
- func (hh *HostHealth) RecordRateLimit(host string)
- func (hh *HostHealth) RecordSuccess(host string)
- func (hh *HostHealth) ShouldRetry(host string, maxConsecutiveFails int) bool
- type Manager
- func (m *Manager) ActiveSessions() []string
- func (m *Manager) BufferState(sessionID string) storage.BufferErrorState
- func (m *Manager) IsQueued(sessionID string) bool
- func (m *Manager) IsRunning(sessionID string) bool
- func (m *Manager) LastError(sessionID string) string
- func (m *Manager) Phase(sessionID string) string
- func (m *Manager) Progress(sessionID string) (int64, int, bool)
- func (m *Manager) QueuedSessions() []string
- func (m *Manager) RecoverOrphanedSessions(ctx context.Context)
- func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)
- func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)
- func (m *Manager) Shutdown(timeout time.Duration)
- func (m *Manager) StartCrawl(req CrawlRequest) (string, error)
- func (m *Manager) StopCrawl(sessionID string) error
- type RetryItem
- type RetryPolicy
- type RetryQueue
- type Session
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CrawlRequest ¶
type CrawlRequest struct {
Seeds []string `json:"seeds"`
MaxPages int `json:"max_pages"`
MaxDepth int `json:"max_depth"`
Workers int `json:"workers"`
Delay string `json:"delay"`
StoreHTML bool `json:"store_html"`
CrawlScope string `json:"crawl_scope"`
ProjectID *string `json:"project_id"`
CheckExternalLinks *bool `json:"check_external_links"`
ExternalLinkWorkers int `json:"external_link_workers"`
RetryStatusCode int `json:"retry_status_code"`
UserAgent string `json:"user_agent"`
CrawlSitemapOnly bool `json:"crawl_sitemap_only"`
FetchSitemaps *bool `json:"fetch_sitemaps"`
CheckPageResources *bool `json:"check_page_resources"`
ResourceWorkers int `json:"resource_workers"`
TLSProfile string `json:"tls_profile"`
JSRenderMode string `json:"js_render_mode"`
JSRenderMaxPages int `json:"js_render_max_pages"`
JSRenderTimeout string `json:"js_render_timeout"`
FollowJSLinks bool `json:"follow_js_links"`
SourceIP string `json:"source_ip"`
ForceIPv4 bool `json:"force_ipv4"`
ExtractorSetID string `json:"extractor_set_id"`
IgnoreRobots bool `json:"ignore_robots"`
}
CrawlRequest holds parameters for starting a new crawl.
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine orchestrates the crawling pipeline.
func (*Engine) BufferState ¶
func (e *Engine) BufferState() storage.BufferErrorState
BufferState returns the current buffer error state for monitoring.
func (*Engine) MarkSeen ¶ added in v0.9.0
MarkSeen marks a single URL as seen in the frontier dedup database.
func (*Engine) PagesCrawled ¶
PagesCrawled returns the current number of pages crawled.
func (*Engine) Phase ¶ added in v0.7.0
Phase returns the current engine phase (e.g. "fetching_sitemaps", "crawling").
func (*Engine) ResumeSession ¶
ResumeSession prepares the engine to resume an existing session.
func (*Engine) SessionID ¶
SessionID creates the session and returns its ID without starting the crawl.
func (*Engine) SetSessionID ¶
SetSessionID sets a pre-existing session ID (for resume).
type ExtractorSetLoader ¶ added in v0.3.0
type ExtractorSetLoader interface {
GetExtractorSet(id string) (*extraction.ExtractorSet, error)
}
ExtractorSetLoader loads an extractor set by ID.
type HostHealth ¶
type HostHealth struct {
// contains filtered or unexported fields
}
HostHealth tracks success/failure rates per host and maintains a sliding window for global error rate and rate-limit rate computations.
func NewHostHealth ¶
func NewHostHealth() *HostHealth
NewHostHealth creates a new HostHealth tracker with a default sliding window.
func NewHostHealthWithWindow ¶ added in v0.9.0
func NewHostHealthWithWindow(windowSize int) *HostHealth
NewHostHealthWithWindow creates a HostHealth tracker with a custom window size.
func (*HostHealth) GlobalErrorRate ¶
func (hh *HostHealth) GlobalErrorRate() float64
GlobalErrorRate returns the error rate over the sliding window of recent requests.
func (*HostHealth) RateLimitRate ¶ added in v0.9.0
func (hh *HostHealth) RateLimitRate() float64
RateLimitRate returns the proportion of rate-limited responses (403/429) over the sliding window of recent requests.
func (*HostHealth) RecordFailure ¶
func (hh *HostHealth) RecordFailure(host string)
RecordFailure records a failed fetch for a host.
func (*HostHealth) RecordRateLimit ¶ added in v0.9.0
func (hh *HostHealth) RecordRateLimit(host string)
RecordRateLimit records a rate-limit response (403/429) for a host. Also counts as a failure.
func (*HostHealth) RecordSuccess ¶
func (hh *HostHealth) RecordSuccess(host string)
RecordSuccess records a successful fetch for a host.
func (*HostHealth) ShouldRetry ¶
func (hh *HostHealth) ShouldRetry(host string, maxConsecutiveFails int) bool
ShouldRetry returns false if the host has exceeded the consecutive failure threshold.
type Manager ¶
type Manager struct {
// contains filtered or unexported fields
}
Manager manages running crawl engines.
func NewManager ¶
func NewManager(cfg *config.Config, store *storage.Store, extractorLoader ...ExtractorSetLoader) *Manager
NewManager creates a new crawl manager.
func (*Manager) ActiveSessions ¶
ActiveSessions returns IDs of currently running sessions.
func (*Manager) BufferState ¶
func (m *Manager) BufferState(sessionID string) storage.BufferErrorState
BufferState returns the buffer error state for a running session.
func (*Manager) LastError ¶
LastError returns the error message from the last run of a session, if any.
func (*Manager) Phase ¶ added in v0.7.0
Phase returns the current phase of a running session (e.g. "fetching_sitemaps", "crawling").
func (*Manager) QueuedSessions ¶
QueuedSessions returns the IDs of sessions waiting in the queue.
func (*Manager) RecoverOrphanedSessions ¶
RecoverOrphanedSessions marks any sessions still in "running" status as "crashed". Should be called at startup to clean up after a previous unclean shutdown.
func (*Manager) ResumeCrawl ¶
func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)
ResumeCrawl resumes a stopped/completed session by re-crawling undiscovered links. If overrides is non-nil, its non-zero fields override the default config.
func (*Manager) RetryFailed ¶
func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)
RetryFailed retries pages with status_code = 0 (fetch errors) or a specific status code. Deletes the failed rows, then runs a mini-crawl with those URLs.
func (*Manager) Shutdown ¶
Shutdown gracefully stops all running engines within the given timeout. Engines still running after the timeout are marked as "crashed". Queued sessions are marked as "stopped".
func (*Manager) StartCrawl ¶
func (m *Manager) StartCrawl(req CrawlRequest) (string, error)
StartCrawl launches a new crawl session in background. Returns the session ID. If all semaphore slots are taken, the crawl is queued and starts automatically when a slot becomes available.
type RetryItem ¶
type RetryItem struct {
URL string
Host string
Depth int
FoundOn string
Attempt int
ReadyAt time.Time
LastCode int
LastErr string
// contains filtered or unexported fields
}
RetryItem represents a URL waiting to be retried.
type RetryPolicy ¶
RetryPolicy decides whether a failed request should be retried and computes delays.
func (*RetryPolicy) ComputeDelay ¶
func (p *RetryPolicy) ComputeDelay(attempt int, retryAfterHeader string) time.Duration
ComputeDelay calculates the delay before the next retry attempt. If a Retry-After header is present, it takes priority.
func (*RetryPolicy) ShouldRetry ¶
func (p *RetryPolicy) ShouldRetry(statusCode int, errString string, attempt int) bool
ShouldRetry returns true if the request should be retried based on status code, error string, and current attempt number.
type RetryQueue ¶
type RetryQueue struct {
// contains filtered or unexported fields
}
RetryQueue is a thread-safe min-heap of RetryItems ordered by ReadyAt.
func (*RetryQueue) Len ¶
func (rq *RetryQueue) Len() int
Len returns the number of items in the queue.
func (*RetryQueue) PopReady ¶
func (rq *RetryQueue) PopReady() *RetryItem
PopReady returns the next item whose ReadyAt is in the past, or nil if none are ready.
func (*RetryQueue) Push ¶
func (rq *RetryQueue) Push(item *RetryItem)
Push adds an item to the retry queue.
type Session ¶
type Session struct {
ID string
StartedAt time.Time
SeedURLs []string
Config *config.Config
Status string
Pages uint64
ProjectID *string
}
Session represents a single crawl session lifecycle.
func NewSession ¶
NewSession creates a new crawl session.
func (*Session) ToStorageRow ¶
func (s *Session) ToStorageRow() *storage.CrawlSession
ToStorageRow converts a Session to a storage model.