crawler

package

v0.10.0 Latest Latest Go to latest Published: Mar 7, 2026 License: AGPL-3.0 Imports: 29 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/SEObserver/crawlobserver

Links

Open Source Insights

Documentation ¶

Index ¶

type CrawlRequest
type Engine
- func NewEngine(cfg *config.Config, store *storage.Store) *Engine
- func (e *Engine) BufferState() storage.BufferErrorState
- func (e *Engine) MarkSeen(url string)
- func (e *Engine) PagesCrawled() int64
- func (e *Engine) Phase() string
- func (e *Engine) QueueLen() int
- func (e *Engine) ResumeSession(id string, originalSeeds []string)
- func (e *Engine) Run(seeds []string) error
- func (e *Engine) SessionID(seeds []string) string
- func (e *Engine) SetSessionID(id string)
- func (e *Engine) Stop()
type ExtractorSetLoader
type HostHealth
- func NewHostHealth() *HostHealth
- func NewHostHealthWithWindow(windowSize int) *HostHealth
- func (hh *HostHealth) GlobalErrorRate() float64
- func (hh *HostHealth) RateLimitRate() float64
- func (hh *HostHealth) RecordFailure(host string)
- func (hh *HostHealth) RecordRateLimit(host string)
- func (hh *HostHealth) RecordSuccess(host string)
- func (hh *HostHealth) ShouldRetry(host string, maxConsecutiveFails int) bool
type Manager
- func NewManager(cfg *config.Config, store *storage.Store, ...) *Manager
- func (m *Manager) ActiveSessions() []string
- func (m *Manager) BufferState(sessionID string) storage.BufferErrorState
- func (m *Manager) IsQueued(sessionID string) bool
- func (m *Manager) IsRunning(sessionID string) bool
- func (m *Manager) LastError(sessionID string) string
- func (m *Manager) Phase(sessionID string) string
- func (m *Manager) Progress(sessionID string) (int64, int, bool)
- func (m *Manager) QueuedSessions() []string
- func (m *Manager) RecoverOrphanedSessions(ctx context.Context)
- func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)
- func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)
- func (m *Manager) Shutdown(timeout time.Duration)
- func (m *Manager) StartCrawl(req CrawlRequest) (string, error)
- func (m *Manager) StopCrawl(sessionID string) error
type RetryItem
type RetryPolicy
- func (p *RetryPolicy) ComputeDelay(attempt int, retryAfterHeader string) time.Duration
- func (p *RetryPolicy) ShouldRetry(statusCode int, errString string, attempt int) bool
type RetryQueue
- func NewRetryQueue() *RetryQueue
- func (rq *RetryQueue) Len() int
- func (rq *RetryQueue) PopReady() *RetryItem
- func (rq *RetryQueue) Push(item *RetryItem)
type Session
- func NewSession(seeds []string, cfg *config.Config) *Session
- func (s *Session) ToStorageRow() *storage.CrawlSession

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type CrawlRequest ¶

type CrawlRequest struct {
	Seeds               []string `json:"seeds"`
	MaxPages            int      `json:"max_pages"`
	MaxDepth            int      `json:"max_depth"`
	Workers             int      `json:"workers"`
	Delay               string   `json:"delay"`
	StoreHTML           bool     `json:"store_html"`
	CrawlScope          string   `json:"crawl_scope"`
	ProjectID           *string  `json:"project_id"`
	CheckExternalLinks  *bool    `json:"check_external_links"`
	ExternalLinkWorkers int      `json:"external_link_workers"`
	RetryStatusCode     int      `json:"retry_status_code"`
	UserAgent           string   `json:"user_agent"`
	CrawlSitemapOnly    bool     `json:"crawl_sitemap_only"`
	FetchSitemaps       *bool    `json:"fetch_sitemaps"`
	CheckPageResources  *bool    `json:"check_page_resources"`
	ResourceWorkers     int      `json:"resource_workers"`
	TLSProfile          string   `json:"tls_profile"`
	JSRenderMode        string   `json:"js_render_mode"`
	JSRenderMaxPages    int      `json:"js_render_max_pages"`
	JSRenderTimeout     string   `json:"js_render_timeout"`
	FollowJSLinks       bool     `json:"follow_js_links"`
	SourceIP            string   `json:"source_ip"`
	ForceIPv4           bool     `json:"force_ipv4"`
	ExtractorSetID      string   `json:"extractor_set_id"`
	IgnoreRobots        bool     `json:"ignore_robots"`
}

CrawlRequest holds parameters for starting a new crawl.

type Engine ¶

type Engine struct {
	// contains filtered or unexported fields
}

Engine orchestrates the crawling pipeline.

func NewEngine ¶

func NewEngine(cfg *config.Config, store *storage.Store) *Engine

NewEngine creates a new crawl engine.

func (*Engine) BufferState ¶

func (e *Engine) BufferState() storage.BufferErrorState

BufferState returns the current buffer error state for monitoring.

func (*Engine) MarkSeen ¶ added in v0.9.0

func (e *Engine) MarkSeen(url string)

MarkSeen marks a single URL as seen in the frontier dedup database.

func (*Engine) PagesCrawled ¶

func (e *Engine) PagesCrawled() int64

PagesCrawled returns the current number of pages crawled.

func (*Engine) Phase ¶ added in v0.7.0

func (e *Engine) Phase() string

Phase returns the current engine phase (e.g. "fetching_sitemaps", "crawling").

func (*Engine) QueueLen ¶

func (e *Engine) QueueLen() int

QueueLen returns the current frontier queue length.

func (*Engine) ResumeSession ¶

func (e *Engine) ResumeSession(id string, originalSeeds []string)

ResumeSession prepares the engine to resume an existing session.

func (*Engine) Run ¶

func (e *Engine) Run(seeds []string) error

Run starts the crawl with the given seed URLs.

func (*Engine) SessionID ¶

func (e *Engine) SessionID(seeds []string) string

SessionID creates the session and returns its ID without starting the crawl.

func (*Engine) SetSessionID ¶

func (e *Engine) SetSessionID(id string)

SetSessionID sets a pre-existing session ID (for resume).

func (*Engine) Stop ¶

func (e *Engine) Stop()

Stop gracefully stops the engine.

type ExtractorSetLoader ¶ added in v0.3.0

type ExtractorSetLoader interface {
	GetExtractorSet(id string) (*extraction.ExtractorSet, error)
}

ExtractorSetLoader loads an extractor set by ID.

type HostHealth ¶

type HostHealth struct {
	// contains filtered or unexported fields
}

HostHealth tracks success/failure rates per host and maintains a sliding window for global error rate and rate-limit rate computations.

func NewHostHealth ¶

func NewHostHealth() *HostHealth

NewHostHealth creates a new HostHealth tracker with a default sliding window.

func NewHostHealthWithWindow ¶ added in v0.9.0

func NewHostHealthWithWindow(windowSize int) *HostHealth

NewHostHealthWithWindow creates a HostHealth tracker with a custom window size.

func (*HostHealth) GlobalErrorRate ¶

func (hh *HostHealth) GlobalErrorRate() float64

GlobalErrorRate returns the error rate over the sliding window of recent requests.

func (*HostHealth) RateLimitRate ¶ added in v0.9.0

func (hh *HostHealth) RateLimitRate() float64

RateLimitRate returns the proportion of rate-limited responses (403/429) over the sliding window of recent requests.

func (*HostHealth) RecordFailure ¶

func (hh *HostHealth) RecordFailure(host string)

RecordFailure records a failed fetch for a host.

func (*HostHealth) RecordRateLimit ¶ added in v0.9.0

func (hh *HostHealth) RecordRateLimit(host string)

RecordRateLimit records a rate-limit response (403/429) for a host. Also counts as a failure.

func (*HostHealth) RecordSuccess ¶

func (hh *HostHealth) RecordSuccess(host string)

RecordSuccess records a successful fetch for a host.

func (*HostHealth) ShouldRetry ¶

func (hh *HostHealth) ShouldRetry(host string, maxConsecutiveFails int) bool

ShouldRetry returns false if the host has exceeded the consecutive failure threshold.

type Manager ¶

type Manager struct {
	// contains filtered or unexported fields
}

Manager manages running crawl engines.

func NewManager ¶

func NewManager(cfg *config.Config, store *storage.Store, extractorLoader ...ExtractorSetLoader) *Manager

NewManager creates a new crawl manager.

func (*Manager) ActiveSessions ¶

func (m *Manager) ActiveSessions() []string

ActiveSessions returns IDs of currently running sessions.

func (*Manager) BufferState ¶

func (m *Manager) BufferState(sessionID string) storage.BufferErrorState

BufferState returns the buffer error state for a running session.

func (*Manager) IsQueued ¶

func (m *Manager) IsQueued(sessionID string) bool

IsQueued returns true if the session is waiting in the queue.

func (*Manager) IsRunning ¶

func (m *Manager) IsRunning(sessionID string) bool

IsRunning checks if a session is currently running.

func (*Manager) LastError ¶

func (m *Manager) LastError(sessionID string) string

LastError returns the error message from the last run of a session, if any.

func (*Manager) Phase ¶ added in v0.7.0

func (m *Manager) Phase(sessionID string) string

Phase returns the current phase of a running session (e.g. "fetching_sitemaps", "crawling").

func (*Manager) Progress ¶

func (m *Manager) Progress(sessionID string) (int64, int, bool)

Progress returns current crawl progress for a running session.

func (*Manager) QueuedSessions ¶

func (m *Manager) QueuedSessions() []string

QueuedSessions returns the IDs of sessions waiting in the queue.

func (*Manager) RecoverOrphanedSessions ¶

func (m *Manager) RecoverOrphanedSessions(ctx context.Context)

RecoverOrphanedSessions marks any sessions still in "running" status as "crashed". Should be called at startup to clean up after a previous unclean shutdown.

func (*Manager) ResumeCrawl ¶

func (m *Manager) ResumeCrawl(sessionID string, overrides *CrawlRequest) (string, error)

ResumeCrawl resumes a stopped/completed session by re-crawling undiscovered links. If overrides is non-nil, its non-zero fields override the default config.

func (*Manager) RetryFailed ¶

func (m *Manager) RetryFailed(sessionID string, overrides *CrawlRequest) (int, error)

RetryFailed retries pages with status_code = 0 (fetch errors) or a specific status code. Deletes the failed rows, then runs a mini-crawl with those URLs.

func (*Manager) Shutdown ¶

func (m *Manager) Shutdown(timeout time.Duration)

Shutdown gracefully stops all running engines within the given timeout. Engines still running after the timeout are marked as "crashed". Queued sessions are marked as "stopped".

func (*Manager) StartCrawl ¶

func (m *Manager) StartCrawl(req CrawlRequest) (string, error)

StartCrawl launches a new crawl session in background. Returns the session ID. If all semaphore slots are taken, the crawl is queued and starts automatically when a slot becomes available.

func (*Manager) StopCrawl ¶

func (m *Manager) StopCrawl(sessionID string) error

StopCrawl stops a running crawl session or removes it from the queue.

type RetryItem ¶

type RetryItem struct {
	URL      string
	Host     string
	Depth    int
	FoundOn  string
	Attempt  int
	ReadyAt  time.Time
	LastCode int
	LastErr  string
	// contains filtered or unexported fields
}

RetryItem represents a URL waiting to be retried.

type RetryPolicy ¶

type RetryPolicy struct {
	MaxRetries int
	BaseDelay  time.Duration
	MaxDelay   time.Duration
}

RetryPolicy decides whether a failed request should be retried and computes delays.

func (*RetryPolicy) ComputeDelay ¶

func (p *RetryPolicy) ComputeDelay(attempt int, retryAfterHeader string) time.Duration

ComputeDelay calculates the delay before the next retry attempt. If a Retry-After header is present, it takes priority.

func (*RetryPolicy) ShouldRetry ¶

func (p *RetryPolicy) ShouldRetry(statusCode int, errString string, attempt int) bool

ShouldRetry returns true if the request should be retried based on status code, error string, and current attempt number.

type RetryQueue ¶

type RetryQueue struct {
	// contains filtered or unexported fields
}

RetryQueue is a thread-safe min-heap of RetryItems ordered by ReadyAt.

func NewRetryQueue ¶

func NewRetryQueue() *RetryQueue

NewRetryQueue creates a new empty RetryQueue.

func (*RetryQueue) Len ¶

func (rq *RetryQueue) Len() int

Len returns the number of items in the queue.

func (*RetryQueue) PopReady ¶

func (rq *RetryQueue) PopReady() *RetryItem

PopReady returns the next item whose ReadyAt is in the past, or nil if none are ready.

func (*RetryQueue) Push ¶

func (rq *RetryQueue) Push(item *RetryItem)

Push adds an item to the retry queue.

type Session ¶

type Session struct {
	ID        string
	StartedAt time.Time
	SeedURLs  []string
	Config    *config.Config
	Status    string
	Pages     uint64
	ProjectID *string
}

Session represents a single crawl session lifecycle.

func NewSession ¶

func NewSession(seeds []string, cfg *config.Config) *Session

NewSession creates a new crawl session.

func (*Session) ToStorageRow ¶

func (s *Session) ToStorageRow() *storage.CrawlSession

ToStorageRow converts a Session to a storage model.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL