crawler

package
v0.3.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 29, 2025 License: MIT Imports: 9 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type ConcurrentCrawler

type ConcurrentCrawler struct {
	*Crawler // Embed the original crawler
	// contains filtered or unexported fields
}

ConcurrentCrawler handles concurrent crawling with worker pool

func NewConcurrentCrawler

func NewConcurrentCrawler(config *Config) (*ConcurrentCrawler, error)

NewConcurrentCrawler creates a new concurrent crawler with worker pool

func (*ConcurrentCrawler) Cancel

func (cc *ConcurrentCrawler) Cancel()

Cancel cancels the crawling operation

func (*ConcurrentCrawler) CrawlConcurrent

func (cc *ConcurrentCrawler) CrawlConcurrent(startURL string) ([]CrawlResult, *CrawlStats, error)

CrawlConcurrent performs concurrent crawling starting from the given URL

func (*ConcurrentCrawler) GetResults

func (cc *ConcurrentCrawler) GetResults() []CrawlResult

GetResults returns the crawling results (thread-safe)

func (*ConcurrentCrawler) GetStats

func (cc *ConcurrentCrawler) GetStats() *CrawlStats

GetStats returns the crawling statistics (thread-safe)

type Config

type Config struct {
	MaxDepth       int              // Maximum depth to crawl (-1 = no limit, 0 = root only)
	SameDomain     bool             // Whether to limit crawling to same domain
	UserAgent      string           // User agent to use for requests
	Timeout        time.Duration    // Request timeout
	Logger         *slog.Logger     // Logger instance
	Workers        int              // Number of concurrent workers
	ShowProgress   bool             // Whether to show progress indicators
	ProgressConfig *progress.Config // Progress reporting configuration
}

Config holds configuration for the crawler

func DefaultConfig

func DefaultConfig() *Config

DefaultConfig returns a default crawler configuration

type CrawlJob

type CrawlJob struct {
	URL   string // URL to crawl
	Depth int    // Depth of this URL in the crawl tree
}

CrawlJob represents a job to be processed by a worker

type CrawlResult

type CrawlResult struct {
	URL          string        // The URL that was crawled
	Depth        int           // The depth at which this URL was found
	Links        []string      // Links found on this page
	Error        error         // Error if crawling failed
	FetchTime    time.Time     // When this URL was crawled
	ResponseTime time.Duration // Time taken to fetch this URL
	StatusCode   int           // HTTP status code
}

CrawlResult represents the result of crawling a single URL

type CrawlStats

type CrawlStats struct {
	TotalURLs       int           // Total URLs discovered
	CrawledURLs     int           // URLs successfully crawled
	FailedURLs      int           // URLs that failed to crawl
	SkippedURLs     int           // URLs skipped (duplicates, depth limit)
	MaxDepthReached int           // Maximum depth reached
	TotalTime       time.Duration // Total crawling time
	StartTime       time.Time     // When crawling started
}

CrawlStats holds statistics about the crawling process

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler represents a web crawler instance with recursive capabilities

func New

func New(config *Config) (*Crawler, error)

New creates a new crawler instance with the given configuration

func (*Crawler) CrawlRecursive

func (c *Crawler) CrawlRecursive(startURL string) ([]CrawlResult, *CrawlStats, error)

CrawlRecursive performs recursive crawling starting from the given URL

func (*Crawler) GetAllURLs

func (c *Crawler) GetAllURLs() []string

GetAllURLs returns all discovered URLs (both crawled and failed)

func (*Crawler) GetResults

func (c *Crawler) GetResults() []CrawlResult

GetResults returns the crawling results

func (*Crawler) GetStats

func (c *Crawler) GetStats() *CrawlStats

GetStats returns the crawling statistics

func (*Crawler) GetSuccessfulURLs

func (c *Crawler) GetSuccessfulURLs() []string

GetSuccessfulURLs returns only successfully crawled URLs

func (*Crawler) Reset

func (c *Crawler) Reset()

Reset clears the crawler state for a new crawling session

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL