Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ConcurrentCrawler ¶
type ConcurrentCrawler struct {
*Crawler // Embed the original crawler
// contains filtered or unexported fields
}
ConcurrentCrawler handles concurrent crawling with worker pool
func NewConcurrentCrawler ¶
func NewConcurrentCrawler(config *Config) (*ConcurrentCrawler, error)
NewConcurrentCrawler creates a new concurrent crawler with worker pool
func (*ConcurrentCrawler) Cancel ¶
func (cc *ConcurrentCrawler) Cancel()
Cancel cancels the crawling operation
func (*ConcurrentCrawler) CrawlConcurrent ¶
func (cc *ConcurrentCrawler) CrawlConcurrent(startURL string) ([]CrawlResult, *CrawlStats, error)
CrawlConcurrent performs concurrent crawling starting from the given URL
func (*ConcurrentCrawler) GetResults ¶
func (cc *ConcurrentCrawler) GetResults() []CrawlResult
GetResults returns the crawling results (thread-safe)
func (*ConcurrentCrawler) GetStats ¶
func (cc *ConcurrentCrawler) GetStats() *CrawlStats
GetStats returns the crawling statistics (thread-safe)
type Config ¶
type Config struct {
MaxDepth int // Maximum depth to crawl (-1 = no limit, 0 = root only)
SameDomain bool // Whether to limit crawling to same domain
UserAgent string // User agent to use for requests
Timeout time.Duration // Request timeout
Logger *slog.Logger // Logger instance
Workers int // Number of concurrent workers
ShowProgress bool // Whether to show progress indicators
ProgressConfig *progress.Config // Progress reporting configuration
}
Config holds configuration for the crawler
func DefaultConfig ¶
func DefaultConfig() *Config
DefaultConfig returns a default crawler configuration
type CrawlJob ¶
type CrawlJob struct {
URL string // URL to crawl
Depth int // Depth of this URL in the crawl tree
}
CrawlJob represents a job to be processed by a worker
type CrawlResult ¶
type CrawlResult struct {
URL string // The URL that was crawled
Depth int // The depth at which this URL was found
Links []string // Links found on this page
Error error // Error if crawling failed
FetchTime time.Time // When this URL was crawled
ResponseTime time.Duration // Time taken to fetch this URL
StatusCode int // HTTP status code
}
CrawlResult represents the result of crawling a single URL
type CrawlStats ¶
type CrawlStats struct {
TotalURLs int // Total URLs discovered
CrawledURLs int // URLs successfully crawled
FailedURLs int // URLs that failed to crawl
SkippedURLs int // URLs skipped (duplicates, depth limit)
MaxDepthReached int // Maximum depth reached
TotalTime time.Duration // Total crawling time
StartTime time.Time // When crawling started
}
CrawlStats holds statistics about the crawling process
type Crawler ¶
type Crawler struct {
// contains filtered or unexported fields
}
Crawler represents a web crawler instance with recursive capabilities
func (*Crawler) CrawlRecursive ¶
func (c *Crawler) CrawlRecursive(startURL string) ([]CrawlResult, *CrawlStats, error)
CrawlRecursive performs recursive crawling starting from the given URL
func (*Crawler) GetAllURLs ¶
GetAllURLs returns all discovered URLs (both crawled and failed)
func (*Crawler) GetResults ¶
func (c *Crawler) GetResults() []CrawlResult
GetResults returns the crawling results
func (*Crawler) GetStats ¶
func (c *Crawler) GetStats() *CrawlStats
GetStats returns the crawling statistics
func (*Crawler) GetSuccessfulURLs ¶
GetSuccessfulURLs returns only successfully crawled URLs