crawler

package

v0.3.0 Latest Latest Go to latest Published: Jun 29, 2025 License: MIT Imports: 9 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/aoshimash/urlmap

Links

Open Source Insights

Documentation ¶

Index ¶

type ConcurrentCrawler
- func NewConcurrentCrawler(config *Config) (*ConcurrentCrawler, error)
type Config
- func DefaultConfig() *Config
type CrawlJob
type CrawlResult
type CrawlStats
type Crawler
- func New(config *Config) (*Crawler, error)

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type ConcurrentCrawler ¶

type ConcurrentCrawler struct {
	*Crawler // Embed the original crawler
	// contains filtered or unexported fields
}

ConcurrentCrawler handles concurrent crawling with worker pool

func NewConcurrentCrawler ¶

func NewConcurrentCrawler(config *Config) (*ConcurrentCrawler, error)

NewConcurrentCrawler creates a new concurrent crawler with worker pool

func (*ConcurrentCrawler) Cancel ¶

func (cc *ConcurrentCrawler) Cancel()

Cancel cancels the crawling operation

func (*ConcurrentCrawler) CrawlConcurrent ¶

func (cc *ConcurrentCrawler) CrawlConcurrent(startURL string) ([]CrawlResult, *CrawlStats, error)

CrawlConcurrent performs concurrent crawling starting from the given URL

func (*ConcurrentCrawler) GetResults ¶

func (cc *ConcurrentCrawler) GetResults() []CrawlResult

GetResults returns the crawling results (thread-safe)

func (*ConcurrentCrawler) GetStats ¶

func (cc *ConcurrentCrawler) GetStats() *CrawlStats

GetStats returns the crawling statistics (thread-safe)

type Config ¶

type Config struct {
	MaxDepth       int              // Maximum depth to crawl (-1 = no limit, 0 = root only)
	SameDomain     bool             // Whether to limit crawling to same domain
	UserAgent      string           // User agent to use for requests
	Timeout        time.Duration    // Request timeout
	Logger         *slog.Logger     // Logger instance
	Workers        int              // Number of concurrent workers
	ShowProgress   bool             // Whether to show progress indicators
	ProgressConfig *progress.Config // Progress reporting configuration
}

Config holds configuration for the crawler

func DefaultConfig ¶

func DefaultConfig() *Config

DefaultConfig returns a default crawler configuration

type CrawlJob ¶

type CrawlJob struct {
	URL   string // URL to crawl
	Depth int    // Depth of this URL in the crawl tree
}

CrawlJob represents a job to be processed by a worker

type CrawlResult ¶

type CrawlResult struct {
	URL          string        // The URL that was crawled
	Depth        int           // The depth at which this URL was found
	Links        []string      // Links found on this page
	Error        error         // Error if crawling failed
	FetchTime    time.Time     // When this URL was crawled
	ResponseTime time.Duration // Time taken to fetch this URL
	StatusCode   int           // HTTP status code
}

CrawlResult represents the result of crawling a single URL

type CrawlStats ¶

type CrawlStats struct {
	TotalURLs       int           // Total URLs discovered
	CrawledURLs     int           // URLs successfully crawled
	FailedURLs      int           // URLs that failed to crawl
	SkippedURLs     int           // URLs skipped (duplicates, depth limit)
	MaxDepthReached int           // Maximum depth reached
	TotalTime       time.Duration // Total crawling time
	StartTime       time.Time     // When crawling started
}

CrawlStats holds statistics about the crawling process

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler represents a web crawler instance with recursive capabilities

func New ¶

func New(config *Config) (*Crawler, error)

New creates a new crawler instance with the given configuration

func (*Crawler) CrawlRecursive ¶

func (c *Crawler) CrawlRecursive(startURL string) ([]CrawlResult, *CrawlStats, error)

CrawlRecursive performs recursive crawling starting from the given URL

func (*Crawler) GetAllURLs ¶

func (c *Crawler) GetAllURLs() []string

GetAllURLs returns all discovered URLs (both crawled and failed)

func (*Crawler) GetResults ¶

func (c *Crawler) GetResults() []CrawlResult

GetResults returns the crawling results

func (*Crawler) GetStats ¶

func (c *Crawler) GetStats() *CrawlStats

GetStats returns the crawling statistics

func (*Crawler) GetSuccessfulURLs ¶

func (c *Crawler) GetSuccessfulURLs() []string

GetSuccessfulURLs returns only successfully crawled URLs

func (*Crawler) Reset ¶

func (c *Crawler) Reset()

Reset clears the crawler state for a new crawling session

Source Files ¶

View all Source files

crawler.go

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL