crawler

package

v0.0.2 Latest Latest Go to latest Published: Mar 25, 2026 License: Apache-2.0 Imports: 26 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/meysam81/scry

Links

Open Source Insights

Documentation ¶

Overview ¶

Package crawler implements a concurrent web crawler with BFS traversal, robots.txt compliance, sitemap parsing, and HTML link extraction.

Index ¶

Variables
func DeleteCheckpoint(path string) error
func ParseHTML(base *url.URL, body []byte) (links []string, assets []string)
func ParseSitemap(ctx context.Context, sitemapURL string) []string
func SaveCheckpoint(path string, seedURL string, frontier *Frontier, pageURLs []string) error
type Checkpoint
- func LoadCheckpoint(path string) (*Checkpoint, error)
type CrawlCache
- func LoadCrawlCache(path string) (*CrawlCache, error)
- func NewCrawlCache() *CrawlCache
- func (cc *CrawlCache) Get(url string) (CrawlCacheEntry, bool)
- func (cc *CrawlCache) Save(path string) error
- func (cc *CrawlCache) Set(url string, entry CrawlCacheEntry)
type CrawlCacheEntry
type Crawler
- func NewCrawler(cfg *config.Config, fetcher Fetcher, l logger.Logger) *Crawler
- func (c *Crawler) Run(ctx context.Context, seedURL string) (*model.CrawlResult, error)
type Fetcher
- func NewFetcher(cfg *config.Config, l logger.Logger) (Fetcher, func(), error)
type Frontier
- func NewFrontier(seedHost string, maxPages int, includePatterns, excludePatterns []string) *Frontier
- func RestoreFrontier(cp *Checkpoint, seedHost string, maxPages int, include, exclude []string) *Frontier
- func (f *Frontier) Add(rawURL string, depth int) bool
- func (f *Frontier) Dequeue() (FrontierTask, bool)
- func (f *Frontier) Len() int
- func (f *Frontier) Seen() int
type FrontierTask
type HTTPFetcher
- func NewHTTPFetcher(userAgent string, timeout time.Duration, l logger.Logger) *HTTPFetcher
- func (f *HTTPFetcher) Fetch(ctx context.Context, rawURL string) (*model.Page, error)
type RobotsChecker
- func NewRobotsChecker(userAgent string, l logger.Logger) *RobotsChecker
- func (rc *RobotsChecker) IsAllowed(ctx context.Context, rawURL string) bool

Constants ¶

This section is empty.

Variables ¶

View Source

var ErrRedirectLoop = errors.New("redirect loop detected")

ErrRedirectLoop is returned when a redirect cycle is detected.

Functions ¶

func DeleteCheckpoint ¶

func DeleteCheckpoint(path string) error

DeleteCheckpoint removes a checkpoint file.

func ParseHTML ¶

func ParseHTML(base *url.URL, body []byte) (links []string, assets []string)

ParseHTML extracts links and assets from HTML content.

func ParseSitemap ¶

func ParseSitemap(ctx context.Context, sitemapURL string) []string

ParseSitemap fetches and parses a sitemap.xml, returning discovered URLs. Gzip-compressed sitemaps (.xml.gz) are transparently decompressed.

func SaveCheckpoint ¶

func SaveCheckpoint(path string, seedURL string, frontier *Frontier, pageURLs []string) error

SaveCheckpoint writes the current frontier state to a JSON file.

Types ¶

type Checkpoint ¶

type Checkpoint struct {
	SeedURL  string          `json:"seed_url"`
	Seen     map[string]bool `json:"seen"`
	Queue    []FrontierTask  `json:"queue"`
	PageURLs []string        `json:"page_urls"`
}

Checkpoint represents a saved crawl state that can be resumed.

func LoadCheckpoint ¶

func LoadCheckpoint(path string) (*Checkpoint, error)

LoadCheckpoint reads a checkpoint file and returns the saved state.

type CrawlCache ¶

type CrawlCache struct {
	Entries map[string]CrawlCacheEntry `json:"entries"`
}

CrawlCache maps URLs to their cached metadata for incremental crawling.

func LoadCrawlCache ¶

func LoadCrawlCache(path string) (*CrawlCache, error)

LoadCrawlCache reads a cache file from disk.

func NewCrawlCache ¶

func NewCrawlCache() *CrawlCache

NewCrawlCache creates an empty CrawlCache.

func (*CrawlCache) Get ¶

func (cc *CrawlCache) Get(url string) (CrawlCacheEntry, bool)

Get returns the cache entry for a URL, or empty if not cached.

func (*CrawlCache) Save ¶

func (cc *CrawlCache) Save(path string) error

Save writes the cache to disk.

func (*CrawlCache) Set ¶

func (cc *CrawlCache) Set(url string, entry CrawlCacheEntry)

Set stores a cache entry for a URL.

type CrawlCacheEntry ¶

type CrawlCacheEntry struct {
	LastModified string `json:"last_modified,omitempty"`
	ETag         string `json:"etag,omitempty"`
	ContentHash  string `json:"content_hash,omitempty"`
}

CrawlCacheEntry stores metadata for a previously crawled URL.

type Crawler ¶

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler orchestrates concurrent web crawling.

func NewCrawler ¶

func NewCrawler(cfg *config.Config, fetcher Fetcher, l logger.Logger) *Crawler

NewCrawler creates a new Crawler with the given configuration and fetcher.

func (*Crawler) Run ¶

func (c *Crawler) Run(ctx context.Context, seedURL string) (*model.CrawlResult, error)

Run crawls starting from seedURL and returns the collected results.

type Fetcher ¶

type Fetcher interface {
	Fetch(ctx context.Context, url string) (*model.Page, error)
}

Fetcher fetches a URL and returns a Page.

func NewFetcher ¶

func NewFetcher(cfg *config.Config, l logger.Logger) (Fetcher, func(), error)

NewFetcher creates the appropriate Fetcher based on configuration. It returns the fetcher, a cleanup function that should be called when done, and any error encountered during creation.

type Frontier ¶

type Frontier struct {
	// contains filtered or unexported fields
}

Frontier is a thread-safe BFS queue with deduplication and scope enforcement.

func NewFrontier ¶

func NewFrontier(seedHost string, maxPages int, includePatterns, excludePatterns []string) *Frontier

NewFrontier creates a new Frontier scoped to the given seed host.

func RestoreFrontier ¶

func RestoreFrontier(cp *Checkpoint, seedHost string, maxPages int, include, exclude []string) *Frontier

RestoreFrontier creates a Frontier pre-populated from a Checkpoint.

func (*Frontier) Add ¶

func (f *Frontier) Add(rawURL string, depth int) bool

Add enqueues a URL at the given depth if it is in scope, not a duplicate, and under the cap.

func (*Frontier) Dequeue ¶

func (f *Frontier) Dequeue() (FrontierTask, bool)

Dequeue removes and returns the next task from the front of the queue.

func (*Frontier) Len ¶

func (f *Frontier) Len() int

Len returns the current number of items in the queue.

func (*Frontier) Seen ¶

func (f *Frontier) Seen() int

Seen returns the total number of unique URLs that have been added.

type FrontierTask ¶

type FrontierTask struct {
	URL   string `json:"url"`
	Depth int    `json:"depth"`
}

FrontierTask represents a URL to be crawled along with its depth from the seed.

type HTTPFetcher ¶

type HTTPFetcher struct {
	// contains filtered or unexported fields
}

HTTPFetcher implements Fetcher using net/http.

func NewHTTPFetcher ¶

func NewHTTPFetcher(userAgent string, timeout time.Duration, l logger.Logger) *HTTPFetcher

NewHTTPFetcher creates a new HTTPFetcher with the given user agent and per-request timeout.

func (*HTTPFetcher) Fetch ¶

func (f *HTTPFetcher) Fetch(ctx context.Context, rawURL string) (*model.Page, error)

Fetch retrieves the given URL and returns a populated Page.

type RobotsChecker ¶

type RobotsChecker struct {
	// contains filtered or unexported fields
}

RobotsChecker fetches, parses, and caches robots.txt files per host.

func NewRobotsChecker ¶

func NewRobotsChecker(userAgent string, l logger.Logger) *RobotsChecker

NewRobotsChecker creates a new RobotsChecker that matches against the given user agent.

func (*RobotsChecker) IsAllowed ¶

func (rc *RobotsChecker) IsAllowed(ctx context.Context, rawURL string) bool

IsAllowed reports whether the given URL is permitted by the host's robots.txt.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL