crawler

package
v0.30.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 28, 2026 License: MIT Imports: 23 Imported by: 0

Documentation

Index

Constants

View Source
const MaxBodySampleSize = 50 * 1024

MaxBodySampleSize is the maximum size of body sample stored for tech detection (50KB)

Variables

This section is empty.

Functions

func IsPathAllowed

func IsPathAllowed(rules *RobotsRules, path string) bool

IsPathAllowed checks if a path is allowed by robots.txt rules

Types

type CacheCheckAttempt

type CacheCheckAttempt struct {
	Attempt     int    `json:"attempt"`
	CacheStatus string `json:"cache_status"`
	Delay       int    `json:"delay_ms"`
	// Diagnostics duplicates attempt metadata for backward-compatible probe history.
	Diagnostics *ProbeDiagnostics `json:"diagnostics,omitempty"`
}

CacheCheckAttempt stores the result of a single cache status check.

type CacheMetadata

type CacheMetadata struct {
	HeaderSource     string `json:"header_source,omitempty"`
	RawValue         string `json:"raw_value,omitempty"`
	NormalisedStatus string `json:"normalised_status,omitempty"`
	Age              string `json:"age,omitempty"`
	CacheControl     string `json:"cache_control,omitempty"`
	Vary             string `json:"vary,omitempty"`
	CacheStatus      string `json:"cache_status,omitempty"`
	CFCacheStatus    string `json:"cf_cache_status,omitempty"`
	XCache           string `json:"x_cache,omitempty"`
	XCacheRemote     string `json:"x_cache_remote,omitempty"`
	XVercelCache     string `json:"x_vercel_cache,omitempty"`
	XVarnish         string `json:"x_varnish,omitempty"`
}

CacheMetadata stores cache-related headers and interpretation.

type Config

type Config struct {
	DefaultTimeout time.Duration // Default timeout for requests
	MaxConcurrency int           // Maximum number of concurrent requests
	RateLimit      int           // Determines request delay range: base=1s/RateLimit, range=base to 1s
	UserAgent      string        // User agent string for requests
	RetryAttempts  int           // Number of retry attempts for failed requests
	RetryDelay     time.Duration // Delay between retry attempts
	SkipCachedURLs bool          // Whether to skip URLs that are already cached (HIT)
	Port           string        // Server port
	Env            string        // Environment (development/production)
	LogLevel       string        // Logging level
	DatabaseURL    string        // Database connection URL
	AuthToken      string        // Database authentication token
	SentryDSN      string        // Sentry DSN for error tracking
	FindLinks      bool          // Whether to extract links (e.g. PDFs/docs) from pages
	SkipSSRFCheck  bool          // Skip SSRF protection (for tests only, never enable in production)
}

Config holds the configuration for a crawler instance

func DefaultConfig

func DefaultConfig() *Config

DefaultConfig returns a Config instance with default values

type CrawlOptions

type CrawlOptions struct {
	MaxPages    int  // Maximum pages to crawl
	Concurrency int  // Number of concurrent crawlers
	RateLimit   int  // Maximum requests per second
	Timeout     int  // Request timeout in seconds
	FollowLinks bool // Whether to follow links on crawled pages
}

CrawlOptions defines configuration options for a crawl operation

type CrawlResult

type CrawlResult struct {
	URL                 string              `json:"url"`
	ResponseTime        int64               `json:"response_time"`
	StatusCode          int                 `json:"status_code"`
	Error               string              `json:"error,omitempty"`
	Warning             string              `json:"warning,omitempty"`
	CacheStatus         string              `json:"cache_status"`
	ContentType         string              `json:"content_type"`
	ContentLength       int64               `json:"content_length"`
	Headers             http.Header         `json:"headers"`
	RedirectURL         string              `json:"redirect_url"`
	Performance         PerformanceMetrics  `json:"performance"`
	Timestamp           int64               `json:"timestamp"`
	RetryCount          int                 `json:"retry_count"`
	SkippedCrawl        bool                `json:"skipped_crawl,omitempty"`
	Links               map[string][]string `json:"links,omitempty"`
	SecondResponseTime  int64               `json:"second_response_time,omitempty"`
	SecondCacheStatus   string              `json:"second_cache_status,omitempty"`
	SecondContentLength int64               `json:"second_content_length,omitempty"`
	SecondHeaders       http.Header         `json:"second_headers,omitempty"`
	SecondPerformance   *PerformanceMetrics `json:"second_performance,omitempty"`
	CacheCheckAttempts  []CacheCheckAttempt `json:"cache_check_attempts,omitempty"`
	RequestDiagnostics  *RequestDiagnostics `json:"request_diagnostics,omitempty"`
	BodySample          []byte              `json:"-"` // Truncated body for tech detection (not serialised)
	Body                []byte              `json:"-"` // Full body for storage upload (not serialised)
}

CrawlResult represents the result of a URL crawl operation

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler represents a URL crawler with configuration and metrics

func New

func New(config *Config, id ...string) *Crawler

New creates a new Crawler instance with the given configuration and optional ID If config is nil, default configuration is used

func (*Crawler) CheckCacheStatus

func (c *Crawler) CheckCacheStatus(ctx context.Context, targetURL string) (ProbeDiagnostics, error)

func (*Crawler) Config

func (c *Crawler) Config() *Config

Config returns the Crawler's configuration.

func (*Crawler) CreateHTTPClient

func (c *Crawler) CreateHTTPClient(timeout time.Duration) *http.Client

CreateHTTPClient returns a configured HTTP client with SSRF protection

func (*Crawler) DiscoverSitemaps

func (c *Crawler) DiscoverSitemaps(ctx context.Context, domain string) ([]string, error)

DiscoverSitemaps is a backward-compatible wrapper that only returns sitemaps

func (*Crawler) DiscoverSitemapsAndRobots

func (c *Crawler) DiscoverSitemapsAndRobots(ctx context.Context, domain string) (*SitemapDiscoveryResult, error)

DiscoverSitemapsAndRobots attempts to find sitemaps and parse robots.txt rules for a domain

func (*Crawler) FilterURLs

func (c *Crawler) FilterURLs(urls []string, includePaths, excludePaths []string) []string

FilterURLs filters URLs based on include/exclude patterns

func (*Crawler) GetUserAgent

func (c *Crawler) GetUserAgent() string

GetUserAgent returns the user agent string for this crawler

func (*Crawler) ParseSitemap

func (c *Crawler) ParseSitemap(ctx context.Context, sitemapURL string) ([]string, error)

ParseSitemap extracts URLs from a sitemap

func (*Crawler) WarmURL

func (c *Crawler) WarmURL(ctx context.Context, targetURL string, findLinks bool) (*CrawlResult, error)

WarmURL performs a crawl of the specified URL and returns the result. It respects context cancellation, enforces timeout, and treats non-2xx statuses as errors.

type PerformanceMetrics

type PerformanceMetrics struct {
	DNSLookupTime       int64 `json:"dns_lookup_time"`
	TCPConnectionTime   int64 `json:"tcp_connection_time"`
	TLSHandshakeTime    int64 `json:"tls_handshake_time"`
	TTFB                int64 `json:"ttfb"`
	ContentTransferTime int64 `json:"content_transfer_time"`
}

PerformanceMetrics holds detailed timing information for a request.

type ProbeDiagnostics

type ProbeDiagnostics struct {
	Attempt  int               `json:"attempt,omitempty"`
	Request  *RequestMetadata  `json:"request,omitempty"`
	Response *ResponseMetadata `json:"response,omitempty"`
	Cache    *CacheMetadata    `json:"cache,omitempty"`
	DelayMS  int               `json:"delay_ms,omitempty"`
}

ProbeDiagnostics stores diagnostics for a cache probe attempt.

type RequestAttemptDiagnostics

type RequestAttemptDiagnostics struct {
	Request         *RequestMetadata    `json:"request,omitempty"`
	Response        *ResponseMetadata   `json:"response,omitempty"`
	RequestHeaders  http.Header         `json:"request_headers,omitempty"`
	ResponseHeaders http.Header         `json:"response_headers,omitempty"`
	Timing          *PerformanceMetrics `json:"timing,omitempty"`
	Cache           *CacheMetadata      `json:"cache,omitempty"`
}

RequestAttemptDiagnostics stores the diagnostics for a full request attempt.

type RequestDiagnostics

type RequestDiagnostics struct {
	Primary   *RequestAttemptDiagnostics `json:"primary,omitempty"`
	Probes    []ProbeDiagnostics         `json:"probes,omitempty"`
	Secondary *RequestAttemptDiagnostics `json:"secondary,omitempty"`
}

RequestDiagnostics stores per-stage diagnostics for a crawl.

type RequestMetadata

type RequestMetadata struct {
	Method     string `json:"method,omitempty"`
	URL        string `json:"url,omitempty"`
	FinalURL   string `json:"final_url,omitempty"`
	Scheme     string `json:"scheme,omitempty"`
	Host       string `json:"host,omitempty"`
	Path       string `json:"path,omitempty"`
	Query      string `json:"query,omitempty"`
	Timestamp  int64  `json:"timestamp,omitempty"`
	Provenance string `json:"provenance,omitempty"`
}

RequestMetadata stores request details for a crawl attempt.

type ResponseMetadata

type ResponseMetadata struct {
	StatusCode    int    `json:"status_code,omitempty"`
	ContentType   string `json:"content_type,omitempty"`
	ContentLength int64  `json:"content_length,omitempty"`
	RedirectURL   string `json:"redirect_url,omitempty"`
	Warning       string `json:"warning,omitempty"`
	Error         string `json:"error,omitempty"`
}

ResponseMetadata stores response details for a crawl attempt.

type RobotsRules

type RobotsRules struct {
	// CrawlDelay in seconds (0 means no delay specified)
	CrawlDelay int
	// Sitemaps found in robots.txt
	Sitemaps []string
	// DisallowPatterns are URL patterns that should not be crawled
	DisallowPatterns []string
	// AllowPatterns override DisallowPatterns (more specific)
	AllowPatterns []string
}

RobotsRules contains parsed robots.txt rules for a domain

func ParseRobotsTxt

func ParseRobotsTxt(ctx context.Context, domain string, userAgent string) (*RobotsRules, error)

ParseRobotsTxt fetches and parses robots.txt for a domain

The parser follows these rules in order of precedence: 1. If there are specific rules for "HoverBot", use those 2. Otherwise, fall back to wildcard (*) rules

We intentionally don't match SEO crawler rules (AhrefsBot, MJ12bot, etc.) as those often have punitive 10s delays meant for aggressive crawlers. Most sites have no crawl-delay for the default * user-agent.

type Sitemap

type Sitemap struct {
	XMLName xml.Name `xml:"sitemap"`
	Loc     string   `xml:"loc"`
}

type SitemapDiscoveryResult

type SitemapDiscoveryResult struct {
	Sitemaps    []string
	RobotsRules *RobotsRules
}

SitemapDiscoveryResult contains both sitemaps and robots.txt rules

type SitemapIndex

type SitemapIndex struct {
	XMLName  xml.Name  `xml:"sitemapindex"`
	Sitemaps []Sitemap `xml:"sitemap"`
}

Create proper sitemap structs

type URL

type URL struct {
	XMLName xml.Name `xml:"url"`
	Loc     string   `xml:"loc"`
}

type URLSet

type URLSet struct {
	XMLName xml.Name `xml:"urlset"`
	URLs    []URL    `xml:"url"`
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL