Documentation
¶
Index ¶
- Constants
- func IsPathAllowed(rules *RobotsRules, path string) bool
- type CacheCheckAttempt
- type CacheMetadata
- type Config
- type CrawlOptions
- type CrawlResult
- type Crawler
- func (c *Crawler) CheckCacheStatus(ctx context.Context, targetURL string) (ProbeDiagnostics, error)
- func (c *Crawler) Config() *Config
- func (c *Crawler) CreateHTTPClient(timeout time.Duration) *http.Client
- func (c *Crawler) DiscoverSitemaps(ctx context.Context, domain string) ([]string, error)
- func (c *Crawler) DiscoverSitemapsAndRobots(ctx context.Context, domain string) (*SitemapDiscoveryResult, error)
- func (c *Crawler) FilterURLs(urls []string, includePaths, excludePaths []string) []string
- func (c *Crawler) GetUserAgent() string
- func (c *Crawler) ParseSitemap(ctx context.Context, sitemapURL string) ([]string, error)
- func (c *Crawler) WarmURL(ctx context.Context, targetURL string, findLinks bool) (*CrawlResult, error)
- type PerformanceMetrics
- type ProbeDiagnostics
- type RequestAttemptDiagnostics
- type RequestDiagnostics
- type RequestMetadata
- type ResponseMetadata
- type RobotsRules
- type Sitemap
- type SitemapDiscoveryResult
- type SitemapIndex
- type URL
- type URLSet
Constants ¶
const MaxBodySampleSize = 50 * 1024
MaxBodySampleSize is the maximum size of body sample stored for tech detection (50KB)
Variables ¶
This section is empty.
Functions ¶
func IsPathAllowed ¶
func IsPathAllowed(rules *RobotsRules, path string) bool
IsPathAllowed checks if a path is allowed by robots.txt rules
Types ¶
type CacheCheckAttempt ¶
type CacheCheckAttempt struct {
Attempt int `json:"attempt"`
CacheStatus string `json:"cache_status"`
Delay int `json:"delay_ms"`
// Diagnostics duplicates attempt metadata for backward-compatible probe history.
Diagnostics *ProbeDiagnostics `json:"diagnostics,omitempty"`
}
CacheCheckAttempt stores the result of a single cache status check.
type CacheMetadata ¶
type CacheMetadata struct {
HeaderSource string `json:"header_source,omitempty"`
RawValue string `json:"raw_value,omitempty"`
NormalisedStatus string `json:"normalised_status,omitempty"`
Age string `json:"age,omitempty"`
CacheControl string `json:"cache_control,omitempty"`
Vary string `json:"vary,omitempty"`
CacheStatus string `json:"cache_status,omitempty"`
CFCacheStatus string `json:"cf_cache_status,omitempty"`
XCache string `json:"x_cache,omitempty"`
XCacheRemote string `json:"x_cache_remote,omitempty"`
XVercelCache string `json:"x_vercel_cache,omitempty"`
XVarnish string `json:"x_varnish,omitempty"`
}
CacheMetadata stores cache-related headers and interpretation.
type Config ¶
type Config struct {
DefaultTimeout time.Duration // Default timeout for requests
MaxConcurrency int // Maximum number of concurrent requests
RateLimit int // Determines request delay range: base=1s/RateLimit, range=base to 1s
UserAgent string // User agent string for requests
RetryAttempts int // Number of retry attempts for failed requests
RetryDelay time.Duration // Delay between retry attempts
SkipCachedURLs bool // Whether to skip URLs that are already cached (HIT)
Port string // Server port
Env string // Environment (development/production)
LogLevel string // Logging level
DatabaseURL string // Database connection URL
AuthToken string // Database authentication token
SentryDSN string // Sentry DSN for error tracking
FindLinks bool // Whether to extract links (e.g. PDFs/docs) from pages
SkipSSRFCheck bool // Skip SSRF protection (for tests only, never enable in production)
}
Config holds the configuration for a crawler instance
func DefaultConfig ¶
func DefaultConfig() *Config
DefaultConfig returns a Config instance with default values
type CrawlOptions ¶
type CrawlOptions struct {
MaxPages int // Maximum pages to crawl
Concurrency int // Number of concurrent crawlers
RateLimit int // Maximum requests per second
Timeout int // Request timeout in seconds
FollowLinks bool // Whether to follow links on crawled pages
}
CrawlOptions defines configuration options for a crawl operation
type CrawlResult ¶
type CrawlResult struct {
URL string `json:"url"`
ResponseTime int64 `json:"response_time"`
StatusCode int `json:"status_code"`
Error string `json:"error,omitempty"`
Warning string `json:"warning,omitempty"`
CacheStatus string `json:"cache_status"`
ContentType string `json:"content_type"`
ContentLength int64 `json:"content_length"`
Headers http.Header `json:"headers"`
RedirectURL string `json:"redirect_url"`
Performance PerformanceMetrics `json:"performance"`
Timestamp int64 `json:"timestamp"`
RetryCount int `json:"retry_count"`
SkippedCrawl bool `json:"skipped_crawl,omitempty"`
Links map[string][]string `json:"links,omitempty"`
SecondResponseTime int64 `json:"second_response_time,omitempty"`
SecondCacheStatus string `json:"second_cache_status,omitempty"`
SecondContentLength int64 `json:"second_content_length,omitempty"`
SecondHeaders http.Header `json:"second_headers,omitempty"`
SecondPerformance *PerformanceMetrics `json:"second_performance,omitempty"`
CacheCheckAttempts []CacheCheckAttempt `json:"cache_check_attempts,omitempty"`
RequestDiagnostics *RequestDiagnostics `json:"request_diagnostics,omitempty"`
BodySample []byte `json:"-"` // Truncated body for tech detection (not serialised)
Body []byte `json:"-"` // Full body for storage upload (not serialised)
}
CrawlResult represents the result of a URL crawl operation
type Crawler ¶
type Crawler struct {
// contains filtered or unexported fields
}
Crawler represents a URL crawler with configuration and metrics
func New ¶
New creates a new Crawler instance with the given configuration and optional ID If config is nil, default configuration is used
func (*Crawler) CheckCacheStatus ¶
func (*Crawler) CreateHTTPClient ¶
CreateHTTPClient returns a configured HTTP client with SSRF protection
func (*Crawler) DiscoverSitemaps ¶
DiscoverSitemaps is a backward-compatible wrapper that only returns sitemaps
func (*Crawler) DiscoverSitemapsAndRobots ¶
func (c *Crawler) DiscoverSitemapsAndRobots(ctx context.Context, domain string) (*SitemapDiscoveryResult, error)
DiscoverSitemapsAndRobots attempts to find sitemaps and parse robots.txt rules for a domain
func (*Crawler) FilterURLs ¶
FilterURLs filters URLs based on include/exclude patterns
func (*Crawler) GetUserAgent ¶
GetUserAgent returns the user agent string for this crawler
func (*Crawler) ParseSitemap ¶
ParseSitemap extracts URLs from a sitemap
type PerformanceMetrics ¶
type PerformanceMetrics struct {
DNSLookupTime int64 `json:"dns_lookup_time"`
TCPConnectionTime int64 `json:"tcp_connection_time"`
TLSHandshakeTime int64 `json:"tls_handshake_time"`
TTFB int64 `json:"ttfb"`
ContentTransferTime int64 `json:"content_transfer_time"`
}
PerformanceMetrics holds detailed timing information for a request.
type ProbeDiagnostics ¶
type ProbeDiagnostics struct {
Attempt int `json:"attempt,omitempty"`
Request *RequestMetadata `json:"request,omitempty"`
Response *ResponseMetadata `json:"response,omitempty"`
Cache *CacheMetadata `json:"cache,omitempty"`
DelayMS int `json:"delay_ms,omitempty"`
}
ProbeDiagnostics stores diagnostics for a cache probe attempt.
type RequestAttemptDiagnostics ¶
type RequestAttemptDiagnostics struct {
Request *RequestMetadata `json:"request,omitempty"`
Response *ResponseMetadata `json:"response,omitempty"`
RequestHeaders http.Header `json:"request_headers,omitempty"`
ResponseHeaders http.Header `json:"response_headers,omitempty"`
Timing *PerformanceMetrics `json:"timing,omitempty"`
Cache *CacheMetadata `json:"cache,omitempty"`
}
RequestAttemptDiagnostics stores the diagnostics for a full request attempt.
type RequestDiagnostics ¶
type RequestDiagnostics struct {
Primary *RequestAttemptDiagnostics `json:"primary,omitempty"`
Probes []ProbeDiagnostics `json:"probes,omitempty"`
Secondary *RequestAttemptDiagnostics `json:"secondary,omitempty"`
}
RequestDiagnostics stores per-stage diagnostics for a crawl.
type RequestMetadata ¶
type RequestMetadata struct {
Method string `json:"method,omitempty"`
URL string `json:"url,omitempty"`
FinalURL string `json:"final_url,omitempty"`
Scheme string `json:"scheme,omitempty"`
Host string `json:"host,omitempty"`
Path string `json:"path,omitempty"`
Query string `json:"query,omitempty"`
Timestamp int64 `json:"timestamp,omitempty"`
Provenance string `json:"provenance,omitempty"`
}
RequestMetadata stores request details for a crawl attempt.
type ResponseMetadata ¶
type ResponseMetadata struct {
StatusCode int `json:"status_code,omitempty"`
ContentType string `json:"content_type,omitempty"`
ContentLength int64 `json:"content_length,omitempty"`
RedirectURL string `json:"redirect_url,omitempty"`
Warning string `json:"warning,omitempty"`
Error string `json:"error,omitempty"`
}
ResponseMetadata stores response details for a crawl attempt.
type RobotsRules ¶
type RobotsRules struct {
// CrawlDelay in seconds (0 means no delay specified)
CrawlDelay int
// Sitemaps found in robots.txt
Sitemaps []string
// DisallowPatterns are URL patterns that should not be crawled
DisallowPatterns []string
// AllowPatterns override DisallowPatterns (more specific)
AllowPatterns []string
}
RobotsRules contains parsed robots.txt rules for a domain
func ParseRobotsTxt ¶
ParseRobotsTxt fetches and parses robots.txt for a domain
The parser follows these rules in order of precedence: 1. If there are specific rules for "HoverBot", use those 2. Otherwise, fall back to wildcard (*) rules
We intentionally don't match SEO crawler rules (AhrefsBot, MJ12bot, etc.) as those often have punitive 10s delays meant for aggressive crawlers. Most sites have no crawl-delay for the default * user-agent.
type SitemapDiscoveryResult ¶
type SitemapDiscoveryResult struct {
Sitemaps []string
RobotsRules *RobotsRules
}
SitemapDiscoveryResult contains both sitemaps and robots.txt rules
type SitemapIndex ¶
type SitemapIndex struct {
XMLName xml.Name `xml:"sitemapindex"`
Sitemaps []Sitemap `xml:"sitemap"`
}
Create proper sitemap structs