Documentation
¶
Index ¶
- Variables
- func CloseGlobalRenderer()
- func ComputeContentHash(content []byte, algorithm string) (string, error)
- func ComputeContentHashWithConfig(html []byte, algorithm string, config *ContentHashConfig) (string, error)
- func FetchSitemapURLs(sitemapURL string) ([]string, error)
- func NormalizeContent(html []byte, config *ContentHashConfig) ([]byte, error)
- func SanitizeFileName(fileName string) string
- func SetFetch(f func(URL string, options interface{}) ([]byte, error))
- func SetInterval(time time.Duration)
- func TryDefaultSitemaps(baseURL string) []string
- func URLHash(urlStr string) uint64
- func UnmarshalHTML(v interface{}, s *goquery.Selection, structMap map[string]string) error
- type AlreadyVisitedError
- type Collector
- func (c *Collector) Appengine(ctx context.Context)
- func (c *Collector) Clone() *Collector
- func (c *Collector) Cookies(URL string) []*http.Cookie
- func (c *Collector) DisableCookies()
- func (c *Collector) FetchSitemapURLs(sitemapURL string) ([]string, error)
- func (c *Collector) FetchURL(u, method string, depth int, requestData io.Reader, ctx *Context, ...) error
- func (c *Collector) GetTransport() http.RoundTripper
- func (c *Collector) Head(URL string) error
- func (c *Collector) Init()
- func (c *Collector) IsCancelled() bool
- func (c *Collector) Limit(rule *LimitRule) error
- func (c *Collector) Limits(rules []*LimitRule) error
- func (c *Collector) OnError(f ErrorCallback)
- func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback)
- func (c *Collector) OnHTMLDetach(goquerySelector string)
- func (c *Collector) OnRedirect(f RedirectCallback)
- func (c *Collector) OnRequest(f RequestCallback)
- func (c *Collector) OnRequestHeaders(f RequestCallback)
- func (c *Collector) OnResponse(f ResponseCallback)
- func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback)
- func (c *Collector) OnScraped(f ScrapedCallback)
- func (c *Collector) OnXML(xpathQuery string, f XMLCallback)
- func (c *Collector) OnXMLDetach(xpathQuery string)
- func (c *Collector) Post(URL string, requestData map[string]string) error
- func (c *Collector) PostMultipart(URL string, requestData map[string][]byte) error
- func (c *Collector) PostRaw(URL string, requestData []byte) error
- func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error
- func (c *Collector) SetClient(client *http.Client)
- func (c *Collector) SetCookieJar(j http.CookieJar)
- func (c *Collector) SetCookies(URL string, cookies []*http.Cookie) error
- func (c *Collector) SetDebugger(d debug.Debugger)
- func (c *Collector) SetRequestTimeout(timeout time.Duration)
- func (c *Collector) SetStorage(s storage.Storage) error
- func (c *Collector) String() string
- func (c *Collector) TryDefaultSitemaps(baseURL string) []string
- func (c *Collector) UnmarshalRequest(r []byte) (*Request, error)
- func (c *Collector) Visit(URL string) error
- func (c *Collector) WithTransport(transport http.RoundTripper)
- type ContentFilter
- type ContentHashConfig
- type Context
- func (c *Context) Clone() *Context
- func (c *Context) ForEach(fn func(k string, v interface{}) interface{}) []interface{}
- func (c *Context) Get(key string) string
- func (c *Context) GetAny(key string) interface{}
- func (c *Context) MarshalBinary() (_ []byte, _ error)
- func (c *Context) Put(key string, value interface{})
- func (c *Context) UnmarshalBinary(_ []byte) error
- type CrawlCompletionReason
- type CrawlResult
- type Crawler
- func (cr *Crawler) SetOnCrawlComplete(f OnCrawlCompleteFunc)
- func (cr *Crawler) SetOnPageCrawled(f OnPageCrawledFunc)
- func (cr *Crawler) SetOnResourceVisit(f OnResourceVisitFunc)
- func (cr *Crawler) SetOnURLDiscovered(f OnURLDiscoveredFunc)
- func (cr *Crawler) Start(url string) error
- func (cr *Crawler) Wait()
- type CrawlerConfig
- type DiscoveryMechanism
- type ErrorCallback
- type FilterChain
- type HTMLCallback
- type HTMLElement
- func (h *HTMLElement) Attr(k string) string
- func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
- func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
- func (h *HTMLElement) ChildText(goquerySelector string) string
- func (h *HTMLElement) ChildTexts(goquerySelector string) []string
- func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
- func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool)
- func (h *HTMLElement) Unmarshal(v interface{}) error
- func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error
- type HTTPConfig
- type HTTPTrace
- type Index
- type LimitRule
- type Link
- type LinkDensityFilter
- type Links
- type MockResponse
- type MockTransport
- func (m *MockTransport) RegisterError(url string, err error)
- func (m *MockTransport) RegisterHTML(url, html string)
- func (m *MockTransport) RegisterJSON(url, json string)
- func (m *MockTransport) RegisterPattern(pattern string, response *MockResponse) error
- func (m *MockTransport) RegisterRedirect(fromURL, toURL string, statusCode int)
- func (m *MockTransport) RegisterRedirectWithDelay(fromURL, toURL string, statusCode int, delay time.Duration)
- func (m *MockTransport) RegisterResponse(url string, response *MockResponse)
- func (m *MockTransport) Reset()
- func (m *MockTransport) RoundTrip(req *http.Request) (*http.Response, error)
- func (m *MockTransport) SetFallback(fallback http.RoundTripper)
- type NavigationTextFilter
- type NoisePatternFilter
- type OnCrawlCompleteFunc
- type OnPageCrawledFunc
- type OnResourceVisitFunc
- type OnURLDiscoveredFunc
- type PageMetadata
- type PageResult
- type RedirectCallback
- type RedirectResponse
- type RenderingConfig
- type Request
- func (r *Request) Abort()
- func (r *Request) AbsoluteURL(u string) string
- func (r *Request) Do() error
- func (r *Request) IsAbort() bool
- func (r *Request) Marshal() ([]byte, error)
- func (r *Request) New(method, URL string, body io.Reader) (*Request, error)
- func (r *Request) Post(URL string, requestData map[string]string) error
- func (r *Request) PostMultipart(URL string, requestData map[string][]byte) error
- func (r *Request) PostRaw(URL string, requestData []byte) error
- func (r *Request) Retry() error
- func (r *Request) Visit(URL string) error
- type RequestCallback
- type ResourceResult
- type ResourceValidationConfig
- type Response
- type ResponseCallback
- type ResponseHeadersCallback
- type ScrapedCallback
- type Sitemap
- type StopwordsScorer
- type URL
- type URLAction
- type URLDiscoveryRequest
- type WorkerPool
- type XMLCallback
- type XMLElement
Constants ¶
This section is empty.
Variables ¶
var ( // ErrForbiddenDomain is the error thrown if visiting // a domain which is not allowed in AllowedDomains ErrForbiddenDomain = errors.New("forbidden domain") // ErrMissingURL is the error type for missing URL errors ErrMissingURL = errors.New("missing URL") // ErrMaxDepth is the error type for exceeding max depth ErrMaxDepth = errors.New("max depth limit reached") // ErrForbiddenURL is the error thrown if visiting // a URL which is not allowed by URLFilters ErrForbiddenURL = errors.New("forbidden URL") // ErrNoURLFiltersMatch is the error thrown if visiting // a URL which is not allowed by URLFilters ErrNoURLFiltersMatch = errors.New("no URLFilters match") // ErrRobotsTxtBlocked is the error type for robots.txt errors ErrRobotsTxtBlocked = errors.New("URL blocked by robots.txt") // ErrNoCookieJar is the error type for missing cookie jar ErrNoCookieJar = errors.New("cookie jar is not available") // ErrNoPattern is the error type for LimitRules without patterns ErrNoPattern = errors.New("no pattern defined in LimitRule") // ErrAbortedAfterHeaders is the error returned when OnResponseHeaders aborts the transfer. ErrAbortedAfterHeaders = errors.New("aborted after receiving response headers") // ErrAbortedBeforeRequest is the error returned when OnResponseHeaders aborts the transfer. ErrAbortedBeforeRequest = errors.New("aborted before Do Request") // ErrQueueFull is the error returned when the queue is full ErrQueueFull = errors.New("queue MaxSize reached") // ErrMaxRequests is the error returned when exceeding max requests ErrMaxRequests = errors.New("max requests limit reached") // ErrRetryBodyUnseekable is the error when retry with not seekable body ErrRetryBodyUnseekable = errors.New("retry body unseekable") )
var ErrMockNotFound = errors.New("no mock response registered for URL")
ErrMockNotFound is returned when no mock is registered for a URL
Functions ¶
func CloseGlobalRenderer ¶
func CloseGlobalRenderer()
CloseGlobalRenderer closes the global renderer instance This should be called when the application exits
func ComputeContentHash ¶
ComputeContentHash computes a hash of the normalized content using the specified algorithm
func ComputeContentHashWithConfig ¶
func ComputeContentHashWithConfig(html []byte, algorithm string, config *ContentHashConfig) (string, error)
ComputeContentHashWithConfig is a convenience function that normalizes content and computes its hash
func FetchSitemapURLs ¶
FetchSitemapURLs fetches a specific sitemap URL and returns all discovered URLs. It handles both regular sitemaps and sitemap indexes automatically. Returns an empty slice if the sitemap cannot be fetched or parsed.
func NormalizeContent ¶
func NormalizeContent(html []byte, config *ContentHashConfig) ([]byte, error)
NormalizeContent normalizes HTML content based on the provided configuration to make content hashing more reliable by removing dynamic elements
func SanitizeFileName ¶
SanitizeFileName replaces dangerous characters in a string so the return value can be used as a safe file name.
func SetInterval ¶
SetInterval change Time interval to be used in Index.get
func TryDefaultSitemaps ¶
TryDefaultSitemaps tries to fetch sitemaps from common default locations. It tries /sitemap.xml first, then /sitemap_index.xml. Returns all discovered URLs from available sitemaps (empty slice if none found). This function does not return errors - it returns empty slice if no sitemaps found.
func URLHash ¶
URLHash computes the hash for a URL string. This is the same hash used internally for visit tracking. Exported for use by the application layer when persisting/restoring crawl state.
func UnmarshalHTML ¶
UnmarshalHTML declaratively extracts text or attributes to a struct from HTML response using struct tags composed of css selectors. Allowed struct tags:
- "selector" (required): CSS (goquery) selector of the desired data
- "attr" (optional): Selects the matching element's attribute's value. Leave it blank or omit to get the text of the element.
Example struct declaration:
type Nested struct {
String string `selector:"div > p"`
Classes []string `selector:"li" attr:"class"`
Struct *Nested `selector:"div > div"`
}
Supported types: struct, *struct, string, []string
Types ¶
type AlreadyVisitedError ¶
type AlreadyVisitedError struct {
// Destination is the URL that was attempted to be visited.
// It might not match the URL passed to Visit if redirect
// was followed.
Destination *url.URL
}
AlreadyVisitedError is the error type for already visited URLs.
It's returned synchronously by Visit when the URL passed to Visit is already visited.
When already visited URL is encountered after following redirects, this error appears in OnError callback, and if Async mode is not enabled, is also returned by Visit.
func (*AlreadyVisitedError) Error ¶
func (e *AlreadyVisitedError) Error() string
Error implements error interface.
type Collector ¶
type Collector struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
// Custom headers for the request
Headers *http.Header
// AllowURLRevisit allows multiple downloads of the same URL
AllowURLRevisit bool
// MaxBodySize is the limit of the retrieved response body in bytes.
// 0 means unlimited.
// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
MaxBodySize int
// CacheDir specifies a location where GET requests are cached as files.
// When it's not defined, caching is disabled.
CacheDir string
// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
// the target host's robots.txt file. See http://www.robotstxt.org/ for more
// information.
IgnoreRobotsTxt bool
// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
// By default, BlueSnake parses only successful HTTP responses. Set ParseHTTPErrorResponse
// to true to enable it.
ParseHTTPErrorResponse bool
// ID is the unique identifier of a collector
ID uint32
// DetectCharset can enable character encoding detection for non-utf8 response bodies
// without explicit charset declaration. This feature uses https://github.com/saintfish/chardet
DetectCharset bool
// CheckHead performs a HEAD request before every GET to pre-validate the response
CheckHead bool
// TraceHTTP enables capturing and reporting request performance for crawler tuning.
// When set to true, the Response.Trace will be filled in with an HTTPTrace object.
TraceHTTP bool
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
MaxRequests uint32
// EnableRendering enables JavaScript rendering using headless Chrome.
// When set to true, pages will be rendered with chromedp before parsing.
EnableRendering bool
// RenderingConfig contains configuration for JavaScript rendering wait times
// Only applies when EnableRendering is true
RenderingConfig *RenderingConfig
// EnableContentHash enables content-based duplicate detection
EnableContentHash bool
// ContentHashAlgorithm specifies the hash algorithm to use ("xxhash", "md5", "sha256")
ContentHashAlgorithm string
// ContentHashConfig contains detailed configuration for content hashing
ContentHashConfig *ContentHashConfig
// CacheExpiration sets the maximum age for cache files.
// If a cached file is older than this duration, it will be ignored and refreshed.
CacheExpiration time.Duration
// contains filtered or unexported fields
}
Collector provides the scraper instance for a scraping job
func NewCollector ¶
func NewCollector(ctx context.Context, config *HTTPConfig) *Collector
NewCollector creates a new Collector instance with the provided context and HTTP configuration. The context is used for request cancellation and lifecycle management. If config is nil, default HTTPConfig from NewDefaultConfig() is used.
func (*Collector) Appengine ¶
Appengine will replace the Collector's backend http.Client With an Http.Client that is provided by appengine/urlfetch This function should be used when the scraper is run on Google App Engine. Example:
func startScraper(w http.ResponseWriter, r *http.Request) {
ctx := appengine.NewContext(r)
c := bluesnake.NewCollector()
c.Appengine(ctx)
...
c.Visit("https://google.ca")
}
func (*Collector) Clone ¶
Clone creates an exact copy of a Collector without callbacks. HTTP backend and cookie jar are shared between collectors.
func (*Collector) DisableCookies ¶
func (c *Collector) DisableCookies()
DisableCookies turns off cookie handling
func (*Collector) FetchSitemapURLs ¶
FetchSitemapURLs fetches URLs from a sitemap using the Collector's HTTP client. This ensures sitemap fetching uses the same transport/configuration as regular crawling. Handles both regular sitemaps and sitemap indexes automatically. Returns all discovered URLs from the sitemap (empty slice if sitemap cannot be fetched). This is the proper way for Crawler to access sitemap data without touching backend.Client directly.
func (*Collector) FetchURL ¶
func (c *Collector) FetchURL(u, method string, depth int, requestData io.Reader, ctx *Context, hdr http.Header) error
FetchURL performs an HTTP request and processes the response synchronously. This method is intentionally exported for use by Crawler. It handles HTTP fetch, HTML parsing, and executes all registered callbacks.
IMPORTANT - Context Parameter: In most cases, pass nil for ctx to create a fresh Context for each request. This ensures proper isolation and prevents race conditions where concurrent requests overwrite shared Context data (contentType, title, etc.).
Only pass a non-nil Context when you explicitly need to preserve data from a previous request, such as:
- Request.Retry() - Preserving state across retry attempts
- Request.Visit() - Manual navigation with session continuity
- Custom request chaining where you need to pass authentication tokens or session data
The Crawler automatically passes nil to ensure each discovered URL gets its own Context.
func (*Collector) GetTransport ¶
func (c *Collector) GetTransport() http.RoundTripper
GetTransport returns the current http.RoundTripper (transport) This is useful for creating additional HTTP clients that should use the same transport (e.g., for robots.txt fetching with redirect support)
func (*Collector) Init ¶
func (c *Collector) Init()
Init initializes the Collector's private variables and sets default configuration for the Collector
func (*Collector) IsCancelled ¶
IsCancelled returns true if the collector's context is cancelled
func (*Collector) OnError ¶
func (c *Collector) OnError(f ErrorCallback)
OnError registers a function. Function will be executed if an error occurs during the HTTP request.
func (*Collector) OnHTML ¶
func (c *Collector) OnHTML(goquerySelector string, f HTMLCallback)
OnHTML registers a function. Function will be executed on every HTML element matched by the GoQuery Selector parameter. GoQuery Selector is a selector used by https://github.com/PuerkitoBio/goquery
func (*Collector) OnHTMLDetach ¶
OnHTMLDetach deregister a function. Function will not be execute after detached
func (*Collector) OnRedirect ¶
func (c *Collector) OnRedirect(f RedirectCallback)
OnRedirect registers a function that will be called when a redirect is encountered. The callback receives the redirect request and the chain of previous requests. Return nil to allow the redirect, or an error to block it. This allows external components (like Crawler) to inject redirect handling logic into the Collector.
func (*Collector) OnRequest ¶
func (c *Collector) OnRequest(f RequestCallback)
OnRequest registers a function. Function will be executed on every request made by the Collector
func (*Collector) OnRequestHeaders ¶
func (c *Collector) OnRequestHeaders(f RequestCallback)
OnRequestHeaders registers a function. Function will be executed on every request made by the Collector before Request Do
func (*Collector) OnResponse ¶
func (c *Collector) OnResponse(f ResponseCallback)
OnResponse registers a function. Function will be executed on every response
func (*Collector) OnResponseHeaders ¶
func (c *Collector) OnResponseHeaders(f ResponseHeadersCallback)
OnResponseHeaders registers a function. Function will be executed on every response when headers and status are already received, but body is not yet read.
Like in OnRequest, you can call Request.Abort to abort the transfer. This might be useful if, for example, you're following all hyperlinks, but want to avoid downloading files.
Be aware that using this will prevent HTTP/1.1 connection reuse, as the only way to abort a download is to immediately close the connection. HTTP/2 doesn't suffer from this problem, as it's possible to close specific stream inside the connection.
func (*Collector) OnScraped ¶
func (c *Collector) OnScraped(f ScrapedCallback)
OnScraped registers a function that will be executed as the final part of the scraping, after OnHTML and OnXML have finished.
func (*Collector) OnXML ¶
func (c *Collector) OnXML(xpathQuery string, f XMLCallback)
OnXML registers a function. Function will be executed on every XML element matched by the xpath Query parameter. xpath Query is used by https://github.com/antchfx/xmlquery
func (*Collector) OnXMLDetach ¶
OnXMLDetach deregister a function. Function will not be execute after detached
func (*Collector) Post ¶
Post starts a collector job by creating a POST request. Post also calls the previously provided callbacks
func (*Collector) PostMultipart ¶
PostMultipart starts a collector job by creating a Multipart POST request with raw binary data. PostMultipart also calls the previously provided callbacks
func (*Collector) PostRaw ¶
PostRaw starts a collector job by creating a POST request with raw binary data. Post also calls the previously provided callbacks
func (*Collector) Request ¶
func (c *Collector) Request(method, URL string, requestData io.Reader, ctx *Context, hdr http.Header) error
Request starts a collector job by creating a custom HTTP request where method, context, headers and request data can be specified. Set requestData, ctx, hdr parameters to nil if you don't want to use them. Valid methods:
- "GET"
- "HEAD"
- "POST"
- "PUT"
- "DELETE"
- "PATCH"
- "OPTIONS"
func (*Collector) SetCookieJar ¶
SetCookieJar overrides the previously set cookie jar
func (*Collector) SetCookies ¶
SetCookies handles the receipt of the cookies in a reply for the given URL
func (*Collector) SetDebugger ¶
SetDebugger attaches a debugger to the collector
func (*Collector) SetRequestTimeout ¶
SetRequestTimeout overrides the default timeout (10 seconds) for this collector
func (*Collector) SetStorage ¶
SetStorage overrides the default in-memory storage. Storage stores scraping related data like cookies and visited urls
func (*Collector) String ¶
String is the text representation of the collector. It contains useful debug information about the collector's internals
func (*Collector) TryDefaultSitemaps ¶
TryDefaultSitemaps tries to fetch sitemaps from common default locations using the Collector's HTTP client. It tries /sitemap.xml first, then /sitemap_index.xml. Returns all discovered URLs from available sitemaps (empty slice if none found). This method does not return errors - it returns an empty slice if no sitemaps are found.
func (*Collector) UnmarshalRequest ¶
UnmarshalRequest creates a Request from serialized data
func (*Collector) Visit ¶
Visit starts Collector's collecting job by creating a request to the URL specified in parameter. Visit also calls the previously provided callbacks
func (*Collector) WithTransport ¶
func (c *Collector) WithTransport(transport http.RoundTripper)
WithTransport allows you to set a custom http.RoundTripper (transport)
type ContentFilter ¶
type ContentFilter interface {
// Filter applies the filter to the document and returns the modified document
Filter(doc *goquery.Document) *goquery.Document
// Name returns the filter name for debugging
Name() string
}
ContentFilter defines the interface for all content filters. Filters modify the document in place and return it for chaining.
type ContentHashConfig ¶
type ContentHashConfig struct {
// ExcludeTags specifies HTML tags to exclude from content hashing
// Default: ["script", "style", "nav", "footer"]
ExcludeTags []string
// IncludeOnlyTags specifies to only include specific tags in content hashing
// If empty, all content (minus ExcludeTags) is included
// Example: ["article", "main"] to focus only on main content
IncludeOnlyTags []string
// StripTimestamps removes timestamp patterns from content before hashing
StripTimestamps bool
// StripAnalytics removes analytics and tracking code from content
StripAnalytics bool
// StripComments removes HTML comments from content
StripComments bool
// CollapseWhitespace normalizes whitespace (multiple spaces/newlines to single)
CollapseWhitespace bool
}
ContentHashConfig contains configuration for content-based duplicate detection
type Context ¶
type Context struct {
// contains filtered or unexported fields
}
Context provides a tiny layer for passing data between callbacks
func (*Context) Get ¶
Get retrieves a string value from Context. Get returns an empty string if key not found
func (*Context) MarshalBinary ¶
MarshalBinary encodes Context value This function is used by request caching
func (*Context) UnmarshalBinary ¶
UnmarshalBinary decodes Context value to nil This function is used by request caching
type CrawlCompletionReason ¶
type CrawlCompletionReason string
CrawlCompletionReason indicates why a crawl completed
const ( // CompletionReasonExhausted means all discoverable URLs have been crawled CompletionReasonExhausted CrawlCompletionReason = "exhausted" // CompletionReasonBudgetReached means the MaxURLsToVisit limit was reached CompletionReasonBudgetReached CrawlCompletionReason = "budget_reached" // CompletionReasonCancelled means the crawl was stopped via context cancellation CompletionReasonCancelled CrawlCompletionReason = "cancelled" )
type CrawlResult ¶
type CrawlResult struct {
// Reason indicates why the crawl completed
Reason CrawlCompletionReason
// TotalPages is the total number of HTML pages successfully crawled
TotalPages int
// TotalDiscovered is the total number of unique URLs discovered
TotalDiscovered int
// URLsVisited is the number of URLs visited in this session
URLsVisited int
// PendingURLs contains URLs that were queued but not visited (for resume)
PendingURLs []URLDiscoveryRequest
}
CrawlResult contains comprehensive information about a completed crawl
type Crawler ¶
type Crawler struct {
// Collector is the underlying low-level collector (exported for advanced configuration)
Collector *Collector
// contains filtered or unexported fields
}
Crawler provides a high-level interface for web crawling with callbacks for page results
func NewCrawler ¶
func NewCrawler(ctx context.Context, config *CrawlerConfig) *Crawler
NewCrawler creates a high-level crawler with the specified context and crawler configuration. The context is used for crawl lifecycle management and cancellation. The returned crawler must have its callbacks set via SetOnPageCrawled and SetOnCrawlComplete before calling Start. If config is nil, default configuration is used.
func (*Crawler) SetOnCrawlComplete ¶
func (cr *Crawler) SetOnCrawlComplete(f OnCrawlCompleteFunc)
SetOnCrawlComplete registers a callback function that will be called when the crawl finishes. This callback receives summary statistics about the completed crawl.
func (*Crawler) SetOnPageCrawled ¶
func (cr *Crawler) SetOnPageCrawled(f OnPageCrawledFunc)
SetOnPageCrawled registers a callback function that will be called after each HTML page is crawled. This callback receives complete page information including discovered URLs. Note: This is only called for HTML pages. For resources, use SetOnResourceVisit.
func (*Crawler) SetOnResourceVisit ¶
func (cr *Crawler) SetOnResourceVisit(f OnResourceVisitFunc)
SetOnResourceVisit registers a callback function that will be called after each resource is visited. This callback receives resource information (URL, status, content type) for non-HTML assets such as images, stylesheets, scripts, etc.
func (*Crawler) SetOnURLDiscovered ¶
func (cr *Crawler) SetOnURLDiscovered(f OnURLDiscoveredFunc)
SetOnURLDiscovered registers a callback function that will be called when a new URL is discovered. This callback is invoked exactly once per unique URL to determine the action to take. The callback should return:
- URLActionCrawl to crawl the URL normally
- URLActionRecordOnly to add the URL to links but not crawl it (e.g., framework-specific paths)
- URLActionSkip to ignore the URL completely (e.g., analytics/tracking URLs)
type CrawlerConfig ¶
type CrawlerConfig struct {
// MaxDepth limits the recursion depth of visited URLs.
// Set it to 0 for infinite recursion (default).
MaxDepth int
// AllowedDomains is a domain whitelist.
// Leave it blank to allow any domains to be visited
AllowedDomains []string
// DisallowedDomains is a domain blacklist.
DisallowedDomains []string
// DisallowedURLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request will be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
DisallowedURLFilters []*regexp.Regexp
// URLFilters is a list of regular expressions which restricts
// visiting URLs. If any of the rules matches to a URL the
// request won't be stopped. DisallowedURLFilters will
// be evaluated before URLFilters
URLFilters []*regexp.Regexp
// DiscoveryMechanisms specifies which mechanisms to use for URL discovery.
// Can be any combination: ["spider"], ["sitemap"], or ["spider", "sitemap"].
// Default is ["spider"].
DiscoveryMechanisms []DiscoveryMechanism
// SitemapURLs specifies custom sitemap URLs to fetch (optional).
// If nil/empty when sitemap discovery is enabled, tries default locations
// (/sitemap.xml, /sitemap_index.xml).
SitemapURLs []string
// ResourceValidation configures checking of non-HTML resources for broken links
ResourceValidation *ResourceValidationConfig
// RobotsTxtMode controls how robots.txt is handled
// Options: "respect", "ignore", or "ignore-report"
// Default: "respect"
RobotsTxtMode string
// FollowInternalNofollow allows following links with rel="nofollow" on same domain
// Default: false
FollowInternalNofollow bool
// FollowExternalNofollow allows following links with rel="nofollow" on external domains
// Default: false
FollowExternalNofollow bool
// RespectMetaRobotsNoindex respects <meta name="robots" content="noindex">
// Default: true
RespectMetaRobotsNoindex bool
// RespectNoindex respects X-Robots-Tag: noindex headers
// Default: true
RespectNoindex bool
// DiscoveryChannelSize is the buffer size for the URL discovery channel
// Larger values reduce blocking but use more memory
// Default: 50000
DiscoveryChannelSize int
// WorkQueueSize is the buffer size for the worker pool work queue
// Should be smaller than DiscoveryChannelSize
// Default: 1000
WorkQueueSize int
// Parallelism is the number of concurrent HTTP requests (worker pool size)
// This replaces/complements the existing async goroutine model
// Default: 10
Parallelism int
// DebugURLs contains exact URLs to enable detailed logging for (scheme and trailing slash ignored)
// Used for debugging race conditions by filtering logs to specific problematic URLs
// Uses exact matching to avoid logging all subpaths
// Example: []string{"handbook.agentberlin.ai/intro", "handbook.agentberlin.ai"}
DebugURLs []string
// HTTP contains HTTP client configuration for the underlying Collector
HTTP *HTTPConfig
// Incremental crawling support
// MaxURLsToVisit limits the number of URLs to visit before pausing.
// Set to 0 for unlimited (default). When the limit is reached, the crawl
// pauses and OnCrawlPaused callback is called with pending URLs.
MaxURLsToVisit int
// PreVisitedHashes contains URL hashes that should be considered already visited.
// Used when resuming a paused crawl to avoid re-crawling URLs.
PreVisitedHashes []uint64
// SeedURLs contains URLs to queue at the start of the crawl (in addition to the initial URL).
// Used when resuming a paused crawl with pending URLs from the previous session.
SeedURLs []URLDiscoveryRequest
}
CrawlerConfig contains all configuration options for a Crawler
func NewDefaultConfig ¶
func NewDefaultConfig() *CrawlerConfig
NewDefaultConfig returns a CrawlerConfig with sensible defaults
type DiscoveryMechanism ¶
type DiscoveryMechanism string
DiscoveryMechanism specifies how URLs are discovered during crawling
const ( // DiscoverySpider discovers URLs by following links in HTML pages DiscoverySpider DiscoveryMechanism = "spider" // DiscoverySitemap discovers URLs from sitemap.xml files DiscoverySitemap DiscoveryMechanism = "sitemap" )
type ErrorCallback ¶
ErrorCallback is a type alias for OnError callback functions
type FilterChain ¶
type FilterChain struct {
// contains filtered or unexported fields
}
FilterChain applies multiple filters in sequence
func NewFilterChain ¶
func NewFilterChain(filters ...ContentFilter) *FilterChain
NewFilterChain creates a new filter chain with the given filters
func (*FilterChain) Add ¶
func (fc *FilterChain) Add(f ContentFilter) *FilterChain
Add adds a filter to the chain
type HTMLCallback ¶
type HTMLCallback func(*HTMLElement)
HTMLCallback is a type alias for OnHTML callback functions
type HTMLElement ¶
type HTMLElement struct {
// Name is the name of the tag
Name string
Text string
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the goquery parsed DOM object of the page. DOM is relative
// to the current HTMLElement
DOM *goquery.Selection
// Index stores the position of the current element within all the elements matched by an OnHTML callback
Index int
// contains filtered or unexported fields
}
HTMLElement is the representation of a HTML tag.
func NewHTMLElementFromSelectionNode ¶
func NewHTMLElementFromSelectionNode(resp *Response, s *goquery.Selection, n *html.Node, idx int) *HTMLElement
NewHTMLElementFromSelectionNode creates a HTMLElement from a goquery.Selection Node.
func (*HTMLElement) Attr ¶
func (h *HTMLElement) Attr(k string) string
Attr returns the selected attribute of a HTMLElement or empty string if no attribute found
func (*HTMLElement) ChildAttr ¶
func (h *HTMLElement) ChildAttr(goquerySelector, attrName string) string
ChildAttr returns the stripped text content of the first matching element's attribute.
func (*HTMLElement) ChildAttrs ¶
func (h *HTMLElement) ChildAttrs(goquerySelector, attrName string) []string
ChildAttrs returns the stripped text content of all the matching element's attributes.
func (*HTMLElement) ChildText ¶
func (h *HTMLElement) ChildText(goquerySelector string) string
ChildText returns the concatenated and stripped text content of the matching elements.
func (*HTMLElement) ChildTexts ¶
func (h *HTMLElement) ChildTexts(goquerySelector string) []string
ChildTexts returns the stripped text content of all the matching elements.
func (*HTMLElement) ForEach ¶
func (h *HTMLElement) ForEach(goquerySelector string, callback func(int, *HTMLElement))
ForEach iterates over the elements matched by the first argument and calls the callback function on every HTMLElement match.
func (*HTMLElement) ForEachWithBreak ¶
func (h *HTMLElement) ForEachWithBreak(goquerySelector string, callback func(int, *HTMLElement) bool)
ForEachWithBreak iterates over the elements matched by the first argument and calls the callback function on every HTMLElement match. It is identical to ForEach except that it is possible to break out of the loop by returning false in the callback function. It returns the current Selection object.
func (*HTMLElement) Unmarshal ¶
func (h *HTMLElement) Unmarshal(v interface{}) error
Unmarshal is a shorthand for bluesnake.UnmarshalHTML
func (*HTMLElement) UnmarshalWithMap ¶
func (h *HTMLElement) UnmarshalWithMap(v interface{}, structMap map[string]string) error
UnmarshalWithMap is a shorthand for bluesnake.UnmarshalHTML, extended to allow maps to be passed in.
type HTTPConfig ¶
type HTTPConfig struct {
// UserAgent is the User-Agent string used by HTTP requests
UserAgent string
// Headers contains custom headers for HTTP requests
Headers map[string]string
// AllowURLRevisit allows multiple downloads of the same URL
AllowURLRevisit bool
// MaxBodySize is the limit of the retrieved response body in bytes.
// 0 means unlimited.
// The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes).
MaxBodySize int
// CacheDir specifies a location where GET requests are cached as files.
// When it's not defined, caching is disabled.
CacheDir string
// CacheExpiration sets the maximum age for cache files.
CacheExpiration time.Duration
// IgnoreRobotsTxt allows the Collector to ignore any restrictions set by
// the target host's robots.txt file.
IgnoreRobotsTxt bool
// ParseHTTPErrorResponse allows parsing HTTP responses with non 2xx status codes.
ParseHTTPErrorResponse bool
// ID is the unique identifier of a collector (auto-assigned if 0)
ID uint32
// DetectCharset can enable character encoding detection for non-utf8 response bodies
DetectCharset bool
// CheckHead performs a HEAD request before every GET to pre-validate the response
CheckHead bool
// TraceHTTP enables capturing and reporting request performance.
TraceHTTP bool
// MaxRequests limit the number of requests done by the instance.
// Set it to 0 for infinite requests (default).
MaxRequests uint32
// EnableRendering enables JavaScript rendering using headless Chrome.
EnableRendering bool
// RenderingConfig contains configuration for JavaScript rendering wait times
// Only applies when EnableRendering is true
RenderingConfig *RenderingConfig
// EnableContentHash enables content-based duplicate detection
// When true, pages with identical content will be detected even if URLs differ
EnableContentHash bool
// ContentHashAlgorithm specifies the hash algorithm to use
// Options: "xxhash" (fastest, default), "md5", "sha256"
ContentHashAlgorithm string
// ContentHashConfig contains detailed configuration for content hashing
ContentHashConfig *ContentHashConfig
// Debugger is the debugger instance to use
Debugger debug.Debugger
}
HTTPConfig contains HTTP client configuration options for the Collector
type HTTPTrace ¶
type HTTPTrace struct {
ConnectDuration time.Duration
FirstByteDuration time.Duration
// contains filtered or unexported fields
}
HTTPTrace provides a datastructure for storing an http trace.
type Index ¶
Index is a structure of <sitemapindex>
func ParseIndex ¶
ParseIndex create Index data from text
func ReadSitemapIndex ¶
ReadSitemapIndex is a function that reads a file and returns a Index structure.
type LimitRule ¶
type LimitRule struct {
// DomainRegexp is a regular expression to match against domains
DomainRegexp string
// DomainGlob is a glob pattern to match against domains
DomainGlob string
// Delay is the duration to wait before creating a new request to the matching domains
Delay time.Duration
// RandomDelay is the extra randomized duration to wait added to Delay before creating a new request
RandomDelay time.Duration
// Parallelism is the number of the maximum allowed concurrent requests of the matching domains
Parallelism int
// contains filtered or unexported fields
}
LimitRule provides connection restrictions for domains. Both DomainRegexp and DomainGlob can be used to specify the included domains patterns, but at least one is required. There can be two kind of limitations:
- Parallelism: Set limit for the number of concurrent requests to matching domains
- Delay: Wait specified amount of time between requests (parallelism is 1 in this case)
type Link ¶
type Link struct {
// URL is the target URL
URL string `json:"url"`
// Type is the link type: "anchor", "image", "script", "stylesheet", "iframe", "canonical", "video", "audio"
Type string `json:"type"`
// Text is the anchor text, alt text, or empty for other link types
Text string `json:"text"`
// Context is the surrounding text context where the link appears
Context string `json:"context,omitempty"`
// IsInternal indicates if this link points to the same domain/subdomain
IsInternal bool `json:"isInternal"`
// Status is the HTTP status code if this URL has been crawled (200, 404, 301, etc.)
Status *int `json:"status,omitempty"`
// Title is the page title if this URL has been crawled
Title string `json:"title,omitempty"`
// ContentType is the MIME type if this URL has been crawled
ContentType string `json:"contentType,omitempty"`
// Position indicates the semantic location of the link on the page
// Values: "content", "navigation", "header", "footer", "sidebar", "breadcrumbs", "pagination", "unknown"
Position string `json:"position,omitempty"`
// DOMPath is a simplified DOM path showing the link's location in the HTML structure
// Example: "body > main > article > p > a"
DOMPath string `json:"domPath,omitempty"`
// Action indicates how this URL should be handled
// Values: "crawl" (normal), "record" (framework-specific, don't crawl), "skip" (ignored)
Action URLAction `json:"action,omitempty"`
}
Link represents a single outbound link discovered on a page
type LinkDensityFilter ¶
type LinkDensityFilter struct {
// MaxLinkRatio is the maximum ratio of link words to total words (default 0.5)
MaxLinkRatio float64
// MinLinks is the minimum number of links before considering link density (default 3)
MinLinks int
}
LinkDensityFilter removes elements where most text is within links
func NewLinkDensityFilter ¶
func NewLinkDensityFilter() *LinkDensityFilter
NewLinkDensityFilter creates a new LinkDensityFilter with default settings
func (*LinkDensityFilter) Filter ¶
func (f *LinkDensityFilter) Filter(doc *goquery.Document) *goquery.Document
Filter removes elements with high link density
func (*LinkDensityFilter) Name ¶
func (f *LinkDensityFilter) Name() string
Name returns the filter name
type Links ¶
type Links struct {
// Internal links point to same domain/subdomain
Internal []Link `json:"internal"`
// External links point to different domains
External []Link `json:"external"`
}
Links contains outbound links from a page
type MockResponse ¶
type MockResponse struct {
// StatusCode is the HTTP status code to return (default: 200)
StatusCode int
// Body is the response body content (used if BodyFunc is nil)
Body string
// BodyFunc is a function that generates the body dynamically based on the request
// If set, this takes precedence over Body
BodyFunc func(*http.Request) string
// Headers are the HTTP headers to include in the response
Headers http.Header
// Delay simulates network latency before returning the response
Delay time.Duration
// Error simulates a network error
Error error
}
MockResponse represents a mock HTTP response
type MockTransport ¶
type MockTransport struct {
// contains filtered or unexported fields
}
MockTransport implements http.RoundTripper for testing purposes. It allows you to register mock responses for specific URLs or URL patterns without needing to run an actual HTTP server.
func NewMockTransport ¶
func NewMockTransport() *MockTransport
NewMockTransport creates a new MockTransport instance
func (*MockTransport) RegisterError ¶
func (m *MockTransport) RegisterError(url string, err error)
RegisterError registers a mock error for a URL (simulates network failure)
func (*MockTransport) RegisterHTML ¶
func (m *MockTransport) RegisterHTML(url, html string)
RegisterHTML is a convenience method to register an HTML response with status 200
func (*MockTransport) RegisterJSON ¶
func (m *MockTransport) RegisterJSON(url, json string)
RegisterJSON is a convenience method to register a JSON response with status 200
func (*MockTransport) RegisterPattern ¶
func (m *MockTransport) RegisterPattern(pattern string, response *MockResponse) error
RegisterPattern registers a mock response for URLs matching a regex pattern
func (*MockTransport) RegisterRedirect ¶
func (m *MockTransport) RegisterRedirect(fromURL, toURL string, statusCode int)
RegisterRedirect is a convenience method to register a redirect response
func (*MockTransport) RegisterRedirectWithDelay ¶
func (m *MockTransport) RegisterRedirectWithDelay(fromURL, toURL string, statusCode int, delay time.Duration)
RegisterRedirectWithDelay is a convenience method to register a redirect response with a delay
func (*MockTransport) RegisterResponse ¶
func (m *MockTransport) RegisterResponse(url string, response *MockResponse)
RegisterResponse registers a mock response for an exact URL match
func (*MockTransport) Reset ¶
func (m *MockTransport) Reset()
Reset clears all registered responses and patterns
func (*MockTransport) SetFallback ¶
func (m *MockTransport) SetFallback(fallback http.RoundTripper)
SetFallback sets a fallback RoundTripper to use when no mock is registered for a URL. This is useful for testing scenarios where you want to mock some URLs but allow real HTTP requests for others.
type NavigationTextFilter ¶
type NavigationTextFilter struct {
// contains filtered or unexported fields
}
NavigationTextFilter removes elements with navigation-like text content
func NewNavigationTextFilter ¶
func NewNavigationTextFilter() *NavigationTextFilter
NewNavigationTextFilter creates a new NavigationTextFilter
func (*NavigationTextFilter) Filter ¶
func (f *NavigationTextFilter) Filter(doc *goquery.Document) *goquery.Document
Filter removes elements with navigation-like text
func (*NavigationTextFilter) Name ¶
func (f *NavigationTextFilter) Name() string
Name returns the filter name
type NoisePatternFilter ¶
type NoisePatternFilter struct{}
NoisePatternFilter removes elements that match known non-content patterns
func NewNoisePatternFilter ¶
func NewNoisePatternFilter() *NoisePatternFilter
NewNoisePatternFilter creates a new NoisePatternFilter
func (*NoisePatternFilter) Filter ¶
func (f *NoisePatternFilter) Filter(doc *goquery.Document) *goquery.Document
Filter removes elements matching noise patterns
func (*NoisePatternFilter) Name ¶
func (f *NoisePatternFilter) Name() string
Name returns the filter name
type OnCrawlCompleteFunc ¶
type OnCrawlCompleteFunc func(result *CrawlResult)
OnCrawlCompleteFunc is called when the entire crawl finishes. The CrawlResult contains comprehensive information about how and why the crawl completed, including the completion reason (exhausted, budget_reached, or cancelled) and pending URLs for incremental crawling support.
type OnPageCrawledFunc ¶
type OnPageCrawledFunc func(*PageResult)
OnPageCrawledFunc is called after each HTML page is successfully crawled or encounters an error. This callback receives HTML pages only, not resources. For resources (images, CSS, JS), use SetOnResourceVisit instead.
type OnResourceVisitFunc ¶
type OnResourceVisitFunc func(*ResourceResult)
OnResourceVisitFunc is called for each resource (non-HTML asset) visited during crawling. Resources include images, stylesheets, scripts, and other non-HTML content. Use this for resource validation/checking without the overhead of PageResult.
type OnURLDiscoveredFunc ¶
OnURLDiscoveredFunc is called when a new URL is discovered during crawling. This callback is invoked exactly once per unique URL to determine how it should be handled. The return value indicates whether the URL should be crawled, recorded only, or skipped entirely. Use cases:
- Return URLActionCrawl for normal URLs that should be crawled
- Return URLActionRecordOnly for framework-specific paths that should appear in links but not be crawled
- Return URLActionSkip for analytics/tracking URLs that should be ignored completely
type PageMetadata ¶
PageMetadata stores cached metadata for crawled pages
type PageResult ¶
type PageResult struct {
// URL is the URL that was crawled
URL string
// Status is the HTTP status code (e.g., 200, 404, 500)
Status int
// Title is the page title extracted from the <title> tag (for HTML pages)
Title string
// MetaDescription is the content of the <meta name="description"> tag
MetaDescription string
// H1 is the text of the first <h1> tag on the page
H1 string
// H2 is the text of the first <h2> tag on the page
H2 string
// CanonicalURL is the canonical URL specified in <link rel="canonical">
CanonicalURL string
// WordCount is the approximate word count of visible text on the page
WordCount int
// Indexable indicates if search engines can index this page
// Values: "Yes", "No", or "-" for non-HTML resources
Indexable string
// ContentType is the Content-Type header value (e.g., "text/html", "application/json")
ContentType string
// Error contains any error message if the crawl failed, empty otherwise
Error string
// Links contains all outbound links from this page (internal and external)
Links *Links
// ContentHash is the hash of the normalized page content (empty if content hashing is disabled)
ContentHash string
// IsDuplicateContent indicates if this content hash has been seen before on a different URL
IsDuplicateContent bool
// contains filtered or unexported fields
}
PageResult contains all data collected from a single crawled page
func (*PageResult) GetHTML ¶
func (pr *PageResult) GetHTML() string
GetHTML returns the full HTML content of the page. Returns empty string if the response is not available.
func (*PageResult) GetTextContent ¶
func (pr *PageResult) GetTextContent() string
GetTextContent returns text from the main content area only (excluding navigation, headers, footers). Extracts text from semantic HTML5 elements like <article>, <main>, or [role="main"]. Returns empty string if the response is not available or is not HTML.
func (*PageResult) GetTextFull ¶
func (pr *PageResult) GetTextFull() string
GetTextFull returns all visible text from the entire page (including navigation, headers, footers). HTML tags are stripped, leaving only the text content. Returns empty string if the response is not available or is not HTML.
type RedirectCallback ¶
RedirectCallback is a type alias for OnRedirect callback functions. It receives the redirect request and the chain of previous requests. Return nil to allow the redirect, or an error to block it.
type RedirectResponse ¶
type RedirectResponse struct {
// URL is the URL that issued the redirect
URL string
// StatusCode is the HTTP redirect status code (301, 302, 307, 308, etc.)
StatusCode int
// Headers contains the redirect response headers
Headers *http.Header
// Location is the value of the Location header (where the redirect points to)
Location string
}
RedirectResponse represents an intermediate redirect in a redirect chain
type RenderingConfig ¶
type RenderingConfig struct {
// InitialWaitMs is the initial wait time after page load (in milliseconds)
// This allows time for JavaScript frameworks (React/Next.js) to hydrate
// Default: 1500ms (matching ScreamingFrog's 5s AJAX timeout approach)
InitialWaitMs int
// ScrollWaitMs is the wait time after scrolling to bottom (in milliseconds)
// This triggers lazy-loaded images and content
// Default: 2000ms
ScrollWaitMs int
// FinalWaitMs is the final wait time before capturing HTML (in milliseconds)
// This allows remaining network requests and DOM updates to complete
// Default: 1000ms
FinalWaitMs int
}
RenderingConfig controls JavaScript rendering behavior with headless Chrome
type Request ¶
type Request struct {
// URL is the parsed URL of the HTTP request
URL *url.URL
// Headers contains the Request's HTTP headers
Headers *http.Header
// the Host header
Host string
// Ctx is a context between a Request and a Response
Ctx *Context
// Depth is the number of the parents of the request
Depth int
// Method is the HTTP method of the request
Method string
// Body is the request body which is used on POST/PUT requests
Body io.Reader
// ResponseCharacterencoding is the character encoding of the response body.
// Leave it blank to allow automatic character encoding of the response body.
// It is empty by default and it can be set in OnRequest callback.
ResponseCharacterEncoding string
// ID is the Unique identifier of the request
ID uint32
// contains filtered or unexported fields
}
Request is the representation of a HTTP request made by a Collector
func (*Request) Abort ¶
func (r *Request) Abort()
Abort cancels the HTTP request when called in an OnRequest callback
func (*Request) AbsoluteURL ¶
AbsoluteURL returns with the resolved absolute URL of an URL chunk. AbsoluteURL returns empty string if the URL chunk is a fragment, could not be parsed, or has a non-http(s) scheme (mailto:, tel:, etc.) URL fragments are stripped as they represent anchors within a page, not separate crawlable URLs.
func (*Request) Post ¶
Post continues a collector job by creating a POST request and preserves the Context of the previous request. This allows passing data (like authentication tokens or session state) from the current request to the next one. Post also calls the previously provided callbacks
func (*Request) PostMultipart ¶
PostMultipart starts a collector job by creating a Multipart POST request with raw binary data. PostMultipart also calls the previously provided. callbacks
func (*Request) PostRaw ¶
PostRaw starts a collector job by creating a POST request with raw binary data. PostRaw preserves the Context of the previous request and calls the previously provided callbacks
func (*Request) Retry ¶
Retry submits HTTP request again with the same parameters. The Context is preserved across retries, allowing you to track retry attempts or maintain state between the original request and retries.
func (*Request) Visit ¶
Visit continues Collector's collecting job by creating a request and preserves the Context of the previous request. This allows passing data (like authentication tokens or session state) from the current request to the next one. Visit also calls the previously provided callbacks
type RequestCallback ¶
type RequestCallback func(*Request)
RequestCallback is a type alias for OnRequest callback functions
type ResourceResult ¶
type ResourceResult struct {
// URL is the resource URL that was visited
URL string
// Status is the HTTP status code (e.g., 200, 404, 500)
Status int
// ContentType is the MIME type (e.g., "image/png", "text/css", "application/javascript")
ContentType string
// Error contains any error message if the visit failed, empty otherwise
Error string
}
ResourceResult contains data from a visited resource (non-HTML asset)
type ResourceValidationConfig ¶
type ResourceValidationConfig struct {
// Enabled turns on resource validation
// Default: true
Enabled bool
// ResourceTypes specifies which resource types to check
// Options: "image", "script", "stylesheet", "video", "audio", "iframe"
// Default: ["image", "script", "stylesheet"]
// Empty array = check all types
ResourceTypes []string
// CheckExternal controls whether to check external resources
// Default: true
CheckExternal bool
}
ResourceValidationConfig controls checking of non-HTML resources for broken links
type Response ¶
type Response struct {
// StatusCode is the status code of the Response
StatusCode int
// Body is the content of the Response
Body []byte
// Ctx is a context between a Request and a Response
Ctx *Context
// Request is the Request object of the response
Request *Request
// Headers contains the Response's HTTP headers
Headers *http.Header
// Trace contains the HTTPTrace for the request. Will only be set by the
// collector if Collector.TraceHTTP is set to true.
Trace *HTTPTrace
// RedirectChain contains all intermediate redirect responses for this request.
// For a redirect chain A→B→C, this contains responses for A and B (not C, which is the main Response).
// Each entry has the actual redirect status code (301, 302, 307, 308) and headers.
RedirectChain []*RedirectResponse
}
Response is the representation of a HTTP response made by a Collector
type ResponseCallback ¶
type ResponseCallback func(*Response)
ResponseCallback is a type alias for OnResponse callback functions
type ResponseHeadersCallback ¶
type ResponseHeadersCallback func(*Response)
ResponseHeadersCallback is a type alias for OnResponseHeaders callback functions
type ScrapedCallback ¶
type ScrapedCallback func(*Response)
ScrapedCallback is a type alias for OnScraped callback functions
type Sitemap ¶
Sitemap is a structure of <sitemap>
func ForceGet ¶
ForceGet is fetch and parse sitemap.xml/sitemapindex.xml. The difference with the Get function is that it ignores some errors.
Errors to Ignore:
・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved. ・When sitemapindex.xml contains a sitemap.xml that is empty ・When sitemapindex.xml contains a sitemap.xml that has format problems.
Errors not to Ignore:
・When sitemap.xml/sitemapindex.xml could not retrieved. ・When sitemap.xml/sitemapindex.xml is empty. ・When sitemap.xml/sitemapindex.xml has format problems.
If you want **not** to ignore some errors, use the Get function.
func Get ¶
Get is fetch and parse sitemap.xml/sitemapindex.xml
If sitemap.xml or sitemapindex.xml has some problems, This function return error.
・When sitemap.xml/sitemapindex.xml could not retrieved. ・When sitemap.xml/sitemapindex.xml is empty. ・When sitemap.xml/sitemapindex.xml has format problems. ・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved. ・When sitemapindex.xml contains a sitemap.xml that is empty ・When sitemapindex.xml contains a sitemap.xml that has format problems.
If you want to ignore these errors, use the ForceGet function.
func ReadSitemap ¶
ReadSitemap is a function that reads a file and returns a Sitemap structure.
type StopwordsScorer ¶
type StopwordsScorer struct {
// contains filtered or unexported fields
}
StopwordsScorer provides methods for scoring text based on stopwords
func NewStopwordsScorer ¶
func NewStopwordsScorer() *StopwordsScorer
NewStopwordsScorer creates a new StopwordsScorer with English stopwords
func (*StopwordsScorer) CountStopwords ¶
func (s *StopwordsScorer) CountStopwords(text string) int
CountStopwords returns the number of stopwords in the given text Ported from GoOse's stopwords.go stopWordsCount
func (*StopwordsScorer) ScoreText ¶
func (s *StopwordsScorer) ScoreText(text string) int
ScoreText returns a score for the text based on stopwords and length Higher scores indicate more likely content
type URL ¶
type URL struct {
Loc string `xml:"loc"`
LastMod string `xml:"lastmod"`
ChangeFreq string `xml:"changefreq"`
Priority float32 `xml:"priority"`
}
URL is a structure of <url> in <sitemap>
type URLAction ¶
type URLAction string
URLAction represents the action to take when a URL is discovered during crawling
const ( // URLActionCrawl indicates the URL should be added to links and crawled URLActionCrawl URLAction = "crawl" // URLActionRecordOnly indicates the URL should be added to links but NOT crawled (e.g., framework-specific paths) URLActionRecordOnly URLAction = "record" // URLActionSkip indicates the URL should be ignored completely (e.g., analytics/tracking URLs) URLActionSkip URLAction = "skip" )
type URLDiscoveryRequest ¶
type URLDiscoveryRequest struct {
URL string // The discovered URL
Source string // Discovery source: "initial", "sitemap", "spider", "network", "resource"
ParentURL string // URL where this was discovered (for spider/network)
Depth int // Crawl depth
Context *Context // Request context for passing metadata
}
URLDiscoveryRequest represents a URL discovered during crawling
type WorkerPool ¶
type WorkerPool struct {
// contains filtered or unexported fields
}
WorkerPool manages a fixed number of worker goroutines that process work items from a queue. This provides controlled concurrency and prevents unbounded goroutine creation.
func NewWorkerPool ¶
func NewWorkerPool(ctx context.Context, maxWorkers int, queueSize int) *WorkerPool
NewWorkerPool creates a new worker pool with the specified number of workers and queue size. Parameters:
- ctx: Context for cancellation
- maxWorkers: Number of concurrent worker goroutines
- queueSize: Buffer size for the work queue (blocks when full)
func (*WorkerPool) Close ¶
func (wp *WorkerPool) Close()
Close shuts down the worker pool gracefully. It closes the work queue and waits for all workers to finish their current tasks.
func (*WorkerPool) Submit ¶
func (wp *WorkerPool) Submit(work func()) error
Submit submits a work item to the pool. This method BLOCKS if the work queue is full, providing backpressure. Returns an error if the context is cancelled.
type XMLCallback ¶
type XMLCallback func(*XMLElement)
XMLCallback is a type alias for OnXML callback functions
type XMLElement ¶
type XMLElement struct {
// Name is the name of the tag
Name string
Text string
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the DOM object of the page. DOM is relative
// to the current XMLElement and is either a html.Node or xmlquery.Node
// based on how the XMLElement was created.
DOM interface{}
// Index stores the position of the current element within all the elements matched by an OnXML callback
Index int
// contains filtered or unexported fields
}
XMLElement is the representation of a XML tag.
func NewXMLElementFromHTMLNode ¶
func NewXMLElementFromHTMLNode(resp *Response, s *html.Node) *XMLElement
NewXMLElementFromHTMLNode creates a XMLElement from a html.Node.
func NewXMLElementFromXMLNode ¶
func NewXMLElementFromXMLNode(resp *Response, s *xmlquery.Node) *XMLElement
NewXMLElementFromXMLNode creates a XMLElement from a xmlquery.Node.
func (*XMLElement) Attr ¶
func (h *XMLElement) Attr(k string) string
Attr returns the selected attribute of a HTMLElement or empty string if no attribute found
func (*XMLElement) ChildAttr ¶
func (h *XMLElement) ChildAttr(xpathQuery, attrName string) string
ChildAttr returns the stripped text content of the first matching element's attribute.
func (*XMLElement) ChildAttrs ¶
func (h *XMLElement) ChildAttrs(xpathQuery, attrName string) []string
ChildAttrs returns the stripped text content of all the matching element's attributes.
func (*XMLElement) ChildText ¶
func (h *XMLElement) ChildText(xpathQuery string) string
ChildText returns the concatenated and stripped text content of the matching elements.
func (*XMLElement) ChildTexts ¶
func (h *XMLElement) ChildTexts(xpathQuery string) []string
ChildTexts returns an array of strings corresponding to child elements that match the xpath query. Each item in the array is the stripped text content of the corresponding matching child element.