Documentation
¶
Index ¶
- Variables
- func CategorizeError(err error) string
- func IsPrivateIP(ip net.IP) bool
- func SafeDialContext(allowPrivate bool) func(ctx context.Context, network, addr string) (net.Conn, error)
- func SafeDialContextWithOpts(opts DialOptions) func(ctx context.Context, network, addr string) (net.Conn, error)
- type DialOptions
- type FetchResult
- type Fetcher
- type RedirectHop
- type RobotsCache
- type RobotsCacheEntry
- type SitemapEntry
- type SitemapURL
- type TLSProfile
Constants ¶
This section is empty.
Variables ¶
var ErrPrivateIP = errors.New("connection to private/reserved IP address is blocked")
ErrPrivateIP is returned when a dial attempts to connect to a private/reserved IP.
Functions ¶
func CategorizeError ¶
CategorizeError classifies a fetch error into a category string.
func IsPrivateIP ¶
IsPrivateIP checks if an IP address belongs to a private or reserved range.
func SafeDialContext ¶
func SafeDialContext(allowPrivate bool) func(ctx context.Context, network, addr string) (net.Conn, error)
SafeDialContext returns a DialContext function that blocks connections to private IPs after DNS resolution (anti DNS-rebinding). When allowPrivate is true, no filtering is applied.
func SafeDialContextWithOpts ¶
func SafeDialContextWithOpts(opts DialOptions) func(ctx context.Context, network, addr string) (net.Conn, error)
SafeDialContextWithOpts returns a DialContext function with full network options: source IP binding, IPv4-only mode, and private IP filtering.
Types ¶
type DialOptions ¶
DialOptions configures network-level behavior for the safe dialer.
type FetchResult ¶
type FetchResult struct {
URL string
FinalURL string
StatusCode int
ContentType string
Headers map[string]string
Body []byte
BodySize int64
BodyTruncated bool
RedirectChain []RedirectHop
Duration time.Duration
Error string
Depth int
FoundOn string
Attempt int // retry attempt number (0 = first try)
}
FetchResult contains the result of fetching a URL.
func (*FetchResult) IsHTML ¶
func (r *FetchResult) IsHTML() bool
IsHTML checks if the FetchResult contains HTML content.
type Fetcher ¶
type Fetcher struct {
// contains filtered or unexported fields
}
Fetcher performs HTTP requests with redirect chain tracking.
func New ¶
func New(userAgent string, timeout time.Duration, maxBodySize int64, dialOpts DialOptions, tlsProfile TLSProfile) *Fetcher
New creates a new Fetcher. When tlsProfile is non-empty, the transport uses utls to mimic the chosen browser's TLS fingerprint.
func (*Fetcher) Fetch ¶
func (f *Fetcher) Fetch(targetURL string, depth int, foundOn string) *FetchResult
Fetch retrieves a URL and returns the result with redirect chain.
func (*Fetcher) FetchWithContext ¶ added in v0.10.2
func (f *Fetcher) FetchWithContext(ctx context.Context, targetURL string, depth int, foundOn string) *FetchResult
FetchWithContext retrieves a URL using the provided context for cancellation.
type RedirectHop ¶
RedirectHop represents a single hop in a redirect chain.
type RobotsCache ¶
type RobotsCache struct {
// contains filtered or unexported fields
}
RobotsCache caches robots.txt data per host.
func NewRobotsCache ¶
func NewRobotsCache(userAgent string, timeout time.Duration, dialOpts DialOptions, tlsProfile TLSProfile) *RobotsCache
NewRobotsCache creates a new RobotsCache.
func (*RobotsCache) CrawlDelay ¶
func (rc *RobotsCache) CrawlDelay(targetURL string) time.Duration
CrawlDelay returns the crawl-delay specified in robots.txt for the given URL's host. Returns 0 if no crawl-delay is specified.
func (*RobotsCache) Entries ¶
func (rc *RobotsCache) Entries() map[string]*RobotsCacheEntry
Entries returns a copy of all cached robots.txt entries.
func (*RobotsCache) IsAllowed ¶
func (rc *RobotsCache) IsAllowed(targetURL string) bool
IsAllowed checks if the given URL is allowed by robots.txt.
func (*RobotsCache) SitemapURLs ¶
func (rc *RobotsCache) SitemapURLs() []string
SitemapURLs collects sitemap URLs from all cached robots.txt entries, plus common fallback paths (/sitemap.xml, /sitemap_index.xml).
type RobotsCacheEntry ¶
type RobotsCacheEntry struct {
Content string
StatusCode int
FetchedAt time.Time
// contains filtered or unexported fields
}
RobotsCacheEntry holds the raw robots.txt data for a host.
type SitemapEntry ¶
type SitemapEntry struct {
URL string
Type string // "index" or "urlset"
StatusCode int
URLs []SitemapURL
Sitemaps []string // child sitemap URLs if index
}
SitemapEntry represents a fetched sitemap (index or urlset).
func DiscoverSitemaps ¶
func DiscoverSitemaps(ctx context.Context, client *http.Client, userAgent string, sitemapURLs []string) []SitemapEntry
DiscoverSitemaps fetches all given sitemap URLs, recursing into indexes. Returns at most maxTotalSitemaps entries.
func FetchSitemap ¶
func FetchSitemap(ctx context.Context, client *http.Client, sitemapURL, userAgent string) SitemapEntry
FetchSitemap fetches and parses a single sitemap URL.
type SitemapURL ¶
type SitemapURL struct {
Loc string `xml:"loc"`
LastMod string `xml:"lastmod"`
ChangeFreq string `xml:"changefreq"`
Priority string `xml:"priority"`
}
SitemapURL represents a single URL entry in a sitemap.
type TLSProfile ¶
type TLSProfile string
TLSProfile selects which browser TLS fingerprint to mimic. Empty string means standard Go TLS (no mimicry).
const ( TLSChrome TLSProfile = "chrome" TLSFirefox TLSProfile = "firefox" TLSEdge TLSProfile = "edge" )