Documentation
¶
Overview ¶
Package fetch the http resource
Index ¶
- Constants
- Variables
- func AddRoundRobinProxy(u string, proxyURLs ...string)
- func CachedResponse(c core.Cache, req *http.Request) (resp *http.Response, err error)
- func Date(respHeaders http.Header) (date time.Time, err error)
- func DefaultRoundTripper() http.RoundTripper
- func DefaultTemplateFuncMap() template.FuncMap
- func DoString(fetch core.Fetch, req *http.Request) (string, error)
- func NewFetcher(opt Options) core.Fetch
- func NewRequest(method, u string, body any, headers map[string]string) (*http.Request, error)
- func NewTemplateRequest(funcs template.FuncMap, tpl string, arg any) (*http.Request, error)
- func RoundRobinProxy(req *http.Request) (*url.URL, error)
- func WithRequestConfig(req *http.Request, c RequestConfig) *http.Request
- type CacheTransport
- func (t *CacheTransport) Client() *http.Client
- func (t *CacheTransport) RoundTrip(req *http.Request) (resp *http.Response, err error)
- func (t *CacheTransport) RoundTripDummy(req *http.Request) (resp *http.Response, err error)
- func (t *CacheTransport) RoundTripRFC2616(req *http.Request) (resp *http.Response, err error)
- func (t *CacheTransport) SetProxy(proxy func(*http.Request) (*url.URL, error))
- type LRUCache
- type Options
- type Policy
- type RequestConfig
Constants ¶
const ( // DefaultMaxBodySize fetch.Response default max body size DefaultMaxBodySize int64 = 1024 * 1024 * 1024 // DefaultRetryTimes fetch.RequestConfig retry times DefaultRetryTimes = 3 // DefaultTimeout fetch.RequestConfig timeout DefaultTimeout = time.Minute )
const (
// DefaultPath the default cache path
DefaultPath = "cache"
)
Variables ¶
var ( // DefaultRetryHTTPCodes retry fetch.RequestConfig error status code DefaultRetryHTTPCodes = []int{http.StatusInternalServerError, http.StatusBadGateway, http.StatusServiceUnavailable, http.StatusGatewayTimeout, http.StatusRequestTimeout} // DefaultHeaders defaults fetch.RequestConfig headers DefaultHeaders = map[string]string{ "Accept": "*/*", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;", "User-Agent": "cloudcat", } )
var ErrNoDateHeader = errors.New("no Date header")
ErrNoDateHeader indicates that the HTTP headers contained no Date header.
Functions ¶
func AddRoundRobinProxy ¶
AddRoundRobinProxy add the proxy URLs for the specified URL. The proxy type is determined by the URL scheme. "http", "https" and "socks5" are supported. If the scheme is empty, "http" is assumed.
func CachedResponse ¶
CachedResponse returns the cached http.Response for req if present, and nil otherwise.
func DefaultRoundTripper ¶
func DefaultRoundTripper() http.RoundTripper
DefaultRoundTripper the fetch default RoundTripper
func DefaultTemplateFuncMap ¶
DefaultTemplateFuncMap The default template function map
func NewRequest ¶
NewRequest returns a new RequestConfig given a method, URL, optional body, optional headers.
func NewTemplateRequest ¶
NewTemplateRequest returns a new RequestConfig given a http template with argument.
func RoundRobinProxy ¶
RoundRobinProxy returns a proxy URL on specific request.
func WithRequestConfig ¶
func WithRequestConfig(req *http.Request, c RequestConfig) *http.Request
Types ¶
type CacheTransport ¶
type CacheTransport struct {
Policy Policy
// The RoundTripper interface actually used to make requests
// If nil, http.DefaultTransport is used
Transport http.RoundTripper
Cache core.Cache
// If true, responses returned from the cache will be given an extra header, X-From-Cache
MarkCachedResponses bool
}
CacheTransport is an implementation of http.RoundTripper that will return values from a cache where possible (avoiding a network request) and will additionally add validators (etag/if-modified-since) to repeated requests allowing servers to return 304 / Not Modified
func NewTransport ¶
func NewTransport(c core.Cache) *CacheTransport
NewTransport returns new CacheTransport with the provided Cache implementation and MarkCachedResponses set to true
func (*CacheTransport) Client ¶
func (t *CacheTransport) Client() *http.Client
Client returns an *http.Client that caches responses.
func (*CacheTransport) RoundTrip ¶
RoundTrip is a wrapper for caching requests. If there is a fresh Response already in cache, then it will be returned without connecting to the server.
func (*CacheTransport) RoundTripDummy ¶
RoundTripDummy has no awareness of any HTTP Cache-Control directives. Every request and its corresponding response are cached. When the same request is seen again, the response is returned without transferring anything from the Internet.
func (*CacheTransport) RoundTripRFC2616 ¶
RoundTripRFC2616 provides a RFC2616 compliant HTTP cache, i.e. with HTTP Cache-Control awareness, aimed at production and used in continuous runs to avoid downloading unmodified data (to save bandwidth and speed up crawls).
If there is a stale Response, then any validators it contains will be set on the new request to give the server a chance to respond with NotModified. If this happens, then the cached Response will be returned.
type LRUCache ¶
type LRUCache[K comparable, V any] struct { sync.RWMutex // MaxEntries is the maximum number of cache entries before // an item is evicted. Zero means no limit. MaxEntries int // OnEvicted optionally specifies a callback function to be // executed when an entry is purged from the cache. OnEvicted func(key K, value V) // contains filtered or unexported fields }
LRUCache is an LRU cache. It is safe for concurrent access.
func NewLRUCache ¶
func NewLRUCache[K comparable, V any](maxEntries int) *LRUCache[K, V]
NewLRUCache creates a new LRUCache. If maxEntries is zero, the cache has no limit, and it's assumed that eviction is done by the caller.
func (*LRUCache[K, V]) Add ¶
func (c *LRUCache[K, V]) Add(key K, value V)
Add adds a value to the cache.
func (*LRUCache[K, V]) Clear ¶
func (c *LRUCache[K, V]) Clear()
Clear purges all stored items from the cache.
func (*LRUCache[K, V]) Remove ¶
func (c *LRUCache[K, V]) Remove(key K)
Remove removes the provided key from the cache.
func (*LRUCache[K, V]) RemoveOldest ¶
func (c *LRUCache[K, V]) RemoveOldest()
RemoveOldest removes the oldest item from the cache.
type Options ¶
type Options struct {
CharsetDetectDisabled bool `yaml:"charset-detect-disabled"`
MaxBodySize int64 `yaml:"max-body-size"`
RetryTimes int `yaml:"retry-times"`
RetryHTTPCodes []int `yaml:"retry-http-codes"`
Timeout time.Duration `yaml:"timeout"`
CachePolicy Policy `yaml:"cache-policy"`
RoundTripper http.RoundTripper `yaml:"-"`
}
Options The Fetch instance options
type Policy ¶
type Policy string
Policy has no awareness of any HTTP Cache-Control directives.
const ( // Dummy policy is useful for testing spiders faster (without having to wait for downloads every time) // and for trying your spider offline, when an Internet connection is not available. // The goal is to be able to “replay” a spider run exactly as it ran before. Dummy Policy = "dummy" // RFC2616 This policy provides a RFC2616 compliant HTTP cache, i.e. with HTTP Cache-Control awareness, // aimed at production and used in continuous runs to avoid downloading unmodified data // (to save bandwidth and speed up crawls). RFC2616 Policy = "rfc2616" )
type RequestConfig ¶
type RequestConfig struct {
// Proxy on this RequestConfig
Proxy []string
// Optional response body encoding. Leave empty for automatic detection.
// If you're having issues with auto-detection, set this.
Encoding string
// contains filtered or unexported fields
}
RequestConfig the *http.Request config
func GetRequestConfig ¶
func GetRequestConfig(req *http.Request) RequestConfig