fetch

package
v0.5.42 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 24, 2026 License: GPL-3.0 Imports: 27 Imported by: 0

Documentation

Overview

Package cache provides a http.RoundTripper implementation that works as a mostly RFC-compliant cache for http responses.

It is only suitable for use as a 'private' cache (i.e. for a web-browser or an API-client and not for a shared proxy).

Mostly borrowed from https://github.com/gregjones/httpcache. Customized for different policies.

Index

Constants

View Source
const (
	InteractionTypeClick  = "click"
	InteractionTypeScroll = "scroll"
)

Variables

View Source
var DefaultMaxBody int64 = 1024 * 1024 * 1024 // 1GB
View Source
var DoDebug = false
View Source
var ErrorIfPageNotInCache = false
View Source
var ShowCaching = false
View Source
var Synchronized = true

Functions

func CacheURLFilebase added in v0.5.39

func CacheURLFilebase(dir string, urlStr string) string

func CacheURLFilename added in v0.5.39

func CacheURLFilename(dir string, urlStr string) string

func MakeURLStringSlug

func MakeURLStringSlug(u string) string

func NormalizeHTMLString added in v0.5.39

func NormalizeHTMLString(htmlStr string) (string, error)

NormalizeHTMLString parses and re-serializes HTML to ensure consistent structure. This ensures that HTML auto-corrections (like wrapping <tr> in <tbody>) are applied consistently during both pattern generation and scraping phases.

func SetGQDocument added in v0.5.39

func SetGQDocument(cache Cache, u string, gqdoc *Document)

func TrimURLScheme

func TrimURLScheme(u string) string

Types

type Cache added in v0.5.39

type Cache interface {
	// Get returns the []byte representation of a cached response and a bool
	// set to true if the value isn't empty
	Get(key string) (responseBytes []byte, ok bool)
	// Set stores the []byte representation of a response against a key
	Set(key string, responseBytes []byte)
	// Delete removes the value associated with the key
	Delete(key string)
	// GetResolvedURL returns the final URL after following redirects
	GetResolvedURL(rawURL string) (string, error)
}

A Cache interface is used by the Transport to store and retrieve responses.

type Document added in v0.5.39

type Document struct {
	*goquery.Document
	// contains filtered or unexported fields
}

func GQDocumentFromURLResponse added in v0.5.39

func GQDocumentFromURLResponse(urlResp *URLResponse) (*Document, error)

func GetGQDocument added in v0.5.39

func GetGQDocument(cache Cache, u string) (*Document, bool, error)

func GetGQDocuments added in v0.5.39

func GetGQDocuments(cache Cache, us []string) ([]*Document, []error)

func NewDocument added in v0.5.39

func NewDocument(gqdoc *goquery.Document) *Document

func NewDocumentFromResponse added in v0.5.39

func NewDocumentFromResponse(str string) (*Document, error)

func NewDocumentFromString added in v0.5.39

func NewDocumentFromString(str string) (*Document, error)

func ResponseBytesToGQDocument added in v0.5.39

func ResponseBytesToGQDocument(respBytes []byte) (*Document, error)

func (*Document) Find added in v0.5.39

func (gqdoc *Document) Find(selector string) *Selection

type DynamicFetcher

type DynamicFetcher struct {
	UserAgent        string
	WaitMilliseconds int
	// contains filtered or unexported fields
}

The DynamicFetcher renders js

func NewDynamicFetcher

func NewDynamicFetcher(ua string, ms int) *DynamicFetcher

func (*DynamicFetcher) Cancel

func (d *DynamicFetcher) Cancel()

func (*DynamicFetcher) Fetch

func (d *DynamicFetcher) Fetch(urlStr string, opts *FetchOpts) (*URLResponse, error)

type FetchCache added in v0.5.39

type FetchCache struct {
	// contains filtered or unexported fields
}

FetchCache is an implementation of Cache that fetches webpages. web.

func NewFetchCache added in v0.5.39

func NewFetchCache(fetcher Fetcher) *FetchCache

New returns a new FetchCache that will fetch webpages

func (*FetchCache) Delete added in v0.5.39

func (c *FetchCache) Delete(key string)

func (*FetchCache) Get added in v0.5.39

func (c *FetchCache) Get(key string) ([]byte, bool)

Get returns the response corresponding to key, and true, if found on the web. Otherwise it returns nil and false.

func (*FetchCache) GetResolvedURL added in v0.5.39

func (c *FetchCache) GetResolvedURL(rawURL string) (string, error)

GetResolvedURL returns the final URL after following redirects. For FetchCache, we fetch the URL and return the final resolved URL.

func (*FetchCache) Set added in v0.5.39

func (c *FetchCache) Set(key string, resp []byte)

type FetchOpts

type FetchOpts struct {
	Interaction []*Interaction
}

type Fetcher

type Fetcher interface {
	Fetch(url string, opts *FetchOpts) (*URLResponse, error)
}

A Fetcher allows to fetch the content of a web page

type FileCache added in v0.5.39

type FileCache struct {
	// contains filtered or unexported fields
}

Cache is an implementation of Geziyor cache.Cache that stores html pages on disk.

func NewFileCache added in v0.5.39

func NewFileCache(fallback Cache, parentDir string, writeable bool, filenameFn func(string, string) string) *FileCache

New returns a new Cache that will store files in dir.

func NewURLFileCache added in v0.5.39

func NewURLFileCache(fallback Cache, parentDir string, writeable bool) *FileCache

New returns a new Cache that will store files in dir.

func (*FileCache) Delete added in v0.5.39

func (c *FileCache) Delete(key string)

Delete removes the response with key from the cache

func (*FileCache) Get added in v0.5.39

func (c *FileCache) Get(key string) ([]byte, bool)

Get returns the response corresponding to key, and true, if present in InputDir or OutputDir. Otherwise it returns nil and false.

func (*FileCache) GetResolvedURL added in v0.5.39

func (c *FileCache) GetResolvedURL(rawURL string) (string, error)

GetResolvedURL returns the final URL after following redirects. For FileCache, we delegate to the fallback cache if available.

func (*FileCache) Set added in v0.5.39

func (c *FileCache) Set(key string, resp []byte)

Set saves a response to the cache as key

type FileFetcher

type FileFetcher struct {
}

The FileFetcher fetches static page content

func (*FileFetcher) Fetch

func (s *FileFetcher) Fetch(url string, opts *FetchOpts) (string, error)

type Interaction added in v0.5.39

type Interaction struct {
	Type     string `yaml:"type,omitempty"`
	Selector string `yaml:"selector,omitempty"`
	Count    int    `yaml:"count,omitempty"`
	Delay    int    `yaml:"delay,omitempty"`
}

Interaction represents a simple user interaction with a webpage

type MemoryCache added in v0.5.39

type MemoryCache struct {
	// contains filtered or unexported fields
}

Cache is an implementation of Cache that stores responses in an in-memory map.

func NewMemoryCache added in v0.5.39

func NewMemoryCache(fallback Cache) *MemoryCache

New returns a new Cache that will store items in an in-memory map

func (*MemoryCache) Delete added in v0.5.39

func (c *MemoryCache) Delete(key string)

Delete removes key from the cache

func (*MemoryCache) Get added in v0.5.39

func (c *MemoryCache) Get(key string) ([]byte, bool)

Get returns the []byte representation of the response and true if present, false if not

func (*MemoryCache) GetResolvedURL added in v0.5.39

func (c *MemoryCache) GetResolvedURL(rawURL string) (string, error)

GetResolvedURL returns the final URL after following redirects

func (*MemoryCache) Set added in v0.5.39

func (c *MemoryCache) Set(key string, resp []byte)

Set saves response resp to the cache with key

type Selection added in v0.5.39

type Selection struct {
	*goquery.Selection
	// contains filtered or unexported fields
}

func NewSelection added in v0.5.39

func NewSelection(sel *goquery.Selection) *Selection

func (*Selection) Find added in v0.5.39

func (sel *Selection) Find(selector string) *Selection

type StaticFetcher

type StaticFetcher struct {
	UserAgent string
	Jar       *cookiejar.Jar
	// contains filtered or unexported fields
}

The StaticFetcher fetches static page content

func NewStaticFetcher added in v0.5.39

func NewStaticFetcher() *StaticFetcher

func (*StaticFetcher) Fetch

func (s *StaticFetcher) Fetch(u string, opts *FetchOpts) (*URLResponse, error)

func (*StaticFetcher) SetTransport added in v0.5.39

func (s *StaticFetcher) SetTransport(tr http.RoundTripper)

type URLResponse added in v0.5.39

type URLResponse struct {
	RequestedURL string
	ResolvedURL  string
	StatusCode   int
	ContentType  string
	Data         []byte
}

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL