Documentation
¶
Overview ¶
Package scraper implements a Cardigann-compatible scraping engine that executes Prowlarr YAML indexer definitions to scrape tracker websites and return structured search results.
Index ¶
- func ApplyFilters(value string, filters []FilterDef) string
- func EvalTemplate(tmplStr string, ctx *TemplateContext) (string, error)
- func EvalTemplateOr(tmplStr string, ctx *TemplateContext, fallback string) string
- func ExtractFieldHTML(row *goquery.Selection, field FieldBlock, ctx *TemplateContext) string
- func ExtractFieldJSON(jsonStr string, field FieldBlock, ctx *TemplateContext) string
- func FindRowsHTML(doc *goquery.Document, rows RowsBlock) *goquery.Selection
- func FindRowsJSON(body string, rows RowsBlock) []string
- func IsCloudflareError(err error) bool
- type CapsBlock
- type CategoryMapping
- type CloudflareError
- type Definition
- type DownloadBlock
- type Engine
- func (e *Engine) Cache() *ResultCache
- func (e *Engine) DefinitionCount() int
- func (e *Engine) GetRunner(catalogID string, settingsJSON string) (*Runner, error)
- func (e *Engine) HasDefinition(catalogID string) bool
- func (e *Engine) LoadDefinitions(defs map[string][]byte)
- func (e *Engine) ResolveCatalogID(name, url string) string
- type FieldBlock
- type FilterDef
- type FlareSolverr
- type LoginBlock
- type QueryContext
- type RateLimitedClient
- type ResponseConfig
- type ResultCache
- type RowsBlock
- type Runner
- type SearchBlock
- type SearchPath
- type SearchResult
- type SelectorBlock
- type SettingField
- type TemplateContext
- type TestBlock
- type TodayContext
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ApplyFilters ¶
ApplyFilters runs a sequence of filters on a value.
func EvalTemplate ¶
func EvalTemplate(tmplStr string, ctx *TemplateContext) (string, error)
EvalTemplate evaluates a Go template string with the given context.
func EvalTemplateOr ¶
func EvalTemplateOr(tmplStr string, ctx *TemplateContext, fallback string) string
EvalTemplateOr evaluates a template, returning the fallback on error.
func ExtractFieldHTML ¶
func ExtractFieldHTML(row *goquery.Selection, field FieldBlock, ctx *TemplateContext) string
ExtractFieldHTML extracts a field value from an HTML selection using the FieldBlock definition.
func ExtractFieldJSON ¶
func ExtractFieldJSON(jsonStr string, field FieldBlock, ctx *TemplateContext) string
ExtractFieldJSON extracts a field value from a JSON result using gjson paths.
func FindRowsHTML ¶
FindRowsHTML finds result rows in an HTML document using the rows selector.
func FindRowsJSON ¶
FindRowsJSON extracts result items from a JSON response.
func IsCloudflareError ¶
IsCloudflareError checks if an error is a Cloudflare block.
Types ¶
type CapsBlock ¶
type CapsBlock struct {
CategoryMappings []CategoryMapping `yaml:"categorymappings"`
Modes map[string][]string `yaml:"modes"`
AllowRawSearch bool `yaml:"allowrawsearch"`
}
CapsBlock describes what the indexer supports.
type CategoryMapping ¶
type CategoryMapping struct {
ID string `yaml:"id"`
Cat string `yaml:"cat"`
Desc string `yaml:"desc"`
}
CategoryMapping maps a site-specific category ID to a Newznab category.
type CloudflareError ¶
type CloudflareError struct {
StatusCode int
}
CloudflareError indicates the request was blocked by Cloudflare protection.
func (*CloudflareError) Error ¶
func (e *CloudflareError) Error() string
type Definition ¶
type Definition struct {
ID string `yaml:"id"`
Name string `yaml:"name"`
Description string `yaml:"description"`
Language string `yaml:"language"`
Type string `yaml:"type"` // public, semi-private, private
Encoding string `yaml:"encoding"`
RequestDelay int `yaml:"requestDelay"` // milliseconds
Links []string `yaml:"links"`
LegacyLinks []string `yaml:"legacylinks"`
Caps CapsBlock `yaml:"caps"`
Settings []SettingField `yaml:"settings"`
Login *LoginBlock `yaml:"login,omitempty"`
Search SearchBlock `yaml:"search"`
Download DownloadBlock `yaml:"download"`
}
Definition is the full parsed representation of a Prowlarr Cardigann YAML file.
func ParseDefinition ¶
func ParseDefinition(raw []byte) (*Definition, error)
ParseDefinition parses raw YAML bytes into a Definition.
type DownloadBlock ¶
type DownloadBlock struct {
Selectors []SelectorBlock `yaml:"selectors"`
Method string `yaml:"method"`
}
DownloadBlock defines how to extract download links.
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine is the top-level scraping coordinator. It manages definitions, runners, and the result cache.
func NewEngine ¶
func NewEngine(logger *slog.Logger, flaresolverr *FlareSolverr) *Engine
NewEngine creates a new scraping engine. flaresolverr can be nil if not configured.
func (*Engine) DefinitionCount ¶
DefinitionCount returns how many definitions are loaded.
func (*Engine) GetRunner ¶
GetRunner returns a cached or new Runner for the given catalog ID and settings.
func (*Engine) HasDefinition ¶
HasDefinition checks if a definition exists for the given catalog ID.
func (*Engine) LoadDefinitions ¶
LoadDefinitions stores raw YAML bytes for all catalog entries. Called after the Prowlarr zip is fetched.
func (*Engine) ResolveCatalogID ¶
ResolveCatalogID finds the Prowlarr catalog ID for an indexer by trying: 1. Exact catalog ID match 2. Normalized name match (e.g., "The Pirate Bay" → "thepiratebay") 3. URL domain match (e.g., "https://1337x.to" → looks up "1337x.to")
type FieldBlock ¶
type FieldBlock struct {
SelectorBlock `yaml:",inline"`
Optional bool `yaml:"optional"`
}
FieldBlock defines how to extract a single field from a result row.
type FilterDef ¶
type FilterDef struct {
Name string `yaml:"name"`
Args interface{} `yaml:"args"` // string, []string, or []interface{}
}
FilterDef describes a single filter in the pipeline.
type FlareSolverr ¶
type FlareSolverr struct {
// contains filtered or unexported fields
}
FlareSolverr is a client for the FlareSolverr Cloudflare bypass proxy.
func NewFlareSolverr ¶
func NewFlareSolverr(apiURL string, logger *slog.Logger) *FlareSolverr
NewFlareSolverr creates a FlareSolverr client. Returns nil if URL is empty.
func (*FlareSolverr) GetSession ¶
func (f *FlareSolverr) GetSession(domain string) (*cfSession, bool)
GetSession returns the cached CF session for a domain, if valid.
type LoginBlock ¶
type LoginBlock struct {
Path string `yaml:"path"`
Method string `yaml:"method"` // post, get, form, cookie
Inputs map[string]string `yaml:"inputs"`
Error []SelectorBlock `yaml:"error"`
Test *TestBlock `yaml:"test"`
}
LoginBlock describes how to authenticate with a private tracker.
type QueryContext ¶
type QueryContext struct {
Type string
Q string
IMDBID string
TMDBID string
TVDBID string
Season string
Ep string
Year string
Page int
}
QueryContext holds search query parameters.
type RateLimitedClient ¶
type RateLimitedClient struct {
// contains filtered or unexported fields
}
RateLimitedClient is an HTTP client with per-host rate limiting.
func NewRateLimitedClient ¶
func NewRateLimitedClient(delay time.Duration) *RateLimitedClient
NewRateLimitedClient creates an HTTP client with rate limiting. delay is the minimum time between requests (from the YAML requestDelay field).
func (*RateLimitedClient) ApplyCFSession ¶
func (c *RateLimitedClient) ApplyCFSession(rawURL string, cookies []*http.Cookie, userAgent string)
ApplyCFSession sets Cloudflare bypass cookies and user agent from a FlareSolverr session onto this client's cookie jar.
type ResponseConfig ¶
type ResponseConfig struct {
Type string `yaml:"type"` // html (default), json, xml
NoResultsMessage string `yaml:"noResultsMessage"`
}
ResponseConfig specifies the response format.
type ResultCache ¶
type ResultCache struct {
// contains filtered or unexported fields
}
ResultCache is a simple in-memory cache for search results.
func NewResultCache ¶
func NewResultCache(ttl time.Duration) *ResultCache
NewResultCache creates a result cache with the given TTL.
func (*ResultCache) Cleanup ¶
func (c *ResultCache) Cleanup()
Cleanup removes expired entries. Call periodically.
func (*ResultCache) Get ¶
func (c *ResultCache) Get(key string) ([]SearchResult, bool)
Get returns cached results if they exist and are not expired.
func (*ResultCache) Set ¶
func (c *ResultCache) Set(key string, results []SearchResult)
Set stores results in the cache.
type RowsBlock ¶
type RowsBlock struct {
Selector string `yaml:"selector"`
After int `yaml:"after"`
Remove string `yaml:"remove"`
Multiple bool `yaml:"multiple"`
Count *SelectorBlock `yaml:"count,omitempty"`
}
RowsBlock defines how to find result rows in the response.
type Runner ¶
type Runner struct {
// contains filtered or unexported fields
}
Runner executes searches against a single indexer using its YAML definition.
func NewRunner ¶
func NewRunner(def *Definition, settings map[string]string, flaresolverr *FlareSolverr, logger *slog.Logger) *Runner
NewRunner creates a runner for the given definition.
func (*Runner) Definition ¶
func (r *Runner) Definition() *Definition
Definition returns the underlying definition.
func (*Runner) ResolveDownload ¶
ResolveDownload fetches a detail page and extracts the actual download URL (magnet link or .torrent URL) using the definition's download selectors.
type SearchBlock ¶
type SearchBlock struct {
Paths []SearchPath `yaml:"paths"`
Inputs map[string]string `yaml:"inputs"`
KeywordsFilters []FilterDef `yaml:"keywordsfilters"`
Rows RowsBlock `yaml:"rows"`
Fields map[string]FieldBlock `yaml:"fields"`
}
SearchBlock defines how to search the tracker.
type SearchPath ¶
type SearchPath struct {
Path string `yaml:"path"`
Method string `yaml:"method"`
Inputs map[string]string `yaml:"inputs"`
Response *ResponseConfig `yaml:"response,omitempty"`
}
SearchPath is a single search URL pattern.
type SearchResult ¶
type SearchResult struct {
Title string
Details string // URL to the detail page
Download string // .torrent URL or magnet URI
MagnetURI string
InfoHash string
Size int64
Seeders int
Leechers int
Grabs int
Date string // RFC3339 or raw
Category string // Newznab category ID
DownloadVolumeFactor float64
UploadVolumeFactor float64
Description string
IMDBID string
Poster string
}
SearchResult is a single result from a search execution.
type SelectorBlock ¶
type SelectorBlock struct {
Selector string `yaml:"selector"`
Text string `yaml:"text"`
Attribute string `yaml:"attribute"`
Remove string `yaml:"remove"`
Filters []FilterDef `yaml:"filters"`
Case map[string]string `yaml:"case"`
}
SelectorBlock is the core unit for extracting data from HTML/JSON.
type SettingField ¶
type SettingField struct {
Name string `yaml:"name"`
Type string `yaml:"type"`
Label string `yaml:"label"`
Default interface{} `yaml:"default"`
Options interface{} `yaml:"options"` // map[string]string or []map[string]string
}
SettingField is a user-configurable field in the YAML definition. Uses interface{} for Default and Options to handle varied YAML types.
type TemplateContext ¶
type TemplateContext struct {
Config map[string]string // user settings (sort, downloadlink, etc.)
Keywords string // search query (after keywordsfilters)
Query QueryContext
Categories []string // site category IDs
Result map[string]string // for self-referencing fields
True bool
False bool
Today TodayContext
}
TemplateContext holds the variables available in Cardigann templates.
func NewTemplateContext ¶
func NewTemplateContext(config map[string]string, query string) *TemplateContext
NewTemplateContext creates a context with sensible defaults.