scraper

package
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 14, 2026 License: MIT Imports: 19 Imported by: 0

Documentation

Overview

Package scraper implements a Cardigann-compatible scraping engine that executes Prowlarr YAML indexer definitions to scrape tracker websites and return structured search results.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ApplyFilters

func ApplyFilters(value string, filters []FilterDef) string

ApplyFilters runs a sequence of filters on a value.

func EvalTemplate

func EvalTemplate(tmplStr string, ctx *TemplateContext) (string, error)

EvalTemplate evaluates a Go template string with the given context.

func EvalTemplateOr

func EvalTemplateOr(tmplStr string, ctx *TemplateContext, fallback string) string

EvalTemplateOr evaluates a template, returning the fallback on error.

func ExtractFieldHTML

func ExtractFieldHTML(row *goquery.Selection, field FieldBlock, ctx *TemplateContext) string

ExtractFieldHTML extracts a field value from an HTML selection using the FieldBlock definition.

func ExtractFieldJSON

func ExtractFieldJSON(jsonStr string, field FieldBlock, ctx *TemplateContext) string

ExtractFieldJSON extracts a field value from a JSON result using gjson paths.

func FindRowsHTML

func FindRowsHTML(doc *goquery.Document, rows RowsBlock) *goquery.Selection

FindRowsHTML finds result rows in an HTML document using the rows selector.

func FindRowsJSON

func FindRowsJSON(body string, rows RowsBlock) []string

FindRowsJSON extracts result items from a JSON response.

func IsCloudflareError

func IsCloudflareError(err error) bool

IsCloudflareError checks if an error is a Cloudflare block.

Types

type CapsBlock

type CapsBlock struct {
	CategoryMappings []CategoryMapping   `yaml:"categorymappings"`
	Modes            map[string][]string `yaml:"modes"`
	AllowRawSearch   bool                `yaml:"allowrawsearch"`
}

CapsBlock describes what the indexer supports.

type CategoryMapping

type CategoryMapping struct {
	ID   string `yaml:"id"`
	Cat  string `yaml:"cat"`
	Desc string `yaml:"desc"`
}

CategoryMapping maps a site-specific category ID to a Newznab category.

type CloudflareError

type CloudflareError struct {
	StatusCode int
}

CloudflareError indicates the request was blocked by Cloudflare protection.

func (*CloudflareError) Error

func (e *CloudflareError) Error() string

type Definition

type Definition struct {
	ID           string         `yaml:"id"`
	Name         string         `yaml:"name"`
	Description  string         `yaml:"description"`
	Language     string         `yaml:"language"`
	Type         string         `yaml:"type"` // public, semi-private, private
	Encoding     string         `yaml:"encoding"`
	RequestDelay int            `yaml:"requestDelay"` // milliseconds
	Links        []string       `yaml:"links"`
	LegacyLinks  []string       `yaml:"legacylinks"`
	Caps         CapsBlock      `yaml:"caps"`
	Settings     []SettingField `yaml:"settings"`
	Login        *LoginBlock    `yaml:"login,omitempty"`
	Search       SearchBlock    `yaml:"search"`
	Download     DownloadBlock  `yaml:"download"`
}

Definition is the full parsed representation of a Prowlarr Cardigann YAML file.

func ParseDefinition

func ParseDefinition(raw []byte) (*Definition, error)

ParseDefinition parses raw YAML bytes into a Definition.

type DownloadBlock

type DownloadBlock struct {
	Selectors []SelectorBlock `yaml:"selectors"`
	Method    string          `yaml:"method"`
}

DownloadBlock defines how to extract download links.

type Engine

type Engine struct {
	// contains filtered or unexported fields
}

Engine is the top-level scraping coordinator. It manages definitions, runners, and the result cache.

func NewEngine

func NewEngine(logger *slog.Logger, flaresolverr *FlareSolverr) *Engine

NewEngine creates a new scraping engine. flaresolverr can be nil if not configured.

func (*Engine) Cache

func (e *Engine) Cache() *ResultCache

Cache returns the result cache.

func (*Engine) DefinitionCount

func (e *Engine) DefinitionCount() int

DefinitionCount returns how many definitions are loaded.

func (*Engine) GetRunner

func (e *Engine) GetRunner(catalogID string, settingsJSON string) (*Runner, error)

GetRunner returns a cached or new Runner for the given catalog ID and settings.

func (*Engine) HasDefinition

func (e *Engine) HasDefinition(catalogID string) bool

HasDefinition checks if a definition exists for the given catalog ID.

func (*Engine) LoadDefinitions

func (e *Engine) LoadDefinitions(defs map[string][]byte)

LoadDefinitions stores raw YAML bytes for all catalog entries. Called after the Prowlarr zip is fetched.

func (*Engine) ResolveCatalogID

func (e *Engine) ResolveCatalogID(name, url string) string

ResolveCatalogID finds the Prowlarr catalog ID for an indexer by trying: 1. Exact catalog ID match 2. Normalized name match (e.g., "The Pirate Bay" → "thepiratebay") 3. URL domain match (e.g., "https://1337x.to" → looks up "1337x.to")

type FieldBlock

type FieldBlock struct {
	SelectorBlock `yaml:",inline"`
	Optional      bool `yaml:"optional"`
}

FieldBlock defines how to extract a single field from a result row.

type FilterDef

type FilterDef struct {
	Name string      `yaml:"name"`
	Args interface{} `yaml:"args"` // string, []string, or []interface{}
}

FilterDef describes a single filter in the pipeline.

type FlareSolverr

type FlareSolverr struct {
	// contains filtered or unexported fields
}

FlareSolverr is a client for the FlareSolverr Cloudflare bypass proxy.

func NewFlareSolverr

func NewFlareSolverr(apiURL string, logger *slog.Logger) *FlareSolverr

NewFlareSolverr creates a FlareSolverr client. Returns nil if URL is empty.

func (*FlareSolverr) GetSession

func (f *FlareSolverr) GetSession(domain string) (*cfSession, bool)

GetSession returns the cached CF session for a domain, if valid.

func (*FlareSolverr) PreWarm

func (f *FlareSolverr) PreWarm(ctx context.Context, urls []string)

PreWarm proactively solves Cloudflare challenges for a list of URLs. This is called at startup so that user searches hit cached sessions.

func (*FlareSolverr) Solve

func (f *FlareSolverr) Solve(ctx context.Context, targetURL string) (html string, cookies []fsCookie, userAgent string, err error)

Solve sends a URL to FlareSolverr to bypass Cloudflare and returns the page HTML, cookies, and user agent string.

type LoginBlock

type LoginBlock struct {
	Path   string            `yaml:"path"`
	Method string            `yaml:"method"` // post, get, form, cookie
	Inputs map[string]string `yaml:"inputs"`
	Error  []SelectorBlock   `yaml:"error"`
	Test   *TestBlock        `yaml:"test"`
}

LoginBlock describes how to authenticate with a private tracker.

type QueryContext

type QueryContext struct {
	Type   string
	Q      string
	IMDBID string
	TMDBID string
	TVDBID string
	Season string
	Ep     string
	Year   string
	Page   int
}

QueryContext holds search query parameters.

type RateLimitedClient

type RateLimitedClient struct {
	// contains filtered or unexported fields
}

RateLimitedClient is an HTTP client with per-host rate limiting.

func NewRateLimitedClient

func NewRateLimitedClient(delay time.Duration) *RateLimitedClient

NewRateLimitedClient creates an HTTP client with rate limiting. delay is the minimum time between requests (from the YAML requestDelay field).

func (*RateLimitedClient) ApplyCFSession

func (c *RateLimitedClient) ApplyCFSession(rawURL string, cookies []*http.Cookie, userAgent string)

ApplyCFSession sets Cloudflare bypass cookies and user agent from a FlareSolverr session onto this client's cookie jar.

func (*RateLimitedClient) Do

func (c *RateLimitedClient) Do(req *http.Request) (*http.Response, error)

Do executes an HTTP request, respecting the rate limit.

type ResponseConfig

type ResponseConfig struct {
	Type             string `yaml:"type"` // html (default), json, xml
	NoResultsMessage string `yaml:"noResultsMessage"`
}

ResponseConfig specifies the response format.

type ResultCache

type ResultCache struct {
	// contains filtered or unexported fields
}

ResultCache is a simple in-memory cache for search results.

func NewResultCache

func NewResultCache(ttl time.Duration) *ResultCache

NewResultCache creates a result cache with the given TTL.

func (*ResultCache) Cleanup

func (c *ResultCache) Cleanup()

Cleanup removes expired entries. Call periodically.

func (*ResultCache) Get

func (c *ResultCache) Get(key string) ([]SearchResult, bool)

Get returns cached results if they exist and are not expired.

func (*ResultCache) Set

func (c *ResultCache) Set(key string, results []SearchResult)

Set stores results in the cache.

type RowsBlock

type RowsBlock struct {
	Selector string         `yaml:"selector"`
	After    int            `yaml:"after"`
	Remove   string         `yaml:"remove"`
	Multiple bool           `yaml:"multiple"`
	Count    *SelectorBlock `yaml:"count,omitempty"`
}

RowsBlock defines how to find result rows in the response.

type Runner

type Runner struct {
	// contains filtered or unexported fields
}

Runner executes searches against a single indexer using its YAML definition.

func NewRunner

func NewRunner(def *Definition, settings map[string]string, flaresolverr *FlareSolverr, logger *slog.Logger) *Runner

NewRunner creates a runner for the given definition.

func (*Runner) Definition

func (r *Runner) Definition() *Definition

Definition returns the underlying definition.

func (*Runner) ResolveDownload

func (r *Runner) ResolveDownload(ctx context.Context, detailURL string) (string, error)

ResolveDownload fetches a detail page and extracts the actual download URL (magnet link or .torrent URL) using the definition's download selectors.

func (*Runner) Search

func (r *Runner) Search(ctx context.Context, query string, categories []int) ([]SearchResult, error)

Search executes a search and returns structured results.

type SearchBlock

type SearchBlock struct {
	Paths           []SearchPath          `yaml:"paths"`
	Inputs          map[string]string     `yaml:"inputs"`
	KeywordsFilters []FilterDef           `yaml:"keywordsfilters"`
	Rows            RowsBlock             `yaml:"rows"`
	Fields          map[string]FieldBlock `yaml:"fields"`
}

SearchBlock defines how to search the tracker.

type SearchPath

type SearchPath struct {
	Path     string            `yaml:"path"`
	Method   string            `yaml:"method"`
	Inputs   map[string]string `yaml:"inputs"`
	Response *ResponseConfig   `yaml:"response,omitempty"`
}

SearchPath is a single search URL pattern.

type SearchResult

type SearchResult struct {
	Title                string
	Details              string // URL to the detail page
	Download             string // .torrent URL or magnet URI
	MagnetURI            string
	InfoHash             string
	Size                 int64
	Seeders              int
	Leechers             int
	Grabs                int
	Date                 string // RFC3339 or raw
	Category             string // Newznab category ID
	DownloadVolumeFactor float64
	UploadVolumeFactor   float64
	Description          string
	IMDBID               string
	Poster               string
}

SearchResult is a single result from a search execution.

type SelectorBlock

type SelectorBlock struct {
	Selector  string            `yaml:"selector"`
	Text      string            `yaml:"text"`
	Attribute string            `yaml:"attribute"`
	Remove    string            `yaml:"remove"`
	Filters   []FilterDef       `yaml:"filters"`
	Case      map[string]string `yaml:"case"`
}

SelectorBlock is the core unit for extracting data from HTML/JSON.

type SettingField

type SettingField struct {
	Name    string      `yaml:"name"`
	Type    string      `yaml:"type"`
	Label   string      `yaml:"label"`
	Default interface{} `yaml:"default"`
	Options interface{} `yaml:"options"` // map[string]string or []map[string]string
}

SettingField is a user-configurable field in the YAML definition. Uses interface{} for Default and Options to handle varied YAML types.

type TemplateContext

type TemplateContext struct {
	Config     map[string]string // user settings (sort, downloadlink, etc.)
	Keywords   string            // search query (after keywordsfilters)
	Query      QueryContext
	Categories []string          // site category IDs
	Result     map[string]string // for self-referencing fields
	True       bool
	False      bool
	Today      TodayContext
}

TemplateContext holds the variables available in Cardigann templates.

func NewTemplateContext

func NewTemplateContext(config map[string]string, query string) *TemplateContext

NewTemplateContext creates a context with sensible defaults.

type TestBlock

type TestBlock struct {
	Path     string `yaml:"path"`
	Selector string `yaml:"selector"`
}

TestBlock verifies a login succeeded.

type TodayContext

type TodayContext struct {
	Year int
}

TodayContext provides date helpers.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL