scraper

package

v0.30.1 Latest Latest Go to latest Published: Dec 18, 2025 License: AGPL-3.0 Imports: 46 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/stashapp/stash

Links

Open Source Insights

Documentation ¶

Overview ¶

Package scraper provides interfaces to interact with the scraper subsystem. The Cache type is the main entry point to the scraper subsystem.

Index ¶

Constants
Variables
func CompileExclusionRegexps(patterns []string) []*regexp.Regexp
func FilterTags(excludeRegexps []*regexp.Regexp, tags []*models.ScrapedTag) (newTags []*models.ScrapedTag, ignoredTags []string)
func LogIgnoredTags(ignoredTags []string)
type Cache
- func NewCache(globalConfig GlobalConfig, repo Repository) *Cache
- func (c Cache) GetScraper(scraperID string) *Scraper
- func (c Cache) ListScrapers(tys []ScrapeContentType) []*Scraper
- func (c *Cache) ReloadScrapers()
- func (c Cache) ScrapeFragment(ctx context.Context, id string, input Input) (ScrapedContent, error)
- func (c Cache) ScrapeID(ctx context.Context, scraperID string, id int, ty ScrapeContentType) (ScrapedContent, error)
- func (c Cache) ScrapeName(ctx context.Context, id, query string, ty ScrapeContentType) ([]ScrapedContent, error)
- func (c Cache) ScrapeURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error)
type GalleryFinder
type GlobalConfig
type ImageFinder
type Input
type PerformerFinder
type QueryType
type Repository
- func NewRepository(repo models.Repository) Repository
- func (r *Repository) WithReadTxn(ctx context.Context, fn txn.TxnFunc) error
type SceneFinder
type ScrapeContentType
- func (e ScrapeContentType) IsValid() bool
- func (e ScrapeContentType) MarshalGQL(w io.Writer)
- func (e ScrapeContentType) String() string
- func (e *ScrapeContentType) UnmarshalGQL(v interface{}) error
type ScrapeType
- func (e ScrapeType) IsValid() bool
- func (e ScrapeType) MarshalGQL(w io.Writer)
- func (e ScrapeType) String() string
- func (e *ScrapeType) UnmarshalGQL(v interface{}) error
type ScrapedContent
type ScrapedMovieInput
type ScrapedPerformerInput
type Scraper
type ScraperSpec
type Source
type StudioFinder
type TagFinder

Constants ¶

View Source

const FreeonesScraperID = "builtin_freeones"

FreeonesScraperID is the scraper ID for the built-in Freeones scraper

Variables ¶

View Source

var (
	// ErrMaxRedirects is returned if the max number of HTTP redirects are reached.
	ErrMaxRedirects = errors.New("maximum number of HTTP redirects reached")

	// ErrNotFound is returned when an entity isn't found
	ErrNotFound = errors.New("scraper not found")

	// ErrNotSupported is returned when a given invocation isn't supported, and there
	// is a guard function which should be able to guard against it.
	ErrNotSupported = errors.New("scraper operation not supported")
)

View Source

var AllScrapeContentType = []ScrapeContentType{
	ScrapeContentTypeGallery,
	ScrapeContentTypeMovie,
	ScrapeContentTypeGroup,
	ScrapeContentTypePerformer,
	ScrapeContentTypeScene,
	ScrapeContentTypeImage,
}

View Source

var AllScrapeType = []ScrapeType{
	ScrapeTypeName,
	ScrapeTypeFragment,
	ScrapeTypeURL,
}

View Source

var ErrScraperScript = errors.New("scraper script error")

Functions ¶

func CompileExclusionRegexps ¶ added in v0.28.0

func CompileExclusionRegexps(patterns []string) []*regexp.Regexp

CompileExclusionRegexps compiles a list of tag exclusion patterns into a list of regular expressions

func FilterTags ¶ added in v0.28.0

func FilterTags(excludeRegexps []*regexp.Regexp, tags []*models.ScrapedTag) (newTags []*models.ScrapedTag, ignoredTags []string)

FilterTags removes tags matching excluded tag patterns from the list of scraped tags It returns the filtered list of tags and a list of the excluded tags

func LogIgnoredTags ¶ added in v0.28.0

func LogIgnoredTags(ignoredTags []string)

LogIgnoredTags logs the list of ignored tags

Types ¶

type Cache ¶ added in v0.3.0

type Cache struct {
	// contains filtered or unexported fields
}

Cache stores the database of scrapers

func NewCache ¶ added in v0.3.0

func NewCache(globalConfig GlobalConfig, repo Repository) *Cache

NewCache returns a new Cache.

Scraper configurations are loaded from yml files in the scrapers directory in the config and any subdirectories.

Does not load scrapers. Scrapers will need to be loaded explicitly using ReloadScrapers.

func (Cache) GetScraper ¶ added in v0.11.0

func (c Cache) GetScraper(scraperID string) *Scraper

GetScraper returns the scraper matching the provided id.

func (Cache) ListScrapers ¶ added in v0.12.0

func (c Cache) ListScrapers(tys []ScrapeContentType) []*Scraper

ListScrapers lists scrapers matching one of the given types. Returns a list of scrapers, sorted by their name.

func (*Cache) ReloadScrapers ¶ added in v0.3.0

func (c *Cache) ReloadScrapers()

ReloadScrapers clears the scraper cache and reloads from the scraper path. If a scraper cannot be loaded, an error is logged and the scraper is skipped.

func (Cache) ScrapeFragment ¶ added in v0.12.0

func (c Cache) ScrapeFragment(ctx context.Context, id string, input Input) (ScrapedContent, error)

ScrapeFragment uses the given fragment input to scrape

func (Cache) ScrapeID ¶ added in v0.12.0

func (c Cache) ScrapeID(ctx context.Context, scraperID string, id int, ty ScrapeContentType) (ScrapedContent, error)

func (Cache) ScrapeName ¶ added in v0.12.0

func (c Cache) ScrapeName(ctx context.Context, id, query string, ty ScrapeContentType) ([]ScrapedContent, error)

func (Cache) ScrapeURL ¶ added in v0.12.0

func (c Cache) ScrapeURL(ctx context.Context, url string, ty ScrapeContentType) (ScrapedContent, error)

ScrapeURL scrapes a given url for the given content. Searches the scraper cache and picks the first scraper capable of scraping the given url into the desired content. Returns the scraped content or an error if the scrape fails.

type GalleryFinder ¶ added in v0.17.0

type GalleryFinder interface {
	models.GalleryGetter
	models.FileLoader
	models.URLLoader
}

type GlobalConfig ¶ added in v0.3.0

type GlobalConfig interface {
	GetScraperUserAgent() string
	GetScrapersPath() string
	GetScraperCDPPath() string
	GetScraperCertCheck() bool
	GetPythonPath() string
	GetProxy() string
	GetScraperExcludeTagPatterns() []string
}

GlobalConfig contains the global scraper options.

type ImageFinder ¶ added in v0.28.0

type ImageFinder interface {
	models.ImageGetter
	models.FileLoader
	models.URLLoader
}

type Input ¶ added in v0.12.0

type Input struct {
	Performer *ScrapedPerformerInput
	Scene     *models.ScrapedSceneInput
	Gallery   *models.ScrapedGalleryInput
	Image     *models.ScrapedImageInput
}

Input coalesces inputs of different types into a single structure. The system expects one of these to be set, and the remaining to be set to nil.

type PerformerFinder ¶ added in v0.17.0

type PerformerFinder interface {
	models.PerformerAutoTagQueryer
	match.PerformerFinder
}

type QueryType ¶ added in v0.12.0

type QueryType int

simple type definitions that can help customize actions per query

const (
	// for now only SearchQuery is needed
	SearchQuery QueryType = iota + 1
)

type Repository ¶ added in v0.17.0

type Repository struct {
	TxnManager models.TxnManager

	SceneFinder     SceneFinder
	GalleryFinder   GalleryFinder
	ImageFinder     ImageFinder
	TagFinder       TagFinder
	PerformerFinder PerformerFinder
	GroupFinder     match.GroupNamesFinder
	StudioFinder    StudioFinder
}

func NewRepository ¶ added in v0.24.0

func NewRepository(repo models.Repository) Repository

func (*Repository) WithReadTxn ¶ added in v0.24.0

func (r *Repository) WithReadTxn(ctx context.Context, fn txn.TxnFunc) error

type SceneFinder ¶ added in v0.22.0

type SceneFinder interface {
	models.SceneGetter
	models.URLLoader
	models.VideoFileLoader
}

type ScrapeContentType ¶ added in v0.17.0

type ScrapeContentType string

Type of the content a scraper generates

const (
	ScrapeContentTypeGallery   ScrapeContentType = "GALLERY"
	ScrapeContentTypeMovie     ScrapeContentType = "MOVIE"
	ScrapeContentTypeGroup     ScrapeContentType = "GROUP"
	ScrapeContentTypePerformer ScrapeContentType = "PERFORMER"
	ScrapeContentTypeScene     ScrapeContentType = "SCENE"
	ScrapeContentTypeImage     ScrapeContentType = "IMAGE"
)

func (ScrapeContentType) IsValid ¶ added in v0.17.0

func (e ScrapeContentType) IsValid() bool

func (ScrapeContentType) MarshalGQL ¶ added in v0.17.0

func (e ScrapeContentType) MarshalGQL(w io.Writer)

func (ScrapeContentType) String ¶ added in v0.17.0

func (e ScrapeContentType) String() string

func (*ScrapeContentType) UnmarshalGQL ¶ added in v0.17.0

func (e *ScrapeContentType) UnmarshalGQL(v interface{}) error

type ScrapeType ¶ added in v0.17.0

type ScrapeType string

const (
	// From text query
	ScrapeTypeName ScrapeType = "NAME"
	// From existing object
	ScrapeTypeFragment ScrapeType = "FRAGMENT"
	// From URL
	ScrapeTypeURL ScrapeType = "URL"
)

func (ScrapeType) IsValid ¶ added in v0.17.0

func (e ScrapeType) IsValid() bool

func (ScrapeType) MarshalGQL ¶ added in v0.17.0

func (e ScrapeType) MarshalGQL(w io.Writer)

func (ScrapeType) String ¶ added in v0.17.0

func (e ScrapeType) String() string

func (*ScrapeType) UnmarshalGQL ¶ added in v0.17.0

func (e *ScrapeType) UnmarshalGQL(v interface{}) error

type ScrapedContent ¶ added in v0.17.0

type ScrapedContent interface {
	IsScrapedContent()
}

Scraped Content is the forming union over the different scrapers

type ScrapedMovieInput ¶ added in v0.17.0

type ScrapedMovieInput struct {
	Name     *string  `json:"name"`
	Aliases  *string  `json:"aliases"`
	Duration *string  `json:"duration"`
	Date     *string  `json:"date"`
	Rating   *string  `json:"rating"`
	Director *string  `json:"director"`
	URLs     []string `json:"urls"`
	Synopsis *string  `json:"synopsis"`

	// deprecated
	URL *string `json:"url"`
}

type ScrapedPerformerInput ¶ added in v0.17.0

type ScrapedPerformerInput struct {
	// Set if performer matched
	StoredID       *string  `json:"stored_id"`
	Name           *string  `json:"name"`
	Disambiguation *string  `json:"disambiguation"`
	Gender         *string  `json:"gender"`
	URLs           []string `json:"urls"`
	URL            *string  `json:"url"`       // deprecated
	Twitter        *string  `json:"twitter"`   // deprecated
	Instagram      *string  `json:"instagram"` // deprecated
	Birthdate      *string  `json:"birthdate"`
	Ethnicity      *string  `json:"ethnicity"`
	Country        *string  `json:"country"`
	EyeColor       *string  `json:"eye_color"`
	Height         *string  `json:"height"`
	Measurements   *string  `json:"measurements"`
	FakeTits       *string  `json:"fake_tits"`
	PenisLength    *string  `json:"penis_length"`
	Circumcised    *string  `json:"circumcised"`
	CareerLength   *string  `json:"career_length"`
	Tattoos        *string  `json:"tattoos"`
	Piercings      *string  `json:"piercings"`
	Aliases        *string  `json:"aliases"`
	Details        *string  `json:"details"`
	DeathDate      *string  `json:"death_date"`
	HairColor      *string  `json:"hair_color"`
	Weight         *string  `json:"weight"`
	RemoteSiteID   *string  `json:"remote_site_id"`
}

type Scraper ¶ added in v0.17.0

type Scraper struct {
	ID   string `json:"id"`
	Name string `json:"name"`
	// Details for performer scraper
	Performer *ScraperSpec `json:"performer"`
	// Details for scene scraper
	Scene *ScraperSpec `json:"scene"`
	// Details for gallery scraper
	Gallery *ScraperSpec `json:"gallery"`
	// Details for image scraper
	Image *ScraperSpec `json:"image"`
	// Details for movie scraper
	Group *ScraperSpec `json:"group"`
	// Details for movie scraper
	Movie *ScraperSpec `json:"movie"`
}

type ScraperSpec ¶ added in v0.17.0

type ScraperSpec struct {
	// URLs matching these can be scraped with
	Urls             []string     `json:"urls"`
	SupportedScrapes []ScrapeType `json:"supported_scrapes"`
}

type Source ¶ added in v0.17.0

type Source struct {
	// Index of the configured stash-box instance to use. Should be unset if scraper_id is set
	StashBoxIndex *int `json:"stash_box_index"`
	// Stash-box endpoint
	StashBoxEndpoint *string `json:"stash_box_endpoint"`
	// Scraper ID to scrape with. Should be unset if stash_box_index is set
	ScraperID *string `json:"scraper_id"`
}

type StudioFinder ¶ added in v0.17.0

type StudioFinder interface {
	models.StudioAutoTagQueryer
	FindByStashID(ctx context.Context, stashID models.StashID) ([]*models.Studio, error)
}

type TagFinder ¶ added in v0.17.0

type TagFinder interface {
	models.TagGetter
	models.TagAutoTagQueryer
}

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func CompileExclusionRegexps ¶ added in v0.28.0

func FilterTags ¶ added in v0.28.0

func LogIgnoredTags ¶ added in v0.28.0

Types ¶

type Cache ¶ added in v0.3.0

func NewCache ¶ added in v0.3.0

func (Cache) GetScraper ¶ added in v0.11.0

func (Cache) ListScrapers ¶ added in v0.12.0

func (*Cache) ReloadScrapers ¶ added in v0.3.0

func (Cache) ScrapeFragment ¶ added in v0.12.0

func (Cache) ScrapeID ¶ added in v0.12.0

func (Cache) ScrapeName ¶ added in v0.12.0

func (Cache) ScrapeURL ¶ added in v0.12.0

type GalleryFinder ¶ added in v0.17.0

type GlobalConfig ¶ added in v0.3.0

type ImageFinder ¶ added in v0.28.0

type Input ¶ added in v0.12.0

type PerformerFinder ¶ added in v0.17.0

type QueryType ¶ added in v0.12.0

type Repository ¶ added in v0.17.0

func NewRepository ¶ added in v0.24.0

func (*Repository) WithReadTxn ¶ added in v0.24.0

type SceneFinder ¶ added in v0.22.0

type ScrapeContentType ¶ added in v0.17.0

func (ScrapeContentType) IsValid ¶ added in v0.17.0

func (ScrapeContentType) MarshalGQL ¶ added in v0.17.0

func (ScrapeContentType) String ¶ added in v0.17.0

func (*ScrapeContentType) UnmarshalGQL ¶ added in v0.17.0

type ScrapeType ¶ added in v0.17.0

func (ScrapeType) IsValid ¶ added in v0.17.0

func (ScrapeType) MarshalGQL ¶ added in v0.17.0

func (ScrapeType) String ¶ added in v0.17.0

func (*ScrapeType) UnmarshalGQL ¶ added in v0.17.0

type ScrapedContent ¶ added in v0.17.0

type ScrapedMovieInput ¶ added in v0.17.0

type ScrapedPerformerInput ¶ added in v0.17.0

type Scraper ¶ added in v0.17.0

type ScraperSpec ¶ added in v0.17.0

type Source ¶ added in v0.17.0

type StudioFinder ¶ added in v0.17.0

type TagFinder ¶ added in v0.17.0

Source Files ¶