Documentation
¶
Index ¶
- func RegisterAdapterCreator(scraperName string, creator AdapterCreatorFunc)
- func RegisterAllWebAdapters() error
- func RegisterWebAdapter(engine WebScraper, factory AdapterFactory)
- type AdapterCreatorFunc
- type AdapterFactory
- type DefaultWebStrategy
- type Format
- type Image
- type Link
- type ScrapeOptions
- type WebAdapter
- type WebContent
- type WebRegistry
- type WebScraper
- type WebStrategy
- type WebStrategyConfig
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func RegisterAdapterCreator ¶
func RegisterAdapterCreator(scraperName string, creator AdapterCreatorFunc)
RegisterAdapterCreator registers an adapter creator function for a given scraper name.
func RegisterAllWebAdapters ¶
func RegisterAllWebAdapters() error
RegisterAllWebAdapters registers all web adapters based on the application configuration.
func RegisterWebAdapter ¶
func RegisterWebAdapter(engine WebScraper, factory AdapterFactory)
RegisterWebAdapter registers a web adapter globally.
Types ¶
type AdapterCreatorFunc ¶
type AdapterCreatorFunc func(scraperConfig types.WebScraperConfig) (WebAdapter, error)
AdapterCreatorFunc defines the function signature for an adapter creator.
type AdapterFactory ¶
type AdapterFactory func() (WebAdapter, error)
AdapterFactory defines the function signature for a web adapter factory.
type DefaultWebStrategy ¶
type DefaultWebStrategy struct {
// contains filtered or unexported fields
}
DefaultWebStrategy is the default implementation of the web scraping strategy.
func NewDefaultWebStrategy ¶
func NewDefaultWebStrategy(config *WebStrategyConfig) *DefaultWebStrategy
NewDefaultWebStrategy creates a new default web scraping strategy.
func (*DefaultWebStrategy) Execute ¶
func (s *DefaultWebStrategy) Execute(ctx context.Context, url string, options *ScrapeOptions) (*WebContent, error)
Execute executes the web scraping strategy for a single URL.
func (*DefaultWebStrategy) ExecuteMultiple ¶
func (s *DefaultWebStrategy) ExecuteMultiple(ctx context.Context, urls []string, options *ScrapeOptions) ([]*WebContent, error)
ExecuteMultiple executes the web scraping strategy for multiple URLs.
type ScrapeOptions ¶
type ScrapeOptions struct {
// Basic options
Format Format `json:"format,omitempty"` // Scraping format enumeration
Timeout int `json:"timeout,omitempty"` // Timeout in seconds
Cookies map[string]string `json:"cookies,omitempty"` // Cookies to use for the request
ImagesSummary bool `json:"images_summary,omitempty"` // Whether to include a summary of unique images at the end
LinksSummary bool `json:"links_summary,omitempty"` // Whether to include a summary of unique links at the end
}
ScrapeOptions holds the options for a scraping request.
type WebAdapter ¶
type WebAdapter interface {
// Scrape is the core function for scraping a single page.
Scrape(ctx context.Context, url string, options *ScrapeOptions) (*WebContent, error)
// ScrapeMultiple scrapes multiple pages concurrently.
ScrapeMultiple(ctx context.Context, urls []string, options *ScrapeOptions) ([]*WebContent, error)
}
WebAdapter is the interface for a web adapter.
func CreateWebAdapter ¶
func CreateWebAdapter(engine WebScraper) (WebAdapter, error)
CreateWebAdapter creates a web adapter instance from the global registry.
type WebContent ¶
type WebContent struct {
// Basic information
URL string `json:"url"`
Title string `json:"title"`
Content string `json:"content"` // Main content
// Extracted structured data
Links []Link `json:"links,omitempty"`
Images []Image `json:"images,omitempty"`
}
WebContent represents the content of a web page (simplified response).
type WebRegistry ¶
type WebRegistry struct {
// contains filtered or unexported fields
}
WebRegistry is a registry for web adapters.
func NewWebRegistry ¶
func NewWebRegistry() *WebRegistry
NewWebRegistry creates a new web adapter registry.
func (*WebRegistry) Create ¶
func (r *WebRegistry) Create(scraper WebScraper) (WebAdapter, error)
Create creates a web adapter instance.
func (*WebRegistry) Register ¶
func (r *WebRegistry) Register(scraper WebScraper, factory AdapterFactory)
Register registers a web adapter factory.
type WebScraper ¶
type WebScraper string
WebScraper represents a web scraping engine identifier.
func (WebScraper) String ¶
func (e WebScraper) String() string
String implements the fmt.Stringer interface.
type WebStrategy ¶
type WebStrategy interface {
// Execute executes the web scraping strategy for a single URL.
Execute(ctx context.Context, url string, options *ScrapeOptions) (*WebContent, error)
// ExecuteMultiple executes the web scraping strategy for multiple URLs.
ExecuteMultiple(ctx context.Context, urls []string, options *ScrapeOptions) ([]*WebContent, error)
}
WebStrategy defines the interface for a web scraping strategy.
func NewDefaultWebStrategyFromConfig ¶
func NewDefaultWebStrategyFromConfig() (WebStrategy, error)
NewDefaultWebStrategyFromConfig creates a default web scraping strategy from the global configuration.
type WebStrategyConfig ¶
type WebStrategyConfig struct {
// DefaultScraper is the default scraper to use if no list is specified.
DefaultScraper WebScraper `json:"default_scraper,omitempty"`
// DefaultFallbackOrder is the default order of scrapers to try.
DefaultFallbackOrder []WebScraper `json:"default_fallback_order,omitempty"`
// EnableFallback determines whether to try the next scraper on failure.
EnableFallback bool `json:"enable_fallback"`
// FailFast determines whether to stop immediately after the first failure.
FailFast bool `json:"fail_fast"`
}
WebStrategyConfig holds the configuration for a web scraping strategy.