web

package
v0.1.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 24, 2025 License: MIT Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func RegisterAdapterCreator

func RegisterAdapterCreator(scraperName string, creator AdapterCreatorFunc)

RegisterAdapterCreator registers an adapter creator function for a given scraper name.

func RegisterAllWebAdapters

func RegisterAllWebAdapters() error

RegisterAllWebAdapters registers all web adapters based on the application configuration.

func RegisterWebAdapter

func RegisterWebAdapter(engine WebScraper, factory AdapterFactory)

RegisterWebAdapter registers a web adapter globally.

Types

type AdapterCreatorFunc

type AdapterCreatorFunc func(scraperConfig types.WebScraperConfig) (WebAdapter, error)

AdapterCreatorFunc defines the function signature for an adapter creator.

type AdapterFactory

type AdapterFactory func() (WebAdapter, error)

AdapterFactory defines the function signature for a web adapter factory.

type DefaultWebStrategy

type DefaultWebStrategy struct {
	// contains filtered or unexported fields
}

DefaultWebStrategy is the default implementation of the web scraping strategy.

func NewDefaultWebStrategy

func NewDefaultWebStrategy(config *WebStrategyConfig) *DefaultWebStrategy

NewDefaultWebStrategy creates a new default web scraping strategy.

func (*DefaultWebStrategy) Execute

func (s *DefaultWebStrategy) Execute(ctx context.Context, url string, options *ScrapeOptions) (*WebContent, error)

Execute executes the web scraping strategy for a single URL.

func (*DefaultWebStrategy) ExecuteMultiple

func (s *DefaultWebStrategy) ExecuteMultiple(ctx context.Context, urls []string, options *ScrapeOptions) ([]*WebContent, error)

ExecuteMultiple executes the web scraping strategy for multiple URLs.

type Format

type Format string

Format represents the scraping format enumeration.

const (
	FormatText     Format = "text"
	FormatHTML     Format = "html"
	FormatMarkdown Format = "markdown"
)

func (Format) IsValid

func (f Format) IsValid() bool

IsValid checks if the format is valid.

func (Format) String

func (f Format) String() string

String implements the fmt.Stringer interface.

type Image

type Image struct {
	URL   string `json:"url"`
	Title string `json:"title,omitempty"`
}

Image represents an image.

type Link struct {
	URL  string `json:"url"`
	Text string `json:"text"`
}

Link represents a hyperlink.

type ScrapeOptions

type ScrapeOptions struct {
	// Basic options
	Format        Format            `json:"format,omitempty"`         // Scraping format enumeration
	Timeout       int               `json:"timeout,omitempty"`        // Timeout in seconds
	Cookies       map[string]string `json:"cookies,omitempty"`        // Cookies to use for the request
	ImagesSummary bool              `json:"images_summary,omitempty"` // Whether to include a summary of unique images at the end
	LinksSummary  bool              `json:"links_summary,omitempty"`  // Whether to include a summary of unique links at the end
}

ScrapeOptions holds the options for a scraping request.

type WebAdapter

type WebAdapter interface {
	// Scrape is the core function for scraping a single page.
	Scrape(ctx context.Context, url string, options *ScrapeOptions) (*WebContent, error)

	// ScrapeMultiple scrapes multiple pages concurrently.
	ScrapeMultiple(ctx context.Context, urls []string, options *ScrapeOptions) ([]*WebContent, error)
}

WebAdapter is the interface for a web adapter.

func CreateWebAdapter

func CreateWebAdapter(engine WebScraper) (WebAdapter, error)

CreateWebAdapter creates a web adapter instance from the global registry.

type WebContent

type WebContent struct {
	// Basic information
	URL     string `json:"url"`
	Title   string `json:"title"`
	Content string `json:"content"` // Main content

	// Extracted structured data
	Links  []Link  `json:"links,omitempty"`
	Images []Image `json:"images,omitempty"`
}

WebContent represents the content of a web page (simplified response).

type WebRegistry

type WebRegistry struct {
	// contains filtered or unexported fields
}

WebRegistry is a registry for web adapters.

func NewWebRegistry

func NewWebRegistry() *WebRegistry

NewWebRegistry creates a new web adapter registry.

func (*WebRegistry) Create

func (r *WebRegistry) Create(scraper WebScraper) (WebAdapter, error)

Create creates a web adapter instance.

func (*WebRegistry) Register

func (r *WebRegistry) Register(scraper WebScraper, factory AdapterFactory)

Register registers a web adapter factory.

type WebScraper

type WebScraper string

WebScraper represents a web scraping engine identifier.

func (WebScraper) String

func (e WebScraper) String() string

String implements the fmt.Stringer interface.

type WebStrategy

type WebStrategy interface {
	// Execute executes the web scraping strategy for a single URL.
	Execute(ctx context.Context, url string, options *ScrapeOptions) (*WebContent, error)

	// ExecuteMultiple executes the web scraping strategy for multiple URLs.
	ExecuteMultiple(ctx context.Context, urls []string, options *ScrapeOptions) ([]*WebContent, error)
}

WebStrategy defines the interface for a web scraping strategy.

func NewDefaultWebStrategyFromConfig

func NewDefaultWebStrategyFromConfig() (WebStrategy, error)

NewDefaultWebStrategyFromConfig creates a default web scraping strategy from the global configuration.

type WebStrategyConfig

type WebStrategyConfig struct {
	// DefaultScraper is the default scraper to use if no list is specified.
	DefaultScraper WebScraper `json:"default_scraper,omitempty"`

	// DefaultFallbackOrder is the default order of scrapers to try.
	DefaultFallbackOrder []WebScraper `json:"default_fallback_order,omitempty"`

	// EnableFallback determines whether to try the next scraper on failure.
	EnableFallback bool `json:"enable_fallback"`

	// FailFast determines whether to stop immediately after the first failure.
	FailFast bool `json:"fail_fast"`
}

WebStrategyConfig holds the configuration for a web scraping strategy.

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL