Documentation
¶
Overview ¶
Package scraper implements a library for scraping web pages.
Index ¶
- Constants
- func ChromeUnmarshal(ctx context.Context, v interface{}, cssSelector string, opt UnmarshalOption) error
- func ExampleDataExtraction(scraper UnifiedScraper, url string) error
- func ExampleFileDownload(scraper UnifiedScraper) error
- func ExampleFormAutomation(scraper UnifiedScraper) error
- func ExampleSwitchableScraper(useChrome bool)
- func ExampleUnifiedScraping()
- func ExtractData(scraper UnifiedScraper, url string, selector string, v interface{}, ...) error
- func ExtractNumber(in string) (float64, error)
- func UnifiedUnmarshal(scraper UnifiedScraper, v interface{}, selector string, opt UnmarshalOption) error
- func UnifiedUnmarshalFromSelection(selection *goquery.Selection, v interface{}, opt UnmarshalOption) error
- func UnifiedUnmarshalWithContext(ctx context.Context, scraper UnifiedScraper, v interface{}, selector string, ...) error
- func Unmarshal(v interface{}, selection *goquery.Selection, opt UnmarshalOption) error
- type AvailableValue
- type BufferedLogger
- type ChromeSession
- func (session *ChromeSession) ActionCtx() (context.Context, context.CancelFunc)
- func (chromeSession *ChromeSession) ClearDebugStep()
- func (chromeSession *ChromeSession) Click(selector string) chromedp.ActionFunc
- func (chromeSession *ChromeSession) DoClick(selector string) error
- func (chromeSession *ChromeSession) DoNavigate(url string) error
- func (chromeSession *ChromeSession) DoSendKeys(selector, value string) error
- func (chromeSession *ChromeSession) DoSleep(duration time.Duration) error
- func (chromeSession *ChromeSession) DoWaitVisible(selector string) error
- func (session *ChromeSession) DownloadFile(filename *string, options DownloadFileOptions, actions ...chromedp.Action) chromedp.ActionFunc
- func (chromeSession *ChromeSession) DownloadResource(options UnifiedDownloadOptions) (string, error)
- func (chromeSession *ChromeSession) ExtractData(v interface{}, selector string, opt UnmarshalOption) error
- func (chromeSession *ChromeSession) FollowAnchor(text string) error
- func (session *ChromeSession) GetCurrentURL() (string, error)
- func (chromeSession *ChromeSession) GetDebugStep() string
- func (chromeSession *ChromeSession) Navigate(url string) chromedp.ActionFunc
- func (chromeSession *ChromeSession) Printf(format string, a ...interface{})
- func (session *ChromeSession) RunNavigate(URL string) (*network.Response, error)
- func (session *ChromeSession) SaveFile(filename *string) chromedp.ActionFunc
- func (session *ChromeSession) SaveHtml(filename *string) chromedp.Action
- func (session *ChromeSession) SaveLastHtmlSnapshot() error
- func (chromeSession *ChromeSession) SavePage() (string, error)
- func (chromeSession *ChromeSession) SendKeys(selector, value string) chromedp.ActionFunc
- func (chromeSession *ChromeSession) SetDebugStep(step string)
- func (chromeSession *ChromeSession) Sleep(duration time.Duration) chromedp.ActionFunc
- func (chromeSession *ChromeSession) SubmitForm(formSelector string, params map[string]string) error
- func (session *ChromeSession) Unmarshal(v interface{}, cssSelector string, opt UnmarshalOption) error
- func (chromeSession *ChromeSession) WaitVisible(selector string) chromedp.ActionFunc
- type ChromeTimeoutError
- type ConsoleLogger
- type DownloadFileOptions
- type DownloadedFileNameNotSatisfiedError
- type FollowAnchorTextOption
- type Form
- func (form *Form) Check(name string) error
- func (form *Form) NumSelect(name string) (int, error)
- func (form *Form) PrintSelection(name string) error
- func (form *Form) Select(name string, index int) error
- func (form *Form) Set(name string, value string) error
- func (form *Form) SetByLabel(name string, label string) error
- func (form *Form) SetForce(name string, value string) error
- func (form *Form) Uncheck(name string) error
- func (form *Form) Unset(name string) error
- func (form *Form) ValueByLabel(name string, label string) (string, error)
- type FormElement
- type FormElementNotFoundError
- type Logger
- type LoginError
- type MaintenanceError
- type NewChromeOptions
- type Page
- type PageMetadata
- type PageOption
- type RequestError
- type Response
- type ResponseError
- type RetryAndRecordError
- type ScraperType
- type Session
- func (session *Session) ApplyRefresh(page *Page, maxRedirect int) (*Page, error)
- func (session *Session) ClearDebugStep()
- func (session *Session) Click(selector string) error
- func (session *Session) Cookies(u *url.URL) []*http.Cookie
- func (session *Session) DoClick(selector string) error
- func (session *Session) DoNavigate(url string) error
- func (session *Session) DoSendKeys(selector, value string) error
- func (session *Session) DoWaitVisible(selector string) error
- func (session *Session) DownloadResource(options UnifiedDownloadOptions) (string, error)
- func (session *Session) ExtractData(v interface{}, selector string, opt UnmarshalOption) error
- func (session *Session) FollowAnchor(text string) error
- func (session *Session) FollowAnchorText(page *Page, text string) (*Response, error)
- func (session *Session) FollowAnchorTextOpt(page *Page, text string, opt FollowAnchorTextOption) (*Response, error)
- func (session *Session) FollowLink(page *Page, linkSelector string, attr string) (*Response, error)
- func (session *Session) FollowSelectionLink(page *Page, selection *goquery.Selection, attr string) (*Response, error)
- func (session *Session) FormAction(page *Page, formSelector string, params map[string]string) (*Response, error)
- func (session *Session) Frame(page *Page, frameSelector string) (*Page, error)
- func (session *Session) Get(getUrl string) (*Response, error)
- func (session *Session) GetCurrentURL() (string, error)
- func (session *Session) GetDebugStep() string
- func (session *Session) GetDirectory() string
- func (session *Session) GetPage(getUrl string) (*Page, error)
- func (session *Session) GetPageMaxRedirect(getUrl string, maxRedirect int) (*Page, error)
- func (session *Session) LoadCookie() error
- func (session *Session) Navigate(url string) error
- func (session *Session) NewChrome() (*ChromeSession, context.CancelFunc, error)
- func (session *Session) NewChromeOpt(options NewChromeOptions) (chromeSession *ChromeSession, cancelFunc context.CancelFunc, err error)
- func (session *Session) OpenURL(page *Page, url string) (*Response, error)
- func (session *Session) Printf(format string, a ...interface{})
- func (session *Session) SaveCookie() error
- func (session *Session) SavePage() (string, error)
- func (session *Session) SendKeys(selector, value string) error
- func (session *Session) SetCookies(u *url.URL, cookies []*http.Cookie)
- func (session *Session) SetDebugStep(step string)
- func (session *Session) Submit(form *Form) (*Response, error)
- func (session *Session) SubmitForm(formSelector string, params map[string]string) error
- func (session *Session) SubmitOpt(form *Form, imageId string) (*Response, error)
- func (session *Session) WaitVisible(selector string) error
- type UnexpectedContentTypeError
- type UnifiedDownloadOptions
- type UnifiedScraper
- type UnmarshalFieldError
- type UnmarshalMustBePointerError
- type UnmarshalOption
- type UnmarshalParseNumberError
- type UnmarshalUnexportedFieldError
- type Unmarshaller
Constants ¶
const ( DefaultFilePermission = 0644 // Read/write for owner, read for group and others DefaultDirPermission = 0755 // Read/write/execute for owner, read/execute for group and others )
File permission constants
const ( // DefaultTimeout is the default timeout for navigation and element waiting DefaultTimeout = 30 * time.Second // DefaultDownloadTimeout is the default timeout for file downloads DefaultDownloadTimeout = 60 * time.Second // DefaultFormTimeout is the default timeout for form operations DefaultFormTimeout = 15 * time.Second )
const ( //UserAgent_Chrome39 = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.99 Safari/537.36" //UserAgent_iOS8 = "Mozilla/5.0 (iPhone; CPU iPhone OS 8_1_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Mobile/12B466" UserAgent_firefox86 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" UserAgent_default = UserAgent_firefox86 )
const (
DefaultBlankURL = "about:blank"
)
URL constants for replay mode
const MetadataFileExtension = ".meta"
Variables ¶
This section is empty.
Functions ¶
func ChromeUnmarshal ¶ added in v0.0.16
func ChromeUnmarshal(ctx context.Context, v interface{}, cssSelector string, opt UnmarshalOption) error
func ExampleDataExtraction ¶ added in v0.0.42
func ExampleDataExtraction(scraper UnifiedScraper, url string) error
ExampleDataExtraction demonstrates advanced data extraction patterns
func ExampleFileDownload ¶ added in v0.0.42
func ExampleFileDownload(scraper UnifiedScraper) error
ExampleFileDownload shows how to handle downloads with the unified interface
func ExampleFormAutomation ¶ added in v0.0.42
func ExampleFormAutomation(scraper UnifiedScraper) error
ExampleFormAutomation shows unified form handling
func ExampleSwitchableScraper ¶ added in v0.0.42
func ExampleSwitchableScraper(useChrome bool)
ExampleSwitchableScraper demonstrates how to easily switch between scraper types based on configuration or runtime conditions.
func ExampleUnifiedScraping ¶ added in v0.0.42
func ExampleUnifiedScraping()
ExampleUnifiedScraping demonstrates how to use the unified scraper interface to write code that works with both HTTP-based and Chrome-based scraping.
func ExtractData ¶ added in v0.0.42
func ExtractData(scraper UnifiedScraper, url string, selector string, v interface{}, opt UnmarshalOption) error
ExtractData is a convenience function that combines navigation and data extraction in a single call, working with both scraper types.
func ExtractNumber ¶
func UnifiedUnmarshal ¶ added in v0.0.42
func UnifiedUnmarshal(scraper UnifiedScraper, v interface{}, selector string, opt UnmarshalOption) error
UnifiedUnmarshal provides a unified interface for data extraction that works with both traditional HTTP-based scraping and Chrome-based browser automation. It automatically detects the scraper type and uses the appropriate unmarshal function.
func UnifiedUnmarshalFromSelection ¶ added in v0.0.42
func UnifiedUnmarshalFromSelection(selection *goquery.Selection, v interface{}, opt UnmarshalOption) error
UnifiedUnmarshalFromSelection provides a unified interface for unmarshaling from a goquery.Selection. This is primarily for HTTP-based scraping but can be useful when you already have HTML content extracted from Chrome.
func UnifiedUnmarshalWithContext ¶ added in v0.0.42
func UnifiedUnmarshalWithContext(ctx context.Context, scraper UnifiedScraper, v interface{}, selector string, opt UnmarshalOption) error
UnifiedUnmarshalWithContext allows specifying a context for Chrome-based unmarshal operations. For HTTP-based scraping, the context is ignored.
func Unmarshal ¶
func Unmarshal(v interface{}, selection *goquery.Selection, opt UnmarshalOption) error
Unmarshal parses selection and stores to v. if v is a struct, each field may specify following tags.
- `find` tag with CSS selector to specify sub element.
- `html` if exists, gets HTML of the child elements as text. ignores `attr`.
- `attr` tag with attribute name to get a text. if both `html` and `tag` do not exist, get a text from text element.
- `re` tag with regular expression, use only matched substring from a text.
- `time` tag with time format to parse for time.Time.
Types ¶
type AvailableValue ¶
AvailableValue holds an available value and corresponding label to display.
type BufferedLogger ¶
type BufferedLogger struct {
// contains filtered or unexported fields
}
func (*BufferedLogger) Flush ¶
func (buflog *BufferedLogger) Flush(logger Logger)
func (*BufferedLogger) Printf ¶
func (buflog *BufferedLogger) Printf(format string, a ...interface{})
type ChromeSession ¶ added in v0.0.19
type ChromeSession struct {
*Session
Ctx context.Context
DownloadPath string
ActionTimeout time.Duration // Timeout for individual actions (0 = no timeout)
// contains filtered or unexported fields
}
func NewChromeWithRetry ¶ added in v0.0.48
func NewChromeWithRetry(session *Session, options NewChromeOptions, maxRetries int) (*ChromeSession, func(), error)
NewChromeWithRetry creates a new Chrome session with retry logic for startup failures. This helper is specifically designed to handle flaky Chrome startup issues in CI environments.
func (*ChromeSession) ActionCtx ¶ added in v0.0.55
func (session *ChromeSession) ActionCtx() (context.Context, context.CancelFunc)
ActionCtx returns a context with ActionTimeout applied if set. Use this for individual chromedp actions that should respect the action timeout. Caller must call the returned cancel function when done.
Example:
actionCtx, cancel := chromeSession.ActionCtx() defer cancel() err := chromedp.Run(actionCtx, chromedp.WaitVisible(selector, chromedp.ByQuery))
func (*ChromeSession) ClearDebugStep ¶ added in v0.0.42
func (chromeSession *ChromeSession) ClearDebugStep()
ClearDebugStep implements UnifiedScraper.ClearDebugStep
func (*ChromeSession) Click ¶ added in v0.0.42
func (chromeSession *ChromeSession) Click(selector string) chromedp.ActionFunc
Click returns an ActionFunc that handles clicking with replay support
func (*ChromeSession) DoClick ¶ added in v0.0.49
func (chromeSession *ChromeSession) DoClick(selector string) error
DoClick implements UnifiedScraper.Click
func (*ChromeSession) DoNavigate ¶ added in v0.0.49
func (chromeSession *ChromeSession) DoNavigate(url string) error
DoNavigate implements UnifiedScraper.Navigate
func (*ChromeSession) DoSendKeys ¶ added in v0.0.49
func (chromeSession *ChromeSession) DoSendKeys(selector, value string) error
DoSendKeys implements UnifiedScraper.SendKeys
func (*ChromeSession) DoSleep ¶ added in v0.0.49
func (chromeSession *ChromeSession) DoSleep(duration time.Duration) error
DoSleep is a convenience method for executing Sleep action
func (*ChromeSession) DoWaitVisible ¶ added in v0.0.49
func (chromeSession *ChromeSession) DoWaitVisible(selector string) error
DoWaitVisible implements UnifiedScraper.WaitVisible
func (*ChromeSession) DownloadFile ¶ added in v0.0.37
func (session *ChromeSession) DownloadFile(filename *string, options DownloadFileOptions, actions ...chromedp.Action) chromedp.ActionFunc
func (*ChromeSession) DownloadResource ¶ added in v0.0.42
func (chromeSession *ChromeSession) DownloadResource(options UnifiedDownloadOptions) (string, error)
DownloadResource implements UnifiedScraper.DownloadResource
func (*ChromeSession) ExtractData ¶ added in v0.0.42
func (chromeSession *ChromeSession) ExtractData(v interface{}, selector string, opt UnmarshalOption) error
ExtractData implements UnifiedScraper.ExtractData
func (*ChromeSession) FollowAnchor ¶ added in v0.0.42
func (chromeSession *ChromeSession) FollowAnchor(text string) error
FollowAnchor implements UnifiedScraper.FollowAnchor
func (*ChromeSession) GetCurrentURL ¶ added in v0.0.46
func (session *ChromeSession) GetCurrentURL() (string, error)
GetCurrentURL returns the current page URL
func (*ChromeSession) GetDebugStep ¶ added in v0.0.42
func (chromeSession *ChromeSession) GetDebugStep() string
GetDebugStep implements UnifiedScraper.GetDebugStep
func (*ChromeSession) Navigate ¶ added in v0.0.42
func (chromeSession *ChromeSession) Navigate(url string) chromedp.ActionFunc
Navigate returns an ActionFunc that handles navigation with replay support
func (*ChromeSession) Printf ¶ added in v0.0.42
func (chromeSession *ChromeSession) Printf(format string, a ...interface{})
Printf implements UnifiedScraper.Printf
func (*ChromeSession) RunNavigate ¶ added in v0.0.19
func (session *ChromeSession) RunNavigate(URL string) (*network.Response, error)
RunNavigate navigates to page URL and download html like Session.invoke
func (*ChromeSession) SaveFile ¶ added in v0.0.37
func (session *ChromeSession) SaveFile(filename *string) chromedp.ActionFunc
*
- SaveFile saves file to filename
- filename: DownloadFile の結果を chromedp.Run で続ける場合、ポインタにしないと実行前の値が渡ってしまうため、ポインタにする
func (*ChromeSession) SaveHtml ¶ added in v0.0.19
func (session *ChromeSession) SaveHtml(filename *string) chromedp.Action
func (*ChromeSession) SaveLastHtmlSnapshot ¶ added in v0.0.51
func (session *ChromeSession) SaveLastHtmlSnapshot() error
SaveLastHtmlSnapshot saves the last HTML content to a timestamped snapshot file This is useful for debugging timeouts - call this when an error occurs Multiple snapshots are preserved to track timeout sequences
func (*ChromeSession) SavePage ¶ added in v0.0.42
func (chromeSession *ChromeSession) SavePage() (string, error)
SavePage implements UnifiedScraper.SavePage (calls existing SaveHtml action)
func (*ChromeSession) SendKeys ¶ added in v0.0.42
func (chromeSession *ChromeSession) SendKeys(selector, value string) chromedp.ActionFunc
SendKeys returns an ActionFunc that handles sending keys with replay support
func (*ChromeSession) SetDebugStep ¶ added in v0.0.42
func (chromeSession *ChromeSession) SetDebugStep(step string)
SetDebugStep implements UnifiedScraper.SetDebugStep
func (*ChromeSession) Sleep ¶ added in v0.0.45
func (chromeSession *ChromeSession) Sleep(duration time.Duration) chromedp.ActionFunc
Sleep returns an ActionFunc that handles sleeping with replay support. In replay mode, this method returns immediately without actual waiting, which speeds up test execution. In record mode, this method performs the actual sleep operation using chromedp.Sleep.
func (*ChromeSession) SubmitForm ¶ added in v0.0.42
func (chromeSession *ChromeSession) SubmitForm(formSelector string, params map[string]string) error
SubmitForm implements UnifiedScraper.SubmitForm
func (*ChromeSession) Unmarshal ¶ added in v0.0.20
func (session *ChromeSession) Unmarshal(v interface{}, cssSelector string, opt UnmarshalOption) error
func (*ChromeSession) WaitVisible ¶ added in v0.0.42
func (chromeSession *ChromeSession) WaitVisible(selector string) chromedp.ActionFunc
WaitVisible returns an ActionFunc that handles wait visible with replay support
type ChromeTimeoutError ¶ added in v0.0.50
func (ChromeTimeoutError) Error ¶ added in v0.0.50
func (error ChromeTimeoutError) Error() string
func (ChromeTimeoutError) Unwrap ¶ added in v0.0.50
func (error ChromeTimeoutError) Unwrap() error
type ConsoleLogger ¶
type ConsoleLogger struct{}
func (ConsoleLogger) Printf ¶
func (logger ConsoleLogger) Printf(format string, a ...interface{})
type DownloadFileOptions ¶ added in v0.0.39
type DownloadedFileNameNotSatisfiedError ¶ added in v0.0.39
func (*DownloadedFileNameNotSatisfiedError) Error ¶ added in v0.0.39
func (e *DownloadedFileNameNotSatisfiedError) Error() string
type FollowAnchorTextOption ¶
type Form ¶
type Form struct {
Action string
Method string
Elements map[string]*FormElement
Logger Logger
// contains filtered or unexported fields
}
Form holds form data and submit information
func (*Form) NumSelect ¶
NumSelect returns number of available values of the select element specified by name.
func (*Form) PrintSelection ¶
PrintSelection shows available values of the element specified by name.
func (*Form) Set ¶
Set sets a value to the element specified by name. if element have AvailableValues(eg. check, radio or select elements), value must be equals one of them.
type FormElement ¶
type FormElement struct {
Type string // "select", "hidden", "submit", "text", "email", "password", "button", "checkbox", "radio", "image"
Name string
Value *AvailableValue
AvailableValues []*AvailableValue
}
FormElement holds a form element.
func (*FormElement) AddAvailableValue ¶
func (element *FormElement) AddAvailableValue(val *AvailableValue)
func (*FormElement) GoString ¶
func (element *FormElement) GoString() string
type FormElementNotFoundError ¶
type FormElementNotFoundError struct {
Name string
}
func (FormElementNotFoundError) Error ¶
func (error FormElementNotFoundError) Error() string
type LoginError ¶
type LoginError struct {
Message string
}
func (LoginError) Error ¶
func (error LoginError) Error() string
type MaintenanceError ¶
type MaintenanceError struct {
Message string
}
func (MaintenanceError) Error ¶
func (error MaintenanceError) Error() string
type NewChromeOptions ¶ added in v0.0.14
type NewChromeOptions struct {
Headless bool
Timeout time.Duration
ActionTimeout time.Duration // Timeout for individual actions (0 = no timeout)
ExtraAllocOptions []chromedp.ExecAllocatorOption
}
func NewTestChromeOptions ¶ added in v0.0.41
func NewTestChromeOptions(headless bool) NewChromeOptions
NewTestChromeOptions creates Chrome options with CI compatibility built-in. This helper function simplifies test code by automatically including CI-compatible options when needed.
func NewTestChromeOptionsWithTimeout ¶ added in v0.0.41
func NewTestChromeOptionsWithTimeout(headless bool, timeout time.Duration) NewChromeOptions
NewTestChromeOptionsWithTimeout creates Chrome options with CI compatibility and custom timeout. This helper function simplifies test code by automatically including CI-compatible options when needed, with a custom timeout setting.
type Page ¶
Page holds DOM structure of the page and its URL, Logging information.
func (*Page) Form ¶
Form generates a Form object from a form object identified by selector in the Page
func (*Page) MetaRefresh ¶
MetaRefresh returns a URL from "meta http-equiv=refresh" tag if it exists. otherwise, returns nil.
type PageMetadata ¶ added in v0.0.46
type PageMetadata struct {
URL string `json:"url"`
ContentType string `json:"content_type"`
Title string `json:"title,omitempty"`
}
PageMetadata holds metadata for saved pages
type PageOption ¶ added in v0.0.40
type RequestError ¶ added in v0.0.13
func (RequestError) Error ¶ added in v0.0.13
func (err RequestError) Error() string
type Response ¶
type Response struct {
Request *http.Request
ContentType string
RawBody []byte
Encoding encoding.Encoding
Logger Logger
}
Response holds a raw response and its request information.
type ResponseError ¶ added in v0.0.13
func (ResponseError) Error ¶ added in v0.0.13
func (err ResponseError) Error() string
type RetryAndRecordError ¶
type RetryAndRecordError struct {
Filename string
}
func (RetryAndRecordError) Error ¶
func (error RetryAndRecordError) Error() string
type ScraperType ¶ added in v0.0.42
type ScraperType int
ScraperType indicates which underlying scraping mechanism is being used
const ( HTTPScraper ScraperType = iota ChromeScraper )
func GetScraperType ¶ added in v0.0.42
func GetScraperType(scraper UnifiedScraper) ScraperType
GetScraperType returns the type of scraper being used
type Session ¶
type Session struct {
Name string // directory name to store session files(downloaded files and cookies)
Encoding encoding.Encoding // force charset over Content-Type response header
UserAgent string // specify User-Agent
FilePrefix string // prefix to directory of session files
NotUseNetwork bool // load from previously downloaded session files rather than network access
SaveToFile bool // save downloaded pages to session directory
ShowRequestHeader bool // print request headers with Logger
ShowResponseHeader bool // print response headers with Logger
ShowFormPosting bool // print posting form data, with Logger
Log Logger
BodyFilter func(resp *Response, body []byte) ([]byte, error)
// contains filtered or unexported fields
}
Session holds communication and logging options
func NewSession ¶
func (*Session) ApplyRefresh ¶
ApplyRefresh mimics HTML Meta Refresh.
func (*Session) ClearDebugStep ¶ added in v0.0.41
func (session *Session) ClearDebugStep()
ClearDebugStep clears the debug step label
func (*Session) DoNavigate ¶ added in v0.0.49
DoNavigate implements UnifiedScraper.DoNavigate
func (*Session) DoSendKeys ¶ added in v0.0.49
DoSendKeys implements UnifiedScraper.DoSendKeys
func (*Session) DoWaitVisible ¶ added in v0.0.49
DoWaitVisible implements UnifiedScraper.DoWaitVisible
func (*Session) DownloadResource ¶ added in v0.0.42
func (session *Session) DownloadResource(options UnifiedDownloadOptions) (string, error)
DownloadResource implements UnifiedScraper.DownloadResource
func (*Session) ExtractData ¶ added in v0.0.42
func (session *Session) ExtractData(v interface{}, selector string, opt UnmarshalOption) error
ExtractData implements UnifiedScraper.ExtractData
func (*Session) FollowAnchor ¶ added in v0.0.42
FollowAnchor implements UnifiedScraper.FollowAnchor
func (*Session) FollowAnchorText ¶
func (*Session) FollowAnchorTextOpt ¶
func (*Session) FollowLink ¶
func (*Session) FollowSelectionLink ¶
func (session *Session) FollowSelectionLink(page *Page, selection *goquery.Selection, attr string) (*Response, error)
FollowSelectionLink opens a link specified by attr of the selection and returns a Response.
func (*Session) FormAction ¶
func (session *Session) FormAction(page *Page, formSelector string, params map[string]string) (*Response, error)
FormAction submits a form (easy version)
func (*Session) GetCurrentURL ¶ added in v0.0.46
GetCurrentURL returns the current page URL
func (*Session) GetDebugStep ¶ added in v0.0.42
GetDebugStep implements UnifiedScraper.GetDebugStep
func (*Session) GetDirectory ¶ added in v0.0.51
GetDirectory returns the directory path for this session's files
func (*Session) GetPageMaxRedirect ¶
GetPageMaxRedirect gets the URL and follows HTTP meta refresh if response page contained that.
func (*Session) LoadCookie ¶
func (*Session) NewChrome ¶
func (session *Session) NewChrome() (*ChromeSession, context.CancelFunc, error)
func (*Session) NewChromeOpt ¶ added in v0.0.14
func (session *Session) NewChromeOpt(options NewChromeOptions) (chromeSession *ChromeSession, cancelFunc context.CancelFunc, err error)
func (*Session) SaveCookie ¶
SaveCookie stores cookies to a file. must call LoadCookie() before call SaveCookie().
func (*Session) SendKeys ¶ added in v0.0.42
SendKeys implements UnifiedScraper.SendKeys For HTTP scraping, this stores form data to be submitted later
func (*Session) SetDebugStep ¶ added in v0.0.41
SetDebugStep sets the debug step label for logging
func (*Session) SubmitForm ¶ added in v0.0.42
SubmitForm implements UnifiedScraper.SubmitForm
func (*Session) SubmitOpt ¶
SubmitOpt submits a form. if imageId is non-empty, specifies "image" element to imitate clicking.
func (*Session) WaitVisible ¶ added in v0.0.42
WaitVisible implements UnifiedScraper.WaitVisible For HTTP-based scraping, this is a no-op since elements are immediately available
type UnexpectedContentTypeError ¶
func (UnexpectedContentTypeError) Error ¶
func (error UnexpectedContentTypeError) Error() string
type UnifiedDownloadOptions ¶ added in v0.0.42
type UnifiedDownloadOptions struct {
Timeout time.Duration // Maximum time to wait for download (defaults to DefaultDownloadTimeout if zero)
Glob string // File name pattern to match (for Chrome downloads)
SaveAs string // Target filename (optional)
}
UnifiedDownloadOptions provides options for file downloads that work with both HTTP-based and browser-based download mechanisms.
type UnifiedScraper ¶ added in v0.0.42
type UnifiedScraper interface {
// Navigation methods
DoWaitVisible(selector string) error
// Form interaction methods
DoSendKeys(selector, value string) error
DoClick(selector string) error
SubmitForm(formSelector string, params map[string]string) error
FollowAnchor(text string) error
// Data extraction methods
SavePage() (string, error)
ExtractData(v interface{}, selector string, opt UnmarshalOption) error
// Download methods
DownloadResource(options UnifiedDownloadOptions) (string, error)
// Debug methods
GetDebugStep() string
SetDebugStep(step string)
ClearDebugStep()
// Utility methods
Printf(format string, a ...interface{})
}
UnifiedScraper provides a common interface for both traditional HTTP-based scraping and Chrome-based browser automation, allowing code to work seamlessly with either backend.
Example usage:
var scraper UnifiedScraper = session // or chromeSession
err := scraper.DoNavigate("https://example.com")
err = scraper.DoWaitVisible(".content")
data, err := scraper.SavePage()
The interface abstracts the differences between HTTP and browser-based scraping: - HTTP scraping: DoWaitVisible is a no-op, form operations are simulated - Chrome scraping: Full browser automation with real user interactions
type UnmarshalFieldError ¶
func (UnmarshalFieldError) Error ¶
func (err UnmarshalFieldError) Error() string
type UnmarshalMustBePointerError ¶
type UnmarshalMustBePointerError struct{}
func (UnmarshalMustBePointerError) Error ¶
func (err UnmarshalMustBePointerError) Error() string
type UnmarshalOption ¶
type UnmarshalOption struct {
Attr string // if nonempty, get attribute text of the element. get Text() otherwise.
Re string // Regular Expression to match the text. must contain one capture.
Time string // for time.Time only. parse with this format.
Loc *time.Location // time zone for parsing time.Time.
Html bool // get Html() rather than Text(). ignores Attr.
Ignore string // is string matches, results zero value.
}
type UnmarshalParseNumberError ¶ added in v0.0.11
func (UnmarshalParseNumberError) Error ¶ added in v0.0.11
func (err UnmarshalParseNumberError) Error() string
type UnmarshalUnexportedFieldError ¶
type UnmarshalUnexportedFieldError struct{}
func (UnmarshalUnexportedFieldError) Error ¶
func (err UnmarshalUnexportedFieldError) Error() string