Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CrawlerConfig ¶
type CrawlerConfig struct {
MaxGoRoutine int `config:"max_go_routine"`
//Fetch Speed Control
FetchThresholdInMs int `config:"fetch_threshold_ms"`
}
func GetDefaultCrawlerConfig ¶
func GetDefaultCrawlerConfig() CrawlerConfig
type RoutingParameter ¶
type RoutingParameter struct {
Shard int
}
type TaskConfig ¶
type TaskConfig struct {
//walking around pattern
LinkUrlExtractRegexStr string `link_extract_pattern`
LinkUrlExtractRegex *regexp.Regexp
LinkUrlExtractRegexGroupIndex int `link_extract_group`
LinkUrlMustContain string
LinkUrlMustNotContain string
//parsing url pattern,when url match this pattern,gopa will not parse urls from response of this url
SkipPageParsePatternStr string `skip_page_parse_pattern`
SkipPageParsePattern *regexp.Regexp
//fetch url pattern
FetchUrlPatternStr string `fetch_url_pattern`
FetchUrlPattern *regexp.Regexp
FetchUrlMustContain string
FetchUrlMustNotContain string
//saving pattern
SavingUrlPatternStr string `save_url_pattern`
SavingUrlPattern *regexp.Regexp
SavingUrlMustContain string
SavingUrlMustNotContain string
//Crawling within domain
FollowSameDomain bool `follow_same_domain`
FollowSubDomain bool `follow_sub_domain`
TaskDataPath string
//User Cookie
Cookie string
//Fetch Speed Control
FetchDelayThreshold int
TaskDBFilename string `task_db_filename`
}
Click to show internal directories.
Click to hide internal directories.