Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var GoogleSearch = &spider.Spider{ Name: "Google search", Description: "Crawls pages from [www.google.com]", Keyin: spider.KEYIN, Limit: spider.LIMIT, EnableCookie: false, RuleTree: &spider.RuleTree{ Root: func(ctx *spider.Context) { var url string var success bool logs.Log().Informational("Running google spider,this may take some time...") for _, ip := range googleIp { url = "http://" + ip + "/?gws_rd=ssl#q=" + ctx.GetKeyin() logs.Log().Informational("测试 " + ip) if goquery.NewDocument(url).IsOk() { success = true break } } if !success { logs.Log().Critical("Could not reach any of the Google mirrors") return } logs.Log().Critical("Starting Google search ...") ctx.AddQueue(&request.Request{ URL: url, Rule: "total_pages", Temp: map[string]interface{}{ "baseUrl": url, }, }) }, Trunk: map[string]*spider.Rule{ "total_pages": { AidFunc: func(ctx *spider.Context, aid map[string]interface{}) interface{} { for loop := aid["loop"].([2]int); loop[0] < loop[1]; loop[0]++ { ctx.AddQueue(&request.Request{ URL: aid["urlBase"].(string) + "&start=" + strconv.Itoa(10*loop[0]), Rule: aid["Rule"].(string), }) } return nil }, ParseFunc: func(ctx *spider.Context) { query := ctx.GetDom() txt := query.Find("#resultStats").Text() re := regexp.MustCompile(`,+`) txt = re.ReplaceAllString(txt, "") re = regexp.MustCompile(`[\d]+`) txt = re.FindString(txt) num, _ := strconv.Atoi(txt) total := int(math.Ceil(float64(num) / 10)) if total > ctx.GetLimit() { total = ctx.GetLimit() } else if total == 0 { logs.Log().Critical("[ERROR:| Spider:%v | KEYIN:%v | Rule:%v] Did not fetch any data!!!\n", ctx.GetName(), ctx.GetKeyin(), ctx.GetRuleName()) return } ctx.Aid(map[string]interface{}{ "loop": [2]int{1, total}, "urlBase": ctx.GetTemp("baseUrl", ""), "Rule": "search_results", }) ctx.Parse("search_results") }, }, "search_results": { ItemFields: []string{ "title", "content", "href", }, ParseFunc: func(ctx *spider.Context) { query := ctx.GetDom() query.Find("#ires .g").Each(func(i int, s *goquery.Selection) { t := s.Find(".r > a") href := t.Attr("href").UnwrapOr("") href = strings.TrimLeft(href, "/url?q=") logs.Log().Informational(href) title := t.Text() content := s.Find(".st").Text() ctx.Output(map[int]interface{}{ 0: title, 1: content, 2: href, }) }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.