Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
View Source
var Wangyi = &spider.Spider{ Name: "网易新闻", Description: "网易排行榜新闻,含点击/跟帖排名 [Auto Page] [news.163.com/rank]", EnableCookie: false, RuleTree: &spider.RuleTree{ Root: func(ctx *spider.Context) { ctx.AddQueue(&request.Request{URL: "http://news.163.com/rank/", Rule: "排行榜主页"}) }, Trunk: map[string]*spider.Rule{ "排行榜主页": { ParseFunc: func(ctx *spider.Context) { query := ctx.GetDom() query.Find(".subNav a").Each(func(i int, s *goquery.Selection) { if url := s.Attr("href"); url.IsSome() { ctx.AddQueue(&request.Request{URL: url.Unwrap(), Rule: "新闻排行榜"}) } }) }, }, "新闻排行榜": { ParseFunc: func(ctx *spider.Context) { topTit := []string{ "1小时前点击排行", "24小时点击排行", "本周点击排行", "今日跟帖排行", "本周跟帖排行", "本月跟贴排行", } query := ctx.GetDom() newsType := query.Find(".titleBar h2").Text() urls_top := map[string]string{} query.Find(".tabContents").Each(func(n int, t *goquery.Selection) { t.Find("tr").Each(func(i int, s *goquery.Selection) { if i == 0 { return } url := s.Find("a").Attr("href") top := s.Find(".cBlue").Text() if url.IsSome() { urls_top[url.Unwrap()] += topTit[n] + ":" + top + "," } }) }) for k, v := range urls_top { ctx.AddQueue(&request.Request{ URL: k, Rule: "热点新闻", Temp: map[string]interface{}{ "newsType": newsType, "top": v, }, }) } }, }, "热点新闻": { ItemFields: []string{ "标题", "内容", "排名", "类别", "ReleaseTime", }, ParseFunc: func(ctx *spider.Context) { query := ctx.GetDom() if pageAll := query.Find(".ep-pages-all"); len(pageAll.Nodes) != 0 { if pageAllUrl := pageAll.Attr("href"); pageAllUrl.IsSome() { ctx.AddQueue(&request.Request{ URL: pageAllUrl.Unwrap(), Rule: "热点新闻", Temp: ctx.CopyTemps(), }) } return } title := query.Find("#h1title").Text() content := query.Find("#endText").Text() re := regexp.MustCompile("\\<[\\S\\s]+?\\>") content = re.ReplaceAllString(content, "") release := query.Find(".ep-time-soure").Text() release = strings.Split(release, "来源:")[0] release = strings.Trim(release, " \t\n") ctx.Output(map[int]interface{}{ 0: title, 1: content, 2: ctx.GetTemp("top", ""), 3: ctx.GetTemp("newsType", ""), 4: release, }) }, }, }, }, }
Functions ¶
This section is empty.
Types ¶
This section is empty.
Click to show internal directories.
Click to hide internal directories.