rules

package
v1.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 3, 2026 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

View Source
var BaiduSearch = &spider.Spider{
	Name:            "百度搜索",
	Description:     "百度搜索结果 [www.baidu.com]",
	Keyin:           spider.KEYIN,
	Limit:           spider.LIMIT,
	EnableCookie:    true,
	NotDefaultField: true,
	RuleTree: &spider.RuleTree{
		Root: func(ctx *spider.Context) {
			ctx.AddQueue(&request.Request{
				URL:          "https://www.baidu.com/s?wd=" + url.QueryEscape(ctx.GetKeyin()) + "&pn=0",
				Rule:         "搜索结果",
				DownloaderID: request.ChromeID,
			})
		},

		Trunk: map[string]*spider.Rule{
			"搜索结果": {
				ItemFields: []string{
					"标题",
					"链接",
					"摘要",
				},
				ParseFunc: func(ctx *spider.Context) {
					query := ctx.GetDom()
					query.Find("div.result,div.result-op").Each(func(i int, s *goquery.Selection) {
						title := strings.TrimSpace(s.Find("h3.t a").Text())
						href := s.Find("h3.t a").AttrOr("href", "")
						summary := strings.TrimSpace(s.Find("[data-module=abstract]").Text())

						if title == "" || href == "" {
							return
						}

						ctx.Output(map[int]interface{}{
							0: title,
							1: href,
							2: summary,
						})
					})

					nextHref := query.Find("a.n").Last().AttrOr("href", "")
					if nextHref != "" {
						ctx.AddQueue(&request.Request{
							URL:          "https://www.baidu.com" + nextHref,
							Rule:         "搜索结果",
							DownloaderID: request.ChromeID,
						})
					}
				},
			},
		},
	},
}

Functions

This section is empty.

Types

This section is empty.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL