crawl

package
v1.6.16 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 17, 2026 License: MIT Imports: 28 Imported by: 0

Documentation

Index

Constants

View Source
const JSExecuteTemplate = `` /* 813-byte string literal not displayed */

Variables

View Source
var DefaultStackDepthLimit = 300

默认JavaScript执行的堆栈深度限制

Functions

func ExtractCharset

func ExtractCharset(htmlContent string) string

func ExtractContentTypeCharset

func ExtractContentTypeCharset(contentType string) (charset string)

func InsertInto

func InsertInto(s string, interval int, sep rune) string

func ResponseDecoding

func ResponseDecoding(body []byte, label string) string

Types

type Banner struct {
	Uri         string               `json:"uri"`
	BodyHash    int32                `json:"body_hash"`
	Body        string               `json:"body"`
	Header      string               `json:"header"`
	Headers     map[string]string    `json:"-"`
	Title       string               `json:"title"`
	StatusCode  int                  `json:"status_code"`
	Response    string               `json:"_"`
	SSL         bool                 `json:"ssl"`
	Certificate string               `json:"certificate"`
	IconHash    int32                `json:"icon_hash"`
	IconType    string               `json:"icon_type"`
	Charset     string               `json:"-"`
	Cert        *tls.ConnectionState `json:"-"`
	IconURI     string               `json:"icon_uri"`
	IconBytes   []byte               `json:"-"`
	Compliance  map[string]string    `json:"-"`
}

Banner 表示爬取的网站信息

func RequestOnce

func RequestOnce(client *retryablehttp.Client, uri string) (banner *Banner, redirectURL string, err error)

func (*Banner) CacheLower

func (b *Banner) CacheLower()

缓存小写内容、避免匹配时进行大小写转换出现的性能损耗、虽然会增加内存开销、但是可以显著提高匹配速度

type Crawler

type Crawler struct {
	// contains filtered or unexported fields
}

Crawler 定义爬虫的核心结构

func NewCrawler

func NewCrawler(options *Options) *Crawler

NewCrawler 创建新的爬虫实例

func (*Crawler) GetBanner

func (c *Crawler) GetBanner(ctx context.Context, uri string) (*Banner, error)

func (*Crawler) GetBanners

func (c *Crawler) GetBanners(ctx context.Context, uri string) ([]*Banner, error)

GetBanners 实现BannerProvider接口

func (*Crawler) GetClient

func (c *Crawler) GetClient() *retryablehttp.Client

GetClient 获取HTTP客户端

type Options

type Options struct {
	DisableIcon bool
	Proxy       string
	DebugReq    bool
	DebugResp   bool
	Timeout     time.Duration
	RetryMax    int // 重试次数
}

func DefaultOption

func DefaultOption() *Options

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL