wbot

package module
v0.1.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 20, 2022 License: MIT Imports: 16 Imported by: 1

README

WBot - a web crawler

A configurable, thread-safe web crawler, provides a minimal interface for crawling and downloading web pages.

Features:

  • Clean minimal API.
  • Configurable: MaxDepth, MaxBodySize, Rate Limit, Parrallelism, User Agent & Proxy rotation.
  • Memory-efficient, thread-safe.
  • Provides built-in interface: Fetcher, Store, Queue & a Logger.

WBot Specifications:

Interfaces
// Fetcher
type Fetcher interface {
	Fetch(req *Request) (*Response, error)
	Close() error
}

// Store
type Store interface {
	Visited(link string) bool
	Close() error
}

// Queue
type Queue interface {
	Add(req *Request)
	Pop() *Request
	Next() bool
	Close() error
}

// Logger
type Logger interface {
	Send(rep *Report)
	Close() error
}
API
// NewWBot
func NewWBot(opts ...Option) (*WBot, error)

// Crawl
func (wb *WBot) Crawl(link string) error

// SetOptions
func (wb *WBot) SetOptions(opts ...Option)

// Stream
func (wb *WBot) Stream() <-chan *Response

// Close
func (wb *WBot) Close() 

Installation

requires Go1.18

go get github.com/twiny/wbot

Example

package main

import (
	"fmt"
	"time"

	"github.com/twiny/wbot"
)

//
func main() {
	// options
	opts := []wbot.Option{
		wbot.SetMaxDepth(5),
		wbot.SetParallel(10),
		wbot.SetRateLimit(1, 1*time.Second),
		wbot.SetMaxBodySize(1024 * 1024),
		wbot.SetUserAgents([]string{"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"}),
	}

	// wbot
	bot := wbot.NewWBot(opts...)
	defer bot.Close()

	// crawl
	site := `https://www.github.com`

	// stream
	// stream
	go func() {
		count := 0
		for resp := range bot.Stream() {
			count++
			fmt.Printf("num: %d - depth: %d - visited url:%s - status:%d - body len: %d\n", count, resp.Depth, resp.URL.String(), resp.Status, len(resp.Body))
		}
	}()

	if err := bot.Crawl(site); err != nil {
		panic(err)
	}

	fmt.Println("done")
}

TODO

  • Add support for robots.txt.
  • Add test cases.
  • Implement Fetch using Chromedp.
  • Add more examples.
  • Add documentation.

Bugs

Bugs or suggestions? Please visit the issue tracker.

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Fetcher

type Fetcher interface {
	Fetch(req *Request) (*Response, error)
	Close() error
}

Fetcher

type Logger

type Logger interface {
	Send(rep *Report)
	Close() error
}

Logger

type Option

type Option func(*WBot)

Option

func SetFetcher

func SetFetcher(f Fetcher) Option

SetFetcher

func SetFilter

func SetFilter(allowed, disallowed []string) Option

SetFilter

func SetLogger

func SetLogger(l Logger) Option

SetLogger

func SetMaxBodySize

func SetMaxBodySize(size int64) Option

SetMaxBodySize

func SetMaxDepth

func SetMaxDepth(depth int32) Option

SetMaxDepth

func SetParallel

func SetParallel(parallel int) Option

SetParallel

func SetProxies

func SetProxies(proxies []string) Option

SetProxies

func SetQueue

func SetQueue(q Queue) Option

SetQueue

func SetRateLimit

func SetRateLimit(rate int, interval time.Duration) Option

SetLimiter

func SetStore

func SetStore(s Store) Option

SetStore

func SetUserAgents

func SetUserAgents(agents []string) Option

SetUserAgents

type Queue

type Queue interface {
	Add(req *Request)
	Pop() *Request
	Next() bool
	Close() error
}

Queue

type Report

type Report struct {
	RequestURL string
	Status     int
	Depth      int32
	Err        error
}

Report

type Request

type Request struct {
	BaseDomain string
	URL        *url.URL
	Depth      int32
	// contains filtered or unexported fields
}

Request

func (*Request) AbsURL

func (r *Request) AbsURL(u string) (*url.URL, error)

AbsURL

type Response

type Response struct {
	URL      *url.URL
	Status   int
	Body     []byte
	NextURLs []string
	Depth    int32
}

Response

type Store

type Store interface {
	Visited(link string) bool
	Close() error
}

Store

type WBot

type WBot struct {
	// contains filtered or unexported fields
}

WBot

func NewWBot

func NewWBot(opts ...Option) *WBot

NewWBot

func (*WBot) Close

func (wb *WBot) Close()

Close

func (*WBot) Crawl

func (wb *WBot) Crawl(link string) error

Crawl

func (*WBot) SetOptions

func (wb *WBot) SetOptions(opts ...Option)

SetOptions

func (*WBot) Stream

func (wb *WBot) Stream() <-chan *Response

Stream

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL