Documentation
¶
Index ¶
Constants ¶
View Source
const UserAgent = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
Default useragent
Variables ¶
View Source
var ( ErrMimeType = errors.New("MIME type not supported") ErrInvalidChar = errors.New("Crawler result contains invalid characters") )
Errors
View Source
var ( // Valid url ValidUrl = regexp.MustCompile(`https?://[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*`) // Valid Page Title ValidPageTitle = regexp.MustCompile(`^(.)+$`) )
Some regular expressions
View Source
var AllowedMimeTypes = map[string]bool{ "text/html; charset=utf-8": true, }
Map of allowed mime types
Functions ¶
func ExtractUrl ¶
Types ¶
type Client ¶
type Client struct {
// Dialer used for requests
Dial func(network, addr string) (net.Conn, error)
// The useragent used in requests
UserAgent string
// UserName and PassWord for authentication with the WebServer
UserName string
PassWord string
}
Client structure
type CrawlResult ¶
type CrawlResult struct {
Title string `xml:"head>title"` // Title of the page
//Desc string 'xml: "head>meta"' // Description of the page
Size int // Size of webpage
}
Store some information about the page
Click to show internal directories.
Click to hide internal directories.