Documentation
¶
Overview ¶
Package pagser is a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags, It's parser library from scrago.
The project source code: https://github.com/foolin/pagser
Features ¶
* Simple - Use golang struct tag syntax.
* Easy - Easy use for your spider/crawler/colly application.
* Extensible - Support for extension functions.
* Struct tag grammar - Grammar is simple, like \`pagser:"a->attr(href)"\`.
* Nested Structure - Support Nested Structure for node.
* Configurable - Support configuration.
* GoQuery/Colly - Support all goquery project, such as go-colly.
More info: https://github.com/foolin/pagser
Index ¶
- type BuiltinFunctions
- func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrInt(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Split(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Value(node *goquery.Selection, args ...string) (out interface{}, err error)
- type CallFunc
- type Config
- type Pagser
- func (p *Pagser) Parse(v interface{}, document string) (err error)
- func (p *Pagser) ParseDocument(v interface{}, document *goquery.Document) (err error)
- func (p *Pagser) ParseReader(v interface{}, reader io.Reader) (err error)
- func (p *Pagser) ParseSelection(v interface{}, selection *goquery.Selection) (err error)
- func (p *Pagser) RegisterFunc(name string, fn CallFunc) error
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BuiltinFunctions ¶ added in v0.0.7
type BuiltinFunctions struct {
}
Builtin functions are registered with a lowercase initial, eg: Text -> text()
func (BuiltinFunctions) Attr ¶ added in v0.0.7
func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
attr(name) get element attribute value, return string.
func (BuiltinFunctions) AttrInt ¶ added in v0.0.7
func (builtin BuiltinFunctions) AttrInt(node *goquery.Selection, args ...string) (out interface{}, err error)
attrInt(name, defaultValue) get element attribute value and to int, return int.
func (BuiltinFunctions) AttrSplit ¶ added in v0.0.7
func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
attrSplit(name, sep) get attribute value and split by separator to array string.
func (BuiltinFunctions) EachAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
eachAttr() get each element attribute value, return []string.
func (BuiltinFunctions) EachHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eachHtml() get each element inner html, return []string.
func (BuiltinFunctions) EachJoin ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
eachJoin(sep) get each element text and join to string, return string.
func (BuiltinFunctions) EachOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eachOutHtml() get each element outer html, return []string.
func (BuiltinFunctions) EachText ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
eachText() get each element text, return []string.
func (BuiltinFunctions) Eq ¶ added in v0.0.7
func (builtin BuiltinFunctions) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)
eq(index) reduces the set of matched elements to the one at the specified index, return string.
func (BuiltinFunctions) EqAndAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndAttr(index, name) reduces the set of matched elements to the one at the specified index, and attr() return string.
func (BuiltinFunctions) EqAndHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndHtml(index) reduces the set of matched elements to the one at the specified index, and html() return string.
func (BuiltinFunctions) EqAndOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndOutHtml(index) reduces the set of matched elements to the one at the specified index, and outHtml() return string.
func (BuiltinFunctions) Html ¶ added in v0.0.7
func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
html() get element inner html, return string.
func (BuiltinFunctions) OutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
outerHtml() get element outer html, return string.
func (BuiltinFunctions) Split ¶ added in v0.0.7
func (builtin BuiltinFunctions) Split(node *goquery.Selection, args ...string) (out interface{}, err error)
split(sep) get element text and split by separator to array string, return []string.
type CallFunc ¶
Define Global Function ¶
func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
}
//Register function
pagser.RegisterFunc("MyFunc", MyFunc)
//Use function
type PageData struct{
Text string `pagser:"h1->MyFunc()"`
}
Define Struct Function ¶
//Use function
type PageData struct{
Text string `pagser:"h1->MyFunc()"`
}
func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
}
Define your own function interface
type Config ¶
type Config struct {
TagerName string //struct tag name, default is `pagser`
FuncSymbol string //Function symbol, default is `->`
IgnoreSymbol string //Ignore symbol, default is `-`
Debug bool //Debug mode, debug will print some log, default is `false`
}
Config configuration
type Pagser ¶
type Pagser struct {
// contains filtered or unexported fields
}
Pagser the page parser
func NewWithConfig ¶
NewWithConfig create client with config and error
Example ¶
cfg := Config{
TagerName: "pagser",
FuncSymbol: "->",
IgnoreSymbol: "-",
Debug: false,
}
p, err := NewWithConfig(cfg)
if err != nil {
log.Fatal(err)
}
//data parser model
var page ExampPage
//parse html data
err = p.Parse(&page, rawPageHtml)
//check error
if err != nil {
log.Fatal(err)
}
func (*Pagser) Parse ¶
Parse parse html to struct
Example ¶
//New default config
p := New()
//data parser model
var page ExampPage
//parse html data
err := p.Parse(&page, rawPageHtml)
//check error
if err != nil {
log.Fatal(err)
}
log.Printf("%v", page)
func (*Pagser) ParseDocument ¶
ParseDocument parse document to struct
func (*Pagser) ParseReader ¶ added in v0.0.3
Parse parse html to struct
Example ¶
resp, err := http.Get("https://raw.githubusercontent.com/foolin/pagser/master/_examples/pages/demo.html")
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
//New default config
p := New()
//data parser model
var page ExampPage
//parse html data
err = p.ParseReader(&page, resp.Body)
//check error
if err != nil {
panic(err)
}
log.Printf("%v", page)
