Documentation
¶
Overview ¶
Package pagser is a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags, It's parser library from scrago.
The project source code: https://github.com/foolin/pagser
Features ¶
* Simple - Use golang struct tag syntax.
* Easy - Easy use for your spider/crawler/colly application.
* Extensible - Support for extension functions.
* Struct tag grammar - Grammar is simple, like \`pagser:"a->attr(href)"\`.
* Nested Structure - Support Nested Structure for node.
* Configurable - Support configuration.
* GoQuery/Colly - Support all goquery project, such as go-colly.
More info: https://github.com/foolin/pagser
Index ¶
- type BuiltinFunctions
- func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Concat(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) ConcatAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Split(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Value(node *goquery.Selection, args ...string) (out interface{}, err error)
- type CallFunc
- type Config
- type Pagser
- func (p *Pagser) Parse(v interface{}, document string) (err error)
- func (p *Pagser) ParseDocument(v interface{}, document *goquery.Document) (err error)
- func (p *Pagser) ParseReader(v interface{}, reader io.Reader) (err error)
- func (p *Pagser) ParseSelection(v interface{}, selection *goquery.Selection) (err error)
- func (p *Pagser) RegisterFunc(name string, fn CallFunc)
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BuiltinFunctions ¶ added in v0.0.7
type BuiltinFunctions struct {
}
Builtin functions are registered with a lowercase initial, eg: Text -> text()
func (BuiltinFunctions) Attr ¶ added in v0.0.7
func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
attr(name, defaultValue=”) get element attribute value, return string. outerHtml() get element outer html, return string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Example string `pagser:".selector->attr(href)"`
}
func (BuiltinFunctions) AttrEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
attrEmpty(name, defaultValue) get element attribute value, return string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Example string `pagser:".selector->AttrEmpty(href, '#')"`
}
func (BuiltinFunctions) AttrSplit ¶ added in v0.0.7
func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
attrSplit(name, sep=',', trim='true') get attribute value and split by separator to array string, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Examples []string `pagser:".selector->attrSplit('keywords', ',')"`
}
func (BuiltinFunctions) Concat ¶ added in v0.1.0
func (builtin BuiltinFunctions) Concat(node *goquery.Selection, args ...string) (out interface{}, err error)
concat(text1, $value, [ text2, ... text_n ]) The `text1, text2, ... text_n` strings that you wish to join together, `$value` is placeholder for get element text, return string.
struct {
Example string `pagser:".selector->concat('Result:', '<', $value, '>')"`
}
func (BuiltinFunctions) ConcatAttr ¶ added in v0.1.0
func (builtin BuiltinFunctions) ConcatAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
concatAttr(name, text1, $value, [ text2, ... text_n ]) `name` get element attribute value by name, `text1, text2, ... text_n` The strings that you wish to join together, `$value` is placeholder for get element text return string.
struct {
Example string `pagser:".selector->concatAttr('Result:', '<', $value, '>')"`
}
func (BuiltinFunctions) EachAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
eachAttr() get each element attribute value, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Examples []string `pagser:".selector->eachAttr(href)"`
}
func (BuiltinFunctions) EachAttrEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
eachAttrEmpty(defaultValue) get each element attribute value, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Examples []string `pagser:".selector->eachAttrEmpty(href, '#')"`
}
func (BuiltinFunctions) EachHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eachHtml() get each element inner html, return []string. eachTextEmpty(defaultValue) get each element text, return []string.
struct {
Examples []string `pagser:".selector->eachHtml()"`
}
func (BuiltinFunctions) EachJoin ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
eachJoin(sep) get each element text and join to string, return string.
struct {
Example string `pagser:".selector->eachJoin(',')"`
}
func (BuiltinFunctions) EachOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eachOutHtml() get each element outer html, return []string.
struct {
Examples []string `pagser:".selector->eachOutHtml()"`
}
func (BuiltinFunctions) EachText ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
eachText() get each element text, return []string.
struct {
Examples []string `pagser:".selector->eachText('')"`
}
func (BuiltinFunctions) EachTextEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
eachTextEmpty(defaultValue) get each element text, return []string.
struct {
Examples []string `pagser:".selector->eachTextEmpty('')"`
}
func (BuiltinFunctions) Eq ¶ added in v0.0.7
func (builtin BuiltinFunctions) Eq(node *goquery.Selection, args ...string) (out interface{}, err error)
eq(index) reduces the set of matched elements to the one at the specified index, return string.
struct {
Example string `pagser:".selector->eq(0)"`
}
func (BuiltinFunctions) EqAndAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndAttr(index, name) reduces the set of matched elements to the one at the specified index, and attr() return string.
struct {
Example string `pagser:".selector->eqAndAttr(0, href)"`
}
func (BuiltinFunctions) EqAndHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndHtml(index) reduces the set of matched elements to the one at the specified index, and html() return string.
struct {
Example string `pagser:".selector->eqAndHtml(0)"`
}
func (BuiltinFunctions) EqAndOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndOutHtml(index) reduces the set of matched elements to the one at the specified index, and outHtml() return string.
struct {
Example string `pagser:".selector->eqAndOutHtml(0)"`
}
func (BuiltinFunctions) Html ¶ added in v0.0.7
func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
html() get element inner html, return string.
struct {
Example string `pagser:".selector->html()"`
}
func (BuiltinFunctions) OutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
outerHtml() get element outer html, return string.
struct {
Example string `pagser:".selector->outerHtml()"`
}
func (BuiltinFunctions) Split ¶ added in v0.0.7
func (builtin BuiltinFunctions) Split(node *goquery.Selection, args ...string) (out interface{}, err error)
split(sep=',', trim='true') get element text and split by separator to array string, return []string.
struct {
Examples []string `pagser:".selector->split('|')"`
}
func (BuiltinFunctions) Text ¶ added in v0.0.7
func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
text() get element text, return string, this is default function, if not define function in struct tag.
struct {
Example string `pagser:".selector->text()"`
}
func (BuiltinFunctions) TextEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
textEmpty(defaultValue) get element text, if empty will return defaultValue, return string.
struct {
Example string `pagser:".selector->TextEmpty('')"`
}
func (BuiltinFunctions) Value ¶ added in v0.0.7
func (builtin BuiltinFunctions) Value(node *goquery.Selection, args ...string) (out interface{}, err error)
value() get element attribute value by name is `value`, return string
//<input name="pagser" value="xxx" />
struct {
Example string `pagser:".selector->Value()"`
}
Output: xxx
type CallFunc ¶
Define Global Function ¶
func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
}
//Register function
pagser.RegisterFunc("MyFunc", MyFunc)
//Use function
type PageData struct{
Text string `pagser:"h1->MyFunc()"`
}
Define Struct Function ¶
//Use function
type PageData struct{
Text string `pagser:"h1->MyFunc()"`
}
func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
}
Lookup function priority order ¶
struct method -> parent method -> ... -> global
Implicit convert type ¶
Automatic type conversion, Output result string convert to int, int64, float64...
CallFunc is a define function interface
type Config ¶
type Config struct {
TagName string //struct tag name, default is `pagser`
FuncSymbol string //Function symbol, default is `->`
CastError bool //Returns an error when the type cannot be converted, default is `false`
Debug bool //Debug mode, debug will print some log, default is `false`
}
Config configuration
func DefaultConfig ¶
func DefaultConfig() Config
DefaultConfig the default Config
Config{
TagName: "pagser",
FuncSymbol: "->",
CastError: false,
Debug: false,
}
type Pagser ¶
type Pagser struct {
Config Config
// contains filtered or unexported fields
}
Pagser the page parser
func NewWithConfig ¶
NewWithConfig create pagser client with Config and error
Example ¶
cfg := Config{
TagName: "pagser",
FuncSymbol: "->",
CastError: false,
Debug: false,
}
p, err := NewWithConfig(cfg)
if err != nil {
log.Fatal(err)
}
//data parser model
var page ExamplePage
//parse html data
err = p.Parse(&page, rawExampleHtml)
//check error
if err != nil {
log.Fatal(err)
}
func (*Pagser) Parse ¶
Parse parse html to struct
Example ¶
//New default Config
p := New()
//data parser model
var page ExamplePage
//parse html data
err := p.Parse(&page, rawExampleHtml)
//check error
if err != nil {
log.Fatal(err)
}
//print result
log.Printf("%v", page)
func (*Pagser) ParseDocument ¶
ParseDocument parse document to struct
Example ¶
//New default Config
p := New()
//data parser model
var data ExamplePage
doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml))
if err != nil {
log.Fatal(err)
}
//parse document
err = p.ParseDocument(&data, doc)
//check error
if err != nil {
log.Fatal(err)
}
//print result
log.Printf("%v", data)
func (*Pagser) ParseReader ¶ added in v0.0.3
Parse parse html to struct
Example ¶
resp, err := http.Get("https://raw.githubusercontent.com/foolin/pagser/master/_examples/pages/demo.html")
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
//New default Config
p := New()
//data parser model
var page ExamplePage
//parse html data
err = p.ParseReader(&page, resp.Body)
//check error
if err != nil {
panic(err)
}
log.Printf("%v", page)
func (*Pagser) ParseSelection ¶
Example ¶
//New default Config
p := New()
//data parser model
var data ExamplePage
doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml))
if err != nil {
log.Fatal(err)
}
//parse document
err = p.ParseSelection(&data, doc.Selection)
//check error
if err != nil {
log.Fatal(err)
}
//print result
log.Printf("%v", data)
func (*Pagser) RegisterFunc ¶
RegisterFunc register function for parse result
pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
})
Example ¶
p := New()
p.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
})
