Documentation
¶
Overview ¶
Package pagser is a simple, easy, extensible, configurable HTML parser to struct based on goquery and struct tags, It's parser library from scrago.
The project source code: https://github.com/foolin/pagser
Features ¶
* Simple - Use golang struct tag syntax.
* Easy - Easy use for your spider/crawler/colly application.
* Extensible - Support for extension functions.
* Struct tag grammar - Grammar is simple, like \`pagser:"a->attr(href)"\`.
* Nested Structure - Support Nested Structure for node.
* Configurable - Support configuration.
* GoQuery/Colly - Support all goquery project, such as go-colly.
More info: https://github.com/foolin/pagser
Index ¶
- type BuiltinFunctions
- func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EachTextJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) EqAndText(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) NodeChild(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) NodeEq(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) NodeNext(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) NodeParent(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) NodePrev(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) NodeSiblings(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
- func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
- type CallFunc
- type Config
- type Pagser
- func (p *Pagser) Parse(v interface{}, document string) (err error)
- func (p *Pagser) ParseDocument(v interface{}, document *goquery.Document) (err error)
- func (p *Pagser) ParseReader(v interface{}, reader io.Reader) (err error)
- func (p *Pagser) ParseSelection(v interface{}, selection *goquery.Selection) (err error)
- func (p *Pagser) RegisterFunc(name string, fn CallFunc)
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BuiltinFunctions ¶ added in v0.0.7
type BuiltinFunctions struct {
}
Builtin functions are registered with a lowercase initial, eg: Text -> text()
func (BuiltinFunctions) Attr ¶ added in v0.0.7
func (builtin BuiltinFunctions) Attr(node *goquery.Selection, args ...string) (out interface{}, err error)
attr(name, defaultValue=”) get element attribute value, return string. outerHtml() get element outer html, return string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Example string `pagser:".selector->attr(href)"`
}
func (BuiltinFunctions) AttrConcat ¶ added in v0.1.1
func (builtin BuiltinFunctions) AttrConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
attrConcat(name, text1, $value, [ text2, ... text_n ]) `name` get element attribute value by name, `text1, text2, ... text_n` The strings that you wish to join together, `$value` is placeholder for get element text return string.
struct {
Example string `pagser:".selector->attrConcat('Result:', '<', $value, '>')"`
}
func (BuiltinFunctions) AttrEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) AttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
attrEmpty(name, defaultValue) get element attribute value, return string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Example string `pagser:".selector->AttrEmpty(href, '#')"`
}
func (BuiltinFunctions) AttrSplit ¶ added in v0.0.7
func (builtin BuiltinFunctions) AttrSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
attrSplit(name, sep=',', trim='true') get attribute value and split by separator to array string, return []string.
struct {
Examples []string `pagser:".selector->attrSplit('keywords', ',')"`
}
func (BuiltinFunctions) EachAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
eachAttr() get each element attribute value, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Examples []string `pagser:".selector->eachAttr(href)"`
}
func (BuiltinFunctions) EachAttrEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) EachAttrEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
eachAttrEmpty(defaultValue) get each element attribute value, return []string.
//<a href="https://github.com/foolin/pagser">Pagser</a>
struct {
Examples []string `pagser:".selector->eachAttrEmpty(href, '#')"`
}
func (BuiltinFunctions) EachHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eachHtml() get each element inner html, return []string. eachTextEmpty(defaultValue) get each element text, return []string.
struct {
Examples []string `pagser:".selector->eachHtml()"`
}
func (BuiltinFunctions) EachOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eachOutHtml() get each element outer html, return []string.
struct {
Examples []string `pagser:".selector->eachOutHtml()"`
}
func (BuiltinFunctions) EachText ¶ added in v0.0.7
func (builtin BuiltinFunctions) EachText(node *goquery.Selection, args ...string) (out interface{}, err error)
eachText() get each element text, return []string.
struct {
Examples []string `pagser:".selector->eachText('')"`
}
func (BuiltinFunctions) EachTextEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) EachTextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
eachTextEmpty(defaultValue) get each element text, return []string.
struct {
Examples []string `pagser:".selector->eachTextEmpty('')"`
}
func (BuiltinFunctions) EachTextJoin ¶ added in v0.1.1
func (builtin BuiltinFunctions) EachTextJoin(node *goquery.Selection, args ...string) (out interface{}, err error)
eachTextJoin(sep) get each element text and join to string, return string.
struct {
Example string `pagser:".selector->eachTextJoin(',')"`
}
func (BuiltinFunctions) EqAndAttr ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndAttr(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndAttr(index, name) reduces the set of matched elements to the one at the specified index, and attr() return string.
struct {
Example string `pagser:".selector->eqAndAttr(0, href)"`
}
func (BuiltinFunctions) EqAndHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndHtml(index) reduces the set of matched elements to the one at the specified index, and html() return string.
struct {
Example string `pagser:".selector->eqAndHtml(0)"`
}
func (BuiltinFunctions) EqAndOutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) EqAndOutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndOutHtml(index) reduces the set of matched elements to the one at the specified index, and outHtml() return string.
struct {
Example string `pagser:".selector->eqAndOutHtml(0)"`
}
func (BuiltinFunctions) EqAndText ¶ added in v0.1.1
func (builtin BuiltinFunctions) EqAndText(node *goquery.Selection, args ...string) (out interface{}, err error)
eqAndText(index) reduces the set of matched elements to the one at the specified index, return string.
struct {
Example string `pagser:".selector->eqAndText(0)"`
}
func (BuiltinFunctions) Html ¶ added in v0.0.7
func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (out interface{}, err error)
html() get element inner html, return string.
struct {
Example string `pagser:".selector->html()"`
}
func (BuiltinFunctions) NodeChild ¶ added in v0.1.1
func (builtin BuiltinFunctions) NodeChild(node *goquery.Selection, args ...string) (out interface{}, err error)
nodeChild(selector = ”) gets the child elements of each element in the Selection, Filtered by the specified selector if selector not empty, It returns Selection object containing these elements.
struct {
SubStruct struct {
Example string `pagser:".selector->text()"`
} `pagser:".selector->nodeChild()"`
}
func (BuiltinFunctions) NodeEq ¶ added in v0.1.1
func (builtin BuiltinFunctions) NodeEq(node *goquery.Selection, args ...string) (out interface{}, err error)
nodeEq(index) reduces the set of matched elements to the one at the specified index. If a negative index is given, it counts backwards starting at the end of the set. It returns a Selection object, and an empty Selection object if the index is invalid.
struct {
SubStruct struct {
Example string `pagser:".selector->text()"`
} `pagser:".selector->nodeEq(0)"`
}
func (BuiltinFunctions) NodeNext ¶ added in v0.1.1
func (builtin BuiltinFunctions) NodeNext(node *goquery.Selection, args ...string) (out interface{}, err error)
nodeNext() gets the immediately following sibling of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements.
struct {
SubStruct struct {
Example string `pagser:".selector->text()"`
} `pagser:".selector->nodeNext()"`
}
func (BuiltinFunctions) NodeParent ¶ added in v0.1.1
func (builtin BuiltinFunctions) NodeParent(node *goquery.Selection, args ...string) (out interface{}, err error)
nodeParent() gets the parent elements of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements.
struct {
SubStruct struct {
Example string `pagser:".selector->text()"`
} `pagser:".selector->nodeParent()"`
}
func (BuiltinFunctions) NodePrev ¶ added in v0.1.1
func (builtin BuiltinFunctions) NodePrev(node *goquery.Selection, args ...string) (out interface{}, err error)
nodePrev() gets the immediately preceding sibling of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements.
struct {
SubStruct struct {
Example string `pagser:".selector->text()"`
} `pagser:".selector->nodePrev()"`
}
func (BuiltinFunctions) NodeSiblings ¶ added in v0.1.1
func (builtin BuiltinFunctions) NodeSiblings(node *goquery.Selection, args ...string) (out interface{}, err error)
nodeSiblings() gets the siblings of each element in the Selection. Filtered by the specified selector if selector not empty, It returns Selection object containing these elements.
struct {
SubStruct struct {
Example string `pagser:".selector->text()"`
} `pagser:".selector->nodeSiblings()"`
}
func (BuiltinFunctions) OutHtml ¶ added in v0.0.7
func (builtin BuiltinFunctions) OutHtml(node *goquery.Selection, args ...string) (out interface{}, err error)
outerHtml() get element outer html, return string.
struct {
Example string `pagser:".selector->outerHtml()"`
}
func (BuiltinFunctions) Text ¶ added in v0.0.7
func (builtin BuiltinFunctions) Text(node *goquery.Selection, args ...string) (out interface{}, err error)
text() get element text, return string, this is default function, if not define function in struct tag.
struct {
Example string `pagser:".selector->text()"`
}
func (BuiltinFunctions) TextConcat ¶ added in v0.1.1
func (builtin BuiltinFunctions) TextConcat(node *goquery.Selection, args ...string) (out interface{}, err error)
textConcat(text1, $value, [ text2, ... text_n ]) The `text1, text2, ... text_n` strings that you wish to join together, `$value` is placeholder for get element text, return string.
struct {
Example string `pagser:".selector->textConcat('Result:', '<', $value, '>')"`
}
func (BuiltinFunctions) TextEmpty ¶ added in v0.1.0
func (builtin BuiltinFunctions) TextEmpty(node *goquery.Selection, args ...string) (out interface{}, err error)
textEmpty(defaultValue) get element text, if empty will return defaultValue, return string.
struct {
Example string `pagser:".selector->TextEmpty('')"`
}
func (BuiltinFunctions) TextSplit ¶ added in v0.1.1
func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...string) (out interface{}, err error)
textSplit(sep=',', trim='true') get element text and split by separator to array string, return []string.
struct {
Examples []string `pagser:".selector->textSplit('|')"`
}
type CallFunc ¶
Define Global Function ¶
func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
}
//Register function
pagser.RegisterFunc("MyFunc", MyFunc)
//Use function
type PageData struct{
Text string `pagser:"h1->MyFunc()"`
}
Define Struct Function ¶
//Use function
type PageData struct{
Text string `pagser:"h1->MyFunc()"`
}
func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
}
Lookup function priority order ¶
struct method -> parent method -> ... -> global
Implicit convert type ¶
Automatic type conversion, Output result string convert to int, int64, float64...
CallFunc is a define function interface
type Config ¶
type Config struct {
TagName string //struct tag name, default is `pagser`
FuncSymbol string //Function symbol, default is `->`
CastError bool //Returns an error when the type cannot be converted, default is `false`
Debug bool //Debug mode, debug will print some log, default is `false`
}
Config configuration
func DefaultConfig ¶
func DefaultConfig() Config
DefaultConfig the default Config
Config{
TagName: "pagser",
FuncSymbol: "->",
CastError: false,
Debug: false,
}
type Pagser ¶
type Pagser struct {
Config Config
// contains filtered or unexported fields
}
Pagser the page parser
func NewWithConfig ¶
NewWithConfig create pagser client with Config and error
Example ¶
cfg := Config{
TagName: "pagser",
FuncSymbol: "->",
CastError: false,
Debug: false,
}
p, err := NewWithConfig(cfg)
if err != nil {
log.Fatal(err)
}
//data parser model
var page ExamplePage
//parse html data
err = p.Parse(&page, rawExampleHtml)
//check error
if err != nil {
log.Fatal(err)
}
func (*Pagser) Parse ¶
Parse parse html to struct
Example ¶
//New default Config
p := New()
//data parser model
var page ExamplePage
//parse html data
err := p.Parse(&page, rawExampleHtml)
//check error
if err != nil {
log.Fatal(err)
}
//print result
log.Printf("%v", page)
func (*Pagser) ParseDocument ¶
ParseDocument parse document to struct
Example ¶
//New default Config
p := New()
//data parser model
var data ExamplePage
doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml))
if err != nil {
log.Fatal(err)
}
//parse document
err = p.ParseDocument(&data, doc)
//check error
if err != nil {
log.Fatal(err)
}
//print result
log.Printf("%v", data)
func (*Pagser) ParseReader ¶ added in v0.0.3
Parse parse html to struct
Example ¶
resp, err := http.Get("https://raw.githubusercontent.com/foolin/pagser/master/_examples/pages/demo.html")
if err != nil {
log.Fatal(err)
}
defer resp.Body.Close()
//New default Config
p := New()
//data parser model
var page ExamplePage
//parse html data
err = p.ParseReader(&page, resp.Body)
//check error
if err != nil {
panic(err)
}
log.Printf("%v", page)
func (*Pagser) ParseSelection ¶
Example ¶
//New default Config
p := New()
//data parser model
var data ExamplePage
doc, err := goquery.NewDocumentFromReader(strings.NewReader(rawExampleHtml))
if err != nil {
log.Fatal(err)
}
//parse document
err = p.ParseSelection(&data, doc.Selection)
//check error
if err != nil {
log.Fatal(err)
}
//print result
log.Printf("%v", data)
func (*Pagser) RegisterFunc ¶
RegisterFunc register function for parse result
pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
})
Example ¶
p := New()
p.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) {
//Todo
return "Hello", nil
})
