Documentation
      ¶
    
    
  
    
  
    Overview ¶
Package html2data - extract data from HTML via CSS selectors
Install package and command line utility:
go get -u github.com/msoap/html2data/cmd/html2data
Install package only:
go get -u github.com/msoap/html2data
Allowed pseudo-selectors:
:attr(attr_name) - for getting attributes instead text
:html - for getting HTML instead text
:get(N) - get n-th element from list
Command line utility:
html2data URL "css selector" html2data file.html "css selector" cat file.html | html2data "css selector"
Example ¶
package main
import (
	"fmt"
	"log"
	"github.com/msoap/html2data"
)
func main() {
	doc := html2data.FromURL("http://example.com")
	// or with config
	// doc := FromURL("http://example.com", URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: true})
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
	// get title
	title, _ := doc.GetDataSingle("title")
	fmt.Println("Title is:", title)
	title, _ = doc.GetDataSingle("title", html2data.Cfg{DontTrimSpaces: true})
	fmt.Println("Title as is, with spaces:", title)
	texts, _ := doc.GetData(map[string]string{"h1": "h1", "links": "a:attr(href)"})
	// get all H1 headers:
	if textOne, ok := texts["h1"]; ok {
		for _, text := range textOne {
			fmt.Println(text)
		}
	}
	// get all urls from links
	if links, ok := texts["links"]; ok {
		for _, text := range links {
			fmt.Println(text)
		}
	}
}
Index ¶
- type CSSSelector
 - type Cfg
 - type Doc
 - func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)
 - func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)
 - func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)
 - func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)
 - func (doc Doc) GetDataSingle(selector string, configs ...Cfg) (result string, err error)
 
- type URLCfg
 
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type CSSSelector ¶
type CSSSelector struct {
	// contains filtered or unexported fields
}
    CSSSelector - selector with settings
type Cfg ¶
type Cfg struct {
	DontTrimSpaces bool // get text as is, by default trim spaces
}
    Cfg - config for GetData* methods
type Doc ¶
type Doc struct {
	Err error
	// contains filtered or unexported fields
}
    Doc - html document for parse
func FromFile ¶
FromFile - get doc from file
Example ¶
package main
import (
	"log"
	"github.com/msoap/html2data"
)
func main() {
	doc := html2data.FromFile("file_name.html")
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
}
func FromReader ¶
FromReader - get doc from io.Reader
Example ¶
package main
import (
	"bufio"
	"log"
	"os"
	"github.com/msoap/html2data"
)
func main() {
	doc := html2data.FromReader(bufio.NewReader(os.Stdin))
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
}
func FromURL ¶
FromURL - get doc from URL
FromURL("https://url")
FromURL("https://url", URLCfg{UA: "Custom UA 1.0", TimeOut: 10})
Example ¶
package main
import (
	"log"
	"github.com/msoap/html2data"
)
func main() {
	doc := html2data.FromURL("http://example.com")
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
	// or with config
	doc = html2data.FromURL("http://example.com", html2data.URLCfg{UA: "userAgent", TimeOut: 10, DontDetectCharset: false})
	if doc.Err != nil {
		log.Fatal(doc.Err)
	}
}
func (Doc) GetData ¶
func (doc Doc) GetData(selectors map[string]string, configs ...Cfg) (result map[string][]string, err error)
GetData - extract data by CSS-selectors
texts, err := doc.GetData(map[string]string{"h1": "h1"})
Example ¶
package main
import (
	"fmt"
	"github.com/msoap/html2data"
)
func main() {
	texts, _ := html2data.FromURL("http://example.com").GetData(map[string]string{"headers": "h1", "links": "a:attr(href)"})
	// get all H1 headers:
	if textOne, ok := texts["headers"]; ok {
		for _, text := range textOne {
			fmt.Println(text)
		}
	}
	// get all urls from links
	if links, ok := texts["links"]; ok {
		for _, text := range links {
			fmt.Println(text)
		}
	}
}
func (Doc) GetDataFirst ¶
func (doc Doc) GetDataFirst(selectors map[string]string, configs ...Cfg) (result map[string]string, err error)
GetDataFirst - extract data by CSS-selectors, get first entry for each selector or ""
texts, err := doc.GetDataFirst(map[string]string{"h1": "h1"})
Example ¶
package main
import (
	"fmt"
	"log"
	"github.com/msoap/html2data"
)
func main() {
	texts, err := html2data.FromURL("http://example.com").GetDataFirst(map[string]string{"header": "h1", "first_link": "a:attr(href)"})
	if err != nil {
		log.Fatal(err)
	}
	// get H1 header:
	fmt.Println("header: ", texts["header"])
	// get URL in first link:
	fmt.Println("first link: ", texts["first_link"])
}
func (Doc) GetDataNested ¶
func (doc Doc) GetDataNested(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string][]string, err error)
GetDataNested - extract nested data by CSS-selectors from another CSS-selector
texts, err := doc.GetDataNested("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example ¶
package main
import (
	"fmt"
	"github.com/msoap/html2data"
)
func main() {
	texts, _ := html2data.FromFile("test.html").GetDataNested("div.article", map[string]string{"headers": "h1", "links": "a:attr(href)"})
	for _, article := range texts {
		// get all H1 headers inside each <div class="article">:
		if textOne, ok := article["headers"]; ok {
			for _, text := range textOne {
				fmt.Println(text)
			}
		}
		// get all urls from links inside each <div class="article">
		if links, ok := article["links"]; ok {
			for _, text := range links {
				fmt.Println(text)
			}
		}
	}
}
func (Doc) GetDataNestedFirst ¶
func (doc Doc) GetDataNestedFirst(selectorRaw string, nestedSelectors map[string]string, configs ...Cfg) (result []map[string]string, err error)
GetDataNestedFirst - extract nested data by CSS-selectors from another CSS-selector get first entry for each selector or ""
texts, err := doc.GetDataNestedFirst("CSS.selector", map[string]string{"h1": "h1"}) - get h1 from CSS.selector
Example ¶
package main
import (
	"fmt"
	"log"
	"github.com/msoap/html2data"
)
func main() {
	texts, err := html2data.FromFile("cmd/html2data/test.html").GetDataNestedFirst("div.block", map[string]string{"header": "h1", "link": "a:attr(href)", "sp": "span"})
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println("")
	for _, block := range texts {
		// get first H1 header
		fmt.Printf("header - %s\n", block["header"])
		// get first link
		fmt.Printf("first URL - %s\n", block["link"])
		// get not exists span
		fmt.Printf("span - '%s'\n", block["span"])
	}
}
Output: header - Head1.1 first URL - http://url1 span - '' header - Head2.1 first URL - http://url2 span - ''
func (Doc) GetDataSingle ¶
GetDataSingle - extract data by one CSS-selector
title, err := doc.GetDataSingle("title")
Example ¶
package main
import (
	"fmt"
	"log"
	"github.com/msoap/html2data"
)
func main() {
	// get title
	title, err := html2data.FromFile("cmd/html2data/test.html").GetDataSingle("title")
	if err != nil {
		log.Fatal(err)
	}
	fmt.Println("Title is:", title)
}
Output: Title is: Title