htmlselector

package module
v1.3.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jul 30, 2020 License: MIT Imports: 3 Imported by: 2

README

go-html-selector

GoDoc Go Report Card Build Status codecov

The library that implements collecting specified HTML tags and their attributes from an HTML document.

Features

  • collecting from an HTML document:
    • HTML tags;
    • HTML attributes;
  • options:
    • filters:
      • filtering a result:
        • by specified HTML tags;
        • by specified HTML attributes;
      • friendly representation of filters:
        • for parsing from JSON;
        • for definition as a code literal;
    • skipping empty tags (i.e. without attributes; optional);
  • representing a result:
    • using the builder interface for building a result;
    • built-in builders:
      • with grouping HTML attributes by their tags;
  • optimizations:
    • of searching for the right one among filters;
    • of conversion from a byte slice to a string;
    • by the number:
      • of memory allocations;
      • of string copies.

Installation

Prepare the directory:

$ mkdir --parents "$(go env GOPATH)/src/github.com/thewizardplusplus/"
$ cd "$(go env GOPATH)/src/github.com/thewizardplusplus/"

Clone this repository:

$ git clone https://github.com/thewizardplusplus/go-html-selector.git
$ cd go-html-selector

Install dependencies with the dep tool:

$ dep ensure -vendor-only

Examples

htmlselector.SelectTags():

package main

import (
	"fmt"
	"log"
	"strings"

	htmlselector "github.com/thewizardplusplus/go-html-selector"
	"github.com/thewizardplusplus/go-html-selector/builders"
)

func main() {
	reader := strings.NewReader(`
		<ul>
			<li>
				<a href="http://example.com/1">1</a>
				<video
					src="http://example.com/1.1"
					poster="http://example.com/1.2">
				</video>
			</li>
			<li>
				<a href="http://example.com/2">2</a>
				<video
					src="http://example.com/2.1"
					poster="http://example.com/2.2">
				</video>
			</li>
			<li>
				<a>3</a>
				<video></video>
			</li>
		</ul>
	`)

	filters := htmlselector.OptimizeFilters(htmlselector.FilterGroup{
		"a":     {"href"},
		"video": {"src", "poster"},
	})

	var builder builders.StructuralBuilder
	err := htmlselector.SelectTags(
		reader,
		filters,
		&builder,
		htmlselector.SkipEmptyTags(),
	)
	if err != nil {
		log.Fatal(err)
	}

	for _, tag := range builder.Tags() {
		fmt.Printf("<%s>:\n", tag.Name)
		for _, attribute := range tag.Attributes {
			fmt.Printf("  %s=%q\n", attribute.Name, attribute.Value)
		}
	}

	// Output:
	// <a>:
	//   href="http://example.com/1"
	// <video>:
	//   src="http://example.com/1.1"
	//   poster="http://example.com/1.2"
	// <a>:
	//   href="http://example.com/2"
	// <video>:
	//   src="http://example.com/2.1"
	//   poster="http://example.com/2.2"
}

Benchmarks

htmlselector.SelectTags() with a simple markup:

BenchmarkSelectTags/simple_markup/10_tags/430B-8         	  200000	      9474 ns/op	     6784 B/op	      51 allocs/op
BenchmarkSelectTags/simple_markup/100_tags/4.4K-8        	   20000	     72306 ns/op	    25456 B/op	     414 allocs/op
BenchmarkSelectTags/simple_markup/1000_tags/45.7K-8      	    2000	    621791 ns/op	   190672 B/op	    4017 allocs/op
BenchmarkSelectTags/simple_markup/10000_tags/476.3K-8    	     200	   7247563 ns/op	  3448480 B/op	   40027 allocs/op
BenchmarkSelectTags/simple_markup/100000_tags/4.8M-8     	      20	  80736482 ns/op	 35420205 B/op	  400037 allocs/op
BenchmarkSelectTags/simple_markup/1000000_tags/50.3M-8   	       2	 802693264 ns/op	339752800 B/op	 4000047 allocs/op

htmlselector.SelectTags() with a complex markup:

BenchmarkSelectTags/complex_markup/10_tags/1020B-8       	  100000	     21746 ns/op	    11264 B/op	     153 allocs/op
BenchmarkSelectTags/complex_markup/100_tags/10.4K-8      	   10000	    187676 ns/op	    67328 B/op	    1416 allocs/op
BenchmarkSelectTags/complex_markup/1000_tags/108.8K-8    	    1000	   1823346 ns/op	   740608 B/op	   14021 allocs/op
BenchmarkSelectTags/complex_markup/10000_tags/1.1M-8     	     100	  21162013 ns/op	  9136409 B/op	  140031 allocs/op
BenchmarkSelectTags/complex_markup/100000_tags/11.6M-8   	       5	 227873490 ns/op	 90800432 B/op	 1400041 allocs/op
BenchmarkSelectTags/complex_markup/1000000_tags/120.6M-8 	       1	2402936519 ns/op	881045280 B/op	14000051 allocs/op

License

The MIT License (MIT)

Copyright © 2020 thewizardplusplus

Documentation

Index

Examples

Constants

This section is empty.

Variables

This section is empty.

Functions

func SelectTags

func SelectTags(
	reader io.Reader,
	filters OptimizedFilterGroup,
	builder Builder,
	options ...Option,
) error

SelectTags ...

Example
package main

import (
	"fmt"
	"log"
	"strings"

	htmlselector "github.com/thewizardplusplus/go-html-selector"
	"github.com/thewizardplusplus/go-html-selector/builders"
)

func main() {
	reader := strings.NewReader(`
		<ul>
			<li>
				<a href="http://example.com/1">1</a>
				<video
					src="http://example.com/1.1"
					poster="http://example.com/1.2">
				</video>
			</li>
			<li>
				<a href="http://example.com/2">2</a>
				<video
					src="http://example.com/2.1"
					poster="http://example.com/2.2">
				</video>
			</li>
			<li>
				<a>3</a>
				<video></video>
			</li>
		</ul>
	`)

	filters := htmlselector.OptimizeFilters(htmlselector.FilterGroup{
		"a":     {"href"},
		"video": {"src", "poster"},
	})

	var builder builders.StructuralBuilder
	err := htmlselector.SelectTags(
		reader,
		filters,
		&builder,
		htmlselector.SkipEmptyTags(),
	)
	if err != nil {
		log.Fatal(err)
	}

	for _, tag := range builder.Tags() {
		fmt.Printf("<%s>:\n", tag.Name)
		for _, attribute := range tag.Attributes {
			fmt.Printf("  %s=%q\n", attribute.Name, attribute.Value)
		}
	}

}
Output:

<a>:
  href="http://example.com/1"
<video>:
  src="http://example.com/1.1"
  poster="http://example.com/1.2"
<a>:
  href="http://example.com/2"
<video>:
  src="http://example.com/2.1"
  poster="http://example.com/2.2"

Types

type AttributeName

type AttributeName string

AttributeName ...

type Builder

type Builder interface {
	AddTag(name []byte)
	AddAttribute(name []byte, value []byte)
}

Builder ...

type FilterGroup

type FilterGroup map[TagName][]AttributeName

FilterGroup ...

type OptimizedAttributeFilterGroup

type OptimizedAttributeFilterGroup map[AttributeName]struct{}

OptimizedAttributeFilterGroup ...

type OptimizedFilterGroup

type OptimizedFilterGroup map[TagName]OptimizedAttributeFilterGroup

OptimizedFilterGroup ...

func OptimizeFilters

func OptimizeFilters(
	filters FilterGroup,
	options ...Option,
) OptimizedFilterGroup

OptimizeFilters ...

type Option

type Option func(config *OptionConfig)

Option ...

func SkipEmptyTags

func SkipEmptyTags() Option

SkipEmptyTags ...

type OptionConfig

type OptionConfig struct {
	// contains filtered or unexported fields
}

OptionConfig ...

type TagName

type TagName string

TagName ...

Directories

Path Synopsis

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL