flyscrape

package module
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Dec 4, 2023 License: MPL-2.0 Imports: 27 Imported by: 0

README



flyscrape is a standalone and scriptable web scraper, combining the speed of Go with the flexibility of JavaScript. — Focus on data extraction rather than request juggling.


Installation · Documentation · Releases

Features

  • Highly Configurable: 10 options to fine-tune your scraper.
  • Standalone: flyscrape comes as a single binary executable.
  • Scriptable: Use JavaScript to write your data extraction logic.
  • Simple API: Extract data from HTML pages with a familiar API.
  • Fast Iteration: Use the development mode to get quick feedback.
  • Request Caching: Re-run scripts on websites you already scraped.
  • Zero Dependencies: No need to fill up your disk with npm packages.

Example script

export const config = {
    url: "https://news.ycombinator.com/",
    // urls: []               // Specify additional URLs to start from.      (default = none)
    // depth: 0,              // Specify how deep links should be followed.  (default = 0, no follow)
    // follow: [],            // Speficy the css selectors to follow         (default = ["a[href]"])
    // allowedDomains: [],    // Specify the allowed domains. ['*'] for all. (default = domain from url)
    // blockedDomains: [],    // Specify the blocked domains.                (default = none)
    // allowedURLs: [],       // Specify the allowed URLs as regex.          (default = all allowed)
    // blockedURLs: [],       // Specify the blocked URLs as regex.          (default = none)
    // rate: 100,             // Specify the rate in requests per second.    (default = no rate limit)
    // proxies: [],           // Specify the HTTP(S) proxy URLs.             (default = no proxy)
    // cache: "file",         // Enable file-based request caching.          (default = no cache)
}

export default function ({ doc, absoluteURL }) {
    const title = doc.find("title");
    const posts = doc.find(".athing");

    return {
        title: title.text(),
        posts: posts.map((post) => {
            const link = post.find(".titleline > a");

            return {
                title: link.text(),
                url: link.attr("href"),
            };
        }),
    }
}
$ flyscrape run hackernews.js
[
  {
    "url": "https://news.ycombinator.com/",
    "data": {
      "title": "Hacker News",
      "posts": [
        {
          "title": "Show HN: flyscrape - An standalone and scriptable web scraper",
          "url": "https://flyscrape.com/"
        },
        ...
      ]
    }
  }
]

Check out the examples folder for more detailed examples.

Installation

Pre-compiled binary

flyscrape is available for MacOS, Linux and Windows as a downloadable binary from the releases page.

Compile from source

To compile flyscrape from source, follow these steps:

  1. Install Go: Make sure you have Go installed on your system. If not, you can download it from https://golang.org/.

  2. Install flyscrape: Open a terminal and run the following command:

    go install github.com/philippta/flyscrape/cmd/flyscrape@latest
    

Usage

Usage:

    flyscrape run SCRIPT [config flags]

Examples:

    # Run the script.
    $ flyscrape run example.js

    # Set the URL as argument.
    $ flyscrape run example.js --url "http://other.com"

    # Enable proxy support.
    $ flyscrape run example.js --proxies "http://someproxy:8043"

    # Follow paginated links.
    $ flyscrape run example.js --depth 5 --follow ".next-button > a"

Configuration

Below is an example scraping script that showcases the capabilities of flyscrape. For a full documentation of all configuration options, visit the documentation page.

export const config = {
    url: "https://example.com/",                        // Specify the URL to start scraping from.
    urls: [                                             // Specify the URL(s) to start scraping from. If both `url` and `urls`
        "https://example.com/foo",                      // are provided, all of the specified URLs will be scraped.
        "https://example.com/bar",
    ],
    depth: 0,                                           // Specify how deep links should be followed.  (default = 0, no follow)
    follow: [],                                         // Speficy the css selectors to follow         (default = ["a[href]"])
    allowedDomains: [],                                 // Specify the allowed domains. ['*'] for all. (default = domain from url)
    blockedDomains: [],                                 // Specify the blocked domains.                (default = none)
    allowedURLs: [],                                    // Specify the allowed URLs as regex.          (default = all allowed)
    blockedURLs: [],                                    // Specify the blocked URLs as regex.          (default = none)
    rate: 100,                                          // Specify the rate in requests per second.    (default = no rate limit)
    proxies: [],                                        // Specify the HTTP(S) proxy URLs.             (default = no proxy)
    cache: "file",                                      // Enable file-based request caching.          (default = no cache)
    headers: {                                          // Specify the HTTP request header.            (default = none)
        "Authorization": "Basic ZGVtbzpwQDU1dzByZA==",
        "User-Agent": "Gecko/1.0",
    },
};

export function setup() {
    // Optional setup function, called once before scraping starts.
    // Can be used for authentication.
}

export default function ({ doc, url, absoluteURL }) {
    // doc              - Contains the parsed HTML document
    // url              - Contains the scraped URL
    // absoluteURL(...) - Transforms relative URLs into absolute URLs
}

Query API

// <div class="element" foo="bar">Hey</div>
const el = doc.find(".element")
el.text()                                 // "Hey"
el.html()                                 // `<div class="element">Hey</div>`
el.attr("foo")                            // "bar"
el.hasAttr("foo")                         // true
el.hasClass("element")                    // true

// <ul>
//   <li class="a">Item 1</li>
//   <li>Item 2</li>
//   <li>Item 3</li>
// </ul>
const list = doc.find("ul")
list.children()                           // [<li class="a">Item 1</li>, <li>Item 2</li>, <li>Item 3</li>]

const items = list.find("li")
items.length()                            // 3
items.first()                             // <li>Item 1</li>
items.last()                              // <li>Item 3</li>
items.get(1)                              // <li>Item 2</li>
items.get(1).prev()                       // <li>Item 1</li>
items.get(1).next()                       // <li>Item 3</li>
items.get(1).parent()                     // <ul>...</ul>
items.get(1).siblings()                   // [<li class="a">Item 1</li>, <li>Item 2</li>, <li>Item 3</li>]
items.map(item => item.text())            // ["Item 1", "Item 2", "Item 3"]
items.filter(item => item.hasClass("a"))  // [<li class="a">Item 1</li>]

Flyscrape API

Document Parsing

import { parse } from "flyscrape";

const doc = parse(`<div class="foo">bar</div>`);
const text = doc.find(".foo").text();

Basic HTTP Requests

import http from "flyscrape/http";

const response = http.get("https://example.com")

const response = http.postForm("https://example.com", {
    "username": "foo",
    "password": "bar",
})

const response = http.postJSON("https://example.com", {
    "username": "foo",
    "password": "bar",
})

// Contents of response
{
    body: "<html>...</html>",
    status: 200,
    headers: {
        "Content-Type": "text/html",
        // ...
    },
    error": "",
}

File Downloads

import { download } from "flyscrape/http";

download("http://example.com/image.jpg")              // downloads as "image.jpg"
download("http://example.com/image.jpg", "other.jpg") // downloads as "other.jpg"
download("http://example.com/image.jpg", "dir/")      // downloads as "dir/image.jpg"

// If the server offers a filename via the Content-Disposition header and no
// destination filename is provided, Flyscrape will honor the suggested filename.
// E.g. `Content-Disposition: attachment; filename="archive.zip"`
download("http://example.com/generate_archive.php", "dir/") // downloads as "dir/archive.zip"

Issues and Suggestions

If you encounter any issues or have suggestions for improvement, please submit an issue.

Documentation

Index

Constants

View Source
const HeaderBypassCache = "X-Flyscrape-Bypass-Cache"

Variables

View Source
var ScriptTemplate []byte
View Source
var StopWatch = errors.New("stop watch")

Functions

func Dev added in v0.4.0

func Dev(file string, overrides map[string]any) error

func Document added in v0.4.0

func Document(sel *goquery.Selection) map[string]any

func DocumentFromString added in v0.4.0

func DocumentFromString(s string) (map[string]any, error)

func MockResponse added in v0.2.0

func MockResponse(statusCode int, html string) (*http.Response, error)

func RegisterModule added in v0.2.0

func RegisterModule(mod Module)

func Run added in v0.4.0

func Run(file string, overrides map[string]any) error

func Watch

func Watch(path string, fn func(string) error) error

Types

type Config added in v0.2.0

type Config []byte

type Context added in v0.2.0

type Context interface {
	ScriptName() string
	Visit(url string)
	MarkVisited(url string)
	MarkUnvisited(url string)
}

type Exports added in v0.4.0

type Exports map[string]any

func Compile

func Compile(src string, imports Imports) (Exports, error)

func (Exports) Config added in v0.4.0

func (e Exports) Config() []byte

func (Exports) Scrape added in v0.4.0

func (e Exports) Scrape(p ScrapeParams) (any, error)

func (Exports) Setup added in v0.4.0

func (e Exports) Setup()

type Finalizer added in v0.2.0

type Finalizer interface {
	Finalize()
}

type Imports added in v0.4.0

type Imports map[string]map[string]any

func NewJSLibrary added in v0.4.0

func NewJSLibrary(client *http.Client) (imports Imports, wait func())

type Module added in v0.2.0

type Module interface {
	ModuleInfo() ModuleInfo
}

func LoadModules added in v0.2.0

func LoadModules(cfg Config) []Module

type ModuleInfo added in v0.2.0

type ModuleInfo struct {
	ID  string
	New func() Module
}

type Provisioner added in v0.2.0

type Provisioner interface {
	Provision(Context)
}

type Request added in v0.2.0

type Request struct {
	Method  string
	URL     string
	Headers http.Header
	Cookies http.CookieJar
	Depth   int
}

type RequestBuilder added in v0.2.0

type RequestBuilder interface {
	BuildRequest(*Request)
}

type RequestValidator added in v0.2.0

type RequestValidator interface {
	ValidateRequest(*Request) bool
}

type Response added in v0.2.0

type Response struct {
	StatusCode int
	Headers    http.Header
	Body       []byte
	Data       any
	Error      error
	Request    *Request

	Visit func(url string)
}

type ResponseReceiver added in v0.2.0

type ResponseReceiver interface {
	ReceiveResponse(*Response)
}

type RoundTripFunc added in v0.2.0

type RoundTripFunc func(*http.Request) (*http.Response, error)

func MockTransport added in v0.2.0

func MockTransport(statusCode int, html string) RoundTripFunc

func (RoundTripFunc) RoundTrip added in v0.2.0

func (f RoundTripFunc) RoundTrip(r *http.Request) (*http.Response, error)

type ScrapeFunc

type ScrapeFunc func(ScrapeParams) (any, error)

type ScrapeParams

type ScrapeParams struct {
	HTML string
	URL  string
}

type Scraper

type Scraper struct {
	ScrapeFunc ScrapeFunc
	SetupFunc  func()
	Script     string
	Modules    []Module
	Client     *http.Client
	// contains filtered or unexported fields
}

func NewScraper added in v0.2.0

func NewScraper() *Scraper

func (*Scraper) MarkUnvisited added in v0.2.0

func (s *Scraper) MarkUnvisited(url string)

func (*Scraper) MarkVisited added in v0.2.0

func (s *Scraper) MarkVisited(url string)

func (*Scraper) Run added in v0.2.0

func (s *Scraper) Run()

func (*Scraper) ScriptName added in v0.2.0

func (s *Scraper) ScriptName() string

func (*Scraper) Visit added in v0.2.0

func (s *Scraper) Visit(url string)

type TransformError

type TransformError struct {
	Line   int
	Column int
	Text   string
}

func (TransformError) Error

func (err TransformError) Error() string

type TransportAdapter added in v0.2.0

type TransportAdapter interface {
	AdaptTransport(http.RoundTripper) http.RoundTripper
}

Directories

Path Synopsis
cmd
flyscrape command
modules

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL