resource

package
v1.0.6 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Aug 31, 2025 License: MIT Imports: 25 Imported by: 0

Documentation

Index

Constants

View Source
const DEFAULT_ENCODING = "utf-8"

Default encoding constants

View Source
const FETCH_TIMEOUT = 10 * time.Second

The number of milliseconds to attempt to fetch a resource before timing out

View Source
const MAX_CONTENT_LENGTH = 5242880

Use this setting as the maximum size an article can be for us to attempt parsing. Defaults to 5 MB.

View Source
const MAX_DOCUMENT_SIZE = 10485760

Maximum document size for processing (10 MB)

View Source
const MAX_DOCUMENT_SIZE_STREAMING = 52428800

Maximum document size for streaming processing (50 MB) Streaming can handle larger documents efficiently

View Source
const MAX_DOM_ELEMENTS = 50000

Maximum number of DOM elements to process

View Source
const MAX_PROCESSING_TIME = 30 * time.Second

Maximum processing time for extraction (30 seconds)

View Source
const TAGS_TO_REMOVE = "script,style,form"

Tags to remove during initial DOM cleanup

Variables

View Source
var (
	IS_LINK_RE   = regexp.MustCompile(`https?://`)
	IS_IMAGE_RE  = regexp.MustCompile(`\.(png|gif|jpe?g)$`)
	IS_SRCSET_RE = regexp.MustCompile(`\.(png|gif|jpe?g)(\?\S+)?(\s*[\d.]+[wx])`)
)

Regular expressions for image and link detection

View Source
var BAD_CONTENT_TYPES = []string{
	"audio/mpeg",
	"image/gif",
	"image/jpeg",
	"image/jpg",
}

Content types that we do not extract content from

View Source
var BAD_CONTENT_TYPES_RE = regexp.MustCompile(`^(` + joinContentTypes() + `)$`)

Regular expression to match bad content types

View Source
var ENCODING_RE = regexp.MustCompile(`charset=([\w-]+)\b`)
View Source
var REQUEST_HEADERS = map[string]string{
	"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
}

Request headers that match the JavaScript version

View Source
var STANDARD_HEADERS = map[string]string{
	"Accept":                    "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
	"Accept-Language":           "en-US,en;q=0.5",
	"DNT":                       "1",
	"Connection":                "keep-alive",
	"Upgrade-Insecure-Requests": "1",
}

Standard HTTP headers for web content fetching

Functions

func BaseDomain

func BaseDomain(host string) string

BaseDomain extracts the base domain from a host Gets the last two pieces of the URL and joins them back together This is to get 'livejournal.com' from 'erotictrains.livejournal.com'

func Clean

func Clean(doc *goquery.Document) *goquery.Document

Clean removes unwanted elements from the DOM Removes scripts, styles, forms, and comments

func ConvertLazyLoadedImages

func ConvertLazyLoadedImages(doc *goquery.Document) *goquery.Document

ConvertLazyLoadedImages converts lazy-loaded images into normal images Many sites have img tags with no source, or placeholders in src attribute We need to properly fill in the src attribute from data-* attributes

func DetectAndDecodeText

func DetectAndDecodeText(data []byte, contentType string) (string, error)

DetectAndDecodeText detects encoding and converts to UTF-8

func GetEncodingByCharset

func GetEncodingByCharset(charset string) encoding.Encoding

GetEncodingByCharset returns encoding by charset name (public wrapper)

func GetEncodingFromMeta

func GetEncodingFromMeta(htmlContent string) encoding.Encoding

GetEncodingFromMeta extracts encoding from HTML meta tags This matches the JavaScript getEncoding function behavior

func IsLargeDocument

func IsLargeDocument(size int64) bool

IsLargeDocument determines if a document should use streaming

func IsTextContent

func IsTextContent(contentType string) bool

IsTextContent checks if content type indicates text content

func MergeHeaders

func MergeHeaders(customHeaders map[string]string) map[string]string

MergeHeaders creates a complete header map by merging default and custom headers

func NormalizeHTML

func NormalizeHTML(html string) string

NormalizeHTML performs basic HTML normalization

func NormalizeMetaTags

func NormalizeMetaTags(doc *goquery.Document) *goquery.Document

NormalizeMetaTags normalizes meta tags for easier extraction - Converts 'content' attribute to 'value' - Converts 'property' attribute to 'name' This matches the JavaScript normalizeMetaTags function

func ValidateResponse

func ValidateResponse(response *Response, parseNon200 bool) error

ValidateResponse validates that the response is suitable for parsing

Types

type FetchResult

type FetchResult struct {
	Response       *Response
	Error          bool
	Message        string
	AlreadyDecoded bool
}

FetchResult represents the result of fetching a resource

func FetchResource

func FetchResource(ctx context.Context, rawURL string, parsedURL *url.URL, headers map[string]string) (*FetchResult, error)

FetchResource fetches a resource from the given URL with retry logic DEPRECATED: Use FetchResourceWithClient instead

func FetchResourceWithClient

func FetchResourceWithClient(ctx context.Context, rawURL string, parsedURL *url.URL, headers map[string]string, httpClient *HTTPClient) (*FetchResult, error)

FetchResourceWithClient fetches a resource using the provided HTTP client

func (*FetchResult) IsError

func (fr *FetchResult) IsError() bool

IsError returns true if the fetch result contains an error

type HTTPClient

type HTTPClient struct {
	Client  *http.Client      // Exported for external use
	Headers map[string]string // Exported for external use
}

HTTPClient provides a configured HTTP client for fetching resources

func CreateDefaultHTTPClient

func CreateDefaultHTTPClient() *HTTPClient

CreateDefaultHTTPClient creates a new HTTP client with default settings This is used when no custom client is provided

func NewHTTPClient

func NewHTTPClient(headers map[string]string) *HTTPClient

NewHTTPClient creates a new HTTP client with sensible defaults

func (*HTTPClient) Get

func (c *HTTPClient) Get(ctx context.Context, url string) (*Response, error)

Get performs a GET request with optional retries

func (*HTTPClient) GetWithRetry

func (c *HTTPClient) GetWithRetry(ctx context.Context, url string, maxRetries int) (*Response, error)

GetWithRetry performs a GET request with specified number of retries

type Resource

type Resource struct{}

Resource provides functionality for fetching and preparing HTML documents

func NewResource

func NewResource() *Resource

NewResource creates a new Resource instance

func (*Resource) Create

func (r *Resource) Create(ctx context.Context, rawURL string, preparedResponse string, parsedURL *url.URL, headers map[string]string) (*goquery.Document, error)

Create creates a Resource by fetching from URL or using provided HTML This is the main entry point that orchestrates fetch -> decode -> DOM preparation Automatically detects large documents and uses streaming when beneficial

Parameters: - ctx: Context for cancellation and timeout - rawURL: The URL for the document we should retrieve - preparedResponse: If set, use as the response rather than fetching. Expects HTML string - parsedURL: Pre-parsed URL object (optional) - headers: Custom headers to include in the request

func (*Resource) CreateWithClient

func (r *Resource) CreateWithClient(ctx context.Context, rawURL string, preparedResponse string, parsedURL *url.URL, headers map[string]string, httpClient *HTTPClient) (*goquery.Document, error)

CreateWithClient creates a Resource using the provided HTTP client

func (*Resource) EncodeDoc

func (r *Resource) EncodeDoc(content []byte, contentType string, alreadyDecoded bool) (*goquery.Document, error)

EncodeDoc handles character encoding detection and document creation

func (*Resource) GenerateDoc

func (r *Resource) GenerateDoc(result *FetchResult) (*goquery.Document, error)

GenerateDoc creates a goquery Document from fetch result Handles encoding detection and applies DOM preparation pipeline with resource limits DEPRECATED: This method uses context.Background() which prevents proper timeout control. Use Create or GenerateDocWithContext instead.

func (*Resource) GenerateDocStreaming

func (r *Resource) GenerateDocStreaming(result *FetchResult) (*goquery.Document, error)

GenerateDocStreaming creates a goquery Document using streaming for large documents Provides memory optimization for documents over 1MB by processing HTML in chunks

func (*Resource) GenerateDocWithContext

func (r *Resource) GenerateDocWithContext(ctx context.Context, result *FetchResult) (*goquery.Document, error)

GenerateDocWithContext creates a document with context for timeout control

func (*Resource) ValidateDOMComplexity

func (r *Resource) ValidateDOMComplexity(doc *goquery.Document) error

ValidateDOMComplexity checks if the DOM has too many elements

func (*Resource) ValidateResourceLimits

func (r *Resource) ValidateResourceLimits(body []byte) error

ValidateResourceLimits checks if the resource is within safe processing limits

type Response

type Response struct {
	StatusCode int
	Status     string
	Headers    http.Header
	Body       []byte
}

Response represents an HTTP response

func (*Response) GetContentType

func (r *Response) GetContentType() string

GetContentType returns the content type header

func (*Response) GetHeader

func (r *Response) GetHeader(key string) string

GetHeader returns a header value

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL