Documentation
¶
Index ¶
- Constants
- Variables
- func BaseDomain(host string) string
- func Clean(doc *goquery.Document) *goquery.Document
- func ConvertLazyLoadedImages(doc *goquery.Document) *goquery.Document
- func DetectAndDecodeText(data []byte, contentType string) (string, error)
- func GetEncodingByCharset(charset string) encoding.Encoding
- func GetEncodingFromMeta(htmlContent string) encoding.Encoding
- func IsLargeDocument(size int64) bool
- func IsTextContent(contentType string) bool
- func MergeHeaders(customHeaders map[string]string) map[string]string
- func NormalizeHTML(html string) string
- func NormalizeMetaTags(doc *goquery.Document) *goquery.Document
- func ValidateResponse(response *Response, parseNon200 bool) error
- type FetchResult
- type HTTPClient
- type Resource
- func (r *Resource) Create(ctx context.Context, rawURL string, preparedResponse string, ...) (*goquery.Document, error)
- func (r *Resource) CreateWithClient(ctx context.Context, rawURL string, preparedResponse string, ...) (*goquery.Document, error)
- func (r *Resource) EncodeDoc(content []byte, contentType string, alreadyDecoded bool) (*goquery.Document, error)
- func (r *Resource) GenerateDoc(result *FetchResult) (*goquery.Document, error)
- func (r *Resource) GenerateDocStreaming(result *FetchResult) (*goquery.Document, error)
- func (r *Resource) GenerateDocWithContext(ctx context.Context, result *FetchResult) (*goquery.Document, error)
- func (r *Resource) ValidateDOMComplexity(doc *goquery.Document) error
- func (r *Resource) ValidateResourceLimits(body []byte) error
- type Response
Constants ¶
const DEFAULT_ENCODING = "utf-8"
Default encoding constants
const FETCH_TIMEOUT = 10 * time.Second
The number of milliseconds to attempt to fetch a resource before timing out
const MAX_CONTENT_LENGTH = 5242880
Use this setting as the maximum size an article can be for us to attempt parsing. Defaults to 5 MB.
const MAX_DOCUMENT_SIZE = 10485760
Maximum document size for processing (10 MB)
const MAX_DOCUMENT_SIZE_STREAMING = 52428800
Maximum document size for streaming processing (50 MB) Streaming can handle larger documents efficiently
const MAX_DOM_ELEMENTS = 50000
Maximum number of DOM elements to process
const MAX_PROCESSING_TIME = 30 * time.Second
Maximum processing time for extraction (30 seconds)
const TAGS_TO_REMOVE = "script,style,form"
Tags to remove during initial DOM cleanup
Variables ¶
var ( IS_LINK_RE = regexp.MustCompile(`https?://`) IS_IMAGE_RE = regexp.MustCompile(`\.(png|gif|jpe?g)$`) IS_SRCSET_RE = regexp.MustCompile(`\.(png|gif|jpe?g)(\?\S+)?(\s*[\d.]+[wx])`) )
Regular expressions for image and link detection
var BAD_CONTENT_TYPES = []string{
"audio/mpeg",
"image/gif",
"image/jpeg",
"image/jpg",
}
Content types that we do not extract content from
var BAD_CONTENT_TYPES_RE = regexp.MustCompile(`^(` + joinContentTypes() + `)$`)
Regular expression to match bad content types
var ENCODING_RE = regexp.MustCompile(`charset=([\w-]+)\b`)
var REQUEST_HEADERS = map[string]string{
"User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
}
Request headers that match the JavaScript version
var STANDARD_HEADERS = map[string]string{
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
}
Standard HTTP headers for web content fetching
Functions ¶
func BaseDomain ¶
BaseDomain extracts the base domain from a host Gets the last two pieces of the URL and joins them back together This is to get 'livejournal.com' from 'erotictrains.livejournal.com'
func Clean ¶
Clean removes unwanted elements from the DOM Removes scripts, styles, forms, and comments
func ConvertLazyLoadedImages ¶
ConvertLazyLoadedImages converts lazy-loaded images into normal images Many sites have img tags with no source, or placeholders in src attribute We need to properly fill in the src attribute from data-* attributes
func DetectAndDecodeText ¶
DetectAndDecodeText detects encoding and converts to UTF-8
func GetEncodingByCharset ¶
GetEncodingByCharset returns encoding by charset name (public wrapper)
func GetEncodingFromMeta ¶
GetEncodingFromMeta extracts encoding from HTML meta tags This matches the JavaScript getEncoding function behavior
func IsLargeDocument ¶
IsLargeDocument determines if a document should use streaming
func IsTextContent ¶
IsTextContent checks if content type indicates text content
func MergeHeaders ¶
MergeHeaders creates a complete header map by merging default and custom headers
func NormalizeHTML ¶
NormalizeHTML performs basic HTML normalization
func NormalizeMetaTags ¶
NormalizeMetaTags normalizes meta tags for easier extraction - Converts 'content' attribute to 'value' - Converts 'property' attribute to 'name' This matches the JavaScript normalizeMetaTags function
func ValidateResponse ¶
ValidateResponse validates that the response is suitable for parsing
Types ¶
type FetchResult ¶
FetchResult represents the result of fetching a resource
func FetchResource ¶
func FetchResource(ctx context.Context, rawURL string, parsedURL *url.URL, headers map[string]string) (*FetchResult, error)
FetchResource fetches a resource from the given URL with retry logic DEPRECATED: Use FetchResourceWithClient instead
func FetchResourceWithClient ¶
func FetchResourceWithClient(ctx context.Context, rawURL string, parsedURL *url.URL, headers map[string]string, httpClient *HTTPClient) (*FetchResult, error)
FetchResourceWithClient fetches a resource using the provided HTTP client
func (*FetchResult) IsError ¶
func (fr *FetchResult) IsError() bool
IsError returns true if the fetch result contains an error
type HTTPClient ¶
type HTTPClient struct {
Client *http.Client // Exported for external use
Headers map[string]string // Exported for external use
}
HTTPClient provides a configured HTTP client for fetching resources
func CreateDefaultHTTPClient ¶
func CreateDefaultHTTPClient() *HTTPClient
CreateDefaultHTTPClient creates a new HTTP client with default settings This is used when no custom client is provided
func NewHTTPClient ¶
func NewHTTPClient(headers map[string]string) *HTTPClient
NewHTTPClient creates a new HTTP client with sensible defaults
func (*HTTPClient) GetWithRetry ¶
func (c *HTTPClient) GetWithRetry(ctx context.Context, url string, maxRetries int) (*Response, error)
GetWithRetry performs a GET request with specified number of retries
type Resource ¶
type Resource struct{}
Resource provides functionality for fetching and preparing HTML documents
func (*Resource) Create ¶
func (r *Resource) Create(ctx context.Context, rawURL string, preparedResponse string, parsedURL *url.URL, headers map[string]string) (*goquery.Document, error)
Create creates a Resource by fetching from URL or using provided HTML This is the main entry point that orchestrates fetch -> decode -> DOM preparation Automatically detects large documents and uses streaming when beneficial
Parameters: - ctx: Context for cancellation and timeout - rawURL: The URL for the document we should retrieve - preparedResponse: If set, use as the response rather than fetching. Expects HTML string - parsedURL: Pre-parsed URL object (optional) - headers: Custom headers to include in the request
func (*Resource) CreateWithClient ¶
func (r *Resource) CreateWithClient(ctx context.Context, rawURL string, preparedResponse string, parsedURL *url.URL, headers map[string]string, httpClient *HTTPClient) (*goquery.Document, error)
CreateWithClient creates a Resource using the provided HTTP client
func (*Resource) EncodeDoc ¶
func (r *Resource) EncodeDoc(content []byte, contentType string, alreadyDecoded bool) (*goquery.Document, error)
EncodeDoc handles character encoding detection and document creation
func (*Resource) GenerateDoc ¶
func (r *Resource) GenerateDoc(result *FetchResult) (*goquery.Document, error)
GenerateDoc creates a goquery Document from fetch result Handles encoding detection and applies DOM preparation pipeline with resource limits DEPRECATED: This method uses context.Background() which prevents proper timeout control. Use Create or GenerateDocWithContext instead.
func (*Resource) GenerateDocStreaming ¶
func (r *Resource) GenerateDocStreaming(result *FetchResult) (*goquery.Document, error)
GenerateDocStreaming creates a goquery Document using streaming for large documents Provides memory optimization for documents over 1MB by processing HTML in chunks
func (*Resource) GenerateDocWithContext ¶
func (r *Resource) GenerateDocWithContext(ctx context.Context, result *FetchResult) (*goquery.Document, error)
GenerateDocWithContext creates a document with context for timeout control
func (*Resource) ValidateDOMComplexity ¶
ValidateDOMComplexity checks if the DOM has too many elements
func (*Resource) ValidateResourceLimits ¶
ValidateResourceLimits checks if the resource is within safe processing limits
type Response ¶
Response represents an HTTP response
func (*Response) GetContentType ¶
GetContentType returns the content type header