config

package
v1.3.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 21, 2026 License: Apache-2.0 Imports: 3 Imported by: 0

Documentation

Index

Constants

View Source
const (
	// DefaultMaxPageSizeBytes is the default maximum page body size (50 MB).
	DefaultMaxPageSizeBytes int64 = 50 * 1024 * 1024
)

Variables

This section is empty.

Functions

func GetEffectiveChunkingEnabled

func GetEffectiveChunkingEnabled(siteCfg *SiteConfig, appCfg *AppConfig) bool

GetEffectiveChunkingEnabled determines if chunking should be enabled.

func GetEffectiveChunkingMaxSize

func GetEffectiveChunkingMaxSize(siteCfg *SiteConfig, appCfg *AppConfig) int

GetEffectiveChunkingMaxSize returns the effective max chunk size in tokens.

func GetEffectiveChunkingOutputFilename

func GetEffectiveChunkingOutputFilename(siteCfg *SiteConfig, appCfg *AppConfig) string

GetEffectiveChunkingOutputFilename returns the effective chunks output filename.

func GetEffectiveChunkingOverlap

func GetEffectiveChunkingOverlap(siteCfg *SiteConfig, appCfg *AppConfig) int

GetEffectiveChunkingOverlap returns the effective chunk overlap in tokens.

func GetEffectiveEnableJSONLOutput

func GetEffectiveEnableJSONLOutput(siteCfg *SiteConfig, appCfg *AppConfig) bool

GetEffectiveEnableJSONLOutput determines if JSONL output should be generated.

func GetEffectiveEnableMetadataYAML

func GetEffectiveEnableMetadataYAML(siteCfg *SiteConfig, appCfg *AppConfig) bool

GetEffectiveEnableMetadataYAML determines if YAML metadata should be generated.

func GetEffectiveEnableOutputMapping

func GetEffectiveEnableOutputMapping(siteCfg *SiteConfig, appCfg *AppConfig) bool

GetEffectiveEnableOutputMapping determines the effective setting for enabling the mapping file

func GetEffectiveJSONLOutputFilename

func GetEffectiveJSONLOutputFilename(siteCfg *SiteConfig, appCfg *AppConfig) string

GetEffectiveJSONLOutputFilename determines the filename for the JSONL output.

func GetEffectiveMaxImageSize

func GetEffectiveMaxImageSize(siteCfg *SiteConfig, appCfg *AppConfig) int64

GetEffectiveMaxImageSize determines the effective max image size

func GetEffectiveMaxPageSize

func GetEffectiveMaxPageSize(appCfg *AppConfig) int64

GetEffectiveMaxPageSize returns the configured max page size, or the default if unset.

func GetEffectiveMetadataYAMLFilename

func GetEffectiveMetadataYAMLFilename(siteCfg *SiteConfig, appCfg *AppConfig) string

GetEffectiveMetadataYAMLFilename determines the filename for the YAML metadata.

func GetEffectiveOutputMappingFilename

func GetEffectiveOutputMappingFilename(siteCfg *SiteConfig, appCfg *AppConfig) string

GetEffectiveOutputMappingFilename determines the effective filename for the mapping file Site config (if non-empty) overrides global If both site and global are empty, a hardcoded default is returned

func GetEffectiveSkipImages

func GetEffectiveSkipImages(siteCfg *SiteConfig, appCfg *AppConfig) bool

GetEffectiveSkipImages determines the effective skip setting

func GetEffectiveUserAgent

func GetEffectiveUserAgent(siteCfg *SiteConfig, appCfg *AppConfig) string

GetEffectiveUserAgent returns the site-specific user agent if set, otherwise the global default.

Types

type AppConfig

type AppConfig struct {
	DefaultUserAgent        string                 `yaml:"default_user_agent"`
	DefaultDelayPerHost     time.Duration          `yaml:"default_delay_per_host"`
	NumWorkers              int                    `yaml:"num_workers"`
	NumImageWorkers         int                    `yaml:"num_image_workers,omitempty"`
	MaxRequests             int                    `yaml:"max_requests"`
	MaxRequestsPerHost      int                    `yaml:"max_requests_per_host"`
	OutputBaseDir           string                 `yaml:"output_base_dir"`
	StateDir                string                 `yaml:"state_dir"`
	MaxRetries              int                    `yaml:"max_retries,omitempty"`
	InitialRetryDelay       time.Duration          `yaml:"initial_retry_delay,omitempty"`
	MaxRetryDelay           time.Duration          `yaml:"max_retry_delay,omitempty"`
	SemaphoreAcquireTimeout time.Duration          `yaml:"semaphore_acquire_timeout,omitempty"`
	GlobalCrawlTimeout      time.Duration          `yaml:"global_crawl_timeout,omitempty"`
	PerPageTimeout          time.Duration          `yaml:"per_page_timeout,omitempty"` // Timeout for processing a single page (0 = no timeout)
	SkipImages              bool                   `yaml:"skip_images,omitempty"`
	MaxImageSizeBytes       int64                  `yaml:"max_image_size_bytes,omitempty"`
	MaxPageSizeBytes        int64                  `yaml:"max_page_size_bytes,omitempty"` // Max HTML page body size in bytes (0 = 50MB default)
	HTTPClientSettings      HTTPClientConfig       `yaml:"http_client_settings,omitempty"`
	Sites                   map[string]*SiteConfig `yaml:"sites"`
	EnableOutputMapping     bool                   `yaml:"enable_output_mapping,omitempty"`
	OutputMappingFilename   string                 `yaml:"output_mapping_filename,omitempty"`
	EnableMetadataYAML      bool                   `yaml:"enable_metadata_yaml,omitempty"`
	MetadataYAMLFilename    string                 `yaml:"metadata_yaml_filename,omitempty"`
	EnableJSONLOutput       bool                   `yaml:"enable_jsonl_output,omitempty"`
	JSONLOutputFilename     string                 `yaml:"jsonl_output_filename,omitempty"`
	EnableTokenCounting     bool                   `yaml:"enable_token_counting,omitempty"`
	TokenizerEncoding       string                 `yaml:"tokenizer_encoding,omitempty"` // e.g., "cl100k_base" (GPT-4 default, approximate for Claude)
	DBGCInterval            time.Duration          `yaml:"db_gc_interval,omitempty"`     // Interval for BadgerDB value log GC (default: 10m)
	EnableIncremental       bool                   `yaml:"enable_incremental,omitempty"` // Enable incremental crawling (skip unchanged pages)
	Chunking                ChunkingConfig         `yaml:"chunking,omitempty"`
}

AppConfig holds the global application configuration

func (*AppConfig) Validate

func (c *AppConfig) Validate() (warnings []string, err error)

Validate checks AppConfig fields and applies sensible defaults. Returns collected warnings and any fatal error. Modifies receiver in place to apply defaults.

type ChunkingConfig

type ChunkingConfig struct {
	Enabled        bool   `yaml:"enabled,omitempty"`         // Enable chunking output
	MaxChunkSize   int    `yaml:"max_chunk_size,omitempty"`  // Max chunk size in tokens (default: 512)
	ChunkOverlap   int    `yaml:"chunk_overlap,omitempty"`   // Overlap between chunks in tokens (default: 50)
	OutputFilename string `yaml:"output_filename,omitempty"` // Output filename (default: chunks.jsonl)
}

ChunkingConfig holds configuration for content chunking.

type HTTPClientConfig

type HTTPClientConfig struct {
	Timeout               time.Duration `yaml:"timeout,omitempty"`                 // Overall request timeout
	MaxIdleConns          int           `yaml:"max_idle_conns,omitempty"`          // Max total idle connections
	MaxIdleConnsPerHost   int           `yaml:"max_idle_conns_per_host,omitempty"` // Max idle connections per host
	IdleConnTimeout       time.Duration `yaml:"idle_conn_timeout,omitempty"`       // Timeout for idle connections
	TLSHandshakeTimeout   time.Duration `yaml:"tls_handshake_timeout,omitempty"`   // Timeout for TLS handshake
	ExpectContinueTimeout time.Duration `yaml:"expect_continue_timeout,omitempty"` // Timeout for 100-continue
	ForceAttemptHTTP2     *bool         `yaml:"force_attempt_http2,omitempty"`     // Explicitly enable/disable HTTP/2 attempt (use pointer for tri-state: nil=default, true=force, false=disable)
	DialerTimeout         time.Duration `yaml:"dialer_timeout,omitempty"`          // Connection dial timeout
	DialerKeepAlive       time.Duration `yaml:"dialer_keep_alive,omitempty"`       // TCP keep-alive interval
}

HTTPClientConfig holds settings for the shared HTTP client

type ResolvedSiteConfig added in v1.3.0

type ResolvedSiteConfig struct {
	UserAgent              string
	OutputMappingFilename  string
	MetadataYAMLFilename   string
	JSONLOutputFilename    string
	ChunkingOutputFilename string
	DelayPerHost           time.Duration
	MaxPageSizeBytes       int64
	MaxImageSizeBytes      int64
	SkipImages             bool
	EnableOutputMapping    bool
	EnableMetadataYAML     bool
	EnableJSONLOutput      bool
	ChunkingEnabled        bool
	ChunkingMaxSize        int
	ChunkingOverlap        int
}

ResolvedSiteConfig holds all effective configuration values for a site, resolved once from site-specific overrides and app-level defaults.

func NewResolvedSiteConfig added in v1.3.0

func NewResolvedSiteConfig(siteCfg *SiteConfig, appCfg *AppConfig) *ResolvedSiteConfig

NewResolvedSiteConfig resolves all effective configuration values for a site.

type SiteChunkingConfig

type SiteChunkingConfig struct {
	Enabled        *bool  `yaml:"enabled,omitempty"`
	MaxChunkSize   *int   `yaml:"max_chunk_size,omitempty"`
	ChunkOverlap   *int   `yaml:"chunk_overlap,omitempty"`
	OutputFilename string `yaml:"output_filename,omitempty"`
}

SiteChunkingConfig holds site-specific chunking overrides (uses pointers for tri-state).

type SiteConfig

type SiteConfig struct {
	StartURLs               []string           `yaml:"start_urls"`
	AllowedDomain           string             `yaml:"allowed_domain"`
	AllowedPathPrefix       string             `yaml:"allowed_path_prefix"`
	ContentSelector         string             `yaml:"content_selector"`
	LinkExtractionSelectors []string           `yaml:"link_extraction_selectors,omitempty"`
	DisallowedPathPatterns  []string           `yaml:"disallowed_path_patterns,omitempty"` // Regex patterns for paths to exclude
	RespectNofollow         bool               `yaml:"respect_nofollow,omitempty"`
	UserAgent               string             `yaml:"user_agent,omitempty"`
	DelayPerHost            time.Duration      `yaml:"delay_per_host,omitempty"`
	MaxDepth                int                `yaml:"max_depth"`
	SkipImages              *bool              `yaml:"skip_images,omitempty"`
	MaxImageSizeBytes       *int64             `yaml:"max_image_size_bytes,omitempty"`
	AllowedImageDomains     []string           `yaml:"allowed_image_domains,omitempty"`
	DisallowedImageDomains  []string           `yaml:"disallowed_image_domains,omitempty"`
	EnableOutputMapping     *bool              `yaml:"enable_output_mapping,omitempty"`
	OutputMappingFilename   string             `yaml:"output_mapping_filename,omitempty"`
	EnableMetadataYAML      *bool              `yaml:"enable_metadata_yaml,omitempty"`
	MetadataYAMLFilename    string             `yaml:"metadata_yaml_filename,omitempty"`
	EnableJSONLOutput       *bool              `yaml:"enable_jsonl_output,omitempty"`
	JSONLOutputFilename     string             `yaml:"jsonl_output_filename,omitempty"`
	Chunking                SiteChunkingConfig `yaml:"chunking,omitempty"`
}

SiteConfig holds configuration specific to a single website crawl

func (*SiteConfig) Validate

func (c *SiteConfig) Validate() (warnings []string, err error)

Validate checks SiteConfig fields and applies defaults. Returns collected warnings and any fatal error. Modifies receiver in place (e.g., path prefix normalization).

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL