config

package
v1.1.1 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 7, 2026 License: Apache-2.0 Imports: 3 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GetEffectiveChunkingEnabled

func GetEffectiveChunkingEnabled(siteCfg SiteConfig, appCfg AppConfig) bool

GetEffectiveChunkingEnabled determines if chunking should be enabled.

func GetEffectiveChunkingMaxSize

func GetEffectiveChunkingMaxSize(siteCfg SiteConfig, appCfg AppConfig) int

GetEffectiveChunkingMaxSize returns the effective max chunk size in tokens.

func GetEffectiveChunkingOutputFilename

func GetEffectiveChunkingOutputFilename(siteCfg SiteConfig, appCfg AppConfig) string

GetEffectiveChunkingOutputFilename returns the effective chunks output filename.

func GetEffectiveChunkingOverlap

func GetEffectiveChunkingOverlap(siteCfg SiteConfig, appCfg AppConfig) int

GetEffectiveChunkingOverlap returns the effective chunk overlap in tokens.

func GetEffectiveEnableJSONLOutput

func GetEffectiveEnableJSONLOutput(siteCfg SiteConfig, appCfg AppConfig) bool

GetEffectiveEnableJSONLOutput determines if JSONL output should be generated.

func GetEffectiveEnableMetadataYAML

func GetEffectiveEnableMetadataYAML(siteCfg SiteConfig, appCfg AppConfig) bool

GetEffectiveEnableMetadataYAML determines if YAML metadata should be generated.

func GetEffectiveEnableOutputMapping

func GetEffectiveEnableOutputMapping(siteCfg SiteConfig, appCfg AppConfig) bool

GetEffectiveEnableOutputMapping determines the effective setting for enabling the mapping file

func GetEffectiveJSONLOutputFilename

func GetEffectiveJSONLOutputFilename(siteCfg SiteConfig, appCfg AppConfig) string

GetEffectiveJSONLOutputFilename determines the filename for the JSONL output.

func GetEffectiveMaxImageSize

func GetEffectiveMaxImageSize(siteCfg SiteConfig, appCfg AppConfig) int64

GetEffectiveMaxImageSize determines the effective max image size

func GetEffectiveMetadataYAMLFilename

func GetEffectiveMetadataYAMLFilename(siteCfg SiteConfig, appCfg AppConfig) string

GetEffectiveMetadataYAMLFilename determines the filename for the YAML metadata.

func GetEffectiveOutputMappingFilename

func GetEffectiveOutputMappingFilename(siteCfg SiteConfig, appCfg AppConfig) string

GetEffectiveOutputMappingFilename determines the effective filename for the mapping file Site config (if non-empty) overrides global If both site and global are empty, a hardcoded default is returned

func GetEffectiveSkipImages

func GetEffectiveSkipImages(siteCfg SiteConfig, appCfg AppConfig) bool

GetEffectiveSkipImages determines the effective skip setting

Types

type AppConfig

type AppConfig struct {
	DefaultUserAgent        string                `yaml:"default_user_agent"`
	DefaultDelayPerHost     time.Duration         `yaml:"default_delay_per_host"`
	NumWorkers              int                   `yaml:"num_workers"`
	NumImageWorkers         int                   `yaml:"num_image_workers,omitempty"`
	MaxRequests             int                   `yaml:"max_requests"`
	MaxRequestsPerHost      int                   `yaml:"max_requests_per_host"`
	OutputBaseDir           string                `yaml:"output_base_dir"`
	StateDir                string                `yaml:"state_dir"`
	MaxRetries              int                   `yaml:"max_retries,omitempty"`
	InitialRetryDelay       time.Duration         `yaml:"initial_retry_delay,omitempty"`
	MaxRetryDelay           time.Duration         `yaml:"max_retry_delay,omitempty"`
	SemaphoreAcquireTimeout time.Duration         `yaml:"semaphore_acquire_timeout,omitempty"`
	GlobalCrawlTimeout      time.Duration         `yaml:"global_crawl_timeout,omitempty"`
	PerPageTimeout          time.Duration         `yaml:"per_page_timeout,omitempty"` // Timeout for processing a single page (0 = no timeout)
	SkipImages              bool                  `yaml:"skip_images,omitempty"`
	MaxImageSizeBytes       int64                 `yaml:"max_image_size_bytes,omitempty"`
	HTTPClientSettings      HTTPClientConfig      `yaml:"http_client_settings,omitempty"`
	Sites                   map[string]SiteConfig `yaml:"sites"`
	EnableOutputMapping     bool                  `yaml:"enable_output_mapping,omitempty"`
	OutputMappingFilename   string                `yaml:"output_mapping_filename,omitempty"`
	EnableMetadataYAML      bool                  `yaml:"enable_metadata_yaml,omitempty"`
	MetadataYAMLFilename    string                `yaml:"metadata_yaml_filename,omitempty"`
	EnableJSONLOutput       bool                  `yaml:"enable_jsonl_output,omitempty"`
	JSONLOutputFilename     string                `yaml:"jsonl_output_filename,omitempty"`
	EnableTokenCounting     bool                  `yaml:"enable_token_counting,omitempty"`
	TokenizerEncoding       string                `yaml:"tokenizer_encoding,omitempty"` // e.g., "cl100k_base" (GPT-4, Claude default)
	EnableIncremental       bool                  `yaml:"enable_incremental,omitempty"` // Enable incremental crawling (skip unchanged pages)
	Chunking                ChunkingConfig        `yaml:"chunking,omitempty"`
}

AppConfig holds the global application configuration

func (*AppConfig) Validate

func (c *AppConfig) Validate() (warnings []string, err error)

Validate checks AppConfig fields and applies sensible defaults. Returns collected warnings and any fatal error. Modifies receiver in place to apply defaults.

type ChunkingConfig

type ChunkingConfig struct {
	Enabled        bool   `yaml:"enabled,omitempty"`         // Enable chunking output
	MaxChunkSize   int    `yaml:"max_chunk_size,omitempty"`  // Max chunk size in tokens (default: 512)
	ChunkOverlap   int    `yaml:"chunk_overlap,omitempty"`   // Overlap between chunks in tokens (default: 50)
	OutputFilename string `yaml:"output_filename,omitempty"` // Output filename (default: chunks.jsonl)
}

ChunkingConfig holds configuration for content chunking.

type HTTPClientConfig

type HTTPClientConfig struct {
	Timeout               time.Duration `yaml:"timeout,omitempty"`                 // Overall request timeout
	MaxIdleConns          int           `yaml:"max_idle_conns,omitempty"`          // Max total idle connections
	MaxIdleConnsPerHost   int           `yaml:"max_idle_conns_per_host,omitempty"` // Max idle connections per host
	IdleConnTimeout       time.Duration `yaml:"idle_conn_timeout,omitempty"`       // Timeout for idle connections
	TLSHandshakeTimeout   time.Duration `yaml:"tls_handshake_timeout,omitempty"`   // Timeout for TLS handshake
	ExpectContinueTimeout time.Duration `yaml:"expect_continue_timeout,omitempty"` // Timeout for 100-continue
	ForceAttemptHTTP2     *bool         `yaml:"force_attempt_http2,omitempty"`     // Explicitly enable/disable HTTP/2 attempt (use pointer for tri-state: nil=default, true=force, false=disable)
	DialerTimeout         time.Duration `yaml:"dialer_timeout,omitempty"`          // Connection dial timeout
	DialerKeepAlive       time.Duration `yaml:"dialer_keep_alive,omitempty"`       // TCP keep-alive interval
}

HTTPClientConfig holds settings for the shared HTTP client

type SiteChunkingConfig

type SiteChunkingConfig struct {
	Enabled        *bool  `yaml:"enabled,omitempty"`
	MaxChunkSize   *int   `yaml:"max_chunk_size,omitempty"`
	ChunkOverlap   *int   `yaml:"chunk_overlap,omitempty"`
	OutputFilename string `yaml:"output_filename,omitempty"`
}

SiteChunkingConfig holds site-specific chunking overrides (uses pointers for tri-state).

type SiteConfig

type SiteConfig struct {
	StartURLs               []string           `yaml:"start_urls"`
	AllowedDomain           string             `yaml:"allowed_domain"`
	AllowedPathPrefix       string             `yaml:"allowed_path_prefix"`
	ContentSelector         string             `yaml:"content_selector"`
	LinkExtractionSelectors []string           `yaml:"link_extraction_selectors,omitempty"`
	DisallowedPathPatterns  []string           `yaml:"disallowed_path_patterns,omitempty"` // Regex patterns for paths to exclude
	RespectNofollow         bool               `yaml:"respect_nofollow,omitempty"`
	UserAgent               string             `yaml:"user_agent,omitempty"`
	DelayPerHost            time.Duration      `yaml:"delay_per_host,omitempty"`
	MaxDepth                int                `yaml:"max_depth"`
	SkipImages              *bool              `yaml:"skip_images,omitempty"`
	MaxImageSizeBytes       *int64             `yaml:"max_image_size_bytes,omitempty"`
	AllowedImageDomains     []string           `yaml:"allowed_image_domains,omitempty"`
	DisallowedImageDomains  []string           `yaml:"disallowed_image_domains,omitempty"`
	EnableOutputMapping     *bool              `yaml:"enable_output_mapping,omitempty"`
	OutputMappingFilename   string             `yaml:"output_mapping_filename,omitempty"`
	EnableMetadataYAML      *bool              `yaml:"enable_metadata_yaml,omitempty"`
	MetadataYAMLFilename    string             `yaml:"metadata_yaml_filename,omitempty"`
	EnableJSONLOutput       *bool              `yaml:"enable_jsonl_output,omitempty"`
	JSONLOutputFilename     string             `yaml:"jsonl_output_filename,omitempty"`
	Chunking                SiteChunkingConfig `yaml:"chunking,omitempty"`
}

SiteConfig holds configuration specific to a single website crawl

func (*SiteConfig) Validate

func (c *SiteConfig) Validate() (warnings []string, err error)

Validate checks SiteConfig fields and applies defaults. Returns collected warnings and any fatal error. Modifies receiver in place (e.g., path prefix normalization).

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL