Documentation
¶
Index ¶
- Constants
- func GetEffectiveChunkingEnabled(siteCfg *SiteConfig, appCfg *AppConfig) bool
- func GetEffectiveChunkingMaxSize(siteCfg *SiteConfig, appCfg *AppConfig) int
- func GetEffectiveChunkingOutputFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
- func GetEffectiveChunkingOverlap(siteCfg *SiteConfig, appCfg *AppConfig) int
- func GetEffectiveEnableJSONLOutput(siteCfg *SiteConfig, appCfg *AppConfig) bool
- func GetEffectiveEnableMetadataYAML(siteCfg *SiteConfig, appCfg *AppConfig) bool
- func GetEffectiveEnableOutputMapping(siteCfg *SiteConfig, appCfg *AppConfig) bool
- func GetEffectiveJSONLOutputFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
- func GetEffectiveMaxImageSize(siteCfg *SiteConfig, appCfg *AppConfig) int64
- func GetEffectiveMaxPageSize(appCfg *AppConfig) int64
- func GetEffectiveMetadataYAMLFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
- func GetEffectiveOutputMappingFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
- func GetEffectiveSkipImages(siteCfg *SiteConfig, appCfg *AppConfig) bool
- func GetEffectiveUserAgent(siteCfg *SiteConfig, appCfg *AppConfig) string
- type AppConfig
- type ChunkingConfig
- type HTTPClientConfig
- type ResolvedSiteConfig
- type SiteChunkingConfig
- type SiteConfig
Constants ¶
const ( // DefaultMaxPageSizeBytes is the default maximum page body size (50 MB). DefaultMaxPageSizeBytes int64 = 50 * 1024 * 1024 )
Variables ¶
This section is empty.
Functions ¶
func GetEffectiveChunkingEnabled ¶
func GetEffectiveChunkingEnabled(siteCfg *SiteConfig, appCfg *AppConfig) bool
GetEffectiveChunkingEnabled determines if chunking should be enabled.
func GetEffectiveChunkingMaxSize ¶
func GetEffectiveChunkingMaxSize(siteCfg *SiteConfig, appCfg *AppConfig) int
GetEffectiveChunkingMaxSize returns the effective max chunk size in tokens.
func GetEffectiveChunkingOutputFilename ¶
func GetEffectiveChunkingOutputFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
GetEffectiveChunkingOutputFilename returns the effective chunks output filename.
func GetEffectiveChunkingOverlap ¶
func GetEffectiveChunkingOverlap(siteCfg *SiteConfig, appCfg *AppConfig) int
GetEffectiveChunkingOverlap returns the effective chunk overlap in tokens.
func GetEffectiveEnableJSONLOutput ¶
func GetEffectiveEnableJSONLOutput(siteCfg *SiteConfig, appCfg *AppConfig) bool
GetEffectiveEnableJSONLOutput determines if JSONL output should be generated.
func GetEffectiveEnableMetadataYAML ¶
func GetEffectiveEnableMetadataYAML(siteCfg *SiteConfig, appCfg *AppConfig) bool
GetEffectiveEnableMetadataYAML determines if YAML metadata should be generated.
func GetEffectiveEnableOutputMapping ¶
func GetEffectiveEnableOutputMapping(siteCfg *SiteConfig, appCfg *AppConfig) bool
GetEffectiveEnableOutputMapping determines the effective setting for enabling the mapping file
func GetEffectiveJSONLOutputFilename ¶
func GetEffectiveJSONLOutputFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
GetEffectiveJSONLOutputFilename determines the filename for the JSONL output.
func GetEffectiveMaxImageSize ¶
func GetEffectiveMaxImageSize(siteCfg *SiteConfig, appCfg *AppConfig) int64
GetEffectiveMaxImageSize determines the effective max image size
func GetEffectiveMaxPageSize ¶
GetEffectiveMaxPageSize returns the configured max page size, or the default if unset.
func GetEffectiveMetadataYAMLFilename ¶
func GetEffectiveMetadataYAMLFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
GetEffectiveMetadataYAMLFilename determines the filename for the YAML metadata.
func GetEffectiveOutputMappingFilename ¶
func GetEffectiveOutputMappingFilename(siteCfg *SiteConfig, appCfg *AppConfig) string
GetEffectiveOutputMappingFilename determines the effective filename for the mapping file Site config (if non-empty) overrides global If both site and global are empty, a hardcoded default is returned
func GetEffectiveSkipImages ¶
func GetEffectiveSkipImages(siteCfg *SiteConfig, appCfg *AppConfig) bool
GetEffectiveSkipImages determines the effective skip setting
func GetEffectiveUserAgent ¶
func GetEffectiveUserAgent(siteCfg *SiteConfig, appCfg *AppConfig) string
GetEffectiveUserAgent returns the site-specific user agent if set, otherwise the global default.
Types ¶
type AppConfig ¶
type AppConfig struct {
DefaultUserAgent string `yaml:"default_user_agent"`
DefaultDelayPerHost time.Duration `yaml:"default_delay_per_host"`
NumWorkers int `yaml:"num_workers"`
NumImageWorkers int `yaml:"num_image_workers,omitempty"`
MaxRequests int `yaml:"max_requests"`
MaxRequestsPerHost int `yaml:"max_requests_per_host"`
OutputBaseDir string `yaml:"output_base_dir"`
StateDir string `yaml:"state_dir"`
MaxRetries int `yaml:"max_retries,omitempty"`
InitialRetryDelay time.Duration `yaml:"initial_retry_delay,omitempty"`
MaxRetryDelay time.Duration `yaml:"max_retry_delay,omitempty"`
SemaphoreAcquireTimeout time.Duration `yaml:"semaphore_acquire_timeout,omitempty"`
GlobalCrawlTimeout time.Duration `yaml:"global_crawl_timeout,omitempty"`
PerPageTimeout time.Duration `yaml:"per_page_timeout,omitempty"` // Timeout for processing a single page (0 = no timeout)
SkipImages bool `yaml:"skip_images,omitempty"`
MaxImageSizeBytes int64 `yaml:"max_image_size_bytes,omitempty"`
MaxPageSizeBytes int64 `yaml:"max_page_size_bytes,omitempty"` // Max HTML page body size in bytes (0 = 50MB default)
HTTPClientSettings HTTPClientConfig `yaml:"http_client_settings,omitempty"`
Sites map[string]*SiteConfig `yaml:"sites"`
EnableOutputMapping bool `yaml:"enable_output_mapping,omitempty"`
OutputMappingFilename string `yaml:"output_mapping_filename,omitempty"`
EnableMetadataYAML bool `yaml:"enable_metadata_yaml,omitempty"`
MetadataYAMLFilename string `yaml:"metadata_yaml_filename,omitempty"`
EnableJSONLOutput bool `yaml:"enable_jsonl_output,omitempty"`
JSONLOutputFilename string `yaml:"jsonl_output_filename,omitempty"`
EnableTokenCounting bool `yaml:"enable_token_counting,omitempty"`
TokenizerEncoding string `yaml:"tokenizer_encoding,omitempty"` // e.g., "cl100k_base" (GPT-4 default, approximate for Claude)
DBGCInterval time.Duration `yaml:"db_gc_interval,omitempty"` // Interval for BadgerDB value log GC (default: 10m)
EnableIncremental bool `yaml:"enable_incremental,omitempty"` // Enable incremental crawling (skip unchanged pages)
Chunking ChunkingConfig `yaml:"chunking,omitempty"`
}
AppConfig holds the global application configuration
type ChunkingConfig ¶
type ChunkingConfig struct {
Enabled bool `yaml:"enabled,omitempty"` // Enable chunking output
MaxChunkSize int `yaml:"max_chunk_size,omitempty"` // Max chunk size in tokens (default: 512)
ChunkOverlap int `yaml:"chunk_overlap,omitempty"` // Overlap between chunks in tokens (default: 50)
OutputFilename string `yaml:"output_filename,omitempty"` // Output filename (default: chunks.jsonl)
}
ChunkingConfig holds configuration for content chunking.
type HTTPClientConfig ¶
type HTTPClientConfig struct {
Timeout time.Duration `yaml:"timeout,omitempty"` // Overall request timeout
MaxIdleConns int `yaml:"max_idle_conns,omitempty"` // Max total idle connections
MaxIdleConnsPerHost int `yaml:"max_idle_conns_per_host,omitempty"` // Max idle connections per host
IdleConnTimeout time.Duration `yaml:"idle_conn_timeout,omitempty"` // Timeout for idle connections
TLSHandshakeTimeout time.Duration `yaml:"tls_handshake_timeout,omitempty"` // Timeout for TLS handshake
ExpectContinueTimeout time.Duration `yaml:"expect_continue_timeout,omitempty"` // Timeout for 100-continue
ForceAttemptHTTP2 *bool `yaml:"force_attempt_http2,omitempty"` // Explicitly enable/disable HTTP/2 attempt (use pointer for tri-state: nil=default, true=force, false=disable)
DialerTimeout time.Duration `yaml:"dialer_timeout,omitempty"` // Connection dial timeout
DialerKeepAlive time.Duration `yaml:"dialer_keep_alive,omitempty"` // TCP keep-alive interval
}
HTTPClientConfig holds settings for the shared HTTP client
type ResolvedSiteConfig ¶ added in v1.3.0
type ResolvedSiteConfig struct {
UserAgent string
OutputMappingFilename string
MetadataYAMLFilename string
JSONLOutputFilename string
ChunkingOutputFilename string
DelayPerHost time.Duration
MaxPageSizeBytes int64
MaxImageSizeBytes int64
SkipImages bool
EnableOutputMapping bool
EnableMetadataYAML bool
EnableJSONLOutput bool
ChunkingEnabled bool
ChunkingMaxSize int
ChunkingOverlap int
}
ResolvedSiteConfig holds all effective configuration values for a site, resolved once from site-specific overrides and app-level defaults.
func NewResolvedSiteConfig ¶ added in v1.3.0
func NewResolvedSiteConfig(siteCfg *SiteConfig, appCfg *AppConfig) *ResolvedSiteConfig
NewResolvedSiteConfig resolves all effective configuration values for a site.
type SiteChunkingConfig ¶
type SiteChunkingConfig struct {
Enabled *bool `yaml:"enabled,omitempty"`
MaxChunkSize *int `yaml:"max_chunk_size,omitempty"`
ChunkOverlap *int `yaml:"chunk_overlap,omitempty"`
OutputFilename string `yaml:"output_filename,omitempty"`
}
SiteChunkingConfig holds site-specific chunking overrides (uses pointers for tri-state).
type SiteConfig ¶
type SiteConfig struct {
StartURLs []string `yaml:"start_urls"`
AllowedDomain string `yaml:"allowed_domain"`
AllowedPathPrefix string `yaml:"allowed_path_prefix"`
ContentSelector string `yaml:"content_selector"`
LinkExtractionSelectors []string `yaml:"link_extraction_selectors,omitempty"`
DisallowedPathPatterns []string `yaml:"disallowed_path_patterns,omitempty"` // Regex patterns for paths to exclude
RespectNofollow bool `yaml:"respect_nofollow,omitempty"`
UserAgent string `yaml:"user_agent,omitempty"`
DelayPerHost time.Duration `yaml:"delay_per_host,omitempty"`
MaxDepth int `yaml:"max_depth"`
SkipImages *bool `yaml:"skip_images,omitempty"`
MaxImageSizeBytes *int64 `yaml:"max_image_size_bytes,omitempty"`
AllowedImageDomains []string `yaml:"allowed_image_domains,omitempty"`
DisallowedImageDomains []string `yaml:"disallowed_image_domains,omitempty"`
EnableOutputMapping *bool `yaml:"enable_output_mapping,omitempty"`
OutputMappingFilename string `yaml:"output_mapping_filename,omitempty"`
EnableMetadataYAML *bool `yaml:"enable_metadata_yaml,omitempty"`
MetadataYAMLFilename string `yaml:"metadata_yaml_filename,omitempty"`
EnableJSONLOutput *bool `yaml:"enable_jsonl_output,omitempty"`
JSONLOutputFilename string `yaml:"jsonl_output_filename,omitempty"`
Chunking SiteChunkingConfig `yaml:"chunking,omitempty"`
}
SiteConfig holds configuration specific to a single website crawl
func (*SiteConfig) Validate ¶
func (c *SiteConfig) Validate() (warnings []string, err error)
Validate checks SiteConfig fields and applies defaults. Returns collected warnings and any fatal error. Modifies receiver in place (e.g., path prefix normalization).