Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ChunkJSONL ¶
type ChunkJSONL struct {
URL string `json:"url"` // Source page URL
ChunkIndex int `json:"chunk_index"` // Index of this chunk within the page
Content string `json:"content"` // Chunk content (includes heading context)
HeadingHierarchy []string `json:"heading_hierarchy"` // Extracted heading hierarchy
TokenCount int `json:"token_count"` // Token count for this chunk
PageTitle string `json:"page_title"` // Title of the source page
CrawledAt string `json:"crawled_at"` // Timestamp of crawl
}
ChunkJSONL represents a single chunk for JSONL output (RAG vector ingestion).
type CrawlMetadata ¶
type CrawlMetadata struct {
SiteKey string `yaml:"site_key"`
AllowedDomain string `yaml:"allowed_domain"`
CrawlStartTime time.Time `yaml:"crawl_start_time"`
CrawlEndTime time.Time `yaml:"crawl_end_time"`
TotalPagesSaved int `yaml:"total_pages_saved"`
SiteConfiguration map[string]interface{} `yaml:"site_configuration,omitempty"` // For a flexible dump of SiteConfig
Pages []PageMetadata `yaml:"pages"`
}
CrawlMetadata holds all metadata for a single crawl session of a site.
type ImageDBEntry ¶
type ImageDBEntry struct {
Status ImageStatus `json:"status"` // Processing status (success, failure, skipped)
LocalPath string `json:"local_path,omitempty"` // Relative path from site output dir (on success)
Caption string `json:"caption,omitempty"` // Captured caption/alt (on success)
ErrorType string `json:"error_type,omitempty"` // Error category (on failure)
LastAttempt time.Time `json:"last_attempt"` // Timestamp of the last processing attempt
}
ImageDBEntry stores the result of processing an image URL in the database
type ImageData ¶
type ImageData struct {
OriginalURL string
LocalPath string // Relative path from site output dir
Caption string // Image caption/alt text
}
ImageData stores information about a successfully downloaded image
type ImageStatus ¶
type ImageStatus string
ImageStatus represents the processing status of an image in the database
const ( ImageStatusUnset ImageStatus = "" // Zero value = unset/unknown ImageStatusPending ImageStatus = "pending" // Image queued for download ImageStatusSuccess ImageStatus = "success" // Image downloaded successfully ImageStatusFailure ImageStatus = "failure" // Image download failed ImageStatusSkipped ImageStatus = "skipped" // Image skipped (size/domain filter) ImageStatusNotFound ImageStatus = "not_found" // Image not in database ImageStatusDBError ImageStatus = "db_error" // Database error occurred )
func (ImageStatus) IsValid ¶
func (s ImageStatus) IsValid() bool
IsValid returns true if the status is a known operational value
func (ImageStatus) String ¶
func (s ImageStatus) String() string
String implements fmt.Stringer for logging
type PageDBEntry ¶
type PageDBEntry struct {
Status PageStatus `json:"status"` // Processing status (success, failure, pending)
ErrorType string `json:"error_type,omitempty"` // Error category (on failure)
ProcessedAt time.Time `json:"processed_at,omitempty"` // Timestamp of successful processing
LastAttempt time.Time `json:"last_attempt"` // Timestamp of the last processing attempt
Depth int `json:"depth"` // Depth at which this page was processed/attempted
ContentHash string `json:"content_hash,omitempty"` // Content hash for incremental crawling
}
PageDBEntry stores the result of processing a page URL in the database
type PageJSONL ¶
type PageJSONL struct {
URL string `json:"url"`
Title string `json:"title"`
Content string `json:"content"`
Headings []string `json:"headings"`
Links []string `json:"links"`
Images []string `json:"images"`
ContentHash string `json:"content_hash"`
CrawledAt string `json:"crawled_at"`
Depth int `json:"depth"`
TokenCount int `json:"token_count,omitempty"`
}
PageJSONL represents a single page for JSONL output (RAG pipeline ingestion).
type PageMetadata ¶
type PageMetadata struct {
OriginalURL string `yaml:"original_url"`
NormalizedURL string `yaml:"normalized_url"`
LocalFilePath string `yaml:"local_file_path"` // Relative to site_output_dir
Title string `yaml:"title,omitempty"`
Depth int `yaml:"depth"`
ProcessedAt time.Time `yaml:"processed_at"`
ContentHash string `yaml:"content_hash,omitempty"` // SHA-256 hex string
ImageCount int `yaml:"image_count,omitempty"` // Count of images processed for this page
TokenCount int `yaml:"token_count,omitempty"` // Token count for LLM context planning
}
PageMetadata holds metadata for a single scraped page.
type PageStatus ¶
type PageStatus string
PageStatus represents the processing status of a page in the database
const ( PageStatusUnset PageStatus = "" // Zero value = unset/unknown PageStatusPending PageStatus = "pending" // Page queued but not processed PageStatusSuccess PageStatus = "success" // Page processed successfully PageStatusFailure PageStatus = "failure" // Page processing failed PageStatusNotFound PageStatus = "not_found" // Page not in database PageStatusDBError PageStatus = "db_error" // Database error occurred )
func (PageStatus) IsValid ¶
func (s PageStatus) IsValid() bool
IsValid returns true if the status is a known operational value
func (PageStatus) String ¶
func (s PageStatus) String() string
String implements fmt.Stringer for logging