Documentation
¶
Index ¶
Constants ¶
const (
ImageDir = "images" // Subdirectory name within siteOutputDir for images
)
Variables ¶
This section is empty.
Functions ¶
func CountTokens ¶
CountTokens returns the token count for the given text. Returns -1 if the tokenizer is not initialized or encoding fails, so callers can distinguish "not available" from a real zero count.
func ExtractHeadings ¶
ExtractHeadings parses markdown content and extracts all heading texts. Returns a slice of heading strings in document order.
func InitTokenizer ¶
InitTokenizer initializes the tokenizer with the specified encoding. Common encodings: "cl100k_base" (GPT-4), "o200k_base" (GPT-4o), "p50k_base" (GPT-3). Note: Claude uses its own tokenizer (not publicly available); cl100k_base is a reasonable approximation. If encoding is empty, defaults to "cl100k_base".
func IsInitialized ¶
func IsInitialized() bool
IsInitialized returns whether the tokenizer has been initialized.
Types ¶
type Chunk ¶
type Chunk struct {
Content string // The chunk content (includes heading context when HeadingHierarchy is enabled)
HeadingHierarchy []string // Extracted heading hierarchy from the chunk
TokenCount int // Token count for this chunk
}
Chunk represents a single chunk of content with its metadata.
func ChunkMarkdown ¶
func ChunkMarkdown(markdown string, cfg ChunkerConfig) ([]Chunk, error)
ChunkMarkdown splits markdown content into chunks using a hybrid strategy: 1. Primary: Split by markdown headers, preserving heading hierarchy 2. Fallback: If any chunk exceeds maxChunkSize, apply recursive character splitting
Each chunk includes parent heading context prepended for RAG retrieval.
type ChunkerConfig ¶
type ChunkerConfig struct {
MaxChunkSize int // Maximum chunk size in tokens (triggers recursive split if exceeded)
ChunkOverlap int // Overlap between chunks in tokens (for recursive fallback)
}
ChunkerConfig holds configuration for the chunker.
func DefaultChunkerConfig ¶
func DefaultChunkerConfig() ChunkerConfig
DefaultChunkerConfig returns sensible defaults for RAG chunking.
type ContentProcessor ¶
type ContentProcessor struct {
// contains filtered or unexported fields
}
ContentProcessor handles extracting, cleaning, processing (images, links), converting to Markdown, and saving of page content
func NewContentProcessor ¶
func NewContentProcessor(imgProcessor *ImageProcessor, appCfg *config.AppConfig, log *logrus.Entry) *ContentProcessor
NewContentProcessor creates a ContentProcessor
func (*ContentProcessor) ExtractProcessAndSaveContent ¶
func (cp *ContentProcessor) ExtractProcessAndSaveContent( doc *goquery.Document, finalURL *url.URL, siteCfg *config.SiteConfig, siteOutputDir string, taskLog *logrus.Entry, ctx context.Context, ) (pageTitle string, savedFilePath string, markdownBytes []byte, imageCount int, err error)
ExtractProcessAndSaveContent extracts content using the configured selector, processes images and internal links, converts to Markdown, and saves to a path derived from finalURL and siteOutputDir. Returns the extracted page title and any critical error encountered during processing or saving.
type ImageDownloadTask ¶
type ImageDownloadTask struct {
AbsImgURL string
NormImgURL string
BaseImgURL *url.URL // Parsed absolute URL
ImgHost string
ExtractedCaption string
ImgLogEntry *logrus.Entry // Logger with image-specific context
Ctx context.Context // Context for this specific task
}
ImageDownloadTask holds information needed for an image worker to process one image
type ImageProcessor ¶
type ImageProcessor struct {
// contains filtered or unexported fields
}
ImageProcessor handles the orchestration of image downloading and processing
func NewImageProcessor ¶
func NewImageProcessor( store storage.ImageStore, fetcher fetch.HTTPFetcher, robotsHandler *fetch.RobotsHandler, rateLimiter *fetch.RateLimiter, globalSemaphore *semaphore.Weighted, hostSemPool *fetch.HostSemaphorePool, resolved *config.ResolvedSiteConfig, appCfg *config.AppConfig, log *logrus.Entry, ) *ImageProcessor
NewImageProcessor creates a new ImageProcessor
func (*ImageProcessor) ProcessImages ¶
func (ip *ImageProcessor) ProcessImages( mainContent *goquery.Selection, finalURL *url.URL, siteCfg *config.SiteConfig, siteOutputDir string, taskLog *logrus.Entry, ctx context.Context, ) (imageMap map[string]models.ImageData, imageErrs []error)
ProcessImages finds images within the main content, checks status, dispatches downloads to a worker pool, and returns a map of successfully processed images and any errors It modifies the 'data-crawl-status' attribute on img tags in the selection
type LinkProcessor ¶
type LinkProcessor struct {
// contains filtered or unexported fields
}
LinkProcessor handles extracting and queueing links found on a page
func NewLinkProcessor ¶
func NewLinkProcessor( store storage.PageStore, pq *queue.ThreadSafePriorityQueue, compiledDisallowedPatterns []*regexp.Regexp, log *logrus.Entry, ) *LinkProcessor
NewLinkProcessor creates a LinkProcessor
func (*LinkProcessor) ExtractAndQueueLinks ¶
func (lp *LinkProcessor) ExtractAndQueueLinks( originalDoc *goquery.Document, finalURL *url.URL, currentDepth int, siteCfg *config.SiteConfig, wg *sync.WaitGroup, taskLog *logrus.Entry, ) (queuedCount int, err error)
ExtractAndQueueLinks finds crawlable links within the specified selectors of a document, filters them based on scope and rules, and adds new ones to the priority queue It takes the *original* document to ensure all potential links are considered, before the content might be modified by Markdown conversion etc