Documentation
¶
Index ¶
- Variables
- func CleanBoxDrawingChars(text string) string
- func CleanZeroWidthChars(text string) string
- func CountPUAChars(text string) (count int, ratio float64)
- func DecodePUAWithShift(text string, shift int) string
- func DetectAndFormatLists(text string) string
- func DetectContentType(path string, content []byte) string
- func DetectFootnoteReferences(text string) []int
- func DetectParagraphBreaks(text string) string
- func DetectSymbolFont(text string) string
- func DetectSymbolGreekText(text string) float64
- func EnhancedParagraphDetection(text string, config ParagraphConfig) string
- func ExpandFootnoteReferences(text string) string
- func HasPUAChars(text string) bool
- func IsPUAChar(r rune) bool
- func JoinHyphenatedWords(text string) string
- func MapGlyphName(name string) (rune, bool)
- func NormalizeSubSuperscripts(text string) string
- func NormalizeSubscripts(text string) string
- func NormalizeSuperscripts(text string) string
- func NormalizeUnicode(text string) string
- func RepairSymbolGreekText(text string) string
- type CacheConfig
- type CacheEntry
- type CacheStats
- type CachingTransport
- type Column
- type ContentCache
- func (c *ContentCache) AddConditionalHeaders(req *http.Request) bool
- func (c *ContentCache) Clear() error
- func (c *ContentCache) ContentHash(body []byte) string
- func (c *ContentCache) Get(url string) *CacheEntry
- func (c *ContentCache) GetWithContentHash(hash string) []byte
- func (c *ContentCache) HandleNotModified(url string, headers http.Header) *CacheEntry
- func (c *ContentCache) IsDuplicate(body []byte) (bool, string)
- func (c *ContentCache) Set(url string, body []byte, headers http.Header, statusCode int)
- func (c *ContentCache) SetFromResponse(resp *http.Response, body []byte)
- func (c *ContentCache) Stats() CacheStats
- type ContentItem
- type ContentProcessor
- type ContentSource
- type DocumentSection
- type EncodingFallbackDecoder
- type EnhancedTextCleaner
- func (etc *EnhancedTextCleaner) Clean(text string) string
- func (etc *EnhancedTextCleaner) CleanForSearch(text string) string
- func (etc *EnhancedTextCleaner) CleanFull(text string) string
- func (etc *EnhancedTextCleaner) CleanWithEnhancedParagraphs(text string) string
- func (etc *EnhancedTextCleaner) CleanWithEnhancedParagraphsConfig(text string, config ParagraphConfig) string
- func (etc *EnhancedTextCleaner) CleanWithFootnotes(text string) string
- func (etc *EnhancedTextCleaner) CleanWithLists(text string) string
- func (etc *EnhancedTextCleaner) CleanWithParagraphs(text string) string
- type FilesystemSource
- type FilesystemSourceConfig
- type FontDecoder
- type GitAuth
- type GitSource
- type GitSourceConfig
- type GlyphMapper
- type GoogleDriveSource
- type GoogleDriveSourceConfig
- type HTMLProcessor
- type LayoutAnalyzer
- type LayoutConfig
- type LineInfo
- type MarkdownProcessor
- func (mp *MarkdownProcessor) CanProcess(contentType, path string) bool
- func (mp *MarkdownProcessor) ExtractQuestions(path, sourceURL string, content []byte) []Question
- func (mp *MarkdownProcessor) ExtractQuestionsWithSections(path, sourceURL string, content []byte) []Question
- func (mp *MarkdownProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
- type OpenAPIProcessor
- type PDFProcessor
- type PDFProgress
- type PDFProgressFunc
- type ParagraphConfig
- type Processor
- type ProcessorRegistry
- type Question
- type QuestionsExtractor
- type S3Source
- type S3SourceConfig
- type Table
- type TableCell
- type TextBlock
- type TextRepair
- func (tr *TextRepair) AutoDecodePUA(text string) (decoded string, description string)
- func (tr *TextRepair) AutoDecodeText(text string) (decoded string, fixed string)
- func (tr *TextRepair) AutoRepairMirroredText(text string) (string, bool)
- func (tr *TextRepair) CalculateLineEntropy(line string) float64
- func (tr *TextRepair) DecodeShiftedText(text string, shift int) string
- func (tr *TextRepair) DecodeSymbolSubstitution(text string, substMap map[rune]rune) string
- func (tr *TextRepair) DetectDepositionLayout(texts []pdf.Text) bool
- func (tr *TextRepair) DetectEncodedPattern(text string) (patternType string, description string)
- func (tr *TextRepair) DetectEncodingShift(text string) (shift int, confidence float64)
- func (tr *TextRepair) DetectFontEncodingCorruption(text string) float64
- func (tr *TextRepair) DetectMirroredText(text string) float64
- func (tr *TextRepair) DetectPUAShift(text string) (shift int, confidence float64)
- func (tr *TextRepair) DetectSymbolSubstitution(text string) (map[rune]rune, float64)
- func (tr *TextRepair) FilterFontEncodingCorruption(text string) string
- func (tr *TextRepair) FilterLineNumberColumn(texts []pdf.Text) []pdf.Text
- func (tr *TextRepair) FilterNoiseLines(text string) string
- func (tr *TextRepair) GetDetectedFooters() []string
- func (tr *TextRepair) GetDetectedHeaders() []string
- func (tr *TextRepair) IsFontEncodingCorrupted(text string) bool
- func (tr *TextRepair) IsNoiseLine(line string) bool
- func (tr *TextRepair) MarkFontEncodingIssues(text string) string
- func (tr *TextRepair) RecordPageContent(pageText string)
- func (tr *TextRepair) RemoveHeadersFooters(pageText string, headers, footers []string) string
- func (tr *TextRepair) RemoveInterleavedReplacements(text string) string
- func (tr *TextRepair) RepairMirroredText(text string) string
- func (tr *TextRepair) RepairMisspelledWords(text string) string
- func (tr *TextRepair) SegmentWords(text string) string
- type WebSource
- type WebSourceConfig
- type WholeFileProcessor
Constants ¶
This section is empty.
Variables ¶
var CommonEnglishWords = map[string]bool{}/* 311 elements not displayed */
CommonEnglishWords contains top 10000 common English words for word segmentation. This is a subset - in production, you'd want the full list.
var CommonReversedWords = map[string]string{
"eht": "the", "dna": "and", "rof": "for", "era": "are", "tub": "but",
"ton": "not", "uoy": "you", "lla": "all", "nac": "can", "dah": "had",
"reh": "her", "saw": "was", "eno": "one", "ruo": "our", "tuo": "out",
"yad": "day", "teg": "get", "sah": "has", "mih": "him", "sih": "his",
"woh": "how", "nam": "man", "wen": "new", "won": "now", "dlo": "old",
"ees": "see", "emit": "time", "yrev": "very", "nehw": "when", "ohw": "who",
"yob": "boy", "did": "did", "sti": "its", "tel": "let", "tup": "put",
"yas": "say", "ehs": "she", "owt": "two", "yaw": "way", "taht": "that",
"siht": "this", "htiw": "with", "evah": "have", "morf": "from", "yeht": "they",
"neeb": "been", "evol": "love", "ekam": "make", "erom": "more", "ylno": "only",
"revo": "over", "hcus": "such", "ekat": "take", "naht": "than", "meht": "them",
"neht": "then", "eseht": "these", "gniht": "thing", "kniht": "think",
"erehw": "where", "hcihw": "which", "dlrow": "world", "dluow": "would",
"tuoba": "about", "retfa": "after", "niaga": "again", "tsniaga": "against",
}
CommonReversedWords contains common English words and their reversed forms. Used as a secondary check for mirrored text.
var CommonSymbolSubstitutions = map[rune]rune{
'$': 'A', '%': 'B', '&': 'C', '\'': 'D', '(': 'E', ')': 'F',
'*': 'G', '+': 'H', ',': 'I', '-': 'J', '.': 'K', '/': 'L',
'0': 'M', '1': 'N', '2': 'O', '3': 'P', '4': 'Q', '5': 'R',
'6': 'S', '7': 'T', '8': 'U', '9': 'V', ':': 'W', ';': 'X',
'<': 'Y', '=': 'Z',
}
CommonSymbolSubstitutions maps common symbol substitutions in encoded PDFs. Key: encoded symbol, Value: decoded character
var DepositionLayoutConfig = LayoutConfig{ ColumnGapThreshold: 12.0, RowTolerance: 2.0, MinRowsForColumnPct: 75, FilterLineNumbers: true, }
DepositionLayoutConfig provides optimized settings for deposition transcripts.
var EnglishBigramFrequency = map[string]float64{
"th": 0.0356, "he": 0.0307, "in": 0.0243, "er": 0.0205, "an": 0.0199,
"re": 0.0185, "on": 0.0176, "at": 0.0149, "en": 0.0145, "nd": 0.0135,
"ti": 0.0134, "es": 0.0134, "or": 0.0128, "te": 0.0120, "of": 0.0117,
"ed": 0.0117, "is": 0.0113, "it": 0.0112, "al": 0.0109, "ar": 0.0107,
"st": 0.0105, "to": 0.0104, "nt": 0.0104, "ng": 0.0095, "se": 0.0093,
"ha": 0.0093, "as": 0.0087, "ou": 0.0087, "io": 0.0083, "le": 0.0083,
"ve": 0.0083, "co": 0.0079, "me": 0.0079, "de": 0.0076, "hi": 0.0076,
"ri": 0.0073, "ro": 0.0073, "ic": 0.0070, "ne": 0.0069, "ea": 0.0069,
"ra": 0.0069, "ce": 0.0065, "li": 0.0062, "ch": 0.0060, "ll": 0.0058,
"be": 0.0058, "ma": 0.0057, "si": 0.0055, "om": 0.0055, "ur": 0.0054,
}
EnglishBigramFrequency contains the top English bigram frequencies. These are used to detect reversed text by comparing bigram distributions.
var EnglishLetterFrequency = map[rune]float64{
'e': 0.127, 't': 0.091, 'a': 0.082, 'o': 0.075, 'i': 0.070,
'n': 0.067, 's': 0.063, 'h': 0.061, 'r': 0.060, 'd': 0.043,
'l': 0.040, 'c': 0.028, 'u': 0.028, 'm': 0.024, 'w': 0.024,
'f': 0.022, 'g': 0.020, 'y': 0.020, 'p': 0.019, 'b': 0.015,
'v': 0.010, 'k': 0.008, 'j': 0.002, 'x': 0.002, 'q': 0.001,
'z': 0.001,
}
EnglishLetterFrequency contains standard English letter frequencies.
var ExtendedGlyphNames = map[string]rune{}/* 114 elements not displayed */
ExtendedGlyphNames provides additional glyph name mappings beyond what the ledongthuc/pdf library includes.
var StandardEncoding = [256]rune{}/* 256 elements not displayed */
StandardEncoding is the PostScript standard encoding. Used by Type 1 fonts when no other encoding is specified. See PDF Reference Table D.1
var SubscriptMap = map[rune]rune{
'0': '₀', '1': '₁', '2': '₂', '3': '₃', '4': '₄',
'5': '₅', '6': '₆', '7': '₇', '8': '₈', '9': '₉',
'+': '₊', '-': '₋', '=': '₌', '(': '₍', ')': '₎',
'a': 'ₐ', 'e': 'ₑ', 'h': 'ₕ', 'i': 'ᵢ', 'j': 'ⱼ',
'k': 'ₖ', 'l': 'ₗ', 'm': 'ₘ', 'n': 'ₙ', 'o': 'ₒ',
'p': 'ₚ', 'r': 'ᵣ', 's': 'ₛ', 't': 'ₜ', 'u': 'ᵤ',
'v': 'ᵥ', 'x': 'ₓ',
}
SubscriptMap maps regular digits and letters to their Unicode subscript equivalents.
var SubscriptToNormal = map[rune]rune{
'₀': '0', '₁': '1', '₂': '2', '₃': '3', '₄': '4',
'₅': '5', '₆': '6', '₇': '7', '₈': '8', '₉': '9',
'₊': '+', '₋': '-', '₌': '=', '₍': '(', '₎': ')',
'ₐ': 'a', 'ₑ': 'e', 'ₕ': 'h', 'ᵢ': 'i', 'ⱼ': 'j',
'ₖ': 'k', 'ₗ': 'l', 'ₘ': 'm', 'ₙ': 'n', 'ₒ': 'o',
'ₚ': 'p', 'ᵣ': 'r', 'ₛ': 's', 'ₜ': 't', 'ᵤ': 'u',
'ᵥ': 'v', 'ₓ': 'x',
}
SubscriptToNormal maps Unicode subscript characters back to normal characters.
var SuperscriptMap = map[rune]rune{
'0': '⁰', '1': '¹', '2': '²', '3': '³', '4': '⁴',
'5': '⁵', '6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹',
'+': '⁺', '-': '⁻', '=': '⁼', '(': '⁽', ')': '⁾',
'a': 'ᵃ', 'b': 'ᵇ', 'c': 'ᶜ', 'd': 'ᵈ', 'e': 'ᵉ',
'f': 'ᶠ', 'g': 'ᵍ', 'h': 'ʰ', 'i': 'ⁱ', 'j': 'ʲ',
'k': 'ᵏ', 'l': 'ˡ', 'm': 'ᵐ', 'n': 'ⁿ', 'o': 'ᵒ',
'p': 'ᵖ', 'r': 'ʳ', 's': 'ˢ', 't': 'ᵗ', 'u': 'ᵘ',
'v': 'ᵛ', 'w': 'ʷ', 'x': 'ˣ', 'y': 'ʸ', 'z': 'ᶻ',
}
SuperscriptMap maps regular digits and letters to their Unicode superscript equivalents.
var SuperscriptToNormal = map[rune]rune{
'⁰': '0', '¹': '1', '²': '2', '³': '3', '⁴': '4',
'⁵': '5', '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9',
'⁺': '+', '⁻': '-', '⁼': '=', '⁽': '(', '⁾': ')',
'ᵃ': 'a', 'ᵇ': 'b', 'ᶜ': 'c', 'ᵈ': 'd', 'ᵉ': 'e',
'ᶠ': 'f', 'ᵍ': 'g', 'ʰ': 'h', 'ⁱ': 'i', 'ʲ': 'j',
'ᵏ': 'k', 'ˡ': 'l', 'ᵐ': 'm', 'ⁿ': 'n', 'ᵒ': 'o',
'ᵖ': 'p', 'ʳ': 'r', 'ˢ': 's', 'ᵗ': 't', 'ᵘ': 'u',
'ᵛ': 'v', 'ʷ': 'w', 'ˣ': 'x', 'ʸ': 'y', 'ᶻ': 'z',
}
SuperscriptToNormal maps Unicode superscript characters back to normal characters.
var SymbolEncoding = [256]rune{}/* 256 elements not displayed */
SymbolEncoding is for the Symbol font (Greek letters, math symbols). See PDF Reference Table D.5
var SymbolToLatinMap = map[rune]rune{
'Α': 'A',
'Β': 'B',
'Χ': 'C',
'Δ': 'D',
'Ε': 'E',
'Φ': 'F',
'Γ': 'G',
'Η': 'H',
'Ι': 'I',
'ϑ': 'J',
'Κ': 'K',
'Λ': 'L',
'Μ': 'M',
'Ν': 'N',
'Ο': 'O',
'Π': 'P',
'Θ': 'Q',
'Ρ': 'R',
'Σ': 'S',
'Τ': 'T',
'Υ': 'U',
'ς': 'V',
'Ω': 'W',
'Ξ': 'X',
'Ψ': 'Y',
'Ζ': 'Z',
'α': 'a',
'β': 'b',
'χ': 'c',
'δ': 'd',
'ε': 'e',
'φ': 'f',
'γ': 'g',
'η': 'h',
'ι': 'i',
'ϕ': 'j',
'κ': 'k',
'λ': 'l',
'μ': 'm',
'ν': 'n',
'ο': 'o',
'π': 'p',
'θ': 'q',
'ρ': 'r',
'σ': 's',
'τ': 't',
'υ': 'u',
'ϖ': 'v',
'ω': 'w',
'ξ': 'x',
'ψ': 'y',
'ζ': 'z',
'∃': 'A',
'∋': 'D',
'∴': 'Y',
'∀': '"',
'∗': '*',
'−': '-',
'≅': '@',
'⊥': '_',
'∼': '~',
}
SymbolToLatinMap provides reverse mapping from Symbol font Greek letters back to their ASCII Latin equivalents. This handles PDFs where Symbol font was used to represent English text (common in legal documents).
var ZapfDingbatsEncoding = [256]rune{}/* 256 elements not displayed */
ZapfDingbatsEncoding is for the ZapfDingbats font (decorative symbols). See PDF Reference Table D.6
Functions ¶
func CleanBoxDrawingChars ¶
CleanBoxDrawingChars removes box drawing characters that appear as artifacts from PDF text extraction (common in forms and legal documents).
func CleanZeroWidthChars ¶
CleanZeroWidthChars removes invisible zero-width and control characters that can break word matching and text processing.
func CountPUAChars ¶
CountPUAChars returns the count and ratio of PUA characters in text.
func DecodePUAWithShift ¶
DecodePUAWithShift decodes PUA-preserved bytes by applying a character shift. This handles custom font encodings where characters are shifted by a fixed offset. For example, a shift of 29 would decode PUA byte 65 as 65+29=94 ('n' in some encodings).
func DetectAndFormatLists ¶
DetectAndFormatLists identifies list structures and formats them consistently. Normalizes bullet styles and adds consistent indentation.
func DetectContentType ¶
DetectContentType detects the MIME type from a file path.
func DetectFootnoteReferences ¶
DetectFootnoteReferences finds patterns like "text¹" or "word²³" that indicate footnote references. Returns a list of positions where footnote superscripts were detected.
func DetectParagraphBreaks ¶
DetectParagraphBreaks analyzes lines and marks paragraph boundaries. Returns text with paragraph breaks as double newlines. Uses heuristics: spacing gaps, indentation, short lines, sentence endings.
func DetectSymbolFont ¶
DetectSymbolFont checks if text appears to be from a Symbol or Dingbats font based on the character patterns present.
func DetectSymbolGreekText ¶
DetectSymbolGreekText checks if text appears to use Symbol font Greek letters in place of Latin letters. Returns the ratio of Greek letters to total letters.
func EnhancedParagraphDetection ¶
func EnhancedParagraphDetection(text string, config ParagraphConfig) string
EnhancedParagraphDetection applies sophisticated paragraph detection. Returns text with paragraph breaks as double newlines.
func ExpandFootnoteReferences ¶
ExpandFootnoteReferences converts superscript footnote markers to bracketed format. Example: "statement¹" → "statement[1]"
func HasPUAChars ¶
HasPUAChars returns true if the text contains any PUA characters.
func IsPUAChar ¶
IsPUAChar returns true if the rune is in the Private Use Area range used to preserve unmapped font bytes (U+E000-U+E0FF).
func JoinHyphenatedWords ¶
JoinHyphenatedWords joins words that were split across lines with hyphens. Handles both hard hyphens (deliberate) and soft hyphens (formatting). Example: "state-\nment" → "statement"
func MapGlyphName ¶
MapGlyphName tries to map a glyph name to Unicode using extended mappings. Returns the rune and true if found, or 0 and false if not.
func NormalizeSubSuperscripts ¶
NormalizeSubSuperscripts normalizes both subscripts and superscripts to regular characters.
func NormalizeSubscripts ¶
NormalizeSubscripts converts Unicode subscript characters to regular characters. Useful for text search and indexing where H₂O should match H2O.
func NormalizeSuperscripts ¶
NormalizeSuperscripts converts Unicode superscript characters to regular characters. Useful for text search and indexing where H²O should match H2O.
func NormalizeUnicode ¶
NormalizeUnicode applies NFC normalization to text. This converts combining character sequences to their composed forms, making text more consistent and searchable. Example: "é" (e + combining accent) → "é" (single character)
func RepairSymbolGreekText ¶
RepairSymbolGreekText converts Symbol font Greek letters back to Latin. Only applies if a significant portion of the text appears to be Greek.
Types ¶
type CacheConfig ¶
type CacheConfig struct {
// Enabled enables caching (default: false)
Enabled bool
// Dir is the directory for persistent cache storage.
// If empty, only in-memory caching is used.
Dir string
// TTL is the default time-to-live for cached entries.
// HTTP Cache-Control headers take precedence when present.
TTL time.Duration
// MaxMemoryItems is the maximum number of items to keep in memory (default: 1000).
MaxMemoryItems int
// MaxDiskSize is the maximum disk cache size in bytes (default: 100MB).
// Set to 0 for unlimited.
MaxDiskSize int64
// RespectCacheHeaders enables HTTP cache header parsing (default: true).
// When enabled, Cache-Control, ETag, and Last-Modified are respected.
RespectCacheHeaders bool
// EnableDeduplication enables content hash deduplication (default: true).
// Identical content from different URLs will be stored only once.
EnableDeduplication bool
}
CacheConfig configures the content cache behavior.
type CacheEntry ¶
type CacheEntry struct {
// URL is the original request URL.
URL string `json:"url"`
// Body is the response body content.
Body []byte `json:"body,omitempty"`
// ContentType is the Content-Type header value.
ContentType string `json:"content_type"`
// StatusCode is the HTTP status code.
StatusCode int `json:"status_code"`
// ETag is the ETag header for conditional requests.
ETag string `json:"etag,omitempty"`
// LastModified is the Last-Modified header for conditional requests.
LastModified string `json:"last_modified,omitempty"`
// Expires is when this entry expires.
Expires time.Time `json:"expires"`
// CachedAt is when this entry was cached.
CachedAt time.Time `json:"cached_at"`
// ContentHash is the SHA-256 hash of the body for deduplication.
ContentHash string `json:"content_hash,omitempty"`
// BodyFile is the filename for disk-cached body (when using deduplication).
BodyFile string `json:"body_file,omitempty"`
}
CacheEntry represents a cached HTTP response.
func (*CacheEntry) CanRevalidate ¶
func (e *CacheEntry) CanRevalidate() bool
CanRevalidate returns true if the entry has validators for conditional requests.
func (*CacheEntry) IsExpired ¶
func (e *CacheEntry) IsExpired() bool
IsExpired returns true if the cache entry has expired.
func (*CacheEntry) IsStale ¶
func (e *CacheEntry) IsStale() bool
IsStale returns true if the entry is expired but may be revalidated.
type CacheStats ¶
type CacheStats struct {
MemoryEntries int
UniqueContents int
TotalSizeBytes int64
DiskSizeBytes int64
}
CacheStats contains cache statistics.
type CachingTransport ¶
type CachingTransport struct {
Transport http.RoundTripper
Cache *ContentCache
}
CachingTransport wraps http.RoundTripper with caching support.
func NewCachingTransport ¶
func NewCachingTransport(transport http.RoundTripper, cache *ContentCache) *CachingTransport
NewCachingTransport creates a new caching transport.
type ContentCache ¶
type ContentCache struct {
// contains filtered or unexported fields
}
ContentCache provides HTTP-aware caching with optional disk persistence.
func NewContentCache ¶
func NewContentCache(config CacheConfig) (*ContentCache, error)
NewContentCache creates a new content cache with the given configuration.
func (*ContentCache) AddConditionalHeaders ¶
func (c *ContentCache) AddConditionalHeaders(req *http.Request) bool
AddConditionalHeaders adds If-None-Match and If-Modified-Since headers to a request if we have cached validators.
func (*ContentCache) Clear ¶
func (c *ContentCache) Clear() error
Clear removes all entries from the cache.
func (*ContentCache) ContentHash ¶
func (c *ContentCache) ContentHash(body []byte) string
ContentHash returns the hash of the given content.
func (*ContentCache) Get ¶
func (c *ContentCache) Get(url string) *CacheEntry
Get retrieves a cached entry for the given URL. Returns nil if not found or expired (and not revalidatable).
func (*ContentCache) GetWithContentHash ¶
func (c *ContentCache) GetWithContentHash(hash string) []byte
GetWithContentHash retrieves a cached entry by content hash (for deduplication).
func (*ContentCache) HandleNotModified ¶
func (c *ContentCache) HandleNotModified(url string, headers http.Header) *CacheEntry
HandleNotModified updates an existing cache entry when a 304 is received.
func (*ContentCache) IsDuplicate ¶
func (c *ContentCache) IsDuplicate(body []byte) (bool, string)
IsDuplicate checks if content with this hash already exists.
func (*ContentCache) SetFromResponse ¶
func (c *ContentCache) SetFromResponse(resp *http.Response, body []byte)
SetFromResponse stores an HTTP response in the cache.
func (*ContentCache) Stats ¶
func (c *ContentCache) Stats() CacheStats
Stats returns cache statistics.
type ContentItem ¶
type ContentItem struct {
// Path is the relative path or URL path for the content
Path string
// SourceURL is the full URL for web sources (empty for filesystem sources)
SourceURL string
// Content is the raw content bytes
Content []byte
// ContentType is the MIME type (e.g., "text/html", "application/pdf")
ContentType string
// Metadata contains source-specific metadata (HTTP headers, file info, etc.)
Metadata map[string]any
}
ContentItem represents a single piece of content from any source (filesystem, web, etc.)
type ContentProcessor ¶
type ContentProcessor interface {
// CanProcess returns true if this processor can handle the given content.
// contentType is the MIME type (may be empty)
// path is the file path or URL path
CanProcess(contentType, path string) bool
// Process processes content bytes and returns document sections.
// path: relative path or URL path for the content
// sourceURL: the original URL (for web) or empty (for filesystem)
// baseURL: the base URL for generating links
// content: raw bytes to process
Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
}
ContentProcessor processes content bytes into document sections. It works with raw bytes, making it suitable for both filesystem and web sources.
type ContentSource ¶
type ContentSource interface {
// Type returns the source type identifier (e.g., "filesystem", "web")
Type() string
// BaseURL returns the base URL for generating document links
BaseURL() string
// Traverse iterates over all content items from the source.
// It returns a channel of ContentItems and a channel for errors.
// The implementation should close both channels when done.
Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)
}
ContentSource represents a source of documents that can be traversed. Implementations include filesystem directories and web crawlers.
type DocumentSection ¶
type DocumentSection struct {
ID string // Unique ID for the section (generated from path + identifier)
FilePath string // Source path (relative path or URL path)
Title string // Section title (from heading or frontmatter)
Content string // Section content (markdown/text)
Type string // Document type (markdown_section, mdx_section, openapi_path, etc.)
URL string // URL to the document section (base URL + path + anchor)
SectionPath []string // Heading hierarchy path (e.g., ["Getting Started", "Installation", "Prerequisites"])
Questions []string // Questions associated with this section (just the question text)
Metadata map[string]any // Additional type-specific metadata
}
DocumentSection represents a generic document section extracted from content. It contains the content, metadata, and type information needed to index the section in Antfly.
func (*DocumentSection) ToDocument ¶
func (ds *DocumentSection) ToDocument() map[string]any
ToDocument converts a DocumentSection to a document map suitable for storage in Antfly.
type EncodingFallbackDecoder ¶
type EncodingFallbackDecoder struct {
// contains filtered or unexported fields
}
EncodingFallbackDecoder tries multiple encodings when text contains undecoded characters. This helps recover text from PDFs where the library failed to decode properly.
func NewEncodingFallbackDecoder ¶
func NewEncodingFallbackDecoder() *EncodingFallbackDecoder
NewEncodingFallbackDecoder creates a new decoder.
func (*EncodingFallbackDecoder) DecodeWithFallback ¶
func (d *EncodingFallbackDecoder) DecodeWithFallback(text string) (string, string)
DecodeWithFallback attempts to decode text using multiple encodings if needed. Returns the decoded text and the encoding used.
type EnhancedTextCleaner ¶
type EnhancedTextCleaner struct {
// contains filtered or unexported fields
}
EnhancedTextCleaner provides more aggressive text cleanup.
func NewEnhancedTextCleaner ¶
func NewEnhancedTextCleaner() *EnhancedTextCleaner
NewEnhancedTextCleaner creates a cleaner with all components initialized.
func (*EnhancedTextCleaner) Clean ¶
func (etc *EnhancedTextCleaner) Clean(text string) string
Clean applies all cleaning steps to extracted text.
func (*EnhancedTextCleaner) CleanForSearch ¶
func (etc *EnhancedTextCleaner) CleanForSearch(text string) string
CleanForSearch applies all cleaning steps plus normalization for text search. This normalizes subscripts/superscripts so that H₂O matches H2O.
func (*EnhancedTextCleaner) CleanFull ¶
func (etc *EnhancedTextCleaner) CleanFull(text string) string
CleanFull applies all Phase 1 and Phase 2 cleaning for maximum text quality. Includes: basic cleaning, enhanced paragraphs, list formatting, and footnote expansion.
func (*EnhancedTextCleaner) CleanWithEnhancedParagraphs ¶
func (etc *EnhancedTextCleaner) CleanWithEnhancedParagraphs(text string) string
CleanWithEnhancedParagraphs applies cleaning with sophisticated paragraph detection. Uses ML-like heuristics to detect headers, lists, indentation, and spacing patterns.
func (*EnhancedTextCleaner) CleanWithEnhancedParagraphsConfig ¶
func (etc *EnhancedTextCleaner) CleanWithEnhancedParagraphsConfig(text string, config ParagraphConfig) string
CleanWithEnhancedParagraphsConfig applies cleaning with configurable paragraph detection.
func (*EnhancedTextCleaner) CleanWithFootnotes ¶
func (etc *EnhancedTextCleaner) CleanWithFootnotes(text string) string
CleanWithFootnotes applies all cleaning and expands footnote references. Example: "statement¹" → "statement[1]"
func (*EnhancedTextCleaner) CleanWithLists ¶
func (etc *EnhancedTextCleaner) CleanWithLists(text string) string
CleanWithLists applies cleaning and normalizes list formatting. Ensures consistent bullet styles and indentation.
func (*EnhancedTextCleaner) CleanWithParagraphs ¶
func (etc *EnhancedTextCleaner) CleanWithParagraphs(text string) string
CleanWithParagraphs applies all cleaning steps plus paragraph detection. Use this when semantic paragraph structure is desired.
type FilesystemSource ¶
type FilesystemSource struct {
// contains filtered or unexported fields
}
FilesystemSource traverses a local filesystem directory and yields content items.
func NewFilesystemSource ¶
func NewFilesystemSource(config FilesystemSourceConfig) *FilesystemSource
NewFilesystemSource creates a new filesystem content source.
func (*FilesystemSource) BaseURL ¶
func (fs *FilesystemSource) BaseURL() string
BaseURL returns the base URL for this source.
func (*FilesystemSource) Traverse ¶
func (fs *FilesystemSource) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)
Traverse walks the directory tree and yields content items for all matching files. It returns a channel of ContentItems and a channel for errors.
func (*FilesystemSource) Type ¶
func (fs *FilesystemSource) Type() string
Type returns "filesystem" as the source type.
type FilesystemSourceConfig ¶
type FilesystemSourceConfig struct {
// BaseDir is the base directory to traverse
BaseDir string
// BaseURL is the base URL for generating document links (optional).
BaseURL string
// IncludePatterns is a list of glob patterns to include.
// Files matching any include pattern will be processed.
// If empty, all files are included (subject to exclude patterns).
// Supports ** wildcards for recursive matching.
IncludePatterns []string
// ExcludePatterns is a list of glob patterns to exclude.
// Files matching any exclude pattern will be skipped.
// Default excludes are: .git/**
// Supports ** wildcards for recursive matching.
ExcludePatterns []string
}
FilesystemSourceConfig holds configuration for a FilesystemSource.
type FontDecoder ¶
type FontDecoder struct {
// contains filtered or unexported fields
}
FontDecoder handles font encoding issues common in PDFs.
func NewFontDecoder ¶
func NewFontDecoder() *FontDecoder
NewFontDecoder creates a FontDecoder with common substitutions.
func (*FontDecoder) Decode ¶
func (fd *FontDecoder) Decode(text string) string
Decode normalizes text by expanding ligatures and fixing encoding issues.
func (*FontDecoder) DecodeROT3 ¶
func (fd *FontDecoder) DecodeROT3(text string) string
DecodeROT3 attempts to decode ROT3-encoded text (common in some PDFs). ROT3 shifts each letter by 3 positions in the alphabet.
func (*FontDecoder) IsLikelyROT3 ¶
func (fd *FontDecoder) IsLikelyROT3(text string) bool
IsLikelyROT3 checks if text appears to be ROT3 encoded.
type GitAuth ¶
type GitAuth struct {
// Username for HTTPS authentication.
Username string
// Password or personal access token for HTTPS authentication.
Password string
// SSHKeyPath is the path to an SSH private key file.
SSHKeyPath string
}
GitAuth holds authentication credentials for private repositories.
type GitSource ¶
type GitSource struct {
// contains filtered or unexported fields
}
GitSource clones a Git repository and traverses its contents.
func NewGitSource ¶
func NewGitSource(config GitSourceConfig) (*GitSource, error)
NewGitSource creates a new Git content source.
func (*GitSource) Cleanup ¶
func (gs *GitSource) Cleanup()
Cleanup removes the cloned directory if it was a temporary directory.
func (*GitSource) Clone ¶
Clone clones the repository. Called automatically by Traverse if not already cloned.
func (*GitSource) CloneDir ¶
CloneDir returns the path to the cloned repository. Returns empty string if not yet cloned.
type GitSourceConfig ¶
type GitSourceConfig struct {
// URL is the Git repository URL (required).
// Supports:
// - Full URLs: https://github.com/owner/repo.git
// - GitHub shorthand: owner/repo (automatically expanded to https://github.com/owner/repo.git)
// - SSH URLs: git@github.com:owner/repo.git
URL string
// Ref is the branch, tag, or commit to checkout (default: default branch).
Ref string
// BaseURL is the base URL for generating document links.
// If empty, it will be derived from the repository URL.
BaseURL string
// SubPath is an optional subdirectory within the repo to traverse.
// Useful for monorepos or repos where docs are in a specific folder.
SubPath string
// IncludePatterns is a list of glob patterns for files to include.
// If empty, all files are included (subject to exclude patterns).
IncludePatterns []string
// ExcludePatterns is a list of glob patterns for files to exclude.
// Default excludes common non-content paths (.git, node_modules, etc.).
ExcludePatterns []string
// ShallowClone enables shallow cloning with depth 1 (default: true).
// Set to false for full history (needed for some operations).
ShallowClone bool
// CloneDir is an optional directory to clone into.
// If empty, a temporary directory is created and cleaned up after traversal.
CloneDir string
// KeepClone prevents cleanup of the cloned directory after traversal.
// Only applies when CloneDir is empty (temp directories).
KeepClone bool
// Auth holds optional authentication credentials.
Auth *GitAuth
}
GitSourceConfig holds configuration for a GitSource.
type GlyphMapper ¶
type GlyphMapper struct {
// contains filtered or unexported fields
}
GlyphMapper handles Private Use Area (PUA) character mapping. Many PDFs map custom fonts to PUA characters (U+E000-U+F8FF).
func NewGlyphMapper ¶
func NewGlyphMapper() *GlyphMapper
NewGlyphMapper creates a GlyphMapper with common PUA mappings.
func (*GlyphMapper) LearnFromContext ¶
func (gm *GlyphMapper) LearnFromContext(texts []pdf.Text)
LearnFromContext tries to learn PUA mappings from surrounding context. This is a heuristic approach that looks for patterns.
func (*GlyphMapper) Map ¶
func (gm *GlyphMapper) Map(text string) string
Map converts PUA characters to their ASCII equivalents if known.
type GoogleDriveSource ¶
type GoogleDriveSource struct {
// contains filtered or unexported fields
}
GoogleDriveSource traverses files in a Google Drive folder and yields content items.
func NewGoogleDriveSource ¶
func NewGoogleDriveSource(ctx context.Context, config GoogleDriveSourceConfig) (*GoogleDriveSource, error)
NewGoogleDriveSource creates a new Google Drive content source.
func (*GoogleDriveSource) BaseURL ¶
func (s *GoogleDriveSource) BaseURL() string
BaseURL returns the base URL for this source.
func (*GoogleDriveSource) Traverse ¶
func (s *GoogleDriveSource) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)
Traverse lists files in the Google Drive folder and yields content items.
func (*GoogleDriveSource) Type ¶
func (s *GoogleDriveSource) Type() string
Type returns "google_drive" as the source type.
type GoogleDriveSourceConfig ¶
type GoogleDriveSourceConfig struct {
// CredentialsJSON is a service account key JSON string or file path.
// Either CredentialsJSON or AccessToken must be provided.
CredentialsJSON string
// AccessToken is a pre-obtained OAuth2 access token.
// Either CredentialsJSON or AccessToken must be provided.
AccessToken string
// FolderID is the Google Drive folder ID or full folder URL (required).
// Supports URLs like https://drive.google.com/drive/folders/<ID> or
// https://drive.google.com/drive/u/0/folders/<ID>.
FolderID string
// BaseURL is the base URL for generating document links (optional).
// If empty, defaults to the Google Drive folder URL.
BaseURL string
// IncludePatterns is a list of glob patterns to include.
// If empty, all files are included (subject to exclude patterns).
// Supports ** wildcards for recursive matching.
IncludePatterns []string
// ExcludePatterns is a list of glob patterns to exclude.
// Supports ** wildcards for recursive matching.
ExcludePatterns []string
// Concurrency controls how many parallel downloads run at once.
// Default: 5
Concurrency int
// Recursive controls whether subfolders are traversed.
// Default: true
Recursive bool
IncludeSharedDrives bool
// ExportFormats overrides the default export MIME type for Google Workspace files.
// Keys are Google Workspace MIME types, values are the export MIME types.
ExportFormats map[string]string
}
GoogleDriveSourceConfig holds configuration for a GoogleDriveSource.
type HTMLProcessor ¶
type HTMLProcessor struct{}
HTMLProcessor processes HTML (.html, .htm) content using goquery. It chunks content into sections by headings and extracts metadata from the document head.
func (*HTMLProcessor) CanProcess ¶
func (hp *HTMLProcessor) CanProcess(contentType, path string) bool
CanProcess returns true for HTML content types or .html/.htm extensions.
func (*HTMLProcessor) ExtractQuestions ¶
func (hp *HTMLProcessor) ExtractQuestions(path, sourceURL string, content []byte) []Question
ExtractQuestions extracts questions from HTML content. It looks for questions in: 1. data-docsaf-questions attributes (JSON array of strings or objects) 2. Elements with class "docsaf-questions" (extracts li text content) Questions are associated with the section they appear in based on preceding headings.
func (*HTMLProcessor) Process ¶
func (hp *HTMLProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
Process processes HTML content and returns document sections. Questions found in the HTML are associated with their containing sections.
type LayoutAnalyzer ¶
type LayoutAnalyzer struct {
// Configuration
ColumnGapThreshold float64 // Minimum gap width to consider as column separator (in points)
RowTolerance float64 // Y-coordinate tolerance for grouping into rows
TableCellMinWidth float64 // Minimum cell width to consider for table detection
WordSpaceMultiplier float64 // Multiplier of font size to detect word boundaries
// Extended options
MinRowsForColumnPct int // Minimum percentage of rows that must have gap for column detection (default 25)
FilterLineNumbers bool // Whether to filter out line number columns (for depositions)
AutoDetectLayout bool // Automatically detect and use optimal layout settings
UseAdaptiveSpacing bool // Use dynamic spacing threshold based on actual character spacing (default true)
}
LayoutAnalyzer provides advanced PDF text extraction with column detection, table recognition, and improved reading order reconstruction.
func NewLayoutAnalyzer ¶
func NewLayoutAnalyzer() *LayoutAnalyzer
NewLayoutAnalyzer creates a LayoutAnalyzer with sensible defaults.
func NewLayoutAnalyzerWithConfig ¶
func NewLayoutAnalyzerWithConfig(cfg LayoutConfig) *LayoutAnalyzer
NewLayoutAnalyzerWithConfig creates a LayoutAnalyzer with custom configuration. Note: UseAdaptiveSpacing defaults to false when using explicit configuration. Set it to true manually if you want adaptive spacing with custom settings.
func (*LayoutAnalyzer) WithDepositionMode ¶
func (la *LayoutAnalyzer) WithDepositionMode() *LayoutAnalyzer
WithDepositionMode configures the analyzer for deposition transcript extraction. This uses tighter column detection and filters out line number columns.
type LayoutConfig ¶
type LayoutConfig struct {
ColumnGapThreshold float64 // Minimum gap for column detection
RowTolerance float64 // Y tolerance for row grouping
MinRowsForColumnPct int // % of rows that must have gap for column detection
FilterLineNumbers bool // Whether to filter out line number columns
}
LayoutConfig allows customization of layout analysis parameters.
func DefaultLayoutConfig ¶
func DefaultLayoutConfig() LayoutConfig
DefaultLayoutConfig returns standard layout configuration.
type LineInfo ¶
type LineInfo struct {
Text string
TrimmedText string
Indent int // Leading whitespace count
Length int // Trimmed length
EndsWithPeriod bool // Sentence ending
IsBullet bool // Starts with bullet marker
IsNumbered bool // Starts with number marker (1. or 1))
IsShort bool // Significantly shorter than median
IsEmpty bool // Whitespace only
IsHeader bool // Appears to be a header
FontSizeHint float64 // Relative font size (1.0 = normal)
}
LineInfo holds analyzed information about a single line of text.
func AnalyzeLine ¶
AnalyzeLine extracts information about a line for paragraph detection.
type MarkdownProcessor ¶
type MarkdownProcessor struct {
// MinTokensPerSection is the minimum token count before splitting into a new section.
// If 0, defaults to 500 tokens. Set to 1 to split on every heading (original behavior).
MinTokensPerSection int
}
MarkdownProcessor processes Markdown (.md) and MDX (.mdx) content using goldmark. It chunks content into sections by headings and extracts YAML frontmatter. Sections are merged if they would be too small (under MinTokensPerSection tokens).
func (*MarkdownProcessor) CanProcess ¶
func (mp *MarkdownProcessor) CanProcess(contentType, path string) bool
CanProcess returns true for markdown content types or .md/.mdx extensions.
func (*MarkdownProcessor) ExtractQuestions ¶
func (mp *MarkdownProcessor) ExtractQuestions(path, sourceURL string, content []byte) []Question
ExtractQuestions extracts questions from markdown/MDX content. It looks for questions in: 1. Frontmatter "questions" field 2. <Questions> MDX components inline in the content
func (*MarkdownProcessor) ExtractQuestionsWithSections ¶
func (mp *MarkdownProcessor) ExtractQuestionsWithSections(path, sourceURL string, content []byte) []Question
ExtractQuestionsWithSections extracts questions with section path information.
func (*MarkdownProcessor) Process ¶
func (mp *MarkdownProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
Process processes markdown content and returns document sections.
type OpenAPIProcessor ¶
type OpenAPIProcessor struct{}
OpenAPIProcessor processes OpenAPI specification content using libopenapi. It extracts API info, paths, and schemas as separate document sections.
func (*OpenAPIProcessor) CanProcess ¶
func (op *OpenAPIProcessor) CanProcess(contentType, path string) bool
CanProcess returns true for .yaml, .yml, and .json files. Note: The content will only be processed if it's a valid OpenAPI v3 specification.
func (*OpenAPIProcessor) ExtractQuestions ¶
func (op *OpenAPIProcessor) ExtractQuestions(path, sourceURL string, content []byte) ([]Question, error)
ExtractQuestions extracts x-docsaf-questions from OpenAPI extensions. It looks for questions at: 1. Top-level document info 2. Individual paths/operations 3. Component schemas
func (*OpenAPIProcessor) Process ¶
func (op *OpenAPIProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
Process processes OpenAPI specification content and returns document sections. Returns an error if the content is not a valid OpenAPI v3 specification. Questions from x-docsaf-questions extensions are associated with their sections.
type PDFProcessor ¶
type PDFProcessor struct {
// When enabled, the processor makes two passes: first to detect patterns,
// then to extract text with headers/footers removed.
EnableHeaderFooterDetection bool
// EnableMirroredTextRepair enables automatic detection and repair of mirrored/reversed text.
// Uses bigram frequency analysis to detect text that has been horizontally flipped.
EnableMirroredTextRepair bool
// ProgressFunc is called to report processing progress.
// If nil, no progress is reported.
ProgressFunc PDFProgressFunc
// ProgressInterval controls how often ProgressFunc is called.
// If 0, defaults to every 100 pages.
ProgressInterval int
// contains filtered or unexported fields
}
PDFProcessor processes PDF (.pdf) content using the ledongthuc/pdf library. It chunks content into sections by pages and extracts metadata from the PDF Info dictionary.
func (*PDFProcessor) CanProcess ¶
func (pp *PDFProcessor) CanProcess(contentType, path string) bool
CanProcess returns true for PDF content types or .pdf extensions.
func (*PDFProcessor) Process ¶
func (pp *PDFProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
Process processes PDF content and returns document sections. Each page becomes a separate section, with text extracted via GetTextByRow() for better handling of tables and complex layouts.
type PDFProgress ¶
type PDFProgress struct {
Phase string // "header_detection", "extraction"
Page int // Current page number (1-based)
TotalPages int // Total pages in document
FilePath string // Path to the PDF file
}
PDFProgress reports progress during PDF processing.
type PDFProgressFunc ¶
type PDFProgressFunc func(progress PDFProgress) error
PDFProgressFunc is called to report PDF processing progress. Return an error to abort processing.
type ParagraphConfig ¶
type ParagraphConfig struct {
// MinLineSpacingRatio: ratio of line spacing to median that indicates paragraph break
// Default 1.5 means 50% more spacing than median triggers break
MinLineSpacingRatio float64
// MinIndentChars: minimum indentation (in characters) for first-line indent detection
MinIndentChars int
// DetectLists: whether to detect and preserve bullet/numbered lists
DetectLists bool
// DetectHeaders: whether to detect headers based on font size patterns
DetectHeaders bool
// PreserveBlankLines: keep existing blank lines as paragraph breaks
PreserveBlankLines bool
}
ParagraphConfig configures enhanced paragraph detection behavior.
func DefaultParagraphConfig ¶
func DefaultParagraphConfig() ParagraphConfig
DefaultParagraphConfig returns sensible defaults for paragraph detection.
type Processor ¶
type Processor struct {
// contains filtered or unexported fields
}
Processor processes content from any source using registered processors. It abstracts the traversal mechanism, allowing the same processing logic to work with filesystem, web, and other content sources.
func NewProcessor ¶
func NewProcessor(source ContentSource, registry ProcessorRegistry) *Processor
NewProcessor creates a new processor. The source provides content items, and the registry provides processors to handle them.
func (*Processor) Process ¶
func (p *Processor) Process(ctx context.Context) ([]DocumentSection, error)
Process traverses the source and processes all content items. Returns a slice of all extracted DocumentSections.
func (*Processor) ProcessWithCallback ¶
func (p *Processor) ProcessWithCallback(ctx context.Context, callback func([]DocumentSection) error) error
ProcessWithCallback traverses the source and calls the callback for each batch of sections. This is useful for streaming large amounts of content without holding everything in memory.
func (*Processor) SetBaseURL ¶
SetBaseURL sets the base URL for generated links. This overrides the base URL from the source.
func (*Processor) SourceType ¶
SourceType returns the type of the underlying content source.
type ProcessorRegistry ¶
type ProcessorRegistry interface {
// Register adds a processor to the registry.
Register(processor ContentProcessor)
// GetProcessor returns the first processor that can handle the given content.
// Returns nil if no processor can handle the content.
GetProcessor(contentType, path string) ContentProcessor
// Processors returns all registered processors.
Processors() []ContentProcessor
}
ProcessorRegistry manages a collection of ContentProcessors.
func DefaultRegistry ¶
func DefaultRegistry() ProcessorRegistry
DefaultRegistry creates a registry with all built-in processors registered. This includes MarkdownProcessor, OpenAPIProcessor, HTMLProcessor, and PDFProcessor.
func NewRegistry ¶
func NewRegistry() ProcessorRegistry
NewRegistry creates a new empty processor registry. Use this to build a custom registry with only the processors you need.
func NewWholeFileRegistry ¶
func NewWholeFileRegistry() ProcessorRegistry
NewWholeFileRegistry creates a registry with only the WholeFileProcessor. This processor returns entire content without any chunking, allowing Antfly's internal chunking (e.g., Termite) to handle document segmentation during the embedding process.
type Question ¶
type Question struct {
// ID is a unique identifier for the question (generated from source + question text)
ID string
// Text is the question text itself
Text string
// SourcePath is the file path where the question was found
SourcePath string
// SourceURL is the URL to the source document (if available)
SourceURL string
// SourceType indicates where the question came from:
// "frontmatter", "mdx_component", "openapi_info", "openapi_path", "openapi_schema"
SourceType string
// Context provides additional context about where the question appears
// For MDX: section title or document title
// For OpenAPI: operation ID, path, or schema name
Context string
// SectionPath is the heading hierarchy path where the question appears
// (e.g., ["Getting Started", "Installation", "Prerequisites"])
SectionPath []string
// Metadata contains source-specific metadata
Metadata map[string]any
}
Question represents a question extracted from documentation. Questions can come from MDX frontmatter, <Questions> MDX components, or x-docsaf-questions OpenAPI extensions.
func (*Question) ToDocument ¶
ToDocument converts a Question to a document map suitable for storage.
type QuestionsExtractor ¶
type QuestionsExtractor struct{}
QuestionsExtractor extracts questions from various content sources.
func (*QuestionsExtractor) ExtractFromMDXContent ¶
func (qe *QuestionsExtractor) ExtractFromMDXContent(path, sourceURL string, content []byte, frontmatter map[string]any) []Question
ExtractFromMDXContent extracts questions from MDX/Markdown content. It looks for: 1. Questions in frontmatter (questions: [...]) 2. <Questions> MDX components in the content
func (*QuestionsExtractor) ExtractFromOpenAPI ¶
func (qe *QuestionsExtractor) ExtractFromOpenAPI(path, sourceURL, sourceType, context string, extensions map[string]any) []Question
ExtractFromOpenAPI extracts x-docsaf-questions from OpenAPI extensions. The extensions map should contain the questions as a string array.
type S3Source ¶
type S3Source struct {
// contains filtered or unexported fields
}
S3Source traverses objects in an S3-compatible bucket and yields content items.
func NewS3Source ¶
func NewS3Source(config S3SourceConfig) (*S3Source, error)
NewS3Source creates a new S3 content source.
func (*S3Source) BaseURL ¶
BaseURL returns the base URL for this source. If not configured, returns an s3:// URL.
type S3SourceConfig ¶
type S3SourceConfig struct {
// Credentials holds S3/MinIO connection credentials.
// Supports keystore syntax and environment variable fallbacks.
Credentials s3.Credentials
// Bucket is the S3 bucket name (required).
Bucket string
// Prefix is an optional key prefix to filter objects.
// Only objects with this prefix will be listed.
// Example: "docs/" to only process objects in the docs/ folder.
Prefix string
// BaseURL is the base URL for generating document links (optional).
// If empty, URLs will use the s3:// scheme.
BaseURL string
// IncludePatterns is a list of glob patterns to include.
// Objects matching any include pattern will be processed.
// If empty, all objects are included (subject to exclude patterns).
// Supports ** wildcards for recursive matching.
// Patterns are matched against the object key (with prefix stripped if configured).
IncludePatterns []string
// ExcludePatterns is a list of glob patterns to exclude.
// Objects matching any exclude pattern will be skipped.
// Supports ** wildcards for recursive matching.
// Patterns are matched against the object key (with prefix stripped if configured).
ExcludePatterns []string
// Concurrency controls how many S3 GetObject requests run in parallel.
// Default: 5
Concurrency int
}
S3SourceConfig holds configuration for an S3Source.
type TextBlock ¶
type TextBlock struct {
X, Y float64
Width, Height float64
Text string
FontSize float64
Chars []pdf.Text // Original characters
}
TextBlock represents a block of text with position and content.
type TextRepair ¶
type TextRepair struct {
// Configuration
MinPagesSeen int // Minimum pages to analyze before detecting headers/footers
// contains filtered or unexported fields
}
TextRepair provides utilities for detecting and fixing common PDF text extraction issues.
func NewTextRepair ¶
func NewTextRepair() *TextRepair
NewTextRepair creates a new TextRepair with sensible defaults.
func (*TextRepair) AutoDecodePUA ¶
func (tr *TextRepair) AutoDecodePUA(text string) (decoded string, description string)
AutoDecodePUA attempts to automatically decode PUA-preserved bytes. Returns the decoded text and a description of the decoding applied.
func (*TextRepair) AutoDecodeText ¶
func (tr *TextRepair) AutoDecodeText(text string) (decoded string, fixed string)
AutoDecodeText automatically detects and decodes text with encoding issues. Returns the decoded text and a description of what was fixed.
func (*TextRepair) AutoRepairMirroredText ¶
func (tr *TextRepair) AutoRepairMirroredText(text string) (string, bool)
AutoRepairMirroredText detects and repairs mirrored text if confidence is high enough. Returns the repaired text and whether repair was applied.
func (*TextRepair) CalculateLineEntropy ¶
func (tr *TextRepair) CalculateLineEntropy(line string) float64
CalculateLineEntropy calculates Shannon entropy of a line of text. Higher entropy indicates more randomness (potential garbled content).
func (*TextRepair) DecodeShiftedText ¶
func (tr *TextRepair) DecodeShiftedText(text string, shift int) string
DecodeShiftedText decodes text that has been shifted by the specified amount.
func (*TextRepair) DecodeSymbolSubstitution ¶
func (tr *TextRepair) DecodeSymbolSubstitution(text string, substMap map[rune]rune) string
DecodeSymbolSubstitution applies symbol substitution decoding to text.
func (*TextRepair) DetectDepositionLayout ¶
func (tr *TextRepair) DetectDepositionLayout(texts []pdf.Text) bool
DetectDepositionLayout checks if a page appears to be a deposition transcript. Deposition transcripts have: - Line numbers 1-25 (or similar) in a narrow left column - Consistent line spacing - Q: and A: question/answer format
func (*TextRepair) DetectEncodedPattern ¶
func (tr *TextRepair) DetectEncodedPattern(text string) (patternType string, description string)
DetectEncodedPattern checks if a string matches a pattern that suggests it's an encoded version of a known format (like case numbers). Returns the detected pattern type and a description.
func (*TextRepair) DetectEncodingShift ¶
func (tr *TextRepair) DetectEncodingShift(text string) (shift int, confidence float64)
DetectEncodingShift analyzes text to detect if it uses a shifted alphabet encoding. Returns the detected shift (0-25) and a confidence score (0.0-1.0). A shift of 0 means no encoding detected or text is normal.
func (*TextRepair) DetectFontEncodingCorruption ¶
func (tr *TextRepair) DetectFontEncodingCorruption(text string) float64
DetectFontEncodingCorruption checks if text appears to be using a corrupted or non-standard font encoding. This happens when PDF fonts have custom glyph mappings that don't match standard character codes.
Characteristics of font-encoding corruption: - Text looks like random letters but has structure (same length as expected) - Unusual mix of uppercase/lowercase in patterns that don't match English - High proportion of consonant clusters that are phonetically impossible - No recognizable words despite looking like text
func (*TextRepair) DetectMirroredText ¶
func (tr *TextRepair) DetectMirroredText(text string) float64
DetectMirroredText analyzes text to detect if it appears to be reversed/mirrored. Returns a confidence score (0.0-1.0) where higher values indicate more likely mirroring.
func (*TextRepair) DetectPUAShift ¶
func (tr *TextRepair) DetectPUAShift(text string) (shift int, confidence float64)
DetectPUAShift analyzes text with PUA characters to detect the encoding shift. Returns the best shift value and confidence score based on resulting English frequency.
func (*TextRepair) DetectSymbolSubstitution ¶
func (tr *TextRepair) DetectSymbolSubstitution(text string) (map[rune]rune, float64)
DetectSymbolSubstitution checks if text uses symbol-to-letter substitution. Returns the substitution map and confidence score.
func (*TextRepair) FilterFontEncodingCorruption ¶
func (tr *TextRepair) FilterFontEncodingCorruption(text string) string
FilterFontEncodingCorruption removes or replaces lines with severe font encoding issues. Less severe issues are left in place but could be marked.
func (*TextRepair) FilterLineNumberColumn ¶
func (tr *TextRepair) FilterLineNumberColumn(texts []pdf.Text) []pdf.Text
FilterLineNumberColumn removes the leftmost column if it contains only line numbers. It identifies line numbers (1-25) in the left margin and filters them out along with any other content in that narrow column, preserving the main text content.
func (*TextRepair) FilterNoiseLines ¶
func (tr *TextRepair) FilterNoiseLines(text string) string
FilterNoiseLines removes lines detected as garbled/corrupted content.
func (*TextRepair) GetDetectedFooters ¶
func (tr *TextRepair) GetDetectedFooters() []string
GetDetectedFooters returns footers detected across multiple pages.
func (*TextRepair) GetDetectedHeaders ¶
func (tr *TextRepair) GetDetectedHeaders() []string
GetDetectedHeaders returns headers detected across multiple pages. A line is considered a header if it appears (with edit distance tolerance) on most pages.
func (*TextRepair) IsFontEncodingCorrupted ¶
func (tr *TextRepair) IsFontEncodingCorrupted(text string) bool
IsFontEncodingCorrupted returns true if text appears to have font encoding issues.
func (*TextRepair) IsNoiseLine ¶
func (tr *TextRepair) IsNoiseLine(line string) bool
IsNoiseLine detects if a line is likely garbled/corrupted content. Uses entropy and character pattern analysis.
func (*TextRepair) MarkFontEncodingIssues ¶
func (tr *TextRepair) MarkFontEncodingIssues(text string) string
MarkFontEncodingIssues wraps text that appears to have font encoding issues with markers for downstream processing or flagging.
func (*TextRepair) RecordPageContent ¶
func (tr *TextRepair) RecordPageContent(pageText string)
RecordPageContent records the first and last lines of a page for pattern detection.
func (*TextRepair) RemoveHeadersFooters ¶
func (tr *TextRepair) RemoveHeadersFooters(pageText string, headers, footers []string) string
RemoveHeadersFooters removes detected headers and footers from page text.
func (*TextRepair) RemoveInterleavedReplacements ¶
func (tr *TextRepair) RemoveInterleavedReplacements(text string) string
RemoveInterleavedReplacements removes U+FFFD replacement characters that appear to be interleaved with real text (pattern: char, FFFD, char, FFFD, ...). This fixes PDFs where font encoding issues produce "C·O·N·F·I·D·E·N·T·I·A·L" patterns where · is U+FFFD.
func (*TextRepair) RepairMirroredText ¶
func (tr *TextRepair) RepairMirroredText(text string) string
RepairMirroredText reverses text that has been detected as mirrored. It can operate at word level or full text level.
func (*TextRepair) RepairMisspelledWords ¶
func (tr *TextRepair) RepairMisspelledWords(text string) string
RepairMisspelledWords uses edit distance to correct likely OCR errors. Conservative approach: only fixes words with small edit distance to known words.
func (*TextRepair) SegmentWords ¶
func (tr *TextRepair) SegmentWords(text string) string
SegmentWords uses dynamic programming to find optimal word boundaries in merged text. This handles PDFs with zero-gap character positioning that result in merged words like "UNITEDSTATESDISTRICTCOURT" -> "UNITED STATES DISTRICT COURT"
type WebSource ¶
type WebSource struct {
// contains filtered or unexported fields
}
WebSource crawls websites and yields content items.
func NewWebSource ¶
func NewWebSource(config WebSourceConfig) (*WebSource, error)
NewWebSource creates a new web content source.
func (*WebSource) CacheStats ¶
func (ws *WebSource) CacheStats() *CacheStats
CacheStats returns statistics about the cache. Returns nil if caching is not enabled.
func (*WebSource) ClearCache ¶
ClearCache removes all entries from the cache.
type WebSourceConfig ¶
type WebSourceConfig struct {
// StartURL is the starting URL to crawl (required)
StartURL string
// BaseURL is the base URL for generating document links.
// If empty, it will be derived from StartURL.
BaseURL string
// AllowedDomains restricts crawling to these domains.
// If empty, only the domain from StartURL is allowed.
AllowedDomains []string
// IncludePatterns is a list of glob patterns for URL paths to include.
// If empty, all paths are included (subject to exclude patterns).
// Patterns match against the URL path (e.g., "/docs/**", "/guides/*")
IncludePatterns []string
// ExcludePatterns is a list of glob patterns for URL paths to exclude.
// Default excludes common non-content paths.
ExcludePatterns []string
// MaxDepth is the maximum crawl depth (0 = unlimited).
MaxDepth int
// MaxPages is the maximum number of pages to crawl (0 = unlimited).
MaxPages int
// Concurrency is the number of concurrent requests (default: 2).
Concurrency int
// RequestDelay is the delay between requests (default: 100ms).
RequestDelay time.Duration
// UserAgent is the User-Agent string to use for requests.
UserAgent string
// UseSitemap enables sitemap-based crawling.
// When enabled, the crawler will first fetch and parse the sitemap
// to discover URLs before following links.
UseSitemap bool
// SitemapURL is the URL of the sitemap (optional).
// If empty and UseSitemap is true, it will try /sitemap.xml
SitemapURL string
// SitemapOnly restricts crawling to URLs found in the sitemap only.
// When true, link discovery is disabled.
SitemapOnly bool
// RespectRobotsTxt enables robots.txt parsing (default: true).
RespectRobotsTxt bool
// MaxRetries is the number of retry attempts for failed requests (default: 3).
MaxRetries int
// RetryDelay is the base delay for exponential backoff retry (default: 1s).
// The actual delay doubles with each retry: 1s, 2s, 4s, etc.
RetryDelay time.Duration
// CacheTTL is how long to cache responses (default: 0 = disabled).
// Set to a positive duration to enable caching.
CacheTTL time.Duration
// CacheMaxItems is the maximum number of items to cache (default: 1000).
CacheMaxItems int
// CacheDir is the directory for persistent cache storage.
// If empty, only in-memory caching is used.
CacheDir string
// CacheRespectHeaders enables HTTP cache header parsing (default: true when caching enabled).
// When enabled, Cache-Control, ETag, and Last-Modified are respected.
CacheRespectHeaders bool
// CacheDeduplication enables content hash deduplication (default: true when caching enabled).
// Identical content from different URLs will be stored only once.
CacheDeduplication bool
// NormalizeURLs enables URL normalization for deduplication (default: true).
// Includes lowercasing host, removing default ports, removing trailing slashes.
NormalizeURLs bool
}
WebSourceConfig holds configuration for a WebSource.
type WholeFileProcessor ¶
type WholeFileProcessor struct{}
WholeFileProcessor processes content by returning it as a single section without any chunking. This is useful when you want Antfly's internal chunking (e.g., Termite) to handle document segmentation.
func (*WholeFileProcessor) CanProcess ¶
func (wfp *WholeFileProcessor) CanProcess(contentType, path string) bool
CanProcess returns true for common text-based file types.
func (*WholeFileProcessor) Process ¶
func (wfp *WholeFileProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
Process returns the entire content as a single DocumentSection.