docsaf

package module
v0.0.0-...-c73d100 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 18, 2026 License: Apache-2.0 Imports: 43 Imported by: 0

README

docsaf

A generic content traversal and processing library for building documentation from various sources including local filesystems and websites.

Features

  • Multiple Content Sources: Traverse local directories, crawl websites, clone Git repositories, or fetch from S3-compatible storage
  • Pluggable Processors: Markdown, HTML, PDF, OpenAPI, and custom processors
  • Web Crawling: Full-featured web crawler with sitemap support via go-colly
  • Git Integration: Clone and traverse Git repositories with branch/tag support
  • S3 Integration: Fetch and process documentation from S3, MinIO, R2, and other S3-compatible storage
  • URL Normalization: Consistent URL deduplication across crawls
  • Retry Logic: Exponential backoff for transient failures
  • Advanced Caching: HTTP-aware caching with disk persistence, ETag/Last-Modified support, and content deduplication
  • robots.txt Support: Respect crawling directives

Installation

go get github.com/antflydb/antfly-go/docsaf

Quick Start

Processing Local Files
package main

import (
    "context"
    "fmt"
    "log"

    "github.com/antflydb/antfly-go/docsaf"
)

func main() {
    // Create a filesystem source
    source, err := docsaf.NewFilesystemSource(docsaf.FilesystemSourceConfig{
        BaseDir: "./docs",
        BaseURL: "https://example.com/docs",
        IncludePatterns: []string{"**/*.md", "**/*.html"},
    })
    if err != nil {
        log.Fatal(err)
    }

    // Create processor with default registry (Markdown, HTML, PDF, OpenAPI)
    processor := docsaf.NewProcessor(source, docsaf.DefaultRegistry())

    // Process all content
    sections, err := processor.Process(context.Background())
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("Processed %d sections\n", len(sections))
}
Crawling a Website
package main

import (
    "context"
    "fmt"
    "log"
    "time"

    "github.com/antflydb/antfly-go/docsaf"
)

func main() {
    // Create a web source
    source, err := docsaf.NewWebSource(docsaf.WebSourceConfig{
        StartURL:         "https://docs.example.com",
        IncludePatterns:  []string{"/docs/**", "/guides/**"},
        UseSitemap:       true,
        MaxPages:         100,
        Concurrency:      2,
        RequestDelay:     200 * time.Millisecond,
        RespectRobotsTxt: true,
    })
    if err != nil {
        log.Fatal(err)
    }

    // Create processor
    processor := docsaf.NewProcessor(source, docsaf.DefaultRegistry())

    // Process with streaming callback
    err = processor.ProcessWithCallback(context.Background(), func(sections []docsaf.DocumentSection) error {
        for _, section := range sections {
            fmt.Printf("Section: %s - %s\n", section.Title, section.URL)
        }
        return nil
    })
    if err != nil {
        log.Fatal(err)
    }
}

Content Sources

FilesystemSource

Traverses local directories and yields files matching specified patterns.

source, err := docsaf.NewFilesystemSource(docsaf.FilesystemSourceConfig{
    // Required: Base directory to traverse
    BaseDir: "./docs",

    // Required: Base URL for generating document links
    BaseURL: "https://example.com/docs",

    // Optional: Glob patterns for files to include (default: all files)
    IncludePatterns: []string{"**/*.md", "**/*.html"},

    // Optional: Glob patterns for files to exclude
    ExcludePatterns: []string{"**/node_modules/**", "**/.git/**"},
})
WebSource

Crawls websites using go-colly with support for sitemaps, rate limiting, and more.

source, err := docsaf.NewWebSource(docsaf.WebSourceConfig{
    // Required: Starting URL for the crawl
    StartURL: "https://docs.example.com",

    // Optional: Base URL for document links (default: derived from StartURL)
    BaseURL: "https://docs.example.com",

    // Optional: Restrict crawling to these domains (default: StartURL domain)
    AllowedDomains: []string{"docs.example.com"},

    // Optional: URL path patterns to include
    IncludePatterns: []string{"/docs/**", "/api/**"},

    // Optional: URL path patterns to exclude (has defaults for static assets)
    ExcludePatterns: []string{"/blog/**"},

    // Optional: Maximum crawl depth (default: 0 = unlimited)
    MaxDepth: 5,

    // Optional: Maximum pages to crawl (default: 0 = unlimited)
    MaxPages: 1000,

    // Optional: Concurrent requests (default: 2)
    Concurrency: 4,

    // Optional: Delay between requests (default: 100ms)
    RequestDelay: 200 * time.Millisecond,

    // Optional: Custom User-Agent string
    UserAgent: "MyBot/1.0",

    // Optional: Enable sitemap-based URL discovery
    UseSitemap: true,

    // Optional: Custom sitemap URL (default: /sitemap.xml)
    SitemapURL: "https://docs.example.com/sitemap.xml",

    // Optional: Only crawl URLs from sitemap, disable link discovery
    SitemapOnly: false,

    // Optional: Respect robots.txt directives (default: true in colly)
    RespectRobotsTxt: true,

    // Optional: Number of retry attempts for failed requests (default: 3)
    MaxRetries: 3,

    // Optional: Base delay for exponential backoff (default: 1s)
    RetryDelay: 1 * time.Second,

    // Optional: Enable response caching with TTL (default: 0 = disabled)
    CacheTTL: 5 * time.Minute,

    // Optional: Maximum cached items (default: 1000)
    CacheMaxItems: 500,

    // Optional: Enable URL normalization (default: true)
    NormalizeURLs: true,
})
GitSource

Clones Git repositories and traverses their contents. Supports GitHub/GitLab shorthand, authentication, and branch/tag selection.

// Using GitHub shorthand
source, err := docsaf.NewGitSource(docsaf.GitSourceConfig{
    // GitHub shorthand - automatically expanded to https://github.com/owner/repo.git
    URL: "owner/repo",

    // Optional: Branch, tag, or commit to checkout (default: default branch)
    Ref: "main",
})

// Full configuration
source, err := docsaf.NewGitSource(docsaf.GitSourceConfig{
    // Required: Git repository URL
    // Supports: https://, git://, git@, ssh://, or GitHub shorthand (owner/repo)
    URL: "https://github.com/owner/repo.git",

    // Optional: Branch, tag, or commit to checkout
    Ref: "v1.0.0",

    // Optional: Base URL for document links (auto-derived for GitHub/GitLab)
    BaseURL: "https://github.com/owner/repo/blob/v1.0.0",

    // Optional: Subdirectory to traverse (useful for monorepos)
    SubPath: "docs",

    // Optional: Glob patterns for files to include
    IncludePatterns: []string{"**/*.md", "**/*.html"},

    // Optional: Glob patterns for files to exclude (has defaults)
    ExcludePatterns: []string{".git/**", "node_modules/**"},

    // Optional: Use shallow clone with depth 1 (default: true)
    ShallowClone: true,

    // Optional: Directory to clone into (default: temp directory)
    CloneDir: "/path/to/clone",

    // Optional: Keep clone directory after traversal (default: false)
    KeepClone: false,

    // Optional: Authentication for private repositories
    Auth: &docsaf.GitAuth{
        Username: "user",
        Password: "token-or-password",
        // Or use SSH key:
        // SSHKeyPath: "/path/to/id_rsa",
    },
})
Processing a Git Repository
package main

import (
    "context"
    "fmt"
    "log"

    "github.com/antflydb/antfly-go/docsaf"
)

func main() {
    // Clone and process a GitHub repository
    source, err := docsaf.NewGitSource(docsaf.GitSourceConfig{
        URL:             "owner/repo",
        Ref:             "main",
        SubPath:         "docs",
        IncludePatterns: []string{"**/*.md"},
    })
    if err != nil {
        log.Fatal(err)
    }

    processor := docsaf.NewProcessor(source, docsaf.DefaultRegistry())

    sections, err := processor.Process(context.Background())
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("Processed %d sections from repository\n", len(sections))
}
S3Source

Traverses objects in S3-compatible buckets (AWS S3, MinIO, R2, etc.) and yields files matching specified patterns.

// Basic configuration with MinIO
source, err := docsaf.NewS3Source(docsaf.S3SourceConfig{
    // Required: S3 credentials
    Credentials: s3.Credentials{
        Endpoint:        "s3.amazonaws.com",
        AccessKeyId:     "your-access-key",
        SecretAccessKey: "your-secret-key",
        UseSsl:          true,
    },

    // Required: Bucket name
    Bucket: "my-docs-bucket",

    // Optional: Key prefix to filter objects (e.g., "docs/" for only docs folder)
    Prefix: "docs/",

    // Optional: Base URL for generating document links
    BaseURL: "https://docs.example.com",

    // Optional: Glob patterns for objects to include (default: all objects)
    IncludePatterns: []string{"**/*.md", "**/*.mdx"},

    // Optional: Glob patterns for objects to exclude
    ExcludePatterns: []string{"**/drafts/**", "**/.DS_Store"},

    // Optional: Concurrent downloads (default: 5)
    Concurrency: 10,
})
Using with MinIO
source, err := docsaf.NewS3Source(docsaf.S3SourceConfig{
    Credentials: s3.Credentials{
        Endpoint:        "localhost:9000",
        AccessKeyId:     "minioadmin",
        SecretAccessKey: "minioadmin",
        UseSsl:          false,  // Disable SSL for local MinIO
    },
    Bucket:          "documentation",
    IncludePatterns: []string{"**/*.md"},
})
Using with AWS S3 and Environment Variables

The S3 source supports environment variable fallbacks for credentials:

// Credentials will fall back to AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY
source, err := docsaf.NewS3Source(docsaf.S3SourceConfig{
    Credentials: s3.Credentials{
        Endpoint: "s3.amazonaws.com",
        UseSsl:   true,
    },
    Bucket: "my-bucket",
    Prefix: "documentation/",
})
Processing S3 Content
package main

import (
    "context"
    "fmt"
    "log"

    "github.com/antflydb/antfly-go/docsaf"
    "github.com/antflydb/antfly/antfly-go/libaf/s3"
)

func main() {
    // Create S3 source
    source, err := docsaf.NewS3Source(docsaf.S3SourceConfig{
        Credentials: s3.Credentials{
            Endpoint:        "s3.amazonaws.com",
            AccessKeyId:     "your-access-key",
            SecretAccessKey: "your-secret-key",
            UseSsl:          true,
        },
        Bucket:          "my-docs",
        Prefix:          "documentation/",
        BaseURL:         "https://docs.example.com",
        IncludePatterns: []string{"**/*.md", "**/*.mdx"},
        ExcludePatterns: []string{"**/drafts/**"},
        Concurrency:     5,
    })
    if err != nil {
        log.Fatal(err)
    }

    processor := docsaf.NewProcessor(source, docsaf.DefaultRegistry())

    sections, err := processor.Process(context.Background())
    if err != nil {
        log.Fatal(err)
    }

    fmt.Printf("Processed %d sections from S3\n", len(sections))
}

Content Processors

Processors extract structured content from raw bytes. Each processor implements:

type ContentProcessor interface {
    CanProcess(contentType, path string) bool
    Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
}
Built-in Processors
Processor File Extensions Content Types
MarkdownProcessor .md, .markdown text/markdown
HTMLProcessor .html, .htm text/html
PDFProcessor .pdf application/pdf
OpenAPIProcessor .yaml, .yml, .json (with OpenAPI content) -
WholeFileProcessor Any Any (fallback)
Registries
// Default registry: Markdown, OpenAPI, HTML, PDF
registry := docsaf.DefaultRegistry()

// Whole file registry: treats entire files as single sections
registry := docsaf.NewWholeFileRegistry()

// Custom registry
registry := docsaf.NewRegistry()
registry.Register(&docsaf.MarkdownProcessor{})
registry.Register(&MyCustomProcessor{})
Custom Processors
type MyProcessor struct{}

func (p *MyProcessor) CanProcess(contentType, path string) bool {
    return strings.HasSuffix(path, ".custom")
}

func (p *MyProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]docsaf.DocumentSection, error) {
    // Parse content and return sections
    return []docsaf.DocumentSection{
        {
            Title:   "My Section",
            Content: string(content),
            URL:     baseURL + path,
        },
    }, nil
}

DocumentSection

The output of processing is a slice of DocumentSection:

type DocumentSection struct {
    Title    string         // Section title (e.g., heading text)
    Content  string         // Section content as text
    URL      string         // Full URL to the section (with anchor if applicable)
    Metadata map[string]any // Additional metadata (source_type, page_number, etc.)
}

Questions Extraction

docsaf can extract questions from documentation as a separate top-level concept. Questions are useful for building FAQ systems, search optimization, or understanding what users want to know about your documentation.

Question Type
type Question struct {
    ID         string         // Unique identifier
    Text       string         // The question text
    SourcePath string         // File path where found
    SourceURL  string         // URL to the source document
    SourceType string         // Origin: "frontmatter", "mdx_component", "openapi_info", etc.
    Context    string         // Document title, operation ID, or schema name
    Metadata   map[string]any // Additional source-specific data
}
Extracting Questions from MDX/Markdown

Questions can be defined in two ways:

1. Frontmatter questions field:

---
title: Installation Guide
questions:
  - How do I install on Windows?
  - How do I install on macOS?
  - text: What are the system requirements?
    category: prerequisites
---

2. Inline <Questions> MDX components:

# Getting Started

<Questions>
- How do I install Antfly?
- Where can I download the CLI?
- What are the prerequisites?
</Questions>

Follow these steps to get started...

Usage:

mp := &docsaf.MarkdownProcessor{}
content, _ := os.ReadFile("guide.mdx")

questions := mp.ExtractQuestions("guide.mdx", "https://docs.example.com/guide", content)

for _, q := range questions {
    fmt.Printf("[%s] %s\n", q.SourceType, q.Text)
}
// Output:
// [frontmatter] How do I install on Windows?
// [frontmatter] How do I install on macOS?
// [mdx_component] How do I install Antfly?
// [mdx_component] Where can I download the CLI?
Extracting Questions from OpenAPI Specs

Use the x-docsaf-questions extension at any level in your OpenAPI spec:

openapi: "3.0.0"
info:
  title: User API
  version: "1.0"
  x-docsaf-questions:
    - How do I authenticate with this API?
    - What rate limits apply?

paths:
  /users:
    x-docsaf-questions:
      - How do I list all users?
    get:
      operationId: getUsers
      summary: Get all users
      x-docsaf-questions:
        - Can I paginate results?
        - How do I filter by status?

components:
  schemas:
    User:
      type: object
      x-docsaf-questions:
        - What fields are required?
        - How do I format the date field?

Usage:

op := &docsaf.OpenAPIProcessor{}
content, _ := os.ReadFile("api.yaml")

questions, err := op.ExtractQuestions("api.yaml", "https://api.example.com/spec", content)
if err != nil {
    log.Fatal(err)
}

for _, q := range questions {
    fmt.Printf("[%s] %s (context: %s)\n", q.SourceType, q.Text, q.Context)
}
// Output:
// [openapi_info] How do I authenticate with this API? (context: User API)
// [openapi_path] How do I list all users? (context: /users)
// [openapi_operation] Can I paginate results? (context: getUsers)
// [openapi_schema] What fields are required? (context: User)
Extracting Questions from HTML

Use data-docsaf-questions attributes or elements with the docsaf-questions class:

<!-- Using data attribute with JSON array -->
<div data-docsaf-questions='["How do I sign up?", "What payment methods are accepted?"]'>
  <h1>Getting Started</h1>
  ...
</div>

<!-- Using a dedicated questions container -->
<ul class="docsaf-questions">
  <li>How do I reset my password?</li>
  <li>Where can I find my API key?</li>
</ul>

Usage:

hp := &docsaf.HTMLProcessor{}
content, _ := os.ReadFile("page.html")

questions := hp.ExtractQuestions("page.html", "https://docs.example.com/page", content)
Question Metadata

Questions can include additional metadata when using the object format:

# In frontmatter or OpenAPI
questions:
  - text: How do I authenticate?
    category: security
    priority: high
    related_to: /docs/auth

This metadata is preserved in the Question.Metadata field.

URL Normalization

When NormalizeURLs is enabled (default), URLs are canonicalized for consistent deduplication:

  • Lowercase scheme and host: HTTPS://Example.COMhttps://example.com
  • Remove default ports: https://example.com:443https://example.com
  • Remove trailing slashes: https://example.com/path/https://example.com/path
  • Remove fragments: https://example.com/path#sectionhttps://example.com/path
  • Normalize empty paths: https://example.comhttps://example.com/

Retry Logic

Failed requests are automatically retried with exponential backoff:

  • Network errors and 5xx status codes trigger retries
  • Default: 3 retries with 1s base delay (1s, 2s, 4s)
  • Maximum delay capped at 30 seconds
  • Respects context cancellation

Caching

Simple In-Memory Cache

Enable basic in-memory response caching:

source, _ := docsaf.NewWebSource(docsaf.WebSourceConfig{
    StartURL:      "https://docs.example.com",
    CacheTTL:      10 * time.Minute,  // Cache responses for 10 minutes
    CacheMaxItems: 500,               // Keep at most 500 responses
})
Advanced Caching Features

Enable HTTP-aware caching with disk persistence and content deduplication:

source, _ := docsaf.NewWebSource(docsaf.WebSourceConfig{
    StartURL: "https://docs.example.com",

    // Basic cache settings
    CacheTTL:      1 * time.Hour,
    CacheMaxItems: 1000,

    // Persistent disk cache
    CacheDir: "/tmp/docsaf-cache",

    // Respect HTTP cache headers (Cache-Control, ETag, Last-Modified)
    CacheRespectHeaders: true,

    // Deduplicate identical content across URLs
    CacheDeduplication: true,
})
Cache Features
Feature Description
HTTP Headers Respects Cache-Control, max-age, ETag, Last-Modified
Conditional Requests Sends If-None-Match and If-Modified-Since for revalidation
Disk Persistence Cache survives restarts when CacheDir is set
Content Deduplication Identical content stored once via SHA-256 hashing
LRU Eviction Oldest entries removed when cache is full
Cache Management
// Check cache statistics
stats := source.CacheStats()
if stats != nil {
    fmt.Printf("Cached entries: %d\n", stats.MemoryEntries)
    fmt.Printf("Unique contents: %d\n", stats.UniqueContents)
    fmt.Printf("Total size: %d bytes\n", stats.TotalSizeBytes)
}

// Clear the cache
source.ClearCache()

// Check if a URL is cached
if source.IsCached("https://docs.example.com/page") {
    fmt.Println("Page is cached")
}

Pattern Matching

Include and exclude patterns use doublestar glob syntax:

Pattern Matches
*.md Markdown files in current directory
**/*.md Markdown files in any subdirectory
/docs/** Everything under /docs path
/api/* Direct children of /api (not nested)
/**/*.{css,js} All CSS and JS files

Architecture

┌─────────────────┐     ┌──────────────────┐     ┌───────────────────┐
│  ContentSource  │────▶│    Processor     │────▶│ DocumentSection[] │
│                 │     │                  │     │                   │
│ - Filesystem    │     │ Uses registry to │     │ - Title           │
│ - Web           │     │ find appropriate │     │ - Content         │
│ - Git           │     │ ContentProcessor │     │ - URL             │
└─────────────────┘     └──────────────────┘     └───────────────────┘
                               │
                               ▼
                    ┌──────────────────────┐
                    │  ProcessorRegistry   │
                    │                      │
                    │ - MarkdownProcessor  │
                    │ - HTMLProcessor      │
                    │ - PDFProcessor       │
                    │ - OpenAPIProcessor   │
                    └──────────────────────┘

License

See repository root for license information.

Documentation

Index

Constants

This section is empty.

Variables

View Source
var CommonEnglishWords = map[string]bool{}/* 311 elements not displayed */

CommonEnglishWords contains top 10000 common English words for word segmentation. This is a subset - in production, you'd want the full list.

View Source
var CommonReversedWords = map[string]string{
	"eht": "the", "dna": "and", "rof": "for", "era": "are", "tub": "but",
	"ton": "not", "uoy": "you", "lla": "all", "nac": "can", "dah": "had",
	"reh": "her", "saw": "was", "eno": "one", "ruo": "our", "tuo": "out",
	"yad": "day", "teg": "get", "sah": "has", "mih": "him", "sih": "his",
	"woh": "how", "nam": "man", "wen": "new", "won": "now", "dlo": "old",
	"ees": "see", "emit": "time", "yrev": "very", "nehw": "when", "ohw": "who",
	"yob": "boy", "did": "did", "sti": "its", "tel": "let", "tup": "put",
	"yas": "say", "ehs": "she", "owt": "two", "yaw": "way", "taht": "that",
	"siht": "this", "htiw": "with", "evah": "have", "morf": "from", "yeht": "they",
	"neeb": "been", "evol": "love", "ekam": "make", "erom": "more", "ylno": "only",
	"revo": "over", "hcus": "such", "ekat": "take", "naht": "than", "meht": "them",
	"neht": "then", "eseht": "these", "gniht": "thing", "kniht": "think",
	"erehw": "where", "hcihw": "which", "dlrow": "world", "dluow": "would",
	"tuoba": "about", "retfa": "after", "niaga": "again", "tsniaga": "against",
}

CommonReversedWords contains common English words and their reversed forms. Used as a secondary check for mirrored text.

View Source
var CommonSymbolSubstitutions = map[rune]rune{
	'$': 'A', '%': 'B', '&': 'C', '\'': 'D', '(': 'E', ')': 'F',
	'*': 'G', '+': 'H', ',': 'I', '-': 'J', '.': 'K', '/': 'L',
	'0': 'M', '1': 'N', '2': 'O', '3': 'P', '4': 'Q', '5': 'R',
	'6': 'S', '7': 'T', '8': 'U', '9': 'V', ':': 'W', ';': 'X',
	'<': 'Y', '=': 'Z',
}

CommonSymbolSubstitutions maps common symbol substitutions in encoded PDFs. Key: encoded symbol, Value: decoded character

View Source
var DepositionLayoutConfig = LayoutConfig{
	ColumnGapThreshold:  12.0,
	RowTolerance:        2.0,
	MinRowsForColumnPct: 75,
	FilterLineNumbers:   true,
}

DepositionLayoutConfig provides optimized settings for deposition transcripts.

View Source
var EnglishBigramFrequency = map[string]float64{
	"th": 0.0356, "he": 0.0307, "in": 0.0243, "er": 0.0205, "an": 0.0199,
	"re": 0.0185, "on": 0.0176, "at": 0.0149, "en": 0.0145, "nd": 0.0135,
	"ti": 0.0134, "es": 0.0134, "or": 0.0128, "te": 0.0120, "of": 0.0117,
	"ed": 0.0117, "is": 0.0113, "it": 0.0112, "al": 0.0109, "ar": 0.0107,
	"st": 0.0105, "to": 0.0104, "nt": 0.0104, "ng": 0.0095, "se": 0.0093,
	"ha": 0.0093, "as": 0.0087, "ou": 0.0087, "io": 0.0083, "le": 0.0083,
	"ve": 0.0083, "co": 0.0079, "me": 0.0079, "de": 0.0076, "hi": 0.0076,
	"ri": 0.0073, "ro": 0.0073, "ic": 0.0070, "ne": 0.0069, "ea": 0.0069,
	"ra": 0.0069, "ce": 0.0065, "li": 0.0062, "ch": 0.0060, "ll": 0.0058,
	"be": 0.0058, "ma": 0.0057, "si": 0.0055, "om": 0.0055, "ur": 0.0054,
}

EnglishBigramFrequency contains the top English bigram frequencies. These are used to detect reversed text by comparing bigram distributions.

View Source
var EnglishLetterFrequency = map[rune]float64{
	'e': 0.127, 't': 0.091, 'a': 0.082, 'o': 0.075, 'i': 0.070,
	'n': 0.067, 's': 0.063, 'h': 0.061, 'r': 0.060, 'd': 0.043,
	'l': 0.040, 'c': 0.028, 'u': 0.028, 'm': 0.024, 'w': 0.024,
	'f': 0.022, 'g': 0.020, 'y': 0.020, 'p': 0.019, 'b': 0.015,
	'v': 0.010, 'k': 0.008, 'j': 0.002, 'x': 0.002, 'q': 0.001,
	'z': 0.001,
}

EnglishLetterFrequency contains standard English letter frequencies.

View Source
var ExtendedGlyphNames = map[string]rune{}/* 114 elements not displayed */

ExtendedGlyphNames provides additional glyph name mappings beyond what the ledongthuc/pdf library includes.

View Source
var StandardEncoding = [256]rune{}/* 256 elements not displayed */

StandardEncoding is the PostScript standard encoding. Used by Type 1 fonts when no other encoding is specified. See PDF Reference Table D.1

View Source
var SubscriptMap = map[rune]rune{
	'0': '₀', '1': '₁', '2': '₂', '3': '₃', '4': '₄',
	'5': '₅', '6': '₆', '7': '₇', '8': '₈', '9': '₉',
	'+': '₊', '-': '₋', '=': '₌', '(': '₍', ')': '₎',
	'a': 'ₐ', 'e': 'ₑ', 'h': 'ₕ', 'i': 'ᵢ', 'j': 'ⱼ',
	'k': 'ₖ', 'l': 'ₗ', 'm': 'ₘ', 'n': 'ₙ', 'o': 'ₒ',
	'p': 'ₚ', 'r': 'ᵣ', 's': 'ₛ', 't': 'ₜ', 'u': 'ᵤ',
	'v': 'ᵥ', 'x': 'ₓ',
}

SubscriptMap maps regular digits and letters to their Unicode subscript equivalents.

View Source
var SubscriptToNormal = map[rune]rune{
	'₀': '0', '₁': '1', '₂': '2', '₃': '3', '₄': '4',
	'₅': '5', '₆': '6', '₇': '7', '₈': '8', '₉': '9',
	'₊': '+', '₋': '-', '₌': '=', '₍': '(', '₎': ')',
	'ₐ': 'a', 'ₑ': 'e', 'ₕ': 'h', 'ᵢ': 'i', 'ⱼ': 'j',
	'ₖ': 'k', 'ₗ': 'l', 'ₘ': 'm', 'ₙ': 'n', 'ₒ': 'o',
	'ₚ': 'p', 'ᵣ': 'r', 'ₛ': 's', 'ₜ': 't', 'ᵤ': 'u',
	'ᵥ': 'v', 'ₓ': 'x',
}

SubscriptToNormal maps Unicode subscript characters back to normal characters.

View Source
var SuperscriptMap = map[rune]rune{
	'0': '⁰', '1': '¹', '2': '²', '3': '³', '4': '⁴',
	'5': '⁵', '6': '⁶', '7': '⁷', '8': '⁸', '9': '⁹',
	'+': '⁺', '-': '⁻', '=': '⁼', '(': '⁽', ')': '⁾',
	'a': 'ᵃ', 'b': 'ᵇ', 'c': 'ᶜ', 'd': 'ᵈ', 'e': 'ᵉ',
	'f': 'ᶠ', 'g': 'ᵍ', 'h': 'ʰ', 'i': 'ⁱ', 'j': 'ʲ',
	'k': 'ᵏ', 'l': 'ˡ', 'm': 'ᵐ', 'n': 'ⁿ', 'o': 'ᵒ',
	'p': 'ᵖ', 'r': 'ʳ', 's': 'ˢ', 't': 'ᵗ', 'u': 'ᵘ',
	'v': 'ᵛ', 'w': 'ʷ', 'x': 'ˣ', 'y': 'ʸ', 'z': 'ᶻ',
}

SuperscriptMap maps regular digits and letters to their Unicode superscript equivalents.

View Source
var SuperscriptToNormal = map[rune]rune{
	'⁰': '0', '¹': '1', '²': '2', '³': '3', '⁴': '4',
	'⁵': '5', '⁶': '6', '⁷': '7', '⁸': '8', '⁹': '9',
	'⁺': '+', '⁻': '-', '⁼': '=', '⁽': '(', '⁾': ')',
	'ᵃ': 'a', 'ᵇ': 'b', 'ᶜ': 'c', 'ᵈ': 'd', 'ᵉ': 'e',
	'ᶠ': 'f', 'ᵍ': 'g', 'ʰ': 'h', 'ⁱ': 'i', 'ʲ': 'j',
	'ᵏ': 'k', 'ˡ': 'l', 'ᵐ': 'm', 'ⁿ': 'n', 'ᵒ': 'o',
	'ᵖ': 'p', 'ʳ': 'r', 'ˢ': 's', 'ᵗ': 't', 'ᵘ': 'u',
	'ᵛ': 'v', 'ʷ': 'w', 'ˣ': 'x', 'ʸ': 'y', 'ᶻ': 'z',
}

SuperscriptToNormal maps Unicode superscript characters back to normal characters.

View Source
var SymbolEncoding = [256]rune{}/* 256 elements not displayed */

SymbolEncoding is for the Symbol font (Greek letters, math symbols). See PDF Reference Table D.5

View Source
var SymbolToLatinMap = map[rune]rune{

	'Α': 'A',
	'Β': 'B',
	'Χ': 'C',
	'Δ': 'D',
	'Ε': 'E',
	'Φ': 'F',
	'Γ': 'G',
	'Η': 'H',
	'Ι': 'I',
	'ϑ': 'J',
	'Κ': 'K',
	'Λ': 'L',
	'Μ': 'M',
	'Ν': 'N',
	'Ο': 'O',
	'Π': 'P',
	'Θ': 'Q',
	'Ρ': 'R',
	'Σ': 'S',
	'Τ': 'T',
	'Υ': 'U',
	'ς': 'V',
	'Ω': 'W',
	'Ξ': 'X',
	'Ψ': 'Y',
	'Ζ': 'Z',

	'α': 'a',
	'β': 'b',
	'χ': 'c',
	'δ': 'd',
	'ε': 'e',
	'φ': 'f',
	'γ': 'g',
	'η': 'h',
	'ι': 'i',
	'ϕ': 'j',
	'κ': 'k',
	'λ': 'l',
	'μ': 'm',
	'ν': 'n',
	'ο': 'o',
	'π': 'p',
	'θ': 'q',
	'ρ': 'r',
	'σ': 's',
	'τ': 't',
	'υ': 'u',
	'ϖ': 'v',
	'ω': 'w',
	'ξ': 'x',
	'ψ': 'y',
	'ζ': 'z',

	'∃': 'A',
	'∋': 'D',
	'∴': 'Y',
	'∀': '"',
	'∗': '*',
	'−': '-',

	'≅': '@',
	'⊥': '_',
	'∼': '~',
}

SymbolToLatinMap provides reverse mapping from Symbol font Greek letters back to their ASCII Latin equivalents. This handles PDFs where Symbol font was used to represent English text (common in legal documents).

View Source
var ZapfDingbatsEncoding = [256]rune{}/* 256 elements not displayed */

ZapfDingbatsEncoding is for the ZapfDingbats font (decorative symbols). See PDF Reference Table D.6

Functions

func CleanBoxDrawingChars

func CleanBoxDrawingChars(text string) string

CleanBoxDrawingChars removes box drawing characters that appear as artifacts from PDF text extraction (common in forms and legal documents).

func CleanZeroWidthChars

func CleanZeroWidthChars(text string) string

CleanZeroWidthChars removes invisible zero-width and control characters that can break word matching and text processing.

func CountPUAChars

func CountPUAChars(text string) (count int, ratio float64)

CountPUAChars returns the count and ratio of PUA characters in text.

func DecodePUAWithShift

func DecodePUAWithShift(text string, shift int) string

DecodePUAWithShift decodes PUA-preserved bytes by applying a character shift. This handles custom font encodings where characters are shifted by a fixed offset. For example, a shift of 29 would decode PUA byte 65 as 65+29=94 ('n' in some encodings).

func DetectAndFormatLists

func DetectAndFormatLists(text string) string

DetectAndFormatLists identifies list structures and formats them consistently. Normalizes bullet styles and adds consistent indentation.

func DetectContentType

func DetectContentType(path string, content []byte) string

DetectContentType detects the MIME type from a file path.

func DetectFootnoteReferences

func DetectFootnoteReferences(text string) []int

DetectFootnoteReferences finds patterns like "text¹" or "word²³" that indicate footnote references. Returns a list of positions where footnote superscripts were detected.

func DetectParagraphBreaks

func DetectParagraphBreaks(text string) string

DetectParagraphBreaks analyzes lines and marks paragraph boundaries. Returns text with paragraph breaks as double newlines. Uses heuristics: spacing gaps, indentation, short lines, sentence endings.

func DetectSymbolFont

func DetectSymbolFont(text string) string

DetectSymbolFont checks if text appears to be from a Symbol or Dingbats font based on the character patterns present.

func DetectSymbolGreekText

func DetectSymbolGreekText(text string) float64

DetectSymbolGreekText checks if text appears to use Symbol font Greek letters in place of Latin letters. Returns the ratio of Greek letters to total letters.

func EnhancedParagraphDetection

func EnhancedParagraphDetection(text string, config ParagraphConfig) string

EnhancedParagraphDetection applies sophisticated paragraph detection. Returns text with paragraph breaks as double newlines.

func ExpandFootnoteReferences

func ExpandFootnoteReferences(text string) string

ExpandFootnoteReferences converts superscript footnote markers to bracketed format. Example: "statement¹" → "statement[1]"

func HasPUAChars

func HasPUAChars(text string) bool

HasPUAChars returns true if the text contains any PUA characters.

func IsPUAChar

func IsPUAChar(r rune) bool

IsPUAChar returns true if the rune is in the Private Use Area range used to preserve unmapped font bytes (U+E000-U+E0FF).

func JoinHyphenatedWords

func JoinHyphenatedWords(text string) string

JoinHyphenatedWords joins words that were split across lines with hyphens. Handles both hard hyphens (deliberate) and soft hyphens (formatting). Example: "state-\nment" → "statement"

func MapGlyphName

func MapGlyphName(name string) (rune, bool)

MapGlyphName tries to map a glyph name to Unicode using extended mappings. Returns the rune and true if found, or 0 and false if not.

func NormalizeSubSuperscripts

func NormalizeSubSuperscripts(text string) string

NormalizeSubSuperscripts normalizes both subscripts and superscripts to regular characters.

func NormalizeSubscripts

func NormalizeSubscripts(text string) string

NormalizeSubscripts converts Unicode subscript characters to regular characters. Useful for text search and indexing where H₂O should match H2O.

func NormalizeSuperscripts

func NormalizeSuperscripts(text string) string

NormalizeSuperscripts converts Unicode superscript characters to regular characters. Useful for text search and indexing where H²O should match H2O.

func NormalizeUnicode

func NormalizeUnicode(text string) string

NormalizeUnicode applies NFC normalization to text. This converts combining character sequences to their composed forms, making text more consistent and searchable. Example: "é" (e + combining accent) → "é" (single character)

func RepairSymbolGreekText

func RepairSymbolGreekText(text string) string

RepairSymbolGreekText converts Symbol font Greek letters back to Latin. Only applies if a significant portion of the text appears to be Greek.

Types

type CacheConfig

type CacheConfig struct {
	// Enabled enables caching (default: false)
	Enabled bool

	// Dir is the directory for persistent cache storage.
	// If empty, only in-memory caching is used.
	Dir string

	// TTL is the default time-to-live for cached entries.
	// HTTP Cache-Control headers take precedence when present.
	TTL time.Duration

	// MaxMemoryItems is the maximum number of items to keep in memory (default: 1000).
	MaxMemoryItems int

	// MaxDiskSize is the maximum disk cache size in bytes (default: 100MB).
	// Set to 0 for unlimited.
	MaxDiskSize int64

	// RespectCacheHeaders enables HTTP cache header parsing (default: true).
	// When enabled, Cache-Control, ETag, and Last-Modified are respected.
	RespectCacheHeaders bool

	// EnableDeduplication enables content hash deduplication (default: true).
	// Identical content from different URLs will be stored only once.
	EnableDeduplication bool
}

CacheConfig configures the content cache behavior.

type CacheEntry

type CacheEntry struct {
	// URL is the original request URL.
	URL string `json:"url"`

	// Body is the response body content.
	Body []byte `json:"body,omitempty"`

	// ContentType is the Content-Type header value.
	ContentType string `json:"content_type"`

	// StatusCode is the HTTP status code.
	StatusCode int `json:"status_code"`

	// ETag is the ETag header for conditional requests.
	ETag string `json:"etag,omitempty"`

	// LastModified is the Last-Modified header for conditional requests.
	LastModified string `json:"last_modified,omitempty"`

	// Expires is when this entry expires.
	Expires time.Time `json:"expires"`

	// CachedAt is when this entry was cached.
	CachedAt time.Time `json:"cached_at"`

	// ContentHash is the SHA-256 hash of the body for deduplication.
	ContentHash string `json:"content_hash,omitempty"`

	// BodyFile is the filename for disk-cached body (when using deduplication).
	BodyFile string `json:"body_file,omitempty"`
}

CacheEntry represents a cached HTTP response.

func (*CacheEntry) CanRevalidate

func (e *CacheEntry) CanRevalidate() bool

CanRevalidate returns true if the entry has validators for conditional requests.

func (*CacheEntry) IsExpired

func (e *CacheEntry) IsExpired() bool

IsExpired returns true if the cache entry has expired.

func (*CacheEntry) IsStale

func (e *CacheEntry) IsStale() bool

IsStale returns true if the entry is expired but may be revalidated.

type CacheStats

type CacheStats struct {
	MemoryEntries  int
	UniqueContents int
	TotalSizeBytes int64
	DiskSizeBytes  int64
}

CacheStats contains cache statistics.

type CachingTransport

type CachingTransport struct {
	Transport http.RoundTripper
	Cache     *ContentCache
}

CachingTransport wraps http.RoundTripper with caching support.

func NewCachingTransport

func NewCachingTransport(transport http.RoundTripper, cache *ContentCache) *CachingTransport

NewCachingTransport creates a new caching transport.

func (*CachingTransport) RoundTrip

func (ct *CachingTransport) RoundTrip(req *http.Request) (*http.Response, error)

RoundTrip implements http.RoundTripper with caching.

type Column

type Column struct {
	Left, Right float64
	Blocks      []TextBlock
}

Column represents a detected column region.

type ContentCache

type ContentCache struct {
	// contains filtered or unexported fields
}

ContentCache provides HTTP-aware caching with optional disk persistence.

func NewContentCache

func NewContentCache(config CacheConfig) (*ContentCache, error)

NewContentCache creates a new content cache with the given configuration.

func (*ContentCache) AddConditionalHeaders

func (c *ContentCache) AddConditionalHeaders(req *http.Request) bool

AddConditionalHeaders adds If-None-Match and If-Modified-Since headers to a request if we have cached validators.

func (*ContentCache) Clear

func (c *ContentCache) Clear() error

Clear removes all entries from the cache.

func (*ContentCache) ContentHash

func (c *ContentCache) ContentHash(body []byte) string

ContentHash returns the hash of the given content.

func (*ContentCache) Get

func (c *ContentCache) Get(url string) *CacheEntry

Get retrieves a cached entry for the given URL. Returns nil if not found or expired (and not revalidatable).

func (*ContentCache) GetWithContentHash

func (c *ContentCache) GetWithContentHash(hash string) []byte

GetWithContentHash retrieves a cached entry by content hash (for deduplication).

func (*ContentCache) HandleNotModified

func (c *ContentCache) HandleNotModified(url string, headers http.Header) *CacheEntry

HandleNotModified updates an existing cache entry when a 304 is received.

func (*ContentCache) IsDuplicate

func (c *ContentCache) IsDuplicate(body []byte) (bool, string)

IsDuplicate checks if content with this hash already exists.

func (*ContentCache) Set

func (c *ContentCache) Set(url string, body []byte, headers http.Header, statusCode int)

Set stores a response in the cache.

func (*ContentCache) SetFromResponse

func (c *ContentCache) SetFromResponse(resp *http.Response, body []byte)

SetFromResponse stores an HTTP response in the cache.

func (*ContentCache) Stats

func (c *ContentCache) Stats() CacheStats

Stats returns cache statistics.

type ContentItem

type ContentItem struct {
	// Path is the relative path or URL path for the content
	Path string

	// SourceURL is the full URL for web sources (empty for filesystem sources)
	SourceURL string

	// Content is the raw content bytes
	Content []byte

	// ContentType is the MIME type (e.g., "text/html", "application/pdf")
	ContentType string

	// Metadata contains source-specific metadata (HTTP headers, file info, etc.)
	Metadata map[string]any
}

ContentItem represents a single piece of content from any source (filesystem, web, etc.)

type ContentProcessor

type ContentProcessor interface {
	// CanProcess returns true if this processor can handle the given content.
	// contentType is the MIME type (may be empty)
	// path is the file path or URL path
	CanProcess(contentType, path string) bool

	// Process processes content bytes and returns document sections.
	// path: relative path or URL path for the content
	// sourceURL: the original URL (for web) or empty (for filesystem)
	// baseURL: the base URL for generating links
	// content: raw bytes to process
	Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)
}

ContentProcessor processes content bytes into document sections. It works with raw bytes, making it suitable for both filesystem and web sources.

type ContentSource

type ContentSource interface {
	// Type returns the source type identifier (e.g., "filesystem", "web")
	Type() string

	// BaseURL returns the base URL for generating document links
	BaseURL() string

	// Traverse iterates over all content items from the source.
	// It returns a channel of ContentItems and a channel for errors.
	// The implementation should close both channels when done.
	Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)
}

ContentSource represents a source of documents that can be traversed. Implementations include filesystem directories and web crawlers.

type DocumentSection

type DocumentSection struct {
	ID          string         // Unique ID for the section (generated from path + identifier)
	FilePath    string         // Source path (relative path or URL path)
	Title       string         // Section title (from heading or frontmatter)
	Content     string         // Section content (markdown/text)
	Type        string         // Document type (markdown_section, mdx_section, openapi_path, etc.)
	URL         string         // URL to the document section (base URL + path + anchor)
	SectionPath []string       // Heading hierarchy path (e.g., ["Getting Started", "Installation", "Prerequisites"])
	Questions   []string       // Questions associated with this section (just the question text)
	Metadata    map[string]any // Additional type-specific metadata
}

DocumentSection represents a generic document section extracted from content. It contains the content, metadata, and type information needed to index the section in Antfly.

func (*DocumentSection) ToDocument

func (ds *DocumentSection) ToDocument() map[string]any

ToDocument converts a DocumentSection to a document map suitable for storage in Antfly.

type EncodingFallbackDecoder

type EncodingFallbackDecoder struct {
	// contains filtered or unexported fields
}

EncodingFallbackDecoder tries multiple encodings when text contains undecoded characters. This helps recover text from PDFs where the library failed to decode properly.

func NewEncodingFallbackDecoder

func NewEncodingFallbackDecoder() *EncodingFallbackDecoder

NewEncodingFallbackDecoder creates a new decoder.

func (*EncodingFallbackDecoder) DecodeWithFallback

func (d *EncodingFallbackDecoder) DecodeWithFallback(text string) (string, string)

DecodeWithFallback attempts to decode text using multiple encodings if needed. Returns the decoded text and the encoding used.

type EnhancedTextCleaner

type EnhancedTextCleaner struct {
	// contains filtered or unexported fields
}

EnhancedTextCleaner provides more aggressive text cleanup.

func NewEnhancedTextCleaner

func NewEnhancedTextCleaner() *EnhancedTextCleaner

NewEnhancedTextCleaner creates a cleaner with all components initialized.

func (*EnhancedTextCleaner) Clean

func (etc *EnhancedTextCleaner) Clean(text string) string

Clean applies all cleaning steps to extracted text.

func (*EnhancedTextCleaner) CleanForSearch

func (etc *EnhancedTextCleaner) CleanForSearch(text string) string

CleanForSearch applies all cleaning steps plus normalization for text search. This normalizes subscripts/superscripts so that H₂O matches H2O.

func (*EnhancedTextCleaner) CleanFull

func (etc *EnhancedTextCleaner) CleanFull(text string) string

CleanFull applies all Phase 1 and Phase 2 cleaning for maximum text quality. Includes: basic cleaning, enhanced paragraphs, list formatting, and footnote expansion.

func (*EnhancedTextCleaner) CleanWithEnhancedParagraphs

func (etc *EnhancedTextCleaner) CleanWithEnhancedParagraphs(text string) string

CleanWithEnhancedParagraphs applies cleaning with sophisticated paragraph detection. Uses ML-like heuristics to detect headers, lists, indentation, and spacing patterns.

func (*EnhancedTextCleaner) CleanWithEnhancedParagraphsConfig

func (etc *EnhancedTextCleaner) CleanWithEnhancedParagraphsConfig(text string, config ParagraphConfig) string

CleanWithEnhancedParagraphsConfig applies cleaning with configurable paragraph detection.

func (*EnhancedTextCleaner) CleanWithFootnotes

func (etc *EnhancedTextCleaner) CleanWithFootnotes(text string) string

CleanWithFootnotes applies all cleaning and expands footnote references. Example: "statement¹" → "statement[1]"

func (*EnhancedTextCleaner) CleanWithLists

func (etc *EnhancedTextCleaner) CleanWithLists(text string) string

CleanWithLists applies cleaning and normalizes list formatting. Ensures consistent bullet styles and indentation.

func (*EnhancedTextCleaner) CleanWithParagraphs

func (etc *EnhancedTextCleaner) CleanWithParagraphs(text string) string

CleanWithParagraphs applies all cleaning steps plus paragraph detection. Use this when semantic paragraph structure is desired.

type FilesystemSource

type FilesystemSource struct {
	// contains filtered or unexported fields
}

FilesystemSource traverses a local filesystem directory and yields content items.

func NewFilesystemSource

func NewFilesystemSource(config FilesystemSourceConfig) *FilesystemSource

NewFilesystemSource creates a new filesystem content source.

func (*FilesystemSource) BaseURL

func (fs *FilesystemSource) BaseURL() string

BaseURL returns the base URL for this source.

func (*FilesystemSource) Traverse

func (fs *FilesystemSource) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)

Traverse walks the directory tree and yields content items for all matching files. It returns a channel of ContentItems and a channel for errors.

func (*FilesystemSource) Type

func (fs *FilesystemSource) Type() string

Type returns "filesystem" as the source type.

type FilesystemSourceConfig

type FilesystemSourceConfig struct {
	// BaseDir is the base directory to traverse
	BaseDir string

	// BaseURL is the base URL for generating document links (optional).
	BaseURL string

	// IncludePatterns is a list of glob patterns to include.
	// Files matching any include pattern will be processed.
	// If empty, all files are included (subject to exclude patterns).
	// Supports ** wildcards for recursive matching.
	IncludePatterns []string

	// ExcludePatterns is a list of glob patterns to exclude.
	// Files matching any exclude pattern will be skipped.
	// Default excludes are: .git/**
	// Supports ** wildcards for recursive matching.
	ExcludePatterns []string
}

FilesystemSourceConfig holds configuration for a FilesystemSource.

type FontDecoder

type FontDecoder struct {
	// contains filtered or unexported fields
}

FontDecoder handles font encoding issues common in PDFs.

func NewFontDecoder

func NewFontDecoder() *FontDecoder

NewFontDecoder creates a FontDecoder with common substitutions.

func (*FontDecoder) Decode

func (fd *FontDecoder) Decode(text string) string

Decode normalizes text by expanding ligatures and fixing encoding issues.

func (*FontDecoder) DecodeROT3

func (fd *FontDecoder) DecodeROT3(text string) string

DecodeROT3 attempts to decode ROT3-encoded text (common in some PDFs). ROT3 shifts each letter by 3 positions in the alphabet.

func (*FontDecoder) IsLikelyROT3

func (fd *FontDecoder) IsLikelyROT3(text string) bool

IsLikelyROT3 checks if text appears to be ROT3 encoded.

type GitAuth

type GitAuth struct {
	// Username for HTTPS authentication.
	Username string

	// Password or personal access token for HTTPS authentication.
	Password string

	// SSHKeyPath is the path to an SSH private key file.
	SSHKeyPath string
}

GitAuth holds authentication credentials for private repositories.

type GitSource

type GitSource struct {
	// contains filtered or unexported fields
}

GitSource clones a Git repository and traverses its contents.

func NewGitSource

func NewGitSource(config GitSourceConfig) (*GitSource, error)

NewGitSource creates a new Git content source.

func (*GitSource) BaseURL

func (gs *GitSource) BaseURL() string

BaseURL returns the base URL for this source.

func (*GitSource) Cleanup

func (gs *GitSource) Cleanup()

Cleanup removes the cloned directory if it was a temporary directory.

func (*GitSource) Clone

func (gs *GitSource) Clone(ctx context.Context) error

Clone clones the repository. Called automatically by Traverse if not already cloned.

func (*GitSource) CloneDir

func (gs *GitSource) CloneDir() string

CloneDir returns the path to the cloned repository. Returns empty string if not yet cloned.

func (*GitSource) Traverse

func (gs *GitSource) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)

Traverse clones the repository (if not already) and yields content items.

func (*GitSource) Type

func (gs *GitSource) Type() string

Type returns "git" as the source type.

type GitSourceConfig

type GitSourceConfig struct {
	// URL is the Git repository URL (required).
	// Supports:
	//   - Full URLs: https://github.com/owner/repo.git
	//   - GitHub shorthand: owner/repo (automatically expanded to https://github.com/owner/repo.git)
	//   - SSH URLs: git@github.com:owner/repo.git
	URL string

	// Ref is the branch, tag, or commit to checkout (default: default branch).
	Ref string

	// BaseURL is the base URL for generating document links.
	// If empty, it will be derived from the repository URL.
	BaseURL string

	// SubPath is an optional subdirectory within the repo to traverse.
	// Useful for monorepos or repos where docs are in a specific folder.
	SubPath string

	// IncludePatterns is a list of glob patterns for files to include.
	// If empty, all files are included (subject to exclude patterns).
	IncludePatterns []string

	// ExcludePatterns is a list of glob patterns for files to exclude.
	// Default excludes common non-content paths (.git, node_modules, etc.).
	ExcludePatterns []string

	// ShallowClone enables shallow cloning with depth 1 (default: true).
	// Set to false for full history (needed for some operations).
	ShallowClone bool

	// CloneDir is an optional directory to clone into.
	// If empty, a temporary directory is created and cleaned up after traversal.
	CloneDir string

	// KeepClone prevents cleanup of the cloned directory after traversal.
	// Only applies when CloneDir is empty (temp directories).
	KeepClone bool

	// Auth holds optional authentication credentials.
	Auth *GitAuth
}

GitSourceConfig holds configuration for a GitSource.

type GlyphMapper

type GlyphMapper struct {
	// contains filtered or unexported fields
}

GlyphMapper handles Private Use Area (PUA) character mapping. Many PDFs map custom fonts to PUA characters (U+E000-U+F8FF).

func NewGlyphMapper

func NewGlyphMapper() *GlyphMapper

NewGlyphMapper creates a GlyphMapper with common PUA mappings.

func (*GlyphMapper) LearnFromContext

func (gm *GlyphMapper) LearnFromContext(texts []pdf.Text)

LearnFromContext tries to learn PUA mappings from surrounding context. This is a heuristic approach that looks for patterns.

func (*GlyphMapper) Map

func (gm *GlyphMapper) Map(text string) string

Map converts PUA characters to their ASCII equivalents if known.

type GoogleDriveSource

type GoogleDriveSource struct {
	// contains filtered or unexported fields
}

GoogleDriveSource traverses files in a Google Drive folder and yields content items.

func NewGoogleDriveSource

func NewGoogleDriveSource(ctx context.Context, config GoogleDriveSourceConfig) (*GoogleDriveSource, error)

NewGoogleDriveSource creates a new Google Drive content source.

func (*GoogleDriveSource) BaseURL

func (s *GoogleDriveSource) BaseURL() string

BaseURL returns the base URL for this source.

func (*GoogleDriveSource) Traverse

func (s *GoogleDriveSource) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)

Traverse lists files in the Google Drive folder and yields content items.

func (*GoogleDriveSource) Type

func (s *GoogleDriveSource) Type() string

Type returns "google_drive" as the source type.

type GoogleDriveSourceConfig

type GoogleDriveSourceConfig struct {
	// CredentialsJSON is a service account key JSON string or file path.
	// Either CredentialsJSON or AccessToken must be provided.
	CredentialsJSON string

	// AccessToken is a pre-obtained OAuth2 access token.
	// Either CredentialsJSON or AccessToken must be provided.
	AccessToken string

	// FolderID is the Google Drive folder ID or full folder URL (required).
	// Supports URLs like https://drive.google.com/drive/folders/<ID> or
	// https://drive.google.com/drive/u/0/folders/<ID>.
	FolderID string

	// BaseURL is the base URL for generating document links (optional).
	// If empty, defaults to the Google Drive folder URL.
	BaseURL string

	// IncludePatterns is a list of glob patterns to include.
	// If empty, all files are included (subject to exclude patterns).
	// Supports ** wildcards for recursive matching.
	IncludePatterns []string

	// ExcludePatterns is a list of glob patterns to exclude.
	// Supports ** wildcards for recursive matching.
	ExcludePatterns []string

	// Concurrency controls how many parallel downloads run at once.
	// Default: 5
	Concurrency int

	// Recursive controls whether subfolders are traversed.
	// Default: true
	Recursive bool

	// IncludeSharedDrives enables listing files from shared/team drives.
	IncludeSharedDrives bool

	// ExportFormats overrides the default export MIME type for Google Workspace files.
	// Keys are Google Workspace MIME types, values are the export MIME types.
	ExportFormats map[string]string
}

GoogleDriveSourceConfig holds configuration for a GoogleDriveSource.

type HTMLProcessor

type HTMLProcessor struct{}

HTMLProcessor processes HTML (.html, .htm) content using goquery. It chunks content into sections by headings and extracts metadata from the document head.

func (*HTMLProcessor) CanProcess

func (hp *HTMLProcessor) CanProcess(contentType, path string) bool

CanProcess returns true for HTML content types or .html/.htm extensions.

func (*HTMLProcessor) ExtractQuestions

func (hp *HTMLProcessor) ExtractQuestions(path, sourceURL string, content []byte) []Question

ExtractQuestions extracts questions from HTML content. It looks for questions in: 1. data-docsaf-questions attributes (JSON array of strings or objects) 2. Elements with class "docsaf-questions" (extracts li text content) Questions are associated with the section they appear in based on preceding headings.

func (*HTMLProcessor) Process

func (hp *HTMLProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)

Process processes HTML content and returns document sections. Questions found in the HTML are associated with their containing sections.

type LayoutAnalyzer

type LayoutAnalyzer struct {
	// Configuration
	ColumnGapThreshold  float64 // Minimum gap width to consider as column separator (in points)
	RowTolerance        float64 // Y-coordinate tolerance for grouping into rows
	TableCellMinWidth   float64 // Minimum cell width to consider for table detection
	WordSpaceMultiplier float64 // Multiplier of font size to detect word boundaries

	// Extended options
	MinRowsForColumnPct int  // Minimum percentage of rows that must have gap for column detection (default 25)
	FilterLineNumbers   bool // Whether to filter out line number columns (for depositions)
	AutoDetectLayout    bool // Automatically detect and use optimal layout settings
	UseAdaptiveSpacing  bool // Use dynamic spacing threshold based on actual character spacing (default true)
}

LayoutAnalyzer provides advanced PDF text extraction with column detection, table recognition, and improved reading order reconstruction.

func NewLayoutAnalyzer

func NewLayoutAnalyzer() *LayoutAnalyzer

NewLayoutAnalyzer creates a LayoutAnalyzer with sensible defaults.

func NewLayoutAnalyzerWithConfig

func NewLayoutAnalyzerWithConfig(cfg LayoutConfig) *LayoutAnalyzer

NewLayoutAnalyzerWithConfig creates a LayoutAnalyzer with custom configuration. Note: UseAdaptiveSpacing defaults to false when using explicit configuration. Set it to true manually if you want adaptive spacing with custom settings.

func (*LayoutAnalyzer) WithDepositionMode

func (la *LayoutAnalyzer) WithDepositionMode() *LayoutAnalyzer

WithDepositionMode configures the analyzer for deposition transcript extraction. This uses tighter column detection and filters out line number columns.

type LayoutConfig

type LayoutConfig struct {
	ColumnGapThreshold  float64 // Minimum gap for column detection
	RowTolerance        float64 // Y tolerance for row grouping
	MinRowsForColumnPct int     // % of rows that must have gap for column detection
	FilterLineNumbers   bool    // Whether to filter out line number columns
}

LayoutConfig allows customization of layout analysis parameters.

func DefaultLayoutConfig

func DefaultLayoutConfig() LayoutConfig

DefaultLayoutConfig returns standard layout configuration.

type LineInfo

type LineInfo struct {
	Text           string
	TrimmedText    string
	Indent         int     // Leading whitespace count
	Length         int     // Trimmed length
	EndsWithPeriod bool    // Sentence ending
	IsBullet       bool    // Starts with bullet marker
	IsNumbered     bool    // Starts with number marker (1. or 1))
	IsShort        bool    // Significantly shorter than median
	IsEmpty        bool    // Whitespace only
	IsHeader       bool    // Appears to be a header
	FontSizeHint   float64 // Relative font size (1.0 = normal)
}

LineInfo holds analyzed information about a single line of text.

func AnalyzeLine

func AnalyzeLine(line string, medianLen int) LineInfo

AnalyzeLine extracts information about a line for paragraph detection.

type MarkdownProcessor

type MarkdownProcessor struct {
	// MinTokensPerSection is the minimum token count before splitting into a new section.
	// If 0, defaults to 500 tokens. Set to 1 to split on every heading (original behavior).
	MinTokensPerSection int
}

MarkdownProcessor processes Markdown (.md) and MDX (.mdx) content using goldmark. It chunks content into sections by headings and extracts YAML frontmatter. Sections are merged if they would be too small (under MinTokensPerSection tokens).

func (*MarkdownProcessor) CanProcess

func (mp *MarkdownProcessor) CanProcess(contentType, path string) bool

CanProcess returns true for markdown content types or .md/.mdx extensions.

func (*MarkdownProcessor) ExtractQuestions

func (mp *MarkdownProcessor) ExtractQuestions(path, sourceURL string, content []byte) []Question

ExtractQuestions extracts questions from markdown/MDX content. It looks for questions in: 1. Frontmatter "questions" field 2. <Questions> MDX components inline in the content

func (*MarkdownProcessor) ExtractQuestionsWithSections

func (mp *MarkdownProcessor) ExtractQuestionsWithSections(path, sourceURL string, content []byte) []Question

ExtractQuestionsWithSections extracts questions with section path information.

func (*MarkdownProcessor) Process

func (mp *MarkdownProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)

Process processes markdown content and returns document sections.

type OpenAPIProcessor

type OpenAPIProcessor struct{}

OpenAPIProcessor processes OpenAPI specification content using libopenapi. It extracts API info, paths, and schemas as separate document sections.

func (*OpenAPIProcessor) CanProcess

func (op *OpenAPIProcessor) CanProcess(contentType, path string) bool

CanProcess returns true for .yaml, .yml, and .json files. Note: The content will only be processed if it's a valid OpenAPI v3 specification.

func (*OpenAPIProcessor) ExtractQuestions

func (op *OpenAPIProcessor) ExtractQuestions(path, sourceURL string, content []byte) ([]Question, error)

ExtractQuestions extracts x-docsaf-questions from OpenAPI extensions. It looks for questions at: 1. Top-level document info 2. Individual paths/operations 3. Component schemas

func (*OpenAPIProcessor) Process

func (op *OpenAPIProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)

Process processes OpenAPI specification content and returns document sections. Returns an error if the content is not a valid OpenAPI v3 specification. Questions from x-docsaf-questions extensions are associated with their sections.

type PDFProcessor

type PDFProcessor struct {

	// EnableHeaderFooterDetection enables cross-page header/footer detection.
	// When enabled, the processor makes two passes: first to detect patterns,
	// then to extract text with headers/footers removed.
	EnableHeaderFooterDetection bool

	// EnableMirroredTextRepair enables automatic detection and repair of mirrored/reversed text.
	// Uses bigram frequency analysis to detect text that has been horizontally flipped.
	EnableMirroredTextRepair bool

	// ProgressFunc is called to report processing progress.
	// If nil, no progress is reported.
	ProgressFunc PDFProgressFunc

	// ProgressInterval controls how often ProgressFunc is called.
	// If 0, defaults to every 100 pages.
	ProgressInterval int
	// contains filtered or unexported fields
}

PDFProcessor processes PDF (.pdf) content using the ledongthuc/pdf library. It chunks content into sections by pages and extracts metadata from the PDF Info dictionary.

func (*PDFProcessor) CanProcess

func (pp *PDFProcessor) CanProcess(contentType, path string) bool

CanProcess returns true for PDF content types or .pdf extensions.

func (*PDFProcessor) Process

func (pp *PDFProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)

Process processes PDF content and returns document sections. Each page becomes a separate section, with text extracted via GetTextByRow() for better handling of tables and complex layouts.

type PDFProgress

type PDFProgress struct {
	Phase      string // "header_detection", "extraction"
	Page       int    // Current page number (1-based)
	TotalPages int    // Total pages in document
	FilePath   string // Path to the PDF file
}

PDFProgress reports progress during PDF processing.

type PDFProgressFunc

type PDFProgressFunc func(progress PDFProgress) error

PDFProgressFunc is called to report PDF processing progress. Return an error to abort processing.

type ParagraphConfig

type ParagraphConfig struct {
	// MinLineSpacingRatio: ratio of line spacing to median that indicates paragraph break
	// Default 1.5 means 50% more spacing than median triggers break
	MinLineSpacingRatio float64

	// MinIndentChars: minimum indentation (in characters) for first-line indent detection
	MinIndentChars int

	// DetectLists: whether to detect and preserve bullet/numbered lists
	DetectLists bool

	// DetectHeaders: whether to detect headers based on font size patterns
	DetectHeaders bool

	// PreserveBlankLines: keep existing blank lines as paragraph breaks
	PreserveBlankLines bool
}

ParagraphConfig configures enhanced paragraph detection behavior.

func DefaultParagraphConfig

func DefaultParagraphConfig() ParagraphConfig

DefaultParagraphConfig returns sensible defaults for paragraph detection.

type Processor

type Processor struct {
	// contains filtered or unexported fields
}

Processor processes content from any source using registered processors. It abstracts the traversal mechanism, allowing the same processing logic to work with filesystem, web, and other content sources.

func NewProcessor

func NewProcessor(source ContentSource, registry ProcessorRegistry) *Processor

NewProcessor creates a new processor. The source provides content items, and the registry provides processors to handle them.

func (*Processor) Process

func (p *Processor) Process(ctx context.Context) ([]DocumentSection, error)

Process traverses the source and processes all content items. Returns a slice of all extracted DocumentSections.

func (*Processor) ProcessWithCallback

func (p *Processor) ProcessWithCallback(ctx context.Context, callback func([]DocumentSection) error) error

ProcessWithCallback traverses the source and calls the callback for each batch of sections. This is useful for streaming large amounts of content without holding everything in memory.

func (*Processor) SetBaseURL

func (p *Processor) SetBaseURL(baseURL string)

SetBaseURL sets the base URL for generated links. This overrides the base URL from the source.

func (*Processor) SourceType

func (p *Processor) SourceType() string

SourceType returns the type of the underlying content source.

type ProcessorRegistry

type ProcessorRegistry interface {
	// Register adds a processor to the registry.
	Register(processor ContentProcessor)

	// GetProcessor returns the first processor that can handle the given content.
	// Returns nil if no processor can handle the content.
	GetProcessor(contentType, path string) ContentProcessor

	// Processors returns all registered processors.
	Processors() []ContentProcessor
}

ProcessorRegistry manages a collection of ContentProcessors.

func DefaultRegistry

func DefaultRegistry() ProcessorRegistry

DefaultRegistry creates a registry with all built-in processors registered. This includes MarkdownProcessor, OpenAPIProcessor, HTMLProcessor, and PDFProcessor.

func NewRegistry

func NewRegistry() ProcessorRegistry

NewRegistry creates a new empty processor registry. Use this to build a custom registry with only the processors you need.

func NewWholeFileRegistry

func NewWholeFileRegistry() ProcessorRegistry

NewWholeFileRegistry creates a registry with only the WholeFileProcessor. This processor returns entire content without any chunking, allowing Antfly's internal chunking (e.g., Termite) to handle document segmentation during the embedding process.

type Question

type Question struct {
	// ID is a unique identifier for the question (generated from source + question text)
	ID string

	// Text is the question text itself
	Text string

	// SourcePath is the file path where the question was found
	SourcePath string

	// SourceURL is the URL to the source document (if available)
	SourceURL string

	// SourceType indicates where the question came from:
	// "frontmatter", "mdx_component", "openapi_info", "openapi_path", "openapi_schema"
	SourceType string

	// Context provides additional context about where the question appears
	// For MDX: section title or document title
	// For OpenAPI: operation ID, path, or schema name
	Context string

	// SectionPath is the heading hierarchy path where the question appears
	// (e.g., ["Getting Started", "Installation", "Prerequisites"])
	SectionPath []string

	// Metadata contains source-specific metadata
	Metadata map[string]any
}

Question represents a question extracted from documentation. Questions can come from MDX frontmatter, <Questions> MDX components, or x-docsaf-questions OpenAPI extensions.

func (*Question) ToDocument

func (q *Question) ToDocument() map[string]any

ToDocument converts a Question to a document map suitable for storage.

type QuestionsExtractor

type QuestionsExtractor struct{}

QuestionsExtractor extracts questions from various content sources.

func (*QuestionsExtractor) ExtractFromMDXContent

func (qe *QuestionsExtractor) ExtractFromMDXContent(path, sourceURL string, content []byte, frontmatter map[string]any) []Question

ExtractFromMDXContent extracts questions from MDX/Markdown content. It looks for: 1. Questions in frontmatter (questions: [...]) 2. <Questions> MDX components in the content

func (*QuestionsExtractor) ExtractFromOpenAPI

func (qe *QuestionsExtractor) ExtractFromOpenAPI(path, sourceURL, sourceType, context string, extensions map[string]any) []Question

ExtractFromOpenAPI extracts x-docsaf-questions from OpenAPI extensions. The extensions map should contain the questions as a string array.

type S3Source

type S3Source struct {
	// contains filtered or unexported fields
}

S3Source traverses objects in an S3-compatible bucket and yields content items.

func NewS3Source

func NewS3Source(config S3SourceConfig) (*S3Source, error)

NewS3Source creates a new S3 content source.

func (*S3Source) BaseURL

func (s *S3Source) BaseURL() string

BaseURL returns the base URL for this source. If not configured, returns an s3:// URL.

func (*S3Source) Traverse

func (s *S3Source) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)

Traverse lists objects in the S3 bucket and yields content items for all matching objects. It returns a channel of ContentItems and a channel for errors.

func (*S3Source) Type

func (s *S3Source) Type() string

Type returns "s3" as the source type.

type S3SourceConfig

type S3SourceConfig struct {
	// Credentials holds S3/MinIO connection credentials.
	// Supports keystore syntax and environment variable fallbacks.
	Credentials s3.Credentials

	// Bucket is the S3 bucket name (required).
	Bucket string

	// Prefix is an optional key prefix to filter objects.
	// Only objects with this prefix will be listed.
	// Example: "docs/" to only process objects in the docs/ folder.
	Prefix string

	// BaseURL is the base URL for generating document links (optional).
	// If empty, URLs will use the s3:// scheme.
	BaseURL string

	// IncludePatterns is a list of glob patterns to include.
	// Objects matching any include pattern will be processed.
	// If empty, all objects are included (subject to exclude patterns).
	// Supports ** wildcards for recursive matching.
	// Patterns are matched against the object key (with prefix stripped if configured).
	IncludePatterns []string

	// ExcludePatterns is a list of glob patterns to exclude.
	// Objects matching any exclude pattern will be skipped.
	// Supports ** wildcards for recursive matching.
	// Patterns are matched against the object key (with prefix stripped if configured).
	ExcludePatterns []string

	// Concurrency controls how many S3 GetObject requests run in parallel.
	// Default: 5
	Concurrency int
}

S3SourceConfig holds configuration for an S3Source.

type Table

type Table struct {
	X, Y          float64
	Width, Height float64
	Rows          int
	Cols          int
	Cells         [][]TableCell
}

Table represents a detected table structure.

type TableCell

type TableCell struct {
	Row, Col      int
	X, Y          float64
	Width, Height float64
	Text          string
}

TableCell represents a cell in a detected table.

type TextBlock

type TextBlock struct {
	X, Y          float64
	Width, Height float64
	Text          string
	FontSize      float64
	Chars         []pdf.Text // Original characters
}

TextBlock represents a block of text with position and content.

type TextRepair

type TextRepair struct {
	// Configuration
	HeaderFooterMargin float64 // Percentage of page height to consider as header/footer region (0.1 = 10%)
	MinPagesSeen       int     // Minimum pages to analyze before detecting headers/footers
	// contains filtered or unexported fields
}

TextRepair provides utilities for detecting and fixing common PDF text extraction issues.

func NewTextRepair

func NewTextRepair() *TextRepair

NewTextRepair creates a new TextRepair with sensible defaults.

func (*TextRepair) AutoDecodePUA

func (tr *TextRepair) AutoDecodePUA(text string) (decoded string, description string)

AutoDecodePUA attempts to automatically decode PUA-preserved bytes. Returns the decoded text and a description of the decoding applied.

func (*TextRepair) AutoDecodeText

func (tr *TextRepair) AutoDecodeText(text string) (decoded string, fixed string)

AutoDecodeText automatically detects and decodes text with encoding issues. Returns the decoded text and a description of what was fixed.

func (*TextRepair) AutoRepairMirroredText

func (tr *TextRepair) AutoRepairMirroredText(text string) (string, bool)

AutoRepairMirroredText detects and repairs mirrored text if confidence is high enough. Returns the repaired text and whether repair was applied.

func (*TextRepair) CalculateLineEntropy

func (tr *TextRepair) CalculateLineEntropy(line string) float64

CalculateLineEntropy calculates Shannon entropy of a line of text. Higher entropy indicates more randomness (potential garbled content).

func (*TextRepair) DecodeShiftedText

func (tr *TextRepair) DecodeShiftedText(text string, shift int) string

DecodeShiftedText decodes text that has been shifted by the specified amount.

func (*TextRepair) DecodeSymbolSubstitution

func (tr *TextRepair) DecodeSymbolSubstitution(text string, substMap map[rune]rune) string

DecodeSymbolSubstitution applies symbol substitution decoding to text.

func (*TextRepair) DetectDepositionLayout

func (tr *TextRepair) DetectDepositionLayout(texts []pdf.Text) bool

DetectDepositionLayout checks if a page appears to be a deposition transcript. Deposition transcripts have: - Line numbers 1-25 (or similar) in a narrow left column - Consistent line spacing - Q: and A: question/answer format

func (*TextRepair) DetectEncodedPattern

func (tr *TextRepair) DetectEncodedPattern(text string) (patternType string, description string)

DetectEncodedPattern checks if a string matches a pattern that suggests it's an encoded version of a known format (like case numbers). Returns the detected pattern type and a description.

func (*TextRepair) DetectEncodingShift

func (tr *TextRepair) DetectEncodingShift(text string) (shift int, confidence float64)

DetectEncodingShift analyzes text to detect if it uses a shifted alphabet encoding. Returns the detected shift (0-25) and a confidence score (0.0-1.0). A shift of 0 means no encoding detected or text is normal.

func (*TextRepair) DetectFontEncodingCorruption

func (tr *TextRepair) DetectFontEncodingCorruption(text string) float64

DetectFontEncodingCorruption checks if text appears to be using a corrupted or non-standard font encoding. This happens when PDF fonts have custom glyph mappings that don't match standard character codes.

Characteristics of font-encoding corruption: - Text looks like random letters but has structure (same length as expected) - Unusual mix of uppercase/lowercase in patterns that don't match English - High proportion of consonant clusters that are phonetically impossible - No recognizable words despite looking like text

func (*TextRepair) DetectMirroredText

func (tr *TextRepair) DetectMirroredText(text string) float64

DetectMirroredText analyzes text to detect if it appears to be reversed/mirrored. Returns a confidence score (0.0-1.0) where higher values indicate more likely mirroring.

func (*TextRepair) DetectPUAShift

func (tr *TextRepair) DetectPUAShift(text string) (shift int, confidence float64)

DetectPUAShift analyzes text with PUA characters to detect the encoding shift. Returns the best shift value and confidence score based on resulting English frequency.

func (*TextRepair) DetectSymbolSubstitution

func (tr *TextRepair) DetectSymbolSubstitution(text string) (map[rune]rune, float64)

DetectSymbolSubstitution checks if text uses symbol-to-letter substitution. Returns the substitution map and confidence score.

func (*TextRepair) FilterFontEncodingCorruption

func (tr *TextRepair) FilterFontEncodingCorruption(text string) string

FilterFontEncodingCorruption removes or replaces lines with severe font encoding issues. Less severe issues are left in place but could be marked.

func (*TextRepair) FilterLineNumberColumn

func (tr *TextRepair) FilterLineNumberColumn(texts []pdf.Text) []pdf.Text

FilterLineNumberColumn removes the leftmost column if it contains only line numbers. It identifies line numbers (1-25) in the left margin and filters them out along with any other content in that narrow column, preserving the main text content.

func (*TextRepair) FilterNoiseLines

func (tr *TextRepair) FilterNoiseLines(text string) string

FilterNoiseLines removes lines detected as garbled/corrupted content.

func (*TextRepair) GetDetectedFooters

func (tr *TextRepair) GetDetectedFooters() []string

GetDetectedFooters returns footers detected across multiple pages.

func (*TextRepair) GetDetectedHeaders

func (tr *TextRepair) GetDetectedHeaders() []string

GetDetectedHeaders returns headers detected across multiple pages. A line is considered a header if it appears (with edit distance tolerance) on most pages.

func (*TextRepair) IsFontEncodingCorrupted

func (tr *TextRepair) IsFontEncodingCorrupted(text string) bool

IsFontEncodingCorrupted returns true if text appears to have font encoding issues.

func (*TextRepair) IsNoiseLine

func (tr *TextRepair) IsNoiseLine(line string) bool

IsNoiseLine detects if a line is likely garbled/corrupted content. Uses entropy and character pattern analysis.

func (*TextRepair) MarkFontEncodingIssues

func (tr *TextRepair) MarkFontEncodingIssues(text string) string

MarkFontEncodingIssues wraps text that appears to have font encoding issues with markers for downstream processing or flagging.

func (*TextRepair) RecordPageContent

func (tr *TextRepair) RecordPageContent(pageText string)

RecordPageContent records the first and last lines of a page for pattern detection.

func (*TextRepair) RemoveHeadersFooters

func (tr *TextRepair) RemoveHeadersFooters(pageText string, headers, footers []string) string

RemoveHeadersFooters removes detected headers and footers from page text.

func (*TextRepair) RemoveInterleavedReplacements

func (tr *TextRepair) RemoveInterleavedReplacements(text string) string

RemoveInterleavedReplacements removes U+FFFD replacement characters that appear to be interleaved with real text (pattern: char, FFFD, char, FFFD, ...). This fixes PDFs where font encoding issues produce "C·O·N·F·I·D·E·N·T·I·A·L" patterns where · is U+FFFD.

func (*TextRepair) RepairMirroredText

func (tr *TextRepair) RepairMirroredText(text string) string

RepairMirroredText reverses text that has been detected as mirrored. It can operate at word level or full text level.

func (*TextRepair) RepairMisspelledWords

func (tr *TextRepair) RepairMisspelledWords(text string) string

RepairMisspelledWords uses edit distance to correct likely OCR errors. Conservative approach: only fixes words with small edit distance to known words.

func (*TextRepair) SegmentWords

func (tr *TextRepair) SegmentWords(text string) string

SegmentWords uses dynamic programming to find optimal word boundaries in merged text. This handles PDFs with zero-gap character positioning that result in merged words like "UNITEDSTATESDISTRICTCOURT" -> "UNITED STATES DISTRICT COURT"

type WebSource

type WebSource struct {
	// contains filtered or unexported fields
}

WebSource crawls websites and yields content items.

func NewWebSource

func NewWebSource(config WebSourceConfig) (*WebSource, error)

NewWebSource creates a new web content source.

func (*WebSource) BaseURL

func (ws *WebSource) BaseURL() string

BaseURL returns the base URL for this source.

func (*WebSource) CacheStats

func (ws *WebSource) CacheStats() *CacheStats

CacheStats returns statistics about the cache. Returns nil if caching is not enabled.

func (*WebSource) ClearCache

func (ws *WebSource) ClearCache() error

ClearCache removes all entries from the cache.

func (*WebSource) IsCached

func (ws *WebSource) IsCached(url string) bool

IsCached checks if a URL is in the cache.

func (*WebSource) Traverse

func (ws *WebSource) Traverse(ctx context.Context) (<-chan ContentItem, <-chan error)

Traverse crawls the website and yields content items.

func (*WebSource) Type

func (ws *WebSource) Type() string

Type returns "web" as the source type.

type WebSourceConfig

type WebSourceConfig struct {
	// StartURL is the starting URL to crawl (required)
	StartURL string

	// BaseURL is the base URL for generating document links.
	// If empty, it will be derived from StartURL.
	BaseURL string

	// AllowedDomains restricts crawling to these domains.
	// If empty, only the domain from StartURL is allowed.
	AllowedDomains []string

	// IncludePatterns is a list of glob patterns for URL paths to include.
	// If empty, all paths are included (subject to exclude patterns).
	// Patterns match against the URL path (e.g., "/docs/**", "/guides/*")
	IncludePatterns []string

	// ExcludePatterns is a list of glob patterns for URL paths to exclude.
	// Default excludes common non-content paths.
	ExcludePatterns []string

	// MaxDepth is the maximum crawl depth (0 = unlimited).
	MaxDepth int

	// MaxPages is the maximum number of pages to crawl (0 = unlimited).
	MaxPages int

	// Concurrency is the number of concurrent requests (default: 2).
	Concurrency int

	// RequestDelay is the delay between requests (default: 100ms).
	RequestDelay time.Duration

	// UserAgent is the User-Agent string to use for requests.
	UserAgent string

	// UseSitemap enables sitemap-based crawling.
	// When enabled, the crawler will first fetch and parse the sitemap
	// to discover URLs before following links.
	UseSitemap bool

	// SitemapURL is the URL of the sitemap (optional).
	// If empty and UseSitemap is true, it will try /sitemap.xml
	SitemapURL string

	// SitemapOnly restricts crawling to URLs found in the sitemap only.
	// When true, link discovery is disabled.
	SitemapOnly bool

	// RespectRobotsTxt enables robots.txt parsing (default: true).
	RespectRobotsTxt bool

	// MaxRetries is the number of retry attempts for failed requests (default: 3).
	MaxRetries int

	// RetryDelay is the base delay for exponential backoff retry (default: 1s).
	// The actual delay doubles with each retry: 1s, 2s, 4s, etc.
	RetryDelay time.Duration

	// CacheTTL is how long to cache responses (default: 0 = disabled).
	// Set to a positive duration to enable caching.
	CacheTTL time.Duration

	// CacheMaxItems is the maximum number of items to cache (default: 1000).
	CacheMaxItems int

	// CacheDir is the directory for persistent cache storage.
	// If empty, only in-memory caching is used.
	CacheDir string

	// CacheRespectHeaders enables HTTP cache header parsing (default: true when caching enabled).
	// When enabled, Cache-Control, ETag, and Last-Modified are respected.
	CacheRespectHeaders bool

	// CacheDeduplication enables content hash deduplication (default: true when caching enabled).
	// Identical content from different URLs will be stored only once.
	CacheDeduplication bool

	// NormalizeURLs enables URL normalization for deduplication (default: true).
	// Includes lowercasing host, removing default ports, removing trailing slashes.
	NormalizeURLs bool
}

WebSourceConfig holds configuration for a WebSource.

type WholeFileProcessor

type WholeFileProcessor struct{}

WholeFileProcessor processes content by returning it as a single section without any chunking. This is useful when you want Antfly's internal chunking (e.g., Termite) to handle document segmentation.

func (*WholeFileProcessor) CanProcess

func (wfp *WholeFileProcessor) CanProcess(contentType, path string) bool

CanProcess returns true for common text-based file types.

func (*WholeFileProcessor) Process

func (wfp *WholeFileProcessor) Process(path, sourceURL, baseURL string, content []byte) ([]DocumentSection, error)

Process returns the entire content as a single DocumentSection.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL