providers

package
v0.2.3 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 11, 2025 License: MIT Imports: 32 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AddPageContextToPrompt

func AddPageContextToPrompt(filename, originalPrompt string) string

AddPageContextToPrompt enhances the prompt with page-specific context for multi-page documents. It extracts page information from filenames with format "document.pdf-page-2-of-5" or "document.pdf-page-2" and adds appropriate context to help the OCR provider understand the document structure and place headings.

func BuildPrompt

func BuildPrompt(base, appendPrompt, filename, format string) string

func DefaultMessageFormatter

func DefaultMessageFormatter(prefix string) func(inProgress, completed, failed, total int64) string

DefaultMessageFormatter creates a default message formatter with the given prefix

func NewExpandSinglePdfStage

func NewExpandSinglePdfStage(service *ProviderService) fluxus.StageFunc[*PipelineItem, []*PipelineItem]

NewExpandSinglePdfStage creates a PDF expansion stage with the given provider

func NewGrayScaleStage

func NewGrayScaleStage(processor image.GrayScaleProcessor) fluxus.StageFunc[*PipelineItem, *PipelineItem]

NewGrayScaleStage creates a grayscale processing stage with the given processor

func NewLastStageProgressStage

func NewLastStageProgressStage(progress *ProgressTracker) fluxus.StageFunc[*PipelineItem, *PipelineItem]

func NewSingleInputOCRStageWithProgress

func NewSingleInputOCRStageWithProgress(ocrFunc ocrFunc, progress *ProgressTracker) fluxus.StageFunc[*PipelineItem, *BatchResult]

func NewTextCorrectionStage

func NewTextCorrectionStage(textCorrectionFunc textCorrectionFunc, progress *ProgressTracker) fluxus.StageFunc[*OCRResult, *OCRResult]

NewTextCorrectionStage creates a text correction stage for post-processing OCR results

func StartOCRPipeline

func StartOCRPipeline(ops []imageio.ImageIO, service *ProviderService) error

StartOCRPipeline orchestrates the OCR workflow. Accepts the an OCR provider and a list of imageIO operations.

func WithDoclingBaseURL

func WithDoclingBaseURL(baseURL string) func(*DoclingClient)

Types

type BatchResult

type BatchResult struct {
	Item   *PipelineItem
	Result *OCRResult
	Error  error
}

BatchResult holds the processed result and its corresponding input item, used for stitching results back together in case of PDF files.

func ProcessBatch

func ProcessBatch(ctx context.Context, items []*PipelineItem, ocrFunc ocrFunc, concurrency int, progress *ProgressTracker) ([]*BatchResult, error)

ProcessBatch processes a collection of pipeline items concurrently using a go-fluxus pipeline,has progress tracking built in. 'concurrency' arg controls the throughput of the worker pool 'limiter' arg is used to rate limit the OCR calls 'ocrFunc' arg is a given function to execute for each item

type Config

type Config struct {
	OCR            ProviderConfig       `yaml:"ocr"`
	Pipeline       PipelineConfig       `yaml:"pipeline"`
	RateLimit      RateLimitConfig      `yaml:"rate_limit"`
	TextCorrection TextCorrectionConfig `yaml:"text_correction"`

	// Provider-specific options that implement the ProviderOptionsInterface
	DoclingOptions *DoclingOptions `yaml:"docling_options,omitempty"`
	OpenAIOptions  *OpenAIOptions  `yaml:"openai_options,omitempty"`
	OllamaOptions  *OllamaOptions  `yaml:"ollama_options,omitempty"`
}

type Configurable

type Configurable interface {
	GetConfig() Config
}

type DoclingCliClient

type DoclingCliClient struct {
	Available  bool
	BinaryPath string
}

DoclingCliClient holds the docling CLI information

func NewDoclingCliClient

func NewDoclingCliClient() *DoclingCliClient

NewDoclingCliClient creates a new CLI client and checks if docling CLI is available

func (*DoclingCliClient) IsAvailable

func (c *DoclingCliClient) IsAvailable() bool

func (*DoclingCliClient) ProcessFile

func (c *DoclingCliClient) ProcessFile(ctx context.Context, fileBytes []byte, filename string, options map[string]string, outputDir string) (*DoclingConvertDocumentResponse, error)

ProcessFile processes a file using the docling CLI

type DoclingClient

type DoclingClient struct {
	Client  *http.Client
	BaseURL string
}

func NewDoclingClient

func NewDoclingClient(opts ...func(*DoclingClient)) *DoclingClient

func (*DoclingClient) HealthCheck

func (d *DoclingClient) HealthCheck(ctx context.Context) error

func (*DoclingClient) ProcessFile

func (d *DoclingClient) ProcessFile(ctx context.Context, imageBytes []byte, filename string, options map[string]string) (*DoclingConvertDocumentResponse, error)

type DoclingConvertDocumentResponse

type DoclingConvertDocumentResponse struct {
	Document       DoclingDocumentResponse `json:"document"`
	Status         string                  `json:"status"` // "success", "partial_success", "skipped", "failure"
	Errors         []any                   `json:"errors"`
	ProcessingTime float64                 `json:"processing_time"`
	Timings        map[string]any          `json:"timings"`
}

type DoclingDocumentResponse

type DoclingDocumentResponse struct {
	Filename       string          `json:"filename"`
	MDContent      string          `json:"md_content"`
	JSONContent    json.RawMessage `json:"json_content"`
	HTMLContent    string          `json:"html_content"`
	TextContent    string          `json:"text_content"`
	DocTagsContent string          `json:"doctags_content"`
}

type DoclingHealthResponse

type DoclingHealthResponse struct {
	Status string `json:"status"`
}

type DoclingOptions

type DoclingOptions struct {
	OCR       *bool `yaml:"do_ocr,omitempty"`
	Force_OCR *bool `yaml:"force_ocr,omitempty"`

	Pipeline string `yaml:"pipeline,omitempty"`

	PDF_Backend      string `yaml:"pdf_backend,omitempty"`
	Abort_On_Error   *bool  `yaml:"abort_on_error,omitempty"`
	Document_Timeout string `yaml:"document_timeout,omitempty"`
	Num_Threads      string `yaml:"num_threads,omitempty"`
	Device           string `yaml:"device,omitempty"`
	Verbose          string `yaml:"verbose,omitempty"`
}

DoclingOptions 3 state bool to overcome go unset,specified false bool fields.

func (*DoclingOptions) Apply

func (d *DoclingOptions) Apply(defaults any, config Config) (any, error)

Apply merges schema options with defaults and returns map[string]string

type DoclingProcessPayload

type DoclingProcessPayload struct {
	FileBytes []byte
	Filename  string
	Options   map[string]string
}

type DoclingProvider

type DoclingProvider struct {
	Client    *DoclingClient
	CliClient *DoclingCliClient
	Config    Config
}

DoclingProvider implements the OCRProvider interface

func (*DoclingProvider) ApplyOptions

func (p *DoclingProvider) ApplyOptions() (map[string]string, error)

ApplyOptions merges the schema.yml openai specific options with the default options

func (*DoclingProvider) GetConfig

func (p *DoclingProvider) GetConfig() Config

func (*DoclingProvider) InputToMessages

func (p *DoclingProvider) InputToMessages(input OCRInput, ctx context.Context) (*DoclingProcessPayload, error)

func (*DoclingProvider) OCR

func (p *DoclingProvider) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

func (*DoclingProvider) SupportsPDF

func (p *DoclingProvider) SupportsPDF() bool

func (*DoclingProvider) WithImage

func (p *DoclingProvider) WithImage(ctx context.Context, input OCRInput) (*DoclingProcessPayload, error)

func (*DoclingProvider) WithPDF

type GeminiProvider

type GeminiProvider struct {
	// contains filtered or unexported fields
}

func (*GeminiProvider) Complete

func (g *GeminiProvider) Complete(ctx context.Context, text string) (string, error)

func (*GeminiProvider) GetConfig

func (g *GeminiProvider) GetConfig() Config

func (*GeminiProvider) InputToMessages

func (g *GeminiProvider) InputToMessages(input OCRInput) ([]*genai.Part, error)

func (*GeminiProvider) OCR

func (g *GeminiProvider) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

type InputType

type InputType int
const (
	InputTypeImage InputType = iota
	InputTypePDF
)

type MistralOcrDimensions

type MistralOcrDimensions struct {
	DPI    int `json:"dpi"`
	Height int `json:"height"`
	Width  int `json:"width"`
}

type MistralOcrDocument

type MistralOcrDocument struct {
	ImageURL     string `json:"image_url,omitempty"`
	DocumentURL  string `json:"document_url,omitempty"`
	DocumentName string `json:"document_name,omitempty"`
	Type         string `json:"type"`
}

type MistralOcrImage

type MistralOcrImage struct {
	ID           string `json:"id"`
	TopLeftX     int    `json:"top_left_x"`
	TopLeftY     int    `json:"top_left_y"`
	BottomRightX int    `json:"bottom_right_x"`
	BottomLeftX  int    `json:"bottom_left_x"`
	ImageBase64  string `json:"image_base64"`
}

type MistralOcrPage

type MistralOcrPage struct {
	Index      int                  `json:"index"`
	Markdown   string               `json:"markdown"`
	Images     []MistralOcrImage    `json:"images"`
	Dimensions MistralOcrDimensions `json:"dimensions"`
}

type MistralOcrResponse

type MistralOcrResponse struct {
	Pages     []MistralOcrPage `json:"pages"`
	Model     string           `json:"model"`
	UsageInfo map[string]any   `json:"usage_info"`
}

type MistralOcrResquest

type MistralOcrResquest struct {
	Model              string             `json:"model"`
	ID                 string             `json:"id,omitempty"`
	Document           MistralOcrDocument `json:"document"`
	Pages              []int              `json:"pages,omitempty"`
	IncludeImageBase64 bool               `json:"include_image_base64"`
	ImageLimit         int                `json:"image_limit,omitempty"`
	ImageMinSize       int                `json:"image_min_size,omitempty"`
}

Mistral single Image OCR request

type MistralProvider

type MistralProvider struct {
	// contains filtered or unexported fields
}

MistralProvider implements the Provider Interface

func (*MistralProvider) GetConfig

func (m *MistralProvider) GetConfig() Config

func (*MistralProvider) InputToMessages

func (m *MistralProvider) InputToMessages(input OCRInput) (MistralOcrResquest, error)

func (*MistralProvider) OCR

func (m *MistralProvider) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

func (*MistralProvider) SupportsPDF

func (m *MistralProvider) SupportsPDF() bool

type OCRImage

type OCRImage struct {
	MistralImages []MistralOcrImage
}

type OCRInput

type OCRInput struct {
	Type     InputType
	Image    image.Image
	PDFData  []byte
	Filename string
}

func MapToOCRInput

func MapToOCRInput(ops []imageio.ImageIO) ([]*OCRInput, map[int]int, error)

MapToOCRInput loads files from the given imageIO operations and returns a list of OCRInputs and a mapping between the OCRInputs and the original imageIO operations.

type OCRProvider

type OCRProvider interface {
	OCR(ctx context.Context, input OCRInput) (*OCRResult, error)
}

func NewDoclingProvider

func NewDoclingProvider(config Config) (OCRProvider, error)

func NewGeminiProvider

func NewGeminiProvider(config Config) (OCRProvider, error)

func NewMistralProvider

func NewMistralProvider(config Config) (OCRProvider, error)

func NewOCRProvider

func NewOCRProvider(config Config) (OCRProvider, error)

func NewOllamaProvider

func NewOllamaProvider(config Config) (OCRProvider, error)

func NewOpenAIProvider

func NewOpenAIProvider(config Config) (OCRProvider, error)

NewOpenAIProvider creates a new OpenAI provider

func NewProvider

func NewProvider(providerName, model string, config Config) (OCRProvider, error)

func NewTesseractProvider

func NewTesseractProvider(config Config) (OCRProvider, error)

type OCRResult

type OCRResult struct {
	Text     string
	Images   OCRImage
	Metadata map[string]string
}

func MistralToOCRResult

func MistralToOCRResult(res *MistralOcrResponse) (*OCRResult, error)

MistralToOCRResult maps the external MistralOcrResponse into our internal OCRResult

type OllamaMessage

type OllamaMessage struct {
	Role    string   `json:"role"`
	Content string   `json:"content"`
	Images  []string `json:"images,omitempty"` // Array of base64 encoded images
}

type OllamaOptions

type OllamaOptions struct {
	MaxTokens   int64   `yaml:"max_tokens"`
	Temperature float64 `yaml:"temperature"`
}

func (*OllamaOptions) Apply

func (o *OllamaOptions) Apply(defaults any, config Config) (any, error)

type OllamaProvider

type OllamaProvider struct {
	// contains filtered or unexported fields
}

OllamaProvider implements the Provider Interface

func (*OllamaProvider) ApplyOptions

func (o *OllamaProvider) ApplyOptions() (*OllamaOptions, error)

ApplyOptions merges the schema.yml ollama specific options with the default options

func (*OllamaProvider) Complete

func (o *OllamaProvider) Complete(ctx context.Context, text string) (string, error)

func (*OllamaProvider) GetConfig

func (o *OllamaProvider) GetConfig() Config

func (*OllamaProvider) OCR

func (o *OllamaProvider) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

func (*OllamaProvider) SupportsPDF

func (o *OllamaProvider) SupportsPDF() bool

type OllamaRequest

type OllamaRequest struct {
	Model    string          `json:"model"`
	Messages []OllamaMessage `json:"messages"`
	Stream   bool            `json:"stream"`
	Options  map[string]any  `json:"options"`
}

type OllamaResponse

type OllamaResponse struct {
	Model     string `json:"model"`
	CreatedAt string `json:"created_at"`
	Message   struct {
		Role    string `json:"role"`
		Content string `json:"content"`
	} `json:"message"`
	Done          bool  `json:"done"`
	TotalDuration int64 `json:"total_duration"`
}

type OpenAIOptions

type OpenAIOptions struct {
	MaxTokens   int64   `yaml:"max_tokens"`
	Temperature float64 `yaml:"temperature"`
}

func (*OpenAIOptions) Apply

func (o *OpenAIOptions) Apply(defaults any, config Config) (any, error)

type OpenAIProvider

type OpenAIProvider struct {
	// contains filtered or unexported fields
}

OpenAIProvider implements the OCRProvider interface

func (*OpenAIProvider) ApplyOptions

func (o *OpenAIProvider) ApplyOptions() (*OpenAIOptions, error)

ApplyOptions merges the schema.yml openai specific options with the default options

func (*OpenAIProvider) Complete

func (o *OpenAIProvider) Complete(ctx context.Context, text string) (string, error)

func (*OpenAIProvider) GetConfig

func (o *OpenAIProvider) GetConfig() Config

func (*OpenAIProvider) InputToMessages

func (o *OpenAIProvider) InputToMessages(input OCRInput) ([]openai.ChatCompletionMessageParamUnion, error)

func (*OpenAIProvider) OCR

func (o *OpenAIProvider) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

OCR OCRs a single image and returns an OCRResult

func (*OpenAIProvider) SupportsPDF

func (o *OpenAIProvider) SupportsPDF() bool

func (*OpenAIProvider) WithImage

func (o *OpenAIProvider) WithImage(base64Image string, prompt string) openai.ChatCompletionMessageParamUnion

func (*OpenAIProvider) WithPDF

func (o *OpenAIProvider) WithPDF(base64PDF string, prompt string) openai.ChatCompletionMessageParamUnion

type PDFCapable

type PDFCapable interface {
	SupportsPDF() bool
}

type PipelineConfig

type PipelineConfig struct {
	DPI            float64 `yaml:"dpi"`
	Concurrency    int     `yaml:"concurrency"`
	OCRConcurrency int     `yaml:"ocr_concurrency"`
}

type PipelineItem

type PipelineItem struct {
	Input         *OCRInput
	OriginalIndex int
	PageIndex     int // -1 for non-PDF pages or single-page docs
}

PipelineItem tracks an item through the processing pipeline. It maps expanded items (like PDF pages) back to their original source file.

type ProgressTracker

type ProgressTracker struct {
	// contains filtered or unexported fields
}

ProgressTracker handles progress tracking with atomic counters and periodic updates

func NewProgressTracker

func NewProgressTracker(config ProgressTrackerConfig) *ProgressTracker

NewProgressTracker creates a new progress tracker

func WithCustomProgress

func WithCustomProgress(total int, formatter func(inProgress, completed, failed, total int64) string) *ProgressTracker

WithCustomProgress creates a progress tracker with custom message formatter

func WithGenericProgress

func WithGenericProgress(total int) *ProgressTracker

WithGenericProgress creates a progress tracker with default settings

func WithPrefixProgress

func WithPrefixProgress(total int, prefix string) *ProgressTracker

WithPrefixProgress creates a progress tracker with a custom prefix

func (*ProgressTracker) AppendToPrefix

func (pt *ProgressTracker) AppendToPrefix(suffix string)

AppendToPrefix concatenates a string to the current prefix (only works with default formatter)

func (*ProgressTracker) GetCounters

func (pt *ProgressTracker) GetCounters() (inProgress, completed, failed, total int64)

func (*ProgressTracker) GetPrefix

func (pt *ProgressTracker) GetPrefix() string

GetPrefix returns the current prefix (only meaningful with default formatter)

func (*ProgressTracker) IncrementCompleted

func (pt *ProgressTracker) IncrementCompleted()

func (*ProgressTracker) IncrementFailed

func (pt *ProgressTracker) IncrementFailed()

func (*ProgressTracker) IncrementInProgress

func (pt *ProgressTracker) IncrementInProgress()

func (*ProgressTracker) IsComplete

func (pt *ProgressTracker) IsComplete() bool

func (*ProgressTracker) SetPrefix

func (pt *ProgressTracker) SetPrefix(newPrefix string)

SetPrefix updates the prefix for the progress tracker (only works with default formatter)

func (*ProgressTracker) SetTotal

func (pt *ProgressTracker) SetTotal(newTotal int64)

func (*ProgressTracker) Start

func (pt *ProgressTracker) Start()

Start begins the progress tracking with periodic updates

func (*ProgressTracker) Stop

func (pt *ProgressTracker) Stop(message string)

type ProgressTrackerConfig

type ProgressTrackerConfig struct {
	Total            int64
	UpdateFrequency  time.Duration
	MessageFormatter func(inProgress, completed, failed, total int64) string
	Prefix           string
}

type ProviderConfig

type ProviderConfig struct {
	Provider     string `yaml:"provider"`
	Model        string `yaml:"model"`
	Prompt       string `yaml:"prompt"`
	AppendPrompt string `yaml:"append_prompt"`
	Language     string `yaml:"language"`
	Format       string `yaml:"format"`
	SupportsPDF  bool   `yaml:"supports_pdf"`
}

type ProviderOptionsInterface

type ProviderOptionsInterface interface {
	// Apply merges default options with schema options and returns a config type the provider can use.
	Apply(defaults any, config Config) (any, error)
}

ProviderOptionsInterface allows each provider to define their own options struct

type ProviderService

type ProviderService struct {
	// contains filtered or unexported fields
}

ProviderService composes providers with their dependencies

func NewProviderService

func NewProviderService(provider OCRProvider, config Config) *ProviderService

func (*ProviderService) Complete

func (s *ProviderService) Complete(ctx context.Context, text string) (string, error)

func (*ProviderService) GetConfig

func (s *ProviderService) GetConfig() Config

func (*ProviderService) OCR

func (s *ProviderService) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

func (*ProviderService) SupportsPDF

func (s *ProviderService) SupportsPDF() bool

type RateLimitConfig

type RateLimitConfig struct {
	RPS   float64 `yaml:"rps"`
	Burst int     `yaml:"burst"`
}

type RateLimiter

type RateLimiter struct {
	// contains filtered or unexported fields
}

func NewRateLimiter

func NewRateLimiter(config RateLimitConfig) *RateLimiter

func (*RateLimiter) IsEnabled

func (r *RateLimiter) IsEnabled() bool

func (*RateLimiter) Wait

func (r *RateLimiter) Wait(ctx context.Context) error

type TesseractClient

type TesseractClient struct{}

func (*TesseractClient) ConstructTesseractCommand

func (c *TesseractClient) ConstructTesseractCommand(ctx context.Context, args []string) *exec.Cmd

func (*TesseractClient) IsTesseractInstalled

func (c *TesseractClient) IsTesseractInstalled() bool

IsTesseractInstalled checks if tesseract is installed in $PATH

func (*TesseractClient) OCRImageCmd

func (c *TesseractClient) OCRImageCmd(ctx context.Context, image image.Image, lang string) (*OCRResult, error)

func (*TesseractClient) RunTesseractCommand

func (c *TesseractClient) RunTesseractCommand(ctx context.Context, cmd *exec.Cmd, input *bytes.Buffer) (string, error)

type TesseractProvider

type TesseractProvider struct {
	// contains filtered or unexported fields
}

Impliments the Provider interface

func (*TesseractProvider) GetConfig

func (p *TesseractProvider) GetConfig() Config

func (*TesseractProvider) HOCRImage

func (p *TesseractProvider) HOCRImage(ctx context.Context, input OCRInput) (*OCRResult, error)

func (*TesseractProvider) OCR

func (p *TesseractProvider) OCR(ctx context.Context, input OCRInput) (*OCRResult, error)

type TextCorrectionConfig

type TextCorrectionConfig struct {
	Enabled   bool            `yaml:"enabled"`
	Provider  ProviderConfig  `yaml:"provider"`
	RateLimit RateLimitConfig `yaml:"rate_limit"`
}

type TextProcessor

type TextProcessor interface {
	Complete(ctx context.Context, text string) (string, error)
}

func NewTextCorrectionProvider

func NewTextCorrectionProvider(config Config) (TextProcessor, error)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL