Documentation
¶
Overview ¶
Package ocr provides optical character recognition capabilities for document processing.
Package ocr provides optical character recognition capabilities for document processing.
Index ¶
- Constants
- func GetDefaultModelForProvider(provider ProviderType) string
- func ValidateProviderConfig(cfg *VisionClientConfig) error
- type AnthropicVisionClient
- func (a *AnthropicVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollamaTypes.OCRWord, error)
- func (a *AnthropicVisionClient) HealthCheck(ctx context.Context, model string) error
- func (a *AnthropicVisionClient) Name() string
- func (a *AnthropicVisionClient) SupportedModels() []string
- type Config
- type DocumentOCR
- type GoogleVisionClient
- func (g *GoogleVisionClient) Close() error
- func (g *GoogleVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollamaTypes.OCRWord, error)
- func (g *GoogleVisionClient) HealthCheck(ctx context.Context, model string) error
- func (g *GoogleVisionClient) Name() string
- func (g *GoogleVisionClient) SupportedModels() []string
- type Line
- type OllamaVisionClient
- func (o *OllamaVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollama.OCRWord, error)
- func (o *OllamaVisionClient) HealthCheck(ctx context.Context, model string) error
- func (o *OllamaVisionClient) Name() string
- func (o *OllamaVisionClient) SupportedModels() []string
- type OpenAIVisionClient
- func (o *OpenAIVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollamaTypes.OCRWord, error)
- func (o *OpenAIVisionClient) HealthCheck(ctx context.Context, model string) error
- func (o *OpenAIVisionClient) Name() string
- func (o *OpenAIVisionClient) SupportedModels() []string
- type PageOCR
- type Paragraph
- type Processor
- type ProviderType
- type Rectangle
- type Result
- type VisionClient
- type VisionClientConfig
- type Word
Constants ¶
const ( // DefaultModel is the default Ollama model for OCR DefaultModel = "llava" // DefaultTemperature for OCR (0.0 for deterministic output) DefaultTemperature = 0.0 )
Variables ¶
This section is empty.
Functions ¶
func GetDefaultModelForProvider ¶ added in v1.1.0
func GetDefaultModelForProvider(provider ProviderType) string
GetDefaultModelForProvider returns a recommended default model for the given provider
func ValidateProviderConfig ¶ added in v1.1.0
func ValidateProviderConfig(cfg *VisionClientConfig) error
ValidateProviderConfig validates that the provider configuration is complete and correct
Types ¶
type AnthropicVisionClient ¶ added in v1.1.0
type AnthropicVisionClient struct {
// contains filtered or unexported fields
}
AnthropicVisionClient implements VisionClient for Anthropic's Claude API
func NewAnthropicVisionClient ¶ added in v1.1.0
func NewAnthropicVisionClient(apiKey string, temperature float64, maxRetries int, log *logger.Logger) *AnthropicVisionClient
NewAnthropicVisionClient creates a new Anthropic Claude vision client
func (*AnthropicVisionClient) GenerateOCR ¶ added in v1.1.0
func (a *AnthropicVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollamaTypes.OCRWord, error)
GenerateOCR performs OCR using Anthropic's Claude vision API
func (*AnthropicVisionClient) HealthCheck ¶ added in v1.1.0
func (a *AnthropicVisionClient) HealthCheck(ctx context.Context, model string) error
HealthCheck verifies that the Anthropic API is accessible
func (*AnthropicVisionClient) Name ¶ added in v1.1.0
func (a *AnthropicVisionClient) Name() string
Name returns the provider name
func (*AnthropicVisionClient) SupportedModels ¶ added in v1.1.0
func (a *AnthropicVisionClient) SupportedModels() []string
SupportedModels returns a list of Anthropic Claude models with vision capabilities
type Config ¶
type Config struct {
Logger *logger.Logger
VisionClient VisionClient // Pre-configured vision client (optional, for advanced usage)
// Legacy Ollama-specific fields (deprecated, use VisionConfig instead)
OllamaEndpoint string // default: "http://localhost:11434"
Model string // default: "llava"
Temperature float64 // default: 0.0 for deterministic output
MaxRetries int // default: 3
// New unified configuration
VisionConfig *VisionClientConfig // Vision client configuration (preferred)
}
Config holds configuration for the OCR processor
type DocumentOCR ¶
type DocumentOCR struct {
// DocumentID is the unique identifier for the document
DocumentID string
// Pages contains OCR results for each page
Pages []PageOCR
// TotalPages is the total number of pages processed
TotalPages int
// TotalWords is the total number of words recognized
TotalWords int
// AverageConfidence is the average confidence across all pages
AverageConfidence float64
// ProcessingTime is the time taken to process the document (in seconds)
ProcessingTime float64
// Language is the OCR language(s) used
Language string
}
DocumentOCR represents OCR results for an entire document
func NewDocumentOCR ¶
func NewDocumentOCR(documentID string, language string) *DocumentOCR
NewDocumentOCR creates a new DocumentOCR
func (*DocumentOCR) AddPage ¶
func (d *DocumentOCR) AddPage(page PageOCR)
AddPage adds a page to the document OCR results
func (*DocumentOCR) Finalize ¶
func (d *DocumentOCR) Finalize()
Finalize calculates summary statistics after all pages are processed
type GoogleVisionClient ¶ added in v1.1.0
type GoogleVisionClient struct {
// contains filtered or unexported fields
}
GoogleVisionClient implements VisionClient for Google's Gemini API
func NewGoogleVisionClient ¶ added in v1.1.0
func NewGoogleVisionClient(ctx context.Context, apiKey string, temperature float64, maxRetries int, log *logger.Logger) (*GoogleVisionClient, error)
NewGoogleVisionClient creates a new Google Gemini vision client
func (*GoogleVisionClient) Close ¶ added in v1.1.0
func (g *GoogleVisionClient) Close() error
Close closes the Google client
func (*GoogleVisionClient) GenerateOCR ¶ added in v1.1.0
func (g *GoogleVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollamaTypes.OCRWord, error)
GenerateOCR performs OCR using Google's Gemini vision API
func (*GoogleVisionClient) HealthCheck ¶ added in v1.1.0
func (g *GoogleVisionClient) HealthCheck(ctx context.Context, model string) error
HealthCheck verifies that the Gemini API is accessible
func (*GoogleVisionClient) Name ¶ added in v1.1.0
func (g *GoogleVisionClient) Name() string
Name returns the provider name
func (*GoogleVisionClient) SupportedModels ¶ added in v1.1.0
func (g *GoogleVisionClient) SupportedModels() []string
SupportedModels returns a list of Google Gemini vision models
type Line ¶
type Line struct {
// Words contains the words in this line
Words []Word
// BoundingBox is the bounding box for the entire line
BoundingBox Rectangle
// Text is the concatenated text of all words in the line
Text string
// Confidence is the average confidence of all words in the line
Confidence float64
}
Line represents a line of text (multiple words)
type OllamaVisionClient ¶ added in v1.1.0
type OllamaVisionClient struct {
// contains filtered or unexported fields
}
OllamaVisionClient is an adapter that implements VisionClient for Ollama
func NewOllamaVisionClient ¶ added in v1.1.0
func NewOllamaVisionClient(endpoint string, maxRetries int, log *logger.Logger) *OllamaVisionClient
NewOllamaVisionClient creates a new Ollama vision client
func (*OllamaVisionClient) GenerateOCR ¶ added in v1.1.0
func (o *OllamaVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollama.OCRWord, error)
GenerateOCR performs OCR on a base64-encoded image and returns structured word data
func (*OllamaVisionClient) HealthCheck ¶ added in v1.1.0
func (o *OllamaVisionClient) HealthCheck(ctx context.Context, model string) error
HealthCheck verifies that Ollama is accessible and the model is available
func (*OllamaVisionClient) Name ¶ added in v1.1.0
func (o *OllamaVisionClient) Name() string
Name returns the provider name
func (*OllamaVisionClient) SupportedModels ¶ added in v1.1.0
func (o *OllamaVisionClient) SupportedModels() []string
SupportedModels returns a list of commonly used Ollama vision models
type OpenAIVisionClient ¶ added in v1.1.0
type OpenAIVisionClient struct {
// contains filtered or unexported fields
}
OpenAIVisionClient implements VisionClient for OpenAI's GPT-4 Vision API
func NewOpenAIVisionClient ¶ added in v1.1.0
func NewOpenAIVisionClient(apiKey string, temperature float64, maxRetries int, log *logger.Logger) *OpenAIVisionClient
NewOpenAIVisionClient creates a new OpenAI vision client
func (*OpenAIVisionClient) GenerateOCR ¶ added in v1.1.0
func (o *OpenAIVisionClient) GenerateOCR(ctx context.Context, model string, imageData string) ([]ollamaTypes.OCRWord, error)
GenerateOCR performs OCR using OpenAI's vision API
func (*OpenAIVisionClient) HealthCheck ¶ added in v1.1.0
func (o *OpenAIVisionClient) HealthCheck(ctx context.Context, model string) error
HealthCheck verifies that the OpenAI API is accessible
func (*OpenAIVisionClient) Name ¶ added in v1.1.0
func (o *OpenAIVisionClient) Name() string
Name returns the provider name
func (*OpenAIVisionClient) SupportedModels ¶ added in v1.1.0
func (o *OpenAIVisionClient) SupportedModels() []string
SupportedModels returns a list of OpenAI vision models
type PageOCR ¶
type PageOCR struct {
// PageNumber is the page number (1-indexed)
PageNumber int
// Words contains all recognized words on the page with their positions
Words []Word
// Text is the full text content of the page (for convenience)
Text string
// Confidence is the overall confidence score for the page (0-100)
Confidence float64
// Width is the page width in pixels
Width int
// Height is the page height in pixels
Height int
// Language is the detected or configured language
Language string
}
PageOCR represents OCR results for a single page
func NewPageOCR ¶
NewPageOCR creates a new PageOCR result
func (*PageOCR) BuildText ¶
func (p *PageOCR) BuildText()
BuildText concatenates all word text to build the full page text
func (*PageOCR) CalculateConfidence ¶
func (p *PageOCR) CalculateConfidence()
CalculateConfidence calculates the average confidence for the page
type Paragraph ¶
type Paragraph struct {
// Lines contains the lines in this paragraph
Lines []Line
// BoundingBox is the bounding box for the entire paragraph
BoundingBox Rectangle
// Text is the concatenated text of all lines in the paragraph
Text string
// Confidence is the average confidence of all lines in the paragraph
Confidence float64
}
Paragraph represents a paragraph (multiple lines)
type Processor ¶
type Processor struct {
// contains filtered or unexported fields
}
Processor handles OCR processing using a vision client
func (*Processor) HealthCheck ¶
HealthCheck verifies that the vision client is accessible and the model is available
func (*Processor) ProcessImage ¶
ProcessImage performs OCR on an image and returns structured results
type ProviderType ¶ added in v1.1.0
type ProviderType string
ProviderType represents the type of LLM provider
const ( // ProviderOllama represents a local Ollama instance ProviderOllama ProviderType = "ollama" // ProviderOpenAI represents OpenAI's GPT-4 Vision API ProviderOpenAI ProviderType = "openai" // ProviderAnthropic represents Anthropic's Claude API with vision ProviderAnthropic ProviderType = "anthropic" // ProviderGoogle represents Google's Gemini API ProviderGoogle ProviderType = "google" )
type Rectangle ¶
type Rectangle struct {
// X is the left coordinate (pixels from left edge)
X int
// Y is the top coordinate (pixels from top edge)
Y int
// Width is the width of the rectangle in pixels
Width int
// Height is the height of the rectangle in pixels
Height int
}
Rectangle represents a rectangular bounding box
func NewRectangle ¶
NewRectangle creates a new Rectangle
func (Rectangle) Intersects ¶
Intersects returns true if this rectangle intersects with another
type Result ¶
type Result struct {
// DocumentOCR contains the OCR results
DocumentOCR *DocumentOCR
// Success indicates if OCR completed successfully
Success bool
// Error contains any error message if Success is false
Error string
}
Result represents the result of an OCR operation
type VisionClient ¶ added in v1.1.0
type VisionClient interface {
// GenerateOCR performs OCR on a base64-encoded image and returns structured word data
GenerateOCR(ctx context.Context, model string, imageData string) ([]ollama.OCRWord, error)
// HealthCheck verifies that the provider is accessible and the model is available
HealthCheck(ctx context.Context, model string) error
// Name returns the name of the provider (e.g., "ollama", "openai", "anthropic", "google")
Name() string
// SupportedModels returns a list of supported model names for this provider
SupportedModels() []string
}
VisionClient is an interface for vision-capable LLM providers that can perform OCR
func NewVisionClient ¶ added in v1.1.0
func NewVisionClient(ctx context.Context, cfg *VisionClientConfig, log *logger.Logger) (VisionClient, error)
NewVisionClient creates a vision client based on the provider configuration
type VisionClientConfig ¶ added in v1.1.0
type VisionClientConfig struct {
// Provider is the LLM provider type (ollama, openai, anthropic, google)
Provider ProviderType
// Model is the specific model to use (e.g., "llava", "gpt-4-vision-preview", "claude-3-5-sonnet-20241022", "gemini-1.5-pro")
Model string
// Endpoint is the API endpoint (required for Ollama, optional for cloud providers)
Endpoint string
// APIKey is the API key for cloud providers (read from env vars)
APIKey string
// MaxRetries is the maximum number of retry attempts
MaxRetries int
// Temperature controls randomness (0.0 = deterministic, recommended for OCR)
Temperature float64
}
VisionClientConfig holds common configuration for all vision clients
type Word ¶
type Word struct {
// Text is the recognized text content
Text string
// BoundingBox is the position and size of the word on the page
BoundingBox Rectangle
// Confidence is the recognition confidence score (0-100)
Confidence float64
// FontSize is the estimated font size in points
FontSize float64
// Bold indicates if the word appears to be bold
Bold bool
// Italic indicates if the word appears to be italic
Italic bool
}
Word represents a single recognized word with its bounding box