extract

package

v1.49.0 Latest Latest Go to latest Published: Feb 23, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/cpcloud/micasa

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func BuildExtractionPrompt(in ExtractionPromptInput) []llm.Message
func ExtractText(data []byte, mime string, timeout time.Duration) (string, error)
func ExtractWithProgress(ctx context.Context, data []byte, mime string, extractors []Extractor) <-chan ExtractProgress
func ExtractorMaxPages(extractors []Extractor) int
func ExtractorTimeout(extractors []Extractor) time.Duration
func FormatDDLBlock(ddl map[string]string, tables []string) string
func FormatEntityRows(label string, rows []EntityRow) string
func HasMatchingExtractor(extractors []Extractor, tool string, mime string) bool
func HasPDFImages() bool
func HasPDFToHTML() bool
func HasPDFToPPM() bool
func HasPDFToText() bool
func HasTesseract() bool
func ImageOCRAvailable() bool
func IsImageMIME(mime string) bool
func IsScanned(extractedText string) bool
func NeedsOCR(extractors []Extractor, mime string) bool
func OCRAvailable() bool
func OperationsSchema() map[string]any
func StripCodeFences(s string) string
func ValidateOperations(ops []Operation, allowed map[string]AllowedOps) error
type AllowedOps
type EntityRow
type ExtractProgress
type ExtractionPromptInput
type Extractor
- func DefaultExtractors(maxPages int, timeout time.Duration) []Extractor
type ImageOCRExtractor
- func (e *ImageOCRExtractor) Available() bool
- func (e *ImageOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource, error)
- func (e *ImageOCRExtractor) Matches(mime string) bool
- func (e *ImageOCRExtractor) Tool() string
type Operation
- func ParseOperations(raw string) ([]Operation, error)
type OperationPreviewRow
- func OperationPreview(op Operation) *OperationPreviewRow
type PDFOCRExtractor
- func (e *PDFOCRExtractor) Available() bool
- func (e *PDFOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource, error)
- func (e *PDFOCRExtractor) Matches(mime string) bool
- func (e *PDFOCRExtractor) Tool() string
type PDFTextExtractor
- func (e *PDFTextExtractor) Available() bool
- func (e *PDFTextExtractor) Extract(ctx context.Context, data []byte) (TextSource, error)
- func (e *PDFTextExtractor) Matches(mime string) bool
- func (e *PDFTextExtractor) Tool() string
type Pipeline
- func (p *Pipeline) Run(ctx context.Context, data []byte, filename string, mime string) *Result
type PlainTextExtractor
- func (e *PlainTextExtractor) Available() bool
- func (e *PlainTextExtractor) Extract(_ context.Context, data []byte) (TextSource, error)
- func (e *PlainTextExtractor) Matches(mime string) bool
- func (e *PlainTextExtractor) Tool() string
type Result
- func (r *Result) HasSource(tool string) bool
- func (r *Result) SourceByTool(tool string) *TextSource
- func (r *Result) Text() string
type SchemaContext
type TextSource

Constants ¶

View Source

const (
	ActionCreate = "create"
	ActionUpdate = "update"
)

Action constants for Operation.Action.

View Source

const DefaultMaxExtractPages = 20

DefaultMaxExtractPages is the default page limit for extraction. Front-loaded info (specs, warranty, maintenance) is typically in the first pages.

View Source

const DefaultTextTimeout = 30 * time.Second

DefaultTextTimeout is the default timeout for pdftotext.

View Source

const MIMEApplicationPDF = "application/pdf"

MIMEApplicationPDF is the MIME type for PDF documents.

Variables ¶

View Source

var ExtractionAllowedOps = map[string]AllowedOps{
	"documents":         {Insert: true, Update: true},
	"vendors":           {Insert: true},
	"quotes":            {Insert: true},
	"maintenance_items": {Insert: true},
	"appliances":        {Insert: true},
}

ExtractionAllowedOps defines which operations the LLM may perform on each table. Used by ValidateOperations.

View Source

var ExtractionTables = []string{
	"documents",
	"vendors",
	"quotes",
	"maintenance_items",
	"appliances",
	"projects",
	"project_types",
	"maintenance_categories",
}

ExtractionTables is the set of tables the LLM receives DDL for and may reference in its output.

Functions ¶

func BuildExtractionPrompt ¶

func BuildExtractionPrompt(in ExtractionPromptInput) []llm.Message

BuildExtractionPrompt creates the system and user messages for document extraction. The system prompt includes the database DDL and existing entity rows; the LLM outputs a JSON array of operations.

func ExtractText ¶

func ExtractText(data []byte, mime string, timeout time.Duration) (string, error)

ExtractText pulls plain text from document content based on MIME type. Returns empty string (not an error) for unsupported MIME types. PDF extraction uses pdftotext (poppler-utils) when available, returning empty for PDFs when the tool is missing. The timeout parameter caps how long pdftotext can run (0 = DefaultTextTimeout).

This is a convenience wrapper that delegates to PDFTextExtractor and PlainTextExtractor. For full pipeline extraction, use Pipeline.Run.

func ExtractWithProgress ¶ added in v1.47.0

func ExtractWithProgress(
	ctx context.Context,
	data []byte,
	mime string,
	extractors []Extractor,
) <-chan ExtractProgress

ExtractWithProgress runs async extraction with per-page progress updates sent on the returned channel. The channel closes when processing completes. The extractors list is consulted to determine whether to run image or PDF OCR. Unsupported types produce a single Done message with empty text.

func ExtractorMaxPages ¶ added in v1.47.0

func ExtractorMaxPages(extractors []Extractor) int

ExtractorMaxPages returns the max pages from the first PDFOCRExtractor in the list, or 0 (meaning "use default") if none is found.

func ExtractorTimeout ¶ added in v1.47.0

func ExtractorTimeout(extractors []Extractor) time.Duration

ExtractorTimeout returns the timeout from the first PDFTextExtractor in the list, or 0 (meaning "use default") if none is found.

func FormatDDLBlock ¶ added in v1.49.0

func FormatDDLBlock(ddl map[string]string, tables []string) string

FormatDDLBlock formats the DDL map as a SQL comment block for inclusion in the LLM system prompt.

func FormatEntityRows ¶ added in v1.49.0

func FormatEntityRows(label string, rows []EntityRow) string

FormatEntityRows formats a named set of entity rows as SQL comments for inclusion in the LLM system prompt.

func HasMatchingExtractor ¶ added in v1.47.0

func HasMatchingExtractor(extractors []Extractor, tool string, mime string) bool

HasMatchingExtractor reports whether any extractor in the list with the given tool name matches the MIME type and is available.

func HasPDFImages ¶ added in v1.49.0

func HasPDFImages() bool

HasPDFImages reports whether the pdfimages binary (from poppler-utils) is on PATH. The result is cached for the process lifetime.

func HasPDFToHTML ¶ added in v1.49.0

func HasPDFToHTML() bool

HasPDFToHTML reports whether the pdftohtml binary (from poppler-utils) is on PATH. The result is cached for the process lifetime.

func HasPDFToPPM ¶

func HasPDFToPPM() bool

HasPDFToPPM reports whether the pdftoppm binary (from poppler-utils) is on PATH. The result is cached for the process lifetime.

func HasPDFToText ¶

func HasPDFToText() bool

HasPDFToText reports whether the pdftotext binary (from poppler-utils) is on PATH. The result is cached for the process lifetime.

func HasTesseract ¶

func HasTesseract() bool

HasTesseract reports whether the tesseract binary is on PATH. The result is cached for the process lifetime.

func ImageOCRAvailable ¶

func ImageOCRAvailable() bool

ImageOCRAvailable reports whether tesseract is available for direct image OCR (no pdftoppm needed for image files).

func IsImageMIME ¶

func IsImageMIME(mime string) bool

IsImageMIME reports whether the MIME type is an image format that tesseract can process.

func IsScanned ¶

func IsScanned(extractedText string) bool

IsScanned returns true if the extracted text is empty or whitespace-only, indicating the document likely needs OCR.

func NeedsOCR ¶ added in v1.47.0

func NeedsOCR(extractors []Extractor, mime string) bool

NeedsOCR reports whether any OCR-capable extractor in the list matches the MIME type and is available. Use this instead of checking tool names directly so callers don't couple to extractor internals.

func OCRAvailable ¶

func OCRAvailable() bool

OCRAvailable reports whether tesseract and at least one PDF image extraction tool (pdfimages, pdftohtml, or pdftoppm) are available.

func OperationsSchema ¶ added in v1.49.0

func OperationsSchema() map[string]any

OperationsSchema returns the JSON Schema for structured extraction output. The schema constrains model output to {"operations": [...]}, where each operation has action, table, and data fields.

func StripCodeFences ¶

func StripCodeFences(s string) string

StripCodeFences removes markdown code fences that LLMs sometimes wrap around JSON output. Handles fences anywhere in the text (not just at the start), since LLMs may produce commentary before the fenced block.

func ValidateOperations ¶ added in v1.49.0

func ValidateOperations(ops []Operation, allowed map[string]AllowedOps) error

ValidateOperations checks each operation against the allowed tables and action types. Returns an error describing the first violation found.

Types ¶

type AllowedOps ¶ added in v1.49.0

type AllowedOps struct {
	Insert bool
	Update bool
}

AllowedOps specifies which operations are permitted on a table. Insert maps to "create", Update maps to "update".

type EntityRow ¶ added in v1.49.0

type EntityRow struct {
	ID   uint
	Name string
}

EntityRow is a lightweight (id, name) pair for FK context in LLM prompts.

type ExtractProgress ¶ added in v1.47.0

type ExtractProgress struct {
	Tool  string // extractor tool name (set on Done)
	Desc  string // human description (set on Done)
	Phase string // e.g. "rasterize", "extract"
	Page  int    // current page (1-indexed)
	Total int    // total pages (0 until known)
	Done  bool   // all phases finished
	Text  string // accumulated text (set on Done)
	Data  []byte // structured data (set on Done)
	Err   error  // set on failure
}

ExtractProgress reports incremental progress from ExtractWithProgress.

type ExtractionPromptInput ¶

type ExtractionPromptInput struct {
	DocID     uint
	Filename  string
	MIME      string
	SizeBytes int64
	Schema    SchemaContext
	Sources   []TextSource
}

ExtractionPromptInput holds the inputs for building an extraction prompt.

type Extractor ¶ added in v1.47.0

type Extractor interface {
	Tool() string
	Matches(mime string) bool
	Available() bool
	Extract(ctx context.Context, data []byte) (TextSource, error)
}

Extractor extracts text from document bytes.

func DefaultExtractors ¶ added in v1.47.0

func DefaultExtractors(maxPages int, timeout time.Duration) []Extractor

DefaultExtractors returns the standard extractors in priority order: pdftotext, plaintext, PDF OCR, image OCR. Zero values for maxPages and timeout cause the concrete extractors to use their own defaults.

type ImageOCRExtractor ¶ added in v1.47.0

type ImageOCRExtractor struct{}

ImageOCRExtractor wraps ocrImage for direct image OCR.

func (*ImageOCRExtractor) Available ¶ added in v1.47.0

func (e *ImageOCRExtractor) Available() bool

func (*ImageOCRExtractor) Extract ¶ added in v1.47.0

func (e *ImageOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource, error)

func (*ImageOCRExtractor) Matches ¶ added in v1.47.0

func (e *ImageOCRExtractor) Matches(mime string) bool

func (*ImageOCRExtractor) Tool ¶ added in v1.47.0

func (e *ImageOCRExtractor) Tool() string

type Operation ¶ added in v1.49.0

type Operation struct {
	Action string         `json:"action"` // ActionCreate or ActionUpdate
	Table  string         `json:"table"`
	Data   map[string]any `json:"data"`
}

Operation is a single create/update action the LLM wants to perform.

func ParseOperations ¶ added in v1.49.0

func ParseOperations(raw string) ([]Operation, error)

ParseOperations unmarshals the schema-constrained {"operations": [...]} response from the LLM.

type OperationPreviewRow ¶ added in v1.49.0

type OperationPreviewRow struct {
	Table   string
	Op      string // "create" or "update"
	RowID   uint   // nonzero for update (from data["id"] or separate field)
	Columns []string
	Values  []string
}

OperationPreviewRow holds the column-value pairs from an Operation for rendering as a mini table in the extraction overlay.

func OperationPreview ¶ added in v1.49.0

func OperationPreview(op Operation) *OperationPreviewRow

OperationPreview extracts column-value pairs from an Operation for display.

type PDFOCRExtractor ¶ added in v1.47.0

type PDFOCRExtractor struct {
	MaxPages int
}

PDFOCRExtractor wraps ocrPDF for scanned PDF pages.

func (*PDFOCRExtractor) Available ¶ added in v1.47.0

func (e *PDFOCRExtractor) Available() bool

func (*PDFOCRExtractor) Extract ¶ added in v1.47.0

func (e *PDFOCRExtractor) Extract(ctx context.Context, data []byte) (TextSource, error)

func (*PDFOCRExtractor) Matches ¶ added in v1.47.0

func (e *PDFOCRExtractor) Matches(mime string) bool

func (*PDFOCRExtractor) Tool ¶ added in v1.47.0

func (e *PDFOCRExtractor) Tool() string

type PDFTextExtractor ¶ added in v1.47.0

type PDFTextExtractor struct {
	Timeout time.Duration
}

PDFTextExtractor wraps pdftotext for digital PDF text extraction.

func (*PDFTextExtractor) Available ¶ added in v1.47.0

func (e *PDFTextExtractor) Available() bool

func (*PDFTextExtractor) Extract ¶ added in v1.47.0

func (e *PDFTextExtractor) Extract(ctx context.Context, data []byte) (TextSource, error)

func (*PDFTextExtractor) Matches ¶ added in v1.47.0

func (e *PDFTextExtractor) Matches(mime string) bool

func (*PDFTextExtractor) Tool ¶ added in v1.47.0

func (e *PDFTextExtractor) Tool() string

type Pipeline ¶

type Pipeline struct {
	LLMClient  *llm.Client   // nil = skip LLM extraction
	Extractors []Extractor   // nil = DefaultExtractors(0, 0)
	Schema     SchemaContext // DDL + entity rows for prompt
	DocID      uint          // document ID for UPDATE operations
}

Pipeline orchestrates the document extraction layers: text extraction, OCR, and LLM-powered structured extraction. Each layer is independent and gracefully degrades when its dependencies are unavailable.

func (*Pipeline) Run ¶

func (p *Pipeline) Run(
	ctx context.Context,
	data []byte,
	filename string,
	mime string,
) *Result

Run executes the extraction pipeline on the given document data. It never returns a Go error -- all failures are captured in Result.Err so the caller can save the document regardless.

type PlainTextExtractor ¶ added in v1.47.0

type PlainTextExtractor struct{}

PlainTextExtractor normalizes whitespace from text/* content.

func (*PlainTextExtractor) Available ¶ added in v1.47.0

func (e *PlainTextExtractor) Available() bool

func (*PlainTextExtractor) Extract ¶ added in v1.47.0

func (e *PlainTextExtractor) Extract(_ context.Context, data []byte) (TextSource, error)

func (*PlainTextExtractor) Matches ¶ added in v1.47.0

func (e *PlainTextExtractor) Matches(mime string) bool

func (*PlainTextExtractor) Tool ¶ added in v1.47.0

func (e *PlainTextExtractor) Tool() string

type Result ¶

type Result struct {
	Sources    []TextSource // text from each extraction method
	Operations []Operation  // nil if LLM unavailable or failed
	LLMRaw     string       // raw LLM output (for display)
	LLMUsed    bool
	Err        error // non-fatal extraction error; document still saves
}

Result holds the output of a pipeline run.

func (*Result) HasSource ¶ added in v1.47.0

func (r *Result) HasSource(tool string) bool

HasSource reports whether any source matches the given tool name.

func (*Result) SourceByTool ¶ added in v1.47.0

func (r *Result) SourceByTool(tool string) *TextSource

SourceByTool returns the first source matching the given tool name, or nil if not found.

func (*Result) Text ¶ added in v1.47.0

func (r *Result) Text() string

Text returns the first non-empty text from the extraction sources.

type SchemaContext ¶ added in v1.49.0

type SchemaContext struct {
	DDL                   map[string]string // table name -> CREATE TABLE SQL
	Vendors               []EntityRow
	Projects              []EntityRow
	Appliances            []EntityRow
	MaintenanceCategories []EntityRow
	ProjectTypes          []EntityRow
}

SchemaContext provides the schema and entity data the LLM needs to generate correct operations against the database.

type TextSource ¶ added in v1.47.0

type TextSource struct {
	Tool string // "pdftotext", "plaintext", "tesseract"
	Desc string // human description for LLM context
	Text string
	Data []byte // optional structured data (e.g. OCR TSV)
}

TextSource holds text from a single extraction method.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL