parser

package
v0.0.8 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 20, 2025 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Block

type Block interface {
	// Identification
	ID() string
	BlockType() BlockType
	EntityTypes() []EntityType
	Page() int

	// Content
	Text() string
	TextType() string
	Confidence() float64

	// Table specific
	RowIndex() int
	ColumnIndex() int
	RowSpan() int
	ColumnSpan() int

	// Selection elements
	SelectionStatus() string

	// Relationships
	Children() []Block
	Parents() []Block
	Relationships() []Relationship

	// Geometry
	BoundingBox() BoundingBox
	Polygon() []Point
}

Block is the basic unit of Textract data

type BlockProcessor

type BlockProcessor interface {
	ProcessBlock(ctx context.Context, block *textract.Block) (Block, error)
}

BlockProcessor is an interface for custom block processing

type BlockType

type BlockType string

BlockType represents the type of a Textract block

const (
	BlockTypePage             BlockType = "PAGE"
	BlockTypeLine             BlockType = "LINE"
	BlockTypeWord             BlockType = "WORD"
	BlockTypeTable            BlockType = "TABLE"
	BlockTypeCell             BlockType = "CELL"
	BlockTypeKeyValueSet      BlockType = "KEY_VALUE_SET"
	BlockTypeSelectionElement BlockType = "SELECTION_ELEMENT"
	BlockTypeSignature        BlockType = "SIGNATURE"
	BlockTypeQuery            BlockType = "QUERY"
	BlockTypeQueryResult      BlockType = "QUERY_RESULT"
)

type BoundingBox

type BoundingBox struct {
	Width  float64
	Height float64
	Left   float64
	Top    float64
}

BoundingBox represents a coarse-grained boundary

type Cell

type Cell interface {
	// Content
	Text() string
	Confidence() float64
	EntityTypes() []EntityType
	IsColumnHeader() bool

	// Position
	RowIndex() int
	ColumnIndex() int
	RowSpan() int
	ColumnSpan() int

	// Navigation
	Table() Table

	// Geometry
	BoundingBox() BoundingBox
	Polygon() []Point
}

Cell represents a table cell

type Document

type Document interface {
	// Core access
	Pages() []Page
	Raw() *TextractResponse

	// Navigation
	GetPageByIndex(idx int) (Page, error)

	// Search and filtering
	FindKeyValuePairs(key string) []KeyValue
	FilterBlocks(opts FilterOptions) []Block

	// Metadata
	PageCount() int
	DocumentMetadata() DocumentMetadata
}

Document is the primary container for Textract response data

func LoadFromJSON

func LoadFromJSON(filename string) ([]Document, error)

LoadFromJSON creates Documents from a JSON file

func LoadFromJSONReader

func LoadFromJSONReader(r io.Reader) ([]Document, error)

LoadFromJSONReader creates Documents from a JSON reader

func NewDocument

func NewDocument(response *TextractResponse, opts ...DocumentOption) (Document, error)

NewDocument creates a new Document from a TextractResponse

type DocumentMetadata

type DocumentMetadata struct {
	Pages int
}

DocumentMetadata contains document-level metadata

type DocumentOption

type DocumentOption func(*DocumentOptions)

DocumentOption configures document parsing behavior

type DocumentOptions

type DocumentOptions struct {
	ConfidenceThreshold float64
	EnableMergedCells   bool
	CustomProcessors    map[BlockType]BlockProcessor
}

DocumentOptions configures document processing behavior

type EntityType

type EntityType string

EntityType represents the type of entity in a Textract block

const (
	EntityTypeKey                 EntityType = "KEY"
	EntityTypeValue               EntityType = "VALUE"
	EntityTypeColumnHeader        EntityType = "COLUMN_HEADER"
	EntityTypeTableTitle          EntityType = "TABLE_TITLE"
	EntityTypeTableSectionTitle   EntityType = "TABLE_SECTION_TITLE"
	EntityTypeTableFooter         EntityType = "TABLE_FOOTER"
	EntityTypeTableSummary        EntityType = "TABLE_SUMMARY"
	EntityTypeStructuredTable     EntityType = "STRUCTURED_TABLE"
	EntityTypeSemiStructuredTable EntityType = "SEMI_STRUCTURED_TABLE"
)

type FilterOptions

type FilterOptions struct {
	MinConfidence float64
	BlockTypes    []BlockType
	EntityTypes   []EntityType
}

FilterOptions provides filtering criteria for blocks

type Form

type Form interface {
	// Content
	Fields() []KeyValue
	SelectionElements() []SelectionElement

	// Search
	GetFieldByKey(key string) KeyValue
	SearchFieldsByKey(key string) []KeyValue

	// Navigation
	Page() Page
}

Form represents a form structure

type Geometry

type Geometry struct {
	BoundingBox BoundingBox
	Polygon     []Point
}

Geometry represents position information for blocks

type JSONBlock

type JSONBlock struct {
	ID              string             `json:"Id"`
	BlockType       string             `json:"BlockType"`
	EntityTypes     []string           `json:"EntityTypes,omitempty"`
	Text            string             `json:"Text,omitempty"`
	Confidence      float64            `json:"Confidence"`
	Geometry        *JSONGeometry      `json:"Geometry"`
	Relationships   []JSONRelationship `json:"Relationships,omitempty"`
	RowIndex        int                `json:"RowIndex,omitempty"`
	ColumnIndex     int                `json:"ColumnIndex,omitempty"`
	RowSpan         int                `json:"RowSpan,omitempty"`
	ColumnSpan      int                `json:"ColumnSpan,omitempty"`
	SelectionStatus string             `json:"SelectionStatus,omitempty"`
	Page            int                `json:"Page,omitempty"`
}

JSONBlock represents a Textract block in JSON format

type JSONBoundingBox

type JSONBoundingBox struct {
	Width  float64 `json:"Width"`
	Height float64 `json:"Height"`
	Left   float64 `json:"Left"`
	Top    float64 `json:"Top"`
}

JSONBoundingBox represents a bounding box in JSON format

type JSONDocumentMetadata

type JSONDocumentMetadata struct {
	Pages int `json:"Pages"`
}

JSONDocumentMetadata represents document metadata in JSON format

type JSONGeometry

type JSONGeometry struct {
	BoundingBox *JSONBoundingBox `json:"BoundingBox"`
	Polygon     []JSONPoint      `json:"Polygon"`
}

JSONGeometry represents geometry information in JSON format

type JSONPoint

type JSONPoint struct {
	X float64 `json:"X"`
	Y float64 `json:"Y"`
}

JSONPoint represents a point in JSON format

type JSONRelationship

type JSONRelationship struct {
	Type string   `json:"Type"`
	IDs  []string `json:"Ids"`
}

JSONRelationship represents a relationship in JSON format

type JSONResponse

type JSONResponse struct {
	DocumentMetadata *JSONDocumentMetadata `json:"DocumentMetadata"`
	Blocks           []*JSONBlock          `json:"Blocks"`
}

JSONResponse represents a complete Textract response in JSON format

type KeyValue

type KeyValue interface {
	// Content
	Key() Block
	Value() Block

	// Helper methods
	KeyText() string
	ValueText() string
	Confidence() float64

	// Navigation
	Form() Form
}

KeyValue represents a key-value pair

type Line

type Line interface {
	// Content
	Text() string
	Words() []string
	Confidence() float64
	EntityTypes() []EntityType

	// Navigation
	Page() Page

	// Geometry
	BoundingBox() BoundingBox
	Polygon() []Point
}

Line represents a text line

type MergedCell

type MergedCell interface {
	Cell
	MergedRowSpan() int
	MergedColumnSpan() int
	ContainedCells() []Cell
	EntityTypes() []EntityType
}

MergedCell represents a cell that spans multiple rows or columns

type Page

type Page interface {
	// Content access
	Lines() []Line
	Tables() []Table
	Forms() []Form
	Words() []string

	// Navigation
	Document() Document
	Number() int

	// Geometry
	BoundingBox() BoundingBox
	Polygon() []Point
	EntityTypes() []EntityType
}

Page represents a single document page

type Parser

type Parser interface {
	ParseDocument(ctx context.Context, response *TextractResponse, opts ...DocumentOption) (Document, error)
}

Parser is the main interface for creating Document instances

type Point

type Point struct {
	X float64
	Y float64
}

Point represents a coordinate pair

type Query

type Query interface {
	// Text returns the query text
	Text() string
	EntityTypes() []EntityType

	// Alias returns the query alias if one was specified
	Alias() string

	// Results returns all answers found for this query
	Results() []QueryResult

	// Page returns the parent page containing this query
	Page() Page
}

Query represents a question asked of a document and its results

type QueryResult

type QueryResult interface {
	// Text returns the result text
	Text() string
	EntityTypes() []EntityType

	// Confidence returns the confidence score for this result
	Confidence() float64

	// Query returns the parent query
	Query() Query

	// Block returns the underlying block
	Block() Block
}

QueryResult represents an answer to a query

type Relationship

type Relationship struct {
	Type string
	IDs  []string
}

Relationship represents a relationship between blocks

type SelectionElement

type SelectionElement interface {
	// Status
	IsSelected() bool
	SelectionStatus() SelectionStatus
	Confidence() float64
	EntityTypes() []EntityType

	// Navigation
	Block() Block
	Form() Form

	// Geometry
	BoundingBox() BoundingBox
	Polygon() []Point
}

SelectionElement represents a checkbox or radio button in a form

type SelectionElementType

type SelectionElementType string

SelectionElementType represents the type of selection element

const (
	SelectionElementTypeCheckbox    SelectionElementType = "CHECKBOX"
	SelectionElementTypeRadioButton SelectionElementType = "RADIO_BUTTON"
)

type SelectionStatus

type SelectionStatus string

SelectionStatus represents the selection status of a block

const (
	SelectionStatusSelected    SelectionStatus = "SELECTED"
	SelectionStatusNotSelected SelectionStatus = "NOT_SELECTED"
)

type Table

type Table interface {
	// Structure
	Rows() []TableRow
	Cells() [][]Cell
	MergedCells() []MergedCell
	GetHeaders() []Cell

	// Metadata
	RowCount() int
	ColumnCount() int

	// Navigation
	Page() Page
	GetCellByPosition(row, col int) (Cell, error)

	// Geometry
	BoundingBox() BoundingBox
	Polygon() []Point
	EntityTypes() []EntityType
}

Table represents a table structure

type TableRow

type TableRow interface {
	Cells() []Cell
	RowIndex() int
	Table() Table
}

TableRow represents a row in a table

type TextType

type TextType string

TextType represents the type of text in a block

type TextractBlock

type TextractBlock struct {
	BlockType     *string
	Confidence    *float64
	Text          *string
	RowIndex      *int64
	ColumnIndex   *int64
	RowSpan       *int64
	ColumnSpan    *int64
	Geometry      *TextractGeometry
	ID            *string
	Relationships []*TextractRelationship
	EntityTypes   []*string
	Page          *int64
}

TextractBlock represents the raw AWS Textract Block structure

type TextractBoundingBox

type TextractBoundingBox struct {
	Width  *float64
	Height *float64
	Left   *float64
	Top    *float64
}

TextractBoundingBox represents a coarse-grained boundary

type TextractDocument

type TextractDocument struct {
	DocumentMetadata *TextractDocumentMetadata
	Blocks           []*TextractBlock
}

TextractDocument represents the top-level document structure

func ConvertFromAWS

func ConvertFromAWS(response *textract.GetDocumentAnalysisOutput) (*TextractDocument, error)

ConvertFromAWS converts from AWS Textract types to our internal types

type TextractDocumentMetadata

type TextractDocumentMetadata struct {
	Pages *int64
}

TextractDocumentMetadata contains document-level metadata

type TextractGeometry

type TextractGeometry struct {
	BoundingBox *TextractBoundingBox
	Polygon     []*TextractPoint
}

TextractGeometry represents the position information for a block

type TextractPoint

type TextractPoint struct {
	X *float64
	Y *float64
}

TextractPoint represents a coordinate pair

type TextractRelationship

type TextractRelationship struct {
	Type *string
	IDs  []*string
}

TextractRelationship represents a relationship between blocks

type TextractResponse

type TextractResponse struct {
	Blocks   []*textract.Block
	Metadata *textract.DocumentMetadata
}

TextractResponse represents the raw response from AWS Textract

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL