Documentation
¶
Index ¶
- type Block
- type BlockProcessor
- type BlockType
- type BoundingBox
- type Cell
- type Document
- type DocumentMetadata
- type DocumentOption
- type DocumentOptions
- type EntityType
- type FilterOptions
- type Form
- type Geometry
- type JSONBlock
- type JSONBoundingBox
- type JSONDocumentMetadata
- type JSONGeometry
- type JSONPoint
- type JSONRelationship
- type JSONResponse
- type KeyValue
- type Line
- type MergedCell
- type Page
- type Parser
- type Point
- type Query
- type QueryResult
- type Relationship
- type SelectionElement
- type SelectionElementType
- type SelectionStatus
- type Table
- type TableRow
- type TextType
- type TextractBlock
- type TextractBoundingBox
- type TextractDocument
- type TextractDocumentMetadata
- type TextractGeometry
- type TextractPoint
- type TextractRelationship
- type TextractResponse
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Block ¶
type Block interface {
// Identification
ID() string
BlockType() BlockType
EntityTypes() []EntityType
Page() int
// Content
Text() string
TextType() string
Confidence() float64
// Table specific
RowIndex() int
ColumnIndex() int
RowSpan() int
ColumnSpan() int
// Selection elements
SelectionStatus() string
// Relationships
Children() []Block
Parents() []Block
Relationships() []Relationship
// Geometry
BoundingBox() BoundingBox
Polygon() []Point
}
Block is the basic unit of Textract data
type BlockProcessor ¶
type BlockProcessor interface {
ProcessBlock(ctx context.Context, block *textract.Block) (Block, error)
}
BlockProcessor is an interface for custom block processing
type BlockType ¶
type BlockType string
BlockType represents the type of a Textract block
const ( BlockTypePage BlockType = "PAGE" BlockTypeLine BlockType = "LINE" BlockTypeWord BlockType = "WORD" BlockTypeTable BlockType = "TABLE" BlockTypeCell BlockType = "CELL" BlockTypeKeyValueSet BlockType = "KEY_VALUE_SET" BlockTypeSelectionElement BlockType = "SELECTION_ELEMENT" BlockTypeSignature BlockType = "SIGNATURE" BlockTypeQuery BlockType = "QUERY" BlockTypeQueryResult BlockType = "QUERY_RESULT" )
type BoundingBox ¶
BoundingBox represents a coarse-grained boundary
type Cell ¶
type Cell interface {
// Content
Text() string
Confidence() float64
EntityTypes() []EntityType
IsColumnHeader() bool
// Position
RowIndex() int
ColumnIndex() int
RowSpan() int
ColumnSpan() int
// Navigation
Table() Table
// Geometry
BoundingBox() BoundingBox
Polygon() []Point
}
Cell represents a table cell
type Document ¶
type Document interface {
// Core access
Pages() []Page
Raw() *TextractResponse
// Navigation
GetPageByIndex(idx int) (Page, error)
// Search and filtering
FindKeyValuePairs(key string) []KeyValue
FilterBlocks(opts FilterOptions) []Block
// Metadata
PageCount() int
DocumentMetadata() DocumentMetadata
}
Document is the primary container for Textract response data
func LoadFromJSON ¶
LoadFromJSON creates Documents from a JSON file
func LoadFromJSONReader ¶
LoadFromJSONReader creates Documents from a JSON reader
func NewDocument ¶
func NewDocument(response *TextractResponse, opts ...DocumentOption) (Document, error)
NewDocument creates a new Document from a TextractResponse
type DocumentMetadata ¶
type DocumentMetadata struct {
Pages int
}
DocumentMetadata contains document-level metadata
type DocumentOption ¶
type DocumentOption func(*DocumentOptions)
DocumentOption configures document parsing behavior
type DocumentOptions ¶
type DocumentOptions struct {
ConfidenceThreshold float64
EnableMergedCells bool
CustomProcessors map[BlockType]BlockProcessor
}
DocumentOptions configures document processing behavior
type EntityType ¶
type EntityType string
EntityType represents the type of entity in a Textract block
const ( EntityTypeKey EntityType = "KEY" EntityTypeValue EntityType = "VALUE" EntityTypeColumnHeader EntityType = "COLUMN_HEADER" EntityTypeTableTitle EntityType = "TABLE_TITLE" EntityTypeTableSectionTitle EntityType = "TABLE_SECTION_TITLE" EntityTypeTableSummary EntityType = "TABLE_SUMMARY" EntityTypeStructuredTable EntityType = "STRUCTURED_TABLE" EntityTypeSemiStructuredTable EntityType = "SEMI_STRUCTURED_TABLE" )
type FilterOptions ¶
type FilterOptions struct {
MinConfidence float64
BlockTypes []BlockType
EntityTypes []EntityType
}
FilterOptions provides filtering criteria for blocks
type Form ¶
type Form interface {
// Content
Fields() []KeyValue
SelectionElements() []SelectionElement
// Search
GetFieldByKey(key string) KeyValue
SearchFieldsByKey(key string) []KeyValue
// Navigation
Page() Page
}
Form represents a form structure
type Geometry ¶
type Geometry struct {
BoundingBox BoundingBox
Polygon []Point
}
Geometry represents position information for blocks
type JSONBlock ¶
type JSONBlock struct {
ID string `json:"Id"`
BlockType string `json:"BlockType"`
EntityTypes []string `json:"EntityTypes,omitempty"`
Text string `json:"Text,omitempty"`
Confidence float64 `json:"Confidence"`
Geometry *JSONGeometry `json:"Geometry"`
Relationships []JSONRelationship `json:"Relationships,omitempty"`
RowIndex int `json:"RowIndex,omitempty"`
ColumnIndex int `json:"ColumnIndex,omitempty"`
RowSpan int `json:"RowSpan,omitempty"`
ColumnSpan int `json:"ColumnSpan,omitempty"`
SelectionStatus string `json:"SelectionStatus,omitempty"`
Page int `json:"Page,omitempty"`
}
JSONBlock represents a Textract block in JSON format
type JSONBoundingBox ¶
type JSONBoundingBox struct {
Width float64 `json:"Width"`
Height float64 `json:"Height"`
Left float64 `json:"Left"`
Top float64 `json:"Top"`
}
JSONBoundingBox represents a bounding box in JSON format
type JSONDocumentMetadata ¶
type JSONDocumentMetadata struct {
Pages int `json:"Pages"`
}
JSONDocumentMetadata represents document metadata in JSON format
type JSONGeometry ¶
type JSONGeometry struct {
BoundingBox *JSONBoundingBox `json:"BoundingBox"`
Polygon []JSONPoint `json:"Polygon"`
}
JSONGeometry represents geometry information in JSON format
type JSONRelationship ¶
JSONRelationship represents a relationship in JSON format
type JSONResponse ¶
type JSONResponse struct {
DocumentMetadata *JSONDocumentMetadata `json:"DocumentMetadata"`
Blocks []*JSONBlock `json:"Blocks"`
}
JSONResponse represents a complete Textract response in JSON format
type KeyValue ¶
type KeyValue interface {
// Content
Key() Block
Value() Block
// Helper methods
KeyText() string
ValueText() string
Confidence() float64
// Navigation
Form() Form
}
KeyValue represents a key-value pair
type Line ¶
type Line interface {
// Content
Text() string
Words() []string
Confidence() float64
EntityTypes() []EntityType
// Navigation
Page() Page
// Geometry
BoundingBox() BoundingBox
Polygon() []Point
}
Line represents a text line
type MergedCell ¶
type MergedCell interface {
Cell
MergedRowSpan() int
MergedColumnSpan() int
ContainedCells() []Cell
EntityTypes() []EntityType
}
MergedCell represents a cell that spans multiple rows or columns
type Page ¶
type Page interface {
// Content access
Lines() []Line
Tables() []Table
Forms() []Form
Words() []string
// Navigation
Document() Document
Number() int
// Geometry
BoundingBox() BoundingBox
Polygon() []Point
EntityTypes() []EntityType
}
Page represents a single document page
type Parser ¶
type Parser interface {
ParseDocument(ctx context.Context, response *TextractResponse, opts ...DocumentOption) (Document, error)
}
Parser is the main interface for creating Document instances
type Query ¶
type Query interface {
// Text returns the query text
Text() string
EntityTypes() []EntityType
// Alias returns the query alias if one was specified
Alias() string
// Results returns all answers found for this query
Results() []QueryResult
// Page returns the parent page containing this query
Page() Page
}
Query represents a question asked of a document and its results
type QueryResult ¶
type QueryResult interface {
// Text returns the result text
Text() string
EntityTypes() []EntityType
// Confidence returns the confidence score for this result
Confidence() float64
// Query returns the parent query
Query() Query
// Block returns the underlying block
Block() Block
}
QueryResult represents an answer to a query
type Relationship ¶
Relationship represents a relationship between blocks
type SelectionElement ¶
type SelectionElement interface {
// Status
IsSelected() bool
SelectionStatus() SelectionStatus
Confidence() float64
EntityTypes() []EntityType
// Navigation
Block() Block
Form() Form
// Geometry
BoundingBox() BoundingBox
Polygon() []Point
}
SelectionElement represents a checkbox or radio button in a form
type SelectionElementType ¶
type SelectionElementType string
SelectionElementType represents the type of selection element
const ( SelectionElementTypeCheckbox SelectionElementType = "CHECKBOX" SelectionElementTypeRadioButton SelectionElementType = "RADIO_BUTTON" )
type SelectionStatus ¶
type SelectionStatus string
SelectionStatus represents the selection status of a block
const ( SelectionStatusSelected SelectionStatus = "SELECTED" SelectionStatusNotSelected SelectionStatus = "NOT_SELECTED" )
type Table ¶
type Table interface {
// Structure
Rows() []TableRow
Cells() [][]Cell
MergedCells() []MergedCell
GetHeaders() []Cell
// Metadata
RowCount() int
ColumnCount() int
// Navigation
Page() Page
GetCellByPosition(row, col int) (Cell, error)
// Geometry
BoundingBox() BoundingBox
Polygon() []Point
EntityTypes() []EntityType
}
Table represents a table structure
type TextractBlock ¶
type TextractBlock struct {
BlockType *string
Confidence *float64
Text *string
RowIndex *int64
ColumnIndex *int64
RowSpan *int64
ColumnSpan *int64
Geometry *TextractGeometry
ID *string
Relationships []*TextractRelationship
EntityTypes []*string
Page *int64
}
TextractBlock represents the raw AWS Textract Block structure
type TextractBoundingBox ¶
TextractBoundingBox represents a coarse-grained boundary
type TextractDocument ¶
type TextractDocument struct {
DocumentMetadata *TextractDocumentMetadata
Blocks []*TextractBlock
}
TextractDocument represents the top-level document structure
func ConvertFromAWS ¶
func ConvertFromAWS(response *textract.GetDocumentAnalysisOutput) (*TextractDocument, error)
ConvertFromAWS converts from AWS Textract types to our internal types
type TextractDocumentMetadata ¶
type TextractDocumentMetadata struct {
Pages *int64
}
TextractDocumentMetadata contains document-level metadata
type TextractGeometry ¶
type TextractGeometry struct {
BoundingBox *TextractBoundingBox
Polygon []*TextractPoint
}
TextractGeometry represents the position information for a block
type TextractPoint ¶
TextractPoint represents a coordinate pair
type TextractRelationship ¶
TextractRelationship represents a relationship between blocks
type TextractResponse ¶
type TextractResponse struct {
Blocks []*textract.Block
Metadata *textract.DocumentMetadata
}
TextractResponse represents the raw response from AWS Textract