Documentation
¶
Index ¶
- type Block
- type BlockProcessor
- type BlockType
- type BoundingBox
- type Cell
- type Document
- type DocumentMetadata
- type DocumentOption
- type DocumentOptions
- type EntityType
- type FilterOptions
- type Form
- type Geometry
- type JSONBlock
- type JSONBoundingBox
- type JSONDocumentMetadata
- type JSONGeometry
- type JSONPoint
- type JSONRelationship
- type JSONResponse
- type KeyValue
- type Line
- type MergedCell
- type Page
- type Parser
- type Point
- type Query
- type QueryResult
- type Relationship
- type SelectionElement
- type SelectionElementType
- type SelectionStatus
- type Table
- type TableRow
- type TextType
- type TextractBlock
- type TextractBoundingBox
- type TextractDocument
- type TextractDocumentMetadata
- type TextractGeometry
- type TextractPoint
- type TextractRelationship
- type TextractResponse
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Block ¶
type Block interface { // Identification ID() string BlockType() BlockType EntityTypes() []EntityType Page() int // Content Text() string TextType() string Confidence() float64 // Table specific RowIndex() int ColumnIndex() int RowSpan() int ColumnSpan() int // Selection elements SelectionStatus() string // Relationships Children() []Block Parents() []Block Relationships() []Relationship // Geometry BoundingBox() BoundingBox Polygon() []Point }
Block is the basic unit of Textract data
type BlockProcessor ¶
type BlockProcessor interface {
ProcessBlock(ctx context.Context, block *textract.Block) (Block, error)
}
BlockProcessor is an interface for custom block processing
type BlockType ¶
type BlockType string
BlockType represents the type of a Textract block
const ( BlockTypePage BlockType = "PAGE" BlockTypeLine BlockType = "LINE" BlockTypeWord BlockType = "WORD" BlockTypeTable BlockType = "TABLE" BlockTypeCell BlockType = "CELL" BlockTypeKeyValueSet BlockType = "KEY_VALUE_SET" BlockTypeSelectionElement BlockType = "SELECTION_ELEMENT" BlockTypeSignature BlockType = "SIGNATURE" BlockTypeQuery BlockType = "QUERY" BlockTypeQueryResult BlockType = "QUERY_RESULT" )
type BoundingBox ¶
BoundingBox represents a coarse-grained boundary
type Cell ¶
type Cell interface { // Content Text() string Confidence() float64 EntityTypes() []EntityType IsColumnHeader() bool // Position RowIndex() int ColumnIndex() int RowSpan() int ColumnSpan() int // Navigation Table() Table // Geometry BoundingBox() BoundingBox Polygon() []Point }
Cell represents a table cell
type Document ¶
type Document interface { // Core access Pages() []Page Raw() *TextractResponse // Navigation GetPageByIndex(idx int) (Page, error) // Search and filtering FindKeyValuePairs(key string) []KeyValue FilterBlocks(opts FilterOptions) []Block // Metadata PageCount() int DocumentMetadata() DocumentMetadata }
Document is the primary container for Textract response data
func LoadFromJSON ¶
LoadFromJSON creates Documents from a JSON file
func LoadFromJSONReader ¶
LoadFromJSONReader creates Documents from a JSON reader
func NewDocument ¶
func NewDocument(response *TextractResponse, opts ...DocumentOption) (Document, error)
NewDocument creates a new Document from a TextractResponse
type DocumentMetadata ¶
type DocumentMetadata struct {
Pages int
}
DocumentMetadata contains document-level metadata
type DocumentOption ¶
type DocumentOption func(*DocumentOptions)
DocumentOption configures document parsing behavior
type DocumentOptions ¶
type DocumentOptions struct { ConfidenceThreshold float64 EnableMergedCells bool CustomProcessors map[BlockType]BlockProcessor }
DocumentOptions configures document processing behavior
type EntityType ¶
type EntityType string
EntityType represents the type of entity in a Textract block
const ( EntityTypeKey EntityType = "KEY" EntityTypeValue EntityType = "VALUE" EntityTypeColumnHeader EntityType = "COLUMN_HEADER" EntityTypeTableTitle EntityType = "TABLE_TITLE" EntityTypeTableSectionTitle EntityType = "TABLE_SECTION_TITLE" EntityTypeTableSummary EntityType = "TABLE_SUMMARY" EntityTypeStructuredTable EntityType = "STRUCTURED_TABLE" EntityTypeSemiStructuredTable EntityType = "SEMI_STRUCTURED_TABLE" )
type FilterOptions ¶
type FilterOptions struct { MinConfidence float64 BlockTypes []BlockType EntityTypes []EntityType }
FilterOptions provides filtering criteria for blocks
type Form ¶
type Form interface { // Content Fields() []KeyValue SelectionElements() []SelectionElement // Search GetFieldByKey(key string) KeyValue SearchFieldsByKey(key string) []KeyValue // Navigation Page() Page }
Form represents a form structure
type Geometry ¶
type Geometry struct { BoundingBox BoundingBox Polygon []Point }
Geometry represents position information for blocks
type JSONBlock ¶
type JSONBlock struct { ID string `json:"Id"` BlockType string `json:"BlockType"` EntityTypes []string `json:"EntityTypes,omitempty"` Text string `json:"Text,omitempty"` Confidence float64 `json:"Confidence"` Geometry *JSONGeometry `json:"Geometry"` Relationships []JSONRelationship `json:"Relationships,omitempty"` RowIndex int `json:"RowIndex,omitempty"` ColumnIndex int `json:"ColumnIndex,omitempty"` RowSpan int `json:"RowSpan,omitempty"` ColumnSpan int `json:"ColumnSpan,omitempty"` SelectionStatus string `json:"SelectionStatus,omitempty"` Page int `json:"Page,omitempty"` }
JSONBlock represents a Textract block in JSON format
type JSONBoundingBox ¶
type JSONBoundingBox struct { Width float64 `json:"Width"` Height float64 `json:"Height"` Left float64 `json:"Left"` Top float64 `json:"Top"` }
JSONBoundingBox represents a bounding box in JSON format
type JSONDocumentMetadata ¶
type JSONDocumentMetadata struct {
Pages int `json:"Pages"`
}
JSONDocumentMetadata represents document metadata in JSON format
type JSONGeometry ¶
type JSONGeometry struct { BoundingBox *JSONBoundingBox `json:"BoundingBox"` Polygon []JSONPoint `json:"Polygon"` }
JSONGeometry represents geometry information in JSON format
type JSONRelationship ¶
JSONRelationship represents a relationship in JSON format
type JSONResponse ¶
type JSONResponse struct { DocumentMetadata *JSONDocumentMetadata `json:"DocumentMetadata"` Blocks []*JSONBlock `json:"Blocks"` }
JSONResponse represents a complete Textract response in JSON format
type KeyValue ¶
type KeyValue interface { // Content Key() Block Value() Block // Helper methods KeyText() string ValueText() string Confidence() float64 // Navigation Form() Form }
KeyValue represents a key-value pair
type Line ¶
type Line interface { // Content Text() string Words() []string Confidence() float64 EntityTypes() []EntityType // Navigation Page() Page // Geometry BoundingBox() BoundingBox Polygon() []Point }
Line represents a text line
type MergedCell ¶
type MergedCell interface { Cell MergedRowSpan() int MergedColumnSpan() int ContainedCells() []Cell EntityTypes() []EntityType }
MergedCell represents a cell that spans multiple rows or columns
type Page ¶
type Page interface { // Content access Lines() []Line Tables() []Table Forms() []Form Words() []string // Navigation Document() Document Number() int // Geometry BoundingBox() BoundingBox Polygon() []Point EntityTypes() []EntityType }
Page represents a single document page
type Parser ¶
type Parser interface {
ParseDocument(ctx context.Context, response *TextractResponse, opts ...DocumentOption) (Document, error)
}
Parser is the main interface for creating Document instances
type Query ¶
type Query interface { // Text returns the query text Text() string EntityTypes() []EntityType // Alias returns the query alias if one was specified Alias() string // Results returns all answers found for this query Results() []QueryResult // Page returns the parent page containing this query Page() Page }
Query represents a question asked of a document and its results
type QueryResult ¶
type QueryResult interface { // Text returns the result text Text() string EntityTypes() []EntityType // Confidence returns the confidence score for this result Confidence() float64 // Query returns the parent query Query() Query // Block returns the underlying block Block() Block }
QueryResult represents an answer to a query
type Relationship ¶
Relationship represents a relationship between blocks
type SelectionElement ¶
type SelectionElement interface { // Status IsSelected() bool SelectionStatus() SelectionStatus Confidence() float64 EntityTypes() []EntityType // Navigation Block() Block Form() Form // Geometry BoundingBox() BoundingBox Polygon() []Point }
SelectionElement represents a checkbox or radio button in a form
type SelectionElementType ¶
type SelectionElementType string
SelectionElementType represents the type of selection element
const ( SelectionElementTypeCheckbox SelectionElementType = "CHECKBOX" SelectionElementTypeRadioButton SelectionElementType = "RADIO_BUTTON" )
type SelectionStatus ¶
type SelectionStatus string
SelectionStatus represents the selection status of a block
const ( SelectionStatusSelected SelectionStatus = "SELECTED" SelectionStatusNotSelected SelectionStatus = "NOT_SELECTED" )
type Table ¶
type Table interface { // Structure Rows() []TableRow Cells() [][]Cell MergedCells() []MergedCell GetHeaders() []Cell // Metadata RowCount() int ColumnCount() int // Navigation Page() Page GetCellByPosition(row, col int) (Cell, error) // Geometry BoundingBox() BoundingBox Polygon() []Point EntityTypes() []EntityType }
Table represents a table structure
type TextractBlock ¶
type TextractBlock struct { BlockType *string Confidence *float64 Text *string RowIndex *int64 ColumnIndex *int64 RowSpan *int64 ColumnSpan *int64 Geometry *TextractGeometry ID *string Relationships []*TextractRelationship EntityTypes []*string Page *int64 }
TextractBlock represents the raw AWS Textract Block structure
type TextractBoundingBox ¶
TextractBoundingBox represents a coarse-grained boundary
type TextractDocument ¶
type TextractDocument struct { DocumentMetadata *TextractDocumentMetadata Blocks []*TextractBlock }
TextractDocument represents the top-level document structure
func ConvertFromAWS ¶
func ConvertFromAWS(response *textract.GetDocumentAnalysisOutput) (*TextractDocument, error)
ConvertFromAWS converts from AWS Textract types to our internal types
type TextractDocumentMetadata ¶
type TextractDocumentMetadata struct {
Pages *int64
}
TextractDocumentMetadata contains document-level metadata
type TextractGeometry ¶
type TextractGeometry struct { BoundingBox *TextractBoundingBox Polygon []*TextractPoint }
TextractGeometry represents the position information for a block
type TextractPoint ¶
TextractPoint represents a coordinate pair
type TextractRelationship ¶
TextractRelationship represents a relationship between blocks
type TextractResponse ¶
type TextractResponse struct { Blocks []*textract.Block Metadata *textract.DocumentMetadata }
TextractResponse represents the raw response from AWS Textract