Documentation
¶
Index ¶
Constants ¶
const ( XMLNameSpace = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" XMLSchemaInstance = "http://www.w3.org/2001/XMLSchema-instance" XMLSchemaLocation = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15" + " http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15/pagecontent.xsd" )
XML namespace, schema instance and location.
Variables ¶
var PcGtsXMLHeader = []xml.Attr{ xml.Attr{ Value: XMLNameSpace, }, xml.Attr{ Name: xml.Name{Space: "xmlns", Local: "xsi"}, Value: XMLSchemaInstance, }, xml.Attr{ Name: xml.Name{Space: "xsi", Local: "schemaLocation"}, Value: XMLSchemaLocation, }, }
PcGtsXMLHeader defines the default xml namespace header.
Functions ¶
This section is empty.
Types ¶
type AlternativeImage ¶ added in v0.1.1
type AlternativeImage struct {
Filename string `xml:"filename,attr"`
Comments string `xml:"comments,attr"`
Conf float64 `xml:"conf,attr"`
}
AlternativeImage defines optional alternative image information.
type Coords ¶
Coords are rectangles of points.
func (*Coords) MarshalXML ¶
MarshalXML marshals a Coords instance. <Coords points="x0,y0 x1,y1 x2,y2,..."/>
func (*Coords) UnmarshalXML ¶
UnmarshalXML unmarshals a Coords instance.
type Metadata ¶
Metadata defines
func (Metadata) MarshalXML ¶
MarshalXML marshals the Metadata of a PcGts structure to xml. <Metadata> <Creator>OCR-D</Creator> ... </Metadata>
func (Metadata) UnmarshalXML ¶
UnmarshalXML unmarshals the Metadata of a PcGts structure from xml.
type OrderedGroup ¶
type OrderedGroup struct {
ID string `xml:"id,attr"`
Caption string `xml:"caption,attr"`
RegionRefIndexed []RegionRefIndexed
}
OrderedGroup is a collection of regions.
type Page ¶
type Page struct {
ImageFilename string `xml:"imageFilename,attr"`
ImageHeight int `xml:"imageHeight,attr"`
ImageWidth int `xml:"imageWidth,attr"`
Type string `xml:"type,attr"`
PrintSpace PrintSpace
ReadingOrder ReadingOrder
TextRegion []TextRegion
}
Page is a page in a PcGts structure.
type PcGts ¶
type PcGts struct {
Attributes []xml.Attr `xml:",attr"`
Metadata Metadata `xml:"Metadata"`
Page Page `xml:"Page"`
}
PcGts is the top level node of page XML files.
func OpenFromHOCR ¶
OpenFromHOCR reads a hOCR file. Returns the hOCR content as PageXML structure. This method assumes one page per hOCR document.
func ReadFromHOCR ¶
ReadFromHOCR parses a hOCR file. Returns the hOCR content as PageXML structure. This method assumes one page per hOCR document.
func (*PcGts) UnmarshalXML ¶
UnmarshalXML unmarshals the top-level PcGts node of page xml files.
type PrintSpace ¶
type PrintSpace struct {
Coords Coords
}
PrintSpace defines the print space of a page.
type ReadingOrder ¶
type ReadingOrder struct {
OrderedGroup []OrderedGroup
}
ReadingOrder is a collection of ordered groups.
type RegionRefIndexed ¶
type RegionRefIndexed struct {
Index int `xml:"index,attr"`
RegionRef string `xml:"regionRef,attr"`
}
RegionRefIndexed is a index region.
type TextEquiv ¶
type TextEquiv struct {
PlainText []string
Unicode []string
DT string `xml:"dataType,attr,omitempty"`
DTD string `xml:"dataTypeDetails,attr,omitempty"`
Index int `xml:"index,attr,omitempty"`
Conf float64 `xml:"conf,attr,omitempty"`
}
TextEquiv defines the text string of text regions.
type TextLine ¶
type TextLine struct {
TextRegionBase
BaseLine Coords `xml:"Baseline"`
Word []Word
}
TextLine is a line of text in a text region.
func (*TextLine) UpdateWords ¶
UpdateWords updates the words of this text line using the given string (should be most likely TextEquiv.Unicode[0]). Any existing words are discared. The bounding boxes are approximatly calculated based on the number of (unicode) characters of the different words.
type TextRegion ¶
type TextRegion struct {
TextRegionBase
Type string `xml:"type,attr"`
TextLine []TextLine
}
TextRegion is a region of text (paragraph, block, ...)
type TextRegionBase ¶
type TextRegionBase struct {
ID string `xml:"id,attr"`
Custom string `xml:"custom,attr"`
Coords Coords
TextStyle TextStyle
TextEquiv TextEquiv // TODO: multiple TextEquivs are allowed
AlternativeImage *AlternativeImage `xml:"AlternativeImage,omitempty"`
}
TextRegionBase defines the base data structure for all text regions (TextRegion, Line, Word, Glyph) in a page XML document.
type TextStyle ¶
type TextStyle struct {
FontFamaily string `xml:"fontFamily,attr,omitempty"`
Serif bool `xml:"serif,attr,omitempty"`
Monospace bool `xml:"monospace,attr,omitempty"`
FontSize float32 `xml:"fontSize,attr,omitempty"`
Kerning int `xml:"kerning,attr,omitempty"`
TextColor string `xml:"textColour,attr,omitempty"`
TextColorRGB int `xml:"textColourRgb,attr,omitempty"`
}
TextStyle specifies font information of any text region.