parser

package

v0.6.0 Latest Latest Go to latest Published: Feb 24, 2026 License: MIT Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/coregx/gxpdf

Links

Open Source Insights

Documentation ¶

Overview ¶

Package parser implements PDF lexical analysis (tokenization) according to PDF 1.7 specification, Section 7.2 (Lexical Conventions).

Package primitive implements PDF primitive object types as defined in PDF specification.

PDF specification defines 8 basic object types: - Boolean values - Integer and Real numbers - Strings (literal and hexadecimal) - Names - Arrays - Dictionaries - Streams - The null object

Reference: PDF 1.7 specification, Section 7.3 "Objects"

Package parser implements PDF document reading and parsing.

Package parser implements PDF cross-reference table parsing according to PDF 1.7 specification, Section 7.5.4-7.5.8 (File Structure).

Index ¶

Constants
func IsContentStreamOperator(s string) bool
func IsKeyword(s string) bool
func ReadPDFInfo(filename string) (version string, pageCount int, err error)
type Array
- func NewArray() *Array
- func NewArrayFromSlice(objects []PdfObject) *Array
- func NewArrayWithCapacity(capacity int) *Array
- func (a *Array) Append(obj PdfObject)
- func (a *Array) AppendAll(objects ...PdfObject)
- func (a *Array) Clear()
- func (a *Array) Clone() *Array
- func (a *Array) Elements() []PdfObject
- func (a *Array) Get(i int) PdfObject
- func (a *Array) Insert(i int, obj PdfObject) error
- func (a *Array) Len() int
- func (a *Array) Remove(i int) error
- func (a *Array) Set(i int, obj PdfObject) error
- func (a *Array) String() string
- func (a *Array) WriteTo(w io.Writer) (int64, error)
type Boolean
- func NewBoolean(value bool) *Boolean
- func (b *Boolean) String() string
- func (b *Boolean) Value() bool
- func (b *Boolean) WriteTo(w io.Writer) (int64, error)
type Dictionary
- func NewDictionary() *Dictionary
- func NewDictionaryWithCapacity(capacity int) *Dictionary
- func (d *Dictionary) Clear()
- func (d *Dictionary) Clone() *Dictionary
- func (d *Dictionary) Get(key string) PdfObject
- func (d *Dictionary) GetArray(key string) *Array
- func (d *Dictionary) GetBoolean(key string) bool
- func (d *Dictionary) GetDictionary(key string) *Dictionary
- func (d *Dictionary) GetInteger(key string) int64
- func (d *Dictionary) GetName(key string) *Name
- func (d *Dictionary) GetReal(key string) float64
- func (d *Dictionary) GetString(key string) string
- func (d *Dictionary) Has(key string) bool
- func (d *Dictionary) Keys() []string
- func (d *Dictionary) KeysSorted() []string
- func (d *Dictionary) Len() int
- func (d *Dictionary) Merge(other *Dictionary)
- func (d *Dictionary) Remove(key string)
- func (d *Dictionary) Set(key string, value PdfObject)
- func (d *Dictionary) SetBoolean(key string, value bool)
- func (d *Dictionary) SetInteger(key string, value int64)
- func (d *Dictionary) SetName(key, value string)
- func (d *Dictionary) SetReal(key string, value float64)
- func (d *Dictionary) SetString(key, value string)
- func (d *Dictionary) String() string
- func (d *Dictionary) WriteTo(w io.Writer) (int64, error)
type DocInfo
type IndirectObject
- func NewIndirectObject(number, generation int, obj PdfObject) *IndirectObject
- func (o *IndirectObject) String() string
- func (o *IndirectObject) WriteTo(w io.Writer) (int64, error)
type IndirectReference
- func NewIndirectReference(number, generation int) *IndirectReference
- func (r *IndirectReference) Clone() *IndirectReference
- func (r *IndirectReference) Equals(other *IndirectReference) bool
- func (r *IndirectReference) String() string
- func (r *IndirectReference) WriteTo(w io.Writer) (int64, error)
type Integer
- func NewInteger(value int64) *Integer
- func (i *Integer) Int() int
- func (i *Integer) String() string
- func (i *Integer) Value() int64
- func (i *Integer) WriteTo(w io.Writer) (int64, error)
type Lexer
- func NewLexer(r io.Reader) *Lexer
- func (l *Lexer) NextToken() (Token, error)
- func (l *Lexer) Peek() (Token, error)
- func (l *Lexer) Position() (line, column int)
- func (l *Lexer) ReadAll() ([]Token, error)
- func (l *Lexer) Reset(r io.Reader)
type Name
- func NewName(value string) *Name
- func (n *Name) Equals(other *Name) bool
- func (n *Name) String() string
- func (n *Name) Value() string
- func (n *Name) WriteTo(w io.Writer) (int64, error)
type Null
- func NewNull() *Null
- func (n *Null) String() string
- func (n *Null) WriteTo(w io.Writer) (int64, error)
type Parser
- func NewParser(r io.Reader) *Parser
- func NewParserFromLexer(lexer *Lexer) *Parser
- func (p *Parser) ParseIndirectObject() (*IndirectObject, error)
- func (p *Parser) ParseObject() (PdfObject, error)
- func (p *Parser) ParseObjectStream(decodedData []byte, numObjects, firstOffset int) (map[int]PdfObject, error)
- func (p *Parser) ParseStartXRef() (int64, error)
- func (p *Parser) ParseXRef() (*XRefTable, error)
- func (p *Parser) ParseXRefStream() (*XRefTable, error)
- func (p *Parser) ParseXRefStreamWithFileAccess(file io.ReadSeeker, xrefOffset int64) (*XRefTable, error)
- func (p *Parser) Position() (line, column int)
- func (p *Parser) Reset(r io.Reader)
type PdfObject
- func Clone(obj PdfObject) PdfObject
- func Resolve(obj PdfObject) PdfObject
type Reader
- func NewReader(filename string) *Reader
- func OpenPDF(filename string) (*Reader, error)
- func OpenPDFWithPassword(filename, password string) (*Reader, error)
- func (r *Reader) Close() error
- func (r *Reader) GetAcroForm() (*Dictionary, error)
- func (r *Reader) GetCatalog() (*Dictionary, error)
- func (r *Reader) GetDocumentInfo() DocInfo
- func (r *Reader) GetObject(objectNum int) (PdfObject, error)
- func (r *Reader) GetPage(pageNum int) (*Dictionary, error)
- func (r *Reader) GetPageCount() (int, error)
- func (r *Reader) GetPages() (*Dictionary, error)
- func (r *Reader) Open() error
- func (r *Reader) ResolveArray(obj PdfObject) (*Array, error)
- func (r *Reader) ResolveReferences(obj PdfObject) PdfObject
- func (r *Reader) String() string
- func (r *Reader) Trailer() *Dictionary
- func (r *Reader) Version() string
- func (r *Reader) XRefTable() *XRefTable
type Real
- func NewReal(value float64) *Real
- func (r *Real) String() string
- func (r *Real) Value() float64
- func (r *Real) WriteTo(w io.Writer) (int64, error)
type Stream
- func NewStream(dict *Dictionary, content []byte) *Stream
- func (s *Stream) Bytes() []byte
- func (s *Stream) Clone() *Stream
- func (s *Stream) Content() []byte
- func (s *Stream) Decode() ([]byte, error)
- func (s *Stream) Dictionary() *Dictionary
- func (s *Stream) Encode(_ []string) error
- func (s *Stream) GetDecodeParams() PdfObject
- func (s *Stream) GetFilter() PdfObject
- func (s *Stream) Length() int64
- func (s *Stream) Reader() io.Reader
- func (s *Stream) SetContent(content []byte)
- func (s *Stream) String() string
- func (s *Stream) WriteTo(w io.Writer) (int64, error)
type String
- func NewHexString(value string) *String
- func NewString(value string) *String
- func NewStringBytes(value []byte) *String
- func (s *String) Bytes() []byte
- func (s *String) IsHex() bool
- func (s *String) String() string
- func (s *String) Value() string
- func (s *String) WriteTo(w io.Writer) (int64, error)
type Token
- func EOFToken(line, column int) Token
- func ErrorToken(msg string, line, column int) Token
- func NewToken(typ TokenType, value string, line, column int) Token
- func Tokenize(input string) ([]Token, error)
- func (t Token) String() string
type TokenType
- func (t TokenType) String() string
type Type
- func TypeOf(obj PdfObject) Type
- func (t Type) String() string
type XRefEntry
- func NewXRefEntry(objectNum int, entryType XRefEntryType, offset int64, generation int) *XRefEntry
- func (e *XRefEntry) IsFree() bool
- func (e *XRefEntry) IsInUse() bool
- func (e *XRefEntry) String() string
type XRefEntryType
- func (t XRefEntryType) String() string
type XRefStream
- func NewXRefStream(stream *Stream) *XRefStream
type XRefTable
- func NewXRefTable() *XRefTable
- func (t *XRefTable) AddEntry(entry *XRefEntry)
- func (t *XRefTable) GetEntry(objectNum int) (*XRefEntry, bool)
- func (t *XRefTable) GetFreeEntries() []*XRefEntry
- func (t *XRefTable) GetInUseEntries() []*XRefEntry
- func (t *XRefTable) GetTrailer() *Dictionary
- func (t *XRefTable) HasObject(objectNum int) bool
- func (t *XRefTable) MergeOlder(older *XRefTable)
- func (t *XRefTable) SetTrailer(trailer *Dictionary)
- func (t *XRefTable) Size() int
- func (t *XRefTable) String() string

Constants ¶

View Source

const (
	KeywordObj       = "obj"
	KeywordEndobj    = "endobj"
	KeywordStream    = "stream"
	KeywordEndstream = "endstream"
	KeywordXref      = "xref"
	KeywordTrailer   = "trailer"
	KeywordStartxref = "startxref"
)

PDF keyword string constants.

Variables ¶

This section is empty.

Functions ¶

func IsContentStreamOperator ¶

func IsContentStreamOperator(s string) bool

IsContentStreamOperator checks if a string is a PDF content stream operator. These are operators used in content streams for graphics and text operations.

Reference: PDF 1.7 specification, Appendix A (Operator Summary).

func IsKeyword ¶

func IsKeyword(s string) bool

IsKeyword checks if a string is a PDF keyword.

func ReadPDFInfo ¶

func ReadPDFInfo(filename string) (version string, pageCount int, err error)

ReadPDFInfo is a convenience function that reads basic PDF information without loading the entire document structure.

Returns: version, page count, error.

This is useful for quickly checking PDF properties without loading all objects into memory.

Types ¶

type Array ¶

type Array struct {
	// contains filtered or unexported fields
}

Array represents a PDF array object. Arrays are ordered collections of objects: [obj1 obj2 obj3] Arrays can contain any PDF objects, including other arrays and dictionaries.

func NewArrayFromSlice ¶

func NewArrayFromSlice(objects []PdfObject) *Array

NewArrayFromSlice creates an Array from a slice of objects.

func NewArrayWithCapacity ¶

func NewArrayWithCapacity(capacity int) *Array

NewArrayWithCapacity creates a new Array with specified capacity.

func (*Array) Append ¶

func (a *Array) Append(obj PdfObject)

Append adds an element to the end of the array.

func (*Array) AppendAll ¶

func (a *Array) AppendAll(objects ...PdfObject)

AppendAll adds multiple elements to the end of the array.

func (*Array) Clear ¶

func (a *Array) Clear()

Clear removes all elements from the array.

func (*Array) Clone ¶

func (a *Array) Clone() *Array

Clone creates a deep copy of the array.

func (*Array) Elements ¶

func (a *Array) Elements() []PdfObject

Elements returns a copy of all elements. Returns a new slice to prevent external modification.

func (*Array) Get ¶

func (a *Array) Get(i int) PdfObject

Get returns the element at index i. Returns nil if index is out of bounds.

func (*Array) Insert ¶

func (a *Array) Insert(i int, obj PdfObject) error

Insert inserts an element at index i.

func (*Array) Len ¶

func (a *Array) Len() int

Len returns the number of elements in the array.

func (*Array) Remove ¶

func (a *Array) Remove(i int) error

Remove removes the element at index i.

func (*Array) Set ¶

func (a *Array) Set(i int, obj PdfObject) error

Set sets the element at index i. Returns error if index is out of bounds.

func (*Array) String ¶

func (a *Array) String() string

String returns a string representation of the array.

func (*Array) WriteTo ¶

func (a *Array) WriteTo(w io.Writer) (int64, error)

WriteTo writes the PDF representation of the array to w.

type Boolean ¶

type Boolean struct {
	// contains filtered or unexported fields
}

Boolean represents a PDF boolean value (true or false).

func (*Boolean) String ¶

func (b *Boolean) String() string

String returns "true" or "false".

func (*Boolean) Value ¶

func (b *Boolean) Value() bool

Value returns the boolean value.

func (*Boolean) WriteTo ¶

func (b *Boolean) WriteTo(w io.Writer) (int64, error)

WriteTo writes "true" or "false" to w.

type Dictionary ¶

type Dictionary struct {
	// contains filtered or unexported fields
}

Dictionary represents a PDF dictionary object. Dictionaries are associative tables: << /Key1 value1 /Key2 value2 >> Keys are always Name objects, values can be any PDF object.

func NewDictionary ¶

func NewDictionary() *Dictionary

NewDictionary creates a new empty Dictionary.

func NewDictionaryWithCapacity ¶

func NewDictionaryWithCapacity(capacity int) *Dictionary

NewDictionaryWithCapacity creates a new Dictionary with specified capacity.

func (*Dictionary) Clear ¶

func (d *Dictionary) Clear()

Clear removes all entries from the dictionary.

func (*Dictionary) Clone ¶

func (d *Dictionary) Clone() *Dictionary

Clone creates a deep copy of the dictionary.

func (*Dictionary) Get ¶

func (d *Dictionary) Get(key string) PdfObject

Get returns the value for a key. Returns nil if key doesn't exist.

func (*Dictionary) GetArray ¶

func (d *Dictionary) GetArray(key string) *Array

GetArray is a convenience method to get an Array value. Returns nil if key doesn't exist or value is not an Array.

func (*Dictionary) GetBoolean ¶

func (d *Dictionary) GetBoolean(key string) bool

GetBoolean is a convenience method to get a Boolean value. Returns false if key doesn't exist or value is not a Boolean.

func (*Dictionary) GetDictionary ¶

func (d *Dictionary) GetDictionary(key string) *Dictionary

GetDictionary is a convenience method to get a Dictionary value. Returns nil if key doesn't exist or value is not a Dictionary.

func (*Dictionary) GetInteger ¶

func (d *Dictionary) GetInteger(key string) int64

GetInteger is a convenience method to get an Integer value. Returns 0 if key doesn't exist or value is not an Integer.

func (*Dictionary) GetName ¶

func (d *Dictionary) GetName(key string) *Name

GetName is a convenience method to get a Name value. Returns nil if key doesn't exist or value is not a Name.

func (*Dictionary) GetReal ¶

func (d *Dictionary) GetReal(key string) float64

GetReal is a convenience method to get a Real value. Returns 0.0 if key doesn't exist or value is not a Real.

func (*Dictionary) GetString ¶

func (d *Dictionary) GetString(key string) string

GetString is a convenience method to get a String value. Returns empty string if key doesn't exist or value is not a String.

func (*Dictionary) Has ¶

func (d *Dictionary) Has(key string) bool

Has checks if a key exists in the dictionary.

func (*Dictionary) Keys ¶

func (d *Dictionary) Keys() []string

Keys returns all keys in insertion order.

func (*Dictionary) KeysSorted ¶

func (d *Dictionary) KeysSorted() []string

KeysSorted returns all keys in alphabetical order.

func (*Dictionary) Len ¶

func (d *Dictionary) Len() int

Len returns the number of entries in the dictionary.

func (*Dictionary) Merge ¶

func (d *Dictionary) Merge(other *Dictionary)

Merge merges another dictionary into this one. Existing keys are overwritten.

func (*Dictionary) Remove ¶

func (d *Dictionary) Remove(key string)

Remove removes a key from the dictionary.

func (*Dictionary) Set ¶

func (d *Dictionary) Set(key string, value PdfObject)

Set sets a key-value pair in the dictionary. If key already exists, its value is replaced.

func (*Dictionary) SetBoolean ¶

func (d *Dictionary) SetBoolean(key string, value bool)

SetBoolean is a convenience method to set a Boolean value.

func (*Dictionary) SetInteger ¶

func (d *Dictionary) SetInteger(key string, value int64)

SetInteger is a convenience method to set an Integer value.

func (*Dictionary) SetName ¶

func (d *Dictionary) SetName(key, value string)

SetName is a convenience method to set a Name value.

func (*Dictionary) SetReal ¶

func (d *Dictionary) SetReal(key string, value float64)

SetReal is a convenience method to set a Real value.

func (*Dictionary) SetString ¶

func (d *Dictionary) SetString(key, value string)

SetString is a convenience method to set a String value.

func (*Dictionary) String ¶

func (d *Dictionary) String() string

String returns a string representation of the dictionary.

func (*Dictionary) WriteTo ¶

func (d *Dictionary) WriteTo(w io.Writer) (int64, error)

WriteTo writes the PDF representation of the dictionary to w.

type DocInfo ¶

type DocInfo struct {
	Version   string
	Title     string
	Author    string
	Subject   string
	Keywords  string
	Creator   string
	Producer  string
	Encrypted bool
}

DocInfo contains document metadata from the Info dictionary.

type IndirectObject ¶

type IndirectObject struct {
	Number     int       // Object number
	Generation int       // Generation number
	Object     PdfObject // The actual object
}

IndirectObject represents an indirect PDF object. Format: objNum genNum obj ... endobj

Example: 1 0 obj (Hello) endobj

Reference: PDF 1.7 specification, Section 7.3.10 (Indirect Objects).

func NewIndirectObject ¶

func NewIndirectObject(number, generation int, obj PdfObject) *IndirectObject

NewIndirectObject creates a new indirect object.

func (*IndirectObject) String ¶

func (o *IndirectObject) String() string

String returns a string representation of the indirect object.

func (*IndirectObject) WriteTo ¶

func (o *IndirectObject) WriteTo(w io.Writer) (int64, error)

WriteTo writes the PDF representation to w.

type IndirectReference ¶

type IndirectReference struct {
	Number     int // Object number being referenced
	Generation int // Generation number being referenced
}

IndirectReference represents a reference to an indirect object. Format: objNum genNum R

Example: 1 0 R (refers to object 1, generation 0)

Reference: PDF 1.7 specification, Section 7.3.10 (Indirect Objects).

func NewIndirectReference ¶

func NewIndirectReference(number, generation int) *IndirectReference

NewIndirectReference creates a new indirect reference.

func (*IndirectReference) Clone ¶

func (r *IndirectReference) Clone() *IndirectReference

Clone creates a copy of the indirect reference.

func (*IndirectReference) Equals ¶

func (r *IndirectReference) Equals(other *IndirectReference) bool

Equals checks if two references point to the same object.

func (*IndirectReference) String ¶

func (r *IndirectReference) String() string

String returns the PDF representation of the reference.

func (*IndirectReference) WriteTo ¶

func (r *IndirectReference) WriteTo(w io.Writer) (int64, error)

WriteTo writes the PDF representation to w.

type Integer ¶

type Integer struct {
	// contains filtered or unexported fields
}

Integer represents a PDF integer object. PDF integers are signed 32-bit values.

func (*Integer) Int ¶

func (i *Integer) Int() int

Int returns the value as int (may overflow on 32-bit systems).

func (*Integer) String ¶

func (i *Integer) String() string

String returns the string representation of the integer.

func (*Integer) Value ¶

func (i *Integer) Value() int64

Value returns the integer value.

func (*Integer) WriteTo ¶

func (i *Integer) WriteTo(w io.Writer) (int64, error)

WriteTo writes the integer to w.

type Lexer ¶

type Lexer struct {
	// contains filtered or unexported fields
}

Lexer tokenizes a PDF byte stream according to PDF 1.7 specification, Section 7.2 (Lexical Conventions).

func NewLexer ¶

func NewLexer(r io.Reader) *Lexer

NewLexer creates a new lexer that reads from the given reader.

func (*Lexer) NextToken ¶

func (l *Lexer) NextToken() (Token, error)

NextToken returns the next token from the input stream.

func (*Lexer) Peek ¶

func (l *Lexer) Peek() (Token, error)

Peek returns the next token without consuming it.

func (*Lexer) Position ¶

func (l *Lexer) Position() (line, column int)

Position returns the current line and column.

func (*Lexer) ReadAll ¶

func (l *Lexer) ReadAll() ([]Token, error)

ReadAll reads all tokens from the input until EOF. Useful for debugging and testing.

func (*Lexer) Reset ¶

func (l *Lexer) Reset(r io.Reader)

Reset resets the lexer to read from a new reader.

type Name ¶

type Name struct {
	// contains filtered or unexported fields
}

Name represents a PDF name object. Names are unique identifiers and always start with '/'.

func NewName ¶

func NewName(value string) *Name

NewName creates a new Name object. The leading '/' is added automatically if not present.

func (*Name) Equals ¶

func (n *Name) Equals(other *Name) bool

Equals checks if two names are equal.

func (*Name) String ¶

func (n *Name) String() string

String returns the name with leading '/'.

func (*Name) Value ¶

func (n *Name) Value() string

Value returns the name without the leading '/'.

func (*Name) WriteTo ¶

func (n *Name) WriteTo(w io.Writer) (int64, error)

WriteTo writes the name to w.

func (*Null) String ¶

func (n *Null) String() string

String returns "null".

func (*Null) WriteTo ¶

func (n *Null) WriteTo(w io.Writer) (int64, error)

WriteTo writes "null" to w.

type Parser ¶

type Parser struct {
	// contains filtered or unexported fields
}

Parser parses PDF objects from a token stream. It builds higher-level objects (arrays, dictionaries, streams, indirect objects) from tokens produced by the Lexer.

Reference: PDF 1.7 specification, Section 7.3 (Objects).

func NewParser ¶

func NewParser(r io.Reader) *Parser

NewParser creates a new parser that reads from the given reader.

func NewParserFromLexer ¶

func NewParserFromLexer(lexer *Lexer) *Parser

NewParserFromLexer creates a new parser from an existing lexer.

func (*Parser) ParseIndirectObject ¶

func (p *Parser) ParseIndirectObject() (*IndirectObject, error)

ParseIndirectObject parses an indirect object: N G obj ... endobj.

func (*Parser) ParseObject ¶

func (p *Parser) ParseObject() (PdfObject, error)

ParseObject parses any PDF direct object. Returns the parsed object or an error.

func (*Parser) ParseObjectStream ¶

func (p *Parser) ParseObjectStream(decodedData []byte, numObjects, firstOffset int) (map[int]PdfObject, error)

ParseObjectStream parses an Object Stream (PDF 1.5+) and returns the contained objects.

Object Streams compress multiple objects together for efficiency. The format is:

N 0 obj
<< /Type /ObjStm /N numObjects /First firstByteOffset /Length ... >>
stream
obj1_num offset1 obj2_num offset2 ... objN_num offsetN
[object1_data] [object2_data] ... [objectN_data]
endstream
endobj

The first part contains pairs of (object_number, offset_from_First). The second part (starting at /First) contains the actual object data.

Reference: PDF 1.7 specification, Section 7.5.7 (Object Streams).

Parameters:

decodedData: The decoded stream content (after decompression)
numObjects: The /N value (number of objects)
firstOffset: The /First value (offset to first object data)

Returns: Map of object number -> parsed object.

func (*Parser) ParseStartXRef ¶

func (p *Parser) ParseStartXRef() (int64, error)

ParseStartXRef parses the startxref section at the end of a PDF file.

Expected format:

startxref
byte_offset
%%EOF

Returns the byte offset of the cross-reference table.

Reference: PDF 1.7 specification, Section 7.5.5 (File Trailer).

func (*Parser) ParseXRef ¶

func (p *Parser) ParseXRef() (*XRefTable, error)

ParseXRef parses a cross-reference table and trailer.

Handles both traditional xref tables (PDF < 1.5) and xref streams (PDF 1.5+):

Traditional format:

xref
startNum count
offset1 generation1 type1
offset2 generation2 type2
...
trailer
<< trailer dictionary >>

XRef stream format (PDF 1.5+):

90 0 obj
<< /Type /XRef /Size 100 /W [1 3 2] ... >>
stream
...compressed xref data...
endstream
endobj

Reference: PDF 1.7 specification, Section 7.5.4 and 7.5.8.

func (*Parser) ParseXRefStream ¶

func (p *Parser) ParseXRefStream() (*XRefTable, error)

ParseXRefStream parses a cross-reference stream object (PDF 1.5+).

When a PDF uses xref streams, the startxref pointer points to an indirect object (e.g., "90 0 obj") instead of the "xref" keyword.

The stream dictionary contains:

/Type /XRef
/Size: number of entries
/W [w1 w2 w3]: field widths for parsing binary data
/Index: optional array of [start count ...] pairs (default [0 Size])
Trailer entries: /Root, /Info, /ID, etc.

Reference: PDF 1.7 specification, Section 7.5.8 (Cross-Reference Streams).

func (*Parser) ParseXRefStreamWithFileAccess ¶

func (p *Parser) ParseXRefStreamWithFileAccess(file io.ReadSeeker, xrefOffset int64) (*XRefTable, error)

ParseXRefStreamWithFileAccess parses a cross-reference stream with direct file access.

This version is used when we have access to an io.ReadSeeker (file handle) and can seek to the exact stream data position, avoiding lexer buffer issues.

func (*Parser) Position ¶

func (p *Parser) Position() (line, column int)

Position returns the current parser position (line, column).

func (*Parser) Reset ¶

func (p *Parser) Reset(r io.Reader)

Reset resets the parser with a new reader.

type PdfObject ¶

type PdfObject interface {
	// String returns a string representation of the object.
	String() string

	// WriteTo writes the PDF representation to w.
	// Returns the number of bytes written and any error.
	WriteTo(w io.Writer) (int64, error)
}

PdfObject is the base interface for all PDF objects. All PDF primitive types implement this interface.

func Clone ¶

func Clone(obj PdfObject) PdfObject

Clone creates a deep copy of a PDF object. This is useful when you need to modify an object without affecting the original.

func Resolve ¶

func Resolve(obj PdfObject) PdfObject

Resolve resolves indirect references to direct objects. For direct objects, it returns the object itself.

Note: Full indirect object support (e.g., "1 0 R" references) will be implemented in Phase 2 (PDF Parser) as part of the document reader. See SUMMARY.md for the complete roadmap.

type Reader ¶

type Reader struct {
	// contains filtered or unexported fields
}

Reader reads and parses PDF documents, providing access to document structure.

The Reader ties together all parser components (Lexer, Parser, XRef) to read actual PDF files according to PDF 1.7 specification.

PDF File Structure (Section 7.5):

Header: %PDF-X.Y
Body: Indirect objects
Cross-reference table: Object locations
Trailer: Document metadata
startxref: XRef table offset
%%EOF: End of file marker

Thread Safety: Reader is thread-safe for concurrent reads using sync.RWMutex for cache and sync.Mutex for file access. Multiple goroutines can safely call GetObject() simultaneously.

Reference: PDF 1.7 specification, Section 7.5 (File Structure).

func NewReader ¶

func NewReader(filename string) *Reader

NewReader creates a new PDF document reader.

The filename is stored but the file is not opened until Open() is called. This allows for resource management and lazy loading.

func OpenPDF ¶

func OpenPDF(filename string) (*Reader, error)

OpenPDF is a convenience function that creates a Reader and opens the PDF.

This is equivalent to:

reader := NewReader(filename)
err := reader.Open()

Remember to call Close() when done:

defer reader.Close()

func OpenPDFWithPassword ¶ added in v0.6.0

func OpenPDFWithPassword(filename, password string) (*Reader, error)

OpenPDFWithPassword is a convenience function that creates a Reader and opens an encrypted PDF with the given password.

For PDFs with an empty user password (permissions-only encryption), use OpenPDF instead — it handles empty passwords transparently.

Example:

reader, err := parser.OpenPDFWithPassword("encrypted.pdf", "secret")
if err != nil {
    log.Fatal(err)
}
defer reader.Close()

func (*Reader) Close ¶

func (r *Reader) Close() error

Close closes the PDF file and releases resources.

func (*Reader) GetAcroForm ¶ added in v0.2.0

func (r *Reader) GetAcroForm() (*Dictionary, error)

GetAcroForm returns the interactive form dictionary (AcroForm).

Returns nil if the document has no interactive form. The AcroForm dictionary contains form field definitions and settings.

Reference: PDF 1.7 specification, Section 12.7 (Interactive Forms).

func (*Reader) GetCatalog ¶

func (r *Reader) GetCatalog() (*Dictionary, error)

GetCatalog returns the document catalog (root object).

The catalog must be loaded via Open() before calling this method.

Reference: PDF 1.7 specification, Section 7.7.2 (Document Catalog).

func (*Reader) GetDocumentInfo ¶

func (r *Reader) GetDocumentInfo() DocInfo

GetDocumentInfo returns document metadata from the Info dictionary.

Reference: PDF 1.7 specification, Section 14.3.3 (Document Information Dictionary).

func (*Reader) GetObject ¶

func (r *Reader) GetObject(objectNum int) (PdfObject, error)

GetObject retrieves and resolves an indirect object by number.

The object is looked up in the cross-reference table, loaded from the file at the specified offset, and cached for future access.

For PDF 1.5+ compressed objects (stored in Object Streams), the method automatically loads and parses the containing ObjStm.

Nested indirect references are automatically resolved.

Thread-safe: Multiple goroutines can call this method concurrently.

Returns error if object is not found or cannot be parsed.

func (*Reader) GetPage ¶

func (r *Reader) GetPage(pageNum int) (*Dictionary, error)

GetPage returns the page dictionary for the specified page number.

Page numbers are 0-based (first page is 0).

The method traverses the page tree to find the requested page. The page tree can have intermediate nodes (/Type /Pages) and leaf nodes (/Type /Page).

Reference: PDF 1.7 specification, Section 7.7.3 (Page Tree).

func (*Reader) GetPageCount ¶

func (r *Reader) GetPageCount() (int, error)

GetPageCount returns the total number of pages in the document.

The count is read from the /Count entry in the page tree root.

Reference: PDF 1.7 specification, Section 7.7.3.2 (Page Tree Nodes).

func (*Reader) GetPages ¶

func (r *Reader) GetPages() (*Dictionary, error)

GetPages returns the page tree root.

The page tree is a hierarchical structure containing all pages.

Reference: PDF 1.7 specification, Section 7.7.3 (Page Tree).

func (*Reader) Open ¶

func (r *Reader) Open() error

Open opens the PDF file and parses its structure.

For encrypted PDFs with an empty user password (the most common case for "permissions-only" encryption), Open will transparently decrypt the document. For PDFs requiring a non-empty password, use OpenWithPassword.

Steps performed:

Open file
Read and validate PDF header
Find startxref offset
Parse cross-reference table and trailer
Initialize decryption (if encrypted)
Load document catalog
Load page tree root

Returns error if file cannot be opened or is not a valid PDF.

Reference: PDF 1.7 specification, Section 7.5 (File Structure).

func (*Reader) ResolveArray ¶ added in v0.2.0

func (r *Reader) ResolveArray(obj PdfObject) (*Array, error)

ResolveArray resolves an object and ensures it's an array. This is the exported version of resolveArray.

func (*Reader) ResolveReferences ¶ added in v0.2.0

func (r *Reader) ResolveReferences(obj PdfObject) PdfObject

ResolveReferences recursively resolves indirect references in an object. This is the exported version of resolveReferences.

func (*Reader) String ¶

func (r *Reader) String() string

String returns a string representation of the reader's state.

func (*Reader) Trailer ¶

func (r *Reader) Trailer() *Dictionary

Trailer returns the trailer dictionary.

The trailer contains document-level metadata like:

/Size: Number of entries in xref table
/Root: Reference to catalog
/Info: Document information dictionary
/ID: File identifier array

Reference: PDF 1.7 specification, Section 7.5.5 (File Trailer).

func (*Reader) Version ¶

func (r *Reader) Version() string

Version returns the PDF version string from the file header.

Returns empty string if Open() has not been called.

Reference: PDF 1.7 specification, Section 7.5.1 (File Header).

func (*Reader) XRefTable ¶

func (r *Reader) XRefTable() *XRefTable

XRefTable returns the cross-reference table.

The xref table maps object numbers to byte offsets in the file.

Reference: PDF 1.7 specification, Section 7.5.4 (Cross-Reference Table).

type Real ¶

type Real struct {
	// contains filtered or unexported fields
}

Real represents a PDF real (floating-point) number.

func NewReal ¶

func NewReal(value float64) *Real

NewReal creates a new Real object.

func (*Real) String ¶

func (r *Real) String() string

String returns the string representation of the real number. Uses minimal precision to keep PDF files compact.

func (*Real) Value ¶

func (r *Real) Value() float64

Value returns the float64 value.

func (*Real) WriteTo ¶

func (r *Real) WriteTo(w io.Writer) (int64, error)

WriteTo writes the real number to w.

type Stream ¶

type Stream struct {
	// contains filtered or unexported fields
}

Stream represents a PDF stream object. A stream consists of a dictionary followed by zero or more bytes bracketed between the keywords stream (followed by newline) and endstream.

Reference: PDF 1.7 specification, Section 7.3.8 (Stream Objects).

func NewStream ¶

func NewStream(dict *Dictionary, content []byte) *Stream

NewStream creates a new Stream with the given dictionary and content.

func (*Stream) Bytes ¶

func (s *Stream) Bytes() []byte

Bytes returns the raw stream content as a byte slice. Alias for Content() for convenience.

func (*Stream) Clone ¶

func (s *Stream) Clone() *Stream

Clone creates a deep copy of the stream.

func (*Stream) Content ¶

func (s *Stream) Content() []byte

Content returns the raw stream content.

func (*Stream) Decode ¶

func (s *Stream) Decode() ([]byte, error)

Decode decodes the stream content based on the filters in the dictionary. This is a placeholder for Phase 3 (Stream Processing). Currently returns the raw content.

func (*Stream) Dictionary ¶

func (s *Stream) Dictionary() *Dictionary

Dictionary returns the stream's dictionary.

func (*Stream) Encode ¶

func (s *Stream) Encode(_ []string) error

Encode encodes the stream content with the specified filters. This is a placeholder for Phase 3 (Stream Processing).

func (*Stream) GetDecodeParams ¶

func (s *Stream) GetDecodeParams() PdfObject

GetDecodeParams returns the decode parameters for the filters. Returns nil if no decode parameters are specified.

func (*Stream) GetFilter ¶

func (s *Stream) GetFilter() PdfObject

GetFilter returns the filter(s) applied to this stream. Returns nil if no filters are applied.

func (*Stream) Length ¶

func (s *Stream) Length() int64

Length returns the length of the stream content.

func (*Stream) Reader ¶

func (s *Stream) Reader() io.Reader

Reader returns an io.Reader for the stream content.

func (*Stream) SetContent ¶

func (s *Stream) SetContent(content []byte)

SetContent sets the stream content and updates the Length entry in the dictionary.

func (*Stream) String ¶

func (s *Stream) String() string

String returns a string representation of the stream. Only shows the dictionary and length, not the full content.

func (*Stream) WriteTo ¶

func (s *Stream) WriteTo(w io.Writer) (int64, error)

WriteTo writes the PDF representation of the stream to w. Format: dictionary\nstream\ncontent\nendstream.

type String ¶

type String struct {
	// contains filtered or unexported fields
}

String represents a PDF string object. PDF strings can be literal strings (text) or hexadecimal strings (<hex>).

func NewHexString ¶

func NewHexString(value string) *String

NewHexString creates a new hexadecimal String object.

func NewString ¶

func NewString(value string) *String

NewString creates a new literal String object.

func NewStringBytes ¶

func NewStringBytes(value []byte) *String

NewStringBytes creates a new literal String from bytes.

func (*String) Bytes ¶

func (s *String) Bytes() []byte

Bytes returns the raw bytes.

func (*String) IsHex ¶

func (s *String) IsHex() bool

IsHex returns true if this is a hexadecimal string.

func (*String) String ¶

func (s *String) String() string

String returns the string representation. For debugging purposes, not PDF format.

func (*String) Value ¶

func (s *String) Value() string

Value returns the string value as a Go string.

func (*String) WriteTo ¶

func (s *String) WriteTo(w io.Writer) (int64, error)

WriteTo writes the PDF representation to w.

type Token ¶

type Token struct {
	Type   TokenType // Type of the token
	Value  string    // String value of the token
	Line   int       // Line number (1-based)
	Column int       // Column number (1-based)
}

Token represents a single token from the PDF byte stream.

func EOFToken ¶

func EOFToken(line, column int) Token

EOFToken creates an end-of-file token.

func ErrorToken ¶

func ErrorToken(msg string, line, column int) Token

ErrorToken creates an error token with the given message.

func NewToken ¶

func NewToken(typ TokenType, value string, line, column int) Token

NewToken creates a new token with the given type, value, line, and column.

func Tokenize ¶

func Tokenize(input string) ([]Token, error)

Tokenize is a convenience function that tokenizes the entire input string.

func (Token) String ¶

func (t Token) String() string

String returns a string representation of the token for debugging.

type TokenType ¶

type TokenType int

TokenType represents the type of a PDF token.

const (
	// TokenError represents an error during tokenization.
	TokenError TokenType = iota

	// TokenEOF represents end of file.
	TokenEOF

	// Basic types.
	TokenInteger   // 123, -456, +789
	TokenReal      // 3.14, -2.5, .5
	TokenString    // (Hello) - literal string
	TokenHexString // <48656C6C6F> - hexadecimal string
	TokenName      // /Type, /Page, /Name#20With#20Spaces
	TokenBoolean   // true, false
	TokenNull      // null

	// Keywords.
	TokenKeyword // obj, endobj, stream, endstream, xref, trailer, startxref, R, n, f

	// Delimiters.
	TokenArrayStart // [
	TokenArrayEnd   // ]
	TokenDictStart  // <<
	TokenDictEnd    // >>
)

Token types recognized by the PDF lexer. Based on PDF 1.7 specification, Section 7.2 (Lexical Conventions).

func (TokenType) String ¶

func (t TokenType) String() string

String returns the string representation of a TokenType.

type Type ¶

type Type int

Type represents the type of a PDF object. This is useful for type assertions and debugging.

const (
	// TypeNull represents the null type.
	TypeNull Type = iota
	TypeBoolean
	TypeInteger
	TypeReal
	TypeString
	TypeName
	TypeArray
	TypeDictionary
	TypeStream
	TypeIndirect
	TypeReference
)

PDF object type constants.

func TypeOf ¶

func TypeOf(obj PdfObject) Type

TypeOf returns the type of a PDF object.

func (Type) String ¶

func (t Type) String() string

String returns the name of the type.

type XRefEntry ¶

type XRefEntry struct {
	Type       XRefEntryType // Entry type (free, in-use, compressed)
	Offset     int64         // Byte offset (in-use) or next free object (free)
	Generation int           // Generation number
	ObjectNum  int           // Object number (for convenience)
}

XRefEntry represents a single entry in the cross-reference table.

For in-use entries (type 'n'):

Offset: byte offset in file where object starts
Generation: generation number of the object

For free entries (type 'f'):

Offset: object number of next free object (or 0 if last)
Generation: generation number to use when object is reused

Reference: PDF 1.7 specification, Section 7.5.4 (Cross-Reference Table).

func NewXRefEntry ¶

func NewXRefEntry(objectNum int, entryType XRefEntryType, offset int64, generation int) *XRefEntry

NewXRefEntry creates a new cross-reference entry.

func (*XRefEntry) IsFree ¶

func (e *XRefEntry) IsFree() bool

IsFree returns true if this entry represents a free (deleted) object.

func (*XRefEntry) IsInUse ¶

func (e *XRefEntry) IsInUse() bool

IsInUse returns true if this entry represents an in-use object.

func (*XRefEntry) String ¶

func (e *XRefEntry) String() string

String returns a string representation of the entry.

type XRefEntryType ¶

type XRefEntryType int

XRefEntryType represents the type of cross-reference entry.

const (
	// XRefEntryFree represents a free (deleted) object entry.
	// Format: next_free_object_num generation f.
	XRefEntryFree XRefEntryType = iota

	// XRefEntryInUse represents an in-use object entry.
	// Format: byte_offset generation n.
	XRefEntryInUse

	// XRefEntryCompressed represents a compressed object entry (PDF 1.5+).
	// Found in XRef streams, format varies based on stream /W array.
	XRefEntryCompressed
)

func (XRefEntryType) String ¶

func (t XRefEntryType) String() string

String returns the string representation of the XRefEntryType.

type XRefStream ¶

type XRefStream struct {
	Stream  *Stream      // The stream object containing compressed xref data
	Entries []*XRefEntry // Parsed entries (if decoded)
	W       []int        // Field widths from /W array in stream dictionary
	Index   []int        // Object number ranges from /Index array
}

XRefStream represents a compressed cross-reference stream (PDF 1.5+).

Cross-reference streams provide a more compact alternative to traditional cross-reference tables by using stream compression.

Note: Full XRef stream support requires stream decoding (compression), which will be implemented in a later phase.

Reference: PDF 1.7 specification, Section 7.5.8 (Cross-Reference Streams).

func NewXRefStream ¶

func NewXRefStream(stream *Stream) *XRefStream

NewXRefStream creates a new XRef stream structure.

type XRefTable ¶

type XRefTable struct {
	Entries map[int]*XRefEntry // Map: object number -> XRef entry
	Trailer *Dictionary        // Trailer dictionary
}

XRefTable represents a PDF cross-reference table.

The cross-reference table contains information about the location of objects in the PDF file. Each object is identified by an object number and generation number.

Reference: PDF 1.7 specification, Section 7.5.4 (Cross-Reference Table).

func NewXRefTable ¶

func NewXRefTable() *XRefTable

NewXRefTable creates a new empty cross-reference table.

func (*XRefTable) AddEntry ¶

func (t *XRefTable) AddEntry(entry *XRefEntry)

AddEntry adds an entry to the cross-reference table.

func (*XRefTable) GetEntry ¶

func (t *XRefTable) GetEntry(objectNum int) (*XRefEntry, bool)

GetEntry retrieves an entry by object number. Returns nil if the entry doesn't exist.

func (*XRefTable) GetFreeEntries ¶

func (t *XRefTable) GetFreeEntries() []*XRefEntry

GetFreeEntries returns all free entries in the table.

func (*XRefTable) GetInUseEntries ¶

func (t *XRefTable) GetInUseEntries() []*XRefEntry

GetInUseEntries returns all in-use entries in the table.

func (*XRefTable) GetTrailer ¶

func (t *XRefTable) GetTrailer() *Dictionary

GetTrailer returns the trailer dictionary.

func (*XRefTable) HasObject ¶

func (t *XRefTable) HasObject(objectNum int) bool

HasObject returns true if the table contains an entry for the given object number.

func (*XRefTable) MergeOlder ¶ added in v0.2.1

func (t *XRefTable) MergeOlder(older *XRefTable)

MergeOlder merges entries from an older cross-reference table.

Entries already present in this table (newer) are preserved. Only entries missing from this table are added from the older table. This implements the PDF incremental update semantics where newer xref sections take precedence over older ones.

Reference: PDF 1.7 specification, Section 7.5.6 (Incremental Updates).

func (*XRefTable) SetTrailer ¶

func (t *XRefTable) SetTrailer(trailer *Dictionary)

SetTrailer sets the trailer dictionary.

func (*XRefTable) Size ¶

func (t *XRefTable) Size() int

Size returns the number of entries in the table.

func (*XRefTable) String ¶

func (t *XRefTable) String() string

String returns a string representation of the XRef table.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Overview ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func IsContentStreamOperator ¶

func IsKeyword ¶

func ReadPDFInfo ¶

Types ¶

type Array ¶

func NewArray ¶

func NewArrayFromSlice ¶

func NewArrayWithCapacity ¶

func (*Array) Append ¶

func (*Array) AppendAll ¶

func (*Array) Clear ¶

func (*Array) Clone ¶

func (*Array) Elements ¶

func (*Array) Get ¶

func (*Array) Insert ¶

func (*Array) Len ¶

func (*Array) Remove ¶

func (*Array) Set ¶

func (*Array) String ¶

func (*Array) WriteTo ¶

type Boolean ¶

func NewBoolean ¶

func (*Boolean) String ¶

func (*Boolean) Value ¶

func (*Boolean) WriteTo ¶

type Dictionary ¶

func NewDictionary ¶

func NewDictionaryWithCapacity ¶

func (*Dictionary) Clear ¶

func (*Dictionary) Clone ¶

func (*Dictionary) Get ¶

func (*Dictionary) GetArray ¶

func (*Dictionary) GetBoolean ¶

func (*Dictionary) GetDictionary ¶

func (*Dictionary) GetInteger ¶

func (*Dictionary) GetName ¶

func (*Dictionary) GetReal ¶

func (*Dictionary) GetString ¶

func (*Dictionary) Has ¶

func (*Dictionary) Keys ¶

func (*Dictionary) KeysSorted ¶

func (*Dictionary) Len ¶

func (*Dictionary) Merge ¶

func (*Dictionary) Remove ¶

func (*Dictionary) Set ¶

func (*Dictionary) SetBoolean ¶

func (*Dictionary) SetInteger ¶

func (*Dictionary) SetName ¶

func (*Dictionary) SetReal ¶

func (*Dictionary) SetString ¶

func (*Dictionary) String ¶

func (*Dictionary) WriteTo ¶

type DocInfo ¶

type IndirectObject ¶

func NewIndirectObject ¶

func (*IndirectObject) String ¶

func (*IndirectObject) WriteTo ¶

type IndirectReference ¶

func NewIndirectReference ¶

func (*IndirectReference) Clone ¶

func (*IndirectReference) Equals ¶

func (*IndirectReference) String ¶

func (*IndirectReference) WriteTo ¶

type Integer ¶

func NewInteger ¶

func (*Integer) Int ¶

func (*Integer) String ¶

func (*Integer) Value ¶

func (*Integer) WriteTo ¶

type Lexer ¶

func NewLexer ¶

func (*Lexer) NextToken ¶

func (*Lexer) Peek ¶

func (*Lexer) Position ¶