Documentation
¶
Overview ¶
Package parser provides PDF parsing functionality.
Quick Start ¶
Open a PDF:
pdf, err := parser.Open(pdfBytes)
if err != nil {
log.Fatal(err)
}
Get an object:
obj, err := pdf.GetObject(5)
For encrypted PDFs:
pdf, err := parser.OpenWithOptions(pdfBytes, parser.ParseOptions{
Password: []byte("secret"),
})
For byte-perfect reconstruction:
pdf, err := parser.OpenWithOptions(pdfBytes, parser.ParseOptions{
BytePerfect: true,
})
reconstructed := pdf.Bytes()
Package parser provides PDF parsing with byte-perfect reconstruction support ¶
Package parser provides PDF parsing with byte-perfect reconstruction support ¶
Package parser provides PDF parsing functionality including stream filters ¶
Package parser provides PDF parsing functionality including incremental updates
Index ¶
- func CountRevisions(pdfBytes []byte) int
- func DecodeASCII85(data []byte) ([]byte, error)
- func DecodeASCIIHex(data []byte) ([]byte, error)
- func DecodeDCTDecode(data []byte) ([]byte, error)
- func DecodeFilter(data []byte, filterName string) ([]byte, error)
- func DecodeFlateDecode(data []byte) ([]byte, error)
- func DecodeRunLength(data []byte) ([]byte, error)
- func EncodeASCII85(data []byte) []byte
- func EncodeASCIIHex(data []byte) []byte
- func EncodeDCTDecode(data []byte) ([]byte, error)
- func EncodeRunLength(data []byte) []byte
- func ExtractRevision(pdfBytes []byte, revisionNum int) ([]byte, error)
- func FindAllEOFMarkers(pdfBytes []byte) []int
- func FindObjectByNumber(pdfBytes []byte, objNum int, encryptInfo *types.PDFEncryption, verbose bool) (int, error)
- func GetDirectObject(pdfBytes []byte, objNum int, offset int64, encryptInfo *types.PDFEncryption, ...) ([]byte, error)
- func GetObject(pdfBytes []byte, objNum int, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)
- func GetObjectFromStream(pdfBytes []byte, objNum int, streamObjNum int, indexInStream int, ...) ([]byte, error)
- func GetRevisionBoundaries(pdfBytes []byte) []int
- func GetSampleObjectNumbers(objMap map[int]int64, max int) []int
- func ParseCrossReferenceTable(pdfBytes []byte, startXRef int64) (map[int]int64, error)
- func ParseCrossReferenceTableWithEncryption(pdfBytes []byte, startXRef int64, encryptInfo *types.PDFEncryption, ...) (map[int]int64, error)
- func ParseTraditionalXRefTable(pdfBytes []byte, startXRef int64) (map[int]int64, error)
- func ParseXFAFields(xfaXML string, verbose bool) ([]types.FormField, error)
- func ParseXRefStream(pdfBytes []byte, startXRef int64) (map[int]int64, error)
- func ParseXRefStreamWithEncryption(pdfBytes []byte, startXRef int64, encryptInfo *types.PDFEncryption, ...) (map[int]int64, error)
- type ObjectLocation
- type ObjectRef
- type ObjectStreamEntry
- type PDF
- func (p *PDF) Bytes() []byte
- func (p *PDF) Document() *PDFDocument
- func (p *PDF) Encryption() *types.PDFEncryption
- func (p *PDF) GetObject(objNum int) ([]byte, error)
- func (p *PDF) GetRawObject(objNum int) (*PDFRawObject, error)
- func (p *PDF) HasObject(objNum int) bool
- func (p *PDF) IsEncrypted() bool
- func (p *PDF) ObjectCount() int
- func (p *PDF) Objects() []int
- func (p *PDF) Raw() []byte
- func (p *PDF) RevisionCount() int
- func (p *PDF) Trailer() *TrailerInfo
- func (p *PDF) Version() string
- type PDFDocument
- func (d *PDFDocument) AllObjects() map[int]*PDFRawObject
- func (d *PDFDocument) Bytes() []byte
- func (d *PDFDocument) GetObject(objNum int) *PDFRawObject
- func (d *PDFDocument) GetObjectInRevision(objNum int, revisionNum int) *PDFRawObject
- func (d *PDFDocument) LatestRevision() *PDFRevision
- func (d *PDFDocument) ObjectCount() int
- func (d *PDFDocument) RevisionCount() int
- type PDFHeader
- type PDFRawObject
- type PDFRevision
- type PDFTrailer
- type ParseOptions
- type TrailerData
- type TrailerInfo
- type XRef
- type XRefData
- type XRefEntry
- type XRefResult
- type XRefType
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func CountRevisions ¶
CountRevisions returns the number of revisions in the PDF
func DecodeASCII85 ¶
DecodeASCII85 decodes ASCII85Decode (also known as btoa) filter data ASCII85 encodes 4 bytes as 5 ASCII characters (base 85) Special cases: 'z' represents 4 zero bytes, '~>' marks end
func DecodeASCIIHex ¶
DecodeASCIIHex decodes ASCIIHexDecode filter data ASCIIHexDecode converts pairs of hex digits to bytes Whitespace is ignored, '>' marks end of data
func DecodeDCTDecode ¶
DecodeDCTDecode decodes DCTDecode filter data DCTDecode is a pass-through filter for JPEG-compressed image data The data is already in JPEG format, so we just return it as-is
func DecodeFilter ¶
DecodeFilter applies the appropriate filter to decode stream data Supports: FlateDecode, ASCIIHexDecode, ASCII85Decode, RunLengthDecode, DCTDecode
func DecodeFlateDecode ¶
DecodeFlateDecode decompresses zlib/deflate compressed data
func DecodeRunLength ¶
DecodeRunLength decodes RunLengthDecode filter data Format: length byte followed by data - length 0-127: copy next (length+1) bytes literally - length 129-255: repeat next byte (257-length) times - length 128: end of data
func EncodeASCII85 ¶
EncodeASCII85 encodes data using ASCII85Decode format Returns data wrapped in <~ ... ~>
func EncodeASCIIHex ¶
EncodeASCIIHex encodes data using ASCIIHexDecode format Returns hex string with '>' terminator
func EncodeDCTDecode ¶
EncodeDCTDecode encodes data using DCTDecode filter This is a pass-through - assumes data is already valid JPEG
func EncodeRunLength ¶
EncodeRunLength encodes data using RunLengthDecode format
func ExtractRevision ¶
ExtractRevision extracts a specific revision (1-indexed) from the PDF Returns the PDF bytes up to and including that revision's %%EOF
func FindAllEOFMarkers ¶
FindAllEOFMarkers returns the byte offsets of all %%EOF markers in the PDF
func FindObjectByNumber ¶
func FindObjectByNumber(pdfBytes []byte, objNum int, encryptInfo *types.PDFEncryption, verbose bool) (int, error)
FindObjectByNumber finds a PDF object by its number
func GetDirectObject ¶
func GetDirectObject(pdfBytes []byte, objNum int, offset int64, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)
GetDirectObject reads a PDF object at a specific byte offset
func GetObject ¶
func GetObject(pdfBytes []byte, objNum int, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)
GetObject retrieves a PDF object, handling both direct objects and objects in object streams This is the equivalent of PyPDF's get_object() method
func GetObjectFromStream ¶
func GetObjectFromStream(pdfBytes []byte, objNum int, streamObjNum int, indexInStream int, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)
GetObjectFromStream extracts an object from an object stream (ObjStm) This implements PyPDF's _get_object_from_stream method
func GetRevisionBoundaries ¶
GetRevisionBoundaries returns the byte offsets where each revision ends (%%EOF positions)
func GetSampleObjectNumbers ¶
GetSampleObjectNumbers returns a sample of object numbers from the map (for debugging)
func ParseCrossReferenceTable ¶
ParseCrossReferenceTable parses the PDF cross-reference table This allows finding objects even in encrypted PDFs Handles both traditional xref tables and cross-reference streams
func ParseCrossReferenceTableWithEncryption ¶
func ParseCrossReferenceTableWithEncryption(pdfBytes []byte, startXRef int64, encryptInfo *types.PDFEncryption, verbose bool) (map[int]int64, error)
ParseCrossReferenceTableWithEncryption parses xref table with optional decryption
func ParseTraditionalXRefTable ¶
ParseTraditionalXRefTable parses a traditional PDF cross-reference table
func ParseXFAFields ¶
ParseXFAFields parses XFA XML and extracts form fields, returning FormField types This is general-purpose parsing - it extracts fields from XFA XML regardless of source
func ParseXRefStream ¶
ParseXRefStream parses a PDF cross-reference stream Cross-reference streams are compressed and may be encrypted They contain object offsets in binary format
func ParseXRefStreamWithEncryption ¶
func ParseXRefStreamWithEncryption(pdfBytes []byte, startXRef int64, encryptInfo *types.PDFEncryption, verbose bool) (map[int]int64, error)
ParseXRefStreamWithEncryption parses a PDF cross-reference stream with optional decryption
Types ¶
type ObjectLocation ¶
type ObjectLocation struct {
IsDirect bool // True if object is at a direct byte offset
ByteOffset int64 // For direct objects: byte offset in PDF
StreamObjNum int // For object stream objects: containing stream's object number
IndexInStream int // For object stream objects: index within the stream
}
ObjectLocation describes where an object is located
func FindObjectLocation ¶
func FindObjectLocation(pdfBytes []byte, objNum int, verbose bool) (*ObjectLocation, error)
FindObjectLocation finds where an object is located (direct or in object stream)
type ObjectRef ¶
type ObjectRef struct {
Number int // Object number
Generation int // Generation number (usually 0)
Offset int64 // Byte offset in PDF (0 if in object stream)
InStream bool // True if object is stored in an object stream
StreamObjNum int // Object stream number (if InStream is true)
StreamIndex int // Index within object stream (if InStream is true)
}
ObjectRef describes where a PDF object is located.
type ObjectStreamEntry ¶
type ObjectStreamEntry struct {
StreamObjNum int // The object stream that contains this object
IndexInStream int // Index within the object stream
}
ObjectStreamEntry represents an object stored in an object stream (Type 2 xref entry)
type PDF ¶
type PDF struct {
// contains filtered or unexported fields
}
PDF represents a parsed PDF document. This is the main entry point for working with PDF files.
func Open ¶
Open parses a PDF from bytes with default options. For encrypted PDFs or byte-perfect parsing, use OpenWithOptions.
func OpenWithOptions ¶
func OpenWithOptions(data []byte, opts ParseOptions) (*PDF, error)
OpenWithOptions parses a PDF with custom options.
func (*PDF) Bytes ¶
Bytes returns the PDF as bytes. If parsed with BytePerfect option, returns byte-identical reconstruction. Otherwise, returns the original input bytes.
func (*PDF) Document ¶
func (p *PDF) Document() *PDFDocument
Document returns the underlying PDFDocument (only for BytePerfect mode)
func (*PDF) Encryption ¶
func (p *PDF) Encryption() *types.PDFEncryption
Encryption returns encryption info (nil if unencrypted)
func (*PDF) GetObject ¶
GetObject returns the content of a PDF object by number. Returns the raw bytes between "N G obj" and "endobj".
func (*PDF) GetRawObject ¶
func (p *PDF) GetRawObject(objNum int) (*PDFRawObject, error)
GetRawObject returns a PDFRawObject with full byte preservation. Only available when parsed with BytePerfect option.
func (*PDF) IsEncrypted ¶
IsEncrypted returns true if the PDF is encrypted
func (*PDF) ObjectCount ¶
ObjectCount returns the number of objects in the PDF
func (*PDF) RevisionCount ¶
RevisionCount returns the number of revisions (1 for non-incremental PDFs)
type PDFDocument ¶
type PDFDocument struct {
RawBytes []byte // Original complete PDF bytes
Header *PDFHeader // PDF header info
Revisions []*PDFRevision // All revisions, oldest first
}
PDFDocument represents a complete PDF with all revisions and raw bytes preserved This enables byte-perfect reconstruction of the original PDF
func ParsePDFDocument ¶
func ParsePDFDocument(pdfBytes []byte) (*PDFDocument, error)
ParsePDFDocument parses a complete PDF preserving all raw bytes for reconstruction
func (*PDFDocument) AllObjects ¶
func (d *PDFDocument) AllObjects() map[int]*PDFRawObject
AllObjects returns all objects from the merged view (latest version of each)
func (*PDFDocument) Bytes ¶
func (d *PDFDocument) Bytes() []byte
Bytes returns the complete PDF bytes for reconstruction This should produce output identical to the original RawBytes
func (*PDFDocument) GetObject ¶
func (d *PDFDocument) GetObject(objNum int) *PDFRawObject
GetObject returns an object by number, searching from newest to oldest revision
func (*PDFDocument) GetObjectInRevision ¶
func (d *PDFDocument) GetObjectInRevision(objNum int, revisionNum int) *PDFRawObject
GetObjectInRevision returns an object from a specific revision
func (*PDFDocument) LatestRevision ¶
func (d *PDFDocument) LatestRevision() *PDFRevision
LatestRevision returns the most recent revision
func (*PDFDocument) ObjectCount ¶
func (d *PDFDocument) ObjectCount() int
ObjectCount returns the total number of unique objects across all revisions
func (*PDFDocument) RevisionCount ¶
func (d *PDFDocument) RevisionCount() int
RevisionCount returns the number of revisions in the PDF
type PDFHeader ¶
type PDFHeader struct {
Version string // PDF version (e.g., "1.7")
MajorVersion int // Major version number (e.g., 1)
MinorVersion int // Minor version number (e.g., 7)
RawBytes []byte // Exact header bytes including binary marker (up to first object)
}
PDFHeader contains PDF header information with exact bytes preserved
func ParsePDFHeader ¶
ParsePDFHeader parses the PDF header with exact bytes preserved
type PDFRawObject ¶
type PDFRawObject struct {
Number int // Object number
Generation int // Generation number
Offset int64 // Byte offset in file where object starts
EndOffset int64 // Byte offset where object ends (after "endobj")
RawBytes []byte // Complete raw bytes from "N G obj" through "endobj" (inclusive)
// Parsed stream components (populated for stream objects)
IsStream bool // True if this is a stream object
DictRaw []byte // Raw dictionary bytes (including << >>)
StreamRaw []byte // Raw stream data (between "stream\n" and "\nendstream", excluding keywords)
DictStart int // Offset within RawBytes where dictionary starts
DictEnd int // Offset within RawBytes where dictionary ends
StreamStart int // Offset within RawBytes where stream data starts
StreamEnd int // Offset within RawBytes where stream data ends
}
PDFRawObject contains the raw bytes of a PDF object This preserves exact formatting for byte-perfect reconstruction
func ParseRawObject ¶
ParseRawObject parses a PDF object preserving all raw bytes
func ParseRawObjectAt ¶
func ParseRawObjectAt(pdfBytes []byte, offset int64) (*PDFRawObject, error)
ParseRawObjectAt is a convenience function to parse an object at a byte offset without knowing its object number (extracts from header)
func (*PDFRawObject) Bytes ¶
func (o *PDFRawObject) Bytes() []byte
Bytes returns the raw bytes of this object
func (*PDFRawObject) Content ¶
func (o *PDFRawObject) Content() []byte
Content returns the content between "N G obj" and "endobj"
func (*PDFRawObject) StreamData ¶
func (o *PDFRawObject) StreamData() []byte
StreamData returns the decompressed stream data (if this is a stream object)
type PDFRevision ¶
type PDFRevision struct {
Number int // Revision number (1-indexed)
Objects map[int]*PDFRawObject // Objects added/modified in this revision (keyed by object number)
XRef *XRefData // Cross-reference data for this revision
Trailer *TrailerData // Trailer dictionary for this revision
StartXRef int64 // startxref value for this revision
EOFOffset int64 // Byte offset where %%EOF starts
EndOffset int64 // Byte offset after %%EOF (and any trailing newlines)
}
PDFRevision represents a single revision of the PDF A PDF can have multiple revisions when it's been incrementally updated
func (*PDFRevision) RevisionBytes ¶
func (r *PDFRevision) RevisionBytes(doc *PDFDocument) []byte
RevisionBytes returns the complete bytes for this revision (all objects + xref + trailer + startxref + %%EOF)
type PDFTrailer ¶
type PDFTrailer struct {
RootRef string // Root reference (e.g., "/Root 204 0 R")
EncryptRef string // Encrypt reference if present
InfoRef string // Info reference if present
StartXRef int64 // Byte offset from startxref
}
PDFTrailer represents simplified PDF trailer information This is a lightweight type for quick trailer parsing without byte preservation. For byte-perfect reconstruction, use TrailerData instead.
func ParsePDFTrailer ¶
func ParsePDFTrailer(pdfBytes []byte) (*PDFTrailer, error)
ParsePDFTrailer parses the PDF trailer to find object references
type ParseOptions ¶
type ParseOptions struct {
Password []byte // Password for encrypted PDFs (empty for unencrypted)
Verbose bool // Enable verbose logging
BytePerfect bool // Preserve exact bytes for reconstruction
}
ParseOptions configures PDF parsing behavior
type TrailerData ¶
type TrailerData struct {
Offset int64 // Byte offset where "trailer" keyword starts (0 for xref stream)
RawBytes []byte // Raw bytes of trailer dictionary (including "trailer\n<<...>>")
// Parsed values for convenience
Size int // /Size value
Root string // /Root reference (e.g., "1 0 R")
Encrypt string // /Encrypt reference (if encrypted)
Info string // /Info reference (if present)
Prev int64 // /Prev value (offset of previous xref, 0 if none)
ID [][]byte // /ID array (two byte strings)
}
TrailerData represents trailer information with exact bytes preserved
func ParseTrailerDataRaw ¶
func ParseTrailerDataRaw(pdfBytes []byte, startXRef int64, eofOffset int) (*TrailerData, error)
ParseTrailerDataRaw parses trailer data preserving raw bytes
func (*TrailerData) Bytes ¶
func (t *TrailerData) Bytes() []byte
Bytes returns the raw bytes of this trailer
type TrailerInfo ¶
type TrailerInfo struct {
Size int // Number of objects in the file
RootRef string // Reference to document catalog (e.g., "1 0 R")
InfoRef string // Reference to document info dictionary
EncryptRef string // Reference to encryption dictionary
IDArray []byte // File identifier array
}
TrailerInfo contains parsed trailer dictionary information.
type XRef ¶
type XRef struct {
Objects map[int]*ObjectRef // Object number -> reference info
Size int // Total number of objects
}
XRef represents consolidated cross-reference data for all objects in the PDF. It merges data from all revisions (for incremental updates) into a single view.
type XRefData ¶
type XRefData struct {
Type XRefType // Traditional table or stream
Offset int64 // Byte offset where xref section starts
RawBytes []byte // Complete raw bytes of xref section (for traditional: "xref" through entries)
// Parsed entries for convenience
Entries []XRefEntry
// For xref streams: the object containing the stream
StreamObject *PDFRawObject
}
XRefData represents cross-reference data with exact bytes preserved
func ParseXRefDataRaw ¶
ParseXRefDataRaw parses cross-reference data preserving raw bytes
type XRefEntry ¶
type XRefEntry struct {
ObjectNum int // Object number
Generation int // Generation number
Offset int64 // For type 1: byte offset in file
InUse bool // true = 'n' (in use), false = 'f' (free)
// For type 2 entries (object in object stream)
InObjectStream bool // True if this object is in an object stream
StreamObjNum int // Object stream number (if InObjectStream)
IndexInStream int // Index within object stream (if InObjectStream)
// Raw entry bytes for byte-perfect reconstruction (20 bytes for traditional)
RawBytes []byte
}
XRefEntry represents a single cross-reference entry
type XRefResult ¶
type XRefResult struct {
// Regular objects (Type 1): objNum -> byte offset
Objects map[int]int64
// Objects in object streams (Type 2): objNum -> ObjectStreamEntry
ObjectStreams map[int]ObjectStreamEntry
}
XRefResult contains both regular object offsets and object stream entries
func ParseXRefStreamFull ¶
func ParseXRefStreamFull(pdfBytes []byte, startXRef int64, verbose bool) (*XRefResult, error)
ParseXRefStreamFull parses a PDF cross-reference stream and returns both regular and compressed object info