parse

package
v0.8.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 11, 2026 License: MIT Imports: 13 Imported by: 0

Documentation

Overview

Package parser provides PDF parsing functionality.

Quick Start

Open a PDF:

pdf, err := parser.Open(pdfBytes)
if err != nil {
    log.Fatal(err)
}

Get an object:

obj, err := pdf.GetObject(5)

For encrypted PDFs:

pdf, err := parser.OpenWithOptions(pdfBytes, parser.ParseOptions{
    Password: []byte("secret"),
})

For byte-perfect reconstruction:

pdf, err := parser.OpenWithOptions(pdfBytes, parser.ParseOptions{
    BytePerfect: true,
})
reconstructed := pdf.Bytes()

Package parser provides PDF parsing with byte-perfect reconstruction support

Package parser provides PDF parsing with byte-perfect reconstruction support

Package parser provides PDF parsing functionality including stream filters

Package parser provides PDF parsing functionality including incremental updates

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CountRevisions

func CountRevisions(pdfBytes []byte) int

CountRevisions returns the number of revisions in the PDF

func DecodeASCII85

func DecodeASCII85(data []byte) ([]byte, error)

DecodeASCII85 decodes ASCII85Decode (also known as btoa) filter data ASCII85 encodes 4 bytes as 5 ASCII characters (base 85) Special cases: 'z' represents 4 zero bytes, '~>' marks end

func DecodeASCIIHex

func DecodeASCIIHex(data []byte) ([]byte, error)

DecodeASCIIHex decodes ASCIIHexDecode filter data ASCIIHexDecode converts pairs of hex digits to bytes Whitespace is ignored, '>' marks end of data

func DecodeDCTDecode

func DecodeDCTDecode(data []byte) ([]byte, error)

DecodeDCTDecode decodes DCTDecode filter data DCTDecode is a pass-through filter for JPEG-compressed image data The data is already in JPEG format, so we just return it as-is

func DecodeFilter

func DecodeFilter(data []byte, filterName string) ([]byte, error)

DecodeFilter applies the appropriate filter to decode stream data Supports: FlateDecode, ASCIIHexDecode, ASCII85Decode, RunLengthDecode, DCTDecode

func DecodeFlateDecode

func DecodeFlateDecode(data []byte) ([]byte, error)

DecodeFlateDecode decompresses zlib/deflate compressed data

func DecodeRunLength

func DecodeRunLength(data []byte) ([]byte, error)

DecodeRunLength decodes RunLengthDecode filter data Format: length byte followed by data - length 0-127: copy next (length+1) bytes literally - length 129-255: repeat next byte (257-length) times - length 128: end of data

func EncodeASCII85

func EncodeASCII85(data []byte) []byte

EncodeASCII85 encodes data using ASCII85Decode format Returns data wrapped in <~ ... ~>

func EncodeASCIIHex

func EncodeASCIIHex(data []byte) []byte

EncodeASCIIHex encodes data using ASCIIHexDecode format Returns hex string with '>' terminator

func EncodeDCTDecode

func EncodeDCTDecode(data []byte) ([]byte, error)

EncodeDCTDecode encodes data using DCTDecode filter This is a pass-through - assumes data is already valid JPEG

func EncodeRunLength

func EncodeRunLength(data []byte) []byte

EncodeRunLength encodes data using RunLengthDecode format

func ExtractRevision

func ExtractRevision(pdfBytes []byte, revisionNum int) ([]byte, error)

ExtractRevision extracts a specific revision (1-indexed) from the PDF Returns the PDF bytes up to and including that revision's %%EOF

func FindAllEOFMarkers

func FindAllEOFMarkers(pdfBytes []byte) []int

FindAllEOFMarkers returns the byte offsets of all %%EOF markers in the PDF

func FindObjectByNumber

func FindObjectByNumber(pdfBytes []byte, objNum int, encryptInfo *types.PDFEncryption, verbose bool) (int, error)

FindObjectByNumber finds a PDF object by its number

func GetDirectObject

func GetDirectObject(pdfBytes []byte, objNum int, offset int64, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)

GetDirectObject reads a PDF object at a specific byte offset

func GetObject

func GetObject(pdfBytes []byte, objNum int, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)

GetObject retrieves a PDF object, handling both direct objects and objects in object streams This is the equivalent of PyPDF's get_object() method

func GetObjectFromStream

func GetObjectFromStream(pdfBytes []byte, objNum int, streamObjNum int, indexInStream int, encryptInfo *types.PDFEncryption, verbose bool) ([]byte, error)

GetObjectFromStream extracts an object from an object stream (ObjStm) This implements PyPDF's _get_object_from_stream method

func GetRevisionBoundaries

func GetRevisionBoundaries(pdfBytes []byte) []int

GetRevisionBoundaries returns the byte offsets where each revision ends (%%EOF positions)

func GetSampleObjectNumbers

func GetSampleObjectNumbers(objMap map[int]int64, max int) []int

GetSampleObjectNumbers returns a sample of object numbers from the map (for debugging)

func ParseCrossReferenceTable

func ParseCrossReferenceTable(pdfBytes []byte, startXRef int64) (map[int]int64, error)

ParseCrossReferenceTable parses the PDF cross-reference table This allows finding objects even in encrypted PDFs Handles both traditional xref tables and cross-reference streams

func ParseCrossReferenceTableWithEncryption

func ParseCrossReferenceTableWithEncryption(pdfBytes []byte, startXRef int64, encryptInfo *types.PDFEncryption, verbose bool) (map[int]int64, error)

ParseCrossReferenceTableWithEncryption parses xref table with optional decryption

func ParseTraditionalXRefTable

func ParseTraditionalXRefTable(pdfBytes []byte, startXRef int64) (map[int]int64, error)

ParseTraditionalXRefTable parses a traditional PDF cross-reference table

func ParseXFAFields

func ParseXFAFields(xfaXML string, verbose bool) ([]types.FormField, error)

ParseXFAFields parses XFA XML and extracts form fields, returning FormField types This is general-purpose parsing - it extracts fields from XFA XML regardless of source

func ParseXRefStream

func ParseXRefStream(pdfBytes []byte, startXRef int64) (map[int]int64, error)

ParseXRefStream parses a PDF cross-reference stream Cross-reference streams are compressed and may be encrypted They contain object offsets in binary format

func ParseXRefStreamWithEncryption

func ParseXRefStreamWithEncryption(pdfBytes []byte, startXRef int64, encryptInfo *types.PDFEncryption, verbose bool) (map[int]int64, error)

ParseXRefStreamWithEncryption parses a PDF cross-reference stream with optional decryption

Types

type ObjectLocation

type ObjectLocation struct {
	IsDirect      bool  // True if object is at a direct byte offset
	ByteOffset    int64 // For direct objects: byte offset in PDF
	StreamObjNum  int   // For object stream objects: containing stream's object number
	IndexInStream int   // For object stream objects: index within the stream
}

ObjectLocation describes where an object is located

func FindObjectLocation

func FindObjectLocation(pdfBytes []byte, objNum int, verbose bool) (*ObjectLocation, error)

FindObjectLocation finds where an object is located (direct or in object stream)

type ObjectRef

type ObjectRef struct {
	Number       int   // Object number
	Generation   int   // Generation number (usually 0)
	Offset       int64 // Byte offset in PDF (0 if in object stream)
	InStream     bool  // True if object is stored in an object stream
	StreamObjNum int   // Object stream number (if InStream is true)
	StreamIndex  int   // Index within object stream (if InStream is true)
}

ObjectRef describes where a PDF object is located.

type ObjectStreamEntry

type ObjectStreamEntry struct {
	StreamObjNum  int // The object stream that contains this object
	IndexInStream int // Index within the object stream
}

ObjectStreamEntry represents an object stored in an object stream (Type 2 xref entry)

type PDF

type PDF struct {
	// contains filtered or unexported fields
}

PDF represents a parsed PDF document. This is the main entry point for working with PDF files.

func Open

func Open(data []byte) (*PDF, error)

Open parses a PDF from bytes with default options. For encrypted PDFs or byte-perfect parsing, use OpenWithOptions.

func OpenWithOptions

func OpenWithOptions(data []byte, opts ParseOptions) (*PDF, error)

OpenWithOptions parses a PDF with custom options.

func (*PDF) Bytes

func (p *PDF) Bytes() []byte

Bytes returns the PDF as bytes. If parsed with BytePerfect option, returns byte-identical reconstruction. Otherwise, returns the original input bytes.

func (*PDF) Document

func (p *PDF) Document() *PDFDocument

Document returns the underlying PDFDocument (only for BytePerfect mode)

func (*PDF) Encryption

func (p *PDF) Encryption() *types.PDFEncryption

Encryption returns encryption info (nil if unencrypted)

func (*PDF) GetObject

func (p *PDF) GetObject(objNum int) ([]byte, error)

GetObject returns the content of a PDF object by number. Returns the raw bytes between "N G obj" and "endobj".

func (*PDF) GetRawObject

func (p *PDF) GetRawObject(objNum int) (*PDFRawObject, error)

GetRawObject returns a PDFRawObject with full byte preservation. Only available when parsed with BytePerfect option.

func (*PDF) HasObject

func (p *PDF) HasObject(objNum int) bool

HasObject returns true if the object exists

func (*PDF) IsEncrypted

func (p *PDF) IsEncrypted() bool

IsEncrypted returns true if the PDF is encrypted

func (*PDF) ObjectCount

func (p *PDF) ObjectCount() int

ObjectCount returns the number of objects in the PDF

func (*PDF) Objects

func (p *PDF) Objects() []int

Objects returns a list of all object numbers in the PDF

func (*PDF) Raw

func (p *PDF) Raw() []byte

Raw returns the original input bytes (always available)

func (*PDF) RevisionCount

func (p *PDF) RevisionCount() int

RevisionCount returns the number of revisions (1 for non-incremental PDFs)

func (*PDF) Trailer

func (p *PDF) Trailer() *TrailerInfo

Trailer returns the trailer information

func (*PDF) Version

func (p *PDF) Version() string

Version returns the PDF version string (e.g., "1.7")

type PDFDocument

type PDFDocument struct {
	RawBytes  []byte         // Original complete PDF bytes
	Header    *PDFHeader     // PDF header info
	Revisions []*PDFRevision // All revisions, oldest first
}

PDFDocument represents a complete PDF with all revisions and raw bytes preserved This enables byte-perfect reconstruction of the original PDF

func ParsePDFDocument

func ParsePDFDocument(pdfBytes []byte) (*PDFDocument, error)

ParsePDFDocument parses a complete PDF preserving all raw bytes for reconstruction

func (*PDFDocument) AllObjects

func (d *PDFDocument) AllObjects() map[int]*PDFRawObject

AllObjects returns all objects from the merged view (latest version of each)

func (*PDFDocument) Bytes

func (d *PDFDocument) Bytes() []byte

Bytes returns the complete PDF bytes for reconstruction This should produce output identical to the original RawBytes

func (*PDFDocument) GetObject

func (d *PDFDocument) GetObject(objNum int) *PDFRawObject

GetObject returns an object by number, searching from newest to oldest revision

func (*PDFDocument) GetObjectInRevision

func (d *PDFDocument) GetObjectInRevision(objNum int, revisionNum int) *PDFRawObject

GetObjectInRevision returns an object from a specific revision

func (*PDFDocument) LatestRevision

func (d *PDFDocument) LatestRevision() *PDFRevision

LatestRevision returns the most recent revision

func (*PDFDocument) ObjectCount

func (d *PDFDocument) ObjectCount() int

ObjectCount returns the total number of unique objects across all revisions

func (*PDFDocument) RevisionCount

func (d *PDFDocument) RevisionCount() int

RevisionCount returns the number of revisions in the PDF

type PDFHeader

type PDFHeader struct {
	Version      string // PDF version (e.g., "1.7")
	MajorVersion int    // Major version number (e.g., 1)
	MinorVersion int    // Minor version number (e.g., 7)
	RawBytes     []byte // Exact header bytes including binary marker (up to first object)
}

PDFHeader contains PDF header information with exact bytes preserved

func ParsePDFHeader

func ParsePDFHeader(pdfBytes []byte) (*PDFHeader, error)

ParsePDFHeader parses the PDF header with exact bytes preserved

type PDFRawObject

type PDFRawObject struct {
	Number     int    // Object number
	Generation int    // Generation number
	Offset     int64  // Byte offset in file where object starts
	EndOffset  int64  // Byte offset where object ends (after "endobj")
	RawBytes   []byte // Complete raw bytes from "N G obj" through "endobj" (inclusive)

	// Parsed stream components (populated for stream objects)
	IsStream    bool   // True if this is a stream object
	DictRaw     []byte // Raw dictionary bytes (including << >>)
	StreamRaw   []byte // Raw stream data (between "stream\n" and "\nendstream", excluding keywords)
	DictStart   int    // Offset within RawBytes where dictionary starts
	DictEnd     int    // Offset within RawBytes where dictionary ends
	StreamStart int    // Offset within RawBytes where stream data starts
	StreamEnd   int    // Offset within RawBytes where stream data ends
}

PDFRawObject contains the raw bytes of a PDF object This preserves exact formatting for byte-perfect reconstruction

func ParseRawObject

func ParseRawObject(pdfBytes []byte, objNum int, objGen int, offset int64) (*PDFRawObject, error)

ParseRawObject parses a PDF object preserving all raw bytes

func ParseRawObjectAt

func ParseRawObjectAt(pdfBytes []byte, offset int64) (*PDFRawObject, error)

ParseRawObjectAt is a convenience function to parse an object at a byte offset without knowing its object number (extracts from header)

func (*PDFRawObject) Bytes

func (o *PDFRawObject) Bytes() []byte

Bytes returns the raw bytes of this object

func (*PDFRawObject) Content

func (o *PDFRawObject) Content() []byte

Content returns the content between "N G obj" and "endobj"

func (*PDFRawObject) StreamData

func (o *PDFRawObject) StreamData() []byte

StreamData returns the decompressed stream data (if this is a stream object)

type PDFRevision

type PDFRevision struct {
	Number    int                   // Revision number (1-indexed)
	Objects   map[int]*PDFRawObject // Objects added/modified in this revision (keyed by object number)
	XRef      *XRefData             // Cross-reference data for this revision
	Trailer   *TrailerData          // Trailer dictionary for this revision
	StartXRef int64                 // startxref value for this revision
	EOFOffset int64                 // Byte offset where %%EOF starts
	EndOffset int64                 // Byte offset after %%EOF (and any trailing newlines)
}

PDFRevision represents a single revision of the PDF A PDF can have multiple revisions when it's been incrementally updated

func (*PDFRevision) RevisionBytes

func (r *PDFRevision) RevisionBytes(doc *PDFDocument) []byte

RevisionBytes returns the complete bytes for this revision (all objects + xref + trailer + startxref + %%EOF)

type PDFTrailer

type PDFTrailer struct {
	RootRef    string // Root reference (e.g., "/Root 204 0 R")
	EncryptRef string // Encrypt reference if present
	InfoRef    string // Info reference if present
	StartXRef  int64  // Byte offset from startxref
}

PDFTrailer represents simplified PDF trailer information This is a lightweight type for quick trailer parsing without byte preservation. For byte-perfect reconstruction, use TrailerData instead.

func ParsePDFTrailer

func ParsePDFTrailer(pdfBytes []byte) (*PDFTrailer, error)

ParsePDFTrailer parses the PDF trailer to find object references

type ParseOptions

type ParseOptions struct {
	Password    []byte // Password for encrypted PDFs (empty for unencrypted)
	Verbose     bool   // Enable verbose logging
	BytePerfect bool   // Preserve exact bytes for reconstruction
}

ParseOptions configures PDF parsing behavior

type TrailerData

type TrailerData struct {
	Offset   int64  // Byte offset where "trailer" keyword starts (0 for xref stream)
	RawBytes []byte // Raw bytes of trailer dictionary (including "trailer\n<<...>>")

	// Parsed values for convenience
	Size    int      // /Size value
	Root    string   // /Root reference (e.g., "1 0 R")
	Encrypt string   // /Encrypt reference (if encrypted)
	Info    string   // /Info reference (if present)
	Prev    int64    // /Prev value (offset of previous xref, 0 if none)
	ID      [][]byte // /ID array (two byte strings)
}

TrailerData represents trailer information with exact bytes preserved

func ParseTrailerDataRaw

func ParseTrailerDataRaw(pdfBytes []byte, startXRef int64, eofOffset int) (*TrailerData, error)

ParseTrailerDataRaw parses trailer data preserving raw bytes

func (*TrailerData) Bytes

func (t *TrailerData) Bytes() []byte

Bytes returns the raw bytes of this trailer

type TrailerInfo

type TrailerInfo struct {
	Size       int    // Number of objects in the file
	RootRef    string // Reference to document catalog (e.g., "1 0 R")
	InfoRef    string // Reference to document info dictionary
	EncryptRef string // Reference to encryption dictionary
	IDArray    []byte // File identifier array
}

TrailerInfo contains parsed trailer dictionary information.

type XRef

type XRef struct {
	Objects map[int]*ObjectRef // Object number -> reference info
	Size    int                // Total number of objects
}

XRef represents consolidated cross-reference data for all objects in the PDF. It merges data from all revisions (for incremental updates) into a single view.

type XRefData

type XRefData struct {
	Type     XRefType // Traditional table or stream
	Offset   int64    // Byte offset where xref section starts
	RawBytes []byte   // Complete raw bytes of xref section (for traditional: "xref" through entries)

	// Parsed entries for convenience
	Entries []XRefEntry

	// For xref streams: the object containing the stream
	StreamObject *PDFRawObject
}

XRefData represents cross-reference data with exact bytes preserved

func ParseXRefDataRaw

func ParseXRefDataRaw(pdfBytes []byte, startXRef int64) (*XRefData, error)

ParseXRefDataRaw parses cross-reference data preserving raw bytes

func (*XRefData) Bytes

func (x *XRefData) Bytes() []byte

Bytes returns the raw bytes of this xref section

type XRefEntry

type XRefEntry struct {
	ObjectNum  int   // Object number
	Generation int   // Generation number
	Offset     int64 // For type 1: byte offset in file
	InUse      bool  // true = 'n' (in use), false = 'f' (free)

	// For type 2 entries (object in object stream)
	InObjectStream bool // True if this object is in an object stream
	StreamObjNum   int  // Object stream number (if InObjectStream)
	IndexInStream  int  // Index within object stream (if InObjectStream)

	// Raw entry bytes for byte-perfect reconstruction (20 bytes for traditional)
	RawBytes []byte
}

XRefEntry represents a single cross-reference entry

type XRefResult

type XRefResult struct {
	// Regular objects (Type 1): objNum -> byte offset
	Objects map[int]int64
	// Objects in object streams (Type 2): objNum -> ObjectStreamEntry
	ObjectStreams map[int]ObjectStreamEntry
}

XRefResult contains both regular object offsets and object stream entries

func ParseXRefStreamFull

func ParseXRefStreamFull(pdfBytes []byte, startXRef int64, verbose bool) (*XRefResult, error)

ParseXRefStreamFull parses a PDF cross-reference stream and returns both regular and compressed object info

type XRefType

type XRefType int

XRefType indicates the type of cross-reference section

const (
	XRefTypeTable  XRefType = iota // Traditional "xref" table
	XRefTypeStream                 // Cross-reference stream (PDF 1.5+)
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL