internal

package

v0.0.0-...-5c329e6 Latest Latest Go to latest Published: Apr 30, 2026 License: Apache-2.0 Imports: 35 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/DataDog/iceberg-go

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
func BigEndianToDecimal(buf []byte) (decimal.Decimal128, error)
func MakeSequencedChan[T any](bufferSize uint, source <-chan T, comesAfter, isNext func(a, b *T) bool, ...) <-chan T
func MapExec[T, S any](ctx context.Context, nWorkers int, slice iter.Seq[T], fn func(T) (S, error)) iter.Seq2[S, error]
func PartitionRecordValue(field iceberg.PartitionField, val iceberg.Literal, schema *iceberg.Schema) (iceberg.Optional[iceberg.Literal], error)
func TablePropertiesFromContext(ctx context.Context) iceberg.Properties
func TruncateUpperBoundBinary(val []byte, trunc int) []byte
func TruncateUpperBoundText(s string, trunc int) string
func WithTableProperties(ctx context.Context, props iceberg.Properties) context.Context
type DataFileOpts
type DataFileStatistics
- func (d *DataFileStatistics) PartitionValue(field iceberg.PartitionField, sc *iceberg.Schema) any
- func (d *DataFileStatistics) ToDataFile(opts DataFileOpts) iceberg.DataFile
type Enumerated
type FileFormat
- func FormatFromFileName(fileName string) FileFormat
- func GetFileFormat(format iceberg.FileFormat) FileFormat
type FileReader
type FileSource
- func GetFile(ctx context.Context, fs iceio.IO, dataFile iceberg.DataFile, isPosDeletes bool) (FileSource, error)
type FileWriter
type Metadata
type MetricModeType
type MetricsMode
- func MatchMetricsMode(mode string) (MetricsMode, error)
type ParquetFileSource
- func (pfs *ParquetFileSource) GetReader(ctx context.Context) (FileReader, error)
type ParquetFileWriter
- func (w *ParquetFileWriter) BytesWritten() int64
- func (w *ParquetFileWriter) Close() (_ iceberg.DataFile, err error)
- func (w *ParquetFileWriter) Write(batch arrow.RecordBatch) error
type ParquetRowGroupTester
type RowGroupBloomPred
type StatisticsCollector
type StatsAgg
type TypedStats
type WriteFileInfo

Constants ¶

View Source

const (
	ParquetRowGroupSizeBytesKey              = "write.parquet.row-group-size-bytes"
	ParquetRowGroupSizeBytesDefault          = 128 * 1024 * 1024 // 128 MB
	ParquetRowGroupLimitKey                  = "write.parquet.row-group-limit"
	ParquetRowGroupLimitDefault              = 1048576
	ParquetPageSizeBytesKey                  = "write.parquet.page-size-bytes"
	ParquetPageSizeBytesDefault              = 1024 * 1024 // 1 MB
	ParquetPageRowLimitKey                   = "write.parquet.page-row-limit"
	ParquetPageRowLimitDefault               = 20000
	ParquetDictSizeBytesKey                  = "write.parquet.dict-size-bytes"
	ParquetDictSizeBytesDefault              = 2 * 1024 * 1024 // 2 MB
	ParquetPageVersionKey                    = "write.parquet.page-version"
	ParquetPageVersionDefault                = "2"
	ParquetCompressionKey                    = "write.parquet.compression-codec"
	ParquetCompressionDefault                = "zstd"
	ParquetCompressionLevelKey               = "write.parquet.compression-level"
	ParquetCompressionLevelDefault           = -1
	ParquetBloomFilterMaxBytesKey            = "write.parquet.bloom-filter-max-bytes"
	ParquetBloomFilterMaxBytesDefault        = 1024 * 1024
	ParquetBloomFilterColumnEnabledKeyPrefix = "write.parquet.bloom-filter-enabled.column"

	ParquetBatchSizeKey     = "read.parquet.batch-size"
	ParquetBatchSizeDefault = 1 << 17 // 131072 rows
)

Variables ¶

This section is empty.

Functions ¶

func BigEndianToDecimal ¶

func BigEndianToDecimal(buf []byte) (decimal.Decimal128, error)

func MakeSequencedChan ¶

func MakeSequencedChan[T any](bufferSize uint, source <-chan T, comesAfter, isNext func(a, b *T) bool, initial T) <-chan T

MakeSequencedChan creates a channel that outputs values in a given order based on the comesAfter and isNext functions. The values are read in from the provided source and then re-ordered before being sent to the output.

func MapExec ¶

func MapExec[T, S any](ctx context.Context, nWorkers int, slice iter.Seq[T], fn func(T) (S, error)) iter.Seq2[S, error]

func PartitionRecordValue ¶

func PartitionRecordValue(field iceberg.PartitionField, val iceberg.Literal, schema *iceberg.Schema) (iceberg.Optional[iceberg.Literal], error)

func TablePropertiesFromContext ¶

func TablePropertiesFromContext(ctx context.Context) iceberg.Properties

TablePropertiesFromContext retrieves table properties from context. Returns nil if not set.

func TruncateUpperBoundBinary ¶

func TruncateUpperBoundBinary(val []byte, trunc int) []byte

func TruncateUpperBoundText ¶

func TruncateUpperBoundText(s string, trunc int) string

func WithTableProperties ¶

func WithTableProperties(ctx context.Context, props iceberg.Properties) context.Context

WithTableProperties returns a new context with the given table properties attached. These properties are used by readers to configure read behavior (e.g. batch size).

Types ¶

type DataFileOpts ¶

type DataFileOpts struct {
	Schema          *iceberg.Schema
	Spec            iceberg.PartitionSpec
	Path            string
	Format          iceberg.FileFormat
	Content         iceberg.ManifestEntryContent
	FileSize        int64
	PartitionValues map[int]any
	SortOrderID     int
}

DataFileOpts groups the fields needed to finalize a DataFile from DataFileStatistics. Collapsing the former positional arguments into a struct keeps call sites readable and makes future additions source- compatible.

type DataFileStatistics ¶

type DataFileStatistics struct {
	RecordCount      int64
	ColSizes         map[int]int64
	ValueCounts      map[int]int64
	NullValueCounts  map[int]int64
	NanValueCounts   map[int]int64
	ColAggs          map[int]StatsAgg
	SplitOffsets     []int64
	EqualityFieldIDs []int
}

func (*DataFileStatistics) PartitionValue ¶

func (d *DataFileStatistics) PartitionValue(field iceberg.PartitionField, sc *iceberg.Schema) any

func (*DataFileStatistics) ToDataFile ¶

func (d *DataFileStatistics) ToDataFile(opts DataFileOpts) iceberg.DataFile

type Enumerated ¶

type Enumerated[T any] struct {
	Value T
	Index int
	Last  bool
}

Enumerated is a quick way to represent a sequenced value that can be processed in parallel and then needs to be reordered.

type FileFormat ¶

type FileFormat interface {
	Open(context.Context, iceio.IO, string) (FileReader, error)
	PathToIDMapping(*iceberg.Schema) (map[string]int, error)
	DataFileStatsFromMeta(rdr Metadata, statsCols map[int]StatisticsCollector, colMapping map[string]int) *DataFileStatistics
	GetWriteProperties(iceberg.Properties) any
	WriteDataFile(ctx context.Context, fs iceio.WriteFileIO, partitionValues map[int]any, info WriteFileInfo, batches []arrow.RecordBatch) (iceberg.DataFile, error)
	NewFileWriter(ctx context.Context, fs iceio.WriteFileIO, partitionValues map[int]any, info WriteFileInfo, arrowSchema *arrow.Schema) (FileWriter, error)
}

func FormatFromFileName ¶

func FormatFromFileName(fileName string) FileFormat

func GetFileFormat ¶

func GetFileFormat(format iceberg.FileFormat) FileFormat

type FileReader ¶

type FileReader interface {
	io.Closer

	Metadata() Metadata
	SourceFileSize() int64
	Schema() (*arrow.Schema, error)
	// PrunedSchema takes in the list of projected field IDs and returns the arrow schema
	// that represents the underlying file schema with only the projected fields. It also
	// returns the indexes of the projected columns to allow reading *only* the needed
	// columns.
	PrunedSchema(projectedIDs map[int]struct{}, mapping iceberg.NameMapping) (*arrow.Schema, []int, error)
	// GetRecords returns a record reader for only the provided columns (using nil will read
	// all of the columns of the underlying file.) The `tester` is a function that can be used,
	// if non-nil, to filter aspects of the file such as skipping row groups in a parquet file.
	GetRecords(ctx context.Context, cols []int, tester any) (array.RecordReader, error)
	// ReadTable reads the entire file and returns it as an arrow table.
	ReadTable(context.Context) (arrow.Table, error)
}

type FileSource ¶

type FileSource interface {
	GetReader(context.Context) (FileReader, error)
}

func GetFile ¶

func GetFile(ctx context.Context, fs iceio.IO, dataFile iceberg.DataFile, isPosDeletes bool) (FileSource, error)

GetFile opens the given file using the provided file system.

The FileSource interface allows abstracting away the underlying file format while providing utilties to read the file as Arrow record batches.

type FileWriter ¶

type FileWriter interface {
	Write(arrow.RecordBatch) error
	BytesWritten() int64
	Close() (iceberg.DataFile, error)
}

FileWriter is an incremental single-file writer with open/write/close lifecycle. It writes Arrow record batches and tracks bytes written for rolling file decisions.

type Metadata ¶

type Metadata any

type MetricModeType ¶

type MetricModeType string

const (
	MetricModeTruncate MetricModeType = "truncate"
	MetricModeNone     MetricModeType = "none"
	MetricModeCounts   MetricModeType = "counts"
	MetricModeFull     MetricModeType = "full"
)

type MetricsMode ¶

type MetricsMode struct {
	Typ MetricModeType
	Len int
}

func MatchMetricsMode ¶

func MatchMetricsMode(mode string) (MetricsMode, error)

type ParquetFileSource ¶

type ParquetFileSource struct {
	// contains filtered or unexported fields
}

func (*ParquetFileSource) GetReader ¶

func (pfs *ParquetFileSource) GetReader(ctx context.Context) (FileReader, error)

type ParquetFileWriter ¶

type ParquetFileWriter struct {
	// contains filtered or unexported fields
}

ParquetFileWriter is an incremental single-file writer with open/write/close lifecycle. It writes Arrow record batches to a Parquet file and tracks bytes written for rolling file decisions.

func (*ParquetFileWriter) BytesWritten ¶

func (w *ParquetFileWriter) BytesWritten() int64

BytesWritten returns the number of bytes flushed to the output so far.

func (*ParquetFileWriter) Close ¶

func (w *ParquetFileWriter) Close() (_ iceberg.DataFile, err error)

Close finalizes the Parquet file and returns the resulting DataFile with accurate file statistics and size.

func (*ParquetFileWriter) Write ¶

func (w *ParquetFileWriter) Write(batch arrow.RecordBatch) error

Write appends a record batch to the Parquet file.

type ParquetRowGroupTester ¶

type ParquetRowGroupTester struct {
	StatsFn    func(*metadata.RowGroupMetaData, []int) (bool, error)
	BloomPreds []RowGroupBloomPred // nil = no bloom filter pass
}

ParquetRowGroupTester combines stats-based and bloom filter row group pruning. Pass it as the tester argument to wrapPqArrowReader.GetRecords.

type RowGroupBloomPred ¶

type RowGroupBloomPred struct {
	FieldID   int
	PhysBytes [][]byte // one entry for EqualTo; one per value for In
}

RowGroupBloomPred holds the physical-encoded bytes for each literal in a bloom-filterable predicate on one field. A row group can be skipped when NONE of the bytes appear in the column's bloom filter.

type StatisticsCollector ¶

type StatisticsCollector struct {
	FieldID    int
	IcebergTyp iceberg.PrimitiveType
	Mode       MetricsMode
	ColName    string
}

type StatsAgg ¶

type StatsAgg interface {
	Min() iceberg.Literal
	Max() iceberg.Literal
	Update(stats interface{ HasMinMax() bool })
	MinAsBytes() ([]byte, error)
	MaxAsBytes() ([]byte, error)
}

type TypedStats ¶

type TypedStats[T iceberg.LiteralType] interface {
	Min() T
	Max() T
}

type WriteFileInfo ¶

type WriteFileInfo struct {
	FileSchema       *iceberg.Schema
	Spec             iceberg.PartitionSpec
	FileName         string
	StatsCols        map[int]StatisticsCollector
	WriteProps       any
	Content          iceberg.ManifestEntryContent
	EqualityFieldIDs []int
	SortOrderID      int
}

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL