Documentation
¶
Index ¶
- Constants
- func BigEndianToDecimal(buf []byte) (decimal.Decimal128, error)
- func MakeSequencedChan[T any](bufferSize uint, source <-chan T, comesAfter, isNext func(a, b *T) bool, ...) <-chan T
- func MapExec[T, S any](ctx context.Context, nWorkers int, slice iter.Seq[T], fn func(T) (S, error)) iter.Seq2[S, error]
- func PartitionRecordValue(field iceberg.PartitionField, val iceberg.Literal, schema *iceberg.Schema) (iceberg.Optional[iceberg.Literal], error)
- func TablePropertiesFromContext(ctx context.Context) iceberg.Properties
- func TruncateUpperBoundBinary(val []byte, trunc int) []byte
- func TruncateUpperBoundText(s string, trunc int) string
- func VariantFieldIDsFromSchema(sc *iceberg.Schema) map[int]struct{}
- func WithTableProperties(ctx context.Context, props iceberg.Properties) context.Context
- type DataFileOpts
- type DataFileStatistics
- type Enumerated
- type FileFormat
- type FileReader
- type FileSource
- type FileWriter
- type Metadata
- type MetricModeType
- type MetricsMode
- type ParquetFileSource
- type ParquetFileWriter
- type ParquetRowGroupTester
- type RowGroupBloomPred
- type StatisticsCollector
- type StatsAgg
- type TypedStats
- type WriteFileInfo
Constants ¶
const ( ParquetRowGroupSizeBytesKey = "write.parquet.row-group-size-bytes" ParquetRowGroupSizeBytesDefault = 128 * 1024 * 1024 // 128 MB ParquetRowGroupLimitKey = "write.parquet.row-group-limit" ParquetRowGroupLimitDefault = 1048576 ParquetPageSizeBytesKey = "write.parquet.page-size-bytes" ParquetPageSizeBytesDefault = 1024 * 1024 // 1 MB ParquetPageRowLimitKey = "write.parquet.page-row-limit" ParquetPageRowLimitDefault = 20000 ParquetDictSizeBytesKey = "write.parquet.dict-size-bytes" ParquetDictSizeBytesDefault = 2 * 1024 * 1024 // 2 MB ParquetPageVersionKey = "write.parquet.page-version" ParquetPageVersionDefault = "2" ParquetCompressionKey = "write.parquet.compression-codec" ParquetCompressionDefault = "zstd" ParquetCompressionLevelKey = "write.parquet.compression-level" ParquetCompressionLevelDefault = -1 ParquetBloomFilterMaxBytesKey = "write.parquet.bloom-filter-max-bytes" ParquetBloomFilterMaxBytesDefault = 1024 * 1024 ParquetBloomFilterColumnEnabledKeyPrefix = "write.parquet.bloom-filter-enabled.column" ParquetBatchSizeKey = "read.parquet.batch-size" ParquetBatchSizeDefault = 1 << 17 // 131072 rows )
Variables ¶
This section is empty.
Functions ¶
func BigEndianToDecimal ¶ added in v0.2.0
func BigEndianToDecimal(buf []byte) (decimal.Decimal128, error)
func MakeSequencedChan ¶
func MakeSequencedChan[T any](bufferSize uint, source <-chan T, comesAfter, isNext func(a, b *T) bool, initial T) <-chan T
MakeSequencedChan creates a channel that outputs values in a given order based on the comesAfter and isNext functions. The values are read in from the provided source and then re-ordered before being sent to the output.
func PartitionRecordValue ¶ added in v0.3.0
func TablePropertiesFromContext ¶ added in v0.6.0
func TablePropertiesFromContext(ctx context.Context) iceberg.Properties
TablePropertiesFromContext retrieves table properties from context. Returns nil if not set.
func TruncateUpperBoundBinary ¶ added in v0.3.0
func TruncateUpperBoundText ¶ added in v0.3.0
func VariantFieldIDsFromSchema ¶ added in v0.6.0
func WithTableProperties ¶ added in v0.6.0
WithTableProperties returns a new context with the given table properties attached. These properties are used by readers to configure read behavior (e.g. batch size).
Types ¶
type DataFileOpts ¶ added in v0.6.0
type DataFileOpts struct {
Schema *iceberg.Schema
Spec iceberg.PartitionSpec
Path string
Format iceberg.FileFormat
Content iceberg.ManifestEntryContent
FileSize int64
PartitionValues map[int]any
SortOrderID int
}
DataFileOpts groups the fields needed to finalize a DataFile from DataFileStatistics. Collapsing the former positional arguments into a struct keeps call sites readable and makes future additions source- compatible.
type DataFileStatistics ¶ added in v0.3.0
type DataFileStatistics struct {
RecordCount int64
ColSizes map[int]int64
ValueCounts map[int]int64
NullValueCounts map[int]int64
NanValueCounts map[int]int64
ColAggs map[int]StatsAgg
SplitOffsets []int64
EqualityFieldIDs []int
}
func (*DataFileStatistics) PartitionValue ¶ added in v0.3.0
func (d *DataFileStatistics) PartitionValue(field iceberg.PartitionField, sc *iceberg.Schema) any
func (*DataFileStatistics) ToDataFile ¶ added in v0.3.0
func (d *DataFileStatistics) ToDataFile(opts DataFileOpts) iceberg.DataFile
type Enumerated ¶
Enumerated is a quick way to represent a sequenced value that can be processed in parallel and then needs to be reordered.
type FileFormat ¶ added in v0.3.0
type FileFormat interface {
Open(context.Context, iceio.IO, string) (FileReader, error)
PathToIDMapping(*iceberg.Schema) (map[string]int, error)
DataFileStatsFromMeta(rdr Metadata, statsCols map[int]StatisticsCollector, colMapping map[string]int, variantFieldIDs map[int]struct{}) *DataFileStatistics
GetWriteProperties(iceberg.Properties) any
WriteDataFile(ctx context.Context, fs iceio.WriteFileIO, partitionValues map[int]any, info WriteFileInfo, batches []arrow.RecordBatch) (iceberg.DataFile, error)
NewFileWriter(ctx context.Context, fs iceio.WriteFileIO, partitionValues map[int]any, info WriteFileInfo, arrowSchema *arrow.Schema) (FileWriter, error)
}
func FormatFromFileName ¶ added in v0.3.0
func FormatFromFileName(fileName string) FileFormat
func GetFileFormat ¶ added in v0.3.0
func GetFileFormat(format iceberg.FileFormat) FileFormat
type FileReader ¶
type FileReader interface {
io.Closer
Metadata() Metadata
SourceFileSize() int64
Schema() (*arrow.Schema, error)
// PrunedSchema takes in the list of projected field IDs and returns the arrow schema
// that represents the underlying file schema with only the projected fields. It also
// returns the indexes of the projected columns to allow reading *only* the needed
// columns.
PrunedSchema(projectedIDs map[int]struct{}, mapping iceberg.NameMapping) (*arrow.Schema, []int, error)
// GetRecords returns a record reader for only the provided columns (using nil will read
// all of the columns of the underlying file.) The `tester` is a function that can be used,
// if non-nil, to filter aspects of the file such as skipping row groups in a parquet file.
GetRecords(ctx context.Context, cols []int, tester any) (array.RecordReader, error)
// ReadTable reads the entire file and returns it as an arrow table.
ReadTable(context.Context) (arrow.Table, error)
}
type FileSource ¶
type FileSource interface {
GetReader(context.Context) (FileReader, error)
}
func GetFile ¶
func GetFile(ctx context.Context, fs iceio.IO, dataFile iceberg.DataFile, isPosDeletes bool) (FileSource, error)
GetFile opens the given file using the provided file system.
The FileSource interface allows abstracting away the underlying file format while providing utilties to read the file as Arrow record batches.
type FileWriter ¶ added in v0.6.0
type FileWriter interface {
Write(arrow.RecordBatch) error
BytesWritten() int64
Close() (iceberg.DataFile, error)
// Abort closes the underlying file handle without finalizing the
// file format (e.g. Parquet footer). It is safe to call regardless
// of how many rows have been written and should be used on error
// paths where Close() may panic or produce an invalid file.
Abort() error
}
FileWriter is an incremental single-file writer with open/write/close lifecycle. It writes Arrow record batches and tracks bytes written for rolling file decisions.
type MetricModeType ¶ added in v0.3.0
type MetricModeType string
const ( MetricModeTruncate MetricModeType = "truncate" MetricModeNone MetricModeType = "none" MetricModeCounts MetricModeType = "counts" MetricModeFull MetricModeType = "full" )
type MetricsMode ¶ added in v0.3.0
type MetricsMode struct {
Typ MetricModeType
Len int
}
func MatchMetricsMode ¶ added in v0.3.0
func MatchMetricsMode(mode string) (MetricsMode, error)
type ParquetFileSource ¶
type ParquetFileSource struct {
// contains filtered or unexported fields
}
func (*ParquetFileSource) GetReader ¶
func (pfs *ParquetFileSource) GetReader(ctx context.Context) (FileReader, error)
type ParquetFileWriter ¶ added in v0.6.0
type ParquetFileWriter struct {
// contains filtered or unexported fields
}
ParquetFileWriter is an incremental single-file writer with open/write/close lifecycle. It writes Arrow record batches to a Parquet file and tracks bytes written for rolling file decisions.
func (*ParquetFileWriter) Abort ¶ added in v0.6.0
func (w *ParquetFileWriter) Abort() error
func (*ParquetFileWriter) BytesWritten ¶ added in v0.6.0
func (w *ParquetFileWriter) BytesWritten() int64
BytesWritten returns the number of bytes flushed to the output so far.
func (*ParquetFileWriter) Close ¶ added in v0.6.0
func (w *ParquetFileWriter) Close() (_ iceberg.DataFile, err error)
Close finalizes the Parquet file and returns the resulting DataFile with accurate file statistics and size.
func (*ParquetFileWriter) Write ¶ added in v0.6.0
func (w *ParquetFileWriter) Write(batch arrow.RecordBatch) error
Write appends a record batch to the Parquet file.
type ParquetRowGroupTester ¶ added in v0.6.0
type ParquetRowGroupTester struct {
StatsFn func(*metadata.RowGroupMetaData, []int) (bool, error)
BloomPreds []RowGroupBloomPred // nil = no bloom filter pass
}
ParquetRowGroupTester combines stats-based and bloom filter row group pruning. Pass it as the tester argument to wrapPqArrowReader.GetRecords.
type RowGroupBloomPred ¶ added in v0.6.0
type RowGroupBloomPred struct {
FieldID int
PhysBytes [][]byte // one entry for EqualTo; one per value for In
}
RowGroupBloomPred holds the physical-encoded bytes for each literal in a bloom-filterable predicate on one field. A row group can be skipped when NONE of the bytes appear in the column's bloom filter.
type StatisticsCollector ¶ added in v0.3.0
type StatisticsCollector struct {
FieldID int
IcebergTyp iceberg.PrimitiveType
Mode MetricsMode
ColName string
}
type TypedStats ¶ added in v0.3.0
type TypedStats[T iceberg.LiteralType] interface { Min() T Max() T }
type WriteFileInfo ¶ added in v0.3.0
type WriteFileInfo struct {
FileSchema *iceberg.Schema
Spec iceberg.PartitionSpec
FileName string
StatsCols map[int]StatisticsCollector
WriteProps any
Content iceberg.ManifestEntryContent
EqualityFieldIDs []int
SortOrderID int
}