Documentation
¶
Index ¶
- Constants
- Variables
- func ArrowSchemaToIceberg(sc *arrow.Schema, downcastNsTimestamp bool, nameMapping iceberg.NameMapping) (*iceberg.Schema, error)
- func ArrowSchemaToIcebergWithFreshIDs(sc *arrow.Schema, downcastNsTimestamp bool) (*iceberg.Schema, error)
- func ArrowTypeToIceberg(dt arrow.DataType, downcastNsTimestamp bool) (iceberg.Type, error)
- func CollectSafePositionDeletes(tasks []FileScanTask) []iceberg.DataFile
- func GetPartitionRecord(dataFile iceberg.DataFile, partitionType *iceberg.StructType) iceberg.StructLike
- func IsAncestorOf(snapshotID, ancestorID int64, lookup SnapshotLookup) bool
- func NewAddEncryptionKeyUpdate(key EncryptionKey) *addEncryptionKeyUpdate
- func NewAddPartitionSpecUpdate(spec *iceberg.PartitionSpec, initial bool) *addPartitionSpecUpdate
- func NewAddSchemaUpdate(schema *iceberg.Schema) *addSchemaUpdate
- func NewAddSnapshotUpdate(snapshot *Snapshot) *addSnapshotUpdate
- func NewAddSortOrderUpdate(sortOrder *SortOrder) *addSortOrderUpdate
- func NewAssignUUIDUpdate(uuid uuid.UUID) *assignUUIDUpdate
- func NewRemoveEncryptionKeyUpdate(keyID string) *removeEncryptionKeyUpdate
- func NewRemovePartitionStatisticsUpdate(snapshotID int64) *removePartitionStatisticsUpdate
- func NewRemovePropertiesUpdate(removals []string) *removePropertiesUpdate
- func NewRemoveSchemasUpdate(schemaIds []int) *removeSchemasUpdate
- func NewRemoveSnapshotRefUpdate(ref string) *removeSnapshotRefUpdate
- func NewRemoveSnapshotsUpdate(ids []int64, postCommit bool) *removeSnapshotsUpdate
- func NewRemoveSpecUpdate(specIds []int) *removeSpecUpdate
- func NewRemoveStatisticsUpdate(snapshotID int64) *removeStatisticsUpdate
- func NewSetCurrentSchemaUpdate(id int) *setCurrentSchemaUpdate
- func NewSetDefaultSortOrderUpdate(id int) *setDefaultSortOrderUpdate
- func NewSetDefaultSpecUpdate(id int) *setDefaultSpecUpdate
- func NewSetPartitionStatisticsUpdate(stats PartitionStatisticsFile) *setPartitionStatisticsUpdate
- func NewSetPropertiesUpdate(updates iceberg.Properties) *setPropertiesUpdate
- func NewSetSnapshotRefUpdate(name string, snapshotID int64, refType RefType, ...) *setSnapshotRefUpdate
- func NewSetStatisticsUpdate(stats StatisticsFile) *setStatisticsUpdate
- func NewUpgradeFormatVersionUpdate(formatVersion int) *upgradeFormatVersionUpdate
- func SchemaToArrowSchema(sc *iceberg.Schema, metadata map[string]string, ...) (*arrow.Schema, error)
- func ToRequestedSchema(ctx context.Context, requested, fileSchema *iceberg.Schema, ...) (arrow.RecordBatch, error)
- func TypeToArrowType(t iceberg.Type, includeFieldIDs bool, useLargeTypes bool) (arrow.DataType, error)
- func VisitArrowSchema[T any](sc *arrow.Schema, visitor ArrowSchemaVisitor[T]) (res T, err error)
- func WithMaxRefAgeMs(maxRefAgeMs int64) setSnapshotRefOption
- func WithMaxSnapshotAgeMs(maxSnapshotAgeMs int64) setSnapshotRefOption
- func WithMinSnapshotsToKeep(minSnapshotsToKeep int) setSnapshotRefOption
- func WriteRecords(ctx context.Context, tbl *Table, schema *arrow.Schema, ...) iter.Seq2[iceberg.DataFile, error]
- type AddFilesOption
- type ArrowSchemaVisitor
- type BlobMetadata
- type BlobType
- type CatalogIO
- type ColumnUpdate
- type CompactionGroupOption
- type CompactionGroupResult
- type CompactionTaskGroup
- type DeleteOption
- type EncryptionKey
- type ErrIncompatibleSchema
- type ExpireSnapshotsOpt
- type FSysF
- type FileScanTask
- type Identifier
- type IncompatibleField
- type InvalidDefault
- type IsolationLevel
- type LocationProvider
- type Metadata
- func NewMetadata(sc *iceberg.Schema, partitions *iceberg.PartitionSpec, sortOrder SortOrder, ...) (Metadata, error)
- func NewMetadataWithUUID(sc *iceberg.Schema, partitions *iceberg.PartitionSpec, sortOrder SortOrder, ...) (Metadata, error)
- func ParseMetadata(r io.Reader) (Metadata, error)
- func ParseMetadataBytes(b []byte) (Metadata, error)
- func ParseMetadataString(s string) (Metadata, error)
- func UpdateTableMetadata(base Metadata, updates []Update, metadataLoc string) (Metadata, error)
- type MetadataBuilder
- func (b *MetadataBuilder) AddEncryptionKey(key EncryptionKey) error
- func (b *MetadataBuilder) AddPartitionSpec(spec *iceberg.PartitionSpec, initial bool) error
- func (b *MetadataBuilder) AddSchema(schema *iceberg.Schema) error
- func (b *MetadataBuilder) AddSnapshot(snapshot *Snapshot) error
- func (b *MetadataBuilder) AddSnapshotUpdate(u *addSnapshotUpdate) error
- func (b *MetadataBuilder) AddSortOrder(sortOrder *SortOrder) error
- func (b *MetadataBuilder) AppendMetadataLog(entry MetadataLogEntry) *MetadataBuilder
- func (b *MetadataBuilder) Build() (Metadata, error)
- func (b *MetadataBuilder) CurrentSchema() *iceberg.Schema
- func (b *MetadataBuilder) CurrentSpec() (*iceberg.PartitionSpec, error)
- func (b *MetadataBuilder) GetSchemaByID(id int) (*iceberg.Schema, error)
- func (b *MetadataBuilder) GetSortOrderByID(id int) (*SortOrder, error)
- func (b *MetadataBuilder) GetSpecByID(id int) (*iceberg.PartitionSpec, error)
- func (b *MetadataBuilder) HasChanges() bool
- func (b *MetadataBuilder) LastColumnID() int
- func (b *MetadataBuilder) LastUpdatedMS() int64
- func (b *MetadataBuilder) NameMapping() iceberg.NameMapping
- func (b *MetadataBuilder) NextRowID() int64
- func (b *MetadataBuilder) RemoveEncryptionKey(keyID string) error
- func (b *MetadataBuilder) RemovePartitionSpecs(ints []int) error
- func (b *MetadataBuilder) RemovePartitionStatistics(snapshotID int64) error
- func (b *MetadataBuilder) RemoveProperties(keys []string) error
- func (b *MetadataBuilder) RemoveSchemas(ints []int) error
- func (b *MetadataBuilder) RemoveSnapshotRef(name string) error
- func (b *MetadataBuilder) RemoveSnapshots(snapshotIds []int64, postCommit bool) error
- func (b *MetadataBuilder) RemoveStatistics(snapshotID int64) error
- func (b *MetadataBuilder) SetCurrentSchemaID(currentSchemaID int) error
- func (b *MetadataBuilder) SetDefaultSortOrderID(defaultSortOrderID int) error
- func (b *MetadataBuilder) SetDefaultSpecID(defaultSpecID int) error
- func (b *MetadataBuilder) SetFormatVersion(formatVersion int) error
- func (b *MetadataBuilder) SetLastUpdatedMS() *MetadataBuilder
- func (b *MetadataBuilder) SetLoc(loc string) error
- func (b *MetadataBuilder) SetPartitionStatistics(stats PartitionStatisticsFile) error
- func (b *MetadataBuilder) SetProperties(props iceberg.Properties) error
- func (b *MetadataBuilder) SetSnapshotRef(name string, snapshotID int64, refType RefType, ...) error
- func (b *MetadataBuilder) SetStatistics(stats StatisticsFile) error
- func (b *MetadataBuilder) SetUUID(uuid uuid.UUID) error
- func (b *MetadataBuilder) SnapshotByID(id int64) (*Snapshot, error)
- func (b *MetadataBuilder) TrimMetadataLogs(maxEntries int) *MetadataBuilder
- type MetadataLogEntry
- type MoveOp
- type NullOrder
- type Operation
- type OrphanCleanupOption
- func WithDeleteFunc(deleteFunc func(string) error) OrphanCleanupOption
- func WithDryRun(enabled bool) OrphanCleanupOption
- func WithEqualAuthorities(authorities map[string]string) OrphanCleanupOption
- func WithEqualSchemes(schemes map[string]string) OrphanCleanupOption
- func WithFilesOlderThan(duration time.Duration) OrphanCleanupOption
- func WithLocation(location string) OrphanCleanupOption
- func WithMaxConcurrency(maxWorkers int) OrphanCleanupOption
- func WithPrefixMismatchMode(mode PrefixMismatchMode) OrphanCleanupOption
- type OrphanCleanupResult
- type OverwriteOption
- type PartitionStatisticsFile
- type PrefixMismatchMode
- type ReassignedIds
- type RefType
- type Requirement
- func AssertCreate() Requirement
- func AssertCurrentSchemaID(id int) Requirement
- func AssertDefaultSortOrderID(id int) Requirement
- func AssertDefaultSpecID(id int) Requirement
- func AssertLastAssignedFieldID(id int) Requirement
- func AssertLastAssignedPartitionID(id int) Requirement
- func AssertRefSnapshotID(ref string, id *int64) Requirement
- func AssertTableUUID(uuid uuid.UUID) Requirement
- func ParseRequirement(r io.Reader) (Requirement, error)
- func ParseRequirementBytes(b []byte) (Requirement, error)
- func ParseRequirementString(s string) (Requirement, error)
- type Requirements
- type RewriteDataFilesOptions
- type RewriteFiles
- func (r *RewriteFiles) AddDataFile(df iceberg.DataFile) *RewriteFiles
- func (r *RewriteFiles) Apply(deletes, adds, safeDeletes []iceberg.DataFile) *RewriteFiles
- func (r *RewriteFiles) ApplyResult(gr CompactionGroupResult) *RewriteFiles
- func (r *RewriteFiles) Commit(ctx context.Context) error
- func (r *RewriteFiles) DeleteFile(df iceberg.DataFile) *RewriteFiles
- type RewriteResult
- type RollingDataWriter
- type RowDelta
- type Scan
- func (scan *Scan) PlanFiles(ctx context.Context) ([]FileScanTask, error)
- func (scan *Scan) Projection() (*iceberg.Schema, error)
- func (scan *Scan) ReadTasks(ctx context.Context, tasks []FileScanTask) (*arrow.Schema, iter.Seq2[arrow.RecordBatch, error], error)
- func (scan *Scan) Snapshot() *Snapshot
- func (scan *Scan) ToArrowRecords(ctx context.Context) (*arrow.Schema, iter.Seq2[arrow.RecordBatch, error], error)
- func (scan *Scan) ToArrowTable(ctx context.Context) (arrow.Table, error)
- func (scan *Scan) UseRef(name string) (*Scan, error)
- func (scan *Scan) UseRowLimit(n int64) *Scan
- type ScanOption
- func WitMaxConcurrency(n int) ScanOption
- func WithCaseSensitive(b bool) ScanOption
- func WithLimit(n int64) ScanOption
- func WithOptions(opts iceberg.Properties) ScanOption
- func WithRowFilter(e iceberg.BooleanExpression) ScanOption
- func WithSelectedFields(fields ...string) ScanOption
- func WithSnapshotAsOf(timeStampMs int64) ScanOption
- func WithSnapshotID(n int64) ScanOption
- type SchemaOptions
- type SequenceNumberValidator
- type Snapshot
- type SnapshotLogEntry
- type SnapshotLookup
- type SnapshotRef
- type SnapshotSummaryCollector
- type SortDirection
- type SortField
- type SortOrder
- func (s *SortOrder) CheckCompatibility(schema *iceberg.Schema) error
- func (s SortOrder) Equals(rhs SortOrder) bool
- func (s SortOrder) Fields() iter.Seq2[int, SortField]
- func (s SortOrder) IsUnsorted() bool
- func (s SortOrder) Len() int
- func (s SortOrder) MarshalJSON() ([]byte, error)
- func (s SortOrder) OrderID() int
- func (s SortOrder) String() string
- func (s *SortOrder) UnmarshalJSON(b []byte) error
- type StagedTable
- type StatisticsFile
- type Summary
- type Table
- func (t Table) AllManifests(ctx context.Context) iter.Seq2[iceberg.ManifestFile, error]
- func (t Table) Append(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties) (*Table, error)
- func (t Table) AppendTable(ctx context.Context, tbl arrow.Table, batchSize int64, ...) (*Table, error)
- func (t Table) CurrentSnapshot() *Snapshot
- func (t Table) Delete(ctx context.Context, filter iceberg.BooleanExpression, ...) (*Table, error)
- func (t Table) DeleteOrphanFiles(ctx context.Context, opts ...OrphanCleanupOption) (OrphanCleanupResult, error)
- func (t Table) Equals(other Table) bool
- func (t Table) FS(ctx context.Context) (icebergio.IO, error)
- func (t Table) Identifier() Identifier
- func (t Table) Location() string
- func (t Table) LocationProvider() (LocationProvider, error)
- func (t Table) Metadata() Metadata
- func (t Table) MetadataLocation() string
- func (t Table) NameMapping() iceberg.NameMapping
- func (t Table) NewTransaction() *Transaction
- func (t Table) NewTransactionOnBranch(branch string) *Transaction
- func (t Table) Overwrite(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties, ...) (*Table, error)
- func (t Table) OverwriteTable(ctx context.Context, tbl arrow.Table, batchSize int64, ...) (*Table, error)
- func (t Table) Properties() iceberg.Properties
- func (t *Table) Refresh(ctx context.Context) error
- func (t Table) Scan(opts ...ScanOption) *Scan
- func (t Table) Schema() *iceberg.Schema
- func (t Table) Schemas() map[int]*iceberg.Schema
- func (t Table) SnapshotAsOf(timestampMs int64, inclusive bool) *Snapshot
- func (t Table) SnapshotByID(id int64) *Snapshot
- func (t Table) SnapshotByName(name string) *Snapshot
- func (t Table) SortOrder() SortOrder
- func (t Table) Spec() iceberg.PartitionSpec
- type TableCommit
- type Transaction
- func (t *Transaction) AddDataFiles(ctx context.Context, dataFiles []iceberg.DataFile, ...) error
- func (t *Transaction) AddFiles(ctx context.Context, filePaths []string, snapshotProps iceberg.Properties, ...) error
- func (t *Transaction) Append(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties) error
- func (t *Transaction) AppendTable(ctx context.Context, tbl arrow.Table, batchSize int64, ...) error
- func (t *Transaction) Commit(ctx context.Context) (*Table, error)
- func (t *Transaction) Delete(ctx context.Context, filter iceberg.BooleanExpression, ...) (err error)
- func (t *Transaction) ExpireSnapshots(opts ...ExpireSnapshotsOpt) error
- func (t *Transaction) MarkCommitted()
- func (t *Transaction) NewRewrite(snapshotProps iceberg.Properties) *RewriteFiles
- func (t *Transaction) NewRowDelta(snapshotProps iceberg.Properties) *RowDelta
- func (t *Transaction) Overwrite(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties, ...) error
- func (t *Transaction) OverwriteTable(ctx context.Context, tbl arrow.Table, batchSize int64, ...) error
- func (t *Transaction) ReplaceDataFiles(ctx context.Context, filesToDelete, filesToAdd []string, ...) error
- func (t *Transaction) ReplaceDataFilesWithDataFiles(ctx context.Context, filesToDelete, filesToAdd []iceberg.DataFile, ...) error
- func (t *Transaction) ReplaceFiles(ctx context.Context, ...) error
- func (t *Transaction) RewriteDataFiles(ctx context.Context, groups []CompactionTaskGroup, ...) (*RewriteResult, error)
- func (t *Transaction) RollbackToSnapshot(snapshotID int64) error
- func (t *Transaction) Scan(opts ...ScanOption) (*Scan, error)
- func (t *Transaction) SetProperties(props iceberg.Properties) error
- func (t *Transaction) StagedTable() (*StagedTable, error)
- func (t *Transaction) TableCommit() (TableCommit, error)
- func (t *Transaction) UpdateSchema(caseSensitive bool, allowIncompatibleChanges bool, opts ...UpdateSchemaOption) *UpdateSchema
- func (t *Transaction) UpdateSpec(caseSensitive bool) *UpdateSpec
- func (t *Transaction) UpgradeFormatVersion(version int) error
- func (t *Transaction) WriteEqualityDeletes(ctx context.Context, equalityFieldIDs []int, ...) ([]iceberg.DataFile, error)
- type UnsupportedType
- type Update
- type UpdateSchema
- func (u *UpdateSchema) AddColumn(path []string, fieldType iceberg.Type, doc string, required bool, ...) *UpdateSchema
- func (u *UpdateSchema) Apply() (*iceberg.Schema, error)
- func (u *UpdateSchema) BuildUpdates() ([]Update, []Requirement, error)
- func (u *UpdateSchema) Commit() error
- func (u *UpdateSchema) DeleteColumn(path []string) *UpdateSchema
- func (u *UpdateSchema) MoveAfter(path, relativeTo []string) *UpdateSchema
- func (u *UpdateSchema) MoveBefore(path, relativeTo []string) *UpdateSchema
- func (u *UpdateSchema) MoveColumn(op MoveOp, path, relativeTo []string) *UpdateSchema
- func (u *UpdateSchema) MoveFirst(path []string) *UpdateSchema
- func (u *UpdateSchema) RenameColumn(path []string, newName string) *UpdateSchema
- func (u *UpdateSchema) SetIdentifierField(paths [][]string) *UpdateSchema
- func (u *UpdateSchema) UpdateColumn(path []string, update ColumnUpdate) *UpdateSchema
- type UpdateSchemaOption
- type UpdateSpec
- func (us *UpdateSpec) AddField(sourceColName string, transform iceberg.Transform, partitionFieldName string) *UpdateSpec
- func (us *UpdateSpec) AddIdentity(sourceColName string) *UpdateSpec
- func (us *UpdateSpec) Apply() (iceberg.PartitionSpec, error)
- func (us *UpdateSpec) BuildUpdates() ([]Update, []Requirement, error)
- func (us *UpdateSpec) Commit() error
- func (us *UpdateSpec) RemoveField(name string) *UpdateSpec
- func (us *UpdateSpec) RenameField(name string, newName string) *UpdateSpec
- type Updates
- type WriteOption
- type WriteRecordOption
- type WriteTask
Constants ¶
const ( ScanOptionArrowUseLargeTypes = "arrow.use_large_types" ScanOptionRowLineageEnabled = "row_lineage.enabled" )
const ( ArrowFieldDocKey = "doc" // Arrow schemas that are generated from the Parquet library will utilize // this key to identify the field id of the source Parquet field. // We use this when converting to Iceberg to provide field IDs ArrowParquetFieldIDKey = "PARQUET:field_id" )
constants to look for as Keys in Arrow field metadata
const ( // WriteDeleteIsolationLevelKey controls isolation for DeleteFiles // and RowDelta eq-delete commits. Default: serializable (matches // Java trunk). Older Java releases and some pyiceberg codepaths // used "snapshot"; set this property explicitly to snapshot if // you are migrating from a pipeline that relied on that behavior. WriteDeleteIsolationLevelKey = "write.delete.isolation-level" WriteDeleteIsolationLevelDefault = IsolationSerializable // WriteUpdateIsolationLevelKey controls isolation for overwrite / // merge commits. Default: serializable (matches Java trunk). // Older Java releases and some pyiceberg codepaths used // "snapshot"; set this property explicitly to snapshot if you are // migrating from a pipeline that relied on that behavior. WriteUpdateIsolationLevelKey = "write.update.isolation-level" WriteUpdateIsolationLevelDefault = IsolationSerializable )
Property keys for configuring isolation per write operation. Names match the Java TableProperties constants so configs can be ported directly between engines.
const ( WriteDataPathKey = "write.data.path" WriteMetadataPathKey = "write.metadata.path" WriteMetadataLocationKey = "write.metadata.location" WriteObjectStorePartitionedPathsKey = "write.object-storage.partitioned-paths" WriteObjectStorePartitionedPathsDefault = true ObjectStoreEnabledKey = "write.object-storage.enabled" ObjectStoreEnabledDefault = false DefaultNameMappingKey = "schema.name-mapping.default" MetricsModeColumnConfPrefix = "write.metadata.metrics.column" DefaultWriteMetricsModeKey = "write.metadata.metrics.default" DefaultWriteMetricsModeDefault = "truncate(16)" ParquetRowGroupSizeBytesKey = internal.ParquetRowGroupSizeBytesKey ParquetRowGroupSizeBytesDefault = internal.ParquetRowGroupSizeBytesDefault ParquetRowGroupLimitKey = internal.ParquetRowGroupLimitKey ParquetRowGroupLimitDefault = internal.ParquetRowGroupLimitDefault ParquetPageSizeBytesKey = internal.ParquetPageSizeBytesKey ParquetPageSizeBytesDefault = internal.ParquetPageSizeBytesDefault ParquetPageRowLimitKey = internal.ParquetPageRowLimitKey ParquetPageRowLimitDefault = internal.ParquetPageRowLimitDefault ParquetDictSizeBytesKey = internal.ParquetDictSizeBytesKey ParquetDictSizeBytesDefault = internal.ParquetDictSizeBytesDefault ParquetPageVersionKey = internal.ParquetPageVersionKey ParquetPageVersionDefault = internal.ParquetPageVersionDefault ParquetCompressionKey = internal.ParquetCompressionKey ParquetCompressionDefault = internal.ParquetCompressionDefault ParquetCompressionLevelKey = internal.ParquetCompressionLevelKey ParquetCompressionLevelDefault = internal.ParquetCompressionLevelDefault ParquetBloomFilterMaxBytesKey = internal.ParquetBloomFilterMaxBytesKey ParquetBloomFilterMaxBytesDefault = internal.ParquetBloomFilterMaxBytesDefault ParquetBloomFilterColumnEnabledKeyPrefix = internal.ParquetBloomFilterColumnEnabledKeyPrefix ParquetBatchSizeKey = internal.ParquetBatchSizeKey ParquetBatchSizeDefault = internal.ParquetBatchSizeDefault ManifestMergeEnabledKey = "commit.manifest-merge.enabled" ManifestMergeEnabledDefault = false ManifestTargetSizeBytesKey = "commit.manifest.target-size-bytes" ManifestTargetSizeBytesDefault = 8 * 1024 * 1024 // 8 MB ManifestMinMergeCountKey = "commit.manifest.min-count-to-merge" ManifestMinMergeCountDefault = 100 WritePartitionSummaryLimitKey = "write.summary.partition-limit" WritePartitionSummaryLimitDefault = 0 WriteDeleteModeKey = "write.delete.mode" WriteDeleteModeDefault = WriteModeCopyOnWrite MetadataDeleteAfterCommitEnabledKey = "write.metadata.delete-after-commit.enabled" MetadataDeleteAfterCommitEnabledDefault = false MetadataPreviousVersionsMaxKey = "write.metadata.previous-versions-max" MetadataPreviousVersionsMaxDefault = 100 MetadataCompressionKey = "write.metadata.compression-codec" MetadataCompressionDefault = "none" WriteFormatDefaultKey = "write.format.default" WriteFormatDefaultDefault = "parquet" WriteTargetFileSizeBytesKey = "write.target-file-size-bytes" WriteTargetFileSizeBytesDefault = 512 * 1024 * 1024 // 512 MB MinSnapshotsToKeepKey = "min-snapshots-to-keep" MinSnapshotsToKeepDefault = math.MaxInt MaxSnapshotAgeMsKey = "max-snapshot-age-ms" MaxSnapshotAgeMsDefault = math.MaxInt MaxRefAgeMsKey = "max-ref-age-ms" MaxRefAgeMsDefault = math.MaxInt // CommitNumRetriesKey is the number of commit retry attempts before // giving up on ErrCommitFailed from the catalog. // // The default is 0 (no retries) until refresh-and-replay lands; a // retry loop that reuses the original updates/requirements will // fail deterministically on genuine OCC conflicts and only slow // down the final error. Callers that observe transient catalog // flakiness (dropped connections, brief 409 during leader // election) can raise this to recover. CommitNumRetriesKey = "commit.retry.num-retries" CommitNumRetriesDefault = 0 // CommitMinRetryWaitMsKey is the initial wait time in milliseconds // for exponential backoff between commit retry attempts. Default: 100ms. CommitMinRetryWaitMsKey = "commit.retry.min-wait-ms" CommitMinRetryWaitMsDefault = 100 // CommitMaxRetryWaitMsKey is the maximum wait time in milliseconds // between commit retry attempts. Default: 60s. CommitMaxRetryWaitMsKey = "commit.retry.max-wait-ms" CommitMaxRetryWaitMsDefault = 60 * 1000 // CommitTotalRetryTimeoutMsKey bounds the total time spent across all // retry attempts. Default: 30 minutes. CommitTotalRetryTimeoutMsKey = "commit.retry.total-timeout-ms" CommitTotalRetryTimeoutMsDefault = 30 * 60 * 1000 )
const ( PropertyFormatVersion = "format-version" PropertyUuid = "uuid" PropertySnapshotCount = "snapshot-count" PropertyCurrentSnapshotId = "current-snapshot-id" PropertyCurrentSnapshotSummary = "current-snapshot-summary" PropertyCurrentSnapshotTimestamp = "current-snapshot-timestamp" PropertyCurrentSchema = "current-schema" PropertyDefaultPartitionSpec = "default-partition-spec" PropertyDefaultSortOrder = "default-sort-order" )
Reserved properties
const ( MetadataCompressionCodecNone = "none" MetadataCompressionCodecGzip = "gzip" MetadataCompressionCodecZstd = "zstd" )
Metadata compression codecs
const ( WriteModeCopyOnWrite = "copy-on-write" WriteModeMergeOnRead = "merge-on-read" )
Write modes
const ( InitialSortOrderID = 1 UnsortedSortOrderID = 0 )
const ( UpdateAddSpec = "add-spec" UpdateAddSchema = "add-schema" UpdateAddSnapshot = "add-snapshot" UpdateAddSortOrder = "add-sort-order" UpdateAssignUUID = "assign-uuid" UpdateAddEncryptionKey = "add-encryption-key" UpdateRemoveEncryptionKey = "remove-encryption-key" UpdateRemovePartitionStatistics = "remove-partition-statistics" UpdateRemoveProperties = "remove-properties" UpdateRemoveSchemas = "remove-schemas" UpdateRemoveSnapshots = "remove-snapshots" UpdateRemoveSnapshotRef = "remove-snapshot-ref" UpdateRemoveSpec = "remove-partition-specs" UpdateRemoveStatistics = "remove-statistics" UpdateSetCurrentSchema = "set-current-schema" UpdateSetDefaultSortOrder = "set-default-sort-order" UpdateSetDefaultSpec = "set-default-spec" UpdateSetLocation = "set-location" UpdateSetPartitionStatistics = "set-partition-statistics" UpdateSetProperties = "set-properties" UpdateSetSnapshotRef = "set-snapshot-ref" UpdateSetStatistics = "set-statistics" UpdateUpgradeFormatVersion = "upgrade-format-version" )
These are the various update actions defined in the iceberg spec
const DefaultFormatVersion = 2
const MainBranch = "main"
const ScanNoLimit = -1
const TableRootID = -1
Variables ¶
var ( // ErrConflictingDataFiles is returned when a concurrent commit // added data files that satisfy the committer's filter. ErrConflictingDataFiles = fmt.Errorf("%w: concurrent data files added", ErrCommitFailed) // ErrConflictingDeleteFiles is returned when a concurrent commit // added delete files that could mask rows the committer is // writing or replacing. ErrConflictingDeleteFiles = fmt.Errorf("%w: concurrent delete files added", ErrCommitFailed) // ErrDataFilesMissing is returned when a concurrent commit deleted // data files the committer explicitly references (e.g. a position // delete pointing at a file that is no longer reachable). ErrDataFilesMissing = fmt.Errorf("%w: referenced data files missing", ErrCommitFailed) )
Retryable conflict sentinels. All wrap ErrCommitFailed so the retry loop in doCommit treats them as retryable. Callers that need to distinguish which kind of conflict occurred can match on the specific sentinel via errors.Is.
var ( ErrInvalidMetadataFormatVersion = errors.New("invalid or missing format-version in table metadata") ErrInvalidMetadata = errors.New("invalid metadata") ErrPartitionSpecNotFound = errors.New("partition spec not found") )
var ( ErrInvalidOperation = errors.New("invalid operation value") ErrMissingOperation = errors.New("missing operation key") ErrInvalidRowLineage = errors.New("invalid row lineage") )
var ( ErrInvalidSortOrderID = errors.New("invalid sort order ID") ErrInvalidTransform = errors.New("invalid transform, must be a valid transform string or a transform object") ErrInvalidSortDirection = errors.New("invalid sort direction, must be 'asc' or 'desc'") ErrInvalidNullOrder = errors.New("invalid null order, must be 'nulls-first' or 'nulls-last'") )
var ErrCommitDiverged = errors.New("commit diverged: base snapshot is no longer on the branch")
ErrCommitDiverged is returned when the committer's base snapshot is no longer on the branch, so conflict validation cannot enumerate concurrent snapshots. The committer must refresh and rebuild their commit from the new base — naive retry would fail identically.
Unlike the other conflict sentinels in this file, ErrCommitDiverged does NOT wrap ErrCommitFailed: it is terminal for the current attempt. This mirrors Java's ValidationException, which the SnapshotProducer retry machinery does not catch.
var ErrCommitFailed = errors.New("commit failed, refresh and try again")
ErrCommitFailed is the sentinel error returned by catalogs when a commit fails due to a concurrent modification (e.g. HTTP 409 Conflict from the REST catalog). Catalog implementations should wrap this error so that callers using errors.Is(err, table.ErrCommitFailed) can detect retryable commit conflicts.
Currently only catalog/rest wraps this sentinel; Glue, SQL, and Hive catalogs return their conflict errors raw and will not trigger retries until follow-up work wires them through (tracked under issue #830).
var ErrEmptyEqualityFieldIDs = errors.New("equality field IDs must not be empty")
var ErrInvalidRefType = errors.New("invalid snapshot ref type, should be 'branch' or 'tag'")
var ErrInvalidRequirement = errors.New("invalid requirement")
var ErrSnapshotNotFound = errors.New("snapshot not found")
ErrSnapshotNotFound is returned (wrapped) by metadata lookups and by computeOwnManifests when a snapshot ID does not exist in the table's snapshot list. Tests pin meaning via errors.Is(err, ErrSnapshotNotFound).
var ErrWriteIORequired = errors.New("commit: file system does not implement WriteFileIO")
ErrWriteIORequired is returned by doCommit when the table's file system does not implement io.WriteFileIO. Manifest-list rebuild on retry requires write access; failing fast here is preferable to silently skipping the rebuild and reintroducing the stale-parent data-loss bug. Callers that need to detect this condition should use errors.Is(err, ErrWriteIORequired).
var PositionalDeleteArrowSchema, _ = SchemaToArrowSchema(iceberg.PositionalDeleteSchema, nil, true, false)
var ReservedProperties = [9]string{ PropertyFormatVersion, PropertyUuid, PropertySnapshotCount, PropertyCurrentSnapshotId, PropertyCurrentSnapshotSummary, PropertyCurrentSnapshotTimestamp, PropertyCurrentSchema, PropertyDefaultPartitionSpec, PropertyDefaultSortOrder, }
var UnsortedSortOrder = SortOrder{/* contains filtered or unexported fields */}
A default Sort Order indicating no sort order at all
Functions ¶
func ArrowSchemaToIceberg ¶
func ArrowSchemaToIcebergWithFreshIDs ¶ added in v0.3.0
func ArrowTypeToIceberg ¶
func CollectSafePositionDeletes ¶ added in v0.6.0
func CollectSafePositionDeletes(tasks []FileScanTask) []iceberg.DataFile
CollectSafePositionDeletes returns position delete files from the given tasks that are safe to remove during compaction.
A position delete file is safe to remove when it was matched to a data file (via scan planning) and that data file is being rewritten in this compaction group. Since ReadTasks applies the deletes during reading, the new output files will not contain the deleted rows.
Only position deletes (EntryContentPosDeletes) are considered. Equality deletes are decided by [compaction.DecideDeadEqualityDeletes] (which needs partition-wide visibility, not just the task scope). Deletion vectors will be handled when DV read support lands.
Caller contract: every data file referenced by a returned pos-delete must be in the caller's rewrite set across the entire commit. This function only sees one group's tasks, but a pos-delete file can reference data files across multiple groups (the planner bin-packs within a partition via [compaction.Config.PlanCompaction] and skips files via MinInputFiles). If a pos-delete is reported safe by one group but references a still-live data file in another group — or a file the planner skipped — committing only this group's rewrite would orphan the still-live data file's deletes. Coordinators that aggregate multiple groups into one rewrite snapshot are responsible for re-checking against the full set of rewritten paths, or for moving this computation leader-side once worker outputs have aggregated.
ExecuteCompactionGroup calls this internally to populate [CompactionGroupResult.SafePosDeletes]. It is kept exported for custom workers that want the spec-shaped predicate without taking the rest of ExecuteCompactionGroup's read+write pipeline.
func GetPartitionRecord ¶ added in v0.6.0
func GetPartitionRecord(dataFile iceberg.DataFile, partitionType *iceberg.StructType) iceberg.StructLike
GetPartitionRecord converts a DataFile's partition map into a positional record ordered by the fields of the given partition struct type.
func IsAncestorOf ¶ added in v0.6.0
func IsAncestorOf(snapshotID, ancestorID int64, lookup SnapshotLookup) bool
IsAncestorOf returns true if ancestorID is in the parent chain of snapshotID (or equal to snapshotID, provided snapshotID resolves).
Returns false if snapshotID cannot be resolved, if ancestorID is not reachable from snapshotID via ParentSnapshotID, or if a cycle prevents reaching ancestorID.
Note: every snapshot on the walked chain must be resolvable via lookup. This diverges from Java's SnapshotUtil.isAncestorOf which walks parent *ids* regardless of snapshot resolvability. The Go semantics are stricter (conservative) — broken chains return false rather than matching an unreachable ancestor id.
func NewAddEncryptionKeyUpdate ¶ added in v0.6.0
func NewAddEncryptionKeyUpdate(key EncryptionKey) *addEncryptionKeyUpdate
NewAddEncryptionKeyUpdate creates a new Update that adds or replaces an encryption key (indexed by its key-id) in the table metadata.
func NewAddPartitionSpecUpdate ¶ added in v0.2.0
func NewAddPartitionSpecUpdate(spec *iceberg.PartitionSpec, initial bool) *addPartitionSpecUpdate
NewAddPartitionSpecUpdate creates a new update that adds the given partition spec to the table metadata. If the initial flag is set to true, the spec is considered the initial spec of the table, and all other previously added specs in the metadata builder are removed.
func NewAddSchemaUpdate ¶ added in v0.2.0
NewAddSchemaUpdate creates a new update that adds the given schema and updates the lastColumnID based on the schema.
func NewAddSnapshotUpdate ¶ added in v0.2.0
func NewAddSnapshotUpdate(snapshot *Snapshot) *addSnapshotUpdate
NewAddSnapshotUpdate creates a new update that adds the given snapshot to the table metadata.
func NewAddSortOrderUpdate ¶ added in v0.2.0
func NewAddSortOrderUpdate(sortOrder *SortOrder) *addSortOrderUpdate
NewAddSortOrderUpdate creates a new update that adds the given sort order to the table metadata. If the initial flag is set to true, the sort order is considered the initial sort order of the table, and all previously added sort orders in the metadata builder are removed.
func NewAssignUUIDUpdate ¶ added in v0.2.0
NewAssignUUIDUpdate creates a new update to assign a UUID to the table metadata.
func NewRemoveEncryptionKeyUpdate ¶ added in v0.6.0
func NewRemoveEncryptionKeyUpdate(keyID string) *removeEncryptionKeyUpdate
NewRemoveEncryptionKeyUpdate creates a new Update that removes the encryption key with the given key-id from the table metadata.
func NewRemovePartitionStatisticsUpdate ¶ added in v0.6.0
func NewRemovePartitionStatisticsUpdate(snapshotID int64) *removePartitionStatisticsUpdate
NewRemovePartitionStatisticsUpdate creates a new Update that removes the partition statistics file for the given snapshot ID from the table metadata.
func NewRemovePropertiesUpdate ¶ added in v0.2.0
func NewRemovePropertiesUpdate(removals []string) *removePropertiesUpdate
NewRemovePropertiesUpdate creates a new update that removes properties from the table metadata. The properties are identified by their names, and if a property with the given name does not exist, it is ignored.
func NewRemoveSchemasUpdate ¶ added in v0.4.0
func NewRemoveSchemasUpdate(schemaIds []int) *removeSchemasUpdate
NewRemoveSchemasUpdate creates a new Update that removes a list of schemas from the table metadata.
func NewRemoveSnapshotRefUpdate ¶ added in v0.2.0
func NewRemoveSnapshotRefUpdate(ref string) *removeSnapshotRefUpdate
NewRemoveSnapshotRefUpdate creates a new update that removes a snapshot reference from the table metadata.
func NewRemoveSnapshotsUpdate ¶ added in v0.2.0
NewRemoveSnapshotsUpdate creates a new update that removes all snapshots from the table metadata with the given snapshot IDs.
func NewRemoveSpecUpdate ¶ added in v0.4.0
func NewRemoveSpecUpdate(specIds []int) *removeSpecUpdate
NewRemoveSpecUpdate creates a new Update that removes a list of partition specs from the table metadata.
func NewRemoveStatisticsUpdate ¶ added in v0.6.0
func NewRemoveStatisticsUpdate(snapshotID int64) *removeStatisticsUpdate
NewRemoveStatisticsUpdate creates a new Update that removes the statistics file for the given snapshot ID from the table metadata.
func NewSetCurrentSchemaUpdate ¶ added in v0.2.0
func NewSetCurrentSchemaUpdate(id int) *setCurrentSchemaUpdate
NewSetCurrentSchemaUpdate creates a new update that sets the current schema of the table metadata to the given schema ID.
func NewSetDefaultSortOrderUpdate ¶ added in v0.2.0
func NewSetDefaultSortOrderUpdate(id int) *setDefaultSortOrderUpdate
NewSetDefaultSortOrderUpdate creates a new update that sets the default sort order of the table metadata to the given sort order ID.
func NewSetDefaultSpecUpdate ¶ added in v0.2.0
func NewSetDefaultSpecUpdate(id int) *setDefaultSpecUpdate
NewSetDefaultSpecUpdate creates a new update that sets the default partition spec of the table metadata to the given spec ID.
func NewSetPartitionStatisticsUpdate ¶ added in v0.6.0
func NewSetPartitionStatisticsUpdate(stats PartitionStatisticsFile) *setPartitionStatisticsUpdate
NewSetPartitionStatisticsUpdate creates a new Update that adds or replaces the partition statistics file for the given snapshot ID in the table metadata.
func NewSetPropertiesUpdate ¶ added in v0.2.0
func NewSetPropertiesUpdate(updates iceberg.Properties) *setPropertiesUpdate
NewSetPropertiesUpdate creates a new update that sets the given properties in the table metadata.
func NewSetSnapshotRefUpdate ¶ added in v0.2.0
func NewSetSnapshotRefUpdate( name string, snapshotID int64, refType RefType, maxRefAgeMs, maxSnapshotAgeMs int64, minSnapshotsToKeep int, ) *setSnapshotRefUpdate
NewSetSnapshotRefUpdate creates a new update that sets the given snapshot reference as the current snapshot of the table metadata. MaxRefAgeMs, MaxSnapshotAgeMs, and MinSnapshotsToKeep are optional, and any non-positive values are ignored.
func NewSetStatisticsUpdate ¶ added in v0.6.0
func NewSetStatisticsUpdate(stats StatisticsFile) *setStatisticsUpdate
NewSetStatisticsUpdate creates a new Update that adds or replaces the statistics file for the given snapshot ID in the table metadata.
func NewUpgradeFormatVersionUpdate ¶ added in v0.2.0
func NewUpgradeFormatVersionUpdate(formatVersion int) *upgradeFormatVersionUpdate
NewUpgradeFormatVersionUpdate creates a new update that upgrades the format version of the table metadata to the given formatVersion.
func SchemaToArrowSchema ¶
func SchemaToArrowSchema(sc *iceberg.Schema, metadata map[string]string, includeFieldIDs, useLargeTypes bool) (*arrow.Schema, error)
SchemaToArrowSchema converts an Iceberg schema to an Arrow schema. If the metadata parameter is non-nil, it will be included as the top-level metadata in the schema. If includeFieldIDs is true, then each field of the schema will contain a metadata key PARQUET:field_id set to the field id from the iceberg schema.
func ToRequestedSchema ¶
func ToRequestedSchema(ctx context.Context, requested, fileSchema *iceberg.Schema, batch arrow.RecordBatch, opts SchemaOptions) (arrow.RecordBatch, error)
ToRequestedSchema will construct a new record batch matching the requested iceberg schema casting columns if necessary as appropriate.
func TypeToArrowType ¶
func TypeToArrowType(t iceberg.Type, includeFieldIDs bool, useLargeTypes bool) (arrow.DataType, error)
TypeToArrowType converts a given iceberg type, into the equivalent Arrow data type. For dealing with nested fields (List, Struct, Map) if includeFieldIDs is true, then the child fields will contain a metadata key PARQUET:field_id set to the field id.
func VisitArrowSchema ¶
func VisitArrowSchema[T any](sc *arrow.Schema, visitor ArrowSchemaVisitor[T]) (res T, err error)
func WithMaxRefAgeMs ¶ added in v0.2.0
func WithMaxRefAgeMs(maxRefAgeMs int64) setSnapshotRefOption
func WithMaxSnapshotAgeMs ¶ added in v0.2.0
func WithMaxSnapshotAgeMs(maxSnapshotAgeMs int64) setSnapshotRefOption
func WithMinSnapshotsToKeep ¶ added in v0.2.0
func WithMinSnapshotsToKeep(minSnapshotsToKeep int) setSnapshotRefOption
func WriteRecords ¶ added in v0.6.0
func WriteRecords(ctx context.Context, tbl *Table, schema *arrow.Schema, records iter.Seq2[arrow.RecordBatch, error], opts ...WriteRecordOption, ) iter.Seq2[iceberg.DataFile, error]
WriteRecords writes Arrow record batches to Parquet data files for the given table, returning an iterator of the resulting DataFile objects.
The provided Arrow schema must be compatible with the table's current Iceberg schema: each field in the Arrow schema is matched to the table schema by field ID (or by name via the table's name mapping if field IDs are absent). The Arrow schema may be a subset of the table schema (projection), but every field present must have a type that is promotable to the corresponding table field type.
WriteRecords releases each RecordBatch it consumes. If the caller needs a batch to remain valid after it has been yielded, it must call Retain before yielding and is then responsible for the corresponding Release.
Types ¶
type AddFilesOption ¶ added in v0.6.0
type AddFilesOption func(addFilesOp *addFilesOperation)
func WithAddFilesConcurrency ¶ added in v0.6.0
func WithAddFilesConcurrency(concurrency int) AddFilesOption
WithAddFilesConcurrency overwrites the default concurrency for add files operation. Default: runtime.GOMAXPROCS(0)
type ArrowSchemaVisitor ¶
type ArrowSchemaVisitor[T any] interface { Schema(*arrow.Schema, T) T Struct(*arrow.StructType, []T) T Field(arrow.Field, T) T List(arrow.ListLikeType, T) T Map(mt *arrow.MapType, keyResult T, valueResult T) T Primitive(arrow.DataType) T }
ArrowSchemaVisitor is an interface that can be implemented and used to call VisitArrowSchema for iterating
type BlobMetadata ¶ added in v0.4.0
type BlobMetadata struct {
Type BlobType `json:"type"`
SnapshotID int64 `json:"snapshot-id"`
SequenceNumber int64 `json:"sequence-number"`
Fields []int32 `json:"fields"`
Properties map[string]string `json:"properties"`
}
BlobMetadata is the metadata of a statistics or indices blob.
type BlobType ¶ added in v0.4.0
type BlobType string
BlobType is the type of blob in a Puffin file
func (*BlobType) UnmarshalJSON ¶ added in v0.4.0
type CatalogIO ¶ added in v0.2.0
type CatalogIO interface {
LoadTable(context.Context, Identifier) (*Table, error)
CommitTable(context.Context, Identifier, []Requirement, []Update) (Metadata, string, error)
}
type ColumnUpdate ¶ added in v0.5.0
type CompactionGroupOption ¶ added in v0.6.0
type CompactionGroupOption func(*compactionGroupConfig)
CompactionGroupOption configures a single ExecuteCompactionGroup call. Use the With* helpers to construct values.
func WithCompactionScanConcurrency ¶ added in v0.6.0
func WithCompactionScanConcurrency(n int) CompactionGroupOption
WithCompactionScanConcurrency sets the scan concurrency used when reading the group's tasks. Forwarded to Table.Scan as WitMaxConcurrency. Zero (the default) means runtime.GOMAXPROCS.
TODO: the WitMaxConcurrency link enshrines a pre-existing typo (missing `h`). Update this reference when that symbol is renamed.
func WithCompactionTargetFileSize ¶ added in v0.6.0
func WithCompactionTargetFileSize(size int64) CompactionGroupOption
WithCompactionTargetFileSize sets the size target for output files written by ExecuteCompactionGroup. Forwarded to WriteRecords as WithTargetFileSize. A non-positive value (including the zero default) means inherit the table's `write.target-file-size-bytes` property.
type CompactionGroupResult ¶ added in v0.6.0
type CompactionGroupResult struct {
// PartitionKey mirrors [CompactionTaskGroup.PartitionKey] for
// display/logging on the coordinator.
PartitionKey string
// OldDataFiles are the data files this group replaces.
OldDataFiles []iceberg.DataFile
// NewDataFiles are the consolidated outputs the worker wrote.
NewDataFiles []iceberg.DataFile
// SafePosDeletes are position-delete files referenced by tasks in
// this group whose target data file is being rewritten, computed
// via [CollectSafePositionDeletes]. They are safe to expunge in
// the rewrite snapshot.
SafePosDeletes []iceberg.DataFile
// BytesBefore is [CompactionTaskGroup.TotalSizeBytes] passed
// through, recorded so the coordinator can roll up metrics
// without re-reading the plan.
BytesBefore int64
// BytesAfter is the sum of [iceberg.DataFile.FileSizeBytes] across
// NewDataFiles.
BytesAfter int64
}
CompactionGroupResult is the per-group output of a compaction worker: the new files written, the old files being replaced, and the position delete files safe to expunge in the rewrite snapshot.
A distributed coordinator aggregates results from N workers and applies them to a RewriteFiles builder via RewriteFiles.Apply to commit a single atomic snapshot. Each field is plain data ([]iceberg.DataFile values plus scalars) — callers serialize the contained DataFiles across process boundaries themselves; the typical pattern is to have the worker write a manifest containing the new files and ship the manifest path to the coordinator, which re-reads it.
func ExecuteCompactionGroup ¶ added in v0.6.0
func ExecuteCompactionGroup(ctx context.Context, tbl *Table, group CompactionTaskGroup, opts ...CompactionGroupOption) (CompactionGroupResult, error)
ExecuteCompactionGroup reads a compaction group's tasks (with deletes applied), writes consolidated output files via WriteRecords, and computes the position-delete files safe to expunge in the rewrite snapshot. It does not commit — the caller hands the result to a coordinator that uses Transaction.NewRewrite + RewriteFiles.Apply + RewriteFiles.Commit to stage the atomic commit.
Empty groups return a zero CompactionGroupResult without doing any I/O.
In-process callers should prefer Transaction.RewriteDataFiles, which drives this and the commit step in one call.
Tunables are exposed via CompactionGroupOption. The clustered write path is always used (a compaction group is single-partition by construction so its read stream is trivially clustered).
type CompactionTaskGroup ¶ added in v0.6.0
type CompactionTaskGroup struct {
// PartitionKey is an opaque grouping key for display/logging.
PartitionKey string
// Tasks are the FileScanTasks to compact.
Tasks []FileScanTask
// TotalSizeBytes is the sum of data file sizes in this group.
TotalSizeBytes int64
}
CompactionTaskGroup is a set of scan tasks in the same partition that should be compacted together. This bridges the compaction planner (table/compaction package) and the executor, avoiding a circular import between table and table/compaction.
Use [compaction.Config.PlanCompaction] to produce groups, then convert [compaction.Group] → CompactionTaskGroup to call Transaction.RewriteDataFiles or ExecuteCompactionGroup.
type DeleteOption ¶ added in v0.5.0
type DeleteOption func(deleteOp *deleteOperation)
func WithDeleteCaseInsensitive ¶ added in v0.5.0
func WithDeleteCaseInsensitive() DeleteOption
WithDeleteCaseInsensitive changes the binding of the filter to be case insensitive instead of the Default: case sensitive Note that the sensitivity only applies to the field name and not the evaluation of the literals on string fields.
func WithDeleteConcurrency ¶ added in v0.5.0
func WithDeleteConcurrency(concurrency int) DeleteOption
WithDeleteConcurrency overwrites the default concurrency for delete operations. Default: runtime.GOMAXPROCS(0)
type EncryptionKey ¶ added in v0.6.0
type EncryptionKey struct {
KeyID string `json:"key-id"`
EncryptedKeyMetadata string `json:"encrypted-key-metadata"`
EncryptedByID *string `json:"encrypted-by-id,omitempty"`
Properties map[string]string `json:"properties,omitempty"`
}
EncryptionKey represents an encryption key stored in table metadata (V3+).
func (EncryptionKey) Equals ¶ added in v0.6.0
func (e EncryptionKey) Equals(other EncryptionKey) bool
type ErrIncompatibleSchema ¶ added in v0.5.0
type ErrIncompatibleSchema struct {
// contains filtered or unexported fields
}
func (ErrIncompatibleSchema) Error ¶ added in v0.5.0
func (e ErrIncompatibleSchema) Error() string
func (ErrIncompatibleSchema) Unwrap ¶ added in v0.5.0
func (e ErrIncompatibleSchema) Unwrap() error
type ExpireSnapshotsOpt ¶ added in v0.4.0
type ExpireSnapshotsOpt func(*expireSnapshotsCfg)
func WithOlderThan ¶ added in v0.4.0
func WithOlderThan(t time.Duration) ExpireSnapshotsOpt
func WithPostCommit ¶ added in v0.5.0
func WithPostCommit(postCommit bool) ExpireSnapshotsOpt
WithPostCommit controls whether orphaned files (manifests, manifest lists, data files) are deleted immediately after expiring snapshots. Defaults to true. Set to false to defer file deletion to a separate maintenance job, avoiding conflicts with in-flight queries that may still reference those files.
func WithRetainLast ¶ added in v0.4.0
func WithRetainLast(n int) ExpireSnapshotsOpt
type FileScanTask ¶
type FileScanTask struct {
File iceberg.DataFile
DeleteFiles []iceberg.DataFile // positional delete files
EqualityDeleteFiles []iceberg.DataFile // equality delete files
DeletionVectorFiles []iceberg.DataFile // deletion vectors (puffin files)
Start, Length int64
// Row lineage (v3): constants used when reading to synthesize _row_id and _last_updated_sequence_number.
// FirstRowID is the effective first_row_id for this file (from manifest entry, after inheritance).
// DataSequenceNumber is the data sequence number of the file's manifest entry.
FirstRowID *int64
DataSequenceNumber *int64
}
type Identifier ¶
type Identifier = []string
type IncompatibleField ¶ added in v0.5.0
type IncompatibleField struct {
Field iceberg.NestedField
ColName string
UnsupportedType *UnsupportedType
InvalidDefault *InvalidDefault
}
type InvalidDefault ¶ added in v0.5.0
type IsolationLevel ¶ added in v0.6.0
type IsolationLevel string
IsolationLevel controls how strictly a commit rejects concurrent writes on the same branch. It mirrors org.apache.iceberg.IsolationLevel from the Java reference.
const ( // IsolationSerializable rejects any concurrent commit that added // data files matching the committer's filter since the base // snapshot. This is the strongest guarantee: a serializable commit // behaves as if it ran in isolation against the base snapshot. IsolationSerializable IsolationLevel = "serializable" // IsolationSnapshot only rejects concurrent commits that touched // files the committer explicitly references (e.g. a position // delete whose referenced data file was removed). Concurrent // appends into the committer's filter region are allowed. IsolationSnapshot IsolationLevel = "snapshot" )
type LocationProvider ¶ added in v0.2.0
type LocationProvider interface {
NewDataLocation(dataFileName string) string
NewTableMetadataFileLocation(newVersion int) (string, error)
NewMetadataLocation(metadataFileName string) string
}
func LoadLocationProvider ¶ added in v0.2.0
func LoadLocationProvider(tableLocation string, tableProps iceberg.Properties) (LocationProvider, error)
type Metadata ¶
type Metadata interface {
// Version indicates the version of this metadata, 1 for V1, 2 for V2, etc.
Version() int
// TableUUID returns a UUID that identifies the table, generated when the
// table is created. Implementations must throw an exception if a table's
// UUID does not match the expected UUID after refreshing metadata.
TableUUID() uuid.UUID
// Location is the table's base location. This is used by writerFactory to determine
// where to store data files, manifest files, and table metadata files.
Location() string
// LastUpdatedMillis is the timestamp in milliseconds from the unix epoch when
// the table was last updated. Each table metadata file should update this
// field just before writing.
LastUpdatedMillis() int64
// LastColumnID returns the highest assigned column ID for the table.
// This is used to ensure fields are always assigned an unused ID when
// evolving schemas.
LastColumnID() int
// Schemas returns the list of schemas, stored as objects with their
// schema-id.
Schemas() []*iceberg.Schema
// CurrentSchema returns the table's current schema.
CurrentSchema() *iceberg.Schema
// PartitionSpecs returns the list of all partition specs in the table.
PartitionSpecs() []iceberg.PartitionSpec
// PartitionSpec returns the current partition spec that the table is using.
PartitionSpec() iceberg.PartitionSpec
// PartitionSpecByID returns the partition spec with the given ID. Returns
// nil if the ID is not found in the list of partition specs.
PartitionSpecByID(int) *iceberg.PartitionSpec
// DefaultPartitionSpec is the ID of the current spec that writerFactory should
// use by default.
DefaultPartitionSpec() int
// LastPartitionSpecID is the highest assigned partition field ID across
// all partition specs for the table. This is used to ensure partition
// fields are always assigned an unused ID when evolving specs.
LastPartitionSpecID() *int
// Snapshots returns the list of valid snapshots. Valid snapshots are
// snapshots for which all data files exist in the file system. A data
// file must not be deleted from the file system until the last snapshot
// in which it was listed is garbage collected.
Snapshots() []Snapshot
// SnapshotByID find and return a specific snapshot by its ID. Returns
// nil if the ID is not found in the list of snapshots.
SnapshotByID(int64) *Snapshot
// SnapshotByName searches the list of snapshots for a snapshot with a given
// ref name. Returns nil if there's no ref with this name for a snapshot.
SnapshotByName(name string) *Snapshot
// CurrentSnapshot returns the table's current snapshot.
CurrentSnapshot() *Snapshot
// Ref returns the snapshot ref for the main branch.
Ref() SnapshotRef
// Refs returns a list of snapshot name/reference pairs.
Refs() iter.Seq2[string, SnapshotRef]
// SnapshotLogs returns the list of snapshot logs for the table.
SnapshotLogs() iter.Seq[SnapshotLogEntry]
// SortOrder returns the table's current sort order, ie: the one with the
// ID that matches the default-sort-order-id.
SortOrder() SortOrder
// SortOrders returns the list of sort orders in the table.
SortOrders() []SortOrder
// DefaultSortOrder returns the ID of the current sort order that writerFactory
// should use by default.
DefaultSortOrder() int
// Properties is a string to string map of table properties. This is used
// to control settings that affect reading and writing and is not intended
// to be used for arbitrary metadata. For example, commit.retry.num-retries
// is used to control the number of commit retries.
Properties() iceberg.Properties
// PreviousFiles returns the list of metadata log entries for the table.
PreviousFiles() iter.Seq[MetadataLogEntry]
Equals(Metadata) bool
NameMapping() iceberg.NameMapping
LastSequenceNumber() int64
// NextRowID returns the next available row ID for v3 tables.
// Returns 0 for v1/v2 tables or if not set.
NextRowID() int64
// Statistics returns an optional list of table statistics.
// Table statistics files are valid Puffin files.
// StatisticsFile are informational. A reader can choose to ignore statistics information.
// StatisticsFile support is not required to read the table correctly.
// A table can contain many statistics files associated with different table snapshots.
Statistics() iter.Seq[StatisticsFile]
// PartitionStatistics returns an optional list of partition statistics files.
// Partition statistics are not required for reading or planning
// and readers may ignore them. Each table snapshot may be associated
// with at most one partition statistics file. A writer can optionally
// write the partition statistics file during each write operation,
// or it can also be computed on demand.
PartitionStatistics() iter.Seq[PartitionStatisticsFile]
// EncryptionKeys returns the list of encryption keys stored in table metadata (V3+).
// Returns an empty sequence for V1/V2 tables.
EncryptionKeys() iter.Seq[EncryptionKey]
}
Metadata for an iceberg table as specified in the Iceberg spec
https://iceberg.apache.org/spec/#iceberg-table-spec
func NewMetadata ¶ added in v0.2.0
func NewMetadata(sc *iceberg.Schema, partitions *iceberg.PartitionSpec, sortOrder SortOrder, location string, props iceberg.Properties) (Metadata, error)
NewMetadata creates a new table metadata object using the provided schema, information, generating a fresh UUID for the new table metadata. By default, this will generate a V2 table metadata, but this can be modified by adding a "format-version" property to the props map. An error will be returned if the "format-version" property exists and is not a valid version number.
func NewMetadataWithUUID ¶ added in v0.2.0
func NewMetadataWithUUID(sc *iceberg.Schema, partitions *iceberg.PartitionSpec, sortOrder SortOrder, location string, props iceberg.Properties, tableUuid uuid.UUID) (Metadata, error)
NewMetadataWithUUID is like NewMetadata, but allows the caller to specify the UUID of the table rather than creating a new one.
func ParseMetadata ¶
ParseMetadata parses json metadata provided by the passed in reader, returning an error if one is encountered.
func ParseMetadataBytes ¶
ParseMetadataBytes is like ParseMetadataString but for a byte slice.
func ParseMetadataString ¶
ParseMetadataString is like ParseMetadata, but for a string rather than an io.Reader.
type MetadataBuilder ¶ added in v0.2.0
type MetadataBuilder struct {
// contains filtered or unexported fields
}
MetadataBuilder is a struct used for building and updating Iceberg table metadata.
It keeps track of applied changes in the `updates` field. This can be used to commit changes made to a table to the catalog.
func MetadataBuilderFromBase ¶ added in v0.2.0
func MetadataBuilderFromBase(metadata Metadata, currentFileLocation string) (*MetadataBuilder, error)
MetadataBuilderFromBase creates a MetadataBuilder from an existing Metadata object. currentFileLocation is the location where the current version of the metadata file is stored. This is used to update the metadata log. If currentFileLocation is empty, the metadata log will not be updated. This should only be used to stage-create tables.
func NewMetadataBuilder ¶ added in v0.2.0
func NewMetadataBuilder(formatVersion int) (*MetadataBuilder, error)
func (*MetadataBuilder) AddEncryptionKey ¶ added in v0.6.0
func (b *MetadataBuilder) AddEncryptionKey(key EncryptionKey) error
AddEncryptionKey adds or replaces an encryption key indexed by its key-id. Encryption keys are only supported for format version 3 and above.
func (*MetadataBuilder) AddPartitionSpec ¶ added in v0.2.0
func (b *MetadataBuilder) AddPartitionSpec(spec *iceberg.PartitionSpec, initial bool) error
func (*MetadataBuilder) AddSchema ¶ added in v0.2.0
func (b *MetadataBuilder) AddSchema(schema *iceberg.Schema) error
func (*MetadataBuilder) AddSnapshot ¶ added in v0.2.0
func (b *MetadataBuilder) AddSnapshot(snapshot *Snapshot) error
func (*MetadataBuilder) AddSnapshotUpdate ¶ added in v0.6.0
func (b *MetadataBuilder) AddSnapshotUpdate(u *addSnapshotUpdate) error
AddSnapshotUpdate adds a snapshot to the builder and stores the supplied *addSnapshotUpdate as the corresponding entry in builder.updates, preserving runtime-only fields (such as the manifest-list rebuild closure used by the OCC retry path) that would be lost if a fresh update object were constructed.
Callers without runtime-only fields should keep using AddSnapshot, which constructs a default *addSnapshotUpdate internally.
func (*MetadataBuilder) AddSortOrder ¶ added in v0.2.0
func (b *MetadataBuilder) AddSortOrder(sortOrder *SortOrder) error
func (*MetadataBuilder) AppendMetadataLog ¶ added in v0.2.0
func (b *MetadataBuilder) AppendMetadataLog(entry MetadataLogEntry) *MetadataBuilder
func (*MetadataBuilder) Build ¶ added in v0.2.0
func (b *MetadataBuilder) Build() (Metadata, error)
func (*MetadataBuilder) CurrentSchema ¶ added in v0.2.0
func (b *MetadataBuilder) CurrentSchema() *iceberg.Schema
func (*MetadataBuilder) CurrentSpec ¶ added in v0.2.0
func (b *MetadataBuilder) CurrentSpec() (*iceberg.PartitionSpec, error)
func (*MetadataBuilder) GetSchemaByID ¶ added in v0.2.0
func (b *MetadataBuilder) GetSchemaByID(id int) (*iceberg.Schema, error)
func (*MetadataBuilder) GetSortOrderByID ¶ added in v0.2.0
func (b *MetadataBuilder) GetSortOrderByID(id int) (*SortOrder, error)
func (*MetadataBuilder) GetSpecByID ¶ added in v0.2.0
func (b *MetadataBuilder) GetSpecByID(id int) (*iceberg.PartitionSpec, error)
func (*MetadataBuilder) HasChanges ¶ added in v0.2.0
func (b *MetadataBuilder) HasChanges() bool
func (*MetadataBuilder) LastColumnID ¶ added in v0.6.0
func (b *MetadataBuilder) LastColumnID() int
LastColumnID returns the highest field id ever assigned in this table's lifetime, as tracked by the Iceberg spec's last-column-id counter.
func (*MetadataBuilder) LastUpdatedMS ¶ added in v0.2.0
func (b *MetadataBuilder) LastUpdatedMS() int64
func (*MetadataBuilder) NameMapping ¶ added in v0.2.0
func (b *MetadataBuilder) NameMapping() iceberg.NameMapping
func (*MetadataBuilder) NextRowID ¶ added in v0.5.0
func (b *MetadataBuilder) NextRowID() int64
NextRowID returns the next available row ID (for v3 row lineage). For v1/v2 returns 0.
func (*MetadataBuilder) RemoveEncryptionKey ¶ added in v0.6.0
func (b *MetadataBuilder) RemoveEncryptionKey(keyID string) error
RemoveEncryptionKey removes the encryption key with the given key-id. It is not an error if no such key exists.
func (*MetadataBuilder) RemovePartitionSpecs ¶ added in v0.4.0
func (b *MetadataBuilder) RemovePartitionSpecs(ints []int) error
func (*MetadataBuilder) RemovePartitionStatistics ¶ added in v0.6.0
func (b *MetadataBuilder) RemovePartitionStatistics(snapshotID int64) error
RemovePartitionStatistics removes the partition statistics file associated with the given snapshot ID. It is not an error if no such file exists.
func (*MetadataBuilder) RemoveProperties ¶ added in v0.2.0
func (b *MetadataBuilder) RemoveProperties(keys []string) error
func (*MetadataBuilder) RemoveSchemas ¶ added in v0.4.0
func (b *MetadataBuilder) RemoveSchemas(ints []int) error
func (*MetadataBuilder) RemoveSnapshotRef ¶ added in v0.4.0
func (b *MetadataBuilder) RemoveSnapshotRef(name string) error
func (*MetadataBuilder) RemoveSnapshots ¶ added in v0.4.0
func (b *MetadataBuilder) RemoveSnapshots(snapshotIds []int64, postCommit bool) error
func (*MetadataBuilder) RemoveStatistics ¶ added in v0.6.0
func (b *MetadataBuilder) RemoveStatistics(snapshotID int64) error
RemoveStatistics removes the statistics file associated with the given snapshot ID. It is not an error if no such file exists.
func (*MetadataBuilder) SetCurrentSchemaID ¶ added in v0.2.0
func (b *MetadataBuilder) SetCurrentSchemaID(currentSchemaID int) error
func (*MetadataBuilder) SetDefaultSortOrderID ¶ added in v0.2.0
func (b *MetadataBuilder) SetDefaultSortOrderID(defaultSortOrderID int) error
func (*MetadataBuilder) SetDefaultSpecID ¶ added in v0.2.0
func (b *MetadataBuilder) SetDefaultSpecID(defaultSpecID int) error
func (*MetadataBuilder) SetFormatVersion ¶ added in v0.2.0
func (b *MetadataBuilder) SetFormatVersion(formatVersion int) error
func (*MetadataBuilder) SetLastUpdatedMS ¶ added in v0.2.0
func (b *MetadataBuilder) SetLastUpdatedMS() *MetadataBuilder
func (*MetadataBuilder) SetLoc ¶ added in v0.2.0
func (b *MetadataBuilder) SetLoc(loc string) error
func (*MetadataBuilder) SetPartitionStatistics ¶ added in v0.6.0
func (b *MetadataBuilder) SetPartitionStatistics(stats PartitionStatisticsFile) error
SetPartitionStatistics adds or replaces a partition statistics file for the given snapshot. If a partition statistics file with the same snapshot ID already exists it is replaced, otherwise the file is appended.
func (*MetadataBuilder) SetProperties ¶ added in v0.2.0
func (b *MetadataBuilder) SetProperties(props iceberg.Properties) error
func (*MetadataBuilder) SetSnapshotRef ¶ added in v0.2.0
func (b *MetadataBuilder) SetSnapshotRef( name string, snapshotID int64, refType RefType, options ...setSnapshotRefOption, ) error
func (*MetadataBuilder) SetStatistics ¶ added in v0.6.0
func (b *MetadataBuilder) SetStatistics(stats StatisticsFile) error
SetStatistics adds or replaces a statistics file for the given snapshot. If a statistics file with the same snapshot ID already exists it is replaced, otherwise the file is appended.
func (*MetadataBuilder) SetUUID ¶ added in v0.2.0
func (b *MetadataBuilder) SetUUID(uuid uuid.UUID) error
func (*MetadataBuilder) SnapshotByID ¶ added in v0.2.0
func (b *MetadataBuilder) SnapshotByID(id int64) (*Snapshot, error)
func (*MetadataBuilder) TrimMetadataLogs ¶ added in v0.2.0
func (b *MetadataBuilder) TrimMetadataLogs(maxEntries int) *MetadataBuilder
type MetadataLogEntry ¶
type Operation ¶
type Operation string
func ValidOperation ¶
ValidOperation ensures that a given string is one of the valid operation types: append,replace,overwrite,delete
type OrphanCleanupOption ¶ added in v0.4.0
type OrphanCleanupOption func(*orphanCleanupConfig)
func WithDeleteFunc ¶ added in v0.4.0
func WithDeleteFunc(deleteFunc func(string) error) OrphanCleanupOption
WithDeleteFunc sets a custom delete function. If not provided, the table's FileIO delete method will be used.
func WithDryRun ¶ added in v0.4.0
func WithDryRun(enabled bool) OrphanCleanupOption
func WithEqualAuthorities ¶ added in v0.4.0
func WithEqualAuthorities(authorities map[string]string) OrphanCleanupOption
WithEqualAuthorities specifies authorities that should be considered equivalent. For example, map["endpoint1.s3.amazonaws.com,endpoint2.s3.amazonaws.com"] = "s3.amazonaws.com" treats different S3 endpoints as equivalent. The key can be a comma-separated list.
func WithEqualSchemes ¶ added in v0.4.0
func WithEqualSchemes(schemes map[string]string) OrphanCleanupOption
WithEqualSchemes specifies schemes that should be considered equivalent. For example, map["s3,s3a,s3n"] = "s3" treats all S3 scheme variants as equivalent. The key can be a comma-separated list of schemes that map to the value scheme.
func WithFilesOlderThan ¶ added in v0.4.0
func WithFilesOlderThan(duration time.Duration) OrphanCleanupOption
func WithLocation ¶ added in v0.4.0
func WithLocation(location string) OrphanCleanupOption
func WithMaxConcurrency ¶ added in v0.4.0
func WithMaxConcurrency(maxWorkers int) OrphanCleanupOption
WithMaxConcurrency sets the maximum number of goroutines for parallel deletion. Defaults to a reasonable number based on the system. Only used when deleteFunc is nil or when the FileIO doesn't support bulk operations.
func WithPrefixMismatchMode ¶ added in v0.4.0
func WithPrefixMismatchMode(mode PrefixMismatchMode) OrphanCleanupOption
WithPrefixMismatchMode sets how to handle situations when metadata references files that match listed files except for authority/scheme differences.
type OrphanCleanupResult ¶ added in v0.4.0
type OverwriteOption ¶ added in v0.5.0
type OverwriteOption func(op *overwriteOperation)
OverwriteOption applies options to overwrite operations
func WithOverwriteCaseInsensitive ¶ added in v0.5.0
func WithOverwriteCaseInsensitive() OverwriteOption
WithOverwriteCaseInsensitive overwrites the default case sensitivity that applies on the binding of the filter. Default: case sensitive Note that the sensitivity only applies to the field name and not the evaluation of the literals on string fields.
func WithOverwriteConcurrency ¶ added in v0.5.0
func WithOverwriteConcurrency(concurrency int) OverwriteOption
WithOverwriteConcurrency overwrites the default concurrency for overwrite operations. Default: runtime.GOMAXPROCS(0)
func WithOverwriteFilter ¶ added in v0.5.0
func WithOverwriteFilter(filter iceberg.BooleanExpression) OverwriteOption
WithOverwriteFilter overwrites the default deletion filter on overwrite operations. Default: iceberg.AlwaysTrue
type PartitionStatisticsFile ¶ added in v0.4.0
type PartitionStatisticsFile struct {
SnapshotID int64 `json:"snapshot-id"`
StatisticsPath string `json:"statistics-path"`
FileSizeInBytes int64 `json:"file-size-in-bytes"`
}
PartitionStatisticsFile represents a partition statistics file that can be used to read table data more efficiently.
Statistics are informational. A reader can choose to ignore statistics information. Statistics support is not required to read the table correctly.
type PrefixMismatchMode ¶ added in v0.4.0
type PrefixMismatchMode int
PrefixMismatchMode defines how to handle cases where candidate files have different URI schemes or authorities compared to table location during orphan cleanup. This is useful when files may be referenced using different but equivalent schemes (e.g., s3:// vs s3a:// vs s3n://) or when cleaning up files across different locations.
const ( // PrefixMismatchError causes cleanup to fail with an error when candidate files // have URI schemes/authorities that don't match the table location and are not // covered by configured equivalences. This is the safest default behavior. PrefixMismatchError PrefixMismatchMode = iota // default // PrefixMismatchIgnore skips candidate files that have mismatched URI schemes/authorities // without treating it as an error. Files are silently ignored and not considered for deletion. PrefixMismatchIgnore // PrefixMismatchDelete treats candidate files with mismatched URI schemes/authorities // as orphans and includes them for deletion. Use with caution as this may delete // files from unexpected locations. PrefixMismatchDelete )
func (PrefixMismatchMode) String ¶ added in v0.4.0
func (p PrefixMismatchMode) String() string
type ReassignedIds ¶ added in v0.4.0
type ReassignedIds struct {
// contains filtered or unexported fields
}
type Requirement ¶ added in v0.2.0
type Requirement interface {
// Validate checks that the current table metadata satisfies the requirement.
Validate(Metadata) error
GetType() string
}
A Requirement is a validation rule that must be satisfied before attempting to make and commit changes to a table. Requirements are used to ensure that the table is in a valid state before making changes.
func AssertCreate ¶ added in v0.2.0
func AssertCreate() Requirement
AssertCreate creates a requirement that the table does not already exist.
func AssertCurrentSchemaID ¶ added in v0.2.0
func AssertCurrentSchemaID(id int) Requirement
AssertCurrentSchemaId creates a requirement that the table's current schema ID matches the given id.
func AssertDefaultSortOrderID ¶ added in v0.2.0
func AssertDefaultSortOrderID(id int) Requirement
AssertDefaultSortOrderID creates a requirement that the table's default sort order ID matches the given id.
func AssertDefaultSpecID ¶ added in v0.2.0
func AssertDefaultSpecID(id int) Requirement
AssertDefaultSpecID creates a requirement that the table's default partition spec ID matches the given id.
func AssertLastAssignedFieldID ¶ added in v0.2.0
func AssertLastAssignedFieldID(id int) Requirement
AssertLastAssignedFieldID validates that the table's last assigned column ID matches the given id.
func AssertLastAssignedPartitionID ¶ added in v0.2.0
func AssertLastAssignedPartitionID(id int) Requirement
AssertLastAssignedPartitionID creates a requriement that the table's last assigned partition ID matches the given id.
func AssertRefSnapshotID ¶ added in v0.2.0
func AssertRefSnapshotID(ref string, id *int64) Requirement
AssertRefSnapshotID creates a requirement which ensures that the table branch or tag identified by the given ref must reference the given snapshot id. If the id is nil, the ref must not already exist.
func AssertTableUUID ¶ added in v0.2.0
func AssertTableUUID(uuid uuid.UUID) Requirement
AssertTableUUID creates a requirement that the table UUID matches the given UUID.
func ParseRequirement ¶ added in v0.3.0
func ParseRequirement(r io.Reader) (Requirement, error)
ParseRequirement parses json data provided by the reader into a Requirement
func ParseRequirementBytes ¶ added in v0.3.0
func ParseRequirementBytes(b []byte) (Requirement, error)
ParseRequirementBytes parses json bytes into a Requirement
func ParseRequirementString ¶ added in v0.3.0
func ParseRequirementString(s string) (Requirement, error)
ParseRequirementString parses json string into a Requirement
type Requirements ¶ added in v0.4.0
type Requirements []Requirement
func (*Requirements) UnmarshalJSON ¶ added in v0.4.0
func (r *Requirements) UnmarshalJSON(data []byte) error
type RewriteDataFilesOptions ¶ added in v0.6.0
type RewriteDataFilesOptions struct {
// PartialProgress, when true, stages each group as its own
// rewrite snapshot inside the loop so a mid-loop write failure
// leaves the already-completed groups staged on this transaction
// (the in-memory transaction can be discarded by group rather
// than wholesale). When false (the default), every group lands in
// a single atomic rewrite snapshot.
//
// In both modes the catalog commit happens once at
// [Transaction.Commit] time, so a process crash mid-loop loses
// every staged group regardless of this flag. Callers who need
// true per-group catalog durability (matching Java's behavior)
// should drive [Transaction.NewRewrite] themselves and commit a
// fresh transaction per group.
PartialProgress bool
// SnapshotProps are added to the rewrite snapshot's summary.
// In partial-progress mode the same properties land on every
// per-group snapshot rather than being summed or split.
SnapshotProps iceberg.Properties
// ExtraDeleteFilesToRemove are delete files (typically equality
// deletes that are dead after the rewrite) that the caller wants
// expunged in the same snapshot as the rewrite. Honored only when
// PartialProgress is false.
//
// Use [compaction.CollectDeadEqualityDeletes] to compute this list
// from the current snapshot. Position delete files that are fully
// applied are removed automatically and do NOT need to be passed
// in here.
ExtraDeleteFilesToRemove []iceberg.DataFile
// GroupOptions are forwarded to every [ExecuteCompactionGroup]
// call to tune the per-group read+write pipeline (target file
// size, scan concurrency). See the With* helpers returning
// [CompactionGroupOption].
GroupOptions []CompactionGroupOption
}
RewriteDataFilesOptions bundles the per-rewrite knobs for Transaction.RewriteDataFiles.
type RewriteFiles ¶ added in v0.6.0
type RewriteFiles struct {
// contains filtered or unexported fields
}
RewriteFiles is the snapshot-operation builder for rewrite (compaction) commits. It is the snapshot-level sibling of RowDelta and mirrors Java's org.apache.iceberg.RewriteFiles interface (returned by Table.newRewrite() in Java).
Compared to a raw Transaction.ReplaceFiles call, the builder owns the rewrite-specific isolation contract internally:
- The overwrite producer's default isolation validator is suppressed (concurrent appends into rewritten partitions are allowed; this is the defining behavior of a rewrite).
- A rewrite-specific conflict validator is registered so concurrent pos/eq-delete files targeting any rewritten data file are rejected pre-flight at Transaction.Commit time. The pos-delete branch only fires when the concurrent writer populated the manifest's referenced_data_file column (field id 143). That column is V2-optional and V3-required for deletion-vector deletes; V2 pos-delete writers commonly leave it empty, in which case only the conservative eq-delete-during-rewrite rule fires.
Distributed compaction coordinators construct one RewriteFiles on the leader transaction, feed worker outputs in via RewriteFiles.Apply, and commit one snapshot. In-process callers can use Transaction.RewriteDataFiles which drives this builder internally.
The builder follows the same fail-fast pattern as [view.MetadataBuilder]: a method that hits an invalid input stages the error and short-circuits all subsequent calls until RewriteFiles.Commit drains it. The builder is single-use; once Commit has been called, a second call returns an error regardless of whether the first call succeeded.
Adding new delete files (e.g., rewriting position deletes into deletion vectors) is not yet supported; RewriteFiles.AddDataFile rejects pos/eq-delete inputs at insertion time. Add the support to the underlying Transaction.ReplaceFiles before lifting that restriction.
func (*RewriteFiles) AddDataFile ¶ added in v0.6.0
func (r *RewriteFiles) AddDataFile(df iceberg.DataFile) *RewriteFiles
AddDataFile queues a new data file. Adding delete files is not yet supported by the underlying snapshot machinery; a pos/eq-delete here stages an error that is returned from the next RewriteFiles.Commit call. The error names the offending file path so callers driving the builder via RewriteFiles.Apply can identify it without tracking queue order.
func (*RewriteFiles) Apply ¶ added in v0.6.0
func (r *RewriteFiles) Apply(deletes, adds, safeDeletes []iceberg.DataFile) *RewriteFiles
Apply is a bulk shortcut that routes three slices onto this builder: every entry in deletes and safeDeletes is queued via RewriteFiles.DeleteFile (which routes data vs. delete files by content type), and every entry in adds via RewriteFiles.AddDataFile.
Distributed coordinators should prefer RewriteFiles.ApplyResult, which takes a CompactionGroupResult directly: the three positional same-typed slices here transpose silently under refactor.
func (*RewriteFiles) ApplyResult ¶ added in v0.6.0
func (r *RewriteFiles) ApplyResult(gr CompactionGroupResult) *RewriteFiles
ApplyResult is the typed coordinator entry point: it queues a worker's CompactionGroupResult onto this builder by routing OldDataFiles (via DeleteFile), NewDataFiles (via AddDataFile), and SafePosDeletes (via DeleteFile) in one call. Prefer this over RewriteFiles.Apply when feeding worker outputs — the field names line up with the builder semantics, so a refactor of CompactionGroupResult cannot silently transpose roles.
Typical distributed-coordinator pattern:
rewrite := leaderTxn.NewRewrite(snapshotProps)
for _, gr := range workerResults {
rewrite.ApplyResult(gr)
}
if err := rewrite.Commit(ctx); err != nil { ... }
func (*RewriteFiles) Commit ¶ added in v0.6.0
func (r *RewriteFiles) Commit(ctx context.Context) error
Commit stages the rewrite snapshot on the underlying transaction. The catalog commit happens once, later, at Transaction.Commit time.
Commit is single-shot: any second call returns an error regardless of whether the first call succeeded, and neither re-stages the rewrite nor re-registers the conflict validator. Returns an error if any file passed to RewriteFiles.AddDataFile or RewriteFiles.DeleteFile had an unsupported content type, if the builder has no file changes, or if the underlying Transaction.ReplaceFiles call fails.
func (*RewriteFiles) DeleteFile ¶ added in v0.6.0
func (r *RewriteFiles) DeleteFile(df iceberg.DataFile) *RewriteFiles
DeleteFile marks a file for removal in this rewrite. Routes by content type: data files are queued as data-file replacements; pos/eq-delete files are queued for delete-file removal alongside the data rewrite (typical when a delete is fully applied to data files being rewritten and is therefore safe to expunge).
Any other content type stages an error that is returned from the next RewriteFiles.Commit call.
type RewriteResult ¶ added in v0.6.0
type RewriteResult struct {
// RewrittenGroups is the number of compaction groups committed.
RewrittenGroups int
// AddedDataFiles is the total number of new data files written.
AddedDataFiles int
// RemovedDataFiles is the total number of old data files replaced.
RemovedDataFiles int
// RemovedPositionDeleteFiles is the count of position delete files
// removed because their referenced data file was rewritten.
RemovedPositionDeleteFiles int
// RemovedEqualityDeleteFiles is the count of equality delete files
// removed via [RewriteDataFilesOptions.ExtraDeleteFilesToRemove].
// The caller computes which eq-deletes are dead — typically via
// [compaction.CollectDeadEqualityDeletes] — and passes the list in.
RemovedEqualityDeleteFiles int
// BytesBefore is the total size of input data files (from the compaction plan).
BytesBefore int64
// BytesAfter is the total size of output data files (measured from written files).
BytesAfter int64
}
RewriteResult summarizes a completed compaction.
type RollingDataWriter ¶ added in v0.4.0
type RollingDataWriter struct {
// contains filtered or unexported fields
}
RollingDataWriter writes Arrow records for a specific partition, rolling to new data files when the actual compressed file size reaches the target.
func (*RollingDataWriter) Add ¶ added in v0.4.0
func (r *RollingDataWriter) Add(record arrow.RecordBatch) error
Add appends a record to the writer's buffer.
type RowDelta ¶ added in v0.6.0
type RowDelta struct {
// contains filtered or unexported fields
}
RowDelta encodes a set of row-level changes to a table: new data files (inserts) and delete files (equality or position deletes). All changes are committed atomically in a single snapshot.
The operation type of the produced snapshot is determined automatically:
- Data files only → OpAppend
- Delete files only → OpDelete
- Both data and delete files → OpOverwrite
This matches the semantics of Java's BaseRowDelta. It is the primary API for CDC/streaming workloads where INSERTs, UPDATEs, and DELETEs must be committed together.
Client-side conflict validation runs before the commit is sent to the catalog:
- Position deletes: referenced data files must still be reachable from the current branch head (validateDataFilesExist).
- Equality deletes under write.delete.isolation-level=serializable (the default): concurrent data files in the same partition(s) as the equality deletes are rejected. For partitioned tables an OR-of-equalities filter is built from the eq-delete files' partition tuples and routed through validateAddedDataFilesMatchingFilter (spec-evolution safe, manifest-summary pruning, type-aware evaluation). For unpartitioned tables the check is conservative (AlwaysTrue — any concurrent append is a conflict). Opt out by setting write.delete.isolation-level=snapshot.
Refresh-and-replay between retries is deferred to a follow-up PR; today the pre-flight runs once on the first attempt.
Usage:
rd := tx.NewRowDelta(snapshotProps) rd.AddRows(dataFile1, dataFile2) rd.AddDeletes(equalityDeleteFile1) err := rd.Commit(ctx)
func (*RowDelta) AddDeletes ¶ added in v0.6.0
AddDeletes adds delete files (equality or position) to this RowDelta. Equality delete files must have ContentType == EntryContentEqDeletes and non-empty EqualityFieldIDs referencing valid schema columns. Position delete files must have ContentType == EntryContentPosDeletes.
func (*RowDelta) AddRows ¶ added in v0.6.0
AddRows adds data files containing new rows (inserts) to this RowDelta.
type Scan ¶
type Scan struct {
// contains filtered or unexported fields
}
func (*Scan) PlanFiles ¶
func (scan *Scan) PlanFiles(ctx context.Context) ([]FileScanTask, error)
PlanFiles orchestrates the fetching and filtering of manifests, and then building a list of FileScanTasks that match the current Scan criteria.
func (*Scan) ReadTasks ¶ added in v0.6.0
func (scan *Scan) ReadTasks(ctx context.Context, tasks []FileScanTask) (*arrow.Schema, iter.Seq2[arrow.RecordBatch, error], error)
ReadTasks reads Arrow records from a specific set of FileScanTasks, applying the scan's projection, row filters, and positional delete handling. This is useful when the caller has already planned or selected specific tasks to read.
func (*Scan) ToArrowRecords ¶
func (scan *Scan) ToArrowRecords(ctx context.Context) (*arrow.Schema, iter.Seq2[arrow.RecordBatch, error], error)
ToArrowRecords returns the arrow schema of the expected records and an interator that can be used with a range expression to read the records as they are available. If an error is encountered, during the planning and setup then this will return the error directly. If the error occurs while iterating the records, it will be returned by the iterator.
The purpose for returning the schema up front is to handle the case where there are no rows returned. The resulting Arrow Schema of the projection will still be known.
func (*Scan) ToArrowTable ¶
ToArrowTable calls ToArrowRecords and then gathers all of the records together and returns an arrow.Table make from those records.
func (*Scan) UseRowLimit ¶
type ScanOption ¶
type ScanOption func(*Scan)
func WitMaxConcurrency ¶
func WitMaxConcurrency(n int) ScanOption
WitMaxConcurrency sets the maximum concurrency for table scan and plan operations. When unset it defaults to runtime.GOMAXPROCS.
func WithCaseSensitive ¶
func WithCaseSensitive(b bool) ScanOption
func WithLimit ¶
func WithLimit(n int64) ScanOption
func WithOptions ¶
func WithOptions(opts iceberg.Properties) ScanOption
func WithRowFilter ¶
func WithRowFilter(e iceberg.BooleanExpression) ScanOption
func WithSelectedFields ¶
func WithSelectedFields(fields ...string) ScanOption
func WithSnapshotAsOf ¶ added in v0.4.0
func WithSnapshotAsOf(timeStampMs int64) ScanOption
func WithSnapshotID ¶
func WithSnapshotID(n int64) ScanOption
type SchemaOptions ¶ added in v0.6.0
type SchemaOptions struct {
DowncastTimestamp bool
IncludeFieldIDs bool
UseLargeTypes bool
UseWriteDefault bool
}
SchemaOptions controls the behaviour of ToRequestedSchema.
type SequenceNumberValidator ¶ added in v0.5.0
type SequenceNumberValidator interface {
LastSequenceNumber() int64
}
SequenceNumberValidator defines an interface for types that can validate sequence numbers
type Snapshot ¶
type Snapshot struct {
SnapshotID int64 `json:"snapshot-id"`
ParentSnapshotID *int64 `json:"parent-snapshot-id,omitempty"`
SequenceNumber int64 `json:"sequence-number"`
TimestampMs int64 `json:"timestamp-ms"`
ManifestList string `json:"manifest-list,omitempty"`
Summary *Summary `json:"summary,omitempty"`
SchemaID *int `json:"schema-id,omitempty"`
FirstRowID *int64 `json:"first-row-id,omitempty"` // V3: Starting row ID for this snapshot
AddedRows *int64 `json:"added-rows,omitempty"` // V3: Number of rows added by this snapshot
}
func AncestorsBetween ¶ added in v0.6.0
func AncestorsBetween(latestID, baseID int64, lookup SnapshotLookup) ([]Snapshot, bool)
AncestorsBetween returns the snapshots from latestID (inclusive) down to but not including baseID, by walking the parent chain from latestID backward. The second return value (baseFound) is true when baseID was actually reached during the walk.
These snapshots are the "concurrent" snapshots that a writer based on baseID needs to examine for conflict detection.
When baseFound is false, the walk terminated without reaching baseID — either because latestID was unknown, baseID is not in latestID's ancestry (diverged branch or expired base), the chain was broken by a missing intermediate snapshot, or a cycle was detected in malformed metadata. In all of these cases the returned slice is diagnostic context only — it is NOT an enumerable "concurrent snapshots" list. Callers performing conflict detection MUST treat baseFound=false as divergent and refuse the commit.
Returns (nil, true) when latestID == baseID (no concurrent snapshots).
Snapshots are returned by value in reverse-chronological order.
func AncestorsOf ¶ added in v0.6.0
func AncestorsOf(snapshotID int64, lookup SnapshotLookup) []Snapshot
AncestorsOf walks the parent chain starting from snapshotID and yields every snapshot reachable via ParentSnapshotID, including the starting snapshot itself. Iteration stops when a snapshot has no parent, the parent cannot be resolved via lookup, or a cycle is detected in malformed metadata.
Returns snapshots by value in reverse-chronological order (newest first). Returns an empty slice if snapshotID cannot be resolved.
The returned slice may be truncated if an intermediate snapshot is missing from the lookup (e.g. expired) or if a cycle is encountered. Callers that need to distinguish a complete walk from a truncated one should use AncestorsOfChecked instead.
func AncestorsOfChecked ¶ added in v0.6.0
func AncestorsOfChecked(snapshotID int64, lookup SnapshotLookup) ([]Snapshot, bool)
AncestorsOfChecked is AncestorsOf with completeness tracking. The second return value is true when the walk terminated at a snapshot with no parent (a clean root). It is false when the walk was truncated by an unresolvable starting snapshot, a missing intermediate snapshot, or a cycle in malformed metadata.
Callers performing conflict detection (where a truncated ancestry equates to under-counting concurrent snapshots) MUST treat complete=false as divergent and refuse the commit, mirroring AncestorsBetween's baseFound=false contract. When complete is false the returned slice is the partial walk before truncation — diagnostic context only, NOT an enumerable ancestry.
Snapshots are returned by value in reverse-chronological order. Returns an empty slice and false when snapshotID cannot be resolved.
func (Snapshot) ValidateRowLineage ¶ added in v0.5.0
type SnapshotLogEntry ¶
type SnapshotLookup ¶ added in v0.6.0
SnapshotLookup returns the snapshot for the given id, or nil if not found. It matches the signature of Metadata.SnapshotByID.
type SnapshotRef ¶
type SnapshotRef struct {
SnapshotID int64 `json:"snapshot-id"`
SnapshotRefType RefType `json:"type"`
MinSnapshotsToKeep *int `json:"min-snapshots-to-keep,omitempty"`
MaxSnapshotAgeMs *int64 `json:"max-snapshot-age-ms,omitempty"`
MaxRefAgeMs *int64 `json:"max-ref-age-ms,omitempty"`
}
SnapshotRef represents the reference information for a specific snapshot
func (*SnapshotRef) Equals ¶
func (s *SnapshotRef) Equals(rhs SnapshotRef) bool
func (*SnapshotRef) UnmarshalJSON ¶
func (s *SnapshotRef) UnmarshalJSON(b []byte) error
type SnapshotSummaryCollector ¶ added in v0.2.0
type SnapshotSummaryCollector struct {
// contains filtered or unexported fields
}
type SortDirection ¶
type SortDirection string
const ( SortASC SortDirection = "asc" SortDESC SortDirection = "desc" )
type SortField ¶
type SortField struct {
// SourceIDs contains the source column ids from the table's schema.
// For single-argument transforms this will have exactly one element.
// For multi-argument transforms this will have multiple elements.
SourceIDs []int `json:"-"`
// Transform is the tranformation used to produce values to be
// sorted on from the source column.
Transform iceberg.Transform `json:"transform"`
// Direction is an enum indicating ascending or descending direction.
Direction SortDirection `json:"direction"`
// NullOrder describes the order of null values when sorting
// should be only either nulls-first or nulls-last enum values.
NullOrder NullOrder `json:"null-order"`
}
SortField describes a field used in a sort order definition.
func (*SortField) MarshalJSON ¶
func (*SortField) UnmarshalJSON ¶
type SortOrder ¶
type SortOrder struct {
// contains filtered or unexported fields
}
SortOrder describes how the data is sorted within the table.
Data can be sorted within partitions by columns to gain performance. The order of the sort fields within the list defines the order in which the sort is applied to the data.
func AssignFreshSortOrderIDs ¶ added in v0.2.0
AssignFreshSortOrderIDs updates and reassigns the field source IDs from the old schema to the corresponding fields in the fresh schema, while also giving the Sort Order a fresh ID of 0 (the initial Sort Order ID).
func AssignFreshSortOrderIDsWithID ¶ added in v0.2.0
func AssignFreshSortOrderIDsWithID(sortOrder SortOrder, old, fresh *iceberg.Schema, sortOrderID int) (SortOrder, error)
AssignFreshSortOrderIDsWithID is like AssignFreshSortOrderIDs but allows specifying the id of the returned SortOrder.
func NewSortOrder ¶ added in v0.4.0
NewSortOrder creates a new SortOrder.
The orderID must be greater than or equal to 0. If orderID is 0, no fields can be passed, this is equal to UnsortedSortOrder. Fields need to have non-nil Transform, valid Direction and NullOrder values.
func (*SortOrder) CheckCompatibility ¶ added in v0.4.0
func (SortOrder) IsUnsorted ¶ added in v0.4.0
func (SortOrder) MarshalJSON ¶ added in v0.4.0
func (*SortOrder) UnmarshalJSON ¶
type StagedTable ¶ added in v0.2.0
type StagedTable struct {
*Table
}
func (*StagedTable) Refresh ¶ added in v0.2.0
func (s *StagedTable) Refresh(ctx context.Context) (*Table, error)
func (*StagedTable) Scan ¶ added in v0.2.0
func (s *StagedTable) Scan(opts ...ScanOption) *Scan
type StatisticsFile ¶ added in v0.4.0
type StatisticsFile struct {
SnapshotID int64 `json:"snapshot-id"`
StatisticsPath string `json:"statistics-path"`
FileSizeInBytes int64 `json:"file-size-in-bytes"`
KeyMetadata *string `json:"key-metadata,omitempty"`
BlobMetadata []BlobMetadata `json:"blob-metadata"`
}
StatisticsFile represents a statistics file in the Puffin format, that can be used to read table data more efficiently.
Statistics are informational. A reader can choose to ignore statistics information. Statistics support is not required to read the table correctly.
type Summary ¶
type Summary struct {
Operation Operation
Properties iceberg.Properties
}
Summary stores the summary information for a snapshot indicating the operation that created the snapshot, and various properties which might exist in the summary.
func (*Summary) MarshalJSON ¶
func (*Summary) UnmarshalJSON ¶
type Table ¶
type Table struct {
// contains filtered or unexported fields
}
func NewFromLocation ¶
func (Table) AllManifests ¶ added in v0.2.0
func (Table) Append ¶ added in v0.3.0
func (t Table) Append(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties) (*Table, error)
Append is a shortcut for NewTransaction().Append() and then committing the transaction
func (Table) AppendTable ¶ added in v0.3.0
func (t Table) AppendTable(ctx context.Context, tbl arrow.Table, batchSize int64, snapshotProps iceberg.Properties) (*Table, error)
AppendTable is a shortcut for NewTransaction().AppendTable() and then committing the transaction
func (Table) CurrentSnapshot ¶
func (Table) Delete ¶ added in v0.5.0
func (t Table) Delete(ctx context.Context, filter iceberg.BooleanExpression, snapshotProps iceberg.Properties, opts ...DeleteOption) (*Table, error)
Delete is a shortcut for NewTransaction().Delete() and then committing the transaction.
The provided filter acts as a row-level predicate on existing data:
- Files where all rows match the filter (strict match) are completely deleted
- Files where some rows match and others don't (partial match) are rewritten to keep only non-matching rows
- Files where no rows match the filter are kept unchanged
The filter uses both inclusive and strict metrics evaluators on file statistics to classify files:
- Inclusive evaluator identifies candidate files that may contain matching rows
- Strict evaluator determines if all rows in a file must match the filter
- Files that pass inclusive but not strict evaluation are rewritten with filtered data
The concurrency parameter controls the level of parallelism for manifest processing and file rewriting and can be overridden using the WithOverwriteConcurrency option. Defaults to runtime.GOMAXPROCS(0).
func (Table) DeleteOrphanFiles ¶ added in v0.4.0
func (t Table) DeleteOrphanFiles(ctx context.Context, opts ...OrphanCleanupOption) (OrphanCleanupResult, error)
func (Table) Identifier ¶
func (t Table) Identifier() Identifier
func (Table) LocationProvider ¶ added in v0.2.0
func (t Table) LocationProvider() (LocationProvider, error)
func (Table) MetadataLocation ¶
func (Table) NameMapping ¶ added in v0.2.0
func (t Table) NameMapping() iceberg.NameMapping
func (Table) NewTransaction ¶ added in v0.2.0
func (t Table) NewTransaction() *Transaction
func (Table) NewTransactionOnBranch ¶ added in v0.6.0
func (t Table) NewTransactionOnBranch(branch string) *Transaction
NewTransactionOnBranch creates a new transaction that commits to the named branch. Use [NewTransaction] to commit to the default "main" branch.
func (Table) Overwrite ¶ added in v0.5.0
func (t Table) Overwrite(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties, opts ...OverwriteOption) (*Table, error)
Overwrite is a shortcut for NewTransaction().Overwrite() and then committing the transaction.
An optional filter (see WithOverwriteFilter) determines which existing data to delete or rewrite:
- If filter is nil or AlwaysTrue, all existing data files are deleted and replaced with new data.
- If a filter is provided, it acts as a row-level predicate on existing data:
- Files where all rows match the filter (strict match) are completely deleted
- Files where some rows match and others don't (partial match) are rewritten to keep only non-matching rows
- Files where no rows match the filter are kept unchanged
The filter uses both inclusive and strict metrics evaluators on file statistics to classify files:
- Inclusive evaluator identifies candidate files that may contain matching rows
- Strict evaluator determines if all rows in a file must match the filter
- Files that pass inclusive but not strict evaluation are rewritten with filtered data
New data from the provided RecordReader is written to the table regardless of the filter.
The concurrency parameter controls the level of parallelism for manifest processing and file rewriting and can be overridden using the WithOverwriteConcurrency option. Defaults to runtime.GOMAXPROCS(0).
func (Table) OverwriteTable ¶ added in v0.5.0
func (t Table) OverwriteTable(ctx context.Context, tbl arrow.Table, batchSize int64, snapshotProps iceberg.Properties, opts ...OverwriteOption) (*Table, error)
OverwriteTable is a shortcut for NewTransaction().OverwriteTable() and then committing the transaction.
An optional filter (see WithOverwriteFilter) determines which existing data to delete or rewrite:
- If filter is nil or AlwaysTrue, all existing data files are deleted and replaced with new data.
- If a filter is provided, it acts as a row-level predicate on existing data:
- Files where all rows match the filter (strict match) are completely deleted
- Files where some rows match and others don't (partial match) are rewritten to keep only non-matching rows
- Files where no rows match the filter are kept unchanged
The filter uses both inclusive and strict metrics evaluators on file statistics to classify files:
- Inclusive evaluator identifies candidate files that may contain matching rows
- Strict evaluator determines if all rows in a file must match the filter
- Files that pass inclusive but not strict evaluation are rewritten with filtered data
New data from the provided table is written to the table regardless of the filter.
The batchSize parameter refers to the batch size for reading the input data, not the batch size for writes. The concurrency parameter controls the level of parallelism for manifest processing and file rewriting and can be overridden using the WithOverwriteConcurrency option. Defaults to runtime.GOMAXPROCS(0).
func (Table) Properties ¶
func (t Table) Properties() iceberg.Properties
func (Table) Scan ¶
func (t Table) Scan(opts ...ScanOption) *Scan
func (Table) SnapshotAsOf ¶ added in v0.4.0
SnapshotAsOf finds the snapshot that was current as of or right before the given timestamp.
func (Table) SnapshotByID ¶
func (Table) SnapshotByName ¶
func (Table) Spec ¶
func (t Table) Spec() iceberg.PartitionSpec
type TableCommit ¶ added in v0.6.0
type TableCommit struct {
Identifier Identifier
Requirements []Requirement
Updates []Update
}
TableCommit holds the identifier, requirements, and updates for a single table within a multi-table transaction. It is used with [catalog.TransactionalCatalog.CommitTransaction] to atomically commit changes across multiple tables.
type Transaction ¶ added in v0.2.0
type Transaction struct {
// contains filtered or unexported fields
}
func (*Transaction) AddDataFiles ¶ added in v0.5.0
func (t *Transaction) AddDataFiles(ctx context.Context, dataFiles []iceberg.DataFile, snapshotProps iceberg.Properties, opts ...WriteOption) error
AddDataFiles adds pre-built DataFiles to the table without scanning them from storage. This is useful for clients who have already constructed DataFile objects with metadata, avoiding the need to read files to extract schema and statistics.
Unlike AddFiles, this method does not read files from storage. It validates only metadata that can be checked without opening files (for example spec-id and partition field IDs).
By default this method automatically sets the schema name mapping in table properties if one does not already exist. Pass WithoutAutoNameMapping to disable this behavior, for example when working with catalogs that reject the name mapping property.
Callers are responsible for ensuring each DataFile is valid and consistent with the table. Supplying incorrect DataFile metadata can produce an invalid snapshot and break reads.
func (*Transaction) AddFiles ¶ added in v0.2.0
func (t *Transaction) AddFiles(ctx context.Context, filePaths []string, snapshotProps iceberg.Properties, ignoreDuplicates bool, opts ...AddFilesOption) error
func (*Transaction) Append ¶ added in v0.2.0
func (t *Transaction) Append(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties) error
func (*Transaction) AppendTable ¶ added in v0.3.0
func (t *Transaction) AppendTable(ctx context.Context, tbl arrow.Table, batchSize int64, snapshotProps iceberg.Properties) error
func (*Transaction) Commit ¶ added in v0.2.0
func (t *Transaction) Commit(ctx context.Context) (*Table, error)
func (*Transaction) Delete ¶ added in v0.5.0
func (t *Transaction) Delete(ctx context.Context, filter iceberg.BooleanExpression, snapshotProps iceberg.Properties, opts ...DeleteOption) (err error)
Delete deletes records matching the provided filter.
The provided filter acts as a row-level predicate on existing data:
- Files where all rows match the filter (strict match) are completely deleted
- Files where some rows match and others don't (partial match) are rewritten to keep only non-matching rows
- Files where no rows match the filter are kept unchanged
The filter uses both inclusive and strict metrics evaluators on file statistics to classify files:
- Inclusive evaluator identifies candidate files that may contain matching rows
- Strict evaluator determines if all rows in a file must match the filter
- Files that pass inclusive but not strict evaluation are rewritten with filtered data
The concurrency parameter controls the level of parallelism for manifest processing and file rewriting and can be overridden using the WithOverwriteConcurrency option. Defaults to runtime.GOMAXPROCS(0).
func (*Transaction) ExpireSnapshots ¶ added in v0.4.0
func (t *Transaction) ExpireSnapshots(opts ...ExpireSnapshotsOpt) error
func (*Transaction) MarkCommitted ¶ added in v0.6.0
func (t *Transaction) MarkCommitted()
MarkCommitted marks the transaction as committed, preventing further use. This should be called after a successful multi-table commit via [catalog.TransactionalCatalog.CommitTransaction].
func (*Transaction) NewRewrite ¶ added in v0.6.0
func (t *Transaction) NewRewrite(snapshotProps iceberg.Properties) *RewriteFiles
NewRewrite returns a RewriteFiles builder bound to this transaction. Mirrors Java's org.apache.iceberg.Table#newRewrite. snapshotProps is cloned and the clone is added to the rewrite snapshot's summary; pass nil for none.
Usage:
rewrite := tx.NewRewrite(nil)
rewrite.DeleteFile(oldDataFile)
rewrite.AddDataFile(newDataFile)
if err := rewrite.Commit(ctx); err != nil { ... }
committed, err := tx.Commit(ctx)
func (*Transaction) NewRowDelta ¶ added in v0.6.0
func (t *Transaction) NewRowDelta(snapshotProps iceberg.Properties) *RowDelta
NewRowDelta creates a new RowDelta for committing row-level changes within this transaction. The provided properties are included in the snapshot summary.
func (*Transaction) Overwrite ¶ added in v0.5.0
func (t *Transaction) Overwrite(ctx context.Context, rdr array.RecordReader, snapshotProps iceberg.Properties, opts ...OverwriteOption) error
Overwrite overwrites the table data using a RecordReader.
An optional filter (see WithOverwriteFilter) determines which existing data to delete or rewrite:
- If filter is nil or AlwaysTrue, all existing data files are deleted and replaced with new data.
- If a filter is provided, it acts as a row-level predicate on existing data:
- Files where all rows match the filter (strict match) are completely deleted
- Files where some rows match and others don't (partial match) are rewritten to keep only non-matching rows
- Files where no rows match the filter are kept unchanged
The filter uses both inclusive and strict metrics evaluators on file statistics to classify files:
- Inclusive evaluator identifies candidate files that may contain matching rows
- Strict evaluator determines if all rows in a file must match the filter
- Files that pass inclusive but not strict evaluation are rewritten with filtered data
New data from the provided RecordReader is written to the table regardless of the filter.
The concurrency parameter controls the level of parallelism for manifest processing and file rewriting and can be overridden using the WithOverwriteConcurrency option. If concurrency <= 0, defaults to runtime.GOMAXPROCS(0).
func (*Transaction) OverwriteTable ¶ added in v0.5.0
func (t *Transaction) OverwriteTable(ctx context.Context, tbl arrow.Table, batchSize int64, snapshotProps iceberg.Properties, opts ...OverwriteOption) error
OverwriteTable overwrites the table data using an Arrow Table.
An optional filter (see WithOverwriteFilter) determines which existing data to delete or rewrite:
- If filter is nil or AlwaysTrue, all existing data files are deleted and replaced with new data.
- If a filter is provided, it acts as a row-level predicate on existing data:
- Files where all rows match the filter (strict match) are completely deleted
- Files where some rows match and others don't (partial match) are rewritten to keep only non-matching rows
- Files where no rows match the filter are kept unchanged
The filter uses both inclusive and strict metrics evaluators on file statistics to classify files:
- Inclusive evaluator identifies candidate files that may contain matching rows
- Strict evaluator determines if all rows in a file must match the filter
- Files that pass inclusive but not strict evaluation are rewritten with filtered data
New data from the provided table is written to the table regardless of the filter.
The batchSize parameter refers to the batch size for reading the input data, not the batch size for writes. The concurrency parameter controls the level of parallelism for manifest processing and file rewriting and can be overridden using the WithOverwriteConcurrency option. If concurrency <= 0, defaults to runtime.GOMAXPROCS(0).
func (*Transaction) ReplaceDataFiles ¶ added in v0.3.0
func (t *Transaction) ReplaceDataFiles(ctx context.Context, filesToDelete, filesToAdd []string, snapshotProps iceberg.Properties) error
ReplaceFiles is actually just an overwrite operation with multiple files deleted and added.
TODO: technically, this could be a REPLACE operation but we aren't performing any validation here that there are no changes to the underlying data. A REPLACE operation is only valid if the data is exactly the same as the previous snapshot.
For now, we'll keep using an overwrite operation.
func (*Transaction) ReplaceDataFilesWithDataFiles ¶ added in v0.5.0
func (t *Transaction) ReplaceDataFilesWithDataFiles(ctx context.Context, filesToDelete, filesToAdd []iceberg.DataFile, snapshotProps iceberg.Properties, opts ...WriteOption) error
ReplaceDataFilesWithDataFiles replaces files using pre-built DataFile objects. This avoids scanning files to extract schema and statistics - the caller provides DataFile objects directly with all required metadata.
For the files to add, use iceberg.NewDataFileBuilder to construct DataFile objects with the appropriate metadata (path, record count, file size, partition values).
This method does not open files. It validates only metadata that can be checked without reading file contents.
By default this method automatically sets the schema name mapping in table properties if one does not already exist. Pass WithoutAutoNameMapping to disable this behavior, for example when working with catalogs that reject the name mapping property.
Callers are responsible for ensuring each DataFile is valid and consistent with the table. Supplying incorrect DataFile metadata can produce an invalid snapshot and break reads.
This is useful when:
- Files are written via a separate I/O path and metadata is already known
- Avoiding file scanning improves performance or reliability
- Working with storage systems where immediate file reads may be unreliable
func (*Transaction) ReplaceFiles ¶ added in v0.6.0
func (t *Transaction) ReplaceFiles(ctx context.Context, dataFilesToDelete, dataFilesToAdd, deleteFilesToRemove []iceberg.DataFile, snapshotProps iceberg.Properties, opts ...WriteOption) error
ReplaceFiles atomically replaces data files and removes associated delete files in a single snapshot. This is the commit primitive for compaction: old data files are replaced with new (compacted) data files, and delete files that are fully applied are removed.
func (*Transaction) RewriteDataFiles ¶ added in v0.6.0
func (t *Transaction) RewriteDataFiles(ctx context.Context, groups []CompactionTaskGroup, opts RewriteDataFilesOptions) (*RewriteResult, error)
RewriteDataFiles compacts the given groups by reading data with deletes applied, writing new consolidated files, and atomically replacing the old files. Position delete files that are fully applied (every referenced data file is in the rewrite set) are removed automatically.
Equality-delete cleanup is the caller's responsibility: compute the dead set with [compaction.CollectDeadEqualityDeletes] (against the same snapshot the rewrite is staged on) and pass it via [RewriteDataFilesOptions.ExtraDeleteFilesToRemove]. The executor only orchestrates the commit; it does not impose a cleanup policy. This split keeps the pure spec predicate in table/compaction and the unexported snapshot machinery in table.
Use [compaction.Config.PlanCompaction] to produce the groups, then convert [compaction.Group] → CompactionTaskGroup and pass them here. Distributed coordinators stage worker results via ExecuteCompactionGroup and commit them via Transaction.NewRewrite + RewriteFiles.Apply + RewriteFiles.Commit instead.
func (*Transaction) RollbackToSnapshot ¶ added in v0.6.0
func (t *Transaction) RollbackToSnapshot(snapshotID int64) error
func (*Transaction) Scan ¶ added in v0.2.0
func (t *Transaction) Scan(opts ...ScanOption) (*Scan, error)
func (*Transaction) SetProperties ¶ added in v0.2.0
func (t *Transaction) SetProperties(props iceberg.Properties) error
func (*Transaction) StagedTable ¶ added in v0.2.0
func (t *Transaction) StagedTable() (*StagedTable, error)
func (*Transaction) TableCommit ¶ added in v0.6.0
func (t *Transaction) TableCommit() (TableCommit, error)
TableCommit returns a TableCommit representing the pending changes in this transaction without actually committing them. This is intended for multi-table transactions where several TableCommit values are collected and submitted together via [catalog.TransactionalCatalog.CommitTransaction].
Most callers should use [catalog.MultiTableTransaction] instead of calling this method directly — it handles extraction, commit, and lifecycle management automatically.
The method automatically includes an AssertTableUUID requirement, matching the behavior of Transaction.Commit.
TableCommit does not mark the transaction as committed — the caller is responsible for either calling Commit (single-table) or submitting the returned TableCommit via CommitTransaction (multi-table). After a successful multi-table commit the caller should call MarkCommitted to prevent accidental reuse.
PostCommit hooks are NOT executed by this method. Because the multi-table commit endpoint returns 204 No Content (no metadata), callers must LoadTable after a successful CommitTransaction if they need updated state.
func (*Transaction) UpdateSchema ¶ added in v0.5.0
func (t *Transaction) UpdateSchema(caseSensitive bool, allowIncompatibleChanges bool, opts ...UpdateSchemaOption) *UpdateSchema
UpdateSchema creates a new UpdateSchema instance for managing schema changes within this transaction.
Parameters:
- caseSensitive: If true, field name lookups are case-sensitive; if false, field names are matched case-insensitively.
- allowIncompatibleChanges: If true, allows schema changes that would normally be rejected for being incompatible (e.g., adding required fields without default values, changing field types in non-promotable ways, or changing column nullability from optional to required).
- opts: Optional configuration functions to customize the UpdateSchema behavior.
Returns an UpdateSchema instance that can be used to build and apply schema changes.
func (*Transaction) UpdateSpec ¶ added in v0.4.0
func (t *Transaction) UpdateSpec(caseSensitive bool) *UpdateSpec
func (*Transaction) UpgradeFormatVersion ¶ added in v0.6.0
func (t *Transaction) UpgradeFormatVersion(version int) error
UpgradeFormatVersion upgrades the table to the given format version. Downgrading is not allowed. If the table is already at the given version, this is a no-op.
func (*Transaction) WriteEqualityDeletes ¶ added in v0.6.0
func (t *Transaction) WriteEqualityDeletes(ctx context.Context, equalityFieldIDs []int, records iter.Seq2[arrow.RecordBatch, error]) ([]iceberg.DataFile, error)
WriteEqualityDeletes writes Arrow record batches as equality delete Parquet files and returns the resulting DataFiles. The returned files have ContentType == EntryContentEqDeletes and EqualityFieldIDs set, ready to be passed to RowDelta.AddDeletes.
The equalityFieldIDs identify which columns in the table schema form the delete key. The provided records must contain exactly those columns.
The table must use format version 2 or higher.
For partitioned tables, the provided records must include the partition source columns in addition to the equality key columns so that records can be routed to the correct partition directories. If the partition source columns overlap with the equality key columns, no extra columns are needed.
Usage:
deleteFiles, err := tx.WriteEqualityDeletes(ctx, []int{1, 2}, records)
rd := tx.NewRowDelta(nil)
rd.AddDeletes(deleteFiles...)
err = rd.Commit(ctx)
type UnsupportedType ¶ added in v0.5.0
type UnsupportedType struct {
MinFormatVersion int
}
type Update ¶ added in v0.2.0
type Update interface {
// Action returns the name of the action that the update represents.
Action() string
// Apply applies the update to the given metadata builder.
Apply(*MetadataBuilder) error
// PostCommit is called after successful commit of the update
PostCommit(context.Context, *Table, *Table) error
}
Update represents a change to a table's metadata.
func NewSetLocationUpdate ¶ added in v0.2.0
NewSetLocationUpdate creates a new update that sets the location of the table metadata.
type UpdateSchema ¶ added in v0.5.0
type UpdateSchema struct {
// contains filtered or unexported fields
}
UpdateSchema manages schema evolution operations within a transaction. It supports adding, deleting, renaming, updating, and reordering columns, and ensures all changes are validated before being committed.
Operations can be chained together and are applied in the order they are called. Changes are not persisted until Commit() is called.
Basic Usage:
txn := table.NewTransaction()
updateSchema := txn.UpdateSchema(true, false)
// Add a new column
updateSchema.AddColumn([]string{"email"}, iceberg.PrimitiveTypes.String, "Email address", false, nil)
// Commit changes
if err := updateSchema.Commit(); err != nil {
return err
}
if _, err := txn.Commit(ctx); err != nil {
return err
}
Chaining Operations:
updateSchema.
AddColumn([]string{"age"}, iceberg.PrimitiveTypes.Int, "User age", false, nil).
RenameColumn([]string{"name"}, "full_name").
MoveFirst([]string{"id"}).
Commit()
Adding Nested Columns:
// Add a column to a struct field
updateSchema.AddColumn([]string{"address", "country"}, iceberg.PrimitiveTypes.String, "Country code", false, iceberg.StringLiteral("US"))
// Commit the schema update
if err := updateSchema.Commit(); err != nil {
return err
}
if _, err := txn.Commit(ctx); err != nil {
return err
}
func NewUpdateSchema ¶ added in v0.5.0
func NewUpdateSchema(txn *Transaction, caseSensitive bool, allowIncompatibleChanges bool, opts ...UpdateSchemaOption) *UpdateSchema
NewUpdateSchema creates a new UpdateSchema instance for managing schema changes within a transaction.
Parameters:
- txn: The transaction that this schema update will be applied to.
- caseSensitive: If true, field name lookups are case-sensitive; if false, field names are matched case-insensitively.
- allowIncompatibleChanges: If true, allows schema changes that would normally be rejected for being incompatible (e.g., adding required fields without default values, changing field types in non-promotable ways, or changing column nullability from optional to required).
- opts: Optional configuration functions to customize the UpdateSchema behavior.
Returns an UpdateSchema instance that can be used to build and apply schema changes.
func (*UpdateSchema) AddColumn ¶ added in v0.5.0
func (u *UpdateSchema) AddColumn(path []string, fieldType iceberg.Type, doc string, required bool, defaultValue iceberg.Literal) *UpdateSchema
func (*UpdateSchema) Apply ¶ added in v0.5.0
func (u *UpdateSchema) Apply() (*iceberg.Schema, error)
func (*UpdateSchema) BuildUpdates ¶ added in v0.5.0
func (u *UpdateSchema) BuildUpdates() ([]Update, []Requirement, error)
func (*UpdateSchema) Commit ¶ added in v0.5.0
func (u *UpdateSchema) Commit() error
func (*UpdateSchema) DeleteColumn ¶ added in v0.5.0
func (u *UpdateSchema) DeleteColumn(path []string) *UpdateSchema
func (*UpdateSchema) MoveAfter ¶ added in v0.5.0
func (u *UpdateSchema) MoveAfter(path, relativeTo []string) *UpdateSchema
func (*UpdateSchema) MoveBefore ¶ added in v0.5.0
func (u *UpdateSchema) MoveBefore(path, relativeTo []string) *UpdateSchema
func (*UpdateSchema) MoveColumn ¶ added in v0.5.0
func (u *UpdateSchema) MoveColumn(op MoveOp, path, relativeTo []string) *UpdateSchema
func (*UpdateSchema) MoveFirst ¶ added in v0.5.0
func (u *UpdateSchema) MoveFirst(path []string) *UpdateSchema
func (*UpdateSchema) RenameColumn ¶ added in v0.5.0
func (u *UpdateSchema) RenameColumn(path []string, newName string) *UpdateSchema
func (*UpdateSchema) SetIdentifierField ¶ added in v0.5.0
func (u *UpdateSchema) SetIdentifierField(paths [][]string) *UpdateSchema
func (*UpdateSchema) UpdateColumn ¶ added in v0.5.0
func (u *UpdateSchema) UpdateColumn(path []string, update ColumnUpdate) *UpdateSchema
type UpdateSchemaOption ¶ added in v0.5.0
type UpdateSchemaOption func(*UpdateSchema)
UpdateSchemaOption is a functional option for configuring UpdateSchema.
func WithNameMapping ¶ added in v0.5.0
func WithNameMapping(nameMapping iceberg.NameMapping) UpdateSchemaOption
WithNameMapping configures the UpdateSchema to use the provided name mapping for tracking field name changes and ensuring consistency during schema evolution.
type UpdateSpec ¶ added in v0.4.0
type UpdateSpec struct {
// contains filtered or unexported fields
}
UpdateSpec implements a builder for evolving a table's partition specification.
It accumulates a sequence of partition spec update operations (e.g., AddField, RemoveField, RenameField) which are applied during BuildUpdates.
Use the builder methods to chain operations, and call BuildUpdates to apply them and produce the final set of partition fields and update requirements, or call Commit to apply the updates in the transaction.
func NewUpdateSpec ¶ added in v0.4.0
func NewUpdateSpec(t *Transaction, caseSensitive bool) *UpdateSpec
func (*UpdateSpec) AddField ¶ added in v0.4.0
func (us *UpdateSpec) AddField(sourceColName string, transform iceberg.Transform, partitionFieldName string) *UpdateSpec
func (*UpdateSpec) AddIdentity ¶ added in v0.4.0
func (us *UpdateSpec) AddIdentity(sourceColName string) *UpdateSpec
func (*UpdateSpec) Apply ¶ added in v0.4.0
func (us *UpdateSpec) Apply() (iceberg.PartitionSpec, error)
func (*UpdateSpec) BuildUpdates ¶ added in v0.4.0
func (us *UpdateSpec) BuildUpdates() ([]Update, []Requirement, error)
func (*UpdateSpec) Commit ¶ added in v0.4.0
func (us *UpdateSpec) Commit() error
func (*UpdateSpec) RemoveField ¶ added in v0.4.0
func (us *UpdateSpec) RemoveField(name string) *UpdateSpec
func (*UpdateSpec) RenameField ¶ added in v0.4.0
func (us *UpdateSpec) RenameField(name string, newName string) *UpdateSpec
type Updates ¶ added in v0.4.0
type Updates []Update
func (*Updates) UnmarshalJSON ¶ added in v0.4.0
type WriteOption ¶ added in v0.6.0
type WriteOption func(*dataFileCfg)
WriteOption is an option for methods that operate on pre-built DataFile objects.
func WithoutAutoNameMapping ¶ added in v0.6.0
func WithoutAutoNameMapping() WriteOption
WithoutAutoNameMapping disables the automatic setting of the schema name mapping in table properties. By default, methods like Transaction.AddDataFiles and Transaction.ReplaceDataFilesWithDataFiles will set the name mapping if one does not already exist. This option is useful when working with catalogs (such as Databricks Unity Catalog) that reject the name mapping property.
func WithoutDuplicateCheck ¶ added in v0.6.0
func WithoutDuplicateCheck() WriteOption
WithoutDuplicateCheck disables the duplicate file path check against existing data files in the current snapshot. By default, Transaction.AddDataFiles scans all manifests to ensure no file being added already exists in the table. For tables with many manifests this scan can be expensive because each manifest must be read from storage. Use this option when the caller can guarantee that the files being added are not already in the table.
type WriteRecordOption ¶ added in v0.6.0
type WriteRecordOption func(*writeRecordConfig)
WriteRecordOption configures the behavior of WriteRecords.
func WithClusteredWrite ¶ added in v0.6.0
func WithClusteredWrite() WriteRecordOption
WithClusteredWrite enables the memory-efficient clustered write path for partitioned tables. It keeps at most one partition writer open at a time: when a record arrives for a new partition, the current writer is flushed and closed before a new one is opened.
The input must be clustered by partition across batches: once a partition's writer has been closed, encountering further records for that partition returns an error. Within a single batch the writer reclusters rows by partition, so interleaved values like [a,b,a,b] are accepted; the strict check fires only across batch boundaries. This is the natural order for compaction, where each source data file typically belongs to a single partition. If the input is not clustered across batches, use the fanout writer (the default) instead.
Combining this option with WithMaxWriteWorkers is rejected by WriteRecords: the clustered path is single-threaded by design.
func WithMaxWriteWorkers ¶ added in v0.6.0
func WithMaxWriteWorkers(n int) WriteRecordOption
WithMaxWriteWorkers overrides the default number of fanout workers used for partitioned writes. Each worker processes record batches, partitions them, and writes to the appropriate partition files. Fewer workers means fewer concurrent parquet writers compressing pages simultaneously, which reduces peak memory. A value of 0 (the default) uses config.EnvConfig.MaxWorkers.
Combining this option with WithClusteredWrite is rejected by WriteRecords: the clustered write path is single-threaded by design, so the two options have no meaningful interaction.
func WithTargetFileSize ¶ added in v0.6.0
func WithTargetFileSize(size int64) WriteRecordOption
WithTargetFileSize overrides the table's default target file size.
func WithWriteUUID ¶ added in v0.6.0
func WithWriteUUID(id uuid.UUID) WriteRecordOption
WithWriteUUID sets a specific UUID for file naming.
type WriteTask ¶ added in v0.3.0
type WriteTask struct {
Uuid uuid.UUID
ID int
PartitionID int // PartitionID is the partition identifier used in data file naming.
FileCount int // FileCount is a sequential counter for files written by this task.
Schema *iceberg.Schema
Batches []arrow.RecordBatch
SortOrderID int
}
func (WriteTask) GenerateDataFileName ¶ added in v0.3.0
Source Files
¶
- arrow_scanner.go
- arrow_utils.go
- clustered_writer.go
- commit.go
- conflict_validation.go
- encryption.go
- equality_delete_reader.go
- equality_delete_writer.go
- evaluators.go
- gen_geo_fixtures.go
- locations.go
- metadata.go
- metadata_schema_compatibility.go
- orphan_cleanup.go
- partitioned_fanout_writer.go
- pos_delete_partitioned_fanout_writer.go
- properties.go
- refs.go
- requirements.go
- rewrite_data_files.go
- rewrite_files.go
- rolling_data_writer.go
- row_delta.go
- scanner.go
- snapshot_ancestry.go
- snapshot_producers.go
- snapshots.go
- sorting.go
- statistics.go
- table.go
- transaction.go
- update_schema.go
- update_spec.go
- updates.go
- write_records.go
- writer.go
Directories
¶
| Path | Synopsis |
|---|---|
|
Package compaction provides bin-pack compaction planning for Iceberg tables.
|
Package compaction provides bin-pack compaction planning for Iceberg tables. |