parquetx

package
v1.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 11, 2026 License: MIT Imports: 16 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AppendEpisodeParquet

func AppendEpisodeParquet(ctx context.Context, dst, src string, opts AppendEpisodeOptions) error

func BuildArrowSchema

func BuildArrowSchema(features map[string]meta.FeatureSpec) (*arrow.Schema, error)

func BuildHFSchemaMetadata

func BuildHFSchemaMetadata(features map[string]meta.FeatureSpec) (arrow.Metadata, error)

func ColumnsForStats

func ColumnsForStats(ctx context.Context, path string) (map[string]any, error)

ColumnsForStats extracts numeric columns from an episode parquet for stats computation.

func ConcatTables

func ConcatTables(alloc memory.Allocator, tables []arrow.Table) (arrow.Table, error)

func ExtractFloat64Column

func ExtractFloat64Column(tbl arrow.Table, name string) ([]float64, error)

func ExtractInt64Column

func ExtractInt64Column(tbl arrow.Table, name string) ([]int64, error)

func ImageArrowType

func ImageArrowType() arrow.DataType

func LoadAllDataTables

func LoadAllDataTables(ctx context.Context, paths []string, alloc memory.Allocator) (arrow.Table, error)

func MergeParquetFiles

func MergeParquetFiles(ctx context.Context, dst string, sources []string, alloc memory.Allocator) error

func ReadTable

func ReadTable(ctx context.Context, path string, alloc memory.Allocator) (arrow.Table, error)

func ReadTasksParquet

func ReadTasksParquet(ctx context.Context, path string) (map[string]int, error)

ReadTasksParquet returns task string -> task_index.

func ReplaceInt64Column

func ReplaceInt64Column(tbl arrow.Table, name string, values []int64, alloc memory.Allocator) (arrow.Table, error)

func RewriteEpisodeParquet

func RewriteEpisodeParquet(ctx context.Context, src string, opts AppendEpisodeOptions, alloc memory.Allocator) (arrow.Table, error)

func SchemaWithHFMetadata

func SchemaWithHFMetadata(schema *arrow.Schema, features map[string]meta.FeatureSpec) (*arrow.Schema, error)

func SliceTable

func SliceTable(tbl arrow.Table, offset, length int64) (arrow.Table, error)

func SliceTableByIndexRange

func SliceTableByIndexRange(tbl arrow.Table, fromIndex, toIndex int64) (arrow.Table, error)

func TableNumRows

func TableNumRows(path string) (int64, error)

func UnflattenEpisodeStats

func UnflattenEpisodeStats(fields map[string][]float64) map[string]any

UnflattenEpisodeStats converts stats/feature/key parquet columns to v2.1 jsonl shape.

func WriteEpisodeBatch

func WriteEpisodeBatch(ctx context.Context, dst string, entries []EpisodeBatchEntry) error

func WriteEpisodesParquet

func WriteEpisodesParquet(root string, rows []EpisodeMetaInput, fileSizeLimitMB int) error

WriteEpisodesParquet writes meta/episodes/*.parquet with chunk rotation by on-disk size.

func WriteTable

func WriteTable(path string, tbl arrow.Table, alloc memory.Allocator) error

func WriteTasksParquet

func WriteTasksParquet(root string, taskMap map[string]int) error

WriteTasksParquet writes meta/tasks.parquet compatible with lerobot load_tasks(). Task strings are stored as the pandas index column named "task".

Types

type AppendEpisodeOptions

type AppendEpisodeOptions struct {
	GlobalFrameIndex int64
	EpisodeIndex     int64
	EpisodeTasks     []string
	GlobalTaskIndex  map[string]int
}

type AppendWriter

type AppendWriter struct {
	// contains filtered or unexported fields
}

AppendWriter mirrors Python pq.ParquetWriter with snappy + dictionary.

func NewAppendWriter

func NewAppendWriter(path string, schema *arrow.Schema) (*AppendWriter, error)

func NewAppendWriterWithFeatures

func NewAppendWriterWithFeatures(path string, schema *arrow.Schema, features map[string]meta.FeatureSpec) (*AppendWriter, error)

func OpenAppendWriter

func OpenAppendWriter(path string, schema *arrow.Schema) (*AppendWriter, error)

func (*AppendWriter) Close

func (w *AppendWriter) Close() error

func (*AppendWriter) WriteEpisodeColumns

func (w *AppendWriter) WriteEpisodeColumns(columns map[string]any, length int, features map[string]meta.FeatureSpec) error

func (*AppendWriter) WriteRecordColumns added in v1.1.0

func (w *AppendWriter) WriteRecordColumns(columns map[string]any, length int, features map[string]meta.FeatureSpec) error

func (*AppendWriter) WriteTable added in v1.1.0

func (w *AppendWriter) WriteTable(tbl arrow.Table, chunkSize int64) error

type EpisodeBatchEntry

type EpisodeBatchEntry struct {
	SourcePath string
	Options    AppendEpisodeOptions
}

type EpisodeMetaInput

type EpisodeMetaInput struct {
	EpisodeIndex int
	Tasks        []string
	Length       int
	Fields       map[string]any
	Stats        stats.EpisodeStats
	Features     map[string]meta.FeatureSpec
}

type EpisodeMetaRow

type EpisodeMetaRow struct {
	EpisodeIndex     int64
	Length           int64
	Tasks            []string
	DatasetFromIndex int64
	DatasetToIndex   int64
	DataChunkIndex   int64
	DataFileIndex    int64
	VideoFields      map[string]any
	StatsFields      map[string][]float64
}

func ReadEpisodesMeta

func ReadEpisodesMeta(root string) ([]EpisodeMetaRow, error)

type ImageCell

type ImageCell struct {
	Bytes []byte
	Path  string
}

ImageCell matches HuggingFace datasets.Image embedded parquet storage.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL