Documentation
¶
Index ¶
- func AppendEpisodeParquet(ctx context.Context, dst, src string, opts AppendEpisodeOptions) error
- func BuildArrowSchema(features map[string]meta.FeatureSpec) (*arrow.Schema, error)
- func BuildHFSchemaMetadata(features map[string]meta.FeatureSpec) (arrow.Metadata, error)
- func ColumnsForStats(ctx context.Context, path string) (map[string]any, error)
- func ConcatTables(alloc memory.Allocator, tables []arrow.Table) (arrow.Table, error)
- func ExtractFloat64Column(tbl arrow.Table, name string) ([]float64, error)
- func ExtractInt64Column(tbl arrow.Table, name string) ([]int64, error)
- func ImageArrowType() arrow.DataType
- func LoadAllDataTables(ctx context.Context, paths []string, alloc memory.Allocator) (arrow.Table, error)
- func MergeParquetFiles(ctx context.Context, dst string, sources []string, alloc memory.Allocator) error
- func ReadTable(ctx context.Context, path string, alloc memory.Allocator) (arrow.Table, error)
- func ReadTasksParquet(ctx context.Context, path string) (map[string]int, error)
- func ReplaceInt64Column(tbl arrow.Table, name string, values []int64, alloc memory.Allocator) (arrow.Table, error)
- func RewriteEpisodeParquet(ctx context.Context, src string, opts AppendEpisodeOptions, ...) (arrow.Table, error)
- func SchemaWithHFMetadata(schema *arrow.Schema, features map[string]meta.FeatureSpec) (*arrow.Schema, error)
- func SliceTable(tbl arrow.Table, offset, length int64) (arrow.Table, error)
- func SliceTableByIndexRange(tbl arrow.Table, fromIndex, toIndex int64) (arrow.Table, error)
- func TableNumRows(path string) (int64, error)
- func UnflattenEpisodeStats(fields map[string][]float64) map[string]any
- func WriteEpisodeBatch(ctx context.Context, dst string, entries []EpisodeBatchEntry) error
- func WriteEpisodesParquet(root string, rows []EpisodeMetaInput, fileSizeLimitMB int) error
- func WriteTable(path string, tbl arrow.Table, alloc memory.Allocator) error
- func WriteTasksParquet(root string, taskMap map[string]int) error
- type AppendEpisodeOptions
- type AppendWriter
- func (w *AppendWriter) Close() error
- func (w *AppendWriter) WriteEpisodeColumns(columns map[string]any, length int, features map[string]meta.FeatureSpec) error
- func (w *AppendWriter) WriteRecordColumns(columns map[string]any, length int, features map[string]meta.FeatureSpec) error
- func (w *AppendWriter) WriteTable(tbl arrow.Table, chunkSize int64) error
- type EpisodeBatchEntry
- type EpisodeMetaInput
- type EpisodeMetaRow
- type ImageCell
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AppendEpisodeParquet ¶
func AppendEpisodeParquet(ctx context.Context, dst, src string, opts AppendEpisodeOptions) error
func BuildArrowSchema ¶
func BuildHFSchemaMetadata ¶
func ColumnsForStats ¶
ColumnsForStats extracts numeric columns from an episode parquet for stats computation.
func ConcatTables ¶
func ExtractFloat64Column ¶
func ImageArrowType ¶
func LoadAllDataTables ¶
func MergeParquetFiles ¶
func ReadTasksParquet ¶
ReadTasksParquet returns task string -> task_index.
func ReplaceInt64Column ¶
func RewriteEpisodeParquet ¶
func SchemaWithHFMetadata ¶
func SliceTableByIndexRange ¶
func TableNumRows ¶
func UnflattenEpisodeStats ¶
UnflattenEpisodeStats converts stats/feature/key parquet columns to v2.1 jsonl shape.
func WriteEpisodeBatch ¶
func WriteEpisodeBatch(ctx context.Context, dst string, entries []EpisodeBatchEntry) error
func WriteEpisodesParquet ¶
func WriteEpisodesParquet(root string, rows []EpisodeMetaInput, fileSizeLimitMB int) error
WriteEpisodesParquet writes meta/episodes/*.parquet with chunk rotation by on-disk size.
Types ¶
type AppendEpisodeOptions ¶
type AppendWriter ¶
type AppendWriter struct {
// contains filtered or unexported fields
}
AppendWriter mirrors Python pq.ParquetWriter with snappy + dictionary.
func NewAppendWriter ¶
func NewAppendWriter(path string, schema *arrow.Schema) (*AppendWriter, error)
func NewAppendWriterWithFeatures ¶
func NewAppendWriterWithFeatures(path string, schema *arrow.Schema, features map[string]meta.FeatureSpec) (*AppendWriter, error)
func OpenAppendWriter ¶
func OpenAppendWriter(path string, schema *arrow.Schema) (*AppendWriter, error)
func (*AppendWriter) Close ¶
func (w *AppendWriter) Close() error
func (*AppendWriter) WriteEpisodeColumns ¶
func (w *AppendWriter) WriteEpisodeColumns(columns map[string]any, length int, features map[string]meta.FeatureSpec) error
func (*AppendWriter) WriteRecordColumns ¶ added in v1.1.0
func (w *AppendWriter) WriteRecordColumns(columns map[string]any, length int, features map[string]meta.FeatureSpec) error
func (*AppendWriter) WriteTable ¶ added in v1.1.0
func (w *AppendWriter) WriteTable(tbl arrow.Table, chunkSize int64) error
type EpisodeBatchEntry ¶
type EpisodeBatchEntry struct {
SourcePath string
Options AppendEpisodeOptions
}
type EpisodeMetaInput ¶
type EpisodeMetaInput struct {
EpisodeIndex int
Tasks []string
Length int
Fields map[string]any
Stats stats.EpisodeStats
Features map[string]meta.FeatureSpec
}
type EpisodeMetaRow ¶
type EpisodeMetaRow struct {
EpisodeIndex int64
Length int64
Tasks []string
DatasetFromIndex int64
DatasetToIndex int64
DataChunkIndex int64
DataFileIndex int64
VideoFields map[string]any
StatsFields map[string][]float64
}
func ReadEpisodesMeta ¶
func ReadEpisodesMeta(root string) ([]EpisodeMetaRow, error)
Click to show internal directories.
Click to hide internal directories.