Documentation
¶
Overview ¶
Package arrow provides an Apache Arrow-backed compute engine for the dataset package. It implements dataset.ColumnFactory, dataset.BuilderFactory, dataset.Aggregator, and dataset.Caster using Arrow arrays and arrow/math SIMD kernels.
Usage:
eng := arrow.NewEngine(memory.DefaultAllocator)
f := eng.(dataset.ColumnFactory)
ds, _ := f.FromColumns(
dataset.NewSchema(dataset.FloatCol("x"), dataset.StringCol("label")),
f.NewFloat64Column("x", []float64{1, 2, 3}),
f.NewStringColumn("label", []string{"a", "b", "c"}),
)
Index ¶
- type Engine
- func (e *Engine) Abs(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Acos(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) AddCols(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) AddScalar(col dataset.AnyColumn, val float64) (dataset.AnyColumn, error)
- func (e *Engine) Alloc() memory.Allocator
- func (e *Engine) Asin(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Atan(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Atan2(y, x dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) BitAnd(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) BitNot(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) BitOr(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) BitShiftLeft(col dataset.AnyColumn, n int) (dataset.AnyColumn, error)
- func (e *Engine) BitShiftRight(col dataset.AnyColumn, n int) (dataset.AnyColumn, error)
- func (e *Engine) BitXor(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Cast(col dataset.AnyColumn, target dataset.DType) (dataset.AnyColumn, error)
- func (e *Engine) Ceil(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Combine(datasets ...dataset.Table) (dataset.Table, error)
- func (e *Engine) Complete(ds dataset.Table, cols ...string) (dataset.Table, error)
- func (e *Engine) Concatenate(ds dataset.Table, col string, from []string, sep string) (dataset.Table, error)
- func (e *Engine) Cos(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Count(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) CumMax(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) CumMin(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) CumSum(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) DenseRank(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) DivCols(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) DropNA(ds dataset.Table, cols ...string) (dataset.Table, error)
- func (e *Engine) Erf(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Exp(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Fill(col dataset.AnyColumn, dir dataset.FillDirection) (dataset.AnyColumn, error)
- func (e *Engine) Filter(ds dataset.Table, mask dataset.Masker) (dataset.Table, error)
- func (e *Engine) FilterIndices(mask []bool) []int
- func (e *Engine) Floor(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) FromColumns(schema *dataset.Schema, cols ...dataset.AnyColumn) (dataset.Table, error)
- func (e *Engine) Join(left, right dataset.Table, spec dataset.JoinSpec) (dataset.Table, error)
- func (e *Engine) Lag(col dataset.AnyColumn, offset int) (dataset.AnyColumn, error)
- func (e *Engine) Lead(col dataset.AnyColumn, offset int) (dataset.AnyColumn, error)
- func (e *Engine) Ln(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Log2(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Log10(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Mean(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Median(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) MinMax(col dataset.AnyColumn) (dataset.AnyColumn, dataset.AnyColumn, error)
- func (e *Engine) MulCols(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) MulScalar(col dataset.AnyColumn, val float64) (dataset.AnyColumn, error)
- func (e *Engine) Name() string
- func (e *Engine) Neg(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) NewBoolColumn(name string, data []bool) dataset.AnyColumn
- func (e *Engine) NewBuilder(schema *dataset.Schema) dataset.Builder
- func (e *Engine) NewFloat64Column(name string, data []float64) dataset.AnyColumn
- func (e *Engine) NewInt64Column(name string, data []int64) dataset.AnyColumn
- func (e *Engine) NewStringColumn(name string, data []string) dataset.AnyColumn
- func (e *Engine) NewTimestampColumn(name string, data []int64) dataset.AnyColumn
- func (e *Engine) PercentRank(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) PivotLonger(ds dataset.Table, spec dataset.PivotLongerSpec) (dataset.Table, error)
- func (e *Engine) PivotWider(ds dataset.Table, spec dataset.PivotWiderSpec) (dataset.Table, error)
- func (e *Engine) Pow(col dataset.AnyColumn, exp float64) (dataset.AnyColumn, error)
- func (e *Engine) Rank(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) ReadCSV(ctx context.Context, r io.Reader, cfg dataset.CSVConfig) (dataset.Table, error)
- func (e *Engine) ReadParquet(ctx context.Context, r io.ReaderAt, size int64, cfg dataset.ParquetConfig) (dataset.Table, error)
- func (e *Engine) ReplaceNA(col dataset.AnyColumn, defaultVal float64) (dataset.AnyColumn, error)
- func (e *Engine) Round(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) RowNumber(n int) (dataset.AnyColumn, error)
- func (e *Engine) Select(col dataset.AnyColumn, indices []int) (dataset.AnyColumn, error)
- func (e *Engine) Separate(ds dataset.Table, col string, into []string, sep string) (dataset.Table, error)
- func (e *Engine) Sigmoid(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Sign(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Sin(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Slice(col dataset.AnyColumn, start, end int) (dataset.AnyColumn, error)
- func (e *Engine) SortIndices(col dataset.AnyColumn) ([]int, error)
- func (e *Engine) Sqrt(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Stack(datasets ...dataset.Table) (dataset.Table, error)
- func (e *Engine) SubCols(a, b dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Sum(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Tan(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Tanh(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) Variance(col dataset.AnyColumn) (dataset.AnyColumn, error)
- func (e *Engine) WriteCSV(ctx context.Context, w io.Writer, ds dataset.Table, cfg dataset.CSVConfig) error
- func (e *Engine) WriteParquet(ctx context.Context, w io.Writer, ds dataset.Table, cfg dataset.ParquetConfig) error
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine is the Arrow compute backend.
func (*Engine) BitShiftLeft ¶
func (*Engine) BitShiftRight ¶
func (*Engine) Complete ¶
Complete generates all combinations of the specified columns' unique values.
func (*Engine) Concatenate ¶
func (e *Engine) Concatenate(ds dataset.Table, col string, from []string, sep string) (dataset.Table, error)
Concatenate joins multiple string columns into one with a separator.
func (*Engine) FilterIndices ¶
func (*Engine) FromColumns ¶
func (*Engine) Join ¶
Join implements the Joiner interface with a hash-join algorithm. It supports Inner, Left, Right, Full, Semi, and Anti joins.
func (*Engine) NewBoolColumn ¶
func (*Engine) NewFloat64Column ¶
func (*Engine) NewInt64Column ¶
func (*Engine) NewStringColumn ¶
func (*Engine) NewTimestampColumn ¶
func (*Engine) PercentRank ¶
func (*Engine) PivotLonger ¶
PivotLonger reshapes a wide dataset to long format.
func (*Engine) PivotWider ¶
PivotWider reshapes a long dataset to wide format.
func (*Engine) ReadCSV ¶
func (e *Engine) ReadCSV(ctx context.Context, r io.Reader, cfg dataset.CSVConfig) (dataset.Table, error)
ReadCSV reads CSV data using arrow/csv.NewInferringReader with chunked streaming. Default chunk is 64K rows per batch to bound memory for large files.
func (*Engine) ReadParquet ¶
func (e *Engine) ReadParquet(ctx context.Context, r io.ReaderAt, size int64, cfg dataset.ParquetConfig) (dataset.Table, error)
ReadParquet reads Parquet data using pqarrow for zero-copy columnar ingest.
func (*Engine) Separate ¶
func (e *Engine) Separate(ds dataset.Table, col string, into []string, sep string) (dataset.Table, error)
Separate splits a string column by a delimiter into multiple columns.
func (*Engine) SortIndices ¶
SortIndices uses Arrow compute's SortIndicesArray kernel. Arrow's implementation handles null placement and type dispatch natively.