dataframe

package

v0.4.0 Latest Latest Go to latest Published: Aug 2, 2025 License: Apache-2.0, MIT Imports: 20 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/paveg/gorilla

Links

Open Source Insights

Documentation ¶

Overview ¶

Package dataframe provides high-performance DataFrame operations

Package dataframe provides high-performance DataFrame operations with optimized HAVING clause evaluation

Index ¶

Constants
type AccessPattern
type ArrowValueArray
type BinaryExprInterface
type CacheLocalityHint
type ColumnAccessInfo
type CompiledExpression
type CompiledHavingEvaluator
- func NewCompiledHavingEvaluator(predicate expr.Expr, hint PerformanceHint) (*CompiledHavingEvaluator, error)
- func (c *CompiledHavingEvaluator) CompileExpression() error
- func (c *CompiledHavingEvaluator) EvaluateAggregated(aggregatedData map[string]arrow.Array) (*array.Boolean, error)
- func (c *CompiledHavingEvaluator) GetMetrics() HavingPerformanceMetrics
- func (c *CompiledHavingEvaluator) Release()
type CompiledOperation
type ConstantFoldingRule
- func (r *ConstantFoldingRule) Apply(plan *ExecutionPlan) *ExecutionPlan
- func (r *ConstantFoldingRule) Name() string
type DataFrame
- func New(series ...ISeries) *DataFrame
- func (df *DataFrame) Column(name string) (ISeries, bool)
- func (df *DataFrame) Columns() []string
- func (df *DataFrame) Concat(others ...*DataFrame) *DataFrame
- func (df *DataFrame) Correlation(col1, col2 string) (float64, error)
- func (df *DataFrame) Drop(names ...string) *DataFrame
- func (df *DataFrame) GroupBy(columns ...string) *GroupBy
- func (df *DataFrame) HasColumn(name string) bool
- func (df *DataFrame) Join(right *DataFrame, options *JoinOptions) (*DataFrame, error)
- func (df *DataFrame) Lazy() *LazyFrame
- func (df *DataFrame) Len() int
- func (df *DataFrame) NumRows() int
- func (df *DataFrame) OptimizedJoin(right *DataFrame, options *JoinOptions) (*DataFrame, error)
- func (df *DataFrame) Release()
- func (df *DataFrame) RollingWindow(column string, windowSize int, operation string) (*DataFrame, error)
- func (df *DataFrame) SafeCollectParallel() (*DataFrame, error)
- func (df *DataFrame) SafeCollectParallelWithMonitoring() (*DataFrame, error)
- func (df *DataFrame) Select(names ...string) *DataFrame
- func (df *DataFrame) Slice(start, end int) *DataFrame
- func (df *DataFrame) Sort(column string, ascending bool) (*DataFrame, error)
- func (df *DataFrame) SortBy(columns []string, ascending []bool) (*DataFrame, error)
- func (df *DataFrame) String() string
- func (df *DataFrame) Width() int
- func (df *DataFrame) WindowFunction(function string, columns ...string) (*DataFrame, error)
- func (df *DataFrame) WindowFunctionWithPartition(function, orderBy, partitionBy string) (*DataFrame, error)
- func (df *DataFrame) WithConfig(opConfig config.OperationConfig) *DataFrame
type ExecutionPlan
- func CreateExecutionPlan(source *DataFrame, operations []LazyOperation) *ExecutionPlan
type ExpressionPlan
type FilterFusionRule
- func (r *FilterFusionRule) Apply(plan *ExecutionPlan) *ExecutionPlan
- func (r *FilterFusionRule) Name() string
type FilterMaskOperation
type FilterMemoryPoolManager
type FilterOperation
- func (f *FilterOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (f *FilterOperation) String() string
type FunctionExprInterface
type GroupBy
- func (gb *GroupBy) Agg(aggregations ...*expr.AggregationExpr) *DataFrame
type GroupByHavingOperation
- func (gh *GroupByHavingOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (gh *GroupByHavingOperation) Release()
- func (gh *GroupByHavingOperation) String() string
type GroupByOperation
- func NewGroupByOperation(groupByCols []string, aggregations []*expr.AggregationExpr) *GroupByOperation
- func NewGroupByOperationWithHaving(groupByCols []string, aggregations []*expr.AggregationExpr, ...) *GroupByOperation
- func (g *GroupByOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (g *GroupByOperation) String() string
type HavingMemoryPool
- func NewHavingMemoryPool(allocator memory.Allocator) *HavingMemoryPool
- func (p *HavingMemoryPool) GetAllocator() memory.Allocator
- func (p *HavingMemoryPool) GetBooleanBuilder() *array.BooleanBuilder
- func (p *HavingMemoryPool) PutBooleanBuilder(builder *array.BooleanBuilder)
- func (p *HavingMemoryPool) Release()
type HavingOperation
- func NewHavingOperation(predicate expr.Expr) *HavingOperation
- func (h *HavingOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (h *HavingOperation) Name() string
- func (h *HavingOperation) String() string
type HavingPerformanceMetrics
type ISeries
type JoinOperation
- func (j *JoinOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (j *JoinOperation) String() string
type JoinOptimizer
- func NewJoinOptimizer(left, right *DataFrame) *JoinOptimizer
- func (jo *JoinOptimizer) SelectStrategy(leftKeys, rightKeys []string) JoinStrategy
type JoinOptions
type JoinStrategy
type JoinType
type LazyFrame
- func (lf *LazyFrame) Collect(ctx ...context.Context) (*DataFrame, error)
- func (lf *LazyFrame) Filter(predicate expr.Expr) *LazyFrame
- func (lf *LazyFrame) GroupBy(columns ...string) *LazyGroupBy
- func (lf *LazyFrame) Join(right *LazyFrame, options *JoinOptions) *LazyFrame
- func (lf *LazyFrame) Release()
- func (lf *LazyFrame) SafeCollectParallel() (*DataFrame, error)
- func (lf *LazyFrame) SafeCollectParallelWithMonitoring() (*DataFrame, error)
- func (lf *LazyFrame) Select(columns ...string) *LazyFrame
- func (lf *LazyFrame) Sort(column string, ascending bool) *LazyFrame
- func (lf *LazyFrame) SortBy(columns []string, ascending []bool) *LazyFrame
- func (lf *LazyFrame) String() string
- func (lf *LazyFrame) WithColumn(name string, expr expr.Expr) *LazyFrame
type LazyGroupBy
- func (lgb *LazyGroupBy) Agg(aggregations ...*expr.AggregationExpr) *LazyFrame
- func (lgb *LazyGroupBy) AggWithHaving(havingPredicate expr.Expr, aggregations ...*expr.AggregationExpr) *LazyFrame
- func (lgb *LazyGroupBy) Count(column string) *LazyFrame
- func (lgb *LazyGroupBy) Having(predicate expr.Expr) *LazyFrame
- func (lgb *LazyGroupBy) Max(column string) *LazyFrame
- func (lgb *LazyGroupBy) Mean(column string) *LazyFrame
- func (lgb *LazyGroupBy) Min(column string) *LazyFrame
- func (lgb *LazyGroupBy) Sum(column string) *LazyFrame
type LazyOperation
type MemoryLayoutInfo
type MergeJoinOptimizer
- func NewMergeJoinOptimizer(leftSorted, rightSorted bool, sortKeys []string) *MergeJoinOptimizer
- func (mjo *MergeJoinOptimizer) Join(left, right *DataFrame, options *JoinOptions) (*DataFrame, error)
type OperationFusionRule
- func (r *OperationFusionRule) Apply(plan *ExecutionPlan) *ExecutionPlan
- func (r *OperationFusionRule) Name() string
type OperationType
type OptimizationRule
type OptimizedHashMap
- func NewOptimizedHashMap(estimatedSize int) *OptimizedHashMap
- func (ohm *OptimizedHashMap) Get(key string) ([]int, bool)
- func (ohm *OptimizedHashMap) Put(key string, value int)
type PerformanceHint
type PlanMetadata
type PredicatePushdownRule
- func (r *PredicatePushdownRule) Apply(plan *ExecutionPlan) *ExecutionPlan
- func (r *PredicatePushdownRule) Name() string
type ProjectionPushdownRule
- func (r *ProjectionPushdownRule) Apply(plan *ExecutionPlan) *ExecutionPlan
- func (r *ProjectionPushdownRule) Name() string
type QueryOptimizer
- func NewQueryOptimizer() *QueryOptimizer
- func (qo *QueryOptimizer) Optimize(plan *ExecutionPlan) *ExecutionPlan
type SelectOperation
- func (s *SelectOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (s *SelectOperation) String() string
type SortOperation
- func (s *SortOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (s *SortOperation) String() string
type TableStats
type UnaryExprInterface
type WithColumnOperation
- func (w *WithColumnOperation) Apply(df *DataFrame) (*DataFrame, error)
- func (w *WithColumnOperation) String() string

Constants ¶

View Source

const (
	// DefaultMaxPoolSize defines the default maximum memory pool size (64MB).
	DefaultMaxPoolSize = 64 * 1024 * 1024
	// MinParallelGroupThreshold defines the minimum groups needed for parallel execution.
	MinParallelGroupThreshold = 100
	// DefaultChunkSize defines the default chunk size for parallel processing.
	DefaultChunkSize = 1000
	// MinChunkSize defines the minimum chunk size for processing.
	MinChunkSize = 100
	// MaxChunkSize defines the maximum chunk size for processing.
	MaxChunkSize = 10000
	// MultiColumnThreshold defines threshold for multi-column cache locality.
	MultiColumnThreshold = 3
)

Variables ¶

This section is empty.

Functions ¶

This section is empty.

Types ¶

type AccessPattern ¶ added in v0.3.0

type AccessPattern int

AccessPattern describes how the column will be accessed.

const (
	AccessSequential AccessPattern = iota
	AccessRandom
	AccessGrouped
)

type ArrowValueArray ¶

type ArrowValueArray[T any] interface {
	Len() int
	IsNull(int) bool
	Value(int) T
}

ArrowValueArray represents an Arrow array with value access.

type BinaryExprInterface ¶

type BinaryExprInterface interface {
	expr.Expr
	Left() expr.Expr
	Op() expr.BinaryOp
	Right() expr.Expr
}

BinaryExprInterface defines the interface for binary expressions.

type CacheLocalityHint ¶ added in v0.3.0

type CacheLocalityHint int

CacheLocalityHint provides cache optimization hints.

const (
	CacheLocalitySequential CacheLocalityHint = iota
	CacheLocalityRandom
	CacheLocalityGrouped
)

type ColumnAccessInfo ¶ added in v0.3.0

type ColumnAccessInfo struct {
	// contains filtered or unexported fields
}

ColumnAccessInfo contains optimized column access information.

type CompiledExpression ¶ added in v0.3.0

type CompiledExpression struct {
	// contains filtered or unexported fields
}

CompiledExpression represents a pre-compiled expression for repeated evaluation.

type CompiledHavingEvaluator ¶ added in v0.3.0

type CompiledHavingEvaluator struct {
	// contains filtered or unexported fields
}

CompiledHavingEvaluator provides optimized evaluation for HAVING predicates.

func NewCompiledHavingEvaluator ¶ added in v0.3.0

func NewCompiledHavingEvaluator(predicate expr.Expr, hint PerformanceHint) (*CompiledHavingEvaluator, error)

NewCompiledHavingEvaluator creates a new optimized HAVING evaluator.

func (*CompiledHavingEvaluator) CompileExpression ¶ added in v0.3.0

func (c *CompiledHavingEvaluator) CompileExpression() error

CompileExpression compiles the HAVING predicate for optimized evaluation.

func (*CompiledHavingEvaluator) EvaluateAggregated ¶ added in v0.3.0

func (c *CompiledHavingEvaluator) EvaluateAggregated(aggregatedData map[string]arrow.Array) (*array.Boolean, error)

EvaluateAggregated evaluates the HAVING predicate against aggregated data.

func (*CompiledHavingEvaluator) GetMetrics ¶ added in v0.3.0

func (c *CompiledHavingEvaluator) GetMetrics() HavingPerformanceMetrics

GetMetrics returns the current performance metrics.

func (*CompiledHavingEvaluator) Release ¶ added in v0.3.0

func (c *CompiledHavingEvaluator) Release()

Release releases resources held by the evaluator.

type CompiledOperation ¶ added in v0.3.0

type CompiledOperation struct {
	// contains filtered or unexported fields
}

CompiledOperation represents a single optimized operation in the expression plan.

type ConstantFoldingRule ¶

type ConstantFoldingRule struct{}

ConstantFoldingRule evaluates constant expressions at planning time.

func (*ConstantFoldingRule) Apply ¶

func (r *ConstantFoldingRule) Apply(plan *ExecutionPlan) *ExecutionPlan

func (*ConstantFoldingRule) Name ¶

func (r *ConstantFoldingRule) Name() string

type DataFrame ¶

type DataFrame struct {
	// contains filtered or unexported fields
}

DataFrame represents a two-dimensional table of data with named, typed columns.

DataFrame provides efficient data manipulation operations through lazy evaluation and automatic parallelization. All operations preserve data integrity and type safety while optimizing for performance with large datasets.

Key characteristics:

Columnar data storage using Apache Arrow arrays
Lazy evaluation for query optimization
Automatic parallel processing for operations on large datasets (>1000 rows)
Type-safe operations with compile-time verification
Memory-efficient operations with proper resource management

Example usage:

mem := memory.NewGoAllocator()
names := series.New("name", []string{"Alice", "Bob"}, mem)
ages := series.New("age", []int64{25, 30}, mem)

df := dataframe.New(names, ages)
defer df.Release() // Essential for memory management

// Lazy operations are chained and optimized
result, err := df.Lazy().
    Filter(expr.Col("age").Gt(expr.Lit(25))).
    Select("name").
    Collect()

Memory Management: DataFrames manage Apache Arrow memory and require explicit cleanup via Release(). Always use defer to ensure proper resource cleanup, even in error conditions.

func New ¶

func New(series ...ISeries) *DataFrame

New creates a new DataFrame from one or more Series columns.

All provided Series must have the same length, or the DataFrame creation will panic. Column names must be unique across all Series.

Parameters:

series: One or more ISeries representing the columns of the DataFrame.
        Each series becomes a column with the series name as the column name.

Returns:

*DataFrame: A new DataFrame containing the provided columns.

Panics:

If series have different lengths
If duplicate column names are provided
If no series are provided

Example:

mem := memory.NewGoAllocator()
names := series.New("employee", []string{"Alice", "Bob", "Charlie"}, mem)
ages := series.New("age", []int64{25, 30, 35}, mem)
salaries := series.New("salary", []float64{50000.0, 60000.0, 70000.0}, mem)

df := dataframe.New(names, ages, salaries)
defer df.Release()

Memory Management: The DataFrame takes ownership of the provided Series. You must call Release() on the DataFrame to properly clean up memory. The original Series references remain valid but should also be released when no longer needed.

func (*DataFrame) Column ¶

func (df *DataFrame) Column(name string) (ISeries, bool)

Column returns the series for the given column name.

func (*DataFrame) Columns ¶

func (df *DataFrame) Columns() []string

Columns returns the names of all columns in order.

func (*DataFrame) Concat ¶

func (df *DataFrame) Concat(others ...*DataFrame) *DataFrame

Concat concatenates multiple DataFrames vertically (row-wise) All DataFrames must have the same column structure.

func (*DataFrame) Correlation ¶

func (df *DataFrame) Correlation(col1, col2 string) (float64, error)

Correlation calculates the Pearson correlation coefficient between two numeric columns.

func (*DataFrame) Drop ¶

func (df *DataFrame) Drop(names ...string) *DataFrame

Drop returns a new DataFrame without the specified columns.

func (*DataFrame) GroupBy ¶

func (df *DataFrame) GroupBy(columns ...string) *GroupBy

GroupBy creates a GroupBy object for the specified columns.

func (*DataFrame) HasColumn ¶

func (df *DataFrame) HasColumn(name string) bool

HasColumn checks if a column exists.

func (*DataFrame) Join ¶

func (df *DataFrame) Join(right *DataFrame, options *JoinOptions) (*DataFrame, error)

Join performs a join operation between two DataFrames.

func (*DataFrame) Lazy ¶

func (df *DataFrame) Lazy() *LazyFrame

Lazy converts a DataFrame to a LazyFrame for deferred operations.

This method creates a LazyFrame that wraps the current DataFrame, enabling lazy evaluation and query optimization. Operations added to the LazyFrame are not executed immediately but are instead accumulated in a query plan.

Returns:

*LazyFrame: A new LazyFrame wrapping this DataFrame with an empty operation queue.

Example:

// Convert DataFrame to LazyFrame for chained operations
lazy := df.Lazy()

// Chain operations without immediate execution
result, err := lazy.
    Filter(expr.Col("status").Eq(expr.Lit("active"))).
    Select("id", "name").
    Collect() // Operations execute here

Performance Benefits:

Operations are optimized before execution
Automatic parallelization for large datasets
Memory-efficient streaming processing
Predicate pushdown reduces data movement

The LazyFrame maintains a reference to the original DataFrame, so both objects should be properly released when no longer needed.

func (*DataFrame) Len ¶

func (df *DataFrame) Len() int

Len returns the number of rows (assumes all columns have same length).

func (*DataFrame) NumRows ¶ added in v0.3.0

func (df *DataFrame) NumRows() int

NumRows returns the number of rows (alias for Len for compatibility).

func (*DataFrame) OptimizedJoin ¶

func (df *DataFrame) OptimizedJoin(right *DataFrame, options *JoinOptions) (*DataFrame, error)

OptimizedJoin performs join using the selected optimal strategy.

func (*DataFrame) Release ¶

func (df *DataFrame) Release()

Release releases all underlying Arrow memory.

func (*DataFrame) RollingWindow ¶

func (df *DataFrame) RollingWindow(column string, windowSize int, operation string) (*DataFrame, error)

RollingWindow applies a rolling window operation to a column.

func (*DataFrame) SafeCollectParallel ¶

func (df *DataFrame) SafeCollectParallel() (*DataFrame, error)

SafeCollectParallel performs memory-safe parallel collection using the new infrastructure.

func (*DataFrame) SafeCollectParallelWithMonitoring ¶

func (df *DataFrame) SafeCollectParallelWithMonitoring() (*DataFrame, error)

SafeCollectParallelWithMonitoring performs memory-safe parallel collection with memory monitoring.

func (*DataFrame) Select ¶

func (df *DataFrame) Select(names ...string) *DataFrame

Select returns a new DataFrame containing only the specified columns.

The operation preserves the order of columns as specified in the names parameter. If a column name doesn't exist, it is silently ignored. This is an eager operation that immediately creates a new DataFrame.

Parameters:

names: Variable number of column names to include in the result.
       Column names are case-sensitive and must match exactly.

Returns:

*DataFrame: A new DataFrame with only the selected columns.
            If no valid column names are provided, returns an empty DataFrame.

Example:

// Original DataFrame has columns: "name", "age", "salary", "department"
selected := df.Select("name", "age")          // Two columns
nameOnly := df.Select("name")                 // Single column
reordered := df.Select("salary", "name")      // Different order
partial := df.Select("name", "invalid")       // "invalid" ignored

Performance: Select is an O(k) operation where k is the number of selected columns. No data copying occurs - the new DataFrame shares references to the same underlying Arrow arrays.

Memory Management: The returned DataFrame shares column references with the original DataFrame. Both DataFrames should be released independently when no longer needed.

func (*DataFrame) Slice ¶

func (df *DataFrame) Slice(start, end int) *DataFrame

Slice creates a new DataFrame containing rows from start (inclusive) to end (exclusive).

func (*DataFrame) Sort ¶

func (df *DataFrame) Sort(column string, ascending bool) (*DataFrame, error)

Sort returns a new DataFrame sorted by the specified column.

func (*DataFrame) SortBy ¶

func (df *DataFrame) SortBy(columns []string, ascending []bool) (*DataFrame, error)

SortBy returns a new DataFrame sorted by multiple columns.

func (*DataFrame) String ¶

func (df *DataFrame) String() string

String returns a string representation of the DataFrame.

func (*DataFrame) Width ¶

func (df *DataFrame) Width() int

Width returns the number of columns.

func (*DataFrame) WindowFunction ¶

func (df *DataFrame) WindowFunction(function string, columns ...string) (*DataFrame, error)

WindowFunction applies a window function to the DataFrame.

func (*DataFrame) WindowFunctionWithPartition ¶

func (df *DataFrame) WindowFunctionWithPartition(function, orderBy, partitionBy string) (*DataFrame, error)

WindowFunctionWithPartition applies a window function with optional partitioning.

func (*DataFrame) WithConfig ¶

func (df *DataFrame) WithConfig(opConfig config.OperationConfig) *DataFrame

WithConfig returns a new DataFrame with the specified operation configuration.

type ExecutionPlan ¶

type ExecutionPlan struct {
	// contains filtered or unexported fields
}

ExecutionPlan represents a planned query execution with metadata.

func CreateExecutionPlan ¶

func CreateExecutionPlan(source *DataFrame, operations []LazyOperation) *ExecutionPlan

CreateExecutionPlan analyzes operations and creates an execution plan.

type ExpressionPlan ¶ added in v0.3.0

type ExpressionPlan struct {
	// contains filtered or unexported fields
}

ExpressionPlan represents an optimized execution plan for an expression.

type FilterFusionRule ¶

type FilterFusionRule struct{}

FilterFusionRule combines multiple filter operations into a single operation.

func (*FilterFusionRule) Apply ¶

func (r *FilterFusionRule) Apply(plan *ExecutionPlan) *ExecutionPlan

func (*FilterFusionRule) Name ¶

func (r *FilterFusionRule) Name() string

type FilterMaskOperation ¶ added in v0.3.1

type FilterMaskOperation interface {
	// contains filtered or unexported methods
}

FilterMaskOperation represents an operation that can apply boolean filter masks.

type FilterMemoryPoolManager ¶ added in v0.3.1

type FilterMemoryPoolManager struct {
	// contains filtered or unexported fields
}

FilterMemoryPoolManager manages the global memory pool for filter operations.

type FilterOperation ¶

type FilterOperation struct {
	// contains filtered or unexported fields
}

FilterOperation represents a filter operation.

func (*FilterOperation) Apply ¶

func (f *FilterOperation) Apply(df *DataFrame) (*DataFrame, error)

func (*FilterOperation) String ¶

func (f *FilterOperation) String() string

type FunctionExprInterface ¶

type FunctionExprInterface interface {
	expr.Expr
	Name() string
	Args() []expr.Expr
}

FunctionExprInterface defines the interface for function expressions.

type GroupBy ¶

type GroupBy struct {
	// contains filtered or unexported fields
}

GroupBy represents a grouped DataFrame for aggregation operations.

func (*GroupBy) Agg ¶

func (gb *GroupBy) Agg(aggregations ...*expr.AggregationExpr) *DataFrame

Agg performs aggregation operations on the grouped data.

type GroupByHavingOperation ¶ added in v0.3.0

type GroupByHavingOperation struct {
	// contains filtered or unexported fields
}

GroupByHavingOperation combines GroupBy, aggregation, and Having filtering.

func (*GroupByHavingOperation) Apply ¶ added in v0.3.0

func (gh *GroupByHavingOperation) Apply(df *DataFrame) (*DataFrame, error)

Apply performs groupby, extracts aggregations from predicate, performs them, and filters.

func (*GroupByHavingOperation) Release ¶ added in v0.3.0

func (gh *GroupByHavingOperation) Release()

Release returns the cached allocator to the pool for reuse.

func (*GroupByHavingOperation) String ¶ added in v0.3.0

func (gh *GroupByHavingOperation) String() string

String returns a string representation of the operation.

type GroupByOperation ¶

type GroupByOperation struct {
	// contains filtered or unexported fields
}

GroupByOperation represents a group by and aggregation operation.

func NewGroupByOperation ¶ added in v0.3.0

func NewGroupByOperation(groupByCols []string, aggregations []*expr.AggregationExpr) *GroupByOperation

NewGroupByOperation creates a new GroupByOperation without HAVING predicate (for backward compatibility).

func NewGroupByOperationWithHaving ¶ added in v0.3.0

func NewGroupByOperationWithHaving(
	groupByCols []string,
	aggregations []*expr.AggregationExpr,
	havingPredicate expr.Expr,
) *GroupByOperation

NewGroupByOperationWithHaving creates a new GroupByOperation with optional HAVING predicate.

func (*GroupByOperation) Apply ¶

func (g *GroupByOperation) Apply(df *DataFrame) (*DataFrame, error)

func (*GroupByOperation) String ¶

func (g *GroupByOperation) String() string

type HavingMemoryPool ¶ added in v0.3.0

type HavingMemoryPool struct {
	// contains filtered or unexported fields
}

HavingMemoryPool manages memory allocation for HAVING operations.

func NewHavingMemoryPool ¶ added in v0.3.0

func NewHavingMemoryPool(allocator memory.Allocator) *HavingMemoryPool

NewHavingMemoryPool creates a new memory pool for HAVING operations.

func (*HavingMemoryPool) GetAllocator ¶ added in v0.3.0

func (p *HavingMemoryPool) GetAllocator() memory.Allocator

GetAllocator returns the underlying memory allocator.

func (*HavingMemoryPool) GetBooleanBuilder ¶ added in v0.3.0

func (p *HavingMemoryPool) GetBooleanBuilder() *array.BooleanBuilder

GetBooleanBuilder gets a boolean array builder from the pool.

func (*HavingMemoryPool) PutBooleanBuilder ¶ added in v0.3.0

func (p *HavingMemoryPool) PutBooleanBuilder(builder *array.BooleanBuilder)

PutBooleanBuilder returns a boolean array builder to the pool.

func (*HavingMemoryPool) Release ¶ added in v0.3.0

func (p *HavingMemoryPool) Release()

Release releases the memory pool.

type HavingOperation ¶ added in v0.3.0

type HavingOperation struct {
	// contains filtered or unexported fields
}

HavingOperation represents a HAVING clause that filters grouped data based on aggregation predicates.

func NewHavingOperation ¶ added in v0.3.0

func NewHavingOperation(predicate expr.Expr) *HavingOperation

NewHavingOperation creates a new HavingOperation with the given predicate.

func (*HavingOperation) Apply ¶ added in v0.3.0

func (h *HavingOperation) Apply(df *DataFrame) (*DataFrame, error)

Apply filters grouped DataFrame based on the aggregation predicate.

func (*HavingOperation) Name ¶ added in v0.3.0

func (h *HavingOperation) Name() string

Name returns the operation name for debugging.

func (*HavingOperation) String ¶ added in v0.3.0

func (h *HavingOperation) String() string

String returns a string representation of the HAVING operation.

type HavingPerformanceMetrics ¶ added in v0.3.0

type HavingPerformanceMetrics struct {
	EvaluationTime        time.Duration
	CompilationTime       time.Duration
	MemoryAllocations     int64
	ParallelEfficiency    float64
	ThroughputMBps        float64
	CacheHitRate          float64
	SelectivityActual     float64
	ColumnAccessTime      time.Duration
	ExpressionEvalTime    time.Duration
	FilterApplicationTime time.Duration
	MemoryAllocationTime  time.Duration
	// contains filtered or unexported fields
}

HavingPerformanceMetrics tracks performance metrics for HAVING operations.

type ISeries ¶

type ISeries interface {
	Name() string
	Len() int
	DataType() arrow.DataType
	IsNull(index int) bool
	String() string
	Array() arrow.Array
	Release()
	GetAsString(index int) string
}

ISeries provides a type-erased interface for Series of any type.

type JoinOperation ¶

type JoinOperation struct {
	// contains filtered or unexported fields
}

JoinOperation represents a join operation.

func (*JoinOperation) Apply ¶

func (j *JoinOperation) Apply(df *DataFrame) (*DataFrame, error)

func (*JoinOperation) String ¶

func (j *JoinOperation) String() string

type JoinOptimizer ¶

type JoinOptimizer struct {
	// contains filtered or unexported fields
}

JoinOptimizer selects optimal join strategy based on table statistics.

func NewJoinOptimizer ¶

func NewJoinOptimizer(left, right *DataFrame) *JoinOptimizer

NewJoinOptimizer creates a new join optimizer with table statistics.

func (*JoinOptimizer) SelectStrategy ¶

func (jo *JoinOptimizer) SelectStrategy(leftKeys, rightKeys []string) JoinStrategy

SelectStrategy chooses the optimal join strategy based on table characteristics.

type JoinOptions ¶

type JoinOptions struct {
	Type      JoinType
	LeftKey   string   // Single join key for left DataFrame
	RightKey  string   // Single join key for right DataFrame
	LeftKeys  []string // Multiple join keys for left DataFrame
	RightKeys []string // Multiple join keys for right DataFrame
}

JoinOptions specifies parameters for join operations.

type JoinStrategy ¶

type JoinStrategy int

JoinStrategy represents different join algorithms.

const (
	HashJoinStrategy JoinStrategy = iota
	BroadcastJoinStrategy
	MergeJoinStrategy
	OptimizedHashJoinStrategy
)

type JoinType ¶

type JoinType int

JoinType represents the type of join operation.

const (
	InnerJoin JoinType = iota
	LeftJoin
	RightJoin
	FullOuterJoin
)

type LazyFrame ¶

type LazyFrame struct {
	// contains filtered or unexported fields
}

LazyFrame represents a DataFrame with deferred operations for optimized execution.

LazyFrame implements lazy evaluation, building up a query plan of operations without executing them immediately. This enables powerful optimizations including:

Query optimization (predicate pushdown, operation fusion)
Automatic parallelization for large datasets
Memory-efficient processing through streaming
Operation reordering for better performance

Operations are only executed when Collect() is called, at which point the entire query plan is optimized and executed in the most efficient manner.

Key characteristics:

Zero-cost operation chaining until execution
Automatic parallel execution for datasets > 1000 rows
Query optimization with predicate pushdown
Memory-efficient streaming for large operations
Thread-safe parallel chunk processing

Example usage:

result, err := df.Lazy().
    Filter(expr.Col("age").Gt(expr.Lit(25))).
    Select("name", "department").
    GroupBy("department").
    Agg(expr.Count(expr.Col("*")).As("employee_count")).
    Collect()

The above builds a query plan and executes it optimally, potentially reordering operations and using parallel processing automatically.

func (*LazyFrame) Collect ¶

func (lf *LazyFrame) Collect(ctx ...context.Context) (*DataFrame, error)

Collect executes all accumulated operations and returns the final DataFrame.

This method triggers the execution of the entire query plan built up through lazy operations. The execution is optimized with:

Query optimization (predicate pushdown, operation fusion)
Automatic parallelization for large datasets (>1000 rows)
Memory-efficient chunk processing
Operation reordering for performance

Parameters:

ctx: Optional context for cancellation support. If provided, the operation
     can be canceled before completion.

Returns:

*DataFrame: The result of executing all operations in the query plan.
error: Any error encountered during execution.

Example:

result, err := df.Lazy().
    Filter(expr.Col("age").Gt(expr.Lit(25))).
    GroupBy("department").
    Agg(expr.Mean(expr.Col("salary")).As("avg_salary")).
    Collect()
if err != nil {
    log.Fatal(err)
}
defer result.Release()

Performance:

Automatic parallel execution for datasets with >1000 rows
Uses worker pools sized to available CPU cores
Optimized memory allocation through pooling
Query plan optimization reduces unnecessary operations

Memory Management: The returned DataFrame is independent and must be released when no longer needed. The LazyFrame and its source DataFrame remain valid after Collect().

Cancellation: If a context is provided, the operation will check for cancellation and return early if the context is canceled. This is useful for long-running operations.

func (*LazyFrame) Filter ¶

func (lf *LazyFrame) Filter(predicate expr.Expr) *LazyFrame

Filter adds a filter operation to the lazy frame.

func (*LazyFrame) GroupBy ¶

func (lf *LazyFrame) GroupBy(columns ...string) *LazyGroupBy

GroupBy adds a group by and aggregation operation to the lazy frame.

func (*LazyFrame) Join ¶

func (lf *LazyFrame) Join(right *LazyFrame, options *JoinOptions) *LazyFrame

Join adds a join operation to the lazy frame.

func (*LazyFrame) Release ¶

func (lf *LazyFrame) Release()

Release releases resources.

func (*LazyFrame) SafeCollectParallel ¶

func (lf *LazyFrame) SafeCollectParallel() (*DataFrame, error)

SafeCollectParallel executes all deferred operations using memory-safe parallel processing.

func (*LazyFrame) SafeCollectParallelWithMonitoring ¶

func (lf *LazyFrame) SafeCollectParallelWithMonitoring() (*DataFrame, error)

SafeCollectParallelWithMonitoring executes operations with memory monitoring and adaptive parallelism.

func (*LazyFrame) Select ¶

func (lf *LazyFrame) Select(columns ...string) *LazyFrame

Select adds a column selection operation to the lazy frame.

func (*LazyFrame) Sort ¶

func (lf *LazyFrame) Sort(column string, ascending bool) *LazyFrame

Sort adds a sort operation to the lazy frame.

func (*LazyFrame) SortBy ¶

func (lf *LazyFrame) SortBy(columns []string, ascending []bool) *LazyFrame

SortBy adds a multi-column sort operation to the lazy frame.

func (*LazyFrame) String ¶

func (lf *LazyFrame) String() string

String returns a string representation of the lazy frame and its operations.

func (*LazyFrame) WithColumn ¶

func (lf *LazyFrame) WithColumn(name string, expr expr.Expr) *LazyFrame

WithColumn adds a column creation/modification operation to the lazy frame.

type LazyGroupBy ¶

type LazyGroupBy struct {
	// contains filtered or unexported fields
}

LazyGroupBy represents a lazy groupby operation that can be followed by aggregations.

func (*LazyGroupBy) Agg ¶

func (lgb *LazyGroupBy) Agg(aggregations ...*expr.AggregationExpr) *LazyFrame

Agg performs aggregation operations and returns a new LazyFrame.

func (*LazyGroupBy) AggWithHaving ¶ added in v0.3.0

func (lgb *LazyGroupBy) AggWithHaving(havingPredicate expr.Expr, aggregations ...*expr.AggregationExpr) *LazyFrame

AggWithHaving performs aggregation operations with an optional HAVING predicate and returns a new LazyFrame.

func (*LazyGroupBy) Count ¶

func (lgb *LazyGroupBy) Count(column string) *LazyFrame

Count creates a count aggregation for the specified column.

func (*LazyGroupBy) Having ¶ added in v0.3.0

func (lgb *LazyGroupBy) Having(predicate expr.Expr) *LazyFrame

Having adds a HAVING clause to filter grouped data based on aggregation predicates.

func (*LazyGroupBy) Max ¶

func (lgb *LazyGroupBy) Max(column string) *LazyFrame

Max creates a max aggregation for the specified column.

func (*LazyGroupBy) Mean ¶

func (lgb *LazyGroupBy) Mean(column string) *LazyFrame

Mean creates a mean aggregation for the specified column.

func (*LazyGroupBy) Min ¶

func (lgb *LazyGroupBy) Min(column string) *LazyFrame

Min creates a min aggregation for the specified column.

func (*LazyGroupBy) Sum ¶

func (lgb *LazyGroupBy) Sum(column string) *LazyFrame

Sum creates a sum aggregation for the specified column.

type LazyOperation ¶

type LazyOperation interface {
	Apply(df *DataFrame) (*DataFrame, error)
	String() string
}

LazyOperation represents a deferred operation on a DataFrame.

type MemoryLayoutInfo ¶ added in v0.3.0

type MemoryLayoutInfo struct {
	// contains filtered or unexported fields
}

MemoryLayoutInfo contains memory optimization hints.

type MergeJoinOptimizer ¶

type MergeJoinOptimizer struct {
	// contains filtered or unexported fields
}

MergeJoinOptimizer tracks if DataFrames are pre-sorted.

func NewMergeJoinOptimizer ¶

func NewMergeJoinOptimizer(leftSorted, rightSorted bool, sortKeys []string) *MergeJoinOptimizer

NewMergeJoinOptimizer creates a merge join optimizer.

func (*MergeJoinOptimizer) Join ¶

func (mjo *MergeJoinOptimizer) Join(left, right *DataFrame, options *JoinOptions) (*DataFrame, error)

Join performs optimized merge join.

type OperationFusionRule ¶

type OperationFusionRule struct{}

OperationFusionRule combines compatible operations for better performance.

func (*OperationFusionRule) Apply ¶

func (r *OperationFusionRule) Apply(plan *ExecutionPlan) *ExecutionPlan

func (*OperationFusionRule) Name ¶

func (r *OperationFusionRule) Name() string

type OperationType ¶ added in v0.3.0

type OperationType int

OperationType represents the type of compiled operation.

const (
	OpTypeColumnAccess OperationType = iota
	OpTypeLiteral
	OpTypeAggregation
	OpTypeComparison
	OpTypeArithmetic
	OpTypeLogical
)

type OptimizationRule ¶

type OptimizationRule interface {
	Apply(plan *ExecutionPlan) *ExecutionPlan
	Name() string
}

OptimizationRule represents a single optimization transformation.

type OptimizedHashMap ¶

type OptimizedHashMap struct {
	// contains filtered or unexported fields
}

OptimizedHashMap uses xxhash for better performance.

func NewOptimizedHashMap ¶

func NewOptimizedHashMap(estimatedSize int) *OptimizedHashMap

NewOptimizedHashMap creates a new optimized hash map.

func (*OptimizedHashMap) Get ¶

func (ohm *OptimizedHashMap) Get(key string) ([]int, bool)

Get retrieves values for a key.

func (*OptimizedHashMap) Put ¶

func (ohm *OptimizedHashMap) Put(key string, value int)

Put adds a key-value pair to the hash map.

type PerformanceHint ¶ added in v0.3.0

type PerformanceHint struct {
	ExpectedGroupCount       int
	ExpectedSelectivity      float64
	PreferMemoryOptimization bool
	EnableParallelization    bool
	MaxMemoryUsage           int64
}

PerformanceHint provides optimization hints for evaluation.

type PlanMetadata ¶

type PlanMetadata struct {
	// contains filtered or unexported fields
}

PlanMetadata contains analysis information about the execution plan.

type PredicatePushdownRule ¶

type PredicatePushdownRule struct{}

PredicatePushdownRule moves filter operations earlier in the pipeline.

func (*PredicatePushdownRule) Apply ¶

func (r *PredicatePushdownRule) Apply(plan *ExecutionPlan) *ExecutionPlan

func (*PredicatePushdownRule) Name ¶

func (r *PredicatePushdownRule) Name() string

type ProjectionPushdownRule ¶

type ProjectionPushdownRule struct{}

ProjectionPushdownRule pushes column selections earlier to reduce data processing.

func (*ProjectionPushdownRule) Apply ¶

func (r *ProjectionPushdownRule) Apply(plan *ExecutionPlan) *ExecutionPlan

func (*ProjectionPushdownRule) Name ¶

func (r *ProjectionPushdownRule) Name() string

type QueryOptimizer ¶

type QueryOptimizer struct {
	// contains filtered or unexported fields
}

QueryOptimizer applies optimization rules to improve query performance.

func NewQueryOptimizer ¶

func NewQueryOptimizer() *QueryOptimizer

NewQueryOptimizer creates a new optimizer with default rules.

func (*QueryOptimizer) Optimize ¶

func (qo *QueryOptimizer) Optimize(plan *ExecutionPlan) *ExecutionPlan

Optimize applies all optimization rules to the execution plan.

type SelectOperation ¶

type SelectOperation struct {
	// contains filtered or unexported fields
}

SelectOperation represents a column selection operation.

func (*SelectOperation) Apply ¶

func (s *SelectOperation) Apply(df *DataFrame) (*DataFrame, error)

func (*SelectOperation) String ¶

func (s *SelectOperation) String() string

type SortOperation ¶

type SortOperation struct {
	// contains filtered or unexported fields
}

SortOperation represents a sort operation.

func (*SortOperation) Apply ¶

func (s *SortOperation) Apply(df *DataFrame) (*DataFrame, error)

func (*SortOperation) String ¶

func (s *SortOperation) String() string

type TableStats ¶

type TableStats struct {
	RowCount      int
	SortedColumns map[string]bool
	Cardinality   map[string]int // distinct values per column
}

TableStats holds statistics about a DataFrame for optimization decisions.

type UnaryExprInterface ¶

type UnaryExprInterface interface {
	expr.Expr
	Op() expr.UnaryOp
	Operand() expr.Expr
}

UnaryExprInterface defines the interface for unary expressions.

type WithColumnOperation ¶

type WithColumnOperation struct {
	// contains filtered or unexported fields
}

WithColumnOperation represents adding/modifying a column.

func (*WithColumnOperation) Apply ¶

func (w *WithColumnOperation) Apply(df *DataFrame) (*DataFrame, error)

func (*WithColumnOperation) String ¶

func (w *WithColumnOperation) String() string

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL