Documentation
¶
Overview ¶
Package dataframe provides high-performance DataFrame operations
Index ¶
- Constants
- type ActualCost
- type AnalysisReport
- type ArrowValueArray
- type BinaryExprInterface
- type Bottleneck
- type ConstantFoldingRule
- type DataFrame
- func (df *DataFrame) Column(name string) (ISeries, bool)
- func (df *DataFrame) Columns() []string
- func (df *DataFrame) Concat(others ...*DataFrame) *DataFrame
- func (df *DataFrame) Correlation(col1, col2 string) (float64, error)
- func (df *DataFrame) Debug() *DataFrame
- func (df *DataFrame) Drop(names ...string) *DataFrame
- func (df *DataFrame) GroupBy(columns ...string) *GroupBy
- func (df *DataFrame) HasColumn(name string) bool
- func (df *DataFrame) Join(right *DataFrame, options *JoinOptions) (*DataFrame, error)
- func (df *DataFrame) Lazy() *LazyFrame
- func (df *DataFrame) Len() int
- func (df *DataFrame) OptimizedJoin(right *DataFrame, options *JoinOptions) (*DataFrame, error)
- func (df *DataFrame) Release()
- func (df *DataFrame) RollingWindow(column string, windowSize int, operation string) (*DataFrame, error)
- func (df *DataFrame) SafeCollectParallel() (*DataFrame, error)
- func (df *DataFrame) SafeCollectParallelWithMonitoring() (*DataFrame, error)
- func (df *DataFrame) Select(names ...string) *DataFrame
- func (df *DataFrame) Slice(start, end int) *DataFrame
- func (df *DataFrame) Sort(column string, ascending bool) (*DataFrame, error)
- func (df *DataFrame) SortBy(columns []string, ascending []bool) (*DataFrame, error)
- func (df *DataFrame) String() string
- func (df *DataFrame) Width() int
- func (df *DataFrame) WindowFunction(function string, columns ...string) (*DataFrame, error)
- func (df *DataFrame) WindowFunctionWithPartition(function, orderBy, partitionBy string) (*DataFrame, error)
- func (df *DataFrame) WithConfig(opConfig config.OperationConfig) *DataFrame
- func (df *DataFrame) WithDebugConfig(config DebugConfig) *DataFrame
- type DataFrameStats
- type DebugConfig
- type DebugExecutionPlan
- type DebugPlanMetadata
- type EstimatedCost
- type ExecutionPlan
- type FilterFusionRule
- type FilterOperation
- type FunctionExprInterface
- type GroupBy
- type GroupByOperation
- type ISeries
- type JoinOperation
- type JoinOptimizer
- type JoinOptions
- type JoinStrategy
- type JoinType
- type LazyFrame
- func (lf *LazyFrame) Collect() (*DataFrame, error)
- func (lf *LazyFrame) Explain() DebugExecutionPlan
- func (lf *LazyFrame) ExplainAnalyze() (DebugExecutionPlan, error)
- func (lf *LazyFrame) Filter(predicate expr.Expr) *LazyFrame
- func (lf *LazyFrame) GroupBy(columns ...string) *LazyGroupBy
- func (lf *LazyFrame) Join(right *LazyFrame, options *JoinOptions) *LazyFrame
- func (lf *LazyFrame) Release()
- func (lf *LazyFrame) SafeCollectParallel() (*DataFrame, error)
- func (lf *LazyFrame) SafeCollectParallelWithMonitoring() (*DataFrame, error)
- func (lf *LazyFrame) Select(columns ...string) *LazyFrame
- func (lf *LazyFrame) Sort(column string, ascending bool) *LazyFrame
- func (lf *LazyFrame) SortBy(columns []string, ascending []bool) *LazyFrame
- func (lf *LazyFrame) String() string
- func (lf *LazyFrame) WithColumn(name string, expr expr.Expr) *LazyFrame
- type LazyGroupBy
- func (lgb *LazyGroupBy) Agg(aggregations ...*expr.AggregationExpr) *LazyFrame
- func (lgb *LazyGroupBy) Count(column string) *LazyFrame
- func (lgb *LazyGroupBy) Max(column string) *LazyFrame
- func (lgb *LazyGroupBy) Mean(column string) *LazyFrame
- func (lgb *LazyGroupBy) Min(column string) *LazyFrame
- func (lgb *LazyGroupBy) Sum(column string) *LazyFrame
- type LazyOperation
- type LogLevel
- type MemoryStats
- type MergeJoinOptimizer
- type OperationFusionRule
- type OperationSummary
- type OperationTrace
- type OptimizationRule
- type OptimizedHashMap
- type OutputFormat
- type PlanCost
- type PlanMetadata
- type PlanNode
- type PlanStats
- type PredicatePushdownRule
- type ProjectionPushdownRule
- type QueryAnalyzer
- type QueryOptimizer
- type SelectOperation
- type SortOperation
- type TableStats
- type UnaryExprInterface
- type WithColumnOperation
Constants ¶
const ( // AvgBytesPerCell is the estimated average bytes per DataFrame cell AvgBytesPerCell = 8 // ParallelThreshold is the minimum number of rows to trigger parallel execution ParallelThreshold = 1000 // FilterSelectivity is the default assumed selectivity for filter operations FilterSelectivity = 0.5 )
Package-level constants for consistent usage across debug functionality
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type ActualCost ¶
type ActualCost struct {
Rows int64 `json:"rows"`
Memory int64 `json:"memory"`
CPU time.Duration `json:"cpu"`
Duration time.Duration `json:"duration"`
}
ActualCost contains actual cost metrics after execution
type AnalysisReport ¶
type AnalysisReport struct {
Operations []OperationTrace `json:"operations"`
Summary OperationSummary `json:"summary"`
Bottlenecks []Bottleneck `json:"bottlenecks"`
Suggestions []string `json:"suggestions"`
}
AnalysisReport contains the complete analysis report
type ArrowValueArray ¶
ArrowValueArray represents an Arrow array with value access
type BinaryExprInterface ¶
type BinaryExprInterface interface {
expr.Expr
Left() expr.Expr
Op() expr.BinaryOp
Right() expr.Expr
}
BinaryExprInterface defines the interface for binary expressions
type Bottleneck ¶
type Bottleneck struct {
Operation string `json:"operation"`
Duration time.Duration `json:"duration"`
Reason string `json:"reason"`
}
Bottleneck represents a performance bottleneck
type ConstantFoldingRule ¶
type ConstantFoldingRule struct{}
ConstantFoldingRule evaluates constant expressions at planning time
func (*ConstantFoldingRule) Apply ¶
func (r *ConstantFoldingRule) Apply(plan *ExecutionPlan) *ExecutionPlan
func (*ConstantFoldingRule) Name ¶
func (r *ConstantFoldingRule) Name() string
type DataFrame ¶
type DataFrame struct {
// contains filtered or unexported fields
}
DataFrame represents a table of data with typed columns
func (*DataFrame) Concat ¶
Concat concatenates multiple DataFrames vertically (row-wise) All DataFrames must have the same column structure
func (*DataFrame) Correlation ¶
Correlation calculates the Pearson correlation coefficient between two numeric columns
func (*DataFrame) Join ¶
func (df *DataFrame) Join(right *DataFrame, options *JoinOptions) (*DataFrame, error)
Join performs a join operation between two DataFrames
func (*DataFrame) OptimizedJoin ¶
func (df *DataFrame) OptimizedJoin(right *DataFrame, options *JoinOptions) (*DataFrame, error)
OptimizedJoin performs join using the selected optimal strategy
func (*DataFrame) Release ¶
func (df *DataFrame) Release()
Release releases all underlying Arrow memory
func (*DataFrame) RollingWindow ¶
func (df *DataFrame) RollingWindow(column string, windowSize int, operation string) (*DataFrame, error)
RollingWindow applies a rolling window operation to a column
func (*DataFrame) SafeCollectParallel ¶
SafeCollectParallel performs memory-safe parallel collection using the new infrastructure
func (*DataFrame) SafeCollectParallelWithMonitoring ¶
SafeCollectParallelWithMonitoring performs memory-safe parallel collection with memory monitoring
func (*DataFrame) Slice ¶
Slice creates a new DataFrame containing rows from start (inclusive) to end (exclusive)
func (*DataFrame) WindowFunction ¶
WindowFunction applies a window function to the DataFrame
func (*DataFrame) WindowFunctionWithPartition ¶
func (df *DataFrame) WindowFunctionWithPartition(function, orderBy, partitionBy string) (*DataFrame, error)
WindowFunctionWithPartition applies a window function with optional partitioning
func (*DataFrame) WithConfig ¶
func (df *DataFrame) WithConfig(opConfig config.OperationConfig) *DataFrame
WithConfig returns a new DataFrame with the specified operation configuration
func (*DataFrame) WithDebugConfig ¶
func (df *DataFrame) WithDebugConfig(config DebugConfig) *DataFrame
WithDebugConfig sets the debug configuration for the DataFrame
type DataFrameStats ¶
type DataFrameStats struct {
Rows int `json:"rows"`
Columns int `json:"columns"`
Memory int64 `json:"memory"`
Schema []string `json:"schema"`
}
DataFrameStats contains statistics about a DataFrame
type DebugConfig ¶
type DebugConfig struct {
Enabled bool `json:"enabled"`
LogLevel LogLevel `json:"log_level"`
ProfileOperations bool `json:"profile_operations"`
TrackMemory bool `json:"track_memory"`
ShowOptimizations bool `json:"show_optimizations"`
OutputFormat OutputFormat `json:"output_format"`
}
DebugConfig configures debug mode settings
type DebugExecutionPlan ¶
type DebugExecutionPlan struct {
RootNode *PlanNode `json:"root"`
Estimated PlanStats `json:"estimated"`
Actual PlanStats `json:"actual,omitempty"`
Metadata DebugPlanMetadata `json:"metadata"`
}
DebugExecutionPlan represents the execution plan for DataFrame operations with debug information
func (*DebugExecutionPlan) RenderJSON ¶
func (plan *DebugExecutionPlan) RenderJSON() ([]byte, error)
RenderJSON renders the execution plan as JSON
func (*DebugExecutionPlan) RenderText ¶
func (plan *DebugExecutionPlan) RenderText() string
RenderText renders the execution plan as text
type DebugPlanMetadata ¶
type DebugPlanMetadata struct {
CreatedAt time.Time `json:"created_at"`
OptimizedAt time.Time `json:"optimized_at,omitempty"`
ExecutedAt time.Time `json:"executed_at,omitempty"`
Optimizations []string `json:"optimizations,omitempty"`
}
DebugPlanMetadata contains metadata about the execution plan
type EstimatedCost ¶
type EstimatedCost struct {
Rows int64 `json:"rows"`
Memory int64 `json:"memory"`
CPU time.Duration `json:"cpu"`
}
EstimatedCost contains estimated cost metrics
type ExecutionPlan ¶
type ExecutionPlan struct {
// contains filtered or unexported fields
}
ExecutionPlan represents a planned query execution with metadata
func CreateExecutionPlan ¶
func CreateExecutionPlan(source *DataFrame, operations []LazyOperation) *ExecutionPlan
CreateExecutionPlan analyzes operations and creates an execution plan
type FilterFusionRule ¶
type FilterFusionRule struct{}
FilterFusionRule combines multiple filter operations into a single operation
func (*FilterFusionRule) Apply ¶
func (r *FilterFusionRule) Apply(plan *ExecutionPlan) *ExecutionPlan
func (*FilterFusionRule) Name ¶
func (r *FilterFusionRule) Name() string
type FilterOperation ¶
type FilterOperation struct {
// contains filtered or unexported fields
}
FilterOperation represents a filter operation
func (*FilterOperation) String ¶
func (f *FilterOperation) String() string
type FunctionExprInterface ¶
FunctionExprInterface defines the interface for function expressions
type GroupBy ¶
type GroupBy struct {
// contains filtered or unexported fields
}
GroupBy represents a grouped DataFrame for aggregation operations
type GroupByOperation ¶
type GroupByOperation struct {
// contains filtered or unexported fields
}
GroupByOperation represents a group by and aggregation operation
func (*GroupByOperation) Apply ¶
func (g *GroupByOperation) Apply(df *DataFrame) (*DataFrame, error)
func (*GroupByOperation) String ¶
func (g *GroupByOperation) String() string
type ISeries ¶
type ISeries interface {
Name() string
Len() int
DataType() arrow.DataType
IsNull(index int) bool
String() string
Array() arrow.Array
Release()
}
ISeries provides a type-erased interface for Series of any type
type JoinOperation ¶
type JoinOperation struct {
// contains filtered or unexported fields
}
JoinOperation represents a join operation
func (*JoinOperation) String ¶
func (j *JoinOperation) String() string
type JoinOptimizer ¶
type JoinOptimizer struct {
// contains filtered or unexported fields
}
JoinOptimizer selects optimal join strategy based on table statistics
func NewJoinOptimizer ¶
func NewJoinOptimizer(left, right *DataFrame) *JoinOptimizer
NewJoinOptimizer creates a new join optimizer with table statistics
func (*JoinOptimizer) SelectStrategy ¶
func (jo *JoinOptimizer) SelectStrategy(leftKeys, rightKeys []string) JoinStrategy
SelectStrategy chooses the optimal join strategy based on table characteristics
type JoinOptions ¶
type JoinOptions struct {
Type JoinType
LeftKey string // Single join key for left DataFrame
RightKey string // Single join key for right DataFrame
LeftKeys []string // Multiple join keys for left DataFrame
RightKeys []string // Multiple join keys for right DataFrame
}
JoinOptions specifies parameters for join operations
type JoinStrategy ¶
type JoinStrategy int
JoinStrategy represents different join algorithms
const ( HashJoinStrategy JoinStrategy = iota BroadcastJoinStrategy MergeJoinStrategy OptimizedHashJoinStrategy )
type LazyFrame ¶
type LazyFrame struct {
// contains filtered or unexported fields
}
LazyFrame holds a DataFrame and a sequence of deferred operations
func (*LazyFrame) Collect ¶
Collect executes all deferred operations and returns the resulting DataFrame
func (*LazyFrame) Explain ¶
func (lf *LazyFrame) Explain() DebugExecutionPlan
Explain generates an execution plan without executing the operations
func (*LazyFrame) ExplainAnalyze ¶
func (lf *LazyFrame) ExplainAnalyze() (DebugExecutionPlan, error)
ExplainAnalyze generates an execution plan and executes it with profiling
func (*LazyFrame) GroupBy ¶
func (lf *LazyFrame) GroupBy(columns ...string) *LazyGroupBy
GroupBy adds a group by and aggregation operation to the lazy frame
func (*LazyFrame) Join ¶
func (lf *LazyFrame) Join(right *LazyFrame, options *JoinOptions) *LazyFrame
Join adds a join operation to the lazy frame
func (*LazyFrame) SafeCollectParallel ¶
SafeCollectParallel executes all deferred operations using memory-safe parallel processing
func (*LazyFrame) SafeCollectParallelWithMonitoring ¶
SafeCollectParallelWithMonitoring executes operations with memory monitoring and adaptive parallelism
type LazyGroupBy ¶
type LazyGroupBy struct {
// contains filtered or unexported fields
}
LazyGroupBy represents a lazy groupby operation that can be followed by aggregations
func (*LazyGroupBy) Agg ¶
func (lgb *LazyGroupBy) Agg(aggregations ...*expr.AggregationExpr) *LazyFrame
Agg performs aggregation operations and returns a new LazyFrame
func (*LazyGroupBy) Count ¶
func (lgb *LazyGroupBy) Count(column string) *LazyFrame
Count creates a count aggregation for the specified column
func (*LazyGroupBy) Max ¶
func (lgb *LazyGroupBy) Max(column string) *LazyFrame
Max creates a max aggregation for the specified column
func (*LazyGroupBy) Mean ¶
func (lgb *LazyGroupBy) Mean(column string) *LazyFrame
Mean creates a mean aggregation for the specified column
func (*LazyGroupBy) Min ¶
func (lgb *LazyGroupBy) Min(column string) *LazyFrame
Min creates a min aggregation for the specified column
func (*LazyGroupBy) Sum ¶
func (lgb *LazyGroupBy) Sum(column string) *LazyFrame
Sum creates a sum aggregation for the specified column
type LazyOperation ¶
LazyOperation represents a deferred operation on a DataFrame
type MemoryStats ¶
type MemoryStats struct {
Before int64 `json:"before"`
After int64 `json:"after"`
Delta int64 `json:"delta"`
}
MemoryStats contains memory usage statistics
type MergeJoinOptimizer ¶
type MergeJoinOptimizer struct {
// contains filtered or unexported fields
}
MergeJoinOptimizer tracks if DataFrames are pre-sorted
func NewMergeJoinOptimizer ¶
func NewMergeJoinOptimizer(leftSorted, rightSorted bool, sortKeys []string) *MergeJoinOptimizer
NewMergeJoinOptimizer creates a merge join optimizer
func (*MergeJoinOptimizer) Join ¶
func (mjo *MergeJoinOptimizer) Join(left, right *DataFrame, options *JoinOptions) (*DataFrame, error)
Join performs optimized merge join
type OperationFusionRule ¶
type OperationFusionRule struct{}
OperationFusionRule combines compatible operations for better performance
func (*OperationFusionRule) Apply ¶
func (r *OperationFusionRule) Apply(plan *ExecutionPlan) *ExecutionPlan
func (*OperationFusionRule) Name ¶
func (r *OperationFusionRule) Name() string
type OperationSummary ¶
type OperationSummary struct {
TotalOperations int `json:"total_operations"`
TotalDuration time.Duration `json:"total_duration"`
TotalMemory int64 `json:"total_memory"`
ParallelOps int `json:"parallel_ops"`
}
OperationSummary contains summary statistics
type OperationTrace ¶
type OperationTrace struct {
ID string `json:"id"`
Operation string `json:"operation"`
Input DataFrameStats `json:"input"`
Output DataFrameStats `json:"output"`
Duration time.Duration `json:"duration"`
Memory MemoryStats `json:"memory"`
Parallel bool `json:"parallel"`
WorkerCount int `json:"worker_count,omitempty"`
Properties map[string]string `json:"properties"`
}
OperationTrace represents a traced operation during execution
type OptimizationRule ¶
type OptimizationRule interface {
Apply(plan *ExecutionPlan) *ExecutionPlan
Name() string
}
OptimizationRule represents a single optimization transformation
type OptimizedHashMap ¶
type OptimizedHashMap struct {
// contains filtered or unexported fields
}
OptimizedHashMap uses xxhash for better performance
func NewOptimizedHashMap ¶
func NewOptimizedHashMap(estimatedSize int) *OptimizedHashMap
NewOptimizedHashMap creates a new optimized hash map
func (*OptimizedHashMap) Get ¶
func (ohm *OptimizedHashMap) Get(key string) ([]int, bool)
Get retrieves values for a key
func (*OptimizedHashMap) Put ¶
func (ohm *OptimizedHashMap) Put(key string, value int)
Put adds a key-value pair to the hash map
type OutputFormat ¶
type OutputFormat int
OutputFormat defines the output format for debug information
const ( OutputFormatText OutputFormat = iota OutputFormatJSON )
type PlanCost ¶
type PlanCost struct {
Estimated EstimatedCost `json:"estimated"`
Actual ActualCost `json:"actual,omitempty"`
}
PlanCost contains cost information for a plan node
type PlanMetadata ¶
type PlanMetadata struct {
// contains filtered or unexported fields
}
PlanMetadata contains analysis information about the execution plan
type PlanNode ¶
type PlanNode struct {
ID string `json:"id"`
Type string `json:"type"` // "Filter", "Select", "GroupBy", etc.
Description string `json:"description"` // Human-readable operation description
Children []*PlanNode `json:"children,omitempty"`
Cost PlanCost `json:"cost"`
Properties map[string]string `json:"properties"`
}
PlanNode represents a node in the execution plan tree
type PlanStats ¶
type PlanStats struct {
TotalRows int64 `json:"total_rows"`
TotalMemory int64 `json:"total_memory"`
TotalDuration time.Duration `json:"total_duration"`
PeakMemory int64 `json:"peak_memory"`
ParallelOps int `json:"parallel_ops"`
}
PlanStats contains overall plan statistics
type PredicatePushdownRule ¶
type PredicatePushdownRule struct{}
PredicatePushdownRule moves filter operations earlier in the pipeline
func (*PredicatePushdownRule) Apply ¶
func (r *PredicatePushdownRule) Apply(plan *ExecutionPlan) *ExecutionPlan
func (*PredicatePushdownRule) Name ¶
func (r *PredicatePushdownRule) Name() string
type ProjectionPushdownRule ¶
type ProjectionPushdownRule struct{}
ProjectionPushdownRule pushes column selections earlier to reduce data processing
func (*ProjectionPushdownRule) Apply ¶
func (r *ProjectionPushdownRule) Apply(plan *ExecutionPlan) *ExecutionPlan
func (*ProjectionPushdownRule) Name ¶
func (r *ProjectionPushdownRule) Name() string
type QueryAnalyzer ¶
type QueryAnalyzer struct {
// contains filtered or unexported fields
}
QueryAnalyzer analyzes and traces query execution
func NewQueryAnalyzer ¶
func NewQueryAnalyzer(config DebugConfig) *QueryAnalyzer
NewQueryAnalyzer creates a new query analyzer
func (*QueryAnalyzer) GenerateReport ¶
func (qa *QueryAnalyzer) GenerateReport() AnalysisReport
GenerateReport generates an analysis report
func (*QueryAnalyzer) TraceOperation ¶
func (qa *QueryAnalyzer) TraceOperation( op string, input *DataFrame, fn func() (*DataFrame, error), ) (*DataFrame, error)
TraceOperation traces an operation execution
type QueryOptimizer ¶
type QueryOptimizer struct {
// contains filtered or unexported fields
}
QueryOptimizer applies optimization rules to improve query performance
func NewQueryOptimizer ¶
func NewQueryOptimizer() *QueryOptimizer
NewQueryOptimizer creates a new optimizer with default rules
func (*QueryOptimizer) Optimize ¶
func (qo *QueryOptimizer) Optimize(plan *ExecutionPlan) *ExecutionPlan
Optimize applies all optimization rules to the execution plan
type SelectOperation ¶
type SelectOperation struct {
// contains filtered or unexported fields
}
SelectOperation represents a column selection operation
func (*SelectOperation) String ¶
func (s *SelectOperation) String() string
type SortOperation ¶
type SortOperation struct {
// contains filtered or unexported fields
}
SortOperation represents a sort operation
func (*SortOperation) String ¶
func (s *SortOperation) String() string
type TableStats ¶
type TableStats struct {
RowCount int
SortedColumns map[string]bool
Cardinality map[string]int // distinct values per column
}
TableStats holds statistics about a DataFrame for optimization decisions
type UnaryExprInterface ¶
UnaryExprInterface defines the interface for unary expressions
type WithColumnOperation ¶
type WithColumnOperation struct {
// contains filtered or unexported fields
}
WithColumnOperation represents adding/modifying a column
func (*WithColumnOperation) Apply ¶
func (w *WithColumnOperation) Apply(df *DataFrame) (*DataFrame, error)
func (*WithColumnOperation) String ¶
func (w *WithColumnOperation) String() string