Documentation
¶
Overview ¶
Package evaluation provides a framework for evaluating LLM outputs.
The evaluation framework consists of:
- Metrics: Interfaces and base types for implementing evaluation metrics
- Scores: Result types for metric evaluations
- Engine: Runs metrics against inputs concurrently
Sub-packages ¶
- heuristic: Rule-based metrics (string matching, JSON validation, text similarity)
- llm: LLM-based judge metrics (relevance, hallucination, factuality)
Basic Usage ¶
import (
"github.com/agentplexus/go-opik/evaluation"
"github.com/agentplexus/go-opik/evaluation/heuristic"
)
// Create metrics
metrics := []evaluation.Metric{
heuristic.NewEquals(false),
heuristic.NewContains(false),
heuristic.NewIsJSON(),
}
// Create engine
engine := evaluation.NewEngine(metrics,
evaluation.WithConcurrency(4),
)
// Create input
input := evaluation.NewMetricInput("What is 2+2?", "4")
input = input.WithExpected("4")
// Evaluate
result := engine.EvaluateOne(ctx, input)
fmt.Printf("Score: %.2f\n", result.AverageScore())
Custom Metrics ¶
type MyMetric struct {
evaluation.BaseMetric
}
func NewMyMetric() *MyMetric {
return &MyMetric{
BaseMetric: evaluation.NewBaseMetric("my_metric"),
}
}
func (m *MyMetric) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult {
// Custom evaluation logic
return evaluation.NewScoreResult(m.Name(), 0.95)
}
Index ¶
- func DefaultInputMapper(inputKey, outputKey, expectedKey string) func(item map[string]any) MetricInput
- type AsyncMetric
- type BaseMetric
- type BatchMetric
- type CompositeMetric
- type ConditionalMetric
- type DatasetEvaluator
- type Engine
- func (e *Engine) EvaluateMany(ctx context.Context, inputs []MetricInput) EvaluationResults
- func (e *Engine) EvaluateOne(ctx context.Context, input MetricInput) *EvaluationResult
- func (e *Engine) EvaluateWithIDs(ctx context.Context, items map[string]MetricInput) EvaluationResults
- func (e *Engine) Metrics() []Metric
- type EngineOption
- type EvaluationCallback
- type EvaluationResult
- type EvaluationResults
- type Metric
- type MetricFunc
- type MetricInput
- func (m MetricInput) Get(key string) (any, bool)
- func (m MetricInput) GetString(key string) string
- func (m MetricInput) GetStringSlice(key string) []string
- func (m MetricInput) WithContext(ctx string) MetricInput
- func (m MetricInput) WithExpected(expected string) MetricInput
- func (m MetricInput) WithMetadata(key string, value any) MetricInput
- type ScoreResult
- func BooleanScore(name string, value bool) *ScoreResult
- func BooleanScoreWithReason(name string, value bool, reason string) *ScoreResult
- func NewFailedScoreResult(name string, err error) *ScoreResult
- func NewScoreResult(name string, value float64) *ScoreResult
- func NewScoreResultWithReason(name string, value float64, reason string) *ScoreResult
- type ScoreResults
- func (r ScoreResults) AllByName(name string) ScoreResults
- func (r ScoreResults) Average() float64
- func (r ScoreResults) AverageByName(name string) float64
- func (r ScoreResults) ByName(name string) *ScoreResult
- func (r ScoreResults) Failed() ScoreResults
- func (r ScoreResults) Successful() ScoreResults
- type WeightedMetric
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func DefaultInputMapper ¶
func DefaultInputMapper(inputKey, outputKey, expectedKey string) func(item map[string]any) MetricInput
DefaultInputMapper creates a default input mapper for common dataset structures.
Types ¶
type AsyncMetric ¶
type AsyncMetric interface {
Metric
// ScoreAsync evaluates the metric asynchronously.
ScoreAsync(ctx context.Context, input MetricInput) <-chan *ScoreResult
}
AsyncMetric is a metric that can be evaluated asynchronously.
type BaseMetric ¶
type BaseMetric struct {
// contains filtered or unexported fields
}
BaseMetric provides common functionality for metrics.
func NewBaseMetric ¶
func NewBaseMetric(name string) BaseMetric
NewBaseMetric creates a new base metric with the given name.
type BatchMetric ¶
type BatchMetric interface {
Metric
// ScoreBatch evaluates multiple inputs and returns results for each.
ScoreBatch(ctx context.Context, inputs []MetricInput) ScoreResults
}
BatchMetric is a metric that can evaluate multiple inputs at once.
type CompositeMetric ¶
type CompositeMetric struct {
BaseMetric
// contains filtered or unexported fields
}
CompositeMetric combines multiple metrics into one.
func NewCompositeMetric ¶
func NewCompositeMetric(name string, metrics ...Metric) *CompositeMetric
NewCompositeMetric creates a new composite metric.
func (*CompositeMetric) Metrics ¶
func (m *CompositeMetric) Metrics() []Metric
Metrics returns the contained metrics.
func (*CompositeMetric) Score ¶
func (m *CompositeMetric) Score(ctx context.Context, input MetricInput) *ScoreResult
Score evaluates all contained metrics and returns the average score.
func (*CompositeMetric) ScoreAll ¶
func (m *CompositeMetric) ScoreAll(ctx context.Context, input MetricInput) ScoreResults
ScoreAll evaluates all contained metrics and returns all results.
type ConditionalMetric ¶
type ConditionalMetric struct {
BaseMetric
// contains filtered or unexported fields
}
ConditionalMetric evaluates a metric only if a condition is met.
func NewConditionalMetric ¶
func NewConditionalMetric(name string, condition func(input MetricInput) bool, metric Metric) *ConditionalMetric
NewConditionalMetric creates a new conditional metric.
func (*ConditionalMetric) Score ¶
func (m *ConditionalMetric) Score(ctx context.Context, input MetricInput) *ScoreResult
Score evaluates the metric if the condition is met.
type DatasetEvaluator ¶
type DatasetEvaluator struct {
// contains filtered or unexported fields
}
DatasetEvaluator evaluates metrics against a dataset.
func NewDatasetEvaluator ¶
func NewDatasetEvaluator(engine *Engine, mapper func(item map[string]any) MetricInput) *DatasetEvaluator
NewDatasetEvaluator creates a new dataset evaluator.
func (*DatasetEvaluator) Evaluate ¶
func (d *DatasetEvaluator) Evaluate(ctx context.Context, items []map[string]any) EvaluationResults
Evaluate evaluates the metrics against dataset items.
type Engine ¶
type Engine struct {
// contains filtered or unexported fields
}
Engine runs evaluation metrics against data.
func NewEngine ¶
func NewEngine(metrics []Metric, opts ...EngineOption) *Engine
NewEngine creates a new evaluation engine.
func (*Engine) EvaluateMany ¶
func (e *Engine) EvaluateMany(ctx context.Context, inputs []MetricInput) EvaluationResults
EvaluateMany evaluates multiple inputs against all metrics.
func (*Engine) EvaluateOne ¶
func (e *Engine) EvaluateOne(ctx context.Context, input MetricInput) *EvaluationResult
EvaluateOne evaluates a single input against all metrics.
func (*Engine) EvaluateWithIDs ¶
func (e *Engine) EvaluateWithIDs(ctx context.Context, items map[string]MetricInput) EvaluationResults
EvaluateWithIDs evaluates inputs with explicit IDs.
type EngineOption ¶
type EngineOption func(*Engine)
EngineOption configures the evaluation engine.
func WithCallback ¶
func WithCallback(cb EvaluationCallback) EngineOption
WithCallback adds a callback for progress updates.
func WithConcurrency ¶
func WithConcurrency(n int) EngineOption
WithConcurrency sets the number of concurrent evaluations.
type EvaluationCallback ¶
type EvaluationCallback func(completed, total int, result *EvaluationResult)
EvaluationCallback is called during evaluation for progress updates.
type EvaluationResult ¶
type EvaluationResult struct {
// ItemID is the identifier for the evaluated item.
ItemID string
// Input is the input that was evaluated.
Input MetricInput
// Scores contains all metric scores for this item.
Scores ScoreResults
// Error is set if evaluation failed entirely.
Error error
}
EvaluationResult represents the result of evaluating a single item.
func EvaluateSingle ¶
func EvaluateSingle(ctx context.Context, metrics []Metric, input MetricInput) *EvaluationResult
EvaluateSingle is a convenience function to evaluate a single input.
func (*EvaluationResult) AverageScore ¶
func (r *EvaluationResult) AverageScore() float64
AverageScore returns the average of all successful scores.
func (*EvaluationResult) IsSuccess ¶
func (r *EvaluationResult) IsSuccess() bool
IsSuccess returns true if evaluation completed without error.
type EvaluationResults ¶
type EvaluationResults []*EvaluationResult
EvaluationResults is a collection of evaluation results.
func Evaluate ¶
func Evaluate(ctx context.Context, metrics []Metric, inputs []MetricInput, opts ...EngineOption) EvaluationResults
Evaluate is a convenience function to evaluate inputs with metrics.
func (EvaluationResults) AverageByMetric ¶
func (r EvaluationResults) AverageByMetric(metricName string) float64
AverageByMetric returns the average score for a specific metric across all items.
func (EvaluationResults) Failed ¶
func (r EvaluationResults) Failed() EvaluationResults
Failed returns only the failed evaluation results.
func (EvaluationResults) Successful ¶
func (r EvaluationResults) Successful() EvaluationResults
Successful returns only the successful evaluation results.
func (EvaluationResults) Summary ¶
func (r EvaluationResults) Summary() map[string]float64
Summary returns a summary of scores by metric name.
type Metric ¶
type Metric interface {
// Name returns the name of the metric.
Name() string
// Score evaluates the metric and returns a score result.
Score(ctx context.Context, input MetricInput) *ScoreResult
}
Metric is the interface for all evaluation metrics.
type MetricFunc ¶
type MetricFunc struct {
BaseMetric
// contains filtered or unexported fields
}
MetricFunc is a function-based metric implementation.
func NewMetricFunc ¶
func NewMetricFunc(name string, fn func(ctx context.Context, input MetricInput) *ScoreResult) *MetricFunc
NewMetricFunc creates a new metric from a function.
func (*MetricFunc) Score ¶
func (m *MetricFunc) Score(ctx context.Context, input MetricInput) *ScoreResult
Score evaluates the metric.
type MetricInput ¶
type MetricInput struct {
// Input is the input to the LLM/model.
Input string
// Output is the output from the LLM/model.
Output string
// Expected is the expected/reference output (for comparison metrics).
Expected string
// Context is additional context provided to the model.
Context string
// Metadata contains additional key-value pairs.
Metadata map[string]any
}
MetricInput contains the inputs for metric evaluation.
func NewMetricInput ¶
func NewMetricInput(input, output string) MetricInput
NewMetricInput creates a new MetricInput with the given input and output.
func (MetricInput) Get ¶
func (m MetricInput) Get(key string) (any, bool)
Get retrieves a value from metadata.
func (MetricInput) GetString ¶
func (m MetricInput) GetString(key string) string
GetString retrieves a string value from metadata.
func (MetricInput) GetStringSlice ¶
func (m MetricInput) GetStringSlice(key string) []string
GetStringSlice retrieves a string slice from metadata.
func (MetricInput) WithContext ¶
func (m MetricInput) WithContext(ctx string) MetricInput
WithContext returns a copy of the input with the context value set.
func (MetricInput) WithExpected ¶
func (m MetricInput) WithExpected(expected string) MetricInput
WithExpected returns a copy of the input with the expected value set.
func (MetricInput) WithMetadata ¶
func (m MetricInput) WithMetadata(key string, value any) MetricInput
WithMetadata returns a copy of the input with additional metadata.
type ScoreResult ¶
type ScoreResult struct {
// Name is the name of the metric.
Name string `json:"name"`
// Value is the numeric score value (typically 0.0 to 1.0).
Value float64 `json:"value"`
// Reason is an optional explanation for the score.
Reason string `json:"reason,omitempty"`
// Metadata contains additional information about the score.
Metadata map[string]any `json:"metadata,omitempty"`
// Error is set if the metric evaluation failed.
Error error `json:"error,omitempty"`
}
ScoreResult represents the result of a metric evaluation.
func BooleanScore ¶
func BooleanScore(name string, value bool) *ScoreResult
BooleanScore converts a boolean to a score (1.0 for true, 0.0 for false).
func BooleanScoreWithReason ¶
func BooleanScoreWithReason(name string, value bool, reason string) *ScoreResult
BooleanScoreWithReason converts a boolean to a score with a reason.
func NewFailedScoreResult ¶
func NewFailedScoreResult(name string, err error) *ScoreResult
NewFailedScoreResult creates a new failed score result.
func NewScoreResult ¶
func NewScoreResult(name string, value float64) *ScoreResult
NewScoreResult creates a new successful score result.
func NewScoreResultWithReason ¶
func NewScoreResultWithReason(name string, value float64, reason string) *ScoreResult
NewScoreResultWithReason creates a new score result with a reason.
func (*ScoreResult) IsSuccess ¶
func (s *ScoreResult) IsSuccess() bool
IsSuccess returns true if the score was computed successfully.
func (*ScoreResult) String ¶
func (s *ScoreResult) String() string
String returns a human-readable representation of the score.
func (*ScoreResult) ToJSON ¶
func (s *ScoreResult) ToJSON() ([]byte, error)
ToJSON returns the score as JSON bytes.
type ScoreResults ¶
type ScoreResults []*ScoreResult
ScoreResults is a collection of score results.
func (ScoreResults) AllByName ¶
func (r ScoreResults) AllByName(name string) ScoreResults
AllByName returns all score results with the given name.
func (ScoreResults) Average ¶
func (r ScoreResults) Average() float64
Average returns the average value of successful scores.
func (ScoreResults) AverageByName ¶
func (r ScoreResults) AverageByName(name string) float64
AverageByName returns the average value of scores with the given name.
func (ScoreResults) ByName ¶
func (r ScoreResults) ByName(name string) *ScoreResult
ByName returns the first score result with the given name.
func (ScoreResults) Failed ¶
func (r ScoreResults) Failed() ScoreResults
Failed returns only the failed score results.
func (ScoreResults) Successful ¶
func (r ScoreResults) Successful() ScoreResults
Successful returns only the successful score results.
type WeightedMetric ¶
type WeightedMetric struct {
// contains filtered or unexported fields
}
WeightedMetric applies a weight to a metric's score.
func NewWeightedMetric ¶
func NewWeightedMetric(metric Metric, weight float64) *WeightedMetric
NewWeightedMetric creates a new weighted metric.
func (*WeightedMetric) Name ¶
func (m *WeightedMetric) Name() string
Name returns the name of the underlying metric.
func (*WeightedMetric) Score ¶
func (m *WeightedMetric) Score(ctx context.Context, input MetricInput) *ScoreResult
Score evaluates the metric and applies the weight.
func (*WeightedMetric) Weight ¶
func (m *WeightedMetric) Weight() float64
Weight returns the weight factor.
Directories
¶
| Path | Synopsis |
|---|---|
|
Package heuristic provides rule-based evaluation metrics that don't require LLM calls.
|
Package heuristic provides rule-based evaluation metrics that don't require LLM calls. |
|
Package llm provides LLM-based evaluation metrics.
|
Package llm provides LLM-based evaluation metrics. |