evaluation

package

v0.5.1 Latest Latest Go to latest Published: Jan 17, 2026 License: MIT Imports: 4 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/agentplexus/go-opik

Links

Open Source Insights

Documentation ¶

Overview ¶

Package evaluation provides a framework for evaluating LLM outputs.

The evaluation framework consists of:

Metrics: Interfaces and base types for implementing evaluation metrics
Scores: Result types for metric evaluations
Engine: Runs metrics against inputs concurrently

Sub-packages ¶

heuristic: Rule-based metrics (string matching, JSON validation, text similarity)
llm: LLM-based judge metrics (relevance, hallucination, factuality)

Basic Usage ¶

import (
    "github.com/agentplexus/go-opik/evaluation"
    "github.com/agentplexus/go-opik/evaluation/heuristic"
)

// Create metrics
metrics := []evaluation.Metric{
    heuristic.NewEquals(false),
    heuristic.NewContains(false),
    heuristic.NewIsJSON(),
}

// Create engine
engine := evaluation.NewEngine(metrics,
    evaluation.WithConcurrency(4),
)

// Create input
input := evaluation.NewMetricInput("What is 2+2?", "4")
input = input.WithExpected("4")

// Evaluate
result := engine.EvaluateOne(ctx, input)
fmt.Printf("Score: %.2f\n", result.AverageScore())

Custom Metrics ¶

type MyMetric struct {
    evaluation.BaseMetric
}

func NewMyMetric() *MyMetric {
    return &MyMetric{
        BaseMetric: evaluation.NewBaseMetric("my_metric"),
    }
}

func (m *MyMetric) Score(ctx context.Context, input evaluation.MetricInput) *evaluation.ScoreResult {
    // Custom evaluation logic
    return evaluation.NewScoreResult(m.Name(), 0.95)
}

Index ¶

func DefaultInputMapper(inputKey, outputKey, expectedKey string) func(item map[string]any) MetricInput
type AsyncMetric
type BaseMetric
- func NewBaseMetric(name string) BaseMetric
- func (m BaseMetric) Name() string
type BatchMetric
type CompositeMetric
- func NewCompositeMetric(name string, metrics ...Metric) *CompositeMetric
- func (m *CompositeMetric) Metrics() []Metric
- func (m *CompositeMetric) Score(ctx context.Context, input MetricInput) *ScoreResult
- func (m *CompositeMetric) ScoreAll(ctx context.Context, input MetricInput) ScoreResults
type ConditionalMetric
- func NewConditionalMetric(name string, condition func(input MetricInput) bool, metric Metric) *ConditionalMetric
- func (m *ConditionalMetric) Score(ctx context.Context, input MetricInput) *ScoreResult
type DatasetEvaluator
- func NewDatasetEvaluator(engine *Engine, mapper func(item map[string]any) MetricInput) *DatasetEvaluator
- func (d *DatasetEvaluator) Evaluate(ctx context.Context, items []map[string]any) EvaluationResults
type Engine
- func NewEngine(metrics []Metric, opts ...EngineOption) *Engine
- func (e *Engine) EvaluateMany(ctx context.Context, inputs []MetricInput) EvaluationResults
- func (e *Engine) EvaluateOne(ctx context.Context, input MetricInput) *EvaluationResult
- func (e *Engine) EvaluateWithIDs(ctx context.Context, items map[string]MetricInput) EvaluationResults
- func (e *Engine) Metrics() []Metric
type EngineOption
- func WithCallback(cb EvaluationCallback) EngineOption
- func WithConcurrency(n int) EngineOption
type EvaluationCallback
type EvaluationResult
- func EvaluateSingle(ctx context.Context, metrics []Metric, input MetricInput) *EvaluationResult
- func (r *EvaluationResult) AverageScore() float64
- func (r *EvaluationResult) IsSuccess() bool
type EvaluationResults
- func Evaluate(ctx context.Context, metrics []Metric, inputs []MetricInput, ...) EvaluationResults
- func (r EvaluationResults) AverageByMetric(metricName string) float64
- func (r EvaluationResults) Failed() EvaluationResults
- func (r EvaluationResults) Successful() EvaluationResults
- func (r EvaluationResults) Summary() map[string]float64
type Metric
type MetricFunc
- func NewMetricFunc(name string, fn func(ctx context.Context, input MetricInput) *ScoreResult) *MetricFunc
- func (m *MetricFunc) Score(ctx context.Context, input MetricInput) *ScoreResult
type MetricInput
- func NewMetricInput(input, output string) MetricInput
- func (m MetricInput) Get(key string) (any, bool)
- func (m MetricInput) GetString(key string) string
- func (m MetricInput) GetStringSlice(key string) []string
- func (m MetricInput) WithContext(ctx string) MetricInput
- func (m MetricInput) WithExpected(expected string) MetricInput
- func (m MetricInput) WithMetadata(key string, value any) MetricInput
type ScoreResult
- func BooleanScore(name string, value bool) *ScoreResult
- func BooleanScoreWithReason(name string, value bool, reason string) *ScoreResult
- func NewFailedScoreResult(name string, err error) *ScoreResult
- func NewScoreResult(name string, value float64) *ScoreResult
- func NewScoreResultWithReason(name string, value float64, reason string) *ScoreResult
- func (s *ScoreResult) IsSuccess() bool
- func (s *ScoreResult) String() string
- func (s *ScoreResult) ToJSON() ([]byte, error)
type ScoreResults
- func (r ScoreResults) AllByName(name string) ScoreResults
- func (r ScoreResults) Average() float64
- func (r ScoreResults) AverageByName(name string) float64
- func (r ScoreResults) ByName(name string) *ScoreResult
- func (r ScoreResults) Failed() ScoreResults
- func (r ScoreResults) Successful() ScoreResults
type WeightedMetric
- func NewWeightedMetric(metric Metric, weight float64) *WeightedMetric
- func (m *WeightedMetric) Name() string
- func (m *WeightedMetric) Score(ctx context.Context, input MetricInput) *ScoreResult
- func (m *WeightedMetric) Weight() float64

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func DefaultInputMapper ¶

func DefaultInputMapper(inputKey, outputKey, expectedKey string) func(item map[string]any) MetricInput

DefaultInputMapper creates a default input mapper for common dataset structures.

Types ¶

type AsyncMetric ¶

type AsyncMetric interface {
	Metric
	// ScoreAsync evaluates the metric asynchronously.
	ScoreAsync(ctx context.Context, input MetricInput) <-chan *ScoreResult
}

AsyncMetric is a metric that can be evaluated asynchronously.

type BaseMetric ¶

type BaseMetric struct {
	// contains filtered or unexported fields
}

BaseMetric provides common functionality for metrics.

func NewBaseMetric ¶

func NewBaseMetric(name string) BaseMetric

NewBaseMetric creates a new base metric with the given name.

func (BaseMetric) Name ¶

func (m BaseMetric) Name() string

Name returns the name of the metric.

type BatchMetric ¶

type BatchMetric interface {
	Metric
	// ScoreBatch evaluates multiple inputs and returns results for each.
	ScoreBatch(ctx context.Context, inputs []MetricInput) ScoreResults
}

BatchMetric is a metric that can evaluate multiple inputs at once.

type CompositeMetric ¶

type CompositeMetric struct {
	BaseMetric
	// contains filtered or unexported fields
}

CompositeMetric combines multiple metrics into one.

func NewCompositeMetric ¶

func NewCompositeMetric(name string, metrics ...Metric) *CompositeMetric

NewCompositeMetric creates a new composite metric.

func (*CompositeMetric) Metrics ¶

func (m *CompositeMetric) Metrics() []Metric

Metrics returns the contained metrics.

func (*CompositeMetric) Score ¶

func (m *CompositeMetric) Score(ctx context.Context, input MetricInput) *ScoreResult

Score evaluates all contained metrics and returns the average score.

func (*CompositeMetric) ScoreAll ¶

func (m *CompositeMetric) ScoreAll(ctx context.Context, input MetricInput) ScoreResults

ScoreAll evaluates all contained metrics and returns all results.

type ConditionalMetric ¶

type ConditionalMetric struct {
	BaseMetric
	// contains filtered or unexported fields
}

ConditionalMetric evaluates a metric only if a condition is met.

func NewConditionalMetric ¶

func NewConditionalMetric(name string, condition func(input MetricInput) bool, metric Metric) *ConditionalMetric

NewConditionalMetric creates a new conditional metric.

func (*ConditionalMetric) Score ¶

func (m *ConditionalMetric) Score(ctx context.Context, input MetricInput) *ScoreResult

Score evaluates the metric if the condition is met.

type DatasetEvaluator ¶

type DatasetEvaluator struct {
	// contains filtered or unexported fields
}

DatasetEvaluator evaluates metrics against a dataset.

func NewDatasetEvaluator ¶

func NewDatasetEvaluator(engine *Engine, mapper func(item map[string]any) MetricInput) *DatasetEvaluator

NewDatasetEvaluator creates a new dataset evaluator.

func (*DatasetEvaluator) Evaluate ¶

func (d *DatasetEvaluator) Evaluate(ctx context.Context, items []map[string]any) EvaluationResults

Evaluate evaluates the metrics against dataset items.

type Engine ¶

type Engine struct {
	// contains filtered or unexported fields
}

Engine runs evaluation metrics against data.

func NewEngine ¶

func NewEngine(metrics []Metric, opts ...EngineOption) *Engine

NewEngine creates a new evaluation engine.

func (*Engine) EvaluateMany ¶

func (e *Engine) EvaluateMany(ctx context.Context, inputs []MetricInput) EvaluationResults

EvaluateMany evaluates multiple inputs against all metrics.

func (*Engine) EvaluateOne ¶

func (e *Engine) EvaluateOne(ctx context.Context, input MetricInput) *EvaluationResult

EvaluateOne evaluates a single input against all metrics.

func (*Engine) EvaluateWithIDs ¶

func (e *Engine) EvaluateWithIDs(ctx context.Context, items map[string]MetricInput) EvaluationResults

EvaluateWithIDs evaluates inputs with explicit IDs.

func (*Engine) Metrics ¶

func (e *Engine) Metrics() []Metric

Metrics returns the metrics used by this engine.

type EngineOption ¶

type EngineOption func(*Engine)

EngineOption configures the evaluation engine.

func WithCallback ¶

func WithCallback(cb EvaluationCallback) EngineOption

WithCallback adds a callback for progress updates.

func WithConcurrency ¶

func WithConcurrency(n int) EngineOption

WithConcurrency sets the number of concurrent evaluations.

type EvaluationCallback ¶

type EvaluationCallback func(completed, total int, result *EvaluationResult)

EvaluationCallback is called during evaluation for progress updates.

type EvaluationResult ¶

type EvaluationResult struct {
	// ItemID is the identifier for the evaluated item.
	ItemID string
	// Input is the input that was evaluated.
	Input MetricInput
	// Scores contains all metric scores for this item.
	Scores ScoreResults
	// Error is set if evaluation failed entirely.
	Error error
}

EvaluationResult represents the result of evaluating a single item.

func EvaluateSingle ¶

func EvaluateSingle(ctx context.Context, metrics []Metric, input MetricInput) *EvaluationResult

EvaluateSingle is a convenience function to evaluate a single input.

func (*EvaluationResult) AverageScore ¶

func (r *EvaluationResult) AverageScore() float64

AverageScore returns the average of all successful scores.

func (*EvaluationResult) IsSuccess ¶

func (r *EvaluationResult) IsSuccess() bool

IsSuccess returns true if evaluation completed without error.

type EvaluationResults ¶

type EvaluationResults []*EvaluationResult

EvaluationResults is a collection of evaluation results.

func Evaluate ¶

func Evaluate(ctx context.Context, metrics []Metric, inputs []MetricInput, opts ...EngineOption) EvaluationResults

Evaluate is a convenience function to evaluate inputs with metrics.

func (EvaluationResults) AverageByMetric ¶

func (r EvaluationResults) AverageByMetric(metricName string) float64

AverageByMetric returns the average score for a specific metric across all items.

func (EvaluationResults) Failed ¶

func (r EvaluationResults) Failed() EvaluationResults

Failed returns only the failed evaluation results.

func (EvaluationResults) Successful ¶

func (r EvaluationResults) Successful() EvaluationResults

Successful returns only the successful evaluation results.

func (EvaluationResults) Summary ¶

func (r EvaluationResults) Summary() map[string]float64

Summary returns a summary of scores by metric name.

type Metric ¶

type Metric interface {
	// Name returns the name of the metric.
	Name() string
	// Score evaluates the metric and returns a score result.
	Score(ctx context.Context, input MetricInput) *ScoreResult
}

Metric is the interface for all evaluation metrics.

type MetricFunc ¶

type MetricFunc struct {
	BaseMetric
	// contains filtered or unexported fields
}

MetricFunc is a function-based metric implementation.

func NewMetricFunc ¶

func NewMetricFunc(name string, fn func(ctx context.Context, input MetricInput) *ScoreResult) *MetricFunc

NewMetricFunc creates a new metric from a function.

func (*MetricFunc) Score ¶

func (m *MetricFunc) Score(ctx context.Context, input MetricInput) *ScoreResult

Score evaluates the metric.

type MetricInput ¶

type MetricInput struct {
	// Input is the input to the LLM/model.
	Input string
	// Output is the output from the LLM/model.
	Output string
	// Expected is the expected/reference output (for comparison metrics).
	Expected string
	// Context is additional context provided to the model.
	Context string
	// Metadata contains additional key-value pairs.
	Metadata map[string]any
}

MetricInput contains the inputs for metric evaluation.

func NewMetricInput ¶

func NewMetricInput(input, output string) MetricInput

NewMetricInput creates a new MetricInput with the given input and output.

func (MetricInput) Get ¶

func (m MetricInput) Get(key string) (any, bool)

Get retrieves a value from metadata.

func (MetricInput) GetString ¶

func (m MetricInput) GetString(key string) string

GetString retrieves a string value from metadata.

func (MetricInput) GetStringSlice ¶

func (m MetricInput) GetStringSlice(key string) []string

GetStringSlice retrieves a string slice from metadata.

func (MetricInput) WithContext ¶

func (m MetricInput) WithContext(ctx string) MetricInput

WithContext returns a copy of the input with the context value set.

func (MetricInput) WithExpected ¶

func (m MetricInput) WithExpected(expected string) MetricInput

WithExpected returns a copy of the input with the expected value set.

func (MetricInput) WithMetadata ¶

func (m MetricInput) WithMetadata(key string, value any) MetricInput

WithMetadata returns a copy of the input with additional metadata.

type ScoreResult ¶

type ScoreResult struct {
	// Name is the name of the metric.
	Name string `json:"name"`
	// Value is the numeric score value (typically 0.0 to 1.0).
	Value float64 `json:"value"`
	// Reason is an optional explanation for the score.
	Reason string `json:"reason,omitempty"`
	// Metadata contains additional information about the score.
	Metadata map[string]any `json:"metadata,omitempty"`
	// Error is set if the metric evaluation failed.
	Error error `json:"error,omitempty"`
}

ScoreResult represents the result of a metric evaluation.

func BooleanScore ¶

func BooleanScore(name string, value bool) *ScoreResult

BooleanScore converts a boolean to a score (1.0 for true, 0.0 for false).

func BooleanScoreWithReason ¶

func BooleanScoreWithReason(name string, value bool, reason string) *ScoreResult

BooleanScoreWithReason converts a boolean to a score with a reason.

func NewFailedScoreResult ¶

func NewFailedScoreResult(name string, err error) *ScoreResult

NewFailedScoreResult creates a new failed score result.

func NewScoreResult ¶

func NewScoreResult(name string, value float64) *ScoreResult

NewScoreResult creates a new successful score result.

func NewScoreResultWithReason ¶

func NewScoreResultWithReason(name string, value float64, reason string) *ScoreResult

NewScoreResultWithReason creates a new score result with a reason.

func (*ScoreResult) IsSuccess ¶

func (s *ScoreResult) IsSuccess() bool

IsSuccess returns true if the score was computed successfully.

func (*ScoreResult) String ¶

func (s *ScoreResult) String() string

String returns a human-readable representation of the score.

func (*ScoreResult) ToJSON ¶

func (s *ScoreResult) ToJSON() ([]byte, error)

ToJSON returns the score as JSON bytes.

type ScoreResults ¶

type ScoreResults []*ScoreResult

ScoreResults is a collection of score results.

func (ScoreResults) AllByName ¶

func (r ScoreResults) AllByName(name string) ScoreResults

AllByName returns all score results with the given name.

func (ScoreResults) Average ¶

func (r ScoreResults) Average() float64

Average returns the average value of successful scores.

func (ScoreResults) AverageByName ¶

func (r ScoreResults) AverageByName(name string) float64

AverageByName returns the average value of scores with the given name.

func (ScoreResults) ByName ¶

func (r ScoreResults) ByName(name string) *ScoreResult

ByName returns the first score result with the given name.

func (ScoreResults) Failed ¶

func (r ScoreResults) Failed() ScoreResults

Failed returns only the failed score results.

func (ScoreResults) Successful ¶

func (r ScoreResults) Successful() ScoreResults

Successful returns only the successful score results.

type WeightedMetric ¶

type WeightedMetric struct {
	// contains filtered or unexported fields
}

WeightedMetric applies a weight to a metric's score.

func NewWeightedMetric ¶

func NewWeightedMetric(metric Metric, weight float64) *WeightedMetric

NewWeightedMetric creates a new weighted metric.

func (*WeightedMetric) Name ¶

func (m *WeightedMetric) Name() string

Name returns the name of the underlying metric.

func (*WeightedMetric) Score ¶

func (m *WeightedMetric) Score(ctx context.Context, input MetricInput) *ScoreResult

Score evaluates the metric and applies the weight.

func (*WeightedMetric) Weight ¶

func (m *WeightedMetric) Weight() float64

Weight returns the weight factor.

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
heuristic Package heuristic provides rule-based evaluation metrics that don't require LLM calls.	Package heuristic provides rule-based evaluation metrics that don't require LLM calls.
llm Package llm provides LLM-based evaluation metrics.	Package llm provides LLM-based evaluation metrics.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL