eval

package
v1.4.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 12, 2026 License: Apache-2.0 Imports: 6 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type Comparison

type Comparison struct {
	Models map[string]Summary `json:"models"`
}

Comparison holds model-to-summary mapping

func (Comparison) JSON

func (c Comparison) JSON() []byte

type Evaluator

type Evaluator struct{}

Evaluator runs scenarios against models and aggregates metrics

func (*Evaluator) CompareModels

func (e *Evaluator) CompareModels(ctx context.Context, modelsMap map[string]models.Model, scenarios []Scenario) (map[string][]RunMetrics, Comparison)

CompareModels runs the same scenarios on multiple models and returns a comparison

func (*Evaluator) EvaluateModel

func (e *Evaluator) EvaluateModel(ctx context.Context, m models.Model, scenarios []Scenario) ([]RunMetrics, Summary)

EvaluateModel runs scenarios on a single model

type RunMetrics

type RunMetrics struct {
	Model            string        `json:"model"`
	Duration         time.Duration `json:"duration"`
	PromptTokens     int           `json:"prompt_tokens"`
	CompletionTokens int           `json:"completion_tokens"`
	TotalTokens      int           `json:"total_tokens"`
	ToolCalls        int           `json:"tool_calls"`
	Success          bool          `json:"success"`
	Error            string        `json:"error,omitempty"`
	Timestamp        time.Time     `json:"timestamp"`
}

RunMetrics captures metrics for a single evaluation run

type Scenario

type Scenario struct {
	Input string
	// ExpectedContains checks if model output includes this substring (minimal oracle)
	ExpectedContains string
}

Scenario represents a simple evaluation case

type Summary

type Summary struct {
	Runs            int                `json:"runs"`
	Successes       int                `json:"successes"`
	Failures        int                `json:"failures"`
	SuccessRate     float64            `json:"success_rate"`
	AvgLatencyMS    float64            `json:"avg_latency_ms"`
	AvgPromptTokens float64            `json:"avg_prompt_tokens"`
	AvgCompletion   float64            `json:"avg_completion_tokens"`
	AvgTotalTokens  float64            `json:"avg_total_tokens"`
	TotalToolCalls  int                `json:"total_tool_calls"`
	ByModel         map[string]Summary `json:"by_model,omitempty"`
}

Summary aggregates metrics across runs

func (Summary) JSON

func (s Summary) JSON() []byte

JSON encoders for reports

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL