evaluation

package
v0.5.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Feb 14, 2026 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CompareModels

func CompareModels(benchmarks ...*ModelBenchmark) map[string]any

CompareModels compares benchmark results between models

func ExportResults

func ExportResults(benchmark *ModelBenchmark, filename string) error

ExportResults exports benchmark results to JSON

func FormatReport

func FormatReport(report *BenchmarkReport, format OutputFormat, w io.Writer) error

FormatReport formats benchmark report in specified format

func SaveReport

func SaveReport(report *BenchmarkReport, path string) error

SaveReport saves benchmark report to JSON file

Types

type BaselineCompare

type BaselineCompare struct {
	BaselineCommit  string    `json:"baseline_commit"`
	BaselineDate    time.Time `json:"baseline_date"`
	SuccessRateDiff float64   `json:"success_rate_diff"`
	LatencyDiffMs   int64     `json:"latency_diff_ms"`
	TokensDiff      int       `json:"tokens_diff"`
	Regressions     []string  `json:"regressions,omitempty"`
	Improvements    []string  `json:"improvements,omitempty"`
	HasRegression   bool      `json:"has_regression"`
}

BaselineCompare represents comparison against a baseline

func CompareWithBaseline

func CompareWithBaseline(current, baseline *ModelBenchmark, baselineCommit string, baselineDate time.Time) *BaselineCompare

CompareWithBaseline compares current benchmark with baseline

type BenchmarkReport

type BenchmarkReport struct {
	Version     string           `json:"version"`
	GeneratedAt time.Time        `json:"generated_at"`
	GitCommit   string           `json:"git_commit,omitempty"`
	GitBranch   string           `json:"git_branch,omitempty"`
	Environment string           `json:"environment,omitempty"`
	Benchmark   *ModelBenchmark  `json:"benchmark"`
	Comparison  *BaselineCompare `json:"comparison,omitempty"`
}

BenchmarkReport represents a complete benchmark report with metadata

func LoadReport

func LoadReport(path string) (*BenchmarkReport, error)

LoadReport loads benchmark report from JSON file

type BenchmarkResult

type BenchmarkResult struct {
	TestCase     TestCase            `json:"test_case"`
	Success      bool                `json:"success"`
	ActualOutput string              `json:"actual_output"`
	ParsedResult *parser.ParseResult `json:"parsed_result,omitempty"`
	Latency      time.Duration       `json:"latency"`
	TokensUsed   int                 `json:"tokens_used"`
	Iterations   int                 `json:"iterations"`
	ErrorMessage string              `json:"error_message,omitempty"`
	Timestamp    time.Time           `json:"timestamp"`
}

BenchmarkResult represents the result of a single test

type BenchmarkSummary

type BenchmarkSummary struct {
	TotalTests          int                         `json:"total_tests"`
	PassedTests         int                         `json:"passed_tests"`
	FailedTests         int                         `json:"failed_tests"`
	SuccessRate         float64                     `json:"success_rate"`
	AverageLatency      time.Duration               `json:"average_latency"`
	MedianLatency       time.Duration               `json:"median_latency"`
	P95Latency          time.Duration               `json:"p95_latency"`
	TotalTokens         int                         `json:"total_tokens"`
	AverageIterations   float64                     `json:"average_iterations"`
	CategoryBreakdown   map[string]*CategoryStats   `json:"category_breakdown"`
	DifficultyBreakdown map[string]*DifficultyStats `json:"difficulty_breakdown"`
}

BenchmarkSummary provides summary statistics

type CategoryStats

type CategoryStats struct {
	Total       int     `json:"total"`
	Passed      int     `json:"passed"`
	SuccessRate float64 `json:"success_rate"`
}

CategoryStats provides statistics per category

type DifficultyStats

type DifficultyStats struct {
	Total       int           `json:"total"`
	Passed      int           `json:"passed"`
	SuccessRate float64       `json:"success_rate"`
	AvgLatency  time.Duration `json:"avg_latency"`
}

DifficultyStats provides statistics per difficulty level

type Evaluator

type Evaluator struct {
	// contains filtered or unexported fields
}

Evaluator runs benchmarks on LLM providers

func NewEvaluator

func NewEvaluator(provider provider.Provider, modelName string) *Evaluator

NewEvaluator creates a new evaluator

func (*Evaluator) RunBenchmark

func (e *Evaluator) RunBenchmark(ctx context.Context, suite *TestSuite, modelName string) (*ModelBenchmark, error)

RunBenchmark runs a complete benchmark suite

type ModelBenchmark

type ModelBenchmark struct {
	ModelName string            `json:"model_name"`
	TestSuite string            `json:"test_suite"`
	Results   []BenchmarkResult `json:"results"`
	Summary   BenchmarkSummary  `json:"summary"`
	StartTime time.Time         `json:"start_time"`
	EndTime   time.Time         `json:"end_time"`
}

ModelBenchmark represents benchmark results for a model

type OutputFormat

type OutputFormat string

OutputFormat represents output format type

const (
	FormatJSON     OutputFormat = "json"
	FormatMarkdown OutputFormat = "markdown"
	FormatText     OutputFormat = "text"
)

type TestCase

type TestCase struct {
	ID           string         `json:"id"`
	Name         string         `json:"name"`
	Description  string         `json:"description"`
	Input        string         `json:"input"`
	ExpectedTool string         `json:"expected_tool,omitempty"`
	ExpectedArgs map[string]any `json:"expected_args,omitempty"`
	ExpectedType string         `json:"expected_type"` // "tool_call" or "direct_answer"
	Category     string         `json:"category"`
	Difficulty   string         `json:"difficulty"` // easy, medium, hard
	Tags         []string       `json:"tags"`
}

TestCase represents a single test case for evaluation

type TestSuite

type TestSuite struct {
	Name      string     `json:"name"`
	Version   string     `json:"version"`
	TestCases []TestCase `json:"test_cases"`
}

TestSuite represents a collection of test cases

func GetDefaultTestSuite

func GetDefaultTestSuite() *TestSuite

GetDefaultTestSuite returns a comprehensive test suite

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL