evaluation

package

v0.1.0 Latest Latest Go to latest Published: Dec 6, 2025 License: MIT Imports: 10 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/aixgo-dev/aixgo

Links

Open Source Insights

Documentation ¶

Index ¶

func CompareModels(benchmarks ...*ModelBenchmark) map[string]any
func ExportResults(benchmark *ModelBenchmark, filename string) error
func FormatReport(report *BenchmarkReport, format OutputFormat, w io.Writer) error
func SaveReport(report *BenchmarkReport, path string) error
type BaselineCompare
- func CompareWithBaseline(current, baseline *ModelBenchmark, baselineCommit string, ...) *BaselineCompare
type BenchmarkReport
- func LoadReport(path string) (*BenchmarkReport, error)
type BenchmarkResult
type BenchmarkSummary
type CategoryStats
type DifficultyStats
type Evaluator
- func NewEvaluator(provider provider.Provider, modelName string) *Evaluator
- func (e *Evaluator) RunBenchmark(ctx context.Context, suite *TestSuite, modelName string) (*ModelBenchmark, error)
type ModelBenchmark
type OutputFormat
type TestCase
type TestSuite
- func GetDefaultTestSuite() *TestSuite

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func CompareModels ¶

func CompareModels(benchmarks ...*ModelBenchmark) map[string]any

CompareModels compares benchmark results between models

func ExportResults ¶

func ExportResults(benchmark *ModelBenchmark, filename string) error

ExportResults exports benchmark results to JSON

func FormatReport ¶

func FormatReport(report *BenchmarkReport, format OutputFormat, w io.Writer) error

FormatReport formats benchmark report in specified format

func SaveReport ¶

func SaveReport(report *BenchmarkReport, path string) error

SaveReport saves benchmark report to JSON file

Types ¶

type BaselineCompare ¶

type BaselineCompare struct {
	BaselineCommit  string    `json:"baseline_commit"`
	BaselineDate    time.Time `json:"baseline_date"`
	SuccessRateDiff float64   `json:"success_rate_diff"`
	LatencyDiffMs   int64     `json:"latency_diff_ms"`
	TokensDiff      int       `json:"tokens_diff"`
	Regressions     []string  `json:"regressions,omitempty"`
	Improvements    []string  `json:"improvements,omitempty"`
	HasRegression   bool      `json:"has_regression"`
}

BaselineCompare represents comparison against a baseline

func CompareWithBaseline ¶

func CompareWithBaseline(current, baseline *ModelBenchmark, baselineCommit string, baselineDate time.Time) *BaselineCompare

CompareWithBaseline compares current benchmark with baseline

type BenchmarkReport ¶

type BenchmarkReport struct {
	Version     string           `json:"version"`
	GeneratedAt time.Time        `json:"generated_at"`
	GitCommit   string           `json:"git_commit,omitempty"`
	GitBranch   string           `json:"git_branch,omitempty"`
	Environment string           `json:"environment,omitempty"`
	Benchmark   *ModelBenchmark  `json:"benchmark"`
	Comparison  *BaselineCompare `json:"comparison,omitempty"`
}

BenchmarkReport represents a complete benchmark report with metadata

func LoadReport ¶

func LoadReport(path string) (*BenchmarkReport, error)

LoadReport loads benchmark report from JSON file

type BenchmarkResult ¶

type BenchmarkResult struct {
	TestCase     TestCase            `json:"test_case"`
	Success      bool                `json:"success"`
	ActualOutput string              `json:"actual_output"`
	ParsedResult *parser.ParseResult `json:"parsed_result,omitempty"`
	Latency      time.Duration       `json:"latency"`
	TokensUsed   int                 `json:"tokens_used"`
	Iterations   int                 `json:"iterations"`
	ErrorMessage string              `json:"error_message,omitempty"`
	Timestamp    time.Time           `json:"timestamp"`
}

BenchmarkResult represents the result of a single test

type BenchmarkSummary ¶

type BenchmarkSummary struct {
	TotalTests          int                         `json:"total_tests"`
	PassedTests         int                         `json:"passed_tests"`
	FailedTests         int                         `json:"failed_tests"`
	SuccessRate         float64                     `json:"success_rate"`
	AverageLatency      time.Duration               `json:"average_latency"`
	MedianLatency       time.Duration               `json:"median_latency"`
	P95Latency          time.Duration               `json:"p95_latency"`
	TotalTokens         int                         `json:"total_tokens"`
	AverageIterations   float64                     `json:"average_iterations"`
	CategoryBreakdown   map[string]*CategoryStats   `json:"category_breakdown"`
	DifficultyBreakdown map[string]*DifficultyStats `json:"difficulty_breakdown"`
}

BenchmarkSummary provides summary statistics

type CategoryStats ¶

type CategoryStats struct {
	Total       int     `json:"total"`
	Passed      int     `json:"passed"`
	SuccessRate float64 `json:"success_rate"`
}

CategoryStats provides statistics per category

type DifficultyStats ¶

type DifficultyStats struct {
	Total       int           `json:"total"`
	Passed      int           `json:"passed"`
	SuccessRate float64       `json:"success_rate"`
	AvgLatency  time.Duration `json:"avg_latency"`
}

DifficultyStats provides statistics per difficulty level

type Evaluator ¶

type Evaluator struct {
	// contains filtered or unexported fields
}

Evaluator runs benchmarks on LLM providers

func NewEvaluator ¶

func NewEvaluator(provider provider.Provider, modelName string) *Evaluator

NewEvaluator creates a new evaluator

func (*Evaluator) RunBenchmark ¶

func (e *Evaluator) RunBenchmark(ctx context.Context, suite *TestSuite, modelName string) (*ModelBenchmark, error)

RunBenchmark runs a complete benchmark suite

type ModelBenchmark ¶

type ModelBenchmark struct {
	ModelName string            `json:"model_name"`
	TestSuite string            `json:"test_suite"`
	Results   []BenchmarkResult `json:"results"`
	Summary   BenchmarkSummary  `json:"summary"`
	StartTime time.Time         `json:"start_time"`
	EndTime   time.Time         `json:"end_time"`
}

ModelBenchmark represents benchmark results for a model

type OutputFormat ¶

type OutputFormat string

OutputFormat represents output format type

const (
	FormatJSON     OutputFormat = "json"
	FormatMarkdown OutputFormat = "markdown"
	FormatText     OutputFormat = "text"
)

type TestCase ¶

type TestCase struct {
	ID           string         `json:"id"`
	Name         string         `json:"name"`
	Description  string         `json:"description"`
	Input        string         `json:"input"`
	ExpectedTool string         `json:"expected_tool,omitempty"`
	ExpectedArgs map[string]any `json:"expected_args,omitempty"`
	ExpectedType string         `json:"expected_type"` // "tool_call" or "direct_answer"
	Category     string         `json:"category"`
	Difficulty   string         `json:"difficulty"` // easy, medium, hard
	Tags         []string       `json:"tags"`
}

TestCase represents a single test case for evaluation

type TestSuite ¶

type TestSuite struct {
	Name      string     `json:"name"`
	Version   string     `json:"version"`
	TestCases []TestCase `json:"test_cases"`
}

TestSuite represents a collection of test cases

func GetDefaultTestSuite ¶

func GetDefaultTestSuite() *TestSuite

GetDefaultTestSuite returns a comprehensive test suite

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL