Documentation
¶
Index ¶
- func CompareModels(benchmarks ...*ModelBenchmark) map[string]any
- func ExportResults(benchmark *ModelBenchmark, filename string) error
- func FormatReport(report *BenchmarkReport, format OutputFormat, w io.Writer) error
- func SaveReport(report *BenchmarkReport, path string) error
- type BaselineCompare
- type BenchmarkReport
- type BenchmarkResult
- type BenchmarkSummary
- type CategoryStats
- type DifficultyStats
- type Evaluator
- type ModelBenchmark
- type OutputFormat
- type TestCase
- type TestSuite
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func CompareModels ¶
func CompareModels(benchmarks ...*ModelBenchmark) map[string]any
CompareModels compares benchmark results between models
func ExportResults ¶
func ExportResults(benchmark *ModelBenchmark, filename string) error
ExportResults exports benchmark results to JSON
func FormatReport ¶
func FormatReport(report *BenchmarkReport, format OutputFormat, w io.Writer) error
FormatReport formats benchmark report in specified format
func SaveReport ¶
func SaveReport(report *BenchmarkReport, path string) error
SaveReport saves benchmark report to JSON file
Types ¶
type BaselineCompare ¶
type BaselineCompare struct {
BaselineCommit string `json:"baseline_commit"`
BaselineDate time.Time `json:"baseline_date"`
SuccessRateDiff float64 `json:"success_rate_diff"`
LatencyDiffMs int64 `json:"latency_diff_ms"`
TokensDiff int `json:"tokens_diff"`
Regressions []string `json:"regressions,omitempty"`
Improvements []string `json:"improvements,omitempty"`
HasRegression bool `json:"has_regression"`
}
BaselineCompare represents comparison against a baseline
func CompareWithBaseline ¶
func CompareWithBaseline(current, baseline *ModelBenchmark, baselineCommit string, baselineDate time.Time) *BaselineCompare
CompareWithBaseline compares current benchmark with baseline
type BenchmarkReport ¶
type BenchmarkReport struct {
Version string `json:"version"`
GeneratedAt time.Time `json:"generated_at"`
GitCommit string `json:"git_commit,omitempty"`
GitBranch string `json:"git_branch,omitempty"`
Environment string `json:"environment,omitempty"`
Benchmark *ModelBenchmark `json:"benchmark"`
Comparison *BaselineCompare `json:"comparison,omitempty"`
}
BenchmarkReport represents a complete benchmark report with metadata
func LoadReport ¶
func LoadReport(path string) (*BenchmarkReport, error)
LoadReport loads benchmark report from JSON file
type BenchmarkResult ¶
type BenchmarkResult struct {
TestCase TestCase `json:"test_case"`
Success bool `json:"success"`
ActualOutput string `json:"actual_output"`
ParsedResult *parser.ParseResult `json:"parsed_result,omitempty"`
Latency time.Duration `json:"latency"`
TokensUsed int `json:"tokens_used"`
Iterations int `json:"iterations"`
ErrorMessage string `json:"error_message,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
BenchmarkResult represents the result of a single test
type BenchmarkSummary ¶
type BenchmarkSummary struct {
TotalTests int `json:"total_tests"`
PassedTests int `json:"passed_tests"`
FailedTests int `json:"failed_tests"`
SuccessRate float64 `json:"success_rate"`
AverageLatency time.Duration `json:"average_latency"`
MedianLatency time.Duration `json:"median_latency"`
P95Latency time.Duration `json:"p95_latency"`
TotalTokens int `json:"total_tokens"`
AverageIterations float64 `json:"average_iterations"`
CategoryBreakdown map[string]*CategoryStats `json:"category_breakdown"`
DifficultyBreakdown map[string]*DifficultyStats `json:"difficulty_breakdown"`
}
BenchmarkSummary provides summary statistics
type CategoryStats ¶
type CategoryStats struct {
Total int `json:"total"`
Passed int `json:"passed"`
SuccessRate float64 `json:"success_rate"`
}
CategoryStats provides statistics per category
type DifficultyStats ¶
type DifficultyStats struct {
Total int `json:"total"`
Passed int `json:"passed"`
SuccessRate float64 `json:"success_rate"`
AvgLatency time.Duration `json:"avg_latency"`
}
DifficultyStats provides statistics per difficulty level
type Evaluator ¶
type Evaluator struct {
// contains filtered or unexported fields
}
Evaluator runs benchmarks on LLM providers
func NewEvaluator ¶
NewEvaluator creates a new evaluator
func (*Evaluator) RunBenchmark ¶
func (e *Evaluator) RunBenchmark(ctx context.Context, suite *TestSuite, modelName string) (*ModelBenchmark, error)
RunBenchmark runs a complete benchmark suite
type ModelBenchmark ¶
type ModelBenchmark struct {
ModelName string `json:"model_name"`
TestSuite string `json:"test_suite"`
Results []BenchmarkResult `json:"results"`
Summary BenchmarkSummary `json:"summary"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
}
ModelBenchmark represents benchmark results for a model
type OutputFormat ¶
type OutputFormat string
OutputFormat represents output format type
const ( FormatJSON OutputFormat = "json" FormatMarkdown OutputFormat = "markdown" FormatText OutputFormat = "text" )
type TestCase ¶
type TestCase struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Input string `json:"input"`
ExpectedTool string `json:"expected_tool,omitempty"`
ExpectedArgs map[string]any `json:"expected_args,omitempty"`
ExpectedType string `json:"expected_type"` // "tool_call" or "direct_answer"
Category string `json:"category"`
Difficulty string `json:"difficulty"` // easy, medium, hard
Tags []string `json:"tags"`
}
TestCase represents a single test case for evaluation
type TestSuite ¶
type TestSuite struct {
Name string `json:"name"`
Version string `json:"version"`
TestCases []TestCase `json:"test_cases"`
}
TestSuite represents a collection of test cases
func GetDefaultTestSuite ¶
func GetDefaultTestSuite() *TestSuite
GetDefaultTestSuite returns a comprehensive test suite