Documentation
¶
Index ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type Comparison ¶
Comparison holds model-to-summary mapping
func (Comparison) JSON ¶
func (c Comparison) JSON() []byte
type Evaluator ¶
type Evaluator struct{}
Evaluator runs scenarios against models and aggregates metrics
func (*Evaluator) CompareModels ¶
func (e *Evaluator) CompareModels(ctx context.Context, modelsMap map[string]models.Model, scenarios []Scenario) (map[string][]RunMetrics, Comparison)
CompareModels runs the same scenarios on multiple models and returns a comparison
func (*Evaluator) EvaluateModel ¶
func (e *Evaluator) EvaluateModel(ctx context.Context, m models.Model, scenarios []Scenario) ([]RunMetrics, Summary)
EvaluateModel runs scenarios on a single model
type RunMetrics ¶
type RunMetrics struct {
Model string `json:"model"`
Duration time.Duration `json:"duration"`
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
TotalTokens int `json:"total_tokens"`
ToolCalls int `json:"tool_calls"`
Success bool `json:"success"`
Error string `json:"error,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
RunMetrics captures metrics for a single evaluation run
type Scenario ¶
type Scenario struct {
Input string
// ExpectedContains checks if model output includes this substring (minimal oracle)
ExpectedContains string
}
Scenario represents a simple evaluation case
type Summary ¶
type Summary struct {
Runs int `json:"runs"`
Successes int `json:"successes"`
Failures int `json:"failures"`
SuccessRate float64 `json:"success_rate"`
AvgLatencyMS float64 `json:"avg_latency_ms"`
AvgPromptTokens float64 `json:"avg_prompt_tokens"`
AvgCompletion float64 `json:"avg_completion_tokens"`
AvgTotalTokens float64 `json:"avg_total_tokens"`
TotalToolCalls int `json:"total_tool_calls"`
ByModel map[string]Summary `json:"by_model,omitempty"`
}
Summary aggregates metrics across runs
Click to show internal directories.
Click to hide internal directories.