evaluation

package
v1.19.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 8, 2026 License: Apache-2.0 Imports: 29 Imported by: 0

Documentation

Overview

Package evaluation provides an evaluation framework for testing agents.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GenerateRunName added in v1.19.0

func GenerateRunName() string

GenerateRunName creates a memorable name for an evaluation run.

func Save

func Save(sess *session.Session, filename string) (string, error)

func SaveRun added in v1.19.0

func SaveRun(run *EvalRun, outputDir string) (string, error)

SaveRun saves the evaluation run results to a JSON file.

Types

type Config added in v1.19.0

type Config struct {
	JudgeModel  provider.Provider // Model for relevance checking (optional)
	Concurrency int               // Number of concurrent runs (0 = number of CPUs)
	TTYFd       int               // File descriptor for terminal size queries (e.g., int(os.Stdout.Fd()))
}

Config holds configuration for evaluation runs.

type EvalCriteria added in v1.19.0

type EvalCriteria struct {
	Relevance  []string `json:"relevance,omitempty"`   // Statements that should be true about the response
	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
}

EvalCriteria contains the evaluation criteria for a test case.

type EvalRun added in v1.19.0

type EvalRun struct {
	Name      string        `json:"name"`
	Timestamp time.Time     `json:"timestamp"`
	Duration  time.Duration `json:"duration"`
	Results   []Result      `json:"results"`
	Summary   Summary       `json:"summary"`
}

EvalRun contains the results and metadata for an evaluation run.

func Evaluate

func Evaluate(ctx context.Context, out io.Writer, isTTY bool, ttyFd int, agentFilename, evalsDir string, runConfig *config.RuntimeConfig, concurrency int, judgeModel provider.Provider) (*EvalRun, error)

Evaluate is the main entry point for running evaluations.

func EvaluateWithName added in v1.19.0

func EvaluateWithName(ctx context.Context, out io.Writer, isTTY bool, ttyFd int, runName, agentFilename, evalsDir string, runConfig *config.RuntimeConfig, concurrency int, judgeModel provider.Provider) (*EvalRun, error)

EvaluateWithName runs evaluations with a specified run name.

type EvalSession added in v1.19.0

type EvalSession struct {
	session.Session
	Evals EvalCriteria `json:"evals"`
}

EvalSession extends session.Session with evaluation criteria.

type Result

type Result struct {
	Title             string   `json:"title"`
	Question          string   `json:"question"`
	Response          string   `json:"response"`
	Cost              float64  `json:"cost"`
	OutputTokens      int64    `json:"output_tokens"`
	Size              string   `json:"size"`
	SizeExpected      string   `json:"size_expected"`
	ToolCallsScore    float64  `json:"tool_calls_score"`
	ToolCallsExpected float64  `json:"tool_calls_score_expected"`
	HandoffsMatch     bool     `json:"handoffs"`
	RelevancePassed   float64  `json:"relevance"`
	RelevanceExpected float64  `json:"relevance_expected"`
	FailedRelevance   []string `json:"failed_relevance,omitempty"`
	Error             string   `json:"error,omitempty"`
}

Result contains the evaluation results for a single test case.

type Runner added in v1.19.0

type Runner struct {
	// contains filtered or unexported fields
}

Runner runs evaluations against an agent.

func NewRunner added in v1.19.0

func NewRunner(agentSource config.Source, runConfig *config.RuntimeConfig, evalsDir string, cfg Config) *Runner

NewRunner creates a new evaluation runner.

func (*Runner) Run added in v1.19.0

func (r *Runner) Run(ctx context.Context, out io.Writer, isTTY bool) ([]Result, error)

Run executes all evaluations concurrently and returns results.

type Summary added in v1.19.0

type Summary struct {
	TotalEvals      int     `json:"total_evals"`
	TotalCost       float64 `json:"total_cost"`
	SizesPassed     int     `json:"sizes_passed"`
	SizesTotal      int     `json:"sizes_total"`
	ToolsPassed     float64 `json:"tools_passed"`
	ToolsTotal      float64 `json:"tools_total"`
	HandoffsPassed  int     `json:"handoffs_passed"`
	HandoffsTotal   int     `json:"handoffs_total"`
	RelevancePassed float64 `json:"relevance_passed"`
	RelevanceTotal  float64 `json:"relevance_total"`
}

Summary contains aggregate statistics across all evaluations.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL