models

package
v0.26.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Apr 21, 2026 License: MIT Imports: 14 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AllGraderKinds

func AllGraderKinds() []string

func ComputeStdDev

func ComputeStdDev(values []float64) float64

ComputeStdDev returns the population standard deviation for a slice of float64 values.

Types

type ActionSequenceGraderParameters

type ActionSequenceGraderParameters struct {
	MatchingMode    ActionSequenceMatchingMode `yaml:"matching_mode,omitempty" json:"matching_mode,omitempty"`
	ExpectedActions []string                   `yaml:"expected_actions,omitempty" json:"expected_actions,omitempty"`
}

type ActionSequenceMatchingMode

type ActionSequenceMatchingMode string

ActionSequenceMatchingMode controls how actual tool calls are compared to expected actions.

const (
	ActionSequenceMatchingModeExact    ActionSequenceMatchingMode = "exact_match"
	ActionSequenceMatchingModeInOrder  ActionSequenceMatchingMode = "in_order_match"
	ActionSequenceMatchingModeAnyOrder ActionSequenceMatchingMode = "any_order_match"
)

type BehaviorGraderParameters

type BehaviorGraderParameters struct {
	MaxToolCalls   int      `yaml:"max_tool_calls,omitempty" json:"max_tool_calls,omitempty"`
	MaxTokens      int      `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
	RequiredTools  []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
	ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
	MaxDurationMS  int64    `yaml:"max_duration_ms,omitempty" json:"max_duration_ms,omitempty"`
}

type BehaviorRules

type BehaviorRules struct {
	MaxToolInvocations int      `yaml:"max_tool_calls,omitempty" json:"max_tool_invocations,omitempty"`
	MaxRounds          int      `yaml:"max_iterations,omitempty" json:"max_rounds,omitempty"`
	MaxTokens          int      `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
	MustUseTool        []string `yaml:"required_tools,omitempty" json:"must_use_tool,omitempty"`
	ForbidTool         []string `yaml:"forbidden_tools,omitempty" json:"forbid_tool,omitempty"`
}

type BenchmarkSpec

type BenchmarkSpec struct {
	SpecIdentity `yaml:",inline"`
	SkillName    string            `yaml:"skill"`
	Version      string            `yaml:"version"`
	Config       Config            `yaml:"config"`
	Hooks        hooks.HooksConfig `yaml:"hooks,omitempty"`
	Inputs       map[string]string `yaml:"inputs,omitempty" json:"inputs,omitempty"`
	TasksFrom    string            `yaml:"tasks_from,omitempty" json:"tasks_from,omitempty"`
	Range        [2]int            `yaml:"range,omitempty" json:"range,omitempty"`
	Graders      []GraderConfig    `yaml:"graders"`
	Metrics      []MeasurementDef  `yaml:"metrics"`
	Tasks        []string          `yaml:"tasks"`
	Baseline     bool              `yaml:"baseline,omitempty" json:"baseline,omitempty"`
}

BenchmarkSpec represents a complete evaluation specification

func LoadBenchmarkSpec

func LoadBenchmarkSpec(path string) (*BenchmarkSpec, error)

LoadBenchmarkSpec loads a spec from a YAML file with strict validation.

Normally the schema validation will catch errors in the eval.yaml, but this also does strict YAML parsing to catch errors like unknown fields or type errors that the schema validation might miss.

func (*BenchmarkSpec) ResolveTestFiles

func (s *BenchmarkSpec) ResolveTestFiles(basePath string) ([]string, error)

ResolveTestFiles expands glob patterns to actual test files

func (*BenchmarkSpec) Validate

func (s *BenchmarkSpec) Validate() error

Validate checks that the spec is valid

type Config

type Config struct {
	TrialsPerTask  int            `yaml:"trials_per_task" json:"runs_per_test"`
	TimeoutSec     int            `yaml:"timeout_seconds" json:"timeout_sec"`
	Concurrent     bool           `yaml:"parallel" json:"concurrent"`
	Workers        int            `yaml:"workers,omitempty" json:"workers,omitempty"`
	StopOnError    bool           `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
	EngineType     string         `yaml:"executor" json:"engine_type"`
	ModelID        string         `yaml:"model" json:"model_id"`
	SkillPaths     []string       `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
	RequiredSkills []string       `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
	ServerConfigs  map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
	MaxAttempts    int            `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
	GroupBy        string         `yaml:"group_by,omitempty" json:"group_by,omitempty"`
	JudgeModel     string         `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
}

Config controls execution behavior

type DiffExpectedFileParameters

type DiffExpectedFileParameters struct {
	// Path is the workspace-relative path to the file being checked.
	Path string `yaml:"path" json:"path"`

	// Snapshot is the path (relative to context/fixtures dir) of the expected file content.
	// When set, the workspace file must match this snapshot exactly.
	Snapshot string `yaml:"snapshot,omitempty" json:"snapshot,omitempty"`

	// Contains lists line fragments that must appear in the workspace file.
	// Prefixed with "+" means the line must be present; "-" means it must be absent.
	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
}

DiffExpectedFileParameters defines a single file expectation for the diff grader. Either Snapshot or Contains (or both) must be specified.

type DiffGraderParameters

type DiffGraderParameters struct {
	ExpectedFiles   []DiffExpectedFileParameters `yaml:"expected_files,omitempty" json:"expected_files,omitempty"`
	ContextDir      string                       `yaml:"context_dir,omitempty" json:"context_dir,omitempty"`
	UpdateSnapshots bool                         `yaml:"update_snapshots,omitempty" json:"update_snapshots,omitempty"`
}

type EvaluationOutcome

type EvaluationOutcome struct {
	RunID           string                   `json:"eval_id"`
	SkillTested     string                   `json:"skill"`
	BenchName       string                   `json:"eval_name"`
	Timestamp       time.Time                `json:"timestamp"`
	Setup           OutcomeSetup             `json:"config"`
	Digest          OutcomeDigest            `json:"summary"`
	Measures        map[string]MeasureResult `json:"metrics"`
	TestOutcomes    []TestOutcome            `json:"tasks"`
	TriggerMetrics  *TriggerMetrics          `json:"trigger_metrics,omitempty"`
	TriggerResults  []TriggerResult          `json:"trigger_results,omitempty"`
	Metadata        map[string]any           `json:"metadata,omitempty"`
	IsBaseline      bool                     `json:"is_baseline,omitempty"`
	BaselineOutcome *EvaluationOutcome       `json:"baseline_outcome,omitempty"`
}

EvaluationOutcome represents the complete result of an evaluation run

type FileContentPatternParameters

type FileContentPatternParameters struct {
	Path         string   `yaml:"path" json:"path"`
	MustMatch    []string `yaml:"must_match,omitempty" json:"must_match,omitempty"`
	MustNotMatch []string `yaml:"must_not_match,omitempty" json:"must_not_match,omitempty"`
}

type FileGraderParameters

type FileGraderParameters struct {
	MustExist       []string                       `yaml:"must_exist,omitempty" json:"must_exist,omitempty"`
	MustNotExist    []string                       `yaml:"must_not_exist,omitempty" json:"must_not_exist,omitempty"`
	ContentPatterns []FileContentPatternParameters `yaml:"content_patterns,omitempty" json:"content_patterns,omitempty"`
}

type GenericGraderParameters

type GenericGraderParameters map[string]any

GenericGraderParameters is used for unknown kinds to preserve raw config values.

type GradeOutcome added in v0.22.0

type GradeOutcome struct {
	OverallScore   float64                 `json:"overall_score"`
	Passed         bool                    `json:"passed"`
	Tasks          map[string]GradeOutcome `json:"tasks,omitempty"`
	GraderAverages map[string]float64      `json:"grader_averages,omitempty"`
}

type GraderConfig

type GraderConfig struct {
	Kind       GraderKind       `yaml:"type" json:"kind"`
	Identifier string           `yaml:"name" json:"identifier"`
	ScriptPath string           `yaml:"script,omitempty" json:"script_path,omitempty"`
	Rubric     string           `yaml:"rubric,omitempty" json:"rubric,omitempty"`
	ModelID    string           `yaml:"model,omitempty" json:"model_id,omitempty"`
	Weight     float64          `yaml:"weight,omitempty" json:"weight,omitempty"`
	Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}

GraderConfig defines a validator/grader

func (*GraderConfig) EffectiveWeight

func (g *GraderConfig) EffectiveWeight() float64

EffectiveWeight returns the grader weight, defaulting to 1.0 if unset.

func (*GraderConfig) UnmarshalYAML

func (g *GraderConfig) UnmarshalYAML(node *yaml.Node) error

func (*GraderConfig) Validate added in v0.26.0

func (g *GraderConfig) Validate() error

Validate checks that the grader config has required fields for its type.

type GraderKind

type GraderKind string

GraderKind identifies the type of grader (e.g. regex, file, code).

const (
	GraderKindInlineScript    GraderKind = "code"
	GraderKindPrompt          GraderKind = "prompt"
	GraderKindText            GraderKind = "text"
	GraderKindFile            GraderKind = "file"
	GraderKindJSONSchema      GraderKind = "json_schema"
	GraderKindProgram         GraderKind = "program"
	GraderKindBehavior        GraderKind = "behavior"
	GraderKindActionSequence  GraderKind = "action_sequence"
	GraderKindSkillInvocation GraderKind = "skill_invocation"
	GraderKindTrigger         GraderKind = "trigger"
	GraderKindDiff            GraderKind = "diff"
	GraderKindToolConstraint  GraderKind = "tool_constraint"
)

type GraderParameters

type GraderParameters interface {
	// contains filtered or unexported methods
}

GraderParameters is a polymorphic grader config payload decoded from YAML based on GraderKind.

type GraderResults

type GraderResults struct {
	Name       string         `json:"identifier"`
	Type       GraderKind     `json:"type"`
	Score      float64        `json:"score"`
	Weight     float64        `json:"weight"`
	Passed     bool           `json:"passed"`
	Feedback   string         `json:"feedback"`
	Details    map[string]any `json:"details,omitempty"`
	DurationMs int64          `json:"duration_ms"`
}

type GroupStats

type GroupStats struct {
	Name     string  `json:"name"`
	Passed   int     `json:"passed"`
	Total    int     `json:"total"`
	AvgScore float64 `json:"avg_score"`
}

GroupStats holds aggregate statistics for a group of test outcomes.

type InlineScriptGraderParameters

type InlineScriptGraderParameters struct {
	Assertions []string `yaml:"assertions,omitempty" json:"assertions,omitempty"`

	// Language indicates which language the Assertions are written for. Defaults to [LanguagePython]
	Language Language `yaml:"language,omitempty" json:"language,omitempty"`
}

type JSONSchemaGraderParameters

type JSONSchemaGraderParameters struct {
	// Schema is an inline JSON schema object used for validation.
	Schema map[string]any `yaml:"schema,omitempty" json:"schema,omitempty"`

	// SchemaFile is a path to a JSON schema file. Used when Schema is not provided.
	SchemaFile string `yaml:"schema_file,omitempty" json:"schema_file,omitempty"`
}

JSONSchemaGraderParameters holds the arguments for creating a JSON schema grader.

type Language

type Language string
const (
	LanguagePython     Language = "python"
	LanguageJavascript Language = "javascript"
)

type MeasureResult

type MeasureResult struct {
	Identifier string         `json:"identifier"`
	Value      float64        `json:"value"`
	Threshold  float64        `json:"threshold"`
	Passed     bool           `json:"passed"`
	Weight     float64        `json:"weight"`
	Details    map[string]any `json:"details,omitempty"`
}

type MeasurementDef

type MeasurementDef struct {
	Identifier string  `yaml:"name" json:"identifier"`
	Weight     float64 `yaml:"weight" json:"weight"`
	Threshold  float64 `yaml:"threshold" json:"threshold"`
	Enabled    bool    `yaml:"enabled,omitempty" json:"enabled,omitempty"`
	Desc       string  `yaml:"description,omitempty" json:"desc,omitempty"`
}

MeasurementDef defines a metric

type ModelScore

type ModelScore struct {
	ModelID        string             `json:"model_id"`
	HeuristicScore float64            `json:"heuristic_score"`
	Rank           int                `json:"rank"`
	Scores         map[string]float64 `json:"component_scores,omitempty"`
}

ModelScore holds the heuristic score and rank for a single model.

type ModelUsage

type ModelUsage struct {
	InputTokens      int     `json:"input_tokens"`
	OutputTokens     int     `json:"output_tokens"`
	CacheReadTokens  int     `json:"cache_read_tokens"`
	CacheWriteTokens int     `json:"cache_write_tokens"`
	RequestCount     float64 `json:"request_count"`
	RequestCost      float64 `json:"request_cost"`
}

ModelUsage holds per-model token and request usage.

type MultiSkillSummary

type MultiSkillSummary struct {
	Timestamp time.Time      `json:"timestamp"`
	Skills    []SkillSummary `json:"skills"`
	Overall   OverallSummary `json:"overall"`
}

MultiSkillSummary aggregates results across multiple skill evaluations.

type OutcomeDigest

type OutcomeDigest struct {
	TotalTests     int          `json:"total_tests"`
	Succeeded      int          `json:"succeeded"`
	Failed         int          `json:"failed"`
	Errors         int          `json:"errors"`
	Skipped        int          `json:"skipped"`
	SuccessRate    float64      `json:"success_rate"`
	AggregateScore float64      `json:"aggregate_score"`
	WeightedScore  float64      `json:"weighted_score"`
	MinScore       float64      `json:"min_score"`
	MaxScore       float64      `json:"max_score"`
	StdDev         float64      `json:"std_dev"`
	DurationMs     int64        `json:"duration_ms"`
	Groups         []GroupStats `json:"groups,omitempty"`
	Usage          *UsageStats  `json:"usage,omitempty"`

	// Statistical summary populated when trials_per_task > 1
	Statistics *StatisticalSummary `json:"statistics,omitempty"`
}

type OutcomeSetup

type OutcomeSetup struct {
	RunsPerTest int    `json:"runs_per_test"`
	ModelID     string `json:"model_id"`
	EngineType  string `json:"engine_type"`
	TimeoutSec  int    `json:"timeout_sec"`
	JudgeModel  string `json:"judge_model,omitempty"`
}

type OutcomeSpec

type OutcomeSpec struct {
	Category  string `yaml:"type" json:"category"`
	Value     any    `yaml:"value,omitempty" json:"value,omitempty"`
	Predicate string `yaml:"condition,omitempty" json:"predicate,omitempty"`
}

type OverallSummary

type OverallSummary struct {
	TotalSkills       int     `json:"total_skills"`
	TotalModels       int     `json:"total_models"`
	AvgPassRate       float64 `json:"avg_pass_rate"`
	AvgAggregateScore float64 `json:"avg_aggregate_score"`
}

OverallSummary contains cross-skill aggregated metrics.

type PairwiseResult

type PairwiseResult struct {
	Winner             string `json:"winner"`    // "baseline", "skill", or "tie"
	Magnitude          string `json:"magnitude"` // "much-better", "slightly-better", "equal", etc.
	Reasoning          string `json:"reasoning"`
	PositionConsistent bool   `json:"position_consistent"` // true if result held after position swap
}

PairwiseResult captures the outcome of a pairwise LLM judge comparison.

type ProgramGraderParameters

type ProgramGraderParameters struct {
	Command string   `yaml:"command,omitempty" json:"command,omitempty"`
	Args    []string `yaml:"args,omitempty" json:"args,omitempty"`
	Timeout int      `yaml:"timeout,omitempty" json:"timeout,omitempty"`
}

type PromptGraderMode

type PromptGraderMode string
const (
	PromptGraderModeIndependent PromptGraderMode = "independent"
	PromptGraderModePairwise    PromptGraderMode = "pairwise"
)

type PromptGraderParameters

type PromptGraderParameters struct {
	Prompt          string           `yaml:"prompt,omitempty" json:"prompt,omitempty"`
	Model           string           `yaml:"model,omitempty" json:"model,omitempty"`
	ContinueSession bool             `yaml:"continue_session,omitempty" json:"continue_session,omitempty"`
	Mode            PromptGraderMode `yaml:"mode,omitempty" json:"mode,omitempty"`
}

type Recommendation

type Recommendation struct {
	RecommendedModel string                `json:"recommended_model"`
	HeuristicScore   float64               `json:"heuristic_score"`
	Reason           string                `json:"reason"`
	WinnerMarginPct  float64               `json:"winner_margin_pct"`
	Weights          RecommendationWeights `json:"weights"`
	ModelScores      []ModelScore          `json:"all_models"`
}

Recommendation represents a heuristic recommendation for the best model across a multi-model evaluation run.

type RecommendationWeights

type RecommendationWeights struct {
	AggregateScore float64 `json:"aggregate_score"`
	PassRate       float64 `json:"pass_rate"`
	Consistency    float64 `json:"consistency"`
	Speed          float64 `json:"speed"`
}

RecommendationWeights defines the weighting scheme for heuristic scoring.

type ResourceRef

type ResourceRef struct {
	Location string `yaml:"path,omitempty" json:"location,omitempty"`
	Body     string `yaml:"content,omitempty" json:"body,omitempty"`
}

ResourceRef points to a file or inline content

type RunResult

type RunResult struct {
	RunNumber int `json:"run_number"`
	Attempts  int `json:"attempts"`
	// Status contains the overall status of the run.
	// NOTE: if Status == [StatusError], then [ErrorMsg] will be set to the
	// message from the error.
	Status           Status                   `json:"status"`
	DurationMs       int64                    `json:"duration_ms"`
	Validations      map[string]GraderResults `json:"validations"`
	SessionDigest    SessionDigest            `json:"session_digest"`
	Transcript       []TranscriptEvent        `json:"transcript,omitempty"`
	FinalOutput      string                   `json:"final_output"`
	ErrorMsg         string                   `json:"error_msg,omitempty"`
	SkillInvocations []SkillInvocation        `json:"skill_invocations,omitempty"`
}

RunResult is the result of a single run/trial

func (*RunResult) AllValidationsPassed

func (r *RunResult) AllValidationsPassed() bool

AllValidationsPassed checks if all validations passed

func (*RunResult) ComputeRunScore

func (r *RunResult) ComputeRunScore() float64

ComputeRunScore calculates the average score across all validations (unweighted, for backward compat)

func (*RunResult) ComputeWeightedRunScore

func (r *RunResult) ComputeWeightedRunScore() float64

ComputeWeightedRunScore calculates the weighted composite score (0.0–1.0) using each grader's Weight field. If all weights are zero, falls back to simple average.

type SessionDigest

type SessionDigest struct {
	ToolCallCount int         `json:"tool_call_count"`
	ToolsUsed     []string    `json:"tools_used"`
	ToolCalls     []ToolCall  `json:"tool_calls,omitempty"`
	Errors        []string    `json:"errors"`
	Usage         *UsageStats `json:"usage,omitempty"`
	SessionID     string      `json:"session_id,omitempty"`
}

type SkillImpactMetric

type SkillImpactMetric struct {
	PassRateWithSkills float64         `json:"pass_rate_with_skills"`
	PassRateBaseline   float64         `json:"pass_rate_baseline"`
	Delta              float64         `json:"delta"`
	PercentChange      float64         `json:"percent_change"`
	Pairwise           *PairwiseResult `json:"pairwise,omitempty"`
}

SkillImpactMetric represents A/B comparison for a single task

type SkillInvocation added in v0.22.0

type SkillInvocation struct {
	Name string `json:"name"`
	Path string `json:"path,omitempty"`
}

SkillInvocation records a skill invoked during an agent session.

type SkillInvocationGraderParameters

type SkillInvocationGraderParameters struct {
	RequiredSkills []string                    `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
	Mode           SkillInvocationMatchingMode `yaml:"mode,omitempty" json:"mode,omitempty"`
	AllowExtra     *bool                       `yaml:"allow_extra,omitempty" json:"allow_extra,omitempty"`
}

type SkillInvocationMatchingMode

type SkillInvocationMatchingMode string

SkillInvocationMatchingMode controls how actual skill invocations are compared to expected skills.

const (
	SkillMatchingModeExact    SkillInvocationMatchingMode = "exact_match"
	SkillMatchingModeInOrder  SkillInvocationMatchingMode = "in_order"
	SkillMatchingModeAnyOrder SkillInvocationMatchingMode = "any_order"
)

type SkillSummary

type SkillSummary struct {
	SkillName      string   `json:"skill_name"`
	Models         []string `json:"models"`
	PassRate       float64  `json:"pass_rate"`
	AggregateScore float64  `json:"aggregate_score"`
	OutputFiles    []string `json:"output_files"`
}

SkillSummary contains aggregated metrics for a single skill evaluation.

type SpecIdentity

type SpecIdentity struct {
	Name        string `yaml:"name" json:"name"`
	Description string `yaml:"description,omitempty" json:"description,omitempty"`
}

type StatisticalSummary

type StatisticalSummary struct {
	BootstrapCI    statistics.ConfidenceInterval `json:"bootstrap_ci"`
	IsSignificant  bool                          `json:"is_significant"`
	NormalizedGain *float64                      `json:"normalized_gain,omitempty"`
}

StatisticalSummary holds aggregate statistical data for the digest when trials > 1.

type Status

type Status string

Status represents the outcome status of a test or run.

const (
	StatusPassed  Status = "passed"
	StatusFailed  Status = "failed"
	StatusError   Status = "error"
	StatusSkipped Status = "skipped"
	// StatusNA is used in comparison reports when a task is not found in a result file.
	StatusNA Status = "n/a"
)

type TaskTranscript

type TaskTranscript struct {
	TaskID      string                   `json:"task_id"`
	TaskName    string                   `json:"task_name"`
	Status      Status                   `json:"status"`
	StartedAt   time.Time                `json:"started_at"`
	CompletedAt time.Time                `json:"completed_at"`
	DurationMs  int64                    `json:"duration_ms"`
	Prompt      string                   `json:"prompt"`
	FinalOutput string                   `json:"final_output"`
	Transcript  []TranscriptEvent        `json:"transcript"`
	Validations map[string]GraderResults `json:"validations,omitempty"`
	Session     SessionDigest            `json:"session"`
	ErrorMsg    string                   `json:"error_msg,omitempty"`
}

TaskTranscript is the per-task JSON file written to the transcript directory.

type TestCase

type TestCase struct {
	Active      *bool             `yaml:"enabled,omitempty" json:"active,omitempty"`
	ContextRoot string            `yaml:"context_dir,omitempty" json:"context_root,omitempty"`
	DisplayName string            `yaml:"name" json:"display_name"`
	Expectation TestExpectation   `yaml:"expected,omitempty" json:"expectation,omitempty"`
	Stimulus    TestStimulus      `yaml:"inputs" json:"stimulus"`
	Summary     string            `yaml:"description,omitempty" json:"summary,omitempty"`
	Tags        []string          `yaml:"tags,omitempty" json:"labels,omitempty"`
	TestID      string            `yaml:"id" json:"test_id"`
	TimeoutSec  *int              `yaml:"timeout_seconds,omitempty" json:"timeout_sec,omitempty"`
	Validators  []ValidatorInline `yaml:"graders,omitempty" json:"validators,omitempty"`
}

TestCase represents a single evaluation test

func LoadTestCase

func LoadTestCase(path string) (*TestCase, error)

LoadTestCase loads a test case from YAML

type TestExpectation

type TestExpectation struct {
	OutcomeSpecs    []OutcomeSpec  `yaml:"outcomes,omitempty" json:"outcome_specs,omitempty"`
	ToolPatterns    map[string]any `yaml:"tool_calls,omitempty" json:"tool_patterns,omitempty"`
	BehaviorRules   BehaviorRules  `yaml:"behavior,omitempty" json:"behavior_rules,omitempty"`
	MustInclude     []string       `yaml:"output_contains,omitempty" json:"must_include,omitempty"`
	MustExclude     []string       `yaml:"output_not_contains,omitempty" json:"must_exclude,omitempty"`
	ExpectedTrigger *bool          `yaml:"should_trigger,omitempty" json:"expected_trigger,omitempty"`
}

TestExpectation defines expected outcomes

type TestOutcome

type TestOutcome struct {
	TestID      string             `json:"test_id"`
	DisplayName string             `json:"display_name"`
	Group       string             `json:"group,omitempty"`
	Status      Status             `json:"status"`
	Runs        []RunResult        `json:"runs"`
	Stats       *TestStats         `json:"stats,omitempty"`
	SkillImpact *SkillImpactMetric `json:"skill_impact,omitempty"`
}

TestOutcome represents the result of one test case

type TestStats

type TestStats struct {
	PassRate         float64 `json:"pass_rate"`
	FlakinessPercent float64 `json:"flakiness_percent"`
	PassedRuns       int     `json:"passed_runs"`
	FailedRuns       int     `json:"failed_runs"`
	ErrorRuns        int     `json:"error_runs"`
	TotalRuns        int     `json:"total_runs"`
	AvgScore         float64 `json:"avg_score"`
	AvgWeightedScore float64 `json:"avg_weighted_score"`
	MinScore         float64 `json:"min_score"`
	MaxScore         float64 `json:"max_score"`
	StdDevScore      float64 `json:"std_dev_score"`
	ScoreVariance    float64 `json:"score_variance"`
	CI95Lo           float64 `json:"ci95_lo"`
	CI95Hi           float64 `json:"ci95_hi"`
	Flaky            bool    `json:"flaky"`
	AvgDurationMs    int64   `json:"avg_duration_ms"`

	// Bootstrap confidence interval over weighted scores (populated when trials > 1)
	BootstrapCI   *statistics.ConfidenceInterval `json:"bootstrap_ci,omitempty"`
	IsSignificant *bool                          `json:"is_significant,omitempty"`
}

type TestStimulus

type TestStimulus struct {
	Message     string            `yaml:"prompt" json:"message"`
	Metadata    map[string]any    `yaml:"context,omitempty" json:"metadata,omitempty"`
	Resources   []ResourceRef     `yaml:"files,omitempty" json:"resources,omitempty"`
	Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"`
}

TestStimulus defines the input for a test

type TextGraderParameters

type TextGraderParameters struct {
	// Contains lists substrings that must appear in the output (case-insensitive).
	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`

	// NotContains lists substrings that must NOT appear in the output (case-insensitive).
	NotContains []string `yaml:"not_contains,omitempty" json:"not_contains,omitempty"`

	// ContainsCS lists substrings that must appear in the output (case-sensitive).
	ContainsCS []string `yaml:"contains_cs,omitempty" json:"contains_cs,omitempty"`

	// NotContainsCS lists substrings that must NOT appear in the output (case-sensitive).
	NotContainsCS []string `yaml:"not_contains_cs,omitempty" json:"not_contains_cs,omitempty"`

	// RegexMatch lists regex patterns that must match somewhere in the output.
	RegexMatch []string `yaml:"regex_match,omitempty" json:"regex_match,omitempty"`

	// RegexNotMatch lists regex patterns that must NOT match anywhere in the output.
	RegexNotMatch []string `yaml:"regex_not_match,omitempty" json:"regex_not_match,omitempty"`
}

TextGraderParameters holds the arguments for creating a text grader.

type ToolCall

type ToolCall struct {
	Name      string          `json:"name"`
	Arguments ToolCallArgs    `json:"arguments,omitempty"`
	Result    *copilot.Result `json:"result,omitempty"`
	Success   bool            `json:"success"`
}

ToolCall represents a tool invocation

func FilterToolCalls

func FilterToolCalls(sessionEvents []copilot.SessionEvent) []ToolCall

FilterToolCalls goes through the list of session events and correlates tool starts with Success.

type ToolCallArgs

type ToolCallArgs struct {
	// these are filled out for file-based tools (view/edit)
	Path     string `json:"path"      mapstructure:"path"`
	FileText string `json:"file_text" mapstructure:"file_text"`

	// filled out for tools like bash or powershell
	Command     string `json:"command"     mapstructure:"command"`
	Description string `json:"description" mapstructure:"description"`

	// filled out for skill invocations
	Skill string `json:"skill" mapstructure:"skill"`
}

type ToolConstraintGraderParameters

type ToolConstraintGraderParameters struct {
	ExpectTools []ToolSpecParameters `yaml:"expect_tools,omitempty" json:"expect_tools,omitempty"`
	RejectTools []ToolSpecParameters `yaml:"reject_tools,omitempty" json:"reject_tools,omitempty"`
}

type ToolSpecParameters

type ToolSpecParameters struct {
	Tool           string `yaml:"tool" json:"tool"`
	CommandPattern string `yaml:"command_pattern,omitempty" json:"command_pattern,omitempty"`
	SkillPattern   string `yaml:"skill_pattern,omitempty" json:"skill_pattern,omitempty"`
	PathPattern    string `yaml:"path_pattern,omitempty" json:"path_pattern,omitempty"`
}

type TranscriptEvent

type TranscriptEvent struct {
	copilot.SessionEvent `json:"-"`
}

func (TranscriptEvent) MarshalJSON

func (te TranscriptEvent) MarshalJSON() ([]byte, error)

func (*TranscriptEvent) UnmarshalJSON

func (te *TranscriptEvent) UnmarshalJSON(data []byte) error

type TriggerHeuristicGraderParameters

type TriggerHeuristicGraderParameters struct {
	SkillPath string   `yaml:"skill_path" json:"skill_path"`
	Mode      string   `yaml:"mode" json:"mode"`
	Threshold *float64 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
}

TriggerHeuristicGraderParameters holds the arguments for creating a trigger heuristic grader.

type TriggerMetrics

type TriggerMetrics struct {
	TP        int     `json:"true_positives"`
	FP        int     `json:"false_positives"`
	TN        int     `json:"true_negatives"`
	FN        int     `json:"false_negatives"`
	Errors    int     `json:"errors,omitempty"`
	Precision float64 `json:"precision"`
	Recall    float64 `json:"recall"`
	F1        float64 `json:"f1"`
	Accuracy  float64 `json:"accuracy"`
}

TriggerMetrics holds classification metrics for trigger accuracy.

func ComputeTriggerMetrics

func ComputeTriggerMetrics(results []TriggerResult) *TriggerMetrics

ComputeTriggerMetrics calculates precision, recall, F1, and accuracy from a set of trigger classification results. Results are weighted by confidence: "high" (or empty) counts as 1.0, "medium" as 0.5. Returns nil when results is empty.

type TriggerResult

type TriggerResult struct {
	Prompt        string            `json:"prompt"`
	Confidence    string            `json:"confidence,omitempty"`
	ShouldTrigger bool              `json:"should_trigger"`
	DidTrigger    bool              `json:"did_trigger"`
	ErrorMsg      string            `json:"error_msg,omitempty"`
	FinalOutput   string            `json:"final_output,omitempty"`
	Transcript    []TranscriptEvent `json:"transcript,omitempty"`
	ToolCalls     []ToolCall        `json:"tool_calls,omitempty"`
	SessionID     string            `json:"session_id,omitempty"`
}

TriggerResult pairs an expected trigger label with the actual outcome.

type UsageStats

type UsageStats struct {
	Turns            int                   `json:"turns"`
	InputTokens      int                   `json:"input_tokens"`
	OutputTokens     int                   `json:"output_tokens"`
	CacheReadTokens  int                   `json:"cache_read_tokens"`
	CacheWriteTokens int                   `json:"cache_write_tokens"`
	PremiumRequests  float64               `json:"premium_requests"`
	ModelMetrics     map[string]ModelUsage `json:"model_metrics,omitempty"`
}

UsageStats holds token and premium request usage data from a Copilot SDK session.

func AggregateUsageStats

func AggregateUsageStats(stats []*UsageStats) *UsageStats

AggregateUsageStats sums usage across multiple UsageStats (e.g. across runs).

func (*UsageStats) IsZero

func (u *UsageStats) IsZero() bool

IsZero returns true if no usage data has been recorded.

type ValidatorInline

type ValidatorInline struct {
	Identifier string           `yaml:"name" json:"identifier"`
	Kind       GraderKind       `yaml:"type,omitempty" json:"kind,omitempty"`
	Checks     []string         `yaml:"assertions,omitempty" json:"checks,omitempty"`
	Rubric     string           `yaml:"rubric,omitempty" json:"rubric,omitempty"`
	Weight     float64          `yaml:"weight,omitempty" json:"weight,omitempty"`
	Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}

ValidatorInline is a validator embedded in a test case

func (*ValidatorInline) EffectiveWeight added in v0.22.0

func (v *ValidatorInline) EffectiveWeight() float64

func (*ValidatorInline) UnmarshalYAML

func (v *ValidatorInline) UnmarshalYAML(node *yaml.Node) error

func (*ValidatorInline) Validate added in v0.26.0

func (v *ValidatorInline) Validate() error

Validate checks that the validator config has required fields for its type.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL