models

package

v0.28.0 Latest Latest Go to latest Published: Apr 21, 2026 License: MIT Imports: 15 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/microsoft/waza

Links

Documentation ¶

Index ¶

func AllGraderKinds() []string
func ComputeStdDev(values []float64) float64
type ActionSequenceGraderParameters
type ActionSequenceMatchingMode
type BehaviorGraderParameters
type BehaviorRules
type BenchmarkSpec
- func LoadBenchmarkSpec(path string) (*BenchmarkSpec, error)
- func (s *BenchmarkSpec) ResolveTestFiles(basePath string) ([]string, error)
- func (s *BenchmarkSpec) Validate() error
type Config
type DiffExpectedFileParameters
type DiffGraderParameters
type EvaluationOutcome
type FileContentPatternParameters
type FileGraderParameters
type GenericGraderParameters
type GradeOutcome
type GraderConfig
- func (g *GraderConfig) EffectiveWeight() float64
- func (g *GraderConfig) UnmarshalYAML(node *yaml.Node) error
- func (g *GraderConfig) Validate() error
type GraderKind
type GraderParameters
type GraderResults
type GroupStats
type InlineScriptGraderParameters
type JSONSchemaGraderParameters
type Language
type MeasureResult
type MeasurementDef
type ModelScore
type ModelUsage
type MultiSkillSummary
type OutcomeDigest
type OutcomeSetup
type OutcomeSpec
type OverallSummary
type PairwiseResult
type ProgramGraderParameters
type PromptGraderMode
type PromptGraderParameters
type Recommendation
type RecommendationWeights
type ResourceRef
type RunResult
- func (r *RunResult) AllValidationsPassed() bool
- func (r *RunResult) ComputeRunScore() float64
- func (r *RunResult) ComputeWeightedRunScore() float64
type SessionDigest
type SkillImpactMetric
type SkillInvocation
type SkillInvocationGraderParameters
type SkillInvocationMatchingMode
type SkillSummary
type SpecIdentity
type StatisticalSummary
type Status
type TaskTranscript
type TestCase
- func LoadTestCase(path string) (*TestCase, error)
type TestExpectation
type TestOutcome
type TestStats
type TestStimulus
type TextGraderParameters
type ToolCall
- func FilterToolCalls(sessionEvents []copilot.SessionEvent) []ToolCall
type ToolCallArgs
type ToolCallsGraderParameters
type ToolConstraintGraderParameters
type ToolSpecParameters
type TranscriptEvent
- func (te TranscriptEvent) MarshalJSON() ([]byte, error)
- func (te *TranscriptEvent) UnmarshalJSON(data []byte) error
type TriggerHeuristicGraderParameters
type TriggerMetrics
- func ComputeTriggerMetrics(results []TriggerResult) *TriggerMetrics
type TriggerResult
type UsageStats
- func AggregateUsageStats(stats []*UsageStats) *UsageStats
- func (u *UsageStats) IsZero() bool
type ValidatorInline
- func (v *ValidatorInline) EffectiveWeight() float64
- func (v *ValidatorInline) UnmarshalYAML(node *yaml.Node) error
- func (v *ValidatorInline) Validate() error

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func AllGraderKinds ¶

func AllGraderKinds() []string

func ComputeStdDev ¶

func ComputeStdDev(values []float64) float64

ComputeStdDev returns the population standard deviation for a slice of float64 values.

Types ¶

type ActionSequenceGraderParameters ¶

type ActionSequenceGraderParameters struct {
	MatchingMode    ActionSequenceMatchingMode `yaml:"matching_mode,omitempty" json:"matching_mode,omitempty"`
	ExpectedActions []string                   `yaml:"expected_actions,omitempty" json:"expected_actions,omitempty"`
}

type ActionSequenceMatchingMode ¶

type ActionSequenceMatchingMode string

ActionSequenceMatchingMode controls how actual tool calls are compared to expected actions.

const (
	ActionSequenceMatchingModeExact    ActionSequenceMatchingMode = "exact_match"
	ActionSequenceMatchingModeInOrder  ActionSequenceMatchingMode = "in_order_match"
	ActionSequenceMatchingModeAnyOrder ActionSequenceMatchingMode = "any_order_match"
)

type BehaviorGraderParameters ¶

type BehaviorGraderParameters struct {
	MaxToolCalls   int      `yaml:"max_tool_calls,omitempty" json:"max_tool_calls,omitempty"`
	MaxTokens      int      `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
	RequiredTools  []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
	ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
	MaxDurationMS  int64    `yaml:"max_duration_ms,omitempty" json:"max_duration_ms,omitempty"`
}

type BehaviorRules ¶

type BehaviorRules struct {
	MaxToolInvocations int      `yaml:"max_tool_calls,omitempty" json:"max_tool_invocations,omitempty"`
	MaxRounds          int      `yaml:"max_iterations,omitempty" json:"max_rounds,omitempty"`
	MaxTokens          int      `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
	MaxResponseTimeMs  int64    `yaml:"max_response_time_ms,omitempty" json:"max_response_time_ms,omitempty"`
	MustUseTool        []string `yaml:"required_tools,omitempty" json:"must_use_tool,omitempty"`
	ForbidTool         []string `yaml:"forbidden_tools,omitempty" json:"forbid_tool,omitempty"`
}

type BenchmarkSpec ¶

type BenchmarkSpec struct {
	SpecIdentity `yaml:",inline"`
	SkillName    string            `yaml:"skill"`
	Version      string            `yaml:"version"`
	Config       Config            `yaml:"config"`
	Hooks        hooks.HooksConfig `yaml:"hooks,omitempty"`
	Inputs       map[string]string `yaml:"inputs,omitempty" json:"inputs,omitempty"`
	TasksFrom    string            `yaml:"tasks_from,omitempty" json:"tasks_from,omitempty"`
	Range        [2]int            `yaml:"range,omitempty" json:"range,omitempty"`
	Graders      []GraderConfig    `yaml:"graders"`
	Metrics      []MeasurementDef  `yaml:"metrics"`
	Tasks        []string          `yaml:"tasks"`
	Baseline     bool              `yaml:"baseline,omitempty" json:"baseline,omitempty"`
}

BenchmarkSpec represents a complete evaluation specification

func LoadBenchmarkSpec ¶

func LoadBenchmarkSpec(path string) (*BenchmarkSpec, error)

LoadBenchmarkSpec loads a spec from a YAML file with strict validation.

Normally the schema validation will catch errors in the eval.yaml, but this also does strict YAML parsing to catch errors like unknown fields or type errors that the schema validation might miss.

func (*BenchmarkSpec) ResolveTestFiles ¶

func (s *BenchmarkSpec) ResolveTestFiles(basePath string) ([]string, error)

ResolveTestFiles expands glob patterns to actual test files

func (*BenchmarkSpec) Validate ¶

func (s *BenchmarkSpec) Validate() error

Validate checks that the spec is valid

type DiffExpectedFileParameters ¶

type DiffExpectedFileParameters struct {
	// Path is the workspace-relative path to the file being checked.
	Path string `yaml:"path" json:"path"`

	// Snapshot is the path (relative to context/fixtures dir) of the expected file content.
	// When set, the workspace file must match this snapshot exactly.
	Snapshot string `yaml:"snapshot,omitempty" json:"snapshot,omitempty"`

	// Contains lists line fragments that must appear in the workspace file.
	// Prefixed with "+" means the line must be present; "-" means it must be absent.
	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
}

DiffExpectedFileParameters defines a single file expectation for the diff grader. Either Snapshot or Contains (or both) must be specified.

type DiffGraderParameters ¶

type DiffGraderParameters struct {
	ExpectedFiles   []DiffExpectedFileParameters `yaml:"expected_files,omitempty" json:"expected_files,omitempty"`
	ContextDir      string                       `yaml:"context_dir,omitempty" json:"context_dir,omitempty"`
	UpdateSnapshots bool                         `yaml:"update_snapshots,omitempty" json:"update_snapshots,omitempty"`
}

type EvaluationOutcome ¶

type EvaluationOutcome struct {
	RunID           string                   `json:"eval_id"`
	SkillTested     string                   `json:"skill"`
	BenchName       string                   `json:"eval_name"`
	Timestamp       time.Time                `json:"timestamp"`
	Setup           OutcomeSetup             `json:"config"`
	Digest          OutcomeDigest            `json:"summary"`
	Measures        map[string]MeasureResult `json:"metrics"`
	TestOutcomes    []TestOutcome            `json:"tasks"`
	TriggerMetrics  *TriggerMetrics          `json:"trigger_metrics,omitempty"`
	TriggerResults  []TriggerResult          `json:"trigger_results,omitempty"`
	Metadata        map[string]any           `json:"metadata,omitempty"`
	IsBaseline      bool                     `json:"is_baseline,omitempty"`
	BaselineOutcome *EvaluationOutcome       `json:"baseline_outcome,omitempty"`
}

EvaluationOutcome represents the complete result of an evaluation run

type FileContentPatternParameters ¶

type FileContentPatternParameters struct {
	Path         string   `yaml:"path" json:"path"`
	MustMatch    []string `yaml:"must_match,omitempty" json:"must_match,omitempty"`
	MustNotMatch []string `yaml:"must_not_match,omitempty" json:"must_not_match,omitempty"`
}

type FileGraderParameters ¶

type FileGraderParameters struct {
	MustExist       []string                       `yaml:"must_exist,omitempty" json:"must_exist,omitempty"`
	MustNotExist    []string                       `yaml:"must_not_exist,omitempty" json:"must_not_exist,omitempty"`
	ContentPatterns []FileContentPatternParameters `yaml:"content_patterns,omitempty" json:"content_patterns,omitempty"`
}

type GenericGraderParameters ¶

type GenericGraderParameters map[string]any

GenericGraderParameters is used for unknown kinds to preserve raw config values.

type GradeOutcome ¶ added in v0.22.0

type GradeOutcome struct {
	OverallScore   float64                 `json:"overall_score"`
	Passed         bool                    `json:"passed"`
	Tasks          map[string]GradeOutcome `json:"tasks,omitempty"`
	GraderAverages map[string]float64      `json:"grader_averages,omitempty"`
}

func (*GraderConfig) EffectiveWeight ¶

func (g *GraderConfig) EffectiveWeight() float64

EffectiveWeight returns the grader weight, defaulting to 1.0 if unset.

func (*GraderConfig) UnmarshalYAML ¶

func (g *GraderConfig) UnmarshalYAML(node *yaml.Node) error

func (*GraderConfig) Validate ¶ added in v0.26.0

func (g *GraderConfig) Validate() error

Validate checks that the grader config has required fields for its type.

type GraderKind ¶

type GraderKind string

GraderKind identifies the type of grader (e.g. regex, file, code).

const (
	GraderKindInlineScript    GraderKind = "code"
	GraderKindPrompt          GraderKind = "prompt"
	GraderKindText            GraderKind = "text"
	GraderKindFile            GraderKind = "file"
	GraderKindJSONSchema      GraderKind = "json_schema"
	GraderKindProgram         GraderKind = "program"
	GraderKindBehavior        GraderKind = "behavior"
	GraderKindActionSequence  GraderKind = "action_sequence"
	GraderKindSkillInvocation GraderKind = "skill_invocation"
	GraderKindTrigger         GraderKind = "trigger"
	GraderKindDiff            GraderKind = "diff"
	GraderKindToolConstraint  GraderKind = "tool_constraint"
	GraderKindToolCalls       GraderKind = "tool_calls"
)

type GraderParameters ¶

type GraderParameters interface {
	// contains filtered or unexported methods
}

GraderParameters is a polymorphic grader config payload decoded from YAML based on GraderKind.

type GraderResults ¶

type GraderResults struct {
	Name       string         `json:"identifier"`
	Type       GraderKind     `json:"type"`
	Score      float64        `json:"score"`
	Weight     float64        `json:"weight"`
	Passed     bool           `json:"passed"`
	Feedback   string         `json:"feedback"`
	Details    map[string]any `json:"details,omitempty"`
	DurationMs int64          `json:"duration_ms"`
}

type GroupStats ¶

type GroupStats struct {
	Name     string  `json:"name"`
	Passed   int     `json:"passed"`
	Total    int     `json:"total"`
	AvgScore float64 `json:"avg_score"`
}

GroupStats holds aggregate statistics for a group of test outcomes.

type InlineScriptGraderParameters ¶

type InlineScriptGraderParameters struct {
	Assertions []string `yaml:"assertions,omitempty" json:"assertions,omitempty"`

	// Language indicates which language the Assertions are written for. Defaults to [LanguagePython]
	Language Language `yaml:"language,omitempty" json:"language,omitempty"`
}

type JSONSchemaGraderParameters ¶

type JSONSchemaGraderParameters struct {
	// Schema is an inline JSON schema object used for validation.
	Schema map[string]any `yaml:"schema,omitempty" json:"schema,omitempty"`

	// SchemaFile is a path to a JSON schema file. Used when Schema is not provided.
	SchemaFile string `yaml:"schema_file,omitempty" json:"schema_file,omitempty"`
}

JSONSchemaGraderParameters holds the arguments for creating a JSON schema grader.

type Language ¶

type Language string

const (
	LanguagePython     Language = "python"
	LanguageJavascript Language = "javascript"
)

type MeasureResult ¶

type MeasureResult struct {
	Identifier string         `json:"identifier"`
	Value      float64        `json:"value"`
	Threshold  float64        `json:"threshold"`
	Passed     bool           `json:"passed"`
	Weight     float64        `json:"weight"`
	Details    map[string]any `json:"details,omitempty"`
}

type ModelScore ¶

type ModelScore struct {
	ModelID        string             `json:"model_id"`
	HeuristicScore float64            `json:"heuristic_score"`
	Rank           int                `json:"rank"`
	Scores         map[string]float64 `json:"component_scores,omitempty"`
}

ModelScore holds the heuristic score and rank for a single model.

type ModelUsage ¶

type ModelUsage struct {
	InputTokens      int     `json:"input_tokens"`
	OutputTokens     int     `json:"output_tokens"`
	CacheReadTokens  int     `json:"cache_read_tokens"`
	CacheWriteTokens int     `json:"cache_write_tokens"`
	RequestCount     float64 `json:"request_count"`
	RequestCost      float64 `json:"request_cost"`
}

ModelUsage holds per-model token and request usage.

type MultiSkillSummary ¶

type MultiSkillSummary struct {
	Timestamp time.Time      `json:"timestamp"`
	Skills    []SkillSummary `json:"skills"`
	Overall   OverallSummary `json:"overall"`
}

MultiSkillSummary aggregates results across multiple skill evaluations.

type OutcomeDigest ¶

type OutcomeDigest struct {
	TotalTests     int          `json:"total_tests"`
	Succeeded      int          `json:"succeeded"`
	Failed         int          `json:"failed"`
	Errors         int          `json:"errors"`
	Skipped        int          `json:"skipped"`
	SuccessRate    float64      `json:"success_rate"`
	AggregateScore float64      `json:"aggregate_score"`
	WeightedScore  float64      `json:"weighted_score"`
	MinScore       float64      `json:"min_score"`
	MaxScore       float64      `json:"max_score"`
	StdDev         float64      `json:"std_dev"`
	DurationMs     int64        `json:"duration_ms"`
	Groups         []GroupStats `json:"groups,omitempty"`
	Usage          *UsageStats  `json:"usage,omitempty"`

	// Statistical summary populated when trials_per_task > 1
	Statistics *StatisticalSummary `json:"statistics,omitempty"`
}

type OutcomeSetup ¶

type OutcomeSetup struct {
	RunsPerTest int    `json:"runs_per_test"`
	ModelID     string `json:"model_id"`
	EngineType  string `json:"engine_type"`
	TimeoutSec  int    `json:"timeout_sec"`
	JudgeModel  string `json:"judge_model,omitempty"`
}

type OutcomeSpec ¶

type OutcomeSpec struct {
	Category  string `yaml:"type" json:"category"`
	Value     any    `yaml:"value,omitempty" json:"value,omitempty"`
	Predicate string `yaml:"condition,omitempty" json:"predicate,omitempty"`
}

type OverallSummary ¶

type OverallSummary struct {
	TotalSkills       int     `json:"total_skills"`
	TotalModels       int     `json:"total_models"`
	AvgPassRate       float64 `json:"avg_pass_rate"`
	AvgAggregateScore float64 `json:"avg_aggregate_score"`
}

OverallSummary contains cross-skill aggregated metrics.

type PairwiseResult ¶

type PairwiseResult struct {
	Winner             string `json:"winner"`    // "baseline", "skill", or "tie"
	Magnitude          string `json:"magnitude"` // "much-better", "slightly-better", "equal", etc.
	Reasoning          string `json:"reasoning"`
	PositionConsistent bool   `json:"position_consistent"` // true if result held after position swap
}

PairwiseResult captures the outcome of a pairwise LLM judge comparison.

type ProgramGraderParameters ¶

type ProgramGraderParameters struct {
	Command string   `yaml:"command,omitempty" json:"command,omitempty"`
	Args    []string `yaml:"args,omitempty" json:"args,omitempty"`
	Timeout int      `yaml:"timeout,omitempty" json:"timeout,omitempty"`
}

type PromptGraderMode ¶

type PromptGraderMode string

const (
	PromptGraderModeIndependent PromptGraderMode = "independent"
	PromptGraderModePairwise    PromptGraderMode = "pairwise"
)

type PromptGraderParameters ¶

type PromptGraderParameters struct {
	Prompt          string           `yaml:"prompt,omitempty" json:"prompt,omitempty"`
	Model           string           `yaml:"model,omitempty" json:"model,omitempty"`
	ContinueSession bool             `yaml:"continue_session,omitempty" json:"continue_session,omitempty"`
	Mode            PromptGraderMode `yaml:"mode,omitempty" json:"mode,omitempty"`
}

type Recommendation ¶

type Recommendation struct {
	RecommendedModel string                `json:"recommended_model"`
	HeuristicScore   float64               `json:"heuristic_score"`
	Reason           string                `json:"reason"`
	WinnerMarginPct  float64               `json:"winner_margin_pct"`
	Weights          RecommendationWeights `json:"weights"`
	ModelScores      []ModelScore          `json:"all_models"`
}

Recommendation represents a heuristic recommendation for the best model across a multi-model evaluation run.

type RecommendationWeights ¶

type RecommendationWeights struct {
	AggregateScore float64 `json:"aggregate_score"`
	PassRate       float64 `json:"pass_rate"`
	Consistency    float64 `json:"consistency"`
	Speed          float64 `json:"speed"`
}

RecommendationWeights defines the weighting scheme for heuristic scoring.

type ResourceRef ¶

type ResourceRef struct {
	Location string `yaml:"path,omitempty" json:"location,omitempty"`
	Body     string `yaml:"content,omitempty" json:"body,omitempty"`
}

ResourceRef points to a file or inline content

type RunResult ¶

type RunResult struct {
	RunNumber int `json:"run_number"`
	Attempts  int `json:"attempts"`
	// Status contains the overall status of the run.
	// NOTE: if Status == [StatusError], then [ErrorMsg] will be set to the
	// message from the error.
	Status           Status                   `json:"status"`
	DurationMs       int64                    `json:"duration_ms"`
	Validations      map[string]GraderResults `json:"validations"`
	SessionDigest    SessionDigest            `json:"session_digest"`
	Transcript       []TranscriptEvent        `json:"transcript,omitempty"`
	FinalOutput      string                   `json:"final_output"`
	ErrorMsg         string                   `json:"error_msg,omitempty"`
	SkillInvocations []SkillInvocation        `json:"skill_invocations,omitempty"`
}

RunResult is the result of a single run/trial

func (*RunResult) AllValidationsPassed ¶

func (r *RunResult) AllValidationsPassed() bool

AllValidationsPassed checks if all validations passed

func (*RunResult) ComputeRunScore ¶

func (r *RunResult) ComputeRunScore() float64

ComputeRunScore calculates the average score across all validations (unweighted, for backward compat)

func (*RunResult) ComputeWeightedRunScore ¶

func (r *RunResult) ComputeWeightedRunScore() float64

ComputeWeightedRunScore calculates the weighted composite score (0.0–1.0) using each grader's Weight field. If all weights are zero, falls back to simple average.

type SessionDigest ¶

type SessionDigest struct {
	ToolCallCount int         `json:"tool_call_count"`
	ToolsUsed     []string    `json:"tools_used"`
	ToolCalls     []ToolCall  `json:"tool_calls,omitempty"`
	Errors        []string    `json:"errors"`
	Usage         *UsageStats `json:"usage,omitempty"`
	SessionID     string      `json:"session_id,omitempty"`
}

type SkillImpactMetric ¶

type SkillImpactMetric struct {
	PassRateWithSkills float64         `json:"pass_rate_with_skills"`
	PassRateBaseline   float64         `json:"pass_rate_baseline"`
	Delta              float64         `json:"delta"`
	PercentChange      float64         `json:"percent_change"`
	Pairwise           *PairwiseResult `json:"pairwise,omitempty"`
}

SkillImpactMetric represents A/B comparison for a single task

type SkillInvocation ¶ added in v0.22.0

type SkillInvocation struct {
	Name string `json:"name"`
	Path string `json:"path,omitempty"`
}

SkillInvocation records a skill invoked during an agent session.

type SkillInvocationGraderParameters ¶

type SkillInvocationGraderParameters struct {
	RequiredSkills []string                    `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
	Mode           SkillInvocationMatchingMode `yaml:"mode,omitempty" json:"mode,omitempty"`
	AllowExtra     *bool                       `yaml:"allow_extra,omitempty" json:"allow_extra,omitempty"`
}

type SkillInvocationMatchingMode ¶

type SkillInvocationMatchingMode string

SkillInvocationMatchingMode controls how actual skill invocations are compared to expected skills.

const (
	SkillMatchingModeExact    SkillInvocationMatchingMode = "exact_match"
	SkillMatchingModeInOrder  SkillInvocationMatchingMode = "in_order"
	SkillMatchingModeAnyOrder SkillInvocationMatchingMode = "any_order"
)

type SkillSummary ¶

type SkillSummary struct {
	SkillName      string   `json:"skill_name"`
	Models         []string `json:"models"`
	PassRate       float64  `json:"pass_rate"`
	AggregateScore float64  `json:"aggregate_score"`
	OutputFiles    []string `json:"output_files"`
}

SkillSummary contains aggregated metrics for a single skill evaluation.

type SpecIdentity ¶

type SpecIdentity struct {
	Name        string `yaml:"name" json:"name"`
	Description string `yaml:"description,omitempty" json:"description,omitempty"`
}

type StatisticalSummary ¶

type StatisticalSummary struct {
	BootstrapCI    statistics.ConfidenceInterval `json:"bootstrap_ci"`
	IsSignificant  bool                          `json:"is_significant"`
	NormalizedGain *float64                      `json:"normalized_gain,omitempty"`
}

StatisticalSummary holds aggregate statistical data for the digest when trials > 1.

type Status ¶

type Status string

Status represents the outcome status of a test or run.

const (
	StatusPassed  Status = "passed"
	StatusFailed  Status = "failed"
	StatusError   Status = "error"
	StatusSkipped Status = "skipped"
	// StatusNA is used in comparison reports when a task is not found in a result file.
	StatusNA Status = "n/a"
)

type TaskTranscript ¶

type TaskTranscript struct {
	TaskID      string                   `json:"task_id"`
	TaskName    string                   `json:"task_name"`
	Status      Status                   `json:"status"`
	StartedAt   time.Time                `json:"started_at"`
	CompletedAt time.Time                `json:"completed_at"`
	DurationMs  int64                    `json:"duration_ms"`
	Prompt      string                   `json:"prompt"`
	FinalOutput string                   `json:"final_output"`
	Transcript  []TranscriptEvent        `json:"transcript"`
	Validations map[string]GraderResults `json:"validations,omitempty"`
	Session     SessionDigest            `json:"session"`
	ErrorMsg    string                   `json:"error_msg,omitempty"`
}

TaskTranscript is the per-task JSON file written to the transcript directory.

type TestCase ¶

type TestCase struct {
	Active      *bool             `yaml:"enabled,omitempty" json:"active,omitempty"`
	ContextRoot string            `yaml:"context_dir,omitempty" json:"context_root,omitempty"`
	DisplayName string            `yaml:"name" json:"display_name"`
	Expectation TestExpectation   `yaml:"expected,omitempty" json:"expectation,omitempty"`
	Stimulus    TestStimulus      `yaml:"inputs" json:"stimulus"`
	Summary     string            `yaml:"description,omitempty" json:"summary,omitempty"`
	Tags        []string          `yaml:"tags,omitempty" json:"labels,omitempty"`
	TestID      string            `yaml:"id" json:"test_id"`
	TimeoutSec  *int              `yaml:"timeout_seconds,omitempty" json:"timeout_sec,omitempty"`
	Validators  []ValidatorInline `yaml:"graders,omitempty" json:"validators,omitempty"`
}

TestCase represents a single evaluation test

type TestExpectation ¶

type TestExpectation struct {
	OutcomeSpecs    []OutcomeSpec  `yaml:"outcomes,omitempty" json:"outcome_specs,omitempty"`
	ToolPatterns    map[string]any `yaml:"tool_calls,omitempty" json:"tool_patterns,omitempty"`
	BehaviorRules   BehaviorRules  `yaml:"behavior,omitempty" json:"behavior_rules,omitempty"`
	MustInclude     []string       `yaml:"output_contains,omitempty" json:"must_include,omitempty"`
	MustExclude     []string       `yaml:"output_not_contains,omitempty" json:"must_exclude,omitempty"`
	MayInclude      []string       `yaml:"output_contains_any,omitempty" json:"may_include,omitempty"`
	ExpectedTrigger *bool          `yaml:"should_trigger,omitempty" json:"expected_trigger,omitempty"`
}

TestExpectation defines expected outcomes

type TestOutcome ¶

type TestOutcome struct {
	TestID      string             `json:"test_id"`
	DisplayName string             `json:"display_name"`
	Group       string             `json:"group,omitempty"`
	Status      Status             `json:"status"`
	Runs        []RunResult        `json:"runs"`
	Stats       *TestStats         `json:"stats,omitempty"`
	SkillImpact *SkillImpactMetric `json:"skill_impact,omitempty"`
}

TestOutcome represents the result of one test case

type TestStats ¶

type TestStats struct {
	PassRate         float64 `json:"pass_rate"`
	FlakinessPercent float64 `json:"flakiness_percent"`
	PassedRuns       int     `json:"passed_runs"`
	FailedRuns       int     `json:"failed_runs"`
	ErrorRuns        int     `json:"error_runs"`
	TotalRuns        int     `json:"total_runs"`
	AvgScore         float64 `json:"avg_score"`
	AvgWeightedScore float64 `json:"avg_weighted_score"`
	MinScore         float64 `json:"min_score"`
	MaxScore         float64 `json:"max_score"`
	StdDevScore      float64 `json:"std_dev_score"`
	ScoreVariance    float64 `json:"score_variance"`
	CI95Lo           float64 `json:"ci95_lo"`
	CI95Hi           float64 `json:"ci95_hi"`
	Flaky            bool    `json:"flaky"`
	AvgDurationMs    int64   `json:"avg_duration_ms"`

	// Bootstrap confidence interval over weighted scores (populated when trials > 1)
	BootstrapCI   *statistics.ConfidenceInterval `json:"bootstrap_ci,omitempty"`
	IsSignificant *bool                          `json:"is_significant,omitempty"`
}

type TestStimulus ¶

type TestStimulus struct {
	Message     string            `yaml:"prompt" json:"message"`
	MessageFile string            `yaml:"prompt_file,omitempty" json:"message_file,omitempty"`
	Metadata    map[string]any    `yaml:"context,omitempty" json:"metadata,omitempty"`
	Resources   []ResourceRef     `yaml:"files,omitempty" json:"resources,omitempty"`
	Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"`
	FollowUps   []string          `yaml:"follow_up_prompts,omitempty" json:"follow_ups,omitempty"`
}

TestStimulus defines the input for a test

type TextGraderParameters ¶

type TextGraderParameters struct {
	// Contains lists substrings that must appear in the output (case-insensitive).
	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`

	// NotContains lists substrings that must NOT appear in the output (case-insensitive).
	NotContains []string `yaml:"not_contains,omitempty" json:"not_contains,omitempty"`

	// ContainsCS lists substrings that must appear in the output (case-sensitive).
	ContainsCS []string `yaml:"contains_cs,omitempty" json:"contains_cs,omitempty"`

	// NotContainsCS lists substrings that must NOT appear in the output (case-sensitive).
	NotContainsCS []string `yaml:"not_contains_cs,omitempty" json:"not_contains_cs,omitempty"`

	// RegexMatch lists regex patterns that must match somewhere in the output.
	RegexMatch []string `yaml:"regex_match,omitempty" json:"regex_match,omitempty"`

	// RegexNotMatch lists regex patterns that must NOT match anywhere in the output.
	RegexNotMatch []string `yaml:"regex_not_match,omitempty" json:"regex_not_match,omitempty"`
}

TextGraderParameters holds the arguments for creating a text grader.

func FilterToolCalls ¶

func FilterToolCalls(sessionEvents []copilot.SessionEvent) []ToolCall

FilterToolCalls goes through the list of session events and correlates tool starts with Success.

type ToolCallArgs ¶

type ToolCallArgs struct {
	// these are filled out for file-based tools (view/edit)
	Path     string `json:"path"      mapstructure:"path"`
	FileText string `json:"file_text" mapstructure:"file_text"`

	// filled out for tools like bash or powershell
	Command     string `json:"command"     mapstructure:"command"`
	Description string `json:"description" mapstructure:"description"`

	// filled out for skill invocations
	Skill string `json:"skill" mapstructure:"skill"`
}

type ToolCallsGraderParameters ¶ added in v0.27.0

type ToolCallsGraderParameters struct {
	RequiredTools  []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
	ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
	MinCalls       *int     `yaml:"min_calls,omitempty" json:"min_calls,omitempty"`
	MaxCalls       *int     `yaml:"max_calls,omitempty" json:"max_calls,omitempty"`
}

ToolCallsGraderParameters validates which tools were called during a session.

type ToolConstraintGraderParameters ¶

type ToolConstraintGraderParameters struct {
	ExpectTools []ToolSpecParameters `yaml:"expect_tools,omitempty" json:"expect_tools,omitempty"`
	RejectTools []ToolSpecParameters `yaml:"reject_tools,omitempty" json:"reject_tools,omitempty"`
}

type ToolSpecParameters ¶

type ToolSpecParameters struct {
	Tool           string `yaml:"tool" json:"tool"`
	CommandPattern string `yaml:"command_pattern,omitempty" json:"command_pattern,omitempty"`
	SkillPattern   string `yaml:"skill_pattern,omitempty" json:"skill_pattern,omitempty"`
	PathPattern    string `yaml:"path_pattern,omitempty" json:"path_pattern,omitempty"`
}

type TranscriptEvent ¶

type TranscriptEvent struct {
	copilot.SessionEvent `json:"-"`
}

func (TranscriptEvent) MarshalJSON ¶

func (te TranscriptEvent) MarshalJSON() ([]byte, error)

func (*TranscriptEvent) UnmarshalJSON ¶

func (te *TranscriptEvent) UnmarshalJSON(data []byte) error

type TriggerHeuristicGraderParameters ¶

type TriggerHeuristicGraderParameters struct {
	SkillPath string   `yaml:"skill_path" json:"skill_path"`
	Mode      string   `yaml:"mode" json:"mode"`
	Threshold *float64 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
}

TriggerHeuristicGraderParameters holds the arguments for creating a trigger heuristic grader.

type TriggerMetrics ¶

type TriggerMetrics struct {
	TP        int     `json:"true_positives"`
	FP        int     `json:"false_positives"`
	TN        int     `json:"true_negatives"`
	FN        int     `json:"false_negatives"`
	Errors    int     `json:"errors,omitempty"`
	Precision float64 `json:"precision"`
	Recall    float64 `json:"recall"`
	F1        float64 `json:"f1"`
	Accuracy  float64 `json:"accuracy"`
}

TriggerMetrics holds classification metrics for trigger accuracy.

func ComputeTriggerMetrics ¶

func ComputeTriggerMetrics(results []TriggerResult) *TriggerMetrics

ComputeTriggerMetrics calculates precision, recall, F1, and accuracy from a set of trigger classification results. Results are weighted by confidence: "high" (or empty) counts as 1.0, "medium" as 0.5. Returns nil when results is empty.

type TriggerResult ¶

type TriggerResult struct {
	Prompt        string            `json:"prompt"`
	Confidence    string            `json:"confidence,omitempty"`
	ShouldTrigger bool              `json:"should_trigger"`
	DidTrigger    bool              `json:"did_trigger"`
	ErrorMsg      string            `json:"error_msg,omitempty"`
	FinalOutput   string            `json:"final_output,omitempty"`
	Transcript    []TranscriptEvent `json:"transcript,omitempty"`
	ToolCalls     []ToolCall        `json:"tool_calls,omitempty"`
	SessionID     string            `json:"session_id,omitempty"`
}

TriggerResult pairs an expected trigger label with the actual outcome.

type UsageStats ¶

type UsageStats struct {
	Turns            int                   `json:"turns"`
	InputTokens      int                   `json:"input_tokens"`
	OutputTokens     int                   `json:"output_tokens"`
	CacheReadTokens  int                   `json:"cache_read_tokens"`
	CacheWriteTokens int                   `json:"cache_write_tokens"`
	PremiumRequests  float64               `json:"premium_requests"`
	ModelMetrics     map[string]ModelUsage `json:"model_metrics,omitempty"`
}

UsageStats holds token and premium request usage data from a Copilot SDK session.

func AggregateUsageStats ¶

func AggregateUsageStats(stats []*UsageStats) *UsageStats

AggregateUsageStats sums usage across multiple UsageStats (e.g. across runs).

func (*UsageStats) IsZero ¶

func (u *UsageStats) IsZero() bool

IsZero returns true if no usage data has been recorded.

type ValidatorInline ¶

type ValidatorInline struct {
	Identifier string           `yaml:"name" json:"identifier"`
	Kind       GraderKind       `yaml:"type,omitempty" json:"kind,omitempty"`
	Checks     []string         `yaml:"assertions,omitempty" json:"checks,omitempty"`
	Rubric     string           `yaml:"rubric,omitempty" json:"rubric,omitempty"`
	Weight     float64          `yaml:"weight,omitempty" json:"weight,omitempty"`
	Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}

ValidatorInline is a validator embedded in a test case

func (*ValidatorInline) EffectiveWeight ¶ added in v0.22.0

func (v *ValidatorInline) EffectiveWeight() float64

func (*ValidatorInline) UnmarshalYAML ¶

func (v *ValidatorInline) UnmarshalYAML(node *yaml.Node) error

func (*ValidatorInline) Validate ¶ added in v0.26.0

func (v *ValidatorInline) Validate() error

Validate checks that the validator config has required fields for its type.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL