models

package
v0.21.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 12, 2026 License: MIT Imports: 13 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AllGraderKinds

func AllGraderKinds() []string

func ComputeStdDev

func ComputeStdDev(values []float64) float64

ComputeStdDev returns the population standard deviation for a slice of float64 values.

Types

type ActionSequenceGraderParameters

type ActionSequenceGraderParameters struct {
	MatchingMode    ActionSequenceMatchingMode `yaml:"matching_mode,omitempty" json:"matching_mode,omitempty"`
	ExpectedActions []string                   `yaml:"expected_actions,omitempty" json:"expected_actions,omitempty"`
}

type ActionSequenceMatchingMode

type ActionSequenceMatchingMode string

ActionSequenceMatchingMode controls how actual tool calls are compared to expected actions.

const (
	ActionSequenceMatchingModeExact    ActionSequenceMatchingMode = "exact_match"
	ActionSequenceMatchingModeInOrder  ActionSequenceMatchingMode = "in_order_match"
	ActionSequenceMatchingModeAnyOrder ActionSequenceMatchingMode = "any_order_match"
)

type BehaviorGraderParameters

type BehaviorGraderParameters struct {
	MaxToolCalls   int      `yaml:"max_tool_calls,omitempty" json:"max_tool_calls,omitempty"`
	MaxTokens      int      `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
	RequiredTools  []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
	ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
	MaxDurationMS  int64    `yaml:"max_duration_ms,omitempty" json:"max_duration_ms,omitempty"`
}

type BehaviorRules

type BehaviorRules struct {
	MaxToolInvocations int      `yaml:"max_tool_calls,omitempty" json:"max_tool_invocations,omitempty"`
	MaxRounds          int      `yaml:"max_iterations,omitempty" json:"max_rounds,omitempty"`
	MaxTokens          int      `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
	MustUseTool        []string `yaml:"required_tools,omitempty" json:"must_use_tool,omitempty"`
	ForbidTool         []string `yaml:"forbidden_tools,omitempty" json:"forbid_tool,omitempty"`
}

type BenchmarkSpec

type BenchmarkSpec struct {
	SpecIdentity `yaml:",inline"`
	SkillName    string            `yaml:"skill"`
	Version      string            `yaml:"version"`
	Config       Config            `yaml:"config"`
	Hooks        hooks.HooksConfig `yaml:"hooks,omitempty"`
	Inputs       map[string]string `yaml:"inputs,omitempty" json:"inputs,omitempty"`
	TasksFrom    string            `yaml:"tasks_from,omitempty" json:"tasks_from,omitempty"`
	Range        [2]int            `yaml:"range,omitempty" json:"range,omitempty"`
	Graders      []GraderConfig    `yaml:"graders"`
	Metrics      []MeasurementDef  `yaml:"metrics"`
	Tasks        []string          `yaml:"tasks"`
	Baseline     bool              `yaml:"baseline,omitempty" json:"baseline,omitempty"`
}

BenchmarkSpec represents a complete evaluation specification

func LoadBenchmarkSpec

func LoadBenchmarkSpec(path string) (*BenchmarkSpec, error)

LoadBenchmarkSpec loads a spec from a YAML file

func (*BenchmarkSpec) ResolveTestFiles

func (s *BenchmarkSpec) ResolveTestFiles(basePath string) ([]string, error)

ResolveTestFiles expands glob patterns to actual test files

func (*BenchmarkSpec) Validate

func (s *BenchmarkSpec) Validate() error

Validate checks that the spec is valid

type Config

type Config struct {
	RunsPerTest    int            `yaml:"trials_per_task" json:"runs_per_test"`
	TimeoutSec     int            `yaml:"timeout_seconds" json:"timeout_sec"`
	Concurrent     bool           `yaml:"parallel" json:"concurrent"`
	Workers        int            `yaml:"max_workers,omitempty" json:"workers,omitempty"`
	StopOnError    bool           `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
	EngineType     string         `yaml:"executor" json:"engine_type"`
	ModelID        string         `yaml:"model" json:"model_id"`
	SkillPaths     []string       `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
	RequiredSkills []string       `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
	ServerConfigs  map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
	MaxAttempts    int            `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
	GroupBy        string         `yaml:"group_by,omitempty" json:"group_by,omitempty"`
	JudgeModel     string         `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
}

Config controls execution behavior

type DiffExpectedFileParameters

type DiffExpectedFileParameters struct {
	// Path is the workspace-relative path to the file being checked.
	Path string `yaml:"path" json:"path"`

	// Snapshot is the path (relative to context/fixtures dir) of the expected file content.
	// When set, the workspace file must match this snapshot exactly.
	Snapshot string `yaml:"snapshot,omitempty" json:"snapshot,omitempty"`

	// Contains lists line fragments that must appear in the workspace file.
	// Prefixed with "+" means the line must be present; "-" means it must be absent.
	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
}

DiffExpectedFileParameters defines a single file expectation for the diff grader. Either Snapshot or Contains (or both) must be specified.

type DiffGraderParameters

type DiffGraderParameters struct {
	ExpectedFiles   []DiffExpectedFileParameters `yaml:"expected_files,omitempty" json:"expected_files,omitempty"`
	ContextDir      string                       `yaml:"context_dir,omitempty" json:"context_dir,omitempty"`
	UpdateSnapshots bool                         `yaml:"update_snapshots,omitempty" json:"update_snapshots,omitempty"`
}

type EvaluationOutcome

type EvaluationOutcome struct {
	RunID           string                   `json:"eval_id"`
	SkillTested     string                   `json:"skill"`
	BenchName       string                   `json:"eval_name"`
	Timestamp       time.Time                `json:"timestamp"`
	Setup           OutcomeSetup             `json:"config"`
	Digest          OutcomeDigest            `json:"summary"`
	Measures        map[string]MeasureResult `json:"metrics"`
	TestOutcomes    []TestOutcome            `json:"tasks"`
	TriggerMetrics  *TriggerMetrics          `json:"trigger_metrics,omitempty"`
	TriggerResults  []TriggerResult          `json:"trigger_results,omitempty"`
	Metadata        map[string]any           `json:"metadata,omitempty"`
	IsBaseline      bool                     `json:"is_baseline,omitempty"`
	BaselineOutcome *EvaluationOutcome       `json:"baseline_outcome,omitempty"`
}

EvaluationOutcome represents the complete result of an evaluation run

type FileContentPatternParameters

type FileContentPatternParameters struct {
	Path         string   `yaml:"path" json:"path"`
	MustMatch    []string `yaml:"must_match,omitempty" json:"must_match,omitempty"`
	MustNotMatch []string `yaml:"must_not_match,omitempty" json:"must_not_match,omitempty"`
}

type FileGraderParameters

type FileGraderParameters struct {
	MustExist       []string                       `yaml:"must_exist,omitempty" json:"must_exist,omitempty"`
	MustNotExist    []string                       `yaml:"must_not_exist,omitempty" json:"must_not_exist,omitempty"`
	ContentPatterns []FileContentPatternParameters `yaml:"content_patterns,omitempty" json:"content_patterns,omitempty"`
}

type GenericGraderParameters

type GenericGraderParameters map[string]any

GenericGraderParameters is used for unknown kinds to preserve raw config values.

type GraderConfig

type GraderConfig struct {
	Kind       GraderKind       `yaml:"type" json:"kind"`
	Identifier string           `yaml:"name" json:"identifier"`
	ScriptPath string           `yaml:"script,omitempty" json:"script_path,omitempty"`
	Rubric     string           `yaml:"rubric,omitempty" json:"rubric,omitempty"`
	ModelID    string           `yaml:"model,omitempty" json:"model_id,omitempty"`
	Weight     float64          `yaml:"weight,omitempty" json:"weight,omitempty"`
	Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}

GraderConfig defines a validator/grader

func (*GraderConfig) EffectiveWeight

func (g *GraderConfig) EffectiveWeight() float64

EffectiveWeight returns the grader weight, defaulting to 1.0 if unset.

func (*GraderConfig) UnmarshalYAML

func (g *GraderConfig) UnmarshalYAML(node *yaml.Node) error

type GraderKind

type GraderKind string

GraderKind identifies the type of grader (e.g. regex, file, code).

const (
	GraderKindInlineScript    GraderKind = "code"
	GraderKindPrompt          GraderKind = "prompt"
	GraderKindText            GraderKind = "text"
	GraderKindFile            GraderKind = "file"
	GraderKindJSONSchema      GraderKind = "json_schema"
	GraderKindProgram         GraderKind = "program"
	GraderKindBehavior        GraderKind = "behavior"
	GraderKindActionSequence  GraderKind = "action_sequence"
	GraderKindSkillInvocation GraderKind = "skill_invocation"
	GraderKindTrigger         GraderKind = "trigger"
	GraderKindDiff            GraderKind = "diff"
	GraderKindToolConstraint  GraderKind = "tool_constraint"
)

type GraderParameters

type GraderParameters interface {
	// contains filtered or unexported methods
}

GraderParameters is a polymorphic grader config payload decoded from YAML based on GraderKind.

type GraderResults

type GraderResults struct {
	Name       string         `json:"identifier"`
	Type       GraderKind     `json:"type"`
	Score      float64        `json:"score"`
	Weight     float64        `json:"weight"`
	Passed     bool           `json:"passed"`
	Feedback   string         `json:"feedback"`
	Details    map[string]any `json:"details,omitempty"`
	DurationMs int64          `json:"duration_ms"`
}

type GroupStats

type GroupStats struct {
	Name     string  `json:"name"`
	Passed   int     `json:"passed"`
	Total    int     `json:"total"`
	AvgScore float64 `json:"avg_score"`
}

GroupStats holds aggregate statistics for a group of test outcomes.

type InlineScriptGraderParameters

type InlineScriptGraderParameters struct {
	Assertions []string `yaml:"assertions,omitempty" json:"assertions,omitempty"`

	// Language indicates which language the Assertions are written for. Defaults to [LanguagePython]
	Language Language `yaml:"language,omitempty" json:"language,omitempty"`
}

type JSONSchemaGraderParameters

type JSONSchemaGraderParameters struct {
	// Schema is an inline JSON schema object used for validation.
	Schema map[string]any `yaml:"schema,omitempty" json:"schema,omitempty"`

	// SchemaFile is a path to a JSON schema file. Used when Schema is not provided.
	SchemaFile string `yaml:"schema_file,omitempty" json:"schema_file,omitempty"`
}

JSONSchemaGraderParameters holds the arguments for creating a JSON schema grader.

type Language

type Language string
const (
	LanguagePython     Language = "python"
	LanguageJavascript Language = "javascript"
)

type MeasureResult

type MeasureResult struct {
	Identifier string         `json:"identifier"`
	Value      float64        `json:"value"`
	Threshold  float64        `json:"threshold"`
	Passed     bool           `json:"passed"`
	Weight     float64        `json:"weight"`
	Details    map[string]any `json:"details,omitempty"`
}

type MeasurementDef

type MeasurementDef struct {
	Identifier string  `yaml:"name" json:"identifier"`
	Weight     float64 `yaml:"weight" json:"weight"`
	Threshold  float64 `yaml:"threshold" json:"threshold"`
	Enabled    bool    `yaml:"enabled,omitempty" json:"enabled,omitempty"`
	Desc       string  `yaml:"description,omitempty" json:"desc,omitempty"`
}

MeasurementDef defines a metric

type ModelScore

type ModelScore struct {
	ModelID        string             `json:"model_id"`
	HeuristicScore float64            `json:"heuristic_score"`
	Rank           int                `json:"rank"`
	Scores         map[string]float64 `json:"component_scores,omitempty"`
}

ModelScore holds the heuristic score and rank for a single model.

type ModelUsage

type ModelUsage struct {
	InputTokens      int     `json:"input_tokens"`
	OutputTokens     int     `json:"output_tokens"`
	CacheReadTokens  int     `json:"cache_read_tokens"`
	CacheWriteTokens int     `json:"cache_write_tokens"`
	RequestCount     float64 `json:"request_count"`
	RequestCost      float64 `json:"request_cost"`
}

ModelUsage holds per-model token and request usage.

type MultiSkillSummary

type MultiSkillSummary struct {
	Timestamp time.Time      `json:"timestamp"`
	Skills    []SkillSummary `json:"skills"`
	Overall   OverallSummary `json:"overall"`
}

MultiSkillSummary aggregates results across multiple skill evaluations.

type OutcomeDigest

type OutcomeDigest struct {
	TotalTests     int          `json:"total_tests"`
	Succeeded      int          `json:"succeeded"`
	Failed         int          `json:"failed"`
	Errors         int          `json:"errors"`
	Skipped        int          `json:"skipped"`
	SuccessRate    float64      `json:"success_rate"`
	AggregateScore float64      `json:"aggregate_score"`
	WeightedScore  float64      `json:"weighted_score"`
	MinScore       float64      `json:"min_score"`
	MaxScore       float64      `json:"max_score"`
	StdDev         float64      `json:"std_dev"`
	DurationMs     int64        `json:"duration_ms"`
	Groups         []GroupStats `json:"groups,omitempty"`
	Usage          *UsageStats  `json:"usage,omitempty"`

	// Statistical summary populated when trials_per_task > 1
	Statistics *StatisticalSummary `json:"statistics,omitempty"`
}

type OutcomeSetup

type OutcomeSetup struct {
	RunsPerTest int    `json:"runs_per_test"`
	ModelID     string `json:"model_id"`
	EngineType  string `json:"engine_type"`
	TimeoutSec  int    `json:"timeout_sec"`
	JudgeModel  string `json:"judge_model,omitempty"`
}

type OutcomeSpec

type OutcomeSpec struct {
	Category  string `yaml:"type" json:"category"`
	Value     any    `yaml:"value,omitempty" json:"value,omitempty"`
	Predicate string `yaml:"condition,omitempty" json:"predicate,omitempty"`
}

type OverallSummary

type OverallSummary struct {
	TotalSkills       int     `json:"total_skills"`
	TotalModels       int     `json:"total_models"`
	AvgPassRate       float64 `json:"avg_pass_rate"`
	AvgAggregateScore float64 `json:"avg_aggregate_score"`
}

OverallSummary contains cross-skill aggregated metrics.

type PairwiseResult

type PairwiseResult struct {
	Winner             string `json:"winner"`    // "baseline", "skill", or "tie"
	Magnitude          string `json:"magnitude"` // "much-better", "slightly-better", "equal", etc.
	Reasoning          string `json:"reasoning"`
	PositionConsistent bool   `json:"position_consistent"` // true if result held after position swap
}

PairwiseResult captures the outcome of a pairwise LLM judge comparison.

type ProgramGraderParameters

type ProgramGraderParameters struct {
	Command string   `yaml:"command,omitempty" json:"command,omitempty"`
	Args    []string `yaml:"args,omitempty" json:"args,omitempty"`
	Timeout int      `yaml:"timeout,omitempty" json:"timeout,omitempty"`
}

type PromptGraderMode

type PromptGraderMode string
const (
	PromptGraderModeIndependent PromptGraderMode = "independent"
	PromptGraderModePairwise    PromptGraderMode = "pairwise"
)

type PromptGraderParameters

type PromptGraderParameters struct {
	Prompt          string           `yaml:"prompt,omitempty" json:"prompt,omitempty"`
	Model           string           `yaml:"model,omitempty" json:"model,omitempty"`
	ContinueSession bool             `yaml:"continue_session,omitempty" json:"continue_session,omitempty"`
	Mode            PromptGraderMode `yaml:"mode,omitempty" json:"mode,omitempty"`
}

type Recommendation

type Recommendation struct {
	RecommendedModel string                `json:"recommended_model"`
	HeuristicScore   float64               `json:"heuristic_score"`
	Reason           string                `json:"reason"`
	WinnerMarginPct  float64               `json:"winner_margin_pct"`
	Weights          RecommendationWeights `json:"weights"`
	ModelScores      []ModelScore          `json:"all_models"`
}

Recommendation represents a heuristic recommendation for the best model across a multi-model evaluation run.

type RecommendationWeights

type RecommendationWeights struct {
	AggregateScore float64 `json:"aggregate_score"`
	PassRate       float64 `json:"pass_rate"`
	Consistency    float64 `json:"consistency"`
	Speed          float64 `json:"speed"`
}

RecommendationWeights defines the weighting scheme for heuristic scoring.

type ResourceRef

type ResourceRef struct {
	Location string `yaml:"path,omitempty" json:"location,omitempty"`
	Body     string `yaml:"content,omitempty" json:"body,omitempty"`
}

ResourceRef points to a file or inline content

type RunResult

type RunResult struct {
	RunNumber int `json:"run_number"`
	Attempts  int `json:"attempts"`
	// Status contains the overall status of the run.
	// NOTE: if Status == [StatusError], then [ErrorMsg] will be set to the
	// message from the error.
	Status        Status                   `json:"status"`
	DurationMs    int64                    `json:"duration_ms"`
	Validations   map[string]GraderResults `json:"validations"`
	SessionDigest SessionDigest            `json:"session_digest"`
	Transcript    []TranscriptEvent        `json:"transcript,omitempty"`
	FinalOutput   string                   `json:"final_output"`
	ErrorMsg      string                   `json:"error_msg,omitempty"`
}

RunResult is the result of a single run/trial

func (*RunResult) AllValidationsPassed

func (r *RunResult) AllValidationsPassed() bool

AllValidationsPassed checks if all validations passed

func (*RunResult) ComputeRunScore

func (r *RunResult) ComputeRunScore() float64

ComputeRunScore calculates the average score across all validations (unweighted, for backward compat)

func (*RunResult) ComputeWeightedRunScore

func (r *RunResult) ComputeWeightedRunScore() float64

ComputeWeightedRunScore calculates the weighted composite score (0.0–1.0) using each grader's Weight field. If all weights are zero, falls back to simple average.

type SessionDigest

type SessionDigest struct {
	ToolCallCount int         `json:"tool_call_count"`
	ToolsUsed     []string    `json:"tools_used"`
	ToolCalls     []ToolCall  `json:"tool_calls,omitempty"`
	Errors        []string    `json:"errors"`
	Usage         *UsageStats `json:"usage,omitempty"`
	SessionID     string      `json:"session_id,omitempty"`
}

type SkillImpactMetric

type SkillImpactMetric struct {
	PassRateWithSkills float64         `json:"pass_rate_with_skills"`
	PassRateBaseline   float64         `json:"pass_rate_baseline"`
	Delta              float64         `json:"delta"`
	PercentChange      float64         `json:"percent_change"`
	Pairwise           *PairwiseResult `json:"pairwise,omitempty"`
}

SkillImpactMetric represents A/B comparison for a single task

type SkillInvocationGraderParameters

type SkillInvocationGraderParameters struct {
	RequiredSkills []string                    `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
	Mode           SkillInvocationMatchingMode `yaml:"mode,omitempty" json:"mode,omitempty"`
	AllowExtra     *bool                       `yaml:"allow_extra,omitempty" json:"allow_extra,omitempty"`
}

type SkillInvocationMatchingMode

type SkillInvocationMatchingMode string

SkillInvocationMatchingMode controls how actual skill invocations are compared to expected skills.

const (
	SkillMatchingModeExact    SkillInvocationMatchingMode = "exact_match"
	SkillMatchingModeInOrder  SkillInvocationMatchingMode = "in_order"
	SkillMatchingModeAnyOrder SkillInvocationMatchingMode = "any_order"
)

type SkillSummary

type SkillSummary struct {
	SkillName      string   `json:"skill_name"`
	Models         []string `json:"models"`
	PassRate       float64  `json:"pass_rate"`
	AggregateScore float64  `json:"aggregate_score"`
	OutputFiles    []string `json:"output_files"`
}

SkillSummary contains aggregated metrics for a single skill evaluation.

type SpecIdentity

type SpecIdentity struct {
	Name        string `yaml:"name" json:"name"`
	Description string `yaml:"description,omitempty" json:"description,omitempty"`
}

type StatisticalSummary

type StatisticalSummary struct {
	BootstrapCI    statistics.ConfidenceInterval `json:"bootstrap_ci"`
	IsSignificant  bool                          `json:"is_significant"`
	NormalizedGain *float64                      `json:"normalized_gain,omitempty"`
}

StatisticalSummary holds aggregate statistical data for the digest when trials > 1.

type Status

type Status string

Status represents the outcome status of a test or run.

const (
	StatusPassed Status = "passed"
	StatusFailed Status = "failed"
	StatusError  Status = "error"
	// StatusNA is used in comparison reports when a task is not found in a result file.
	StatusNA Status = "n/a"
)

type TaskTranscript

type TaskTranscript struct {
	TaskID      string                   `json:"task_id"`
	TaskName    string                   `json:"task_name"`
	Status      Status                   `json:"status"`
	StartedAt   time.Time                `json:"started_at"`
	CompletedAt time.Time                `json:"completed_at"`
	DurationMs  int64                    `json:"duration_ms"`
	Prompt      string                   `json:"prompt"`
	FinalOutput string                   `json:"final_output"`
	Transcript  []TranscriptEvent        `json:"transcript"`
	Validations map[string]GraderResults `json:"validations,omitempty"`
	Session     SessionDigest            `json:"session"`
	ErrorMsg    string                   `json:"error_msg,omitempty"`
}

TaskTranscript is the per-task JSON file written to the transcript directory.

type TestCase

type TestCase struct {
	Active      *bool             `yaml:"enabled,omitempty" json:"active,omitempty"`
	ContextRoot string            `yaml:"context_dir,omitempty" json:"context_root,omitempty"`
	DisplayName string            `yaml:"name" json:"display_name"`
	Expectation TestExpectation   `yaml:"expected,omitempty" json:"expectation,omitempty"`
	Stimulus    TestStimulus      `yaml:"inputs" json:"stimulus"`
	Summary     string            `yaml:"description,omitempty" json:"summary,omitempty"`
	Tags        []string          `yaml:"tags,omitempty" json:"labels,omitempty"`
	TestID      string            `yaml:"id" json:"test_id"`
	TimeoutSec  *int              `yaml:"timeout_seconds,omitempty" json:"timeout_sec,omitempty"`
	Validators  []ValidatorInline `yaml:"graders,omitempty" json:"validators,omitempty"`
}

TestCase represents a single evaluation test

func LoadTestCase

func LoadTestCase(path string) (*TestCase, error)

LoadTestCase loads a test case from YAML

type TestExpectation

type TestExpectation struct {
	OutcomeSpecs    []OutcomeSpec  `yaml:"outcomes,omitempty" json:"outcome_specs,omitempty"`
	ToolPatterns    map[string]any `yaml:"tool_calls,omitempty" json:"tool_patterns,omitempty"`
	BehaviorRules   BehaviorRules  `yaml:"behavior,omitempty" json:"behavior_rules,omitempty"`
	MustInclude     []string       `yaml:"output_contains,omitempty" json:"must_include,omitempty"`
	MustExclude     []string       `yaml:"output_not_contains,omitempty" json:"must_exclude,omitempty"`
	ExpectedTrigger *bool          `yaml:"should_trigger,omitempty" json:"expected_trigger,omitempty"`
}

TestExpectation defines expected outcomes

type TestOutcome

type TestOutcome struct {
	TestID      string             `json:"test_id"`
	DisplayName string             `json:"display_name"`
	Group       string             `json:"group,omitempty"`
	Status      Status             `json:"status"`
	Runs        []RunResult        `json:"runs"`
	Stats       *TestStats         `json:"stats,omitempty"`
	SkillImpact *SkillImpactMetric `json:"skill_impact,omitempty"`
}

TestOutcome represents the result of one test case

type TestStats

type TestStats struct {
	PassRate         float64 `json:"pass_rate"`
	FlakinessPercent float64 `json:"flakiness_percent"`
	PassedRuns       int     `json:"passed_runs"`
	FailedRuns       int     `json:"failed_runs"`
	ErrorRuns        int     `json:"error_runs"`
	TotalRuns        int     `json:"total_runs"`
	AvgScore         float64 `json:"avg_score"`
	AvgWeightedScore float64 `json:"avg_weighted_score"`
	MinScore         float64 `json:"min_score"`
	MaxScore         float64 `json:"max_score"`
	StdDevScore      float64 `json:"std_dev_score"`
	ScoreVariance    float64 `json:"score_variance"`
	CI95Lo           float64 `json:"ci95_lo"`
	CI95Hi           float64 `json:"ci95_hi"`
	Flaky            bool    `json:"flaky"`
	AvgDurationMs    int64   `json:"avg_duration_ms"`

	// Bootstrap confidence interval over weighted scores (populated when trials > 1)
	BootstrapCI   *statistics.ConfidenceInterval `json:"bootstrap_ci,omitempty"`
	IsSignificant *bool                          `json:"is_significant,omitempty"`
}

type TestStimulus

type TestStimulus struct {
	Message     string            `yaml:"prompt" json:"message"`
	Metadata    map[string]any    `yaml:"context,omitempty" json:"metadata,omitempty"`
	Resources   []ResourceRef     `yaml:"files,omitempty" json:"resources,omitempty"`
	Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"`
}

TestStimulus defines the input for a test

type TextGraderParameters

type TextGraderParameters struct {
	// Contains lists substrings that must appear in the output (case-insensitive).
	Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`

	// NotContains lists substrings that must NOT appear in the output (case-insensitive).
	NotContains []string `yaml:"not_contains,omitempty" json:"not_contains,omitempty"`

	// ContainsCS lists substrings that must appear in the output (case-sensitive).
	ContainsCS []string `yaml:"contains_cs,omitempty" json:"contains_cs,omitempty"`

	// NotContainsCS lists substrings that must NOT appear in the output (case-sensitive).
	NotContainsCS []string `yaml:"not_contains_cs,omitempty" json:"not_contains_cs,omitempty"`

	// RegexMatch lists regex patterns that must match somewhere in the output.
	RegexMatch []string `yaml:"regex_match,omitempty" json:"regex_match,omitempty"`

	// RegexNotMatch lists regex patterns that must NOT match anywhere in the output.
	RegexNotMatch []string `yaml:"regex_not_match,omitempty" json:"regex_not_match,omitempty"`
}

TextGraderParameters holds the arguments for creating a text grader.

type ToolCall

type ToolCall struct {
	Name      string          `json:"name"`
	Arguments ToolCallArgs    `json:"arguments,omitempty"`
	Result    *copilot.Result `json:"result,omitempty"`
	Success   bool            `json:"success"`
}

ToolCall represents a tool invocation

func FilterToolCalls

func FilterToolCalls(sessionEvents []copilot.SessionEvent) []ToolCall

FilterToolCalls goes through the list of session events and correlates tool starts with Success.

type ToolCallArgs

type ToolCallArgs struct {
	// these are filled out for file-based tools (view/edit)
	Path     string `json:"path"      mapstructure:"path"`
	FileText string `json:"file_text" mapstructure:"file_text"`

	// filled out for tools like bash or powershell
	Command     string `json:"command"     mapstructure:"command"`
	Description string `json:"description" mapstructure:"description"`

	// filled out for skill invocations
	Skill string `json:"skill" mapstructure:"skill"`
}

type ToolConstraintGraderParameters

type ToolConstraintGraderParameters struct {
	ExpectTools []ToolSpecParameters `yaml:"expect_tools,omitempty" json:"expect_tools,omitempty"`
	RejectTools []ToolSpecParameters `yaml:"reject_tools,omitempty" json:"reject_tools,omitempty"`
}

type ToolSpecParameters

type ToolSpecParameters struct {
	Tool           string `yaml:"tool" json:"tool"`
	CommandPattern string `yaml:"command_pattern,omitempty" json:"command_pattern,omitempty"`
	SkillPattern   string `yaml:"skill_pattern,omitempty" json:"skill_pattern,omitempty"`
	PathPattern    string `yaml:"path_pattern,omitempty" json:"path_pattern,omitempty"`
}

type TranscriptEvent

type TranscriptEvent struct {
	copilot.SessionEvent `json:"-"`
}

func (TranscriptEvent) MarshalJSON

func (te TranscriptEvent) MarshalJSON() ([]byte, error)

func (*TranscriptEvent) UnmarshalJSON

func (te *TranscriptEvent) UnmarshalJSON(data []byte) error

type TriggerHeuristicGraderParameters

type TriggerHeuristicGraderParameters struct {
	SkillPath string   `yaml:"skill_path" json:"skill_path"`
	Mode      string   `yaml:"mode" json:"mode"`
	Threshold *float64 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
}

TriggerHeuristicGraderParameters holds the arguments for creating a trigger heuristic grader.

type TriggerMetrics

type TriggerMetrics struct {
	TP        int     `json:"true_positives"`
	FP        int     `json:"false_positives"`
	TN        int     `json:"true_negatives"`
	FN        int     `json:"false_negatives"`
	Errors    int     `json:"errors,omitempty"`
	Precision float64 `json:"precision"`
	Recall    float64 `json:"recall"`
	F1        float64 `json:"f1"`
	Accuracy  float64 `json:"accuracy"`
}

TriggerMetrics holds classification metrics for trigger accuracy.

func ComputeTriggerMetrics

func ComputeTriggerMetrics(results []TriggerResult) *TriggerMetrics

ComputeTriggerMetrics calculates precision, recall, F1, and accuracy from a set of trigger classification results. Results are weighted by confidence: "high" (or empty) counts as 1.0, "medium" as 0.5. Returns nil when results is empty.

type TriggerResult

type TriggerResult struct {
	Prompt        string            `json:"prompt"`
	Confidence    string            `json:"confidence,omitempty"`
	ShouldTrigger bool              `json:"should_trigger"`
	DidTrigger    bool              `json:"did_trigger"`
	ErrorMsg      string            `json:"error_msg,omitempty"`
	FinalOutput   string            `json:"final_output,omitempty"`
	Transcript    []TranscriptEvent `json:"transcript,omitempty"`
	ToolCalls     []ToolCall        `json:"tool_calls,omitempty"`
	SessionID     string            `json:"session_id,omitempty"`
}

TriggerResult pairs an expected trigger label with the actual outcome.

type UsageStats

type UsageStats struct {
	Turns            int                   `json:"turns"`
	InputTokens      int                   `json:"input_tokens"`
	OutputTokens     int                   `json:"output_tokens"`
	CacheReadTokens  int                   `json:"cache_read_tokens"`
	CacheWriteTokens int                   `json:"cache_write_tokens"`
	PremiumRequests  float64               `json:"premium_requests"`
	ModelMetrics     map[string]ModelUsage `json:"model_metrics,omitempty"`
}

UsageStats holds token and premium request usage data from a Copilot SDK session.

func AggregateUsageStats

func AggregateUsageStats(stats []*UsageStats) *UsageStats

AggregateUsageStats sums usage across multiple UsageStats (e.g. across runs).

func (*UsageStats) IsZero

func (u *UsageStats) IsZero() bool

IsZero returns true if no usage data has been recorded.

type ValidatorInline

type ValidatorInline struct {
	Identifier string           `yaml:"name" json:"identifier"`
	Kind       GraderKind       `yaml:"type,omitempty" json:"kind,omitempty"`
	Checks     []string         `yaml:"assertions,omitempty" json:"checks,omitempty"`
	Rubric     string           `yaml:"rubric,omitempty" json:"rubric,omitempty"`
	Weight     float64          `yaml:"weight,omitempty" json:"weight,omitempty"`
	Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}

ValidatorInline is a validator embedded in a test case

func (*ValidatorInline) UnmarshalYAML

func (v *ValidatorInline) UnmarshalYAML(node *yaml.Node) error

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL