Documentation
¶
Index ¶
- func AllGraderKinds() []string
- func ComputeStdDev(values []float64) float64
- type ActionSequenceGraderParameters
- type ActionSequenceMatchingMode
- type BehaviorGraderParameters
- type BehaviorRules
- type BenchmarkSpec
- type Config
- type DiffExpectedFileParameters
- type DiffGraderParameters
- type EvaluationOutcome
- type FileContentPatternParameters
- type FileGraderParameters
- type GenericGraderParameters
- type GradeOutcome
- type GraderConfig
- type GraderKind
- type GraderParameters
- type GraderResults
- type GroupStats
- type InlineScriptGraderParameters
- type JSONSchemaGraderParameters
- type Language
- type MeasureResult
- type MeasurementDef
- type ModelScore
- type ModelUsage
- type MultiSkillSummary
- type OutcomeDigest
- type OutcomeSetup
- type OutcomeSpec
- type OverallSummary
- type PairwiseResult
- type ProgramGraderParameters
- type PromptGraderMode
- type PromptGraderParameters
- type Recommendation
- type RecommendationWeights
- type ResourceRef
- type RunResult
- type SessionDigest
- type SkillImpactMetric
- type SkillInvocation
- type SkillInvocationGraderParameters
- type SkillInvocationMatchingMode
- type SkillSummary
- type SpecIdentity
- type StatisticalSummary
- type Status
- type TaskTranscript
- type TestCase
- type TestExpectation
- type TestOutcome
- type TestStats
- type TestStimulus
- type TextGraderParameters
- type ToolCall
- type ToolCallArgs
- type ToolCallsGraderParameters
- type ToolConstraintGraderParameters
- type ToolSpecParameters
- type TranscriptEvent
- type TriggerHeuristicGraderParameters
- type TriggerMetrics
- type TriggerResult
- type UsageStats
- type ValidatorInline
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AllGraderKinds ¶
func AllGraderKinds() []string
func ComputeStdDev ¶
ComputeStdDev returns the population standard deviation for a slice of float64 values.
Types ¶
type ActionSequenceGraderParameters ¶
type ActionSequenceGraderParameters struct {
MatchingMode ActionSequenceMatchingMode `yaml:"matching_mode,omitempty" json:"matching_mode,omitempty"`
ExpectedActions []string `yaml:"expected_actions,omitempty" json:"expected_actions,omitempty"`
}
type ActionSequenceMatchingMode ¶
type ActionSequenceMatchingMode string
ActionSequenceMatchingMode controls how actual tool calls are compared to expected actions.
const ( ActionSequenceMatchingModeExact ActionSequenceMatchingMode = "exact_match" ActionSequenceMatchingModeInOrder ActionSequenceMatchingMode = "in_order_match" ActionSequenceMatchingModeAnyOrder ActionSequenceMatchingMode = "any_order_match" )
type BehaviorGraderParameters ¶
type BehaviorGraderParameters struct {
MaxToolCalls int `yaml:"max_tool_calls,omitempty" json:"max_tool_calls,omitempty"`
MaxTokens int `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
RequiredTools []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
MaxDurationMS int64 `yaml:"max_duration_ms,omitempty" json:"max_duration_ms,omitempty"`
}
type BehaviorRules ¶
type BehaviorRules struct {
MaxToolInvocations int `yaml:"max_tool_calls,omitempty" json:"max_tool_invocations,omitempty"`
MaxRounds int `yaml:"max_iterations,omitempty" json:"max_rounds,omitempty"`
MaxTokens int `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
MaxResponseTimeMs int64 `yaml:"max_response_time_ms,omitempty" json:"max_response_time_ms,omitempty"`
MustUseTool []string `yaml:"required_tools,omitempty" json:"must_use_tool,omitempty"`
ForbidTool []string `yaml:"forbidden_tools,omitempty" json:"forbid_tool,omitempty"`
}
type BenchmarkSpec ¶
type BenchmarkSpec struct {
SpecIdentity `yaml:",inline"`
SkillName string `yaml:"skill"`
Version string `yaml:"version"`
Config Config `yaml:"config"`
Hooks hooks.HooksConfig `yaml:"hooks,omitempty"`
Inputs map[string]string `yaml:"inputs,omitempty" json:"inputs,omitempty"`
TasksFrom string `yaml:"tasks_from,omitempty" json:"tasks_from,omitempty"`
Range [2]int `yaml:"range,omitempty" json:"range,omitempty"`
Graders []GraderConfig `yaml:"graders"`
Metrics []MeasurementDef `yaml:"metrics"`
Tasks []string `yaml:"tasks"`
Baseline bool `yaml:"baseline,omitempty" json:"baseline,omitempty"`
}
BenchmarkSpec represents a complete evaluation specification
func LoadBenchmarkSpec ¶
func LoadBenchmarkSpec(path string) (*BenchmarkSpec, error)
LoadBenchmarkSpec loads a spec from a YAML file with strict validation.
Normally the schema validation will catch errors in the eval.yaml, but this also does strict YAML parsing to catch errors like unknown fields or type errors that the schema validation might miss.
func (*BenchmarkSpec) ResolveTestFiles ¶
func (s *BenchmarkSpec) ResolveTestFiles(basePath string) ([]string, error)
ResolveTestFiles expands glob patterns to actual test files
func (*BenchmarkSpec) Validate ¶
func (s *BenchmarkSpec) Validate() error
Validate checks that the spec is valid
type Config ¶
type Config struct {
TrialsPerTask int `yaml:"trials_per_task" json:"runs_per_test"`
TimeoutSec int `yaml:"timeout_seconds" json:"timeout_sec"`
Concurrent bool `yaml:"parallel" json:"concurrent"`
Workers int `yaml:"workers,omitempty" json:"workers,omitempty"`
StopOnError bool `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
EngineType string `yaml:"executor" json:"engine_type"`
ModelID string `yaml:"model" json:"model_id"`
SkillPaths []string `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
ServerConfigs map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
MaxAttempts int `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
GroupBy string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
JudgeModel string `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
}
Config controls execution behavior
type DiffExpectedFileParameters ¶
type DiffExpectedFileParameters struct {
// Path is the workspace-relative path to the file being checked.
Path string `yaml:"path" json:"path"`
// Snapshot is the path (relative to context/fixtures dir) of the expected file content.
// When set, the workspace file must match this snapshot exactly.
Snapshot string `yaml:"snapshot,omitempty" json:"snapshot,omitempty"`
// Contains lists line fragments that must appear in the workspace file.
// Prefixed with "+" means the line must be present; "-" means it must be absent.
Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
}
DiffExpectedFileParameters defines a single file expectation for the diff grader. Either Snapshot or Contains (or both) must be specified.
type DiffGraderParameters ¶
type DiffGraderParameters struct {
ExpectedFiles []DiffExpectedFileParameters `yaml:"expected_files,omitempty" json:"expected_files,omitempty"`
ContextDir string `yaml:"context_dir,omitempty" json:"context_dir,omitempty"`
UpdateSnapshots bool `yaml:"update_snapshots,omitempty" json:"update_snapshots,omitempty"`
}
type EvaluationOutcome ¶
type EvaluationOutcome struct {
RunID string `json:"eval_id"`
SkillTested string `json:"skill"`
BenchName string `json:"eval_name"`
Timestamp time.Time `json:"timestamp"`
Setup OutcomeSetup `json:"config"`
Digest OutcomeDigest `json:"summary"`
Measures map[string]MeasureResult `json:"metrics"`
TestOutcomes []TestOutcome `json:"tasks"`
TriggerMetrics *TriggerMetrics `json:"trigger_metrics,omitempty"`
TriggerResults []TriggerResult `json:"trigger_results,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
IsBaseline bool `json:"is_baseline,omitempty"`
BaselineOutcome *EvaluationOutcome `json:"baseline_outcome,omitempty"`
}
EvaluationOutcome represents the complete result of an evaluation run
type FileGraderParameters ¶
type FileGraderParameters struct {
MustExist []string `yaml:"must_exist,omitempty" json:"must_exist,omitempty"`
MustNotExist []string `yaml:"must_not_exist,omitempty" json:"must_not_exist,omitempty"`
ContentPatterns []FileContentPatternParameters `yaml:"content_patterns,omitempty" json:"content_patterns,omitempty"`
}
type GenericGraderParameters ¶
GenericGraderParameters is used for unknown kinds to preserve raw config values.
type GradeOutcome ¶ added in v0.22.0
type GraderConfig ¶
type GraderConfig struct {
Kind GraderKind `yaml:"type" json:"kind"`
Identifier string `yaml:"name" json:"identifier"`
ScriptPath string `yaml:"script,omitempty" json:"script_path,omitempty"`
Rubric string `yaml:"rubric,omitempty" json:"rubric,omitempty"`
ModelID string `yaml:"model,omitempty" json:"model_id,omitempty"`
Weight float64 `yaml:"weight,omitempty" json:"weight,omitempty"`
Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}
GraderConfig defines a validator/grader
func (*GraderConfig) EffectiveWeight ¶
func (g *GraderConfig) EffectiveWeight() float64
EffectiveWeight returns the grader weight, defaulting to 1.0 if unset.
func (*GraderConfig) UnmarshalYAML ¶
func (g *GraderConfig) UnmarshalYAML(node *yaml.Node) error
func (*GraderConfig) Validate ¶ added in v0.26.0
func (g *GraderConfig) Validate() error
Validate checks that the grader config has required fields for its type.
type GraderKind ¶
type GraderKind string
GraderKind identifies the type of grader (e.g. regex, file, code).
const ( GraderKindInlineScript GraderKind = "code" GraderKindPrompt GraderKind = "prompt" GraderKindText GraderKind = "text" GraderKindFile GraderKind = "file" GraderKindJSONSchema GraderKind = "json_schema" GraderKindProgram GraderKind = "program" GraderKindBehavior GraderKind = "behavior" GraderKindActionSequence GraderKind = "action_sequence" GraderKindSkillInvocation GraderKind = "skill_invocation" GraderKindTrigger GraderKind = "trigger" GraderKindDiff GraderKind = "diff" GraderKindToolConstraint GraderKind = "tool_constraint" GraderKindToolCalls GraderKind = "tool_calls" )
type GraderParameters ¶
type GraderParameters interface {
// contains filtered or unexported methods
}
GraderParameters is a polymorphic grader config payload decoded from YAML based on GraderKind.
type GraderResults ¶
type GraderResults struct {
Name string `json:"identifier"`
Type GraderKind `json:"type"`
Score float64 `json:"score"`
Weight float64 `json:"weight"`
Passed bool `json:"passed"`
Feedback string `json:"feedback"`
Details map[string]any `json:"details,omitempty"`
DurationMs int64 `json:"duration_ms"`
}
type GroupStats ¶
type GroupStats struct {
Name string `json:"name"`
Passed int `json:"passed"`
Total int `json:"total"`
AvgScore float64 `json:"avg_score"`
}
GroupStats holds aggregate statistics for a group of test outcomes.
type JSONSchemaGraderParameters ¶
type JSONSchemaGraderParameters struct {
// Schema is an inline JSON schema object used for validation.
Schema map[string]any `yaml:"schema,omitempty" json:"schema,omitempty"`
// SchemaFile is a path to a JSON schema file. Used when Schema is not provided.
SchemaFile string `yaml:"schema_file,omitempty" json:"schema_file,omitempty"`
}
JSONSchemaGraderParameters holds the arguments for creating a JSON schema grader.
type MeasureResult ¶
type MeasurementDef ¶
type MeasurementDef struct {
Identifier string `yaml:"name" json:"identifier"`
Weight float64 `yaml:"weight" json:"weight"`
Threshold float64 `yaml:"threshold" json:"threshold"`
Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
Desc string `yaml:"description,omitempty" json:"desc,omitempty"`
}
MeasurementDef defines a metric
type ModelScore ¶
type ModelScore struct {
ModelID string `json:"model_id"`
HeuristicScore float64 `json:"heuristic_score"`
Rank int `json:"rank"`
Scores map[string]float64 `json:"component_scores,omitempty"`
}
ModelScore holds the heuristic score and rank for a single model.
type ModelUsage ¶
type ModelUsage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
CacheReadTokens int `json:"cache_read_tokens"`
CacheWriteTokens int `json:"cache_write_tokens"`
RequestCount float64 `json:"request_count"`
RequestCost float64 `json:"request_cost"`
}
ModelUsage holds per-model token and request usage.
type MultiSkillSummary ¶
type MultiSkillSummary struct {
Timestamp time.Time `json:"timestamp"`
Skills []SkillSummary `json:"skills"`
Overall OverallSummary `json:"overall"`
}
MultiSkillSummary aggregates results across multiple skill evaluations.
type OutcomeDigest ¶
type OutcomeDigest struct {
TotalTests int `json:"total_tests"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
Errors int `json:"errors"`
Skipped int `json:"skipped"`
SuccessRate float64 `json:"success_rate"`
AggregateScore float64 `json:"aggregate_score"`
WeightedScore float64 `json:"weighted_score"`
MinScore float64 `json:"min_score"`
MaxScore float64 `json:"max_score"`
StdDev float64 `json:"std_dev"`
DurationMs int64 `json:"duration_ms"`
Groups []GroupStats `json:"groups,omitempty"`
Usage *UsageStats `json:"usage,omitempty"`
// Statistical summary populated when trials_per_task > 1
Statistics *StatisticalSummary `json:"statistics,omitempty"`
}
type OutcomeSetup ¶
type OutcomeSpec ¶
type OverallSummary ¶
type OverallSummary struct {
TotalSkills int `json:"total_skills"`
TotalModels int `json:"total_models"`
AvgPassRate float64 `json:"avg_pass_rate"`
AvgAggregateScore float64 `json:"avg_aggregate_score"`
}
OverallSummary contains cross-skill aggregated metrics.
type PairwiseResult ¶
type PairwiseResult struct {
Winner string `json:"winner"` // "baseline", "skill", or "tie"
Magnitude string `json:"magnitude"` // "much-better", "slightly-better", "equal", etc.
Reasoning string `json:"reasoning"`
PositionConsistent bool `json:"position_consistent"` // true if result held after position swap
}
PairwiseResult captures the outcome of a pairwise LLM judge comparison.
type ProgramGraderParameters ¶
type PromptGraderMode ¶
type PromptGraderMode string
const ( PromptGraderModeIndependent PromptGraderMode = "independent" PromptGraderModePairwise PromptGraderMode = "pairwise" )
type PromptGraderParameters ¶
type PromptGraderParameters struct {
Prompt string `yaml:"prompt,omitempty" json:"prompt,omitempty"`
Model string `yaml:"model,omitempty" json:"model,omitempty"`
ContinueSession bool `yaml:"continue_session,omitempty" json:"continue_session,omitempty"`
Mode PromptGraderMode `yaml:"mode,omitempty" json:"mode,omitempty"`
}
type Recommendation ¶
type Recommendation struct {
RecommendedModel string `json:"recommended_model"`
HeuristicScore float64 `json:"heuristic_score"`
Reason string `json:"reason"`
WinnerMarginPct float64 `json:"winner_margin_pct"`
Weights RecommendationWeights `json:"weights"`
ModelScores []ModelScore `json:"all_models"`
}
Recommendation represents a heuristic recommendation for the best model across a multi-model evaluation run.
type RecommendationWeights ¶
type RecommendationWeights struct {
AggregateScore float64 `json:"aggregate_score"`
PassRate float64 `json:"pass_rate"`
Consistency float64 `json:"consistency"`
Speed float64 `json:"speed"`
}
RecommendationWeights defines the weighting scheme for heuristic scoring.
type ResourceRef ¶
type ResourceRef struct {
Location string `yaml:"path,omitempty" json:"location,omitempty"`
Body string `yaml:"content,omitempty" json:"body,omitempty"`
}
ResourceRef points to a file or inline content
type RunResult ¶
type RunResult struct {
RunNumber int `json:"run_number"`
Attempts int `json:"attempts"`
// Status contains the overall status of the run.
// NOTE: if Status == [StatusError], then [ErrorMsg] will be set to the
// message from the error.
Status Status `json:"status"`
DurationMs int64 `json:"duration_ms"`
Validations map[string]GraderResults `json:"validations"`
SessionDigest SessionDigest `json:"session_digest"`
Transcript []TranscriptEvent `json:"transcript,omitempty"`
FinalOutput string `json:"final_output"`
ErrorMsg string `json:"error_msg,omitempty"`
SkillInvocations []SkillInvocation `json:"skill_invocations,omitempty"`
}
RunResult is the result of a single run/trial
func (*RunResult) AllValidationsPassed ¶
AllValidationsPassed checks if all validations passed
func (*RunResult) ComputeRunScore ¶
ComputeRunScore calculates the average score across all validations (unweighted, for backward compat)
func (*RunResult) ComputeWeightedRunScore ¶
ComputeWeightedRunScore calculates the weighted composite score (0.0–1.0) using each grader's Weight field. If all weights are zero, falls back to simple average.
type SessionDigest ¶
type SkillImpactMetric ¶
type SkillImpactMetric struct {
PassRateWithSkills float64 `json:"pass_rate_with_skills"`
PassRateBaseline float64 `json:"pass_rate_baseline"`
Delta float64 `json:"delta"`
PercentChange float64 `json:"percent_change"`
Pairwise *PairwiseResult `json:"pairwise,omitempty"`
}
SkillImpactMetric represents A/B comparison for a single task
type SkillInvocation ¶ added in v0.22.0
SkillInvocation records a skill invoked during an agent session.
type SkillInvocationGraderParameters ¶
type SkillInvocationGraderParameters struct {
RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
Mode SkillInvocationMatchingMode `yaml:"mode,omitempty" json:"mode,omitempty"`
AllowExtra *bool `yaml:"allow_extra,omitempty" json:"allow_extra,omitempty"`
}
type SkillInvocationMatchingMode ¶
type SkillInvocationMatchingMode string
SkillInvocationMatchingMode controls how actual skill invocations are compared to expected skills.
const ( SkillMatchingModeExact SkillInvocationMatchingMode = "exact_match" SkillMatchingModeInOrder SkillInvocationMatchingMode = "in_order" SkillMatchingModeAnyOrder SkillInvocationMatchingMode = "any_order" )
type SkillSummary ¶
type SkillSummary struct {
SkillName string `json:"skill_name"`
Models []string `json:"models"`
PassRate float64 `json:"pass_rate"`
AggregateScore float64 `json:"aggregate_score"`
OutputFiles []string `json:"output_files"`
}
SkillSummary contains aggregated metrics for a single skill evaluation.
type SpecIdentity ¶
type StatisticalSummary ¶
type StatisticalSummary struct {
BootstrapCI statistics.ConfidenceInterval `json:"bootstrap_ci"`
IsSignificant bool `json:"is_significant"`
NormalizedGain *float64 `json:"normalized_gain,omitempty"`
}
StatisticalSummary holds aggregate statistical data for the digest when trials > 1.
type TaskTranscript ¶
type TaskTranscript struct {
TaskID string `json:"task_id"`
TaskName string `json:"task_name"`
Status Status `json:"status"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
DurationMs int64 `json:"duration_ms"`
Prompt string `json:"prompt"`
FinalOutput string `json:"final_output"`
Transcript []TranscriptEvent `json:"transcript"`
Validations map[string]GraderResults `json:"validations,omitempty"`
Session SessionDigest `json:"session"`
ErrorMsg string `json:"error_msg,omitempty"`
}
TaskTranscript is the per-task JSON file written to the transcript directory.
type TestCase ¶
type TestCase struct {
Active *bool `yaml:"enabled,omitempty" json:"active,omitempty"`
ContextRoot string `yaml:"context_dir,omitempty" json:"context_root,omitempty"`
DisplayName string `yaml:"name" json:"display_name"`
Expectation TestExpectation `yaml:"expected,omitempty" json:"expectation,omitempty"`
Stimulus TestStimulus `yaml:"inputs" json:"stimulus"`
Summary string `yaml:"description,omitempty" json:"summary,omitempty"`
Tags []string `yaml:"tags,omitempty" json:"labels,omitempty"`
TestID string `yaml:"id" json:"test_id"`
TimeoutSec *int `yaml:"timeout_seconds,omitempty" json:"timeout_sec,omitempty"`
Validators []ValidatorInline `yaml:"graders,omitempty" json:"validators,omitempty"`
}
TestCase represents a single evaluation test
func LoadTestCase ¶
LoadTestCase loads a test case from YAML
type TestExpectation ¶
type TestExpectation struct {
OutcomeSpecs []OutcomeSpec `yaml:"outcomes,omitempty" json:"outcome_specs,omitempty"`
ToolPatterns map[string]any `yaml:"tool_calls,omitempty" json:"tool_patterns,omitempty"`
BehaviorRules BehaviorRules `yaml:"behavior,omitempty" json:"behavior_rules,omitempty"`
MustInclude []string `yaml:"output_contains,omitempty" json:"must_include,omitempty"`
MustExclude []string `yaml:"output_not_contains,omitempty" json:"must_exclude,omitempty"`
MayInclude []string `yaml:"output_contains_any,omitempty" json:"may_include,omitempty"`
ExpectedTrigger *bool `yaml:"should_trigger,omitempty" json:"expected_trigger,omitempty"`
}
TestExpectation defines expected outcomes
type TestOutcome ¶
type TestOutcome struct {
TestID string `json:"test_id"`
DisplayName string `json:"display_name"`
Group string `json:"group,omitempty"`
Status Status `json:"status"`
Runs []RunResult `json:"runs"`
Stats *TestStats `json:"stats,omitempty"`
SkillImpact *SkillImpactMetric `json:"skill_impact,omitempty"`
}
TestOutcome represents the result of one test case
type TestStats ¶
type TestStats struct {
PassRate float64 `json:"pass_rate"`
FlakinessPercent float64 `json:"flakiness_percent"`
PassedRuns int `json:"passed_runs"`
FailedRuns int `json:"failed_runs"`
ErrorRuns int `json:"error_runs"`
TotalRuns int `json:"total_runs"`
AvgScore float64 `json:"avg_score"`
AvgWeightedScore float64 `json:"avg_weighted_score"`
MinScore float64 `json:"min_score"`
MaxScore float64 `json:"max_score"`
StdDevScore float64 `json:"std_dev_score"`
ScoreVariance float64 `json:"score_variance"`
CI95Lo float64 `json:"ci95_lo"`
CI95Hi float64 `json:"ci95_hi"`
Flaky bool `json:"flaky"`
AvgDurationMs int64 `json:"avg_duration_ms"`
// Bootstrap confidence interval over weighted scores (populated when trials > 1)
BootstrapCI *statistics.ConfidenceInterval `json:"bootstrap_ci,omitempty"`
IsSignificant *bool `json:"is_significant,omitempty"`
}
type TestStimulus ¶
type TestStimulus struct {
Message string `yaml:"prompt" json:"message"`
MessageFile string `yaml:"prompt_file,omitempty" json:"message_file,omitempty"`
Metadata map[string]any `yaml:"context,omitempty" json:"metadata,omitempty"`
Resources []ResourceRef `yaml:"files,omitempty" json:"resources,omitempty"`
Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"`
FollowUps []string `yaml:"follow_up_prompts,omitempty" json:"follow_ups,omitempty"`
}
TestStimulus defines the input for a test
type TextGraderParameters ¶
type TextGraderParameters struct {
// Contains lists substrings that must appear in the output (case-insensitive).
Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
// NotContains lists substrings that must NOT appear in the output (case-insensitive).
NotContains []string `yaml:"not_contains,omitempty" json:"not_contains,omitempty"`
// ContainsCS lists substrings that must appear in the output (case-sensitive).
ContainsCS []string `yaml:"contains_cs,omitempty" json:"contains_cs,omitempty"`
// NotContainsCS lists substrings that must NOT appear in the output (case-sensitive).
NotContainsCS []string `yaml:"not_contains_cs,omitempty" json:"not_contains_cs,omitempty"`
// RegexMatch lists regex patterns that must match somewhere in the output.
RegexMatch []string `yaml:"regex_match,omitempty" json:"regex_match,omitempty"`
// RegexNotMatch lists regex patterns that must NOT match anywhere in the output.
RegexNotMatch []string `yaml:"regex_not_match,omitempty" json:"regex_not_match,omitempty"`
}
TextGraderParameters holds the arguments for creating a text grader.
type ToolCall ¶
type ToolCall struct {
Name string `json:"name"`
Arguments ToolCallArgs `json:"arguments,omitempty"`
Result *copilot.Result `json:"result,omitempty"`
Success bool `json:"success"`
}
ToolCall represents a tool invocation
func FilterToolCalls ¶
func FilterToolCalls(sessionEvents []copilot.SessionEvent) []ToolCall
FilterToolCalls goes through the list of session events and correlates tool starts with Success.
type ToolCallArgs ¶
type ToolCallArgs struct {
// these are filled out for file-based tools (view/edit)
Path string `json:"path" mapstructure:"path"`
FileText string `json:"file_text" mapstructure:"file_text"`
// filled out for tools like bash or powershell
Command string `json:"command" mapstructure:"command"`
Description string `json:"description" mapstructure:"description"`
// filled out for skill invocations
Skill string `json:"skill" mapstructure:"skill"`
}
type ToolCallsGraderParameters ¶ added in v0.27.0
type ToolCallsGraderParameters struct {
RequiredTools []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
MinCalls *int `yaml:"min_calls,omitempty" json:"min_calls,omitempty"`
MaxCalls *int `yaml:"max_calls,omitempty" json:"max_calls,omitempty"`
}
ToolCallsGraderParameters validates which tools were called during a session.
type ToolConstraintGraderParameters ¶
type ToolConstraintGraderParameters struct {
ExpectTools []ToolSpecParameters `yaml:"expect_tools,omitempty" json:"expect_tools,omitempty"`
RejectTools []ToolSpecParameters `yaml:"reject_tools,omitempty" json:"reject_tools,omitempty"`
}
type ToolSpecParameters ¶
type ToolSpecParameters struct {
Tool string `yaml:"tool" json:"tool"`
CommandPattern string `yaml:"command_pattern,omitempty" json:"command_pattern,omitempty"`
SkillPattern string `yaml:"skill_pattern,omitempty" json:"skill_pattern,omitempty"`
PathPattern string `yaml:"path_pattern,omitempty" json:"path_pattern,omitempty"`
}
type TranscriptEvent ¶
type TranscriptEvent struct {
copilot.SessionEvent `json:"-"`
}
func (TranscriptEvent) MarshalJSON ¶
func (te TranscriptEvent) MarshalJSON() ([]byte, error)
func (*TranscriptEvent) UnmarshalJSON ¶
func (te *TranscriptEvent) UnmarshalJSON(data []byte) error
type TriggerHeuristicGraderParameters ¶
type TriggerHeuristicGraderParameters struct {
SkillPath string `yaml:"skill_path" json:"skill_path"`
Mode string `yaml:"mode" json:"mode"`
Threshold *float64 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
}
TriggerHeuristicGraderParameters holds the arguments for creating a trigger heuristic grader.
type TriggerMetrics ¶
type TriggerMetrics struct {
TP int `json:"true_positives"`
FP int `json:"false_positives"`
TN int `json:"true_negatives"`
FN int `json:"false_negatives"`
Errors int `json:"errors,omitempty"`
Precision float64 `json:"precision"`
Recall float64 `json:"recall"`
F1 float64 `json:"f1"`
Accuracy float64 `json:"accuracy"`
}
TriggerMetrics holds classification metrics for trigger accuracy.
func ComputeTriggerMetrics ¶
func ComputeTriggerMetrics(results []TriggerResult) *TriggerMetrics
ComputeTriggerMetrics calculates precision, recall, F1, and accuracy from a set of trigger classification results. Results are weighted by confidence: "high" (or empty) counts as 1.0, "medium" as 0.5. Returns nil when results is empty.
type TriggerResult ¶
type TriggerResult struct {
Prompt string `json:"prompt"`
Confidence string `json:"confidence,omitempty"`
ShouldTrigger bool `json:"should_trigger"`
DidTrigger bool `json:"did_trigger"`
ErrorMsg string `json:"error_msg,omitempty"`
FinalOutput string `json:"final_output,omitempty"`
Transcript []TranscriptEvent `json:"transcript,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
SessionID string `json:"session_id,omitempty"`
}
TriggerResult pairs an expected trigger label with the actual outcome.
type UsageStats ¶
type UsageStats struct {
Turns int `json:"turns"`
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
CacheReadTokens int `json:"cache_read_tokens"`
CacheWriteTokens int `json:"cache_write_tokens"`
PremiumRequests float64 `json:"premium_requests"`
ModelMetrics map[string]ModelUsage `json:"model_metrics,omitempty"`
}
UsageStats holds token and premium request usage data from a Copilot SDK session.
func AggregateUsageStats ¶
func AggregateUsageStats(stats []*UsageStats) *UsageStats
AggregateUsageStats sums usage across multiple UsageStats (e.g. across runs).
func (*UsageStats) IsZero ¶
func (u *UsageStats) IsZero() bool
IsZero returns true if no usage data has been recorded.
type ValidatorInline ¶
type ValidatorInline struct {
Identifier string `yaml:"name" json:"identifier"`
Kind GraderKind `yaml:"type,omitempty" json:"kind,omitempty"`
Checks []string `yaml:"assertions,omitempty" json:"checks,omitempty"`
Rubric string `yaml:"rubric,omitempty" json:"rubric,omitempty"`
Weight float64 `yaml:"weight,omitempty" json:"weight,omitempty"`
Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}
ValidatorInline is a validator embedded in a test case
func (*ValidatorInline) EffectiveWeight ¶ added in v0.22.0
func (v *ValidatorInline) EffectiveWeight() float64
func (*ValidatorInline) UnmarshalYAML ¶
func (v *ValidatorInline) UnmarshalYAML(node *yaml.Node) error
func (*ValidatorInline) Validate ¶ added in v0.26.0
func (v *ValidatorInline) Validate() error
Validate checks that the validator config has required fields for its type.