Documentation
¶
Index ¶
- func AllGraderKinds() []string
- func ComputeStdDev(values []float64) float64
- type ActionSequenceGraderParameters
- type ActionSequenceMatchingMode
- type BehaviorGraderParameters
- type BehaviorRules
- type BenchmarkSpec
- type Config
- type DiffExpectedFileParameters
- type DiffGraderParameters
- type EvaluationOutcome
- type FileContentPatternParameters
- type FileGraderParameters
- type GenericGraderParameters
- type GraderConfig
- type GraderKind
- type GraderParameters
- type GraderResults
- type GroupStats
- type InlineScriptGraderParameters
- type JSONSchemaGraderParameters
- type Language
- type MeasureResult
- type MeasurementDef
- type ModelScore
- type ModelUsage
- type MultiSkillSummary
- type OutcomeDigest
- type OutcomeSetup
- type OutcomeSpec
- type OverallSummary
- type PairwiseResult
- type ProgramGraderParameters
- type PromptGraderMode
- type PromptGraderParameters
- type Recommendation
- type RecommendationWeights
- type ResourceRef
- type RunResult
- type SessionDigest
- type SkillImpactMetric
- type SkillInvocationGraderParameters
- type SkillInvocationMatchingMode
- type SkillSummary
- type SpecIdentity
- type StatisticalSummary
- type Status
- type TaskTranscript
- type TestCase
- type TestExpectation
- type TestOutcome
- type TestStats
- type TestStimulus
- type TextGraderParameters
- type ToolCall
- type ToolCallArgs
- type ToolConstraintGraderParameters
- type ToolSpecParameters
- type TranscriptEvent
- type TriggerHeuristicGraderParameters
- type TriggerMetrics
- type TriggerResult
- type UsageStats
- type ValidatorInline
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AllGraderKinds ¶
func AllGraderKinds() []string
func ComputeStdDev ¶
ComputeStdDev returns the population standard deviation for a slice of float64 values.
Types ¶
type ActionSequenceGraderParameters ¶
type ActionSequenceGraderParameters struct {
MatchingMode ActionSequenceMatchingMode `yaml:"matching_mode,omitempty" json:"matching_mode,omitempty"`
ExpectedActions []string `yaml:"expected_actions,omitempty" json:"expected_actions,omitempty"`
}
type ActionSequenceMatchingMode ¶
type ActionSequenceMatchingMode string
ActionSequenceMatchingMode controls how actual tool calls are compared to expected actions.
const ( ActionSequenceMatchingModeExact ActionSequenceMatchingMode = "exact_match" ActionSequenceMatchingModeInOrder ActionSequenceMatchingMode = "in_order_match" ActionSequenceMatchingModeAnyOrder ActionSequenceMatchingMode = "any_order_match" )
type BehaviorGraderParameters ¶
type BehaviorGraderParameters struct {
MaxToolCalls int `yaml:"max_tool_calls,omitempty" json:"max_tool_calls,omitempty"`
MaxTokens int `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
RequiredTools []string `yaml:"required_tools,omitempty" json:"required_tools,omitempty"`
ForbiddenTools []string `yaml:"forbidden_tools,omitempty" json:"forbidden_tools,omitempty"`
MaxDurationMS int64 `yaml:"max_duration_ms,omitempty" json:"max_duration_ms,omitempty"`
}
type BehaviorRules ¶
type BehaviorRules struct {
MaxToolInvocations int `yaml:"max_tool_calls,omitempty" json:"max_tool_invocations,omitempty"`
MaxRounds int `yaml:"max_iterations,omitempty" json:"max_rounds,omitempty"`
MaxTokens int `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty"`
MustUseTool []string `yaml:"required_tools,omitempty" json:"must_use_tool,omitempty"`
ForbidTool []string `yaml:"forbidden_tools,omitempty" json:"forbid_tool,omitempty"`
}
type BenchmarkSpec ¶
type BenchmarkSpec struct {
SpecIdentity `yaml:",inline"`
SkillName string `yaml:"skill"`
Version string `yaml:"version"`
Config Config `yaml:"config"`
Hooks hooks.HooksConfig `yaml:"hooks,omitempty"`
Inputs map[string]string `yaml:"inputs,omitempty" json:"inputs,omitempty"`
TasksFrom string `yaml:"tasks_from,omitempty" json:"tasks_from,omitempty"`
Range [2]int `yaml:"range,omitempty" json:"range,omitempty"`
Graders []GraderConfig `yaml:"graders"`
Metrics []MeasurementDef `yaml:"metrics"`
Tasks []string `yaml:"tasks"`
Baseline bool `yaml:"baseline,omitempty" json:"baseline,omitempty"`
}
BenchmarkSpec represents a complete evaluation specification
func LoadBenchmarkSpec ¶
func LoadBenchmarkSpec(path string) (*BenchmarkSpec, error)
LoadBenchmarkSpec loads a spec from a YAML file
func (*BenchmarkSpec) ResolveTestFiles ¶
func (s *BenchmarkSpec) ResolveTestFiles(basePath string) ([]string, error)
ResolveTestFiles expands glob patterns to actual test files
func (*BenchmarkSpec) Validate ¶
func (s *BenchmarkSpec) Validate() error
Validate checks that the spec is valid
type Config ¶
type Config struct {
RunsPerTest int `yaml:"trials_per_task" json:"runs_per_test"`
TimeoutSec int `yaml:"timeout_seconds" json:"timeout_sec"`
Concurrent bool `yaml:"parallel" json:"concurrent"`
Workers int `yaml:"max_workers,omitempty" json:"workers,omitempty"`
StopOnError bool `yaml:"fail_fast,omitempty" json:"stop_on_error,omitempty"`
EngineType string `yaml:"executor" json:"engine_type"`
ModelID string `yaml:"model" json:"model_id"`
SkillPaths []string `yaml:"skill_directories,omitempty" json:"skill_paths,omitempty"`
RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
ServerConfigs map[string]any `yaml:"mcp_servers,omitempty" json:"server_configs,omitempty"`
MaxAttempts int `yaml:"max_attempts,omitempty" json:"max_attempts,omitempty"`
GroupBy string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
JudgeModel string `yaml:"judge_model,omitempty" json:"judge_model,omitempty"`
}
Config controls execution behavior
type DiffExpectedFileParameters ¶
type DiffExpectedFileParameters struct {
// Path is the workspace-relative path to the file being checked.
Path string `yaml:"path" json:"path"`
// Snapshot is the path (relative to context/fixtures dir) of the expected file content.
// When set, the workspace file must match this snapshot exactly.
Snapshot string `yaml:"snapshot,omitempty" json:"snapshot,omitempty"`
// Contains lists line fragments that must appear in the workspace file.
// Prefixed with "+" means the line must be present; "-" means it must be absent.
Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
}
DiffExpectedFileParameters defines a single file expectation for the diff grader. Either Snapshot or Contains (or both) must be specified.
type DiffGraderParameters ¶
type DiffGraderParameters struct {
ExpectedFiles []DiffExpectedFileParameters `yaml:"expected_files,omitempty" json:"expected_files,omitempty"`
ContextDir string `yaml:"context_dir,omitempty" json:"context_dir,omitempty"`
UpdateSnapshots bool `yaml:"update_snapshots,omitempty" json:"update_snapshots,omitempty"`
}
type EvaluationOutcome ¶
type EvaluationOutcome struct {
RunID string `json:"eval_id"`
SkillTested string `json:"skill"`
BenchName string `json:"eval_name"`
Timestamp time.Time `json:"timestamp"`
Setup OutcomeSetup `json:"config"`
Digest OutcomeDigest `json:"summary"`
Measures map[string]MeasureResult `json:"metrics"`
TestOutcomes []TestOutcome `json:"tasks"`
TriggerMetrics *TriggerMetrics `json:"trigger_metrics,omitempty"`
TriggerResults []TriggerResult `json:"trigger_results,omitempty"`
Metadata map[string]any `json:"metadata,omitempty"`
IsBaseline bool `json:"is_baseline,omitempty"`
BaselineOutcome *EvaluationOutcome `json:"baseline_outcome,omitempty"`
}
EvaluationOutcome represents the complete result of an evaluation run
type FileGraderParameters ¶
type FileGraderParameters struct {
MustExist []string `yaml:"must_exist,omitempty" json:"must_exist,omitempty"`
MustNotExist []string `yaml:"must_not_exist,omitempty" json:"must_not_exist,omitempty"`
ContentPatterns []FileContentPatternParameters `yaml:"content_patterns,omitempty" json:"content_patterns,omitempty"`
}
type GenericGraderParameters ¶
GenericGraderParameters is used for unknown kinds to preserve raw config values.
type GraderConfig ¶
type GraderConfig struct {
Kind GraderKind `yaml:"type" json:"kind"`
Identifier string `yaml:"name" json:"identifier"`
ScriptPath string `yaml:"script,omitempty" json:"script_path,omitempty"`
Rubric string `yaml:"rubric,omitempty" json:"rubric,omitempty"`
ModelID string `yaml:"model,omitempty" json:"model_id,omitempty"`
Weight float64 `yaml:"weight,omitempty" json:"weight,omitempty"`
Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}
GraderConfig defines a validator/grader
func (*GraderConfig) EffectiveWeight ¶
func (g *GraderConfig) EffectiveWeight() float64
EffectiveWeight returns the grader weight, defaulting to 1.0 if unset.
func (*GraderConfig) UnmarshalYAML ¶
func (g *GraderConfig) UnmarshalYAML(node *yaml.Node) error
type GraderKind ¶
type GraderKind string
GraderKind identifies the type of grader (e.g. regex, file, code).
const ( GraderKindInlineScript GraderKind = "code" GraderKindPrompt GraderKind = "prompt" GraderKindText GraderKind = "text" GraderKindFile GraderKind = "file" GraderKindJSONSchema GraderKind = "json_schema" GraderKindProgram GraderKind = "program" GraderKindBehavior GraderKind = "behavior" GraderKindActionSequence GraderKind = "action_sequence" GraderKindSkillInvocation GraderKind = "skill_invocation" GraderKindTrigger GraderKind = "trigger" GraderKindDiff GraderKind = "diff" GraderKindToolConstraint GraderKind = "tool_constraint" )
type GraderParameters ¶
type GraderParameters interface {
// contains filtered or unexported methods
}
GraderParameters is a polymorphic grader config payload decoded from YAML based on GraderKind.
type GraderResults ¶
type GraderResults struct {
Name string `json:"identifier"`
Type GraderKind `json:"type"`
Score float64 `json:"score"`
Weight float64 `json:"weight"`
Passed bool `json:"passed"`
Feedback string `json:"feedback"`
Details map[string]any `json:"details,omitempty"`
DurationMs int64 `json:"duration_ms"`
}
type GroupStats ¶
type GroupStats struct {
Name string `json:"name"`
Passed int `json:"passed"`
Total int `json:"total"`
AvgScore float64 `json:"avg_score"`
}
GroupStats holds aggregate statistics for a group of test outcomes.
type JSONSchemaGraderParameters ¶
type JSONSchemaGraderParameters struct {
// Schema is an inline JSON schema object used for validation.
Schema map[string]any `yaml:"schema,omitempty" json:"schema,omitempty"`
// SchemaFile is a path to a JSON schema file. Used when Schema is not provided.
SchemaFile string `yaml:"schema_file,omitempty" json:"schema_file,omitempty"`
}
JSONSchemaGraderParameters holds the arguments for creating a JSON schema grader.
type MeasureResult ¶
type MeasurementDef ¶
type MeasurementDef struct {
Identifier string `yaml:"name" json:"identifier"`
Weight float64 `yaml:"weight" json:"weight"`
Threshold float64 `yaml:"threshold" json:"threshold"`
Enabled bool `yaml:"enabled,omitempty" json:"enabled,omitempty"`
Desc string `yaml:"description,omitempty" json:"desc,omitempty"`
}
MeasurementDef defines a metric
type ModelScore ¶
type ModelScore struct {
ModelID string `json:"model_id"`
HeuristicScore float64 `json:"heuristic_score"`
Rank int `json:"rank"`
Scores map[string]float64 `json:"component_scores,omitempty"`
}
ModelScore holds the heuristic score and rank for a single model.
type ModelUsage ¶
type ModelUsage struct {
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
CacheReadTokens int `json:"cache_read_tokens"`
CacheWriteTokens int `json:"cache_write_tokens"`
RequestCount float64 `json:"request_count"`
RequestCost float64 `json:"request_cost"`
}
ModelUsage holds per-model token and request usage.
type MultiSkillSummary ¶
type MultiSkillSummary struct {
Timestamp time.Time `json:"timestamp"`
Skills []SkillSummary `json:"skills"`
Overall OverallSummary `json:"overall"`
}
MultiSkillSummary aggregates results across multiple skill evaluations.
type OutcomeDigest ¶
type OutcomeDigest struct {
TotalTests int `json:"total_tests"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
Errors int `json:"errors"`
Skipped int `json:"skipped"`
SuccessRate float64 `json:"success_rate"`
AggregateScore float64 `json:"aggregate_score"`
WeightedScore float64 `json:"weighted_score"`
MinScore float64 `json:"min_score"`
MaxScore float64 `json:"max_score"`
StdDev float64 `json:"std_dev"`
DurationMs int64 `json:"duration_ms"`
Groups []GroupStats `json:"groups,omitempty"`
Usage *UsageStats `json:"usage,omitempty"`
// Statistical summary populated when trials_per_task > 1
Statistics *StatisticalSummary `json:"statistics,omitempty"`
}
type OutcomeSetup ¶
type OutcomeSpec ¶
type OverallSummary ¶
type OverallSummary struct {
TotalSkills int `json:"total_skills"`
TotalModels int `json:"total_models"`
AvgPassRate float64 `json:"avg_pass_rate"`
AvgAggregateScore float64 `json:"avg_aggregate_score"`
}
OverallSummary contains cross-skill aggregated metrics.
type PairwiseResult ¶
type PairwiseResult struct {
Winner string `json:"winner"` // "baseline", "skill", or "tie"
Magnitude string `json:"magnitude"` // "much-better", "slightly-better", "equal", etc.
Reasoning string `json:"reasoning"`
PositionConsistent bool `json:"position_consistent"` // true if result held after position swap
}
PairwiseResult captures the outcome of a pairwise LLM judge comparison.
type ProgramGraderParameters ¶
type PromptGraderMode ¶
type PromptGraderMode string
const ( PromptGraderModeIndependent PromptGraderMode = "independent" PromptGraderModePairwise PromptGraderMode = "pairwise" )
type PromptGraderParameters ¶
type PromptGraderParameters struct {
Prompt string `yaml:"prompt,omitempty" json:"prompt,omitempty"`
Model string `yaml:"model,omitempty" json:"model,omitempty"`
ContinueSession bool `yaml:"continue_session,omitempty" json:"continue_session,omitempty"`
Mode PromptGraderMode `yaml:"mode,omitempty" json:"mode,omitempty"`
}
type Recommendation ¶
type Recommendation struct {
RecommendedModel string `json:"recommended_model"`
HeuristicScore float64 `json:"heuristic_score"`
Reason string `json:"reason"`
WinnerMarginPct float64 `json:"winner_margin_pct"`
Weights RecommendationWeights `json:"weights"`
ModelScores []ModelScore `json:"all_models"`
}
Recommendation represents a heuristic recommendation for the best model across a multi-model evaluation run.
type RecommendationWeights ¶
type RecommendationWeights struct {
AggregateScore float64 `json:"aggregate_score"`
PassRate float64 `json:"pass_rate"`
Consistency float64 `json:"consistency"`
Speed float64 `json:"speed"`
}
RecommendationWeights defines the weighting scheme for heuristic scoring.
type ResourceRef ¶
type ResourceRef struct {
Location string `yaml:"path,omitempty" json:"location,omitempty"`
Body string `yaml:"content,omitempty" json:"body,omitempty"`
}
ResourceRef points to a file or inline content
type RunResult ¶
type RunResult struct {
RunNumber int `json:"run_number"`
Attempts int `json:"attempts"`
// Status contains the overall status of the run.
// NOTE: if Status == [StatusError], then [ErrorMsg] will be set to the
// message from the error.
Status Status `json:"status"`
DurationMs int64 `json:"duration_ms"`
Validations map[string]GraderResults `json:"validations"`
SessionDigest SessionDigest `json:"session_digest"`
Transcript []TranscriptEvent `json:"transcript,omitempty"`
FinalOutput string `json:"final_output"`
ErrorMsg string `json:"error_msg,omitempty"`
}
RunResult is the result of a single run/trial
func (*RunResult) AllValidationsPassed ¶
AllValidationsPassed checks if all validations passed
func (*RunResult) ComputeRunScore ¶
ComputeRunScore calculates the average score across all validations (unweighted, for backward compat)
func (*RunResult) ComputeWeightedRunScore ¶
ComputeWeightedRunScore calculates the weighted composite score (0.0–1.0) using each grader's Weight field. If all weights are zero, falls back to simple average.
type SessionDigest ¶
type SkillImpactMetric ¶
type SkillImpactMetric struct {
PassRateWithSkills float64 `json:"pass_rate_with_skills"`
PassRateBaseline float64 `json:"pass_rate_baseline"`
Delta float64 `json:"delta"`
PercentChange float64 `json:"percent_change"`
Pairwise *PairwiseResult `json:"pairwise,omitempty"`
}
SkillImpactMetric represents A/B comparison for a single task
type SkillInvocationGraderParameters ¶
type SkillInvocationGraderParameters struct {
RequiredSkills []string `yaml:"required_skills,omitempty" json:"required_skills,omitempty"`
Mode SkillInvocationMatchingMode `yaml:"mode,omitempty" json:"mode,omitempty"`
AllowExtra *bool `yaml:"allow_extra,omitempty" json:"allow_extra,omitempty"`
}
type SkillInvocationMatchingMode ¶
type SkillInvocationMatchingMode string
SkillInvocationMatchingMode controls how actual skill invocations are compared to expected skills.
const ( SkillMatchingModeExact SkillInvocationMatchingMode = "exact_match" SkillMatchingModeInOrder SkillInvocationMatchingMode = "in_order" SkillMatchingModeAnyOrder SkillInvocationMatchingMode = "any_order" )
type SkillSummary ¶
type SkillSummary struct {
SkillName string `json:"skill_name"`
Models []string `json:"models"`
PassRate float64 `json:"pass_rate"`
AggregateScore float64 `json:"aggregate_score"`
OutputFiles []string `json:"output_files"`
}
SkillSummary contains aggregated metrics for a single skill evaluation.
type SpecIdentity ¶
type StatisticalSummary ¶
type StatisticalSummary struct {
BootstrapCI statistics.ConfidenceInterval `json:"bootstrap_ci"`
IsSignificant bool `json:"is_significant"`
NormalizedGain *float64 `json:"normalized_gain,omitempty"`
}
StatisticalSummary holds aggregate statistical data for the digest when trials > 1.
type TaskTranscript ¶
type TaskTranscript struct {
TaskID string `json:"task_id"`
TaskName string `json:"task_name"`
Status Status `json:"status"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
DurationMs int64 `json:"duration_ms"`
Prompt string `json:"prompt"`
FinalOutput string `json:"final_output"`
Transcript []TranscriptEvent `json:"transcript"`
Validations map[string]GraderResults `json:"validations,omitempty"`
Session SessionDigest `json:"session"`
ErrorMsg string `json:"error_msg,omitempty"`
}
TaskTranscript is the per-task JSON file written to the transcript directory.
type TestCase ¶
type TestCase struct {
Active *bool `yaml:"enabled,omitempty" json:"active,omitempty"`
ContextRoot string `yaml:"context_dir,omitempty" json:"context_root,omitempty"`
DisplayName string `yaml:"name" json:"display_name"`
Expectation TestExpectation `yaml:"expected,omitempty" json:"expectation,omitempty"`
Stimulus TestStimulus `yaml:"inputs" json:"stimulus"`
Summary string `yaml:"description,omitempty" json:"summary,omitempty"`
Tags []string `yaml:"tags,omitempty" json:"labels,omitempty"`
TestID string `yaml:"id" json:"test_id"`
TimeoutSec *int `yaml:"timeout_seconds,omitempty" json:"timeout_sec,omitempty"`
Validators []ValidatorInline `yaml:"graders,omitempty" json:"validators,omitempty"`
}
TestCase represents a single evaluation test
func LoadTestCase ¶
LoadTestCase loads a test case from YAML
type TestExpectation ¶
type TestExpectation struct {
OutcomeSpecs []OutcomeSpec `yaml:"outcomes,omitempty" json:"outcome_specs,omitempty"`
ToolPatterns map[string]any `yaml:"tool_calls,omitempty" json:"tool_patterns,omitempty"`
BehaviorRules BehaviorRules `yaml:"behavior,omitempty" json:"behavior_rules,omitempty"`
MustInclude []string `yaml:"output_contains,omitempty" json:"must_include,omitempty"`
MustExclude []string `yaml:"output_not_contains,omitempty" json:"must_exclude,omitempty"`
ExpectedTrigger *bool `yaml:"should_trigger,omitempty" json:"expected_trigger,omitempty"`
}
TestExpectation defines expected outcomes
type TestOutcome ¶
type TestOutcome struct {
TestID string `json:"test_id"`
DisplayName string `json:"display_name"`
Group string `json:"group,omitempty"`
Status Status `json:"status"`
Runs []RunResult `json:"runs"`
Stats *TestStats `json:"stats,omitempty"`
SkillImpact *SkillImpactMetric `json:"skill_impact,omitempty"`
}
TestOutcome represents the result of one test case
type TestStats ¶
type TestStats struct {
PassRate float64 `json:"pass_rate"`
FlakinessPercent float64 `json:"flakiness_percent"`
PassedRuns int `json:"passed_runs"`
FailedRuns int `json:"failed_runs"`
ErrorRuns int `json:"error_runs"`
TotalRuns int `json:"total_runs"`
AvgScore float64 `json:"avg_score"`
AvgWeightedScore float64 `json:"avg_weighted_score"`
MinScore float64 `json:"min_score"`
MaxScore float64 `json:"max_score"`
StdDevScore float64 `json:"std_dev_score"`
ScoreVariance float64 `json:"score_variance"`
CI95Lo float64 `json:"ci95_lo"`
CI95Hi float64 `json:"ci95_hi"`
Flaky bool `json:"flaky"`
AvgDurationMs int64 `json:"avg_duration_ms"`
// Bootstrap confidence interval over weighted scores (populated when trials > 1)
BootstrapCI *statistics.ConfidenceInterval `json:"bootstrap_ci,omitempty"`
IsSignificant *bool `json:"is_significant,omitempty"`
}
type TestStimulus ¶
type TestStimulus struct {
Message string `yaml:"prompt" json:"message"`
Metadata map[string]any `yaml:"context,omitempty" json:"metadata,omitempty"`
Resources []ResourceRef `yaml:"files,omitempty" json:"resources,omitempty"`
Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"`
}
TestStimulus defines the input for a test
type TextGraderParameters ¶
type TextGraderParameters struct {
// Contains lists substrings that must appear in the output (case-insensitive).
Contains []string `yaml:"contains,omitempty" json:"contains,omitempty"`
// NotContains lists substrings that must NOT appear in the output (case-insensitive).
NotContains []string `yaml:"not_contains,omitempty" json:"not_contains,omitempty"`
// ContainsCS lists substrings that must appear in the output (case-sensitive).
ContainsCS []string `yaml:"contains_cs,omitempty" json:"contains_cs,omitempty"`
// NotContainsCS lists substrings that must NOT appear in the output (case-sensitive).
NotContainsCS []string `yaml:"not_contains_cs,omitempty" json:"not_contains_cs,omitempty"`
// RegexMatch lists regex patterns that must match somewhere in the output.
RegexMatch []string `yaml:"regex_match,omitempty" json:"regex_match,omitempty"`
// RegexNotMatch lists regex patterns that must NOT match anywhere in the output.
RegexNotMatch []string `yaml:"regex_not_match,omitempty" json:"regex_not_match,omitempty"`
}
TextGraderParameters holds the arguments for creating a text grader.
type ToolCall ¶
type ToolCall struct {
Name string `json:"name"`
Arguments ToolCallArgs `json:"arguments,omitempty"`
Result *copilot.Result `json:"result,omitempty"`
Success bool `json:"success"`
}
ToolCall represents a tool invocation
func FilterToolCalls ¶
func FilterToolCalls(sessionEvents []copilot.SessionEvent) []ToolCall
FilterToolCalls goes through the list of session events and correlates tool starts with Success.
type ToolCallArgs ¶
type ToolCallArgs struct {
// these are filled out for file-based tools (view/edit)
Path string `json:"path" mapstructure:"path"`
FileText string `json:"file_text" mapstructure:"file_text"`
// filled out for tools like bash or powershell
Command string `json:"command" mapstructure:"command"`
Description string `json:"description" mapstructure:"description"`
// filled out for skill invocations
Skill string `json:"skill" mapstructure:"skill"`
}
type ToolConstraintGraderParameters ¶
type ToolConstraintGraderParameters struct {
ExpectTools []ToolSpecParameters `yaml:"expect_tools,omitempty" json:"expect_tools,omitempty"`
RejectTools []ToolSpecParameters `yaml:"reject_tools,omitempty" json:"reject_tools,omitempty"`
}
type ToolSpecParameters ¶
type ToolSpecParameters struct {
Tool string `yaml:"tool" json:"tool"`
CommandPattern string `yaml:"command_pattern,omitempty" json:"command_pattern,omitempty"`
SkillPattern string `yaml:"skill_pattern,omitempty" json:"skill_pattern,omitempty"`
PathPattern string `yaml:"path_pattern,omitempty" json:"path_pattern,omitempty"`
}
type TranscriptEvent ¶
type TranscriptEvent struct {
copilot.SessionEvent `json:"-"`
}
func (TranscriptEvent) MarshalJSON ¶
func (te TranscriptEvent) MarshalJSON() ([]byte, error)
func (*TranscriptEvent) UnmarshalJSON ¶
func (te *TranscriptEvent) UnmarshalJSON(data []byte) error
type TriggerHeuristicGraderParameters ¶
type TriggerHeuristicGraderParameters struct {
SkillPath string `yaml:"skill_path" json:"skill_path"`
Mode string `yaml:"mode" json:"mode"`
Threshold *float64 `yaml:"threshold,omitempty" json:"threshold,omitempty"`
}
TriggerHeuristicGraderParameters holds the arguments for creating a trigger heuristic grader.
type TriggerMetrics ¶
type TriggerMetrics struct {
TP int `json:"true_positives"`
FP int `json:"false_positives"`
TN int `json:"true_negatives"`
FN int `json:"false_negatives"`
Errors int `json:"errors,omitempty"`
Precision float64 `json:"precision"`
Recall float64 `json:"recall"`
F1 float64 `json:"f1"`
Accuracy float64 `json:"accuracy"`
}
TriggerMetrics holds classification metrics for trigger accuracy.
func ComputeTriggerMetrics ¶
func ComputeTriggerMetrics(results []TriggerResult) *TriggerMetrics
ComputeTriggerMetrics calculates precision, recall, F1, and accuracy from a set of trigger classification results. Results are weighted by confidence: "high" (or empty) counts as 1.0, "medium" as 0.5. Returns nil when results is empty.
type TriggerResult ¶
type TriggerResult struct {
Prompt string `json:"prompt"`
Confidence string `json:"confidence,omitempty"`
ShouldTrigger bool `json:"should_trigger"`
DidTrigger bool `json:"did_trigger"`
ErrorMsg string `json:"error_msg,omitempty"`
FinalOutput string `json:"final_output,omitempty"`
Transcript []TranscriptEvent `json:"transcript,omitempty"`
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
SessionID string `json:"session_id,omitempty"`
}
TriggerResult pairs an expected trigger label with the actual outcome.
type UsageStats ¶
type UsageStats struct {
Turns int `json:"turns"`
InputTokens int `json:"input_tokens"`
OutputTokens int `json:"output_tokens"`
CacheReadTokens int `json:"cache_read_tokens"`
CacheWriteTokens int `json:"cache_write_tokens"`
PremiumRequests float64 `json:"premium_requests"`
ModelMetrics map[string]ModelUsage `json:"model_metrics,omitempty"`
}
UsageStats holds token and premium request usage data from a Copilot SDK session.
func AggregateUsageStats ¶
func AggregateUsageStats(stats []*UsageStats) *UsageStats
AggregateUsageStats sums usage across multiple UsageStats (e.g. across runs).
func (*UsageStats) IsZero ¶
func (u *UsageStats) IsZero() bool
IsZero returns true if no usage data has been recorded.
type ValidatorInline ¶
type ValidatorInline struct {
Identifier string `yaml:"name" json:"identifier"`
Kind GraderKind `yaml:"type,omitempty" json:"kind,omitempty"`
Checks []string `yaml:"assertions,omitempty" json:"checks,omitempty"`
Rubric string `yaml:"rubric,omitempty" json:"rubric,omitempty"`
Weight float64 `yaml:"weight,omitempty" json:"weight,omitempty"`
Parameters GraderParameters `yaml:"config,omitempty" json:"parameters,omitempty"`
}
ValidatorInline is a validator embedded in a test case
func (*ValidatorInline) UnmarshalYAML ¶
func (v *ValidatorInline) UnmarshalYAML(node *yaml.Node) error