Documentation
¶
Index ¶
- Constants
- func SchemaForEvalConfig() (string, error)
- type AgenticStep
- type DimensionCriteria
- type Eval
- type EvalClient
- type EvalClientConfig
- type EvalConfig
- type EvalResult
- type EvalRunResult
- type EvalTrace
- type GradeResult
- type GradingRubric
- type GradingTrace
- type MCPServerConfig
- type MaxSteps
- type MaxTokens
- type ToolCall
- type ValidationError
- type ValidationResult
Constants ¶
const ( AgentSystemPrompt = "" /* 164-byte string literal not displayed */ EvalSystemPrompt = `` /* 1251-byte string literal not displayed */ )
Variables ¶
This section is empty.
Functions ¶
func SchemaForEvalConfig ¶
Types ¶
type AgenticStep ¶
type AgenticStep struct {
StepNumber int `json:"step_number"` // 1-indexed step number
StartTime time.Time `json:"start_time"` // When this step started
EndTime time.Time `json:"end_time"` // When this step completed
Duration time.Duration `json:"duration"` // Step execution duration
ModelResponse string `json:"model_response"` // Text content from assistant
StopReason string `json:"stop_reason"` // end_turn, tool_use, max_tokens, etc.
ToolCalls []ToolCall `json:"tool_calls"` // Tools executed in this step
InputTokens int `json:"input_tokens"` // Input tokens for this step
OutputTokens int `json:"output_tokens"` // Output tokens for this step
CacheCreationInputTokens int `json:"cache_creation_input_tokens"` // Tokens used to create cache
CacheReadInputTokens int `json:"cache_read_input_tokens"` // Tokens read from cache
Error string `json:"error,omitempty"` // Error message if step failed
}
AgenticStep records a single iteration of the agentic loop
type DimensionCriteria ¶
type DimensionCriteria struct {
Description string `yaml:"description,omitempty" json:"description,omitempty" jsonschema:"What this dimension means for this specific eval"`
MustHave []string `yaml:"must_have,omitempty" json:"must_have,omitempty" jsonschema:"Required elements for high scores (4-5)"`
NiceToHave []string `yaml:"nice_to_have,omitempty" json:"nice_to_have,omitempty" jsonschema:"Optional elements that improve scores"`
Penalties []string `` /* 128-byte string literal not displayed */
}
DimensionCriteria provides specific guidance for grading a dimension
type Eval ¶
type Eval struct {
Name string `yaml:"name" json:"name" jsonschema:"Unique identifier for this evaluation"`
Description string `yaml:"description,omitempty" json:"description,omitempty" jsonschema:"Human-readable description of what this eval tests"`
Prompt string `yaml:"prompt" json:"prompt" jsonschema:"The input prompt to send to the LLM"`
ExpectedResult string `` /* 151-byte string literal not displayed */
AgentSystemPrompt string `` /* 157-byte string literal not displayed */
GradingRubric *GradingRubric `` /* 129-byte string literal not displayed */
}
Eval represents a single evaluation test case
type EvalClient ¶
type EvalClient struct {
// contains filtered or unexported fields
}
func NewEvalClient ¶
func NewEvalClient(config EvalClientConfig) *EvalClient
func (*EvalClient) RunEval ¶
func (ec *EvalClient) RunEval(ctx context.Context, eval Eval) (*EvalRunResult, error)
func (*EvalClient) RunEvals ¶
func (ec *EvalClient) RunEvals(ctx context.Context, evals []Eval) ([]EvalRunResult, error)
RunEvals executes multiple evaluations and returns all results. Individual eval failures are captured in EvalRunResult.Error and don't stop the batch.
type EvalClientConfig ¶
type EvalClientConfig struct {
APIKey string
BaseURL string // Optional: if set, override the default Anthropic API endpoint
Command string
Args []string
Env []string
Model string
GradingModel string // Optional: if set, use this model for grading instead of Model
AgentSystemPrompt string // Optional: custom system prompt for the agent being evaluated
MaxSteps int
MaxTokens int
EnablePromptCaching *bool // Optional: enable Anthropic prompt caching for tool definitions and system prompts. Default: true
CacheTTL string // Optional: cache time-to-live, either "5m" (default) or "1h". Requires EnablePromptCaching=true
EnforceMinimumScores *bool // Optional: enforce minimum scores from grading rubrics. Default: true
StderrCallback func(line string) // Optional: called for each line written to stderr by the MCP server subprocess
}
func (*EvalClientConfig) ApplyDefaults ¶
func (c *EvalClientConfig) ApplyDefaults() *EvalClientConfig
ApplyDefaults sets default values for optional configuration fields. This method modifies the config in-place and returns a pointer to it for method chaining.
type EvalConfig ¶
type EvalConfig struct {
Model string `yaml:"model" json:"model" jsonschema:"Anthropic model ID to use for evaluations"`
GradingModel string `` /* 140-byte string literal not displayed */
AgentSystemPrompt string `` /* 167-byte string literal not displayed */
Timeout string `yaml:"timeout,omitempty" json:"timeout,omitempty" jsonschema:"Timeout duration for each evaluation (e.g., '2m', '30s')"`
MaxSteps MaxSteps `yaml:"max_steps,omitempty" json:"max_steps,omitempty" jsonschema:"Maximum number of agentic loop iterations"`
MaxTokens MaxTokens `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty" jsonschema:"Maximum tokens per LLM request"`
EnablePromptCaching *bool `` /* 198-byte string literal not displayed */
CacheTTL string `` /* 162-byte string literal not displayed */
EnforceMinimumScores *bool `` /* 180-byte string literal not displayed */
MCPServer MCPServerConfig `yaml:"mcp_server" json:"mcp_server" jsonschema:"Configuration for the MCP server to evaluate"`
Evals []Eval `yaml:"evals" json:"evals" jsonschema:"List of evaluation test cases to run"`
}
EvalConfig represents the top-level configuration for running evaluations
func LoadConfig ¶
func LoadConfig(filePath string) (*EvalConfig, error)
LoadConfig loads an evaluation configuration from a YAML or JSON file. The file format is detected by the file extension (.yaml, .yml, or .json). Environment variables in the config file are expanded using ${VAR} or $VAR syntax. Supports shell-style default values: ${VAR:-default}
type EvalResult ¶
type EvalRunResult ¶
type EvalRunResult struct {
Eval Eval
Result *EvalResult
Grade *GradeResult
Error error
Trace *EvalTrace // Complete execution trace for debugging and analysis
}
EvalRunResult combines the eval configuration with its execution results
type EvalTrace ¶
type EvalTrace struct {
Steps []AgenticStep `json:"steps"` // Each step in the agentic loop
Grading *GradingTrace `json:"grading,omitempty"` // Grading interaction details
TotalDuration time.Duration `json:"total_duration"` // Total execution time
TotalInputTokens int `json:"total_input_tokens"` // Sum of input tokens across all steps
TotalOutputTokens int `json:"total_output_tokens"` // Sum of output tokens across all steps
StepCount int `json:"step_count"` // Number of agentic steps executed
ToolCallCount int `json:"tool_call_count"` // Total number of tool calls made
TotalCacheCreationTokens int `json:"total_cache_creation_tokens"` // Sum of cache creation tokens across all steps
TotalCacheReadTokens int `json:"total_cache_read_tokens"` // Sum of cache read tokens across all steps
}
EvalTrace captures complete execution history of an evaluation run
type GradeResult ¶
type GradingRubric ¶
type GradingRubric struct {
// Optional: Override which dimensions to grade (defaults to all 5 standard dimensions)
Dimensions []string `` /* 149-byte string literal not displayed */
// Criteria for each dimension - what to look for when grading
Accuracy *DimensionCriteria `yaml:"accuracy,omitempty" json:"accuracy,omitempty" jsonschema:"Specific criteria for accuracy scoring"`
Completeness *DimensionCriteria `yaml:"completeness,omitempty" json:"completeness,omitempty" jsonschema:"Specific criteria for completeness scoring"`
Relevance *DimensionCriteria `yaml:"relevance,omitempty" json:"relevance,omitempty" jsonschema:"Specific criteria for relevance scoring"`
Clarity *DimensionCriteria `yaml:"clarity,omitempty" json:"clarity,omitempty" jsonschema:"Specific criteria for clarity scoring"`
Reasoning *DimensionCriteria `yaml:"reasoning,omitempty" json:"reasoning,omitempty" jsonschema:"Specific criteria for reasoning scoring"`
// Optional: Minimum acceptable scores for pass/fail
MinimumScores map[string]int `` /* 126-byte string literal not displayed */
}
GradingRubric defines specific evaluation criteria for grading
func (*GradingRubric) CheckMinimumScores ¶
func (r *GradingRubric) CheckMinimumScores(grade *GradeResult) error
CheckMinimumScores verifies that graded scores meet minimum thresholds
func (*GradingRubric) Validate ¶
func (r *GradingRubric) Validate() error
Validate checks that the rubric is well-formed
type GradingTrace ¶
type GradingTrace struct {
UserPrompt string `json:"user_prompt"` // Original eval prompt
ModelResponse string `json:"model_response"` // Model's answer being graded
ExpectedResult string `json:"expected_result"` // Expected result description
GradingPrompt string `json:"grading_prompt"` // Full prompt sent to grader
RawGradingOutput string `json:"raw_grading_output"` // Complete LLM response before parsing
StartTime time.Time `json:"start_time"` // When grading started
EndTime time.Time `json:"end_time"` // When grading completed
Duration time.Duration `json:"duration"` // Grading duration
InputTokens int `json:"input_tokens"` // Input tokens for grading
OutputTokens int `json:"output_tokens"` // Output tokens for grading
CacheCreationInputTokens int `json:"cache_creation_input_tokens"` // Tokens used to create cache
CacheReadInputTokens int `json:"cache_read_input_tokens"` // Tokens read from cache
Error string `json:"error,omitempty"` // Error message if grading failed
}
GradingTrace records the grading interaction with the LLM
type MCPServerConfig ¶
type MCPServerConfig struct {
Command string `yaml:"command" json:"command" jsonschema:"Command to start the MCP server"`
Args []string `yaml:"args,omitempty" json:"args,omitempty" jsonschema:"Arguments to pass to the command"`
Env []string `yaml:"env,omitempty" json:"env,omitempty" jsonschema:"Environment variables to set for the MCP server"`
}
MCPServerConfig defines how to start the MCP server
type ToolCall ¶
type ToolCall struct {
ToolID string `json:"tool_id"` // Unique ID from content block
ToolName string `json:"tool_name"` // MCP tool name
StartTime time.Time `json:"start_time"` // When tool execution started
EndTime time.Time `json:"end_time"` // When tool execution completed
Duration time.Duration `json:"duration"` // Tool execution duration
Input json.RawMessage `json:"input"` // Tool arguments as JSON
Output json.RawMessage `json:"output"` // Tool result as JSON
Success bool `json:"success"` // Whether tool executed successfully
Error string `json:"error,omitempty"` // Error message if tool failed
}
ToolCall captures details of a single tool invocation
type ValidationError ¶
type ValidationError struct {
Path string // JSON path to the error (e.g., "mcp_server.command")
Message string // Human-readable error message
}
ValidationError represents a single validation error with location information
type ValidationResult ¶
type ValidationResult struct {
Valid bool
Errors []ValidationError
}
ValidationResult contains the results of validating a config file
func ValidateConfigFile ¶
func ValidateConfigFile(filePath string) (*ValidationResult, error)
ValidateConfigFile validates a configuration file against the JSON schema. It reads the file, converts YAML to JSON if needed, and validates against the schema.