Documentation
¶
Index ¶
- Constants
- func SchemaForEvalConfig() (string, error)
- type AgenticStep
- type DimensionCriteria
- type Eval
- type EvalClient
- type EvalClientConfig
- type EvalConfig
- type EvalResult
- type EvalRunResult
- type EvalTrace
- type GradeResult
- type GradingRubric
- type GradingTrace
- type MCPServerConfig
- type MaxSteps
- type MaxTokens
- type ToolCall
- type ValidationError
- type ValidationResult
Constants ¶
const ( AgentSystemPrompt = "" /* 164-byte string literal not displayed */ EvalSystemPrompt = `` /* 1251-byte string literal not displayed */ )
Variables ¶
This section is empty.
Functions ¶
func SchemaForEvalConfig ¶
Types ¶
type AgenticStep ¶
type AgenticStep struct {
StepNumber int `json:"step_number"` // 1-indexed step number
StartTime time.Time `json:"start_time"` // When this step started
EndTime time.Time `json:"end_time"` // When this step completed
Duration time.Duration `json:"duration"` // Step execution duration
ModelResponse string `json:"model_response"` // Text content from assistant
StopReason string `json:"stop_reason"` // end_turn, tool_use, max_tokens, etc.
ToolCalls []ToolCall `json:"tool_calls"` // Tools executed in this step
InputTokens int `json:"input_tokens"` // Input tokens for this step
OutputTokens int `json:"output_tokens"` // Output tokens for this step
CacheCreationInputTokens int `json:"cache_creation_input_tokens"` // Tokens used to create cache
CacheReadInputTokens int `json:"cache_read_input_tokens"` // Tokens read from cache
Error string `json:"error,omitempty"` // Error message if step failed
}
AgenticStep records a single iteration of the agentic loop
type DimensionCriteria ¶
type DimensionCriteria struct {
Description string `yaml:"description,omitempty" json:"description,omitempty" jsonschema:"What this dimension means for this specific eval"`
MustHave []string `yaml:"must_have,omitempty" json:"must_have,omitempty" jsonschema:"Required elements for high scores (4-5)"`
NiceToHave []string `yaml:"nice_to_have,omitempty" json:"nice_to_have,omitempty" jsonschema:"Optional elements that improve scores"`
Penalties []string `` /* 128-byte string literal not displayed */
}
DimensionCriteria provides specific guidance for grading a dimension
type Eval ¶
type Eval struct {
Name string `yaml:"name" json:"name" jsonschema:"Unique identifier for this evaluation"`
Description string `yaml:"description,omitempty" json:"description,omitempty" jsonschema:"Human-readable description of what this eval tests"`
Prompt string `yaml:"prompt" json:"prompt" jsonschema:"The input prompt to send to the LLM"`
ExpectedResult string `` /* 151-byte string literal not displayed */
AgentSystemPrompt string `` /* 157-byte string literal not displayed */
GradingRubric *GradingRubric `` /* 129-byte string literal not displayed */
}
Eval represents a single evaluation test case
type EvalClient ¶
type EvalClient struct {
// contains filtered or unexported fields
}
func NewEvalClient ¶
func NewEvalClient(config EvalClientConfig) *EvalClient
func (*EvalClient) RunEval ¶
func (ec *EvalClient) RunEval(ctx context.Context, eval Eval) (*EvalRunResult, error)
func (*EvalClient) RunEvals ¶
func (ec *EvalClient) RunEvals(ctx context.Context, evals []Eval) ([]EvalRunResult, error)
RunEvals executes multiple evaluations and returns all results. Each eval reuses the same MCP session for efficiency. Individual eval failures are captured in EvalRunResult.Error and don't stop the batch.
type EvalClientConfig ¶
type EvalClientConfig struct {
APIKey string
BaseURL string // Optional: if set, override the default Anthropic API endpoint
Command string
Args []string
Env []string
Model string
GradingModel string // Optional: if set, use this model for grading instead of Model
AgentSystemPrompt string // Optional: custom system prompt for the agent being evaluated
MaxSteps int
MaxTokens int
EnablePromptCaching *bool // Optional: enable Anthropic prompt caching for tool definitions and system prompts. Default: true
CacheTTL string // Optional: cache time-to-live, either "5m" (default) or "1h". Requires EnablePromptCaching=true
EnforceMinimumScores *bool // Optional: enforce minimum scores from grading rubrics. Default: true
StderrCallback func(line string) // Optional: called for each line written to stderr by the MCP server subprocess
}
func (*EvalClientConfig) ApplyDefaults ¶
func (c *EvalClientConfig) ApplyDefaults() *EvalClientConfig
ApplyDefaults sets default values for optional configuration fields. This method modifies the config in-place and returns a pointer to it for method chaining.
type EvalConfig ¶
type EvalConfig struct {
Model string `yaml:"model" json:"model" jsonschema:"Anthropic model ID to use for evaluations"`
GradingModel string `` /* 140-byte string literal not displayed */
AgentSystemPrompt string `` /* 167-byte string literal not displayed */
Timeout string `yaml:"timeout,omitempty" json:"timeout,omitempty" jsonschema:"Timeout duration for each evaluation (e.g., '2m', '30s')"`
MaxSteps MaxSteps `yaml:"max_steps,omitempty" json:"max_steps,omitempty" jsonschema:"Maximum number of agentic loop iterations"`
MaxTokens MaxTokens `yaml:"max_tokens,omitempty" json:"max_tokens,omitempty" jsonschema:"Maximum tokens per LLM request"`
EnablePromptCaching *bool `` /* 198-byte string literal not displayed */
CacheTTL string `` /* 162-byte string literal not displayed */
EnforceMinimumScores *bool `` /* 180-byte string literal not displayed */
MCPServer MCPServerConfig `yaml:"mcp_server" json:"mcp_server" jsonschema:"Configuration for the MCP server to evaluate"`
Evals []Eval `yaml:"evals" json:"evals" jsonschema:"List of evaluation test cases to run"`
}
EvalConfig represents the top-level configuration for running evaluations
func LoadConfig ¶
func LoadConfig(filePath string) (*EvalConfig, error)
LoadConfig loads an evaluation configuration from a YAML or JSON file. The file format is detected by the file extension (.yaml, .yml, or .json). Environment variables in the config file are expanded using ${VAR} or $VAR syntax. Supports shell-style default values: ${VAR:-default}
type EvalResult ¶
type EvalRunResult ¶
type EvalRunResult struct {
Eval Eval
Result *EvalResult
Grade *GradeResult
Error error
Trace *EvalTrace // Complete execution trace for debugging and analysis
}
EvalRunResult combines the eval configuration with its execution results
type EvalTrace ¶
type EvalTrace struct {
Steps []AgenticStep `json:"steps"` // Each step in the agentic loop
Grading *GradingTrace `json:"grading,omitempty"` // Grading interaction details
TotalDuration time.Duration `json:"total_duration"` // Total execution time
TotalInputTokens int `json:"total_input_tokens"` // Sum of input tokens across all steps
TotalOutputTokens int `json:"total_output_tokens"` // Sum of output tokens across all steps
StepCount int `json:"step_count"` // Number of agentic steps executed
ToolCallCount int `json:"tool_call_count"` // Total number of tool calls made
TotalCacheCreationTokens int `json:"total_cache_creation_tokens"` // Sum of cache creation tokens across all steps
TotalCacheReadTokens int `json:"total_cache_read_tokens"` // Sum of cache read tokens across all steps
}
EvalTrace captures complete execution history of an evaluation run
type GradeResult ¶
type GradingRubric ¶
type GradingRubric struct {
// Optional: Override which dimensions to grade (defaults to all 5 standard dimensions)
Dimensions []string `` /* 149-byte string literal not displayed */
// Criteria for each dimension - what to look for when grading
Accuracy *DimensionCriteria `yaml:"accuracy,omitempty" json:"accuracy,omitempty" jsonschema:"Specific criteria for accuracy scoring"`
Completeness *DimensionCriteria `yaml:"completeness,omitempty" json:"completeness,omitempty" jsonschema:"Specific criteria for completeness scoring"`
Relevance *DimensionCriteria `yaml:"relevance,omitempty" json:"relevance,omitempty" jsonschema:"Specific criteria for relevance scoring"`
Clarity *DimensionCriteria `yaml:"clarity,omitempty" json:"clarity,omitempty" jsonschema:"Specific criteria for clarity scoring"`
Reasoning *DimensionCriteria `yaml:"reasoning,omitempty" json:"reasoning,omitempty" jsonschema:"Specific criteria for reasoning scoring"`
// Optional: Minimum acceptable scores for pass/fail
MinimumScores map[string]int `` /* 126-byte string literal not displayed */
}
GradingRubric defines specific evaluation criteria for grading
func (*GradingRubric) CheckMinimumScores ¶
func (r *GradingRubric) CheckMinimumScores(grade *GradeResult) error
CheckMinimumScores verifies that graded scores meet minimum thresholds
func (*GradingRubric) Validate ¶
func (r *GradingRubric) Validate() error
Validate checks that the rubric is well-formed
type GradingTrace ¶
type GradingTrace struct {
UserPrompt string `json:"user_prompt"` // Original eval prompt
ModelResponse string `json:"model_response"` // Model's answer being graded
ExpectedResult string `json:"expected_result"` // Expected result description
GradingPrompt string `json:"grading_prompt"` // Full prompt sent to grader
RawGradingOutput string `json:"raw_grading_output"` // Complete LLM response before parsing
StartTime time.Time `json:"start_time"` // When grading started
EndTime time.Time `json:"end_time"` // When grading completed
Duration time.Duration `json:"duration"` // Grading duration
InputTokens int `json:"input_tokens"` // Input tokens for grading
OutputTokens int `json:"output_tokens"` // Output tokens for grading
CacheCreationInputTokens int `json:"cache_creation_input_tokens"` // Tokens used to create cache
CacheReadInputTokens int `json:"cache_read_input_tokens"` // Tokens read from cache
Error string `json:"error,omitempty"` // Error message if grading failed
}
GradingTrace records the grading interaction with the LLM
type MCPServerConfig ¶
type MCPServerConfig struct {
Command string `yaml:"command" json:"command" jsonschema:"Command to start the MCP server"`
Args []string `yaml:"args,omitempty" json:"args,omitempty" jsonschema:"Arguments to pass to the command"`
Env []string `yaml:"env,omitempty" json:"env,omitempty" jsonschema:"Environment variables to set for the MCP server"`
}
MCPServerConfig defines how to start the MCP server
type ToolCall ¶
type ToolCall struct {
ToolID string `json:"tool_id"` // Unique ID from content block
ToolName string `json:"tool_name"` // MCP tool name
StartTime time.Time `json:"start_time"` // When tool execution started
EndTime time.Time `json:"end_time"` // When tool execution completed
Duration time.Duration `json:"duration"` // Tool execution duration
Input json.RawMessage `json:"input"` // Tool arguments as JSON
Output json.RawMessage `json:"output"` // Tool result as JSON
Success bool `json:"success"` // Whether tool executed successfully
Error string `json:"error,omitempty"` // Error message if tool failed
}
ToolCall captures details of a single tool invocation
type ValidationError ¶
type ValidationError struct {
Path string // JSON path to the error (e.g., "mcp_server.command")
Message string // Human-readable error message
}
ValidationError represents a single validation error with location information
type ValidationResult ¶
type ValidationResult struct {
Valid bool
Errors []ValidationError
}
ValidationResult contains the results of validating a config file
func ValidateConfigFile ¶
func ValidateConfigFile(filePath string) (*ValidationResult, error)
ValidateConfigFile validates a configuration file against the JSON schema. It reads the file, converts YAML to JSON if needed, and validates against the schema.