Documentation
¶
Overview ¶
Package eval_harness provides AI code generation benchmarking. This file provides the adapter layer between unified ai.Provider and eval harness.
Index ¶
- Constants
- Variables
- func CalculateCostWithBreakdown(model string, inputTokens, outputTokens int) float64
- func CategorizeError(compileOk, runtimeOk, stdoutOk bool) string
- func CategorizeErrorCode(stderr string) (ErrCode, *RepairHint)
- func CategorizeErrorWithCode(code, stderr string) (ErrCode, *RepairHint)
- func CompareOutput(expected, actual string) bool
- func ComputePromptHash(filePath string) (string, error)
- func DetectedPythonVersion() string
- func EnhancedGenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (string, string, error)
- func FindAILANG() (string, error)
- func FindModelsConfig(startDir string) (string, error)
- func FormatRepairPrompt(code ErrCode, hint *RepairHint, benchmarkID, lang, failedCode, stderr string) string
- func GenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, syntaxRef string, ...) string
- func GenerateAgentPromptsWithSystemPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string, ...) (string, string, string, error)
- func InitModelsConfig() error
- func KillProcess(pid int) error
- func KillProcessGroup(pid int) error
- func LoadActiveSyntaxReference(language string) (string, error)
- func LoadSystemPromptForLanguage(language string, promptVersion string) (string, string, error)
- func LoadTaskPromptTemplate(language string) (string, error)
- func PinnedPythonVersion() string
- func PopulateVerifyMetrics(metrics *RunMetrics, result *AICheckResult, rawJSON string)
- func PrepareWorkspaceWithSyntax(workspace string, spec *BenchmarkSpec, syntaxRef string) error
- func ResolveModelName(name string) (apiName, provider string, err error)
- func SetProcessGroup(cmd *exec.Cmd)
- func ValidatePythonCode(code string) (ErrCode, *RepairHint)
- type AIAgent
- type AICheckCheckResult
- type AICheckResult
- type AICheckVerifyResult
- type AILANGRunner
- type AgentBenchmarkConfig
- type AgentBenchmarkResult
- type BenchmarkSpec
- type ClaudeHeadlessResult
- type ErrCode
- type ErrUvMissing
- type EvalCondition
- type GenerateResult
- type GoRunner
- type JSRunner
- type LanguageRunner
- type LimitedWriter
- type MetricsLogger
- type MicroragMode
- type MockAIAgent
- type ModelConfig
- type ModelStats
- type ModelsConfig
- func (c *ModelsConfig) CalculateCostForModel(name string, inputTokens, outputTokens int) (float64, error)
- func (c *ModelsConfig) FilterAgentSupportedModels(models []string) []string
- func (c *ModelsConfig) GetAPIName(name string) (string, error)
- func (c *ModelsConfig) GetAgentCLI(name string) (string, error)
- func (c *ModelsConfig) GetAgentModelName(name string) (string, error)
- func (c *ModelsConfig) GetAgentSuite() []string
- func (c *ModelsConfig) GetBenchmarkSuite() []string
- func (c *ModelsConfig) GetDefaultModel() string
- func (c *ModelsConfig) GetEnvVar(name string) (string, error)
- func (c *ModelsConfig) GetExecutorForModel(name string) (executorName string, modelName string, err error)
- func (c *ModelsConfig) GetModel(name string) (*ModelConfig, error)
- func (c *ModelsConfig) GetProvider(name string) (string, error)
- func (c *ModelsConfig) ListModels() []string
- func (c *ModelsConfig) SupportsAgentEval(name string) bool
- type MultiExecutorConfig
- type Pricing
- type PromptLoader
- func (l *PromptLoader) GetActivePrompt() (string, error)
- func (l *PromptLoader) GetActiveVersionID() string
- func (l *PromptLoader) GetVersion(versionID string) (*PromptVersion, error)
- func (l *PromptLoader) ListVersions() map[string]PromptVersion
- func (l *PromptLoader) LoadPrompt(versionID string) (string, error)
- type PromptRegistry
- type PromptVersion
- type PythonRunner
- type RepairHint
- type RepairLog
- type RepairRunner
- type RetryConfig
- type RunMetrics
- type RunResult
- type TelemetryReporter
- func (t *TelemetryReporter) AddUsage(tokensIn, tokensOut int, cost float64)
- func (t *TelemetryReporter) Complete(tokensIn, tokensOut int, cost float64)
- func (t *TelemetryReporter) Error()
- func (t *TelemetryReporter) IncrementTurn()
- func (t *TelemetryReporter) IsEnabled() bool
- func (t *TelemetryReporter) SetStatus(status string)
- func (t *TelemetryReporter) SetUsage(tokensIn, tokensOut int, cost float64)
- type TokenUsage
- type ValidationResult
- type Watchdog
Constants ¶
const ( EvalModeStandard = "standard" // Standard 0-shot + self-repair evaluation EvalModeAgent = "agent" // Agent-based evaluation with multi-turn interaction )
EvalMode constants
const ( ErrorCategoryNone = "none" ErrorCategoryCompile = "compile_error" ErrorCategoryRuntime = "runtime_error" ErrorCategoryLogic = "logic_error" ErrorCategoryAPI = "api_error" // API call failed (timeout, rate limit, connection error) ErrorCategoryVerify = "verify_error" // Contract verification failed (M-CONTRACT-EVAL) )
ErrorCategory constants
const ( PinnedPythonMajor = 3 PinnedPythonMinor = 12 )
PinnedPythonVersion is the Python version the eval suite targets. Both the runtime (the interpreter `uv` resolves) and the prompt (what we tell the model) are driven from this single constant so they can never drift.
3.12 is a modern production Python that supports structural pattern matching (`match/case`, PEP 634) — so models are not penalised for reaching for idiomatic constructs available in current stable Python.
const MaxOutputSize = 1 * 1024 * 1024 // 1 MB
MaxOutputSize is the maximum size (in bytes) for stdout/stderr capture This prevents infinite loop bugs from generating gigabyte-sized JSON files
Variables ¶
var Rules = []errorRule{ { WRONG_LANG, regexp.MustCompile(`(?i)(def |class |import json|import sys|function |var |const |#include|using namespace|public static|interface |enum class)`), RepairHint{ Title: "Wrong programming language", Why: "Generated code appears to be Python/JavaScript/C++/Java, not AILANG. AILANG is a pure functional language with ML-style syntax.", How: "Start over with AILANG syntax: 1) Use `let x = expr` for bindings, 2) Use `func name(params) -> Type { body }` for functions, 3) Use recursion instead of loops, 4) No classes, no mutation, no statements. Refer to AILANG examples in the prompt.", }, }, { IMPERATIVE, regexp.MustCompile(`(?i)(loop\s*\{|while\s*\(|for\s*\(|break;|continue;|^\s*\w+\s*=\s*[^=]|;\s*\w+\s*=\s*[^=]|let mut )`), RepairHint{ Title: "Imperative syntax not allowed", Why: "Used imperative constructs (loop/while/for/break/assignment statements). AILANG is purely functional - no loops, no mutation, no statements.", How: "Replace imperative code with functional patterns: 1) Use recursion instead of loops, 2) Use `let x = expr in body` instead of `x = expr;`, 3) Use pattern matching instead of break/continue, 4) All variables are immutable.", }, }, { PAR_001, regexp.MustCompile(`PAR_NO_PREFIX_PARSE|PAR_UNEXPECTED_TOKEN|parse errors? in|unexpected token`), RepairHint{ Title: "Parse error", Why: "AILANG syntax error - common issues: missing semicolons in blocks, wrong syntax for let/lambda/records.", How: "Check: 1) Use `{ e1; e2; e3 }` for blocks (semicolons between exprs), 2) Use `let x = expr in body` or `let x = expr; rest`, 3) Lambda: `\\x -> body` or `func(x) { body }`, 4) No `=` in function params.", }, }, { TC_REC_001, regexp.MustCompile(`field '([^']+)' not found in record|closed row missing labels`), RepairHint{ Title: "Record field missing", Why: "Type checker requires the field to exist in the record.", How: "Add the missing field to the record literal, or use row polymorphism: `{ field: T | ρ }` in type annotation.", }, }, { TC_INT_001, regexp.MustCompile(`Float .* is not an instance of Integral|mod not defined for Float`), RepairHint{ Title: "Modulo on Float", Why: "`%` requires `Integral` (Int) type.", How: "Use integers for `%`, or use `/` and `floor` for floats.", }, }, { EQ_001, regexp.MustCompile(`Eq dictionary resolution failed|using eq_Int for Float`), RepairHint{ Title: "Float equality dictionary", Why: "The Eq dictionary must match Float type.", How: "Annotate as `: float` or ensure both sides are Float.", }, }, { CAP_001, regexp.MustCompile(`no effect context available|effect '(\w+)' requires capability|closed row missing labels: \[(IO|FS|Clock|Net)`), RepairHint{ Title: "Missing capability", Why: "Effect calls require explicit capabilities at runtime.", How: "Declare effects in function signature with explicit type annotation: `let main : Unit -> Unit <IO> = \\() -> { println(...) }`. The type annotation is REQUIRED for effects.", }, }, { MOD_001, regexp.MustCompile(`entrypoint '(\w+)' not found|module .* not found`), RepairHint{ Title: "Entrypoint/module resolution", Why: "Runner couldn't find the entry point function.", How: "Export a zero-argument `main` function, the eval harness uses `--entry main`.", }, }, }
Rules maps error patterns to categorized error codes and repair hints
var ValidConditionNames = []string{"baseline", "contract", "z3_guided", "full", "tool_aware", "agent_prompt"}
ValidConditionNames lists all recognized condition names
var ValidTagTaxonomy = []string{
"adt_pattern_match",
"recursion",
"effects_io",
"contracts",
"data_transform",
"records",
"functional",
"type_safety",
"string_algo",
"state_machine",
"algorithmic",
"error_handling",
}
ValidTagTaxonomy lists the 12-tag taxonomy for BenchmarkSpec.Tags. Tag definitions are in design_docs/planned/v0_13_0/m-eval-category-analysis.md §Component 1.
var ValidTiers = []string{"smoke", "core", "stretch", "vision"}
ValidTiers lists the allowed values for BenchmarkSpec.Tier. Tier structure is defined in design_docs/planned/v0_13_0/m-benchmark-suite-tiers.md.
Functions ¶
func CalculateCostWithBreakdown ¶
CalculateCostWithBreakdown calculates cost using separate input/output token counts This provides accurate pricing based on models.yml configuration Returns 0.0 if model not found - FAIL LOUDLY, NO SILENT FALLBACKS
func CategorizeError ¶
CategorizeError determines the error category based on execution results
func CategorizeErrorCode ¶
func CategorizeErrorCode(stderr string) (ErrCode, *RepairHint)
CategorizeErrorCode matches stderr against error patterns and returns the error code and repair hint if a match is found. Returns ("", nil) if no pattern matches.
func CategorizeErrorWithCode ¶
func CategorizeErrorWithCode(code, stderr string) (ErrCode, *RepairHint)
CategorizeErrorWithCode analyzes both generated code and stderr to detect AI usability issues like wrong language or imperative syntax. Checks code patterns first (WRONG_LANG, IMPERATIVE), then stderr patterns.
func CompareOutput ¶
CompareOutput checks if actual output matches expected output
func ComputePromptHash ¶
ComputePromptHash is a helper to compute hash for a prompt file (for updating registry)
func DetectedPythonVersion ¶
func DetectedPythonVersion() string
DetectedPythonVersion is retained for prompt substitution so nothing in the prompt layer cares whether we're using uv, pyenv, or a raw interpreter. With uv managing the runtime we guarantee the pinned version, so this just returns the pin.
func EnhancedGenerateAgentPrompt ¶
func EnhancedGenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (string, string, error)
EnhancedGenerateAgentPrompt is a wrapper that loads syntax and generates prompt language parameter determines which teaching prompt to load (ailang, python, etc.)
func FindModelsConfig ¶
FindModelsConfig searches for models.yml starting from a directory
func FormatRepairPrompt ¶
func FormatRepairPrompt(code ErrCode, hint *RepairHint, benchmarkID, lang, failedCode, stderr string) string
FormatRepairPrompt creates the repair guidance injection for retry attempts. This prompt is appended to the original benchmark prompt to guide the AI toward fixing the specific error that occurred.
func GenerateAgentPrompt ¶
func GenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, syntaxRef string, language string) string
GenerateAgentPrompt creates a comprehensive prompt for the agent This version loads from a language-specific template file for easy editing
func GenerateAgentPromptsWithSystemPrompt ¶
func GenerateAgentPromptsWithSystemPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string, promptVersion string, solutionPath string) (string, string, string, error)
GenerateAgentPromptsWithSystemPrompt generates split prompts for --system-prompt flag Returns: (systemPrompt, taskPrompt, promptVersionUsed, error)
func InitModelsConfig ¶
func InitModelsConfig() error
InitModelsConfig loads the global models configuration
func KillProcessGroup ¶
KillProcessGroup kills the entire process group (Unix only) Uses negative PID to kill all processes in the group
func LoadActiveSyntaxReference ¶
LoadActiveSyntaxReference loads the active teaching prompt for a language
func LoadSystemPromptForLanguage ¶
LoadSystemPromptForLanguage loads the versioned teaching prompt for a language This is used with Claude CLI's --system-prompt flag
func LoadTaskPromptTemplate ¶
LoadTaskPromptTemplate loads the generic .txt template for the initial agent prompt This explains the benchmark task (what to solve), not the language syntax
func PopulateVerifyMetrics ¶
func PopulateVerifyMetrics(metrics *RunMetrics, result *AICheckResult, rawJSON string)
PopulateVerifyMetrics fills verify fields in RunMetrics from an AICheckResult
func PrepareWorkspaceWithSyntax ¶
func PrepareWorkspaceWithSyntax(workspace string, spec *BenchmarkSpec, syntaxRef string) error
PrepareWorkspaceWithSyntax creates workspace files with full AILANG syntax reference
func ResolveModelName ¶
ResolveModelName resolves a user-provided model name to its API name. Supports both friendly names (e.g., "claude-sonnet-4-5") and direct API names. Returns error if model is not found in configuration -- NO SILENT FALLBACKS.
func SetProcessGroup ¶
SetProcessGroup configures the command to run in its own process group (Unix only)
func ValidatePythonCode ¶
func ValidatePythonCode(code string) (ErrCode, *RepairHint)
ValidatePythonCode checks if Python code accidentally contains AILANG syntax Returns error code and hint if AILANG syntax detected, otherwise returns ("", nil)
Types ¶
type AIAgent ¶
type AIAgent struct {
// contains filtered or unexported fields
}
AIAgent generates code using LLM APIs. Uses the unified internal/ai/ providers via providerAdapter.
func NewAIAgent ¶
NewAIAgent creates a new AI agent using unified providers.
func (*AIAgent) GenerateCode ¶
GenerateCode generates code using the unified provider.
func (*AIAgent) GenerateWithRetry ¶
func (a *AIAgent) GenerateWithRetry(ctx context.Context, prompt string, cfg RetryConfig) (*GenerateResult, error)
GenerateWithRetry generates code with retry logic
type AICheckCheckResult ¶
AICheckCheckResult is the type-check portion
type AICheckResult ¶
type AICheckResult struct {
File string `json:"file"`
Check AICheckCheckResult `json:"check"`
Verify AICheckVerifyResult `json:"verify"`
}
AICheckResult is the parsed JSON output from `ailang ai-check`
func RunAICheck ¶
RunAICheck executes `ailang ai-check <file>` and parses the JSON output. Returns nil result and error if the command can't be executed. Returns parsed result even if verification found counterexamples.
type AICheckVerifyResult ¶
type AICheckVerifyResult struct {
Available bool `json:"available"`
Verified int `json:"verified"`
Counterexample int `json:"counterexample"`
Skipped int `json:"skipped"`
Errors int `json:"errors"`
}
AICheckVerifyResult is the contract verification portion
type AILANGRunner ¶
type AILANGRunner struct {
// contains filtered or unexported fields
}
AILANGRunner executes AILANG code
func NewAILANGRunner ¶
func NewAILANGRunner(ailangPath string, caps []string) *AILANGRunner
NewAILANGRunner creates a new AILANG runner
func NewAILANGRunnerWithTask ¶
func NewAILANGRunnerWithTask(ctx context.Context, ailangPath string, caps []string, taskID string, spec *BenchmarkSpec) *AILANGRunner
NewAILANGRunnerWithTask creates a new AILANG runner with task ID and context for telemetry hierarchy. The taskID is propagated via AILANG_PARENT_TASK_ID, and trace context via TRACEPARENT.
type AgentBenchmarkConfig ¶
type AgentBenchmarkConfig struct {
MaxConcurrent int // Max parallel Claude sessions
RequestsPerSecond int // API rate limit
TimeoutSeconds int // Timeout per benchmark
WorkspaceDir string // Base workspace directory
AllowedTools []string // Tools agent can use
ClaudePath string // Path to claude CLI
ClaudeModel string // Claude model to use (haiku, sonnet, opus, or full name)
Verify bool // Enable contract verification (M-CONTRACT-EVAL)
DevtoolsPrompt string // Devtools prompt content to append to system prompt (M-CONTRACT-EVAL)
AgentPromptContent string // Agent coding prompt content (replaces teaching prompt when UseAgentPrompt condition is active)
Condition EvalCondition // Experimental condition (overrides Verify/DevtoolsPrompt when set)
MicroragMode MicroragMode // μRAG subprocess env mode (M-BRAIN-MICRORAG): on/off/auto
}
AgentBenchmarkConfig configures agent-based evaluation
func DefaultAgentConfig ¶
func DefaultAgentConfig() AgentBenchmarkConfig
DefaultAgentConfig returns sensible defaults
type AgentBenchmarkResult ¶
type AgentBenchmarkResult struct {
BenchmarkID string
Executor string // Executor used: "claude", "gemini", etc.
Success bool
Iterations int // Number of agent turns
Cost float64 // Total cost in USD
DurationMS int // Total time in milliseconds
NumTurns int // Conversation turns
ToolCallCount int // Number of tool invocations (validates agentic behavior)
Error string // Error message if failed
SessionID string // Session ID from executor
Result string // Final result text from agent
// Token usage details
Usage TokenUsage `json:"usage"`
ModelUsage map[string]ModelStats `json:"modelUsage"`
// Solution and session log for inspection
SolutionCode string `json:"solution_code,omitempty"` // Generated solution code
SessionLog string `json:"session_log,omitempty"` // Full Claude session log
PromptVersion string `json:"prompt_version,omitempty"` // Version of teaching prompt used
// Validation flags (match standard eval format for downstream compatibility)
CompileOk bool `json:"compile_ok"` // Did solution parse/compile?
RuntimeOk bool `json:"runtime_ok"` // Did solution run without error?
StdoutOk bool `json:"stdout_ok"` // Did output match expected?
Stdout string `json:"stdout,omitempty"`
Stderr string `json:"stderr,omitempty"`
// Timing breakdown
TTFTSeconds float64 `json:"ttft_seconds,omitempty"` // Time to first token in seconds
// Cross-harness grouping
ModelFamily string `json:"model_family,omitempty"` // Logical model family (e.g. "claude-sonnet-4-6"); empty = no grouping
// Contract verification results (M-CONTRACT-EVAL)
VerifyOk bool `json:"verify_ok"` // All contracts verified
VerifyVerified int `json:"verify_verified"` // Count of verified functions
VerifyCounterex int `json:"verify_counterexample"` // Count of counterexamples
VerifySkipped int `json:"verify_skipped"` // Count of skipped functions
VerifyErrors int `json:"verify_errors"` // Count of Z3 errors
VerifyJSON string `json:"verify_json,omitempty"` // Full ai-check JSON output
}
AgentBenchmarkResult captures agent evaluation outcome
func RunAgentBenchmark ¶
func RunAgentBenchmark(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (*AgentBenchmarkResult, error)
RunAgentBenchmark runs a single benchmark using Claude Code headless mode language parameter specifies which language to run (ailang, python, etc.)
func RunAgentBenchmarkWithExecutor ¶
func RunAgentBenchmarkWithExecutor(spec *BenchmarkSpec, config MultiExecutorConfig, language string) (*AgentBenchmarkResult, error)
RunAgentBenchmarkWithExecutor runs a benchmark using the specified executor This enables comparing performance across different AI coding agents
type BenchmarkSpec ¶
type BenchmarkSpec struct {
ID string `yaml:"id"`
Description string `yaml:"description"`
Languages []string `yaml:"languages"`
Entrypoint string `yaml:"entrypoint"`
Caps []string `yaml:"caps"`
Prompt string `yaml:"prompt"` // Inline prompt text (language-agnostic)
PromptFiles map[string]string `yaml:"prompt_files"` // Language-specific prompt files: {ailang: "prompts/v0.3.0.md"}
TaskPrompt string `yaml:"task_prompt"` // Task-specific prompt appended after base prompt
ContractSpec string `yaml:"contract_spec"` // Optional: AILANG contract specification for Z3 verification
Z3Hints string `yaml:"z3_hints"` // Optional: Pre-computed Z3 counterexample descriptions for known traps
ExpectedOut string `yaml:"expected_stdout"`
Difficulty string `yaml:"difficulty"`
ExpectedGain string `yaml:"expected_gain"`
Timeout int `yaml:"timeout"` // Agent timeout in seconds (default: 60)
// Test infrastructure: stdin, CLI args, and input files
Stdin string `yaml:"stdin,omitempty"` // Stdin data to pipe to the program
CliArgs []string `yaml:"cli_args,omitempty"` // CLI arguments to pass after the script
InputFiles map[string]string `yaml:"input_files,omitempty"` // Files to create in workspace: {filename: content}
// Eval suite classification (M-EVAL-SUITE-PREP, v0.14.0)
Tier string `yaml:"tier,omitempty"` // One of: smoke|core|stretch|vision. Missing defaults to "core".
Tags []string `yaml:"tags,omitempty"` // 1-3 tags from ValidTagTaxonomy. May be empty during migration.
}
BenchmarkSpec defines a single benchmark task
func LoadSpec ¶
func LoadSpec(path string) (*BenchmarkSpec, error)
LoadSpec loads a benchmark spec from a YAML file
func (*BenchmarkSpec) FormatContractSpec ¶
func (s *BenchmarkSpec) FormatContractSpec(verify bool) string
FormatContractSpec returns a formatted contract specification block for prompt injection. When verify is true and the spec has a ContractSpec, returns a formatted block. Otherwise returns empty string (backward compatible).
func (*BenchmarkSpec) FormatZ3Hints ¶
func (s *BenchmarkSpec) FormatZ3Hints() string
FormatZ3Hints returns a formatted Z3 hints block for prompt injection. Only returns content when the spec has Z3Hints defined.
func (*BenchmarkSpec) PromptForLanguage ¶
func (s *BenchmarkSpec) PromptForLanguage(lang string) string
PromptForLanguage returns the prompt with language-specific base prompt + task prompt
func (*BenchmarkSpec) SupportsLanguage ¶
func (s *BenchmarkSpec) SupportsLanguage(lang string) bool
SupportsLanguage checks if the benchmark supports a given language
type ClaudeHeadlessResult ¶
type ClaudeHeadlessResult struct {
Type string `json:"type"`
Subtype string `json:"subtype"`
IsError bool `json:"is_error"`
DurationMS int `json:"duration_ms"`
DurationAPIMS int `json:"duration_api_ms"`
NumTurns int `json:"num_turns"`
Result string `json:"result"`
SessionID string `json:"session_id"`
TotalCostUSD float64 `json:"total_cost_usd"`
Usage TokenUsage `json:"usage"`
ModelUsage map[string]ModelStats `json:"modelUsage"`
PermissionDenials []interface{} `json:"permission_denials"`
UUID string `json:"uuid"`
Transcript string `json:"-"` // Full conversation transcript (not in JSON, set by streaming)
}
ClaudeHeadlessResult is the JSON structure returned by `claude -p --output-format json`
func RunHeadlessSessionStreaming ¶
func RunHeadlessSessionStreaming(spec *BenchmarkSpec, systemPrompt, taskPrompt, workspace string, config AgentBenchmarkConfig) (*ClaudeHeadlessResult, error)
RunHeadlessSessionStreaming executes Claude in headless mode with real-time message streaming This is used when DEBUG_AGENT=1 to provide visibility into what Claude is doing systemPrompt contains language knowledge (loaded from prompts/versions.json) taskPrompt contains the benchmark task description
Exported for use by internal/coordinator package for task execution
type ErrCode ¶
type ErrCode string
ErrCode represents a categorized error type from AILANG execution
const ( // Parser errors PAR_001 ErrCode = "PAR_001" // Parse error (block/semicolon issues) // AI usability errors - Wrong language WRONG_LANG ErrCode = "WRONG_LANG" // Generated code in wrong programming language // AI usability errors - Imperative syntax IMPERATIVE ErrCode = "IMPERATIVE" // Used imperative constructs (loop, break, assignment statements) // Type checker errors - Records TC_REC_001 ErrCode = "TC_REC_001" // Record field not found // Type checker errors - Type classes TC_INT_001 ErrCode = "TC_INT_001" // Not an instance of Integral EQ_001 ErrCode = "EQ_001" // Wrong Eq dictionary // Runtime errors - Capabilities CAP_001 ErrCode = "CAP_001" // Capability missing // Runtime errors - Module system MOD_001 ErrCode = "MOD_001" // Undefined module/entry // Contract verification errors (M-CONTRACT-EVAL) VERIFY_COUNTEREXAMPLE ErrCode = "VERIFY_COUNTEREXAMPLE" // Z3 found counterexample )
type ErrUvMissing ¶
type ErrUvMissing struct {
// contains filtered or unexported fields
}
ErrUvMissing is returned by Python benchmark runners when the `uv` binary is not on PATH. The eval suite depends on uv to pin the Python runtime so that every benchmark sees the exact same interpreter on every machine. Per CLAUDE.md §2 we fail loudly rather than silently falling back to a system `python3`, because a wrong-version fallback is the exact bug we are trying to eliminate.
func (*ErrUvMissing) Error ¶
func (e *ErrUvMissing) Error() string
func (*ErrUvMissing) Unwrap ¶
func (e *ErrUvMissing) Unwrap() error
type EvalCondition ¶
type EvalCondition struct {
Name string // "baseline", "contract", "z3_guided", "full", "tool_aware", "agent_prompt", or "" for legacy
IncludeContract bool // Include contract_spec in prompt
IncludeZ3Hints bool // Include z3_hints in prompt
IncludeDevtools bool // Append devtools prompt to system prompt
IncludeToolGuidance bool // Include general contract-writing + ai-check guidance (no spec given)
EnableVerify bool // Enable Z3 verification (standard mode repair + post-hoc check)
UseAgentPrompt bool // Use compact agent coding prompt instead of full teaching prompt
}
EvalCondition represents a named experimental condition that controls what information is included in the LLM prompt. Conditions are treated like languages — each creates a separate evaluation job.
func ResolveCondition ¶
func ResolveCondition(name string, legacyVerify, legacyDevtools bool) EvalCondition
ResolveCondition returns the settings for a named condition. If name is empty, returns legacy behavior using the explicit --verify/--devtools-prompt flags.
type GenerateResult ¶
type GenerateResult struct {
Code string
InputTokens int // Prompt tokens (input to LLM)
OutputTokens int // Completion tokens (generated code)
TotalTokens int // Total tokens (for billing)
Model string
}
GenerateResult contains the result of code generation
type GoRunner ¶ added in v0.14.2
type GoRunner struct {
// contains filtered or unexported fields
}
GoRunner executes Go code via go run
func NewGoRunner ¶ added in v0.14.2
func NewGoRunner() *GoRunner
NewGoRunner creates a new Go runner
func NewGoRunnerWithSpec ¶ added in v0.14.2
func NewGoRunnerWithSpec(spec *BenchmarkSpec) *GoRunner
NewGoRunnerWithSpec creates a new Go runner with benchmark spec
type JSRunner ¶ added in v0.14.2
type JSRunner struct {
// contains filtered or unexported fields
}
JSRunner executes JavaScript (Node.js) code
func NewJSRunner ¶ added in v0.14.2
func NewJSRunner() *JSRunner
NewJSRunner creates a new JavaScript runner
func NewJSRunnerWithSpec ¶ added in v0.14.2
func NewJSRunnerWithSpec(spec *BenchmarkSpec) *JSRunner
NewJSRunnerWithSpec creates a new JavaScript runner with benchmark spec
type LanguageRunner ¶
type LanguageRunner interface {
Run(code string, timeout time.Duration) (*RunResult, error)
Language() string
}
LanguageRunner executes code in a specific language
func GetRunner ¶
func GetRunner(lang string, spec *BenchmarkSpec) (LanguageRunner, error)
GetRunner returns a LanguageRunner for the specified language
func GetRunnerWithContext ¶
func GetRunnerWithContext(ctx context.Context, langName string, spec *BenchmarkSpec, taskID string) (LanguageRunner, error)
GetRunnerWithContext returns a LanguageRunner with full telemetry context. The ctx is used to propagate TRACEPARENT for span hierarchy. The taskID is propagated via AILANG_PARENT_TASK_ID for task-level correlation.
func GetRunnerWithTask ¶
func GetRunnerWithTask(lang string, spec *BenchmarkSpec, taskID string) (LanguageRunner, error)
GetRunnerWithTask returns a LanguageRunner with task ID for telemetry hierarchy. Deprecated: Use GetRunnerWithContext instead for full trace propagation.
type LimitedWriter ¶
type LimitedWriter struct {
// contains filtered or unexported fields
}
LimitedWriter wraps an io.Writer and limits the total bytes written Once the limit is reached, subsequent writes are discarded and a truncation message is appended
func NewLimitedWriter ¶
func NewLimitedWriter(limit int64) *LimitedWriter
NewLimitedWriter creates a new LimitedWriter with the specified byte limit
func (*LimitedWriter) String ¶
func (lw *LimitedWriter) String() string
String returns the buffered content (possibly truncated)
func (*LimitedWriter) Truncated ¶
func (lw *LimitedWriter) Truncated() bool
Truncated returns true if output was truncated
type MetricsLogger ¶
type MetricsLogger struct {
// contains filtered or unexported fields
}
MetricsLogger handles writing metrics to JSON files
func NewMetricsLogger ¶
func NewMetricsLogger(outputDir string) *MetricsLogger
NewMetricsLogger creates a new metrics logger
func (*MetricsLogger) Log ¶
func (l *MetricsLogger) Log(m *RunMetrics) error
Log writes a RunMetrics to a JSON file
type MicroragMode ¶ added in v0.14.2
type MicroragMode string
MicroragMode is the eval-suite --microrag flag value.
const ( // MicroragModeAuto respects the inherited environment. Default. // Use when running outside an A/B comparison. MicroragModeAuto MicroragMode = "auto" // MicroragModeOn forces AILANG_MICRORAG_ENABLED=1 in subprocesses. MicroragModeOn MicroragMode = "on" // MicroragModeOff forces AILANG_MICRORAG_ENABLED=0 in subprocesses. // Use this for the baseline arm of an A/B run. MicroragModeOff MicroragMode = "off" )
func ParseMicroragMode ¶ added in v0.14.2
func ParseMicroragMode(s string) MicroragMode
ParseMicroragMode normalises CLI input. Empty / unknown → auto.
func (MicroragMode) ApplyToEnv ¶ added in v0.14.2
func (m MicroragMode) ApplyToEnv(env []string) []string
ApplyToEnv strips any existing AILANG_MICRORAG_ENABLED entry and appends a fresh one matching the mode. Auto leaves the inherited value untouched.
Returns the modified slice. Caller passes the result to cmd.Env. We don't mutate os.Environ() — every subprocess gets its own independent copy.
func (MicroragMode) ResolvedState ¶ added in v0.14.2
func (m MicroragMode) ResolvedState() string
ResolvedState returns what should be recorded in RunMetrics.MicroragState. For auto, peeks at the actual env so the result file shows the effective value rather than "auto" (which would obscure the comparison).
type MockAIAgent ¶
type MockAIAgent struct {
// contains filtered or unexported fields
}
MockAIAgent is a mock implementation for testing
func NewMockAIAgent ¶
func NewMockAIAgent(model, code string) *MockAIAgent
NewMockAIAgent creates a mock AI agent
func (*MockAIAgent) GenerateCode ¶
func (m *MockAIAgent) GenerateCode(ctx context.Context, prompt string) (*GenerateResult, error)
GenerateCode returns the pre-configured mock code
type ModelConfig ¶
type ModelConfig struct {
APIName string `yaml:"api_name"`
Provider string `yaml:"provider"`
Description string `yaml:"description"`
EnvVar string `yaml:"env_var"`
AgentCLI *string `yaml:"agent_cli"` // CLI command for agent eval (e.g., "claude", "openai", "gemini"), nil if not supported
AgentModelName *string `yaml:"agent_model_name"` // Model name to pass to agent CLI (e.g., "haiku", "sonnet")
MaxOutputTokens int `yaml:"max_output_tokens"` // Max output tokens (0 = handler default 4096)
TTFTTimeoutSeconds int `yaml:"ttft_timeout"` // Prefill budget in seconds (0 = executor default 30s)
GenerationTimeoutSeconds int `yaml:"generation_timeout"` // Per-token idle budget after first event (0 = executor default 3m)
ModelFamily string `yaml:"model_family"` // Logical model family for cross-harness grouping (e.g., "claude-sonnet-4-6"); empty = no grouping
GCPProject string `yaml:"gcp_project"` // Override GOOGLE_CLOUD_PROJECT for this model's evals (e.g. "ailang-dev")
GCPLocation string `yaml:"gcp_location"` // Override GOOGLE_CLOUD_LOCATION (e.g. "us-central1")
Pricing Pricing `yaml:"pricing"`
Notes string `yaml:"notes"`
}
ModelConfig represents a single model configuration
type ModelStats ¶
type ModelStats struct {
InputTokens int `json:"inputTokens"`
OutputTokens int `json:"outputTokens"`
CacheReadInputTokens int `json:"cacheReadInputTokens"`
CacheCreationInputTokens int `json:"cacheCreationInputTokens"`
WebSearchRequests int `json:"webSearchRequests"`
CostUSD float64 `json:"costUSD"`
ContextWindow int `json:"contextWindow"`
}
ModelStats captures per-model statistics
type ModelsConfig ¶
type ModelsConfig struct {
Models map[string]ModelConfig `yaml:"models"`
Default string `yaml:"default"`
BenchmarkSuite []string `yaml:"benchmark_suite"`
ExtendedSuite []string `yaml:"extended_suite"`
DevModels []string `yaml:"dev_models"`
AgentSuite []string `yaml:"agent_suite"`
OllamaSuite []string `yaml:"ollama_suite"`
HarnessSuite []string `yaml:"harness_suite"`
LangHarnessSuite []string `yaml:"lang_harness_suite"`
}
ModelsConfig represents the entire models.yml configuration
var ( // GlobalModelsConfig is the loaded models configuration GlobalModelsConfig *ModelsConfig )
func LoadModelsConfig ¶
func LoadModelsConfig(path string) (*ModelsConfig, error)
LoadModelsConfig loads the models.yml configuration
func (*ModelsConfig) CalculateCostForModel ¶
func (c *ModelsConfig) CalculateCostForModel(name string, inputTokens, outputTokens int) (float64, error)
CalculateCostForModel calculates the cost for a model using its pricing config
func (*ModelsConfig) FilterAgentSupportedModels ¶
func (c *ModelsConfig) FilterAgentSupportedModels(models []string) []string
FilterAgentSupportedModels filters a list of models to only those that support agent eval
func (*ModelsConfig) GetAPIName ¶
func (c *ModelsConfig) GetAPIName(name string) (string, error)
GetAPIName returns the API name for a model by friendly name
func (*ModelsConfig) GetAgentCLI ¶
func (c *ModelsConfig) GetAgentCLI(name string) (string, error)
GetAgentCLI returns the agent CLI command for a model (e.g., "claude")
func (*ModelsConfig) GetAgentModelName ¶
func (c *ModelsConfig) GetAgentModelName(name string) (string, error)
GetAgentModelName returns the model name to pass to the agent CLI
func (*ModelsConfig) GetAgentSuite ¶ added in v0.14.2
func (c *ModelsConfig) GetAgentSuite() []string
GetAgentSuite returns the cross-harness agent eval suite (claude+gemini+codex+opencode). Only models with non-null agent_cli participate in agent-mode runs; text-only models in the suite are skipped cleanly.
func (*ModelsConfig) GetBenchmarkSuite ¶
func (c *ModelsConfig) GetBenchmarkSuite() []string
GetBenchmarkSuite returns the recommended models for comprehensive evaluation
func (*ModelsConfig) GetDefaultModel ¶
func (c *ModelsConfig) GetDefaultModel() string
GetDefaultModel returns the default model name
func (*ModelsConfig) GetEnvVar ¶
func (c *ModelsConfig) GetEnvVar(name string) (string, error)
GetEnvVar returns the environment variable name for a model's API key
func (*ModelsConfig) GetExecutorForModel ¶
func (c *ModelsConfig) GetExecutorForModel(name string) (executorName string, modelName string, err error)
GetExecutorForModel returns the appropriate executor for a model Returns the executor name (e.g., "claude", "gemini") and the model name to use
func (*ModelsConfig) GetModel ¶
func (c *ModelsConfig) GetModel(name string) (*ModelConfig, error)
GetModel returns the configuration for a model by friendly name
func (*ModelsConfig) GetProvider ¶
func (c *ModelsConfig) GetProvider(name string) (string, error)
GetProvider returns the provider for a model
func (*ModelsConfig) ListModels ¶
func (c *ModelsConfig) ListModels() []string
ListModels returns all configured model names
func (*ModelsConfig) SupportsAgentEval ¶
func (c *ModelsConfig) SupportsAgentEval(name string) bool
SupportsAgentEval returns true if the model supports agent-based evaluation
type MultiExecutorConfig ¶
type MultiExecutorConfig struct {
AgentBenchmarkConfig
// ExecutorName specifies which executor to use (e.g., "claude", "gemini")
// If empty, uses the model's agent_cli from models.yml
ExecutorName string
// ModelName is the model to use (e.g., "claude-sonnet-4-5", "gemini-3-flash")
ModelName string
// ConfigKey is the models.yml lookup key for per-model config (e.g., "opencode-gemma4-e4b").
// When set, overrides ModelName for timeout/config lookups. Needed when ModelName is the
// resolved API model name (e.g., "ollama/gemma4:e4b") rather than the models.yml key.
ConfigKey string
// ExtraHandler is an additional event handler composed with the debug handler.
// Used for ObservatoryWriter to capture structured tool calls during streaming.
// When nil, only the debug handler is used (no behavior change).
ExtraHandler executor.EventHandler
}
MultiExecutorConfig extends AgentBenchmarkConfig with executor selection
type Pricing ¶
type Pricing struct {
InputPer1K float64 `yaml:"input_per_1k"`
OutputPer1K float64 `yaml:"output_per_1k"`
}
Pricing represents model pricing information
type PromptLoader ¶
type PromptLoader struct {
// contains filtered or unexported fields
}
PromptLoader loads and verifies prompt versions
func NewPromptLoader ¶
func NewPromptLoader(registryPath string) (*PromptLoader, error)
NewPromptLoader creates a loader from versions.json
func (*PromptLoader) GetActivePrompt ¶
func (l *PromptLoader) GetActivePrompt() (string, error)
GetActivePrompt loads the active prompt version Supports special value "latest" to automatically use the most recent version
func (*PromptLoader) GetActiveVersionID ¶
func (l *PromptLoader) GetActiveVersionID() string
GetActiveVersionID returns the active version ID (resolving "latest" if needed)
func (*PromptLoader) GetVersion ¶
func (l *PromptLoader) GetVersion(versionID string) (*PromptVersion, error)
GetVersion returns metadata for a specific version
func (*PromptLoader) ListVersions ¶
func (l *PromptLoader) ListVersions() map[string]PromptVersion
ListVersions returns all available prompt versions
func (*PromptLoader) LoadPrompt ¶
func (l *PromptLoader) LoadPrompt(versionID string) (string, error)
LoadPrompt loads a prompt by version ID with hash verification
type PromptRegistry ¶
type PromptRegistry struct {
SchemaVersion string `json:"schema_version"`
Versions map[string]PromptVersion `json:"versions"`
Active string `json:"active"`
Notes []string `json:"notes"`
}
PromptRegistry contains all registered prompt versions
type PromptVersion ¶
type PromptVersion struct {
File string `json:"file"`
Hash string `json:"hash"`
Description string `json:"description"`
Created string `json:"created"`
Tags []string `json:"tags"`
Notes string `json:"notes"`
}
PromptVersion represents metadata about a prompt version
type PythonRunner ¶
type PythonRunner struct {
// contains filtered or unexported fields
}
PythonRunner executes Python code
func NewPythonRunner ¶
func NewPythonRunner() *PythonRunner
NewPythonRunner creates a new Python runner
func NewPythonRunnerWithSpec ¶
func NewPythonRunnerWithSpec(spec *BenchmarkSpec) *PythonRunner
NewPythonRunnerWithSpec creates a new Python runner with benchmark spec for test infrastructure
type RepairHint ¶
type RepairHint struct {
Title string // Short description of the error
Why string // Explanation of why the error occurred
How string // Concrete steps to fix the error
}
RepairHint provides actionable guidance for fixing an error
func FormatZ3RepairHint ¶
func FormatZ3RepairHint(verifyStderr string) *RepairHint
FormatZ3RepairHint creates a RepairHint from Z3 verification output (M-CONTRACT-EVAL)
type RepairLog ¶
type RepairLog struct {
Wrapped bool // True if bare expression was wrapped in module scaffold
AddedModule bool // True if module declaration was added
AddedImports []string // List of imports that were injected
CallFixes int // Number of bare function calls that were fixed with parens
AddedMainFunc bool // True if main function was synthesized
}
RepairLog tracks transformations applied by normalizeProgram
type RepairRunner ¶
type RepairRunner struct {
// contains filtered or unexported fields
}
RepairRunner orchestrates self-repair logic for eval benchmarks
func NewRepairRunner ¶
func NewRepairRunner(agent *AIAgent, runner LanguageRunner, spec *BenchmarkSpec, timeout time.Duration, selfRepair bool) *RepairRunner
NewRepairRunner creates a new repair runner
func (*RepairRunner) Run ¶
func (r *RepairRunner) Run(ctx context.Context, prompt string) (*RunMetrics, error)
Run executes the benchmark with optional self-repair
func (*RepairRunner) SetPromptVersion ¶
func (r *RepairRunner) SetPromptVersion(version string)
SetPromptVersion sets the prompt version ID for metrics tracking
type RetryConfig ¶
RetryConfig configures retry behavior
type RunMetrics ¶
type RunMetrics struct {
ID string `json:"id"`
Lang string `json:"lang"`
Model string `json:"model"`
Executor string `json:"executor,omitempty"` // Executor used: "claude", "gemini", etc. (agent mode only)
Seed int64 `json:"seed"`
InputTokens int `json:"input_tokens"` // Prompt tokens (recorded but not primary metric)
OutputTokens int `json:"output_tokens"` // Generated code tokens (PRIMARY METRIC)
TotalTokens int `json:"total_tokens"` // Total for billing
CostUSD float64 `json:"cost_usd"`
CompileOk bool `json:"compile_ok"`
RuntimeOk bool `json:"runtime_ok"`
StdoutOk bool `json:"stdout_ok"`
DurationMs int64 `json:"duration_ms"` // Total time (startup + compile + execution)
CompileMs int64 `json:"compile_ms"` // Time spent in compilation (if separate)
ExecuteMs int64 `json:"execute_ms"` // Time spent in execution (if measurable)
ErrorCategory string `json:"error_category"` // compile_error | runtime_error | logic_error | none
Stdout string `json:"stdout,omitempty"`
Stderr string `json:"stderr,omitempty"`
ExpectedStdout string `json:"expected_stdout,omitempty"`
Timestamp time.Time `json:"timestamp"`
Code string `json:"code,omitempty"` // Generated code (optional, for debugging)
// Self-repair metrics (M-EVAL-LOOP)
FirstAttemptOk bool `json:"first_attempt_ok"` // Did first attempt succeed?
RepairUsed bool `json:"repair_used"` // Did we attempt a repair?
RepairOk bool `json:"repair_ok"` // Did repair succeed?
ErrCode string `json:"err_code,omitempty"` // Error code from taxonomy (PAR_001, etc.)
RepairTokensIn int `json:"repair_tokens_in,omitempty"` // Input tokens for repair attempt
RepairTokensOut int `json:"repair_tokens_out,omitempty"` // Output tokens for repair attempt
// Prompt versioning (M-EVAL-LOOP)
PromptVersion string `json:"prompt_version,omitempty"` // Prompt version used (v0.3.0-hints, etc.)
// Reproducibility (M-EVAL-LOOP)
BinaryHash string `json:"binary_hash,omitempty"` // SHA256 of ailang binary
StdlibHash string `json:"stdlib_hash,omitempty"` // SHA256 of stdlib
Caps []string `json:"caps,omitempty"` // Capabilities granted
// Contract verification results (M-CONTRACT-EVAL)
VerifyOk bool `json:"verify_ok"` // All contracts verified
VerifyVerified int `json:"verify_verified"` // Count of verified functions
VerifyCounterex int `json:"verify_counterexample"` // Count of counterexamples
VerifySkipped int `json:"verify_skipped"` // Count of skipped functions
VerifyErrors int `json:"verify_errors"` // Count of Z3 errors
VerifyJSON string `json:"verify_json,omitempty"` // Full ai-check JSON output
// Agent mode KPIs (M-EVAL-AGENT)
AgentTurns int `json:"agent_turns,omitempty"` // Number of conversation turns (agent mode only)
AgentTranscript string `json:"agent_transcript,omitempty"` // Full Claude conversation transcript (agent mode only)
EvalMode string `json:"eval_mode,omitempty"` // Evaluation mode: "standard" or "agent"
// Experimental condition (M-CONTRACT-EVAL conditions dimension)
Condition string `json:"condition,omitempty"` // Experimental condition: "baseline", "contract", "z3_guided", "full"
// μRAG state for this run (M-BRAIN-MICRORAG)
// Values: "on" | "off" | "auto" | "disabled" | "" (legacy / not set).
// Lets eval-report break results down with vs. without JIT knowledge injection.
MicroragState string `json:"microrag_state,omitempty"`
// Cross-harness grouping (M-EVAL-CROSS-HARNESS)
// Populated from models.yml model_family field. Enables --group-by=model-family
// in eval-matrix to compare same model across different harnesses (e.g. claude vs opencode).
ModelFamily string `json:"model_family,omitempty"`
}
RunMetrics captures the results of a single benchmark run
func NewRunMetrics ¶
func NewRunMetrics(id, lang, model string, seed int64) *RunMetrics
NewRunMetrics creates a new RunMetrics with timestamp and error category. MicroragState is auto-populated from the inherited env so every metrics emission honours the eval-suite --microrag flag (M-BRAIN-MICRORAG).
type RunResult ¶
type RunResult struct {
Stdout string
Stderr string
ExitCode int
Duration time.Duration // Total time (startup + compile + execution)
CompileTime time.Duration // Time spent in compilation/type-checking (if separate)
ExecuteTime time.Duration // Time spent in actual code execution (if measurable)
CompileOk bool
RuntimeOk bool
StdoutOk bool
TimedOut bool
CodeHash string // SHA256 hash of executed code (for validation)
WorkspaceDir string // Path to isolated workspace (for debugging)
}
RunResult captures the outcome of running generated code
type TelemetryReporter ¶
type TelemetryReporter struct {
// contains filtered or unexported fields
}
TelemetryReporter sends telemetry updates to the collaboration hub server This allows eval suite and other external processes to report their Claude usage
func NewTelemetryReporter ¶
func NewTelemetryReporter(serverURL, instanceID string) *TelemetryReporter
NewTelemetryReporter creates a new telemetry reporter serverURL should be like "http://localhost:8090" (the collaboration hub) If serverURL is empty, checks AILANG_HUB_URL environment variable If neither is set, telemetry reporting is disabled
func (*TelemetryReporter) AddUsage ¶
func (t *TelemetryReporter) AddUsage(tokensIn, tokensOut int, cost float64)
AddUsage adds to the token usage and cost (for cumulative updates)
func (*TelemetryReporter) Complete ¶
func (t *TelemetryReporter) Complete(tokensIn, tokensOut int, cost float64)
Complete marks the session as complete with final metrics
func (*TelemetryReporter) Error ¶
func (t *TelemetryReporter) Error()
Error marks the session as failed
func (*TelemetryReporter) IncrementTurn ¶
func (t *TelemetryReporter) IncrementTurn()
IncrementTurn increments the turn counter and sends an update
func (*TelemetryReporter) IsEnabled ¶
func (t *TelemetryReporter) IsEnabled() bool
IsEnabled returns whether telemetry reporting is enabled
func (*TelemetryReporter) SetStatus ¶
func (t *TelemetryReporter) SetStatus(status string)
SetStatus sets the process status (running, completed, error)
func (*TelemetryReporter) SetUsage ¶
func (t *TelemetryReporter) SetUsage(tokensIn, tokensOut int, cost float64)
SetUsage sets the token usage and cost
type TokenUsage ¶
type TokenUsage struct {
InputTokens int `json:"input_tokens"`
CacheCreationInputTokens int `json:"cache_creation_input_tokens"`
CacheReadInputTokens int `json:"cache_read_input_tokens"`
OutputTokens int `json:"output_tokens"`
}
TokenUsage captures detailed token metrics
type ValidationResult ¶
type ValidationResult struct {
CompileOk bool
RuntimeOk bool
StdoutOk bool
Stdout string
Stderr string
}
ValidationResult holds detailed validation results for agent benchmarks
type Watchdog ¶
type Watchdog struct {
MaxAge time.Duration // Kill processes older than this
CheckPeriod time.Duration // How often to check
Pattern string // Process pattern to match
KilledCount int // Number of orphans killed
Enabled bool // Whether watchdog is active
}
Watchdog monitors for orphaned eval processes and kills them This is a safety net for cases where process group cleanup fails
func NewWatchdog ¶
NewWatchdog creates a new Watchdog with the specified settings
func (*Watchdog) KillOrphans ¶
KillOrphans performs an immediate check and kill of orphaned processes This can be called during shutdown for extra cleanup