eval_harness

package

v0.14.3 Latest Latest Go to latest Published: May 2, 2026 License: Apache-2.0 Imports: 32 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/sunholo-data/ailang

Links

Open Source Insights

Documentation ¶

Rendered for

Overview ¶

Package eval_harness provides AI code generation benchmarking. This file provides the adapter layer between unified ai.Provider and eval harness.

Index ¶

Constants
Variables
func CalculateCostWithBreakdown(model string, inputTokens, outputTokens int) float64
func CategorizeError(compileOk, runtimeOk, stdoutOk bool) string
func CategorizeErrorCode(stderr string) (ErrCode, *RepairHint)
func CategorizeErrorWithCode(code, stderr string) (ErrCode, *RepairHint)
func CompareOutput(expected, actual string) bool
func ComputePromptHash(filePath string) (string, error)
func DetectedPythonVersion() string
func EnhancedGenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (string, string, error)
func FindAILANG() (string, error)
func FindModelsConfig(startDir string) (string, error)
func FormatRepairPrompt(code ErrCode, hint *RepairHint, benchmarkID, lang, failedCode, stderr string) string
func GenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, syntaxRef string, ...) string
func GenerateAgentPromptsWithSystemPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string, ...) (string, string, string, error)
func InitModelsConfig() error
func KillProcess(pid int) error
func KillProcessGroup(pid int) error
func LoadActiveSyntaxReference(language string) (string, error)
func LoadSystemPromptForLanguage(language string, promptVersion string) (string, string, error)
func LoadTaskPromptTemplate(language string) (string, error)
func PinnedPythonVersion() string
func PopulateVerifyMetrics(metrics *RunMetrics, result *AICheckResult, rawJSON string)
func PrepareWorkspaceWithSyntax(workspace string, spec *BenchmarkSpec, syntaxRef string) error
func ResolveModelName(name string) (apiName, provider string, err error)
func SetProcessGroup(cmd *exec.Cmd)
func ValidatePythonCode(code string) (ErrCode, *RepairHint)
type AIAgent
- func NewAIAgent(model string, seed int64) (*AIAgent, error)
- func (a *AIAgent) GenerateCode(ctx context.Context, prompt string) (*GenerateResult, error)
- func (a *AIAgent) GenerateWithRetry(ctx context.Context, prompt string, cfg RetryConfig) (*GenerateResult, error)
type AICheckCheckResult
type AICheckResult
- func RunAICheck(ailangPath, filePath string, timeout time.Duration) (*AICheckResult, string, error)
type AICheckVerifyResult
type AILANGRunner
- func NewAILANGRunner(ailangPath string, caps []string) *AILANGRunner
- func NewAILANGRunnerWithTask(ctx context.Context, ailangPath string, caps []string, taskID string, ...) *AILANGRunner
- func (r *AILANGRunner) Language() string
- func (r *AILANGRunner) Run(code string, timeout time.Duration) (*RunResult, error)
type AgentBenchmarkConfig
- func DefaultAgentConfig() AgentBenchmarkConfig
type AgentBenchmarkResult
- func RunAgentBenchmark(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (*AgentBenchmarkResult, error)
- func RunAgentBenchmarkWithExecutor(spec *BenchmarkSpec, config MultiExecutorConfig, language string) (*AgentBenchmarkResult, error)
type BenchmarkSpec
- func LoadSpec(path string) (*BenchmarkSpec, error)
- func (s *BenchmarkSpec) FormatContractSpec(verify bool) string
- func (s *BenchmarkSpec) FormatZ3Hints() string
- func (s *BenchmarkSpec) PromptForLanguage(lang string) string
- func (s *BenchmarkSpec) SupportsLanguage(lang string) bool
type ClaudeHeadlessResult
- func RunHeadlessSessionStreaming(spec *BenchmarkSpec, systemPrompt, taskPrompt, workspace string, ...) (*ClaudeHeadlessResult, error)
type ErrCode
type ErrUvMissing
- func (e *ErrUvMissing) Error() string
- func (e *ErrUvMissing) Unwrap() error
type EvalCondition
- func ResolveCondition(name string, legacyVerify, legacyDevtools bool) EvalCondition
type GenerateResult
type GoRunner
- func NewGoRunner() *GoRunner
- func NewGoRunnerWithSpec(spec *BenchmarkSpec) *GoRunner
- func (r *GoRunner) Language() string
- func (r *GoRunner) Run(code string, timeout time.Duration) (*RunResult, error)
type JSRunner
- func NewJSRunner() *JSRunner
- func NewJSRunnerWithSpec(spec *BenchmarkSpec) *JSRunner
- func (r *JSRunner) Language() string
- func (r *JSRunner) Run(code string, timeout time.Duration) (*RunResult, error)
type LanguageRunner
- func GetRunner(lang string, spec *BenchmarkSpec) (LanguageRunner, error)
- func GetRunnerWithContext(ctx context.Context, langName string, spec *BenchmarkSpec, taskID string) (LanguageRunner, error)
- func GetRunnerWithTask(lang string, spec *BenchmarkSpec, taskID string) (LanguageRunner, error)
type LimitedWriter
- func NewLimitedWriter(limit int64) *LimitedWriter
- func (lw *LimitedWriter) String() string
- func (lw *LimitedWriter) Truncated() bool
- func (lw *LimitedWriter) Write(p []byte) (n int, err error)
type MetricsLogger
- func NewMetricsLogger(outputDir string) *MetricsLogger
- func (l *MetricsLogger) Log(m *RunMetrics) error
type MicroragMode
- func ParseMicroragMode(s string) MicroragMode
- func (m MicroragMode) ApplyToEnv(env []string) []string
- func (m MicroragMode) ResolvedState() string
type MockAIAgent
- func NewMockAIAgent(model, code string) *MockAIAgent
- func (m *MockAIAgent) GenerateCode(ctx context.Context, prompt string) (*GenerateResult, error)
type ModelConfig
type ModelStats
type ModelsConfig
- func LoadModelsConfig(path string) (*ModelsConfig, error)
- func (c *ModelsConfig) CalculateCostForModel(name string, inputTokens, outputTokens int) (float64, error)
- func (c *ModelsConfig) FilterAgentSupportedModels(models []string) []string
- func (c *ModelsConfig) GetAPIName(name string) (string, error)
- func (c *ModelsConfig) GetAgentCLI(name string) (string, error)
- func (c *ModelsConfig) GetAgentModelName(name string) (string, error)
- func (c *ModelsConfig) GetAgentSuite() []string
- func (c *ModelsConfig) GetBenchmarkSuite() []string
- func (c *ModelsConfig) GetDefaultModel() string
- func (c *ModelsConfig) GetEnvVar(name string) (string, error)
- func (c *ModelsConfig) GetExecutorForModel(name string) (executorName string, modelName string, err error)
- func (c *ModelsConfig) GetModel(name string) (*ModelConfig, error)
- func (c *ModelsConfig) GetProvider(name string) (string, error)
- func (c *ModelsConfig) ListModels() []string
- func (c *ModelsConfig) SupportsAgentEval(name string) bool
type MultiExecutorConfig
type Pricing
type PromptLoader
- func NewPromptLoader(registryPath string) (*PromptLoader, error)
- func (l *PromptLoader) GetActivePrompt() (string, error)
- func (l *PromptLoader) GetActiveVersionID() string
- func (l *PromptLoader) GetVersion(versionID string) (*PromptVersion, error)
- func (l *PromptLoader) ListVersions() map[string]PromptVersion
- func (l *PromptLoader) LoadPrompt(versionID string) (string, error)
type PromptRegistry
type PromptVersion
type PythonRunner
- func NewPythonRunner() *PythonRunner
- func NewPythonRunnerWithSpec(spec *BenchmarkSpec) *PythonRunner
- func (r *PythonRunner) Language() string
- func (r *PythonRunner) Run(code string, timeout time.Duration) (*RunResult, error)
type RepairHint
- func FormatZ3RepairHint(verifyStderr string) *RepairHint
type RepairLog
type RepairRunner
- func NewRepairRunner(agent *AIAgent, runner LanguageRunner, spec *BenchmarkSpec, ...) *RepairRunner
- func (r *RepairRunner) Run(ctx context.Context, prompt string) (*RunMetrics, error)
- func (r *RepairRunner) SetPromptVersion(version string)
- func (r *RepairRunner) SetVerify(verify bool, timeout time.Duration)
type RetryConfig
type RunMetrics
- func NewRunMetrics(id, lang, model string, seed int64) *RunMetrics
type RunResult
type TelemetryReporter
- func NewTelemetryReporter(serverURL, instanceID string) *TelemetryReporter
- func (t *TelemetryReporter) AddUsage(tokensIn, tokensOut int, cost float64)
- func (t *TelemetryReporter) Complete(tokensIn, tokensOut int, cost float64)
- func (t *TelemetryReporter) Error()
- func (t *TelemetryReporter) IncrementTurn()
- func (t *TelemetryReporter) IsEnabled() bool
- func (t *TelemetryReporter) SetStatus(status string)
- func (t *TelemetryReporter) SetUsage(tokensIn, tokensOut int, cost float64)
type TokenUsage
type ValidationResult
type Watchdog
- func NewWatchdog(maxAge, checkPeriod time.Duration) *Watchdog
- func (w *Watchdog) KillOrphans() int
- func (w *Watchdog) Report() string
- func (w *Watchdog) Start(done <-chan struct{})

Constants ¶

View Source

const (
	EvalModeStandard = "standard" // Standard 0-shot + self-repair evaluation
	EvalModeAgent    = "agent"    // Agent-based evaluation with multi-turn interaction
)

EvalMode constants

View Source

const (
	ErrorCategoryNone    = "none"
	ErrorCategoryCompile = "compile_error"
	ErrorCategoryRuntime = "runtime_error"
	ErrorCategoryLogic   = "logic_error"
	ErrorCategoryAPI     = "api_error"    // API call failed (timeout, rate limit, connection error)
	ErrorCategoryVerify  = "verify_error" // Contract verification failed (M-CONTRACT-EVAL)
)

ErrorCategory constants

View Source

const (
	PinnedPythonMajor = 3
	PinnedPythonMinor = 12
)

PinnedPythonVersion is the Python version the eval suite targets. Both the runtime (the interpreter `uv` resolves) and the prompt (what we tell the model) are driven from this single constant so they can never drift.

3.12 is a modern production Python that supports structural pattern matching (`match/case`, PEP 634) — so models are not penalised for reaching for idiomatic constructs available in current stable Python.

View Source

const MaxOutputSize = 1 * 1024 * 1024 // 1 MB

MaxOutputSize is the maximum size (in bytes) for stdout/stderr capture This prevents infinite loop bugs from generating gigabyte-sized JSON files

Variables ¶

View Source

var Rules = []errorRule{

	{
		WRONG_LANG,
		regexp.MustCompile(`(?i)(def |class |import json|import sys|function |var |const |#include|using namespace|public static|interface |enum class)`),
		RepairHint{
			Title: "Wrong programming language",
			Why:   "Generated code appears to be Python/JavaScript/C++/Java, not AILANG. AILANG is a pure functional language with ML-style syntax.",
			How:   "Start over with AILANG syntax: 1) Use `let x = expr` for bindings, 2) Use `func name(params) -> Type { body }` for functions, 3) Use recursion instead of loops, 4) No classes, no mutation, no statements. Refer to AILANG examples in the prompt.",
		},
	},
	{
		IMPERATIVE,
		regexp.MustCompile(`(?i)(loop\s*\{|while\s*\(|for\s*\(|break;|continue;|^\s*\w+\s*=\s*[^=]|;\s*\w+\s*=\s*[^=]|let mut )`),
		RepairHint{
			Title: "Imperative syntax not allowed",
			Why:   "Used imperative constructs (loop/while/for/break/assignment statements). AILANG is purely functional - no loops, no mutation, no statements.",
			How:   "Replace imperative code with functional patterns: 1) Use recursion instead of loops, 2) Use `let x = expr in body` instead of `x = expr;`, 3) Use pattern matching instead of break/continue, 4) All variables are immutable.",
		},
	},
	{
		PAR_001,
		regexp.MustCompile(`PAR_NO_PREFIX_PARSE|PAR_UNEXPECTED_TOKEN|parse errors? in|unexpected token`),
		RepairHint{
			Title: "Parse error",
			Why:   "AILANG syntax error - common issues: missing semicolons in blocks, wrong syntax for let/lambda/records.",
			How:   "Check: 1) Use `{ e1; e2; e3 }` for blocks (semicolons between exprs), 2) Use `let x = expr in body` or `let x = expr; rest`, 3) Lambda: `\\x -> body` or `func(x) { body }`, 4) No `=` in function params.",
		},
	},
	{
		TC_REC_001,
		regexp.MustCompile(`field '([^']+)' not found in record|closed row missing labels`),
		RepairHint{
			Title: "Record field missing",
			Why:   "Type checker requires the field to exist in the record.",
			How:   "Add the missing field to the record literal, or use row polymorphism: `{ field: T | ρ }` in type annotation.",
		},
	},
	{
		TC_INT_001,
		regexp.MustCompile(`Float .* is not an instance of Integral|mod not defined for Float`),
		RepairHint{
			Title: "Modulo on Float",
			Why:   "`%` requires `Integral` (Int) type.",
			How:   "Use integers for `%`, or use `/` and `floor` for floats.",
		},
	},
	{
		EQ_001,
		regexp.MustCompile(`Eq dictionary resolution failed|using eq_Int for Float`),
		RepairHint{
			Title: "Float equality dictionary",
			Why:   "The Eq dictionary must match Float type.",
			How:   "Annotate as `: float` or ensure both sides are Float.",
		},
	},
	{
		CAP_001,
		regexp.MustCompile(`no effect context available|effect '(\w+)' requires capability|closed row missing labels: \[(IO|FS|Clock|Net)`),
		RepairHint{
			Title: "Missing capability",
			Why:   "Effect calls require explicit capabilities at runtime.",
			How:   "Declare effects in function signature with explicit type annotation: `let main : Unit -> Unit <IO> = \\() -> { println(...) }`. The type annotation is REQUIRED for effects.",
		},
	},
	{
		MOD_001,
		regexp.MustCompile(`entrypoint '(\w+)' not found|module .* not found`),
		RepairHint{
			Title: "Entrypoint/module resolution",
			Why:   "Runner couldn't find the entry point function.",
			How:   "Export a zero-argument `main` function, the eval harness uses `--entry main`.",
		},
	},
}

Rules maps error patterns to categorized error codes and repair hints

View Source

var ValidConditionNames = []string{"baseline", "contract", "z3_guided", "full", "tool_aware", "agent_prompt"}

ValidConditionNames lists all recognized condition names

View Source

var ValidTagTaxonomy = []string{
	"adt_pattern_match",
	"recursion",
	"effects_io",
	"contracts",
	"data_transform",
	"records",
	"functional",
	"type_safety",
	"string_algo",
	"state_machine",
	"algorithmic",
	"error_handling",
}

ValidTagTaxonomy lists the 12-tag taxonomy for BenchmarkSpec.Tags. Tag definitions are in design_docs/planned/v0_13_0/m-eval-category-analysis.md §Component 1.

View Source

var ValidTiers = []string{"smoke", "core", "stretch", "vision"}

ValidTiers lists the allowed values for BenchmarkSpec.Tier. Tier structure is defined in design_docs/planned/v0_13_0/m-benchmark-suite-tiers.md.

Functions ¶

func CalculateCostWithBreakdown ¶

func CalculateCostWithBreakdown(model string, inputTokens, outputTokens int) float64

CalculateCostWithBreakdown calculates cost using separate input/output token counts This provides accurate pricing based on models.yml configuration Returns 0.0 if model not found - FAIL LOUDLY, NO SILENT FALLBACKS

func CategorizeError ¶

func CategorizeError(compileOk, runtimeOk, stdoutOk bool) string

CategorizeError determines the error category based on execution results

func CategorizeErrorCode ¶

func CategorizeErrorCode(stderr string) (ErrCode, *RepairHint)

CategorizeErrorCode matches stderr against error patterns and returns the error code and repair hint if a match is found. Returns ("", nil) if no pattern matches.

func CategorizeErrorWithCode ¶

func CategorizeErrorWithCode(code, stderr string) (ErrCode, *RepairHint)

CategorizeErrorWithCode analyzes both generated code and stderr to detect AI usability issues like wrong language or imperative syntax. Checks code patterns first (WRONG_LANG, IMPERATIVE), then stderr patterns.

func CompareOutput ¶

func CompareOutput(expected, actual string) bool

CompareOutput checks if actual output matches expected output

func ComputePromptHash ¶

func ComputePromptHash(filePath string) (string, error)

ComputePromptHash is a helper to compute hash for a prompt file (for updating registry)

func DetectedPythonVersion ¶

func DetectedPythonVersion() string

DetectedPythonVersion is retained for prompt substitution so nothing in the prompt layer cares whether we're using uv, pyenv, or a raw interpreter. With uv managing the runtime we guarantee the pinned version, so this just returns the pin.

func EnhancedGenerateAgentPrompt ¶

func EnhancedGenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (string, string, error)

EnhancedGenerateAgentPrompt is a wrapper that loads syntax and generates prompt language parameter determines which teaching prompt to load (ailang, python, etc.)

func FindAILANG ¶

func FindAILANG() (string, error)

FindAILANG attempts to locate the ailang binary

func FindModelsConfig ¶

func FindModelsConfig(startDir string) (string, error)

FindModelsConfig searches for models.yml starting from a directory

func FormatRepairPrompt ¶

func FormatRepairPrompt(code ErrCode, hint *RepairHint, benchmarkID, lang, failedCode, stderr string) string

FormatRepairPrompt creates the repair guidance injection for retry attempts. This prompt is appended to the original benchmark prompt to guide the AI toward fixing the specific error that occurred.

func GenerateAgentPrompt ¶

func GenerateAgentPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, syntaxRef string, language string) string

GenerateAgentPrompt creates a comprehensive prompt for the agent This version loads from a language-specific template file for easy editing

func GenerateAgentPromptsWithSystemPrompt ¶

func GenerateAgentPromptsWithSystemPrompt(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string, promptVersion string, solutionPath string) (string, string, string, error)

GenerateAgentPromptsWithSystemPrompt generates split prompts for --system-prompt flag Returns: (systemPrompt, taskPrompt, promptVersionUsed, error)

func InitModelsConfig ¶

func InitModelsConfig() error

InitModelsConfig loads the global models configuration

func KillProcess ¶

func KillProcess(pid int) error

KillProcess kills a single process

func KillProcessGroup ¶

func KillProcessGroup(pid int) error

KillProcessGroup kills the entire process group (Unix only) Uses negative PID to kill all processes in the group

func LoadActiveSyntaxReference ¶

func LoadActiveSyntaxReference(language string) (string, error)

LoadActiveSyntaxReference loads the active teaching prompt for a language

func LoadSystemPromptForLanguage ¶

func LoadSystemPromptForLanguage(language string, promptVersion string) (string, string, error)

LoadSystemPromptForLanguage loads the versioned teaching prompt for a language This is used with Claude CLI's --system-prompt flag

func LoadTaskPromptTemplate ¶

func LoadTaskPromptTemplate(language string) (string, error)

LoadTaskPromptTemplate loads the generic .txt template for the initial agent prompt This explains the benchmark task (what to solve), not the language syntax

func PinnedPythonVersion ¶

func PinnedPythonVersion() string

PinnedPythonVersion returns "3.12".

func PopulateVerifyMetrics ¶

func PopulateVerifyMetrics(metrics *RunMetrics, result *AICheckResult, rawJSON string)

PopulateVerifyMetrics fills verify fields in RunMetrics from an AICheckResult

func PrepareWorkspaceWithSyntax ¶

func PrepareWorkspaceWithSyntax(workspace string, spec *BenchmarkSpec, syntaxRef string) error

PrepareWorkspaceWithSyntax creates workspace files with full AILANG syntax reference

func ResolveModelName ¶

func ResolveModelName(name string) (apiName, provider string, err error)

ResolveModelName resolves a user-provided model name to its API name. Supports both friendly names (e.g., "claude-sonnet-4-5") and direct API names. Returns error if model is not found in configuration -- NO SILENT FALLBACKS.

func SetProcessGroup ¶

func SetProcessGroup(cmd *exec.Cmd)

SetProcessGroup configures the command to run in its own process group (Unix only)

func ValidatePythonCode ¶

func ValidatePythonCode(code string) (ErrCode, *RepairHint)

ValidatePythonCode checks if Python code accidentally contains AILANG syntax Returns error code and hint if AILANG syntax detected, otherwise returns ("", nil)

Types ¶

type AIAgent ¶

type AIAgent struct {
	// contains filtered or unexported fields
}

AIAgent generates code using LLM APIs. Uses the unified internal/ai/ providers via providerAdapter.

func NewAIAgent ¶

func NewAIAgent(model string, seed int64) (*AIAgent, error)

NewAIAgent creates a new AI agent using unified providers.

func (*AIAgent) GenerateCode ¶

func (a *AIAgent) GenerateCode(ctx context.Context, prompt string) (*GenerateResult, error)

GenerateCode generates code using the unified provider.

func (*AIAgent) GenerateWithRetry ¶

func (a *AIAgent) GenerateWithRetry(ctx context.Context, prompt string, cfg RetryConfig) (*GenerateResult, error)

GenerateWithRetry generates code with retry logic

type AICheckCheckResult ¶

type AICheckCheckResult struct {
	Passed     bool `json:"passed"`
	ErrorCount int  `json:"error_count"`
}

AICheckCheckResult is the type-check portion

type AICheckResult ¶

type AICheckResult struct {
	File   string              `json:"file"`
	Check  AICheckCheckResult  `json:"check"`
	Verify AICheckVerifyResult `json:"verify"`
}

AICheckResult is the parsed JSON output from `ailang ai-check`

func RunAICheck ¶

func RunAICheck(ailangPath, filePath string, timeout time.Duration) (*AICheckResult, string, error)

RunAICheck executes `ailang ai-check <file>` and parses the JSON output. Returns nil result and error if the command can't be executed. Returns parsed result even if verification found counterexamples.

type AICheckVerifyResult ¶

type AICheckVerifyResult struct {
	Available      bool `json:"available"`
	Verified       int  `json:"verified"`
	Counterexample int  `json:"counterexample"`
	Skipped        int  `json:"skipped"`
	Errors         int  `json:"errors"`
}

AICheckVerifyResult is the contract verification portion

type AILANGRunner ¶

type AILANGRunner struct {
	// contains filtered or unexported fields
}

AILANGRunner executes AILANG code

func NewAILANGRunner ¶

func NewAILANGRunner(ailangPath string, caps []string) *AILANGRunner

NewAILANGRunner creates a new AILANG runner

func NewAILANGRunnerWithTask ¶

func NewAILANGRunnerWithTask(ctx context.Context, ailangPath string, caps []string, taskID string, spec *BenchmarkSpec) *AILANGRunner

NewAILANGRunnerWithTask creates a new AILANG runner with task ID and context for telemetry hierarchy. The taskID is propagated via AILANG_PARENT_TASK_ID, and trace context via TRACEPARENT.

func (*AILANGRunner) Language ¶

func (r *AILANGRunner) Language() string

Language returns "ailang"

func (*AILANGRunner) Run ¶

func (r *AILANGRunner) Run(code string, timeout time.Duration) (*RunResult, error)

Run executes AILANG code

type AgentBenchmarkConfig ¶

type AgentBenchmarkConfig struct {
	MaxConcurrent      int           // Max parallel Claude sessions
	RequestsPerSecond  int           // API rate limit
	TimeoutSeconds     int           // Timeout per benchmark
	WorkspaceDir       string        // Base workspace directory
	AllowedTools       []string      // Tools agent can use
	ClaudePath         string        // Path to claude CLI
	ClaudeModel        string        // Claude model to use (haiku, sonnet, opus, or full name)
	Verify             bool          // Enable contract verification (M-CONTRACT-EVAL)
	DevtoolsPrompt     string        // Devtools prompt content to append to system prompt (M-CONTRACT-EVAL)
	AgentPromptContent string        // Agent coding prompt content (replaces teaching prompt when UseAgentPrompt condition is active)
	Condition          EvalCondition // Experimental condition (overrides Verify/DevtoolsPrompt when set)
	MicroragMode       MicroragMode  // μRAG subprocess env mode (M-BRAIN-MICRORAG): on/off/auto
}

AgentBenchmarkConfig configures agent-based evaluation

func DefaultAgentConfig ¶

func DefaultAgentConfig() AgentBenchmarkConfig

DefaultAgentConfig returns sensible defaults

type AgentBenchmarkResult ¶

type AgentBenchmarkResult struct {
	BenchmarkID   string
	Executor      string // Executor used: "claude", "gemini", etc.
	Success       bool
	Iterations    int     // Number of agent turns
	Cost          float64 // Total cost in USD
	DurationMS    int     // Total time in milliseconds
	NumTurns      int     // Conversation turns
	ToolCallCount int     // Number of tool invocations (validates agentic behavior)
	Error         string  // Error message if failed
	SessionID     string  // Session ID from executor
	Result        string  // Final result text from agent

	// Token usage details
	Usage      TokenUsage            `json:"usage"`
	ModelUsage map[string]ModelStats `json:"modelUsage"`

	// Solution and session log for inspection
	SolutionCode  string `json:"solution_code,omitempty"`  // Generated solution code
	SessionLog    string `json:"session_log,omitempty"`    // Full Claude session log
	PromptVersion string `json:"prompt_version,omitempty"` // Version of teaching prompt used

	// Validation flags (match standard eval format for downstream compatibility)
	CompileOk bool   `json:"compile_ok"` // Did solution parse/compile?
	RuntimeOk bool   `json:"runtime_ok"` // Did solution run without error?
	StdoutOk  bool   `json:"stdout_ok"`  // Did output match expected?
	Stdout    string `json:"stdout,omitempty"`
	Stderr    string `json:"stderr,omitempty"`

	// Timing breakdown
	TTFTSeconds float64 `json:"ttft_seconds,omitempty"` // Time to first token in seconds

	// Cross-harness grouping
	ModelFamily string `json:"model_family,omitempty"` // Logical model family (e.g. "claude-sonnet-4-6"); empty = no grouping

	// Contract verification results (M-CONTRACT-EVAL)
	VerifyOk        bool   `json:"verify_ok"`             // All contracts verified
	VerifyVerified  int    `json:"verify_verified"`       // Count of verified functions
	VerifyCounterex int    `json:"verify_counterexample"` // Count of counterexamples
	VerifySkipped   int    `json:"verify_skipped"`        // Count of skipped functions
	VerifyErrors    int    `json:"verify_errors"`         // Count of Z3 errors
	VerifyJSON      string `json:"verify_json,omitempty"` // Full ai-check JSON output
}

AgentBenchmarkResult captures agent evaluation outcome

func RunAgentBenchmark ¶

func RunAgentBenchmark(spec *BenchmarkSpec, config AgentBenchmarkConfig, language string) (*AgentBenchmarkResult, error)

RunAgentBenchmark runs a single benchmark using Claude Code headless mode language parameter specifies which language to run (ailang, python, etc.)

func RunAgentBenchmarkWithExecutor ¶

func RunAgentBenchmarkWithExecutor(spec *BenchmarkSpec, config MultiExecutorConfig, language string) (*AgentBenchmarkResult, error)

RunAgentBenchmarkWithExecutor runs a benchmark using the specified executor This enables comparing performance across different AI coding agents

type BenchmarkSpec ¶

type BenchmarkSpec struct {
	ID           string            `yaml:"id"`
	Description  string            `yaml:"description"`
	Languages    []string          `yaml:"languages"`
	Entrypoint   string            `yaml:"entrypoint"`
	Caps         []string          `yaml:"caps"`
	Prompt       string            `yaml:"prompt"`        // Inline prompt text (language-agnostic)
	PromptFiles  map[string]string `yaml:"prompt_files"`  // Language-specific prompt files: {ailang: "prompts/v0.3.0.md"}
	TaskPrompt   string            `yaml:"task_prompt"`   // Task-specific prompt appended after base prompt
	ContractSpec string            `yaml:"contract_spec"` // Optional: AILANG contract specification for Z3 verification
	Z3Hints      string            `yaml:"z3_hints"`      // Optional: Pre-computed Z3 counterexample descriptions for known traps
	ExpectedOut  string            `yaml:"expected_stdout"`
	Difficulty   string            `yaml:"difficulty"`
	ExpectedGain string            `yaml:"expected_gain"`
	Timeout      int               `yaml:"timeout"` // Agent timeout in seconds (default: 60)

	// Test infrastructure: stdin, CLI args, and input files
	Stdin      string            `yaml:"stdin,omitempty"`       // Stdin data to pipe to the program
	CliArgs    []string          `yaml:"cli_args,omitempty"`    // CLI arguments to pass after the script
	InputFiles map[string]string `yaml:"input_files,omitempty"` // Files to create in workspace: {filename: content}

	// Eval suite classification (M-EVAL-SUITE-PREP, v0.14.0)
	Tier string   `yaml:"tier,omitempty"` // One of: smoke|core|stretch|vision. Missing defaults to "core".
	Tags []string `yaml:"tags,omitempty"` // 1-3 tags from ValidTagTaxonomy. May be empty during migration.
}

BenchmarkSpec defines a single benchmark task

func LoadSpec ¶

func LoadSpec(path string) (*BenchmarkSpec, error)

LoadSpec loads a benchmark spec from a YAML file

func (*BenchmarkSpec) FormatContractSpec ¶

func (s *BenchmarkSpec) FormatContractSpec(verify bool) string

FormatContractSpec returns a formatted contract specification block for prompt injection. When verify is true and the spec has a ContractSpec, returns a formatted block. Otherwise returns empty string (backward compatible).

func (*BenchmarkSpec) FormatZ3Hints ¶

func (s *BenchmarkSpec) FormatZ3Hints() string

FormatZ3Hints returns a formatted Z3 hints block for prompt injection. Only returns content when the spec has Z3Hints defined.

func (*BenchmarkSpec) PromptForLanguage ¶

func (s *BenchmarkSpec) PromptForLanguage(lang string) string

PromptForLanguage returns the prompt with language-specific base prompt + task prompt

func (*BenchmarkSpec) SupportsLanguage ¶

func (s *BenchmarkSpec) SupportsLanguage(lang string) bool

SupportsLanguage checks if the benchmark supports a given language

type ClaudeHeadlessResult ¶

type ClaudeHeadlessResult struct {
	Type              string                `json:"type"`
	Subtype           string                `json:"subtype"`
	IsError           bool                  `json:"is_error"`
	DurationMS        int                   `json:"duration_ms"`
	DurationAPIMS     int                   `json:"duration_api_ms"`
	NumTurns          int                   `json:"num_turns"`
	Result            string                `json:"result"`
	SessionID         string                `json:"session_id"`
	TotalCostUSD      float64               `json:"total_cost_usd"`
	Usage             TokenUsage            `json:"usage"`
	ModelUsage        map[string]ModelStats `json:"modelUsage"`
	PermissionDenials []interface{}         `json:"permission_denials"`
	UUID              string                `json:"uuid"`
	Transcript        string                `json:"-"` // Full conversation transcript (not in JSON, set by streaming)
}

ClaudeHeadlessResult is the JSON structure returned by `claude -p --output-format json`

func RunHeadlessSessionStreaming ¶

func RunHeadlessSessionStreaming(spec *BenchmarkSpec, systemPrompt, taskPrompt, workspace string, config AgentBenchmarkConfig) (*ClaudeHeadlessResult, error)

RunHeadlessSessionStreaming executes Claude in headless mode with real-time message streaming This is used when DEBUG_AGENT=1 to provide visibility into what Claude is doing systemPrompt contains language knowledge (loaded from prompts/versions.json) taskPrompt contains the benchmark task description

Exported for use by internal/coordinator package for task execution

type ErrCode ¶

type ErrCode string

ErrCode represents a categorized error type from AILANG execution

const (
	// Parser errors
	PAR_001 ErrCode = "PAR_001" // Parse error (block/semicolon issues)

	// AI usability errors - Wrong language
	WRONG_LANG ErrCode = "WRONG_LANG" // Generated code in wrong programming language

	// AI usability errors - Imperative syntax
	IMPERATIVE ErrCode = "IMPERATIVE" // Used imperative constructs (loop, break, assignment statements)

	// Type checker errors - Records
	TC_REC_001 ErrCode = "TC_REC_001" // Record field not found

	// Type checker errors - Type classes
	TC_INT_001 ErrCode = "TC_INT_001" // Not an instance of Integral
	EQ_001     ErrCode = "EQ_001"     // Wrong Eq dictionary

	// Runtime errors - Capabilities
	CAP_001 ErrCode = "CAP_001" // Capability missing

	// Runtime errors - Module system
	MOD_001 ErrCode = "MOD_001" // Undefined module/entry

	// Contract verification errors (M-CONTRACT-EVAL)
	VERIFY_COUNTEREXAMPLE ErrCode = "VERIFY_COUNTEREXAMPLE" // Z3 found counterexample
)

type ErrUvMissing ¶

type ErrUvMissing struct {
	// contains filtered or unexported fields
}

ErrUvMissing is returned by Python benchmark runners when the `uv` binary is not on PATH. The eval suite depends on uv to pin the Python runtime so that every benchmark sees the exact same interpreter on every machine. Per CLAUDE.md §2 we fail loudly rather than silently falling back to a system `python3`, because a wrong-version fallback is the exact bug we are trying to eliminate.

func (*ErrUvMissing) Error ¶

func (e *ErrUvMissing) Error() string

func (*ErrUvMissing) Unwrap ¶

func (e *ErrUvMissing) Unwrap() error

type EvalCondition ¶

type EvalCondition struct {
	Name                string // "baseline", "contract", "z3_guided", "full", "tool_aware", "agent_prompt", or "" for legacy
	IncludeContract     bool   // Include contract_spec in prompt
	IncludeZ3Hints      bool   // Include z3_hints in prompt
	IncludeDevtools     bool   // Append devtools prompt to system prompt
	IncludeToolGuidance bool   // Include general contract-writing + ai-check guidance (no spec given)
	EnableVerify        bool   // Enable Z3 verification (standard mode repair + post-hoc check)
	UseAgentPrompt      bool   // Use compact agent coding prompt instead of full teaching prompt
}

EvalCondition represents a named experimental condition that controls what information is included in the LLM prompt. Conditions are treated like languages — each creates a separate evaluation job.

func ResolveCondition ¶

func ResolveCondition(name string, legacyVerify, legacyDevtools bool) EvalCondition

ResolveCondition returns the settings for a named condition. If name is empty, returns legacy behavior using the explicit --verify/--devtools-prompt flags.

type GenerateResult ¶

type GenerateResult struct {
	Code         string
	InputTokens  int // Prompt tokens (input to LLM)
	OutputTokens int // Completion tokens (generated code)
	TotalTokens  int // Total tokens (for billing)
	Model        string
}

GenerateResult contains the result of code generation

type GoRunner ¶ added in v0.14.2

type GoRunner struct {
	// contains filtered or unexported fields
}

GoRunner executes Go code via go run

func NewGoRunner ¶ added in v0.14.2

func NewGoRunner() *GoRunner

NewGoRunner creates a new Go runner

func NewGoRunnerWithSpec ¶ added in v0.14.2

func NewGoRunnerWithSpec(spec *BenchmarkSpec) *GoRunner

NewGoRunnerWithSpec creates a new Go runner with benchmark spec

func (*GoRunner) Language ¶ added in v0.14.2

func (r *GoRunner) Language() string

Language returns "go"

func (*GoRunner) Run ¶ added in v0.14.2

func (r *GoRunner) Run(code string, timeout time.Duration) (*RunResult, error)

Run executes Go code via go run

type JSRunner ¶ added in v0.14.2

type JSRunner struct {
	// contains filtered or unexported fields
}

JSRunner executes JavaScript (Node.js) code

func NewJSRunner ¶ added in v0.14.2

func NewJSRunner() *JSRunner

NewJSRunner creates a new JavaScript runner

func NewJSRunnerWithSpec ¶ added in v0.14.2

func NewJSRunnerWithSpec(spec *BenchmarkSpec) *JSRunner

NewJSRunnerWithSpec creates a new JavaScript runner with benchmark spec

func (*JSRunner) Language ¶ added in v0.14.2

func (r *JSRunner) Language() string

Language returns "javascript"

func (*JSRunner) Run ¶ added in v0.14.2

func (r *JSRunner) Run(code string, timeout time.Duration) (*RunResult, error)

Run executes JavaScript code via node

type LanguageRunner ¶

type LanguageRunner interface {
	Run(code string, timeout time.Duration) (*RunResult, error)
	Language() string
}

LanguageRunner executes code in a specific language

func GetRunner ¶

func GetRunner(lang string, spec *BenchmarkSpec) (LanguageRunner, error)

GetRunner returns a LanguageRunner for the specified language

func GetRunnerWithContext ¶

func GetRunnerWithContext(ctx context.Context, langName string, spec *BenchmarkSpec, taskID string) (LanguageRunner, error)

GetRunnerWithContext returns a LanguageRunner with full telemetry context. The ctx is used to propagate TRACEPARENT for span hierarchy. The taskID is propagated via AILANG_PARENT_TASK_ID for task-level correlation.

func GetRunnerWithTask ¶

func GetRunnerWithTask(lang string, spec *BenchmarkSpec, taskID string) (LanguageRunner, error)

GetRunnerWithTask returns a LanguageRunner with task ID for telemetry hierarchy. Deprecated: Use GetRunnerWithContext instead for full trace propagation.

type LimitedWriter ¶

type LimitedWriter struct {
	// contains filtered or unexported fields
}

LimitedWriter wraps an io.Writer and limits the total bytes written Once the limit is reached, subsequent writes are discarded and a truncation message is appended

func NewLimitedWriter ¶

func NewLimitedWriter(limit int64) *LimitedWriter

NewLimitedWriter creates a new LimitedWriter with the specified byte limit

func (*LimitedWriter) String ¶

func (lw *LimitedWriter) String() string

String returns the buffered content (possibly truncated)

func (*LimitedWriter) Truncated ¶

func (lw *LimitedWriter) Truncated() bool

Truncated returns true if output was truncated

func (*LimitedWriter) Write ¶

func (lw *LimitedWriter) Write(p []byte) (n int, err error)

Write implements io.Writer with size limiting

type MetricsLogger ¶

type MetricsLogger struct {
	// contains filtered or unexported fields
}

MetricsLogger handles writing metrics to JSON files

func NewMetricsLogger ¶

func NewMetricsLogger(outputDir string) *MetricsLogger

NewMetricsLogger creates a new metrics logger

func (*MetricsLogger) Log ¶

func (l *MetricsLogger) Log(m *RunMetrics) error

Log writes a RunMetrics to a JSON file

type MicroragMode ¶ added in v0.14.2

type MicroragMode string

MicroragMode is the eval-suite --microrag flag value.

const (
	// MicroragModeAuto respects the inherited environment. Default.
	// Use when running outside an A/B comparison.
	MicroragModeAuto MicroragMode = "auto"
	// MicroragModeOn forces AILANG_MICRORAG_ENABLED=1 in subprocesses.
	MicroragModeOn MicroragMode = "on"
	// MicroragModeOff forces AILANG_MICRORAG_ENABLED=0 in subprocesses.
	// Use this for the baseline arm of an A/B run.
	MicroragModeOff MicroragMode = "off"
)

func ParseMicroragMode ¶ added in v0.14.2

func ParseMicroragMode(s string) MicroragMode

ParseMicroragMode normalises CLI input. Empty / unknown → auto.

func (MicroragMode) ApplyToEnv ¶ added in v0.14.2

func (m MicroragMode) ApplyToEnv(env []string) []string

ApplyToEnv strips any existing AILANG_MICRORAG_ENABLED entry and appends a fresh one matching the mode. Auto leaves the inherited value untouched.

Returns the modified slice. Caller passes the result to cmd.Env. We don't mutate os.Environ() — every subprocess gets its own independent copy.

func (MicroragMode) ResolvedState ¶ added in v0.14.2

func (m MicroragMode) ResolvedState() string

ResolvedState returns what should be recorded in RunMetrics.MicroragState. For auto, peeks at the actual env so the result file shows the effective value rather than "auto" (which would obscure the comparison).

type MockAIAgent ¶

type MockAIAgent struct {
	// contains filtered or unexported fields
}

MockAIAgent is a mock implementation for testing

func NewMockAIAgent ¶

func NewMockAIAgent(model, code string) *MockAIAgent

NewMockAIAgent creates a mock AI agent

func (*MockAIAgent) GenerateCode ¶

func (m *MockAIAgent) GenerateCode(ctx context.Context, prompt string) (*GenerateResult, error)

GenerateCode returns the pre-configured mock code

type ModelConfig ¶

type ModelConfig struct {
	APIName                  string  `yaml:"api_name"`
	Provider                 string  `yaml:"provider"`
	Description              string  `yaml:"description"`
	EnvVar                   string  `yaml:"env_var"`
	AgentCLI                 *string `yaml:"agent_cli"`          // CLI command for agent eval (e.g., "claude", "openai", "gemini"), nil if not supported
	AgentModelName           *string `yaml:"agent_model_name"`   // Model name to pass to agent CLI (e.g., "haiku", "sonnet")
	MaxOutputTokens          int     `yaml:"max_output_tokens"`  // Max output tokens (0 = handler default 4096)
	TTFTTimeoutSeconds       int     `yaml:"ttft_timeout"`       // Prefill budget in seconds (0 = executor default 30s)
	GenerationTimeoutSeconds int     `yaml:"generation_timeout"` // Per-token idle budget after first event (0 = executor default 3m)
	ModelFamily              string  `yaml:"model_family"`       // Logical model family for cross-harness grouping (e.g., "claude-sonnet-4-6"); empty = no grouping
	GCPProject               string  `yaml:"gcp_project"`        // Override GOOGLE_CLOUD_PROJECT for this model's evals (e.g. "ailang-dev")
	GCPLocation              string  `yaml:"gcp_location"`       // Override GOOGLE_CLOUD_LOCATION (e.g. "us-central1")
	Pricing                  Pricing `yaml:"pricing"`
	Notes                    string  `yaml:"notes"`
}

ModelConfig represents a single model configuration

type ModelStats ¶

type ModelStats struct {
	InputTokens              int     `json:"inputTokens"`
	OutputTokens             int     `json:"outputTokens"`
	CacheReadInputTokens     int     `json:"cacheReadInputTokens"`
	CacheCreationInputTokens int     `json:"cacheCreationInputTokens"`
	WebSearchRequests        int     `json:"webSearchRequests"`
	CostUSD                  float64 `json:"costUSD"`
	ContextWindow            int     `json:"contextWindow"`
}

ModelStats captures per-model statistics

type ModelsConfig ¶

type ModelsConfig struct {
	Models           map[string]ModelConfig `yaml:"models"`
	Default          string                 `yaml:"default"`
	BenchmarkSuite   []string               `yaml:"benchmark_suite"`
	ExtendedSuite    []string               `yaml:"extended_suite"`
	DevModels        []string               `yaml:"dev_models"`
	AgentSuite       []string               `yaml:"agent_suite"`
	OllamaSuite      []string               `yaml:"ollama_suite"`
	HarnessSuite     []string               `yaml:"harness_suite"`
	LangHarnessSuite []string               `yaml:"lang_harness_suite"`
}

ModelsConfig represents the entire models.yml configuration

var (
	// GlobalModelsConfig is the loaded models configuration
	GlobalModelsConfig *ModelsConfig
)

func LoadModelsConfig ¶

func LoadModelsConfig(path string) (*ModelsConfig, error)

LoadModelsConfig loads the models.yml configuration

func (*ModelsConfig) CalculateCostForModel ¶

func (c *ModelsConfig) CalculateCostForModel(name string, inputTokens, outputTokens int) (float64, error)

CalculateCostForModel calculates the cost for a model using its pricing config

func (*ModelsConfig) FilterAgentSupportedModels ¶

func (c *ModelsConfig) FilterAgentSupportedModels(models []string) []string

FilterAgentSupportedModels filters a list of models to only those that support agent eval

func (*ModelsConfig) GetAPIName ¶

func (c *ModelsConfig) GetAPIName(name string) (string, error)

GetAPIName returns the API name for a model by friendly name

func (*ModelsConfig) GetAgentCLI ¶

func (c *ModelsConfig) GetAgentCLI(name string) (string, error)

GetAgentCLI returns the agent CLI command for a model (e.g., "claude")

func (*ModelsConfig) GetAgentModelName ¶

func (c *ModelsConfig) GetAgentModelName(name string) (string, error)

GetAgentModelName returns the model name to pass to the agent CLI

func (*ModelsConfig) GetAgentSuite ¶ added in v0.14.2

func (c *ModelsConfig) GetAgentSuite() []string

GetAgentSuite returns the cross-harness agent eval suite (claude+gemini+codex+opencode). Only models with non-null agent_cli participate in agent-mode runs; text-only models in the suite are skipped cleanly.

func (*ModelsConfig) GetBenchmarkSuite ¶

func (c *ModelsConfig) GetBenchmarkSuite() []string

GetBenchmarkSuite returns the recommended models for comprehensive evaluation

func (*ModelsConfig) GetDefaultModel ¶

func (c *ModelsConfig) GetDefaultModel() string

GetDefaultModel returns the default model name

func (*ModelsConfig) GetEnvVar ¶

func (c *ModelsConfig) GetEnvVar(name string) (string, error)

GetEnvVar returns the environment variable name for a model's API key

func (*ModelsConfig) GetExecutorForModel ¶

func (c *ModelsConfig) GetExecutorForModel(name string) (executorName string, modelName string, err error)

GetExecutorForModel returns the appropriate executor for a model Returns the executor name (e.g., "claude", "gemini") and the model name to use

func (*ModelsConfig) GetModel ¶

func (c *ModelsConfig) GetModel(name string) (*ModelConfig, error)

GetModel returns the configuration for a model by friendly name

func (*ModelsConfig) GetProvider ¶

func (c *ModelsConfig) GetProvider(name string) (string, error)

GetProvider returns the provider for a model

func (*ModelsConfig) ListModels ¶

func (c *ModelsConfig) ListModels() []string

ListModels returns all configured model names

func (*ModelsConfig) SupportsAgentEval ¶

func (c *ModelsConfig) SupportsAgentEval(name string) bool

SupportsAgentEval returns true if the model supports agent-based evaluation

type MultiExecutorConfig ¶

type MultiExecutorConfig struct {
	AgentBenchmarkConfig

	// ExecutorName specifies which executor to use (e.g., "claude", "gemini")
	// If empty, uses the model's agent_cli from models.yml
	ExecutorName string

	// ModelName is the model to use (e.g., "claude-sonnet-4-5", "gemini-3-flash")
	ModelName string

	// ConfigKey is the models.yml lookup key for per-model config (e.g., "opencode-gemma4-e4b").
	// When set, overrides ModelName for timeout/config lookups. Needed when ModelName is the
	// resolved API model name (e.g., "ollama/gemma4:e4b") rather than the models.yml key.
	ConfigKey string

	// ExtraHandler is an additional event handler composed with the debug handler.
	// Used for ObservatoryWriter to capture structured tool calls during streaming.
	// When nil, only the debug handler is used (no behavior change).
	ExtraHandler executor.EventHandler
}

MultiExecutorConfig extends AgentBenchmarkConfig with executor selection

type Pricing ¶

type Pricing struct {
	InputPer1K  float64 `yaml:"input_per_1k"`
	OutputPer1K float64 `yaml:"output_per_1k"`
}

Pricing represents model pricing information

type PromptLoader ¶

type PromptLoader struct {
	// contains filtered or unexported fields
}

PromptLoader loads and verifies prompt versions

func NewPromptLoader ¶

func NewPromptLoader(registryPath string) (*PromptLoader, error)

NewPromptLoader creates a loader from versions.json

func (*PromptLoader) GetActivePrompt ¶

func (l *PromptLoader) GetActivePrompt() (string, error)

GetActivePrompt loads the active prompt version Supports special value "latest" to automatically use the most recent version

func (*PromptLoader) GetActiveVersionID ¶

func (l *PromptLoader) GetActiveVersionID() string

GetActiveVersionID returns the active version ID (resolving "latest" if needed)

func (*PromptLoader) GetVersion ¶

func (l *PromptLoader) GetVersion(versionID string) (*PromptVersion, error)

GetVersion returns metadata for a specific version

func (*PromptLoader) ListVersions ¶

func (l *PromptLoader) ListVersions() map[string]PromptVersion

ListVersions returns all available prompt versions

func (*PromptLoader) LoadPrompt ¶

func (l *PromptLoader) LoadPrompt(versionID string) (string, error)

LoadPrompt loads a prompt by version ID with hash verification

type PromptRegistry ¶

type PromptRegistry struct {
	SchemaVersion string                   `json:"schema_version"`
	Versions      map[string]PromptVersion `json:"versions"`
	Active        string                   `json:"active"`
	Notes         []string                 `json:"notes"`
}

PromptRegistry contains all registered prompt versions

type PromptVersion ¶

type PromptVersion struct {
	File        string   `json:"file"`
	Hash        string   `json:"hash"`
	Description string   `json:"description"`
	Created     string   `json:"created"`
	Tags        []string `json:"tags"`
	Notes       string   `json:"notes"`
}

PromptVersion represents metadata about a prompt version

type PythonRunner ¶

type PythonRunner struct {
	// contains filtered or unexported fields
}

PythonRunner executes Python code

func NewPythonRunner ¶

func NewPythonRunner() *PythonRunner

NewPythonRunner creates a new Python runner

func NewPythonRunnerWithSpec ¶

func NewPythonRunnerWithSpec(spec *BenchmarkSpec) *PythonRunner

NewPythonRunnerWithSpec creates a new Python runner with benchmark spec for test infrastructure

func (*PythonRunner) Language ¶

func (r *PythonRunner) Language() string

Language returns "python"

func (*PythonRunner) Run ¶

func (r *PythonRunner) Run(code string, timeout time.Duration) (*RunResult, error)

Run executes Python code

type RepairHint ¶

type RepairHint struct {
	Title string // Short description of the error
	Why   string // Explanation of why the error occurred
	How   string // Concrete steps to fix the error
}

RepairHint provides actionable guidance for fixing an error

func FormatZ3RepairHint ¶

func FormatZ3RepairHint(verifyStderr string) *RepairHint

FormatZ3RepairHint creates a RepairHint from Z3 verification output (M-CONTRACT-EVAL)

type RepairLog ¶

type RepairLog struct {
	Wrapped       bool     // True if bare expression was wrapped in module scaffold
	AddedModule   bool     // True if module declaration was added
	AddedImports  []string // List of imports that were injected
	CallFixes     int      // Number of bare function calls that were fixed with parens
	AddedMainFunc bool     // True if main function was synthesized
}

RepairLog tracks transformations applied by normalizeProgram

type RepairRunner ¶

type RepairRunner struct {
	// contains filtered or unexported fields
}

RepairRunner orchestrates self-repair logic for eval benchmarks

func NewRepairRunner ¶

func NewRepairRunner(agent *AIAgent, runner LanguageRunner, spec *BenchmarkSpec, timeout time.Duration, selfRepair bool) *RepairRunner

NewRepairRunner creates a new repair runner

func (*RepairRunner) Run ¶

func (r *RepairRunner) Run(ctx context.Context, prompt string) (*RunMetrics, error)

Run executes the benchmark with optional self-repair

func (*RepairRunner) SetPromptVersion ¶

func (r *RepairRunner) SetPromptVersion(version string)

SetPromptVersion sets the prompt version ID for metrics tracking

func (*RepairRunner) SetVerify ¶

func (r *RepairRunner) SetVerify(verify bool, timeout time.Duration)

SetVerify enables contract verification (M-CONTRACT-EVAL)

type RetryConfig ¶

type RetryConfig struct {
	MaxRetries int
	BaseDelay  time.Duration
}

RetryConfig configures retry behavior

type RunMetrics ¶

type RunMetrics struct {
	ID             string    `json:"id"`
	Lang           string    `json:"lang"`
	Model          string    `json:"model"`
	Executor       string    `json:"executor,omitempty"` // Executor used: "claude", "gemini", etc. (agent mode only)
	Seed           int64     `json:"seed"`
	InputTokens    int       `json:"input_tokens"`  // Prompt tokens (recorded but not primary metric)
	OutputTokens   int       `json:"output_tokens"` // Generated code tokens (PRIMARY METRIC)
	TotalTokens    int       `json:"total_tokens"`  // Total for billing
	CostUSD        float64   `json:"cost_usd"`
	CompileOk      bool      `json:"compile_ok"`
	RuntimeOk      bool      `json:"runtime_ok"`
	StdoutOk       bool      `json:"stdout_ok"`
	DurationMs     int64     `json:"duration_ms"`    // Total time (startup + compile + execution)
	CompileMs      int64     `json:"compile_ms"`     // Time spent in compilation (if separate)
	ExecuteMs      int64     `json:"execute_ms"`     // Time spent in execution (if measurable)
	ErrorCategory  string    `json:"error_category"` // compile_error | runtime_error | logic_error | none
	Stdout         string    `json:"stdout,omitempty"`
	Stderr         string    `json:"stderr,omitempty"`
	ExpectedStdout string    `json:"expected_stdout,omitempty"`
	Timestamp      time.Time `json:"timestamp"`
	Code           string    `json:"code,omitempty"` // Generated code (optional, for debugging)

	// Self-repair metrics (M-EVAL-LOOP)
	FirstAttemptOk  bool   `json:"first_attempt_ok"`            // Did first attempt succeed?
	RepairUsed      bool   `json:"repair_used"`                 // Did we attempt a repair?
	RepairOk        bool   `json:"repair_ok"`                   // Did repair succeed?
	ErrCode         string `json:"err_code,omitempty"`          // Error code from taxonomy (PAR_001, etc.)
	RepairTokensIn  int    `json:"repair_tokens_in,omitempty"`  // Input tokens for repair attempt
	RepairTokensOut int    `json:"repair_tokens_out,omitempty"` // Output tokens for repair attempt

	// Prompt versioning (M-EVAL-LOOP)
	PromptVersion string `json:"prompt_version,omitempty"` // Prompt version used (v0.3.0-hints, etc.)

	// Reproducibility (M-EVAL-LOOP)
	BinaryHash string   `json:"binary_hash,omitempty"` // SHA256 of ailang binary
	StdlibHash string   `json:"stdlib_hash,omitempty"` // SHA256 of stdlib
	Caps       []string `json:"caps,omitempty"`        // Capabilities granted

	// Contract verification results (M-CONTRACT-EVAL)
	VerifyOk        bool   `json:"verify_ok"`             // All contracts verified
	VerifyVerified  int    `json:"verify_verified"`       // Count of verified functions
	VerifyCounterex int    `json:"verify_counterexample"` // Count of counterexamples
	VerifySkipped   int    `json:"verify_skipped"`        // Count of skipped functions
	VerifyErrors    int    `json:"verify_errors"`         // Count of Z3 errors
	VerifyJSON      string `json:"verify_json,omitempty"` // Full ai-check JSON output

	// Agent mode KPIs (M-EVAL-AGENT)
	AgentTurns      int    `json:"agent_turns,omitempty"`      // Number of conversation turns (agent mode only)
	AgentTranscript string `json:"agent_transcript,omitempty"` // Full Claude conversation transcript (agent mode only)
	EvalMode        string `json:"eval_mode,omitempty"`        // Evaluation mode: "standard" or "agent"

	// Experimental condition (M-CONTRACT-EVAL conditions dimension)
	Condition string `json:"condition,omitempty"` // Experimental condition: "baseline", "contract", "z3_guided", "full"

	// μRAG state for this run (M-BRAIN-MICRORAG)
	// Values: "on" | "off" | "auto" | "disabled" | "" (legacy / not set).
	// Lets eval-report break results down with vs. without JIT knowledge injection.
	MicroragState string `json:"microrag_state,omitempty"`

	// Cross-harness grouping (M-EVAL-CROSS-HARNESS)
	// Populated from models.yml model_family field. Enables --group-by=model-family
	// in eval-matrix to compare same model across different harnesses (e.g. claude vs opencode).
	ModelFamily string `json:"model_family,omitempty"`
}

RunMetrics captures the results of a single benchmark run

func NewRunMetrics ¶

func NewRunMetrics(id, lang, model string, seed int64) *RunMetrics

NewRunMetrics creates a new RunMetrics with timestamp and error category. MicroragState is auto-populated from the inherited env so every metrics emission honours the eval-suite --microrag flag (M-BRAIN-MICRORAG).

type RunResult ¶

type RunResult struct {
	Stdout       string
	Stderr       string
	ExitCode     int
	Duration     time.Duration // Total time (startup + compile + execution)
	CompileTime  time.Duration // Time spent in compilation/type-checking (if separate)
	ExecuteTime  time.Duration // Time spent in actual code execution (if measurable)
	CompileOk    bool
	RuntimeOk    bool
	StdoutOk     bool
	TimedOut     bool
	CodeHash     string // SHA256 hash of executed code (for validation)
	WorkspaceDir string // Path to isolated workspace (for debugging)
}

RunResult captures the outcome of running generated code

type TelemetryReporter ¶

type TelemetryReporter struct {
	// contains filtered or unexported fields
}

TelemetryReporter sends telemetry updates to the collaboration hub server This allows eval suite and other external processes to report their Claude usage

func NewTelemetryReporter ¶

func NewTelemetryReporter(serverURL, instanceID string) *TelemetryReporter

NewTelemetryReporter creates a new telemetry reporter serverURL should be like "http://localhost:8090" (the collaboration hub) If serverURL is empty, checks AILANG_HUB_URL environment variable If neither is set, telemetry reporting is disabled

func (*TelemetryReporter) AddUsage ¶

func (t *TelemetryReporter) AddUsage(tokensIn, tokensOut int, cost float64)

AddUsage adds to the token usage and cost (for cumulative updates)

func (*TelemetryReporter) Complete ¶

func (t *TelemetryReporter) Complete(tokensIn, tokensOut int, cost float64)

Complete marks the session as complete with final metrics

func (*TelemetryReporter) Error ¶

func (t *TelemetryReporter) Error()

Error marks the session as failed

func (*TelemetryReporter) IncrementTurn ¶

func (t *TelemetryReporter) IncrementTurn()

IncrementTurn increments the turn counter and sends an update

func (*TelemetryReporter) IsEnabled ¶

func (t *TelemetryReporter) IsEnabled() bool

IsEnabled returns whether telemetry reporting is enabled

func (*TelemetryReporter) SetStatus ¶

func (t *TelemetryReporter) SetStatus(status string)

SetStatus sets the process status (running, completed, error)

func (*TelemetryReporter) SetUsage ¶

func (t *TelemetryReporter) SetUsage(tokensIn, tokensOut int, cost float64)

SetUsage sets the token usage and cost

type TokenUsage ¶

type TokenUsage struct {
	InputTokens              int `json:"input_tokens"`
	CacheCreationInputTokens int `json:"cache_creation_input_tokens"`
	CacheReadInputTokens     int `json:"cache_read_input_tokens"`
	OutputTokens             int `json:"output_tokens"`
}

TokenUsage captures detailed token metrics

type ValidationResult ¶

type ValidationResult struct {
	CompileOk bool
	RuntimeOk bool
	StdoutOk  bool
	Stdout    string
	Stderr    string
}

ValidationResult holds detailed validation results for agent benchmarks

type Watchdog ¶

type Watchdog struct {
	MaxAge      time.Duration // Kill processes older than this
	CheckPeriod time.Duration // How often to check
	Pattern     string        // Process pattern to match
	KilledCount int           // Number of orphans killed
	Enabled     bool          // Whether watchdog is active
}

Watchdog monitors for orphaned eval processes and kills them This is a safety net for cases where process group cleanup fails

func NewWatchdog ¶

func NewWatchdog(maxAge, checkPeriod time.Duration) *Watchdog

NewWatchdog creates a new Watchdog with the specified settings

func (*Watchdog) KillOrphans ¶

func (w *Watchdog) KillOrphans() int

KillOrphans performs an immediate check and kill of orphaned processes This can be called during shutdown for extra cleanup

func (*Watchdog) Report ¶

func (w *Watchdog) Report() string

Report returns a summary of watchdog activity

func (*Watchdog) Start ¶

func (w *Watchdog) Start(done <-chan struct{})

Start begins the watchdog monitoring loop It runs until the done channel is closed

Directories ¶

Path	Synopsis
langreg Package langreg is the language registry for the AILANG eval harness.	Package langreg is the language registry for the AILANG eval harness.

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL