eval

package
v0.2.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 15, 2026 License: MIT Imports: 19 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func ApplyFilters

func ApplyFilters(input string, filters ...Filter) string

ApplyFilters runs a chain of filters on the input.

func CompareModels

func CompareModels(results map[string]*SuiteResult) string

CompareModels produces a side-by-side comparison table for multiple models.

func DeltaCoverage

func DeltaCoverage(before, after *CoverageReport) string

DeltaCoverage computes and formats the difference between two coverage reports.

func DiscoverTestPackages

func DiscoverTestPackages(projectDir string) ([]string, error)

DiscoverTestPackages finds all directories containing _test.go files under projectDir and returns their relative import paths.

func FindUncoveredFunctions

func FindUncoveredFunctions(profile *CoverageReport, projectDir string) []string

FindUncoveredFunctions cross-references coverage data with the AST to identify functions that have zero coverage.

func FormatReport

func FormatReport(report *CoverageReport) string

FormatReport produces a human-readable coverage report with visual bars.

func FormatResult

func FormatResult(result *RunnerResult) string

FormatResult returns a human-readable summary of test results.

func GenerateLeaderboard

func GenerateLeaderboard(results []SuiteResult) string

GenerateLeaderboard produces a markdown leaderboard comparing multiple suite results.

func GenerateReport

func GenerateReport(result *SuiteResult) string

GenerateReport produces a markdown report from a suite result.

func GenerateTestTemplate

func GenerateTestTemplate(funcName, pkg, signature string) string

GenerateTestTemplate creates a table-driven test template for a function.

func Percentile

func Percentile(durations []time.Duration, pct float64) time.Duration

Percentile computes the given percentile from a slice of durations. pct should be between 0 and 100 (e.g., 50 for P50, 95 for P95).

func StripMarkdown

func StripMarkdown(s string) string

StripMarkdown removes all markdown formatting, keeping only code content.

func TrimExplanation

func TrimExplanation(s string) string

TrimExplanation removes common LLM explanation prefixes/suffixes.

Types

type BenchmarkSuite

type BenchmarkSuite struct {
	Name    string
	Tasks   []BenchmarkTask
	Results []TaskResult
}

BenchmarkSuite represents a collection of benchmark tasks for evaluation.

func GoTasks

func GoTasks() *BenchmarkSuite

GoTasks returns a BenchmarkSuite with 15 Go coding tasks.

type BenchmarkTask

type BenchmarkTask struct {
	ID          string
	Description string
	SetupFn     func(workDir string) error
	ValidateFn  func(workDir string) (bool, string)
	Prompt      string
	TimeLimit   time.Duration
	Tags        []string
	MaxAttempts int
	Filters     []Filter
}

BenchmarkTask defines a single coding task to evaluate.

func LoadTasksFromYAML

func LoadTasksFromYAML(dir string) ([]BenchmarkTask, error)

LoadTasksFromYAML loads task definitions from a directory of YAML files.

type Cache

type Cache struct {
	Dir string
}

Cache stores LLM responses keyed by (model, prompt_hash, params) to avoid re-calling APIs.

func DefaultCache

func DefaultCache() *Cache

DefaultCache returns a cache at ~/.hawk/eval/cache/.

func (*Cache) Clear

func (c *Cache) Clear() error

Clear removes all cached entries.

func (*Cache) Get

func (c *Cache) Get(model, prompt string) *CacheEntry

Get retrieves a cached response. Returns nil if not found.

func (*Cache) Key

func (c *Cache) Key(model, prompt string) string

Key computes a cache key from model and prompt.

func (*Cache) Put

func (c *Cache) Put(model, prompt, response string, tokens int, cost float64) error

Put stores a response in the cache.

type CacheEntry

type CacheEntry struct {
	Model    string  `json:"model"`
	Prompt   string  `json:"prompt"`
	Response string  `json:"response"`
	Tokens   int     `json:"tokens"`
	CostUSD  float64 `json:"cost_usd"`
}

CacheEntry is a single cached LLM response.

type CoverageAnalyzer

type CoverageAnalyzer struct {
	ProjectDir string
	// contains filtered or unexported fields
}

CoverageAnalyzer runs coverage analysis on a Go project.

func NewCoverageAnalyzer

func NewCoverageAnalyzer(projectDir string) *CoverageAnalyzer

NewCoverageAnalyzer creates a new CoverageAnalyzer for the given project directory.

func (*CoverageAnalyzer) RunCoverage

func (ca *CoverageAnalyzer) RunCoverage() (*CoverageReport, error)

RunCoverage executes go test with coverage and builds a structured report.

type CoverageReport

type CoverageReport struct {
	TotalLines         int
	CoveredLines       int
	Percentage         float64
	Files              []FileCoverage
	UncoveredFunctions []string
	Suggestions        []TestSuggestion
}

CoverageReport holds overall coverage metrics and per-file details.

func ParseCoverageProfile

func ParseCoverageProfile(data string) (*CoverageReport, error)

ParseCoverageProfile parses the Go coverage profile format. Each line after the mode header has the form:

file.go:startLine.startCol,endLine.endCol numStmts count

type FileCoverage

type FileCoverage struct {
	Path            string
	TotalLines      int
	CoveredLines    int
	Percentage      float64
	UncoveredRanges []LineRange
}

FileCoverage holds coverage information for a single source file.

type Filter

type Filter func(string) string

Filter transforms LLM output before validation.

func ExtractCodeBlock

func ExtractCodeBlock(lang string) Filter

ExtractCodeBlock extracts the first fenced code block matching the given language. If no match, returns the original string.

type GroupResult

type GroupResult struct {
	Name     string  `json:"name"`
	Total    int     `json:"total"`
	Passed   int     `json:"passed"`
	PassRate float64 `json:"pass_rate"`
}

GroupResult holds aggregated metrics for a task group.

func AggregateGroupResults

func AggregateGroupResults(groups []TaskGroup, results []TaskResult) []GroupResult

AggregateGroupResults computes pass rates per group from task results.

type LLMClient

type LLMClient interface {
	Complete(ctx context.Context, model, prompt string) (response string, tokens int, cost float64, err error)
}

LLMClient is the interface for invoking an LLM during evaluation.

type LineRange

type LineRange struct {
	Start        int
	End          int
	FunctionName string
}

LineRange represents a contiguous range of uncovered lines.

type ModelBenchmark

type ModelBenchmark struct {
	Name    string
	Models  []ModelConfig
	Tasks   []BenchmarkTask
	Runs    int
	Results map[string]*ModelResult
}

ModelBenchmark orchestrates benchmarking multiple LLM models on standardized tasks.

func NewModelBenchmark

func NewModelBenchmark(name string, models []ModelConfig) *ModelBenchmark

NewModelBenchmark creates a new benchmark configured with the given models.

func (*ModelBenchmark) AnalyzeStrengths

func (mb *ModelBenchmark) AnalyzeStrengths(model string) ([]string, []string)

AnalyzeStrengths returns (strengths, weaknesses) for the given model by looking at which task categories the model excels or fails at.

func (*ModelBenchmark) Compare

func (mb *ModelBenchmark) Compare() string

Compare produces a side-by-side model comparison string.

func (*ModelBenchmark) ExportCSV

func (mb *ModelBenchmark) ExportCSV() string

ExportCSV exports benchmark results as CSV for external analysis.

func (*ModelBenchmark) RankModels

func (mb *ModelBenchmark) RankModels(by string) []ModelResult

RankModels returns model results sorted by the given criterion.

func (*ModelBenchmark) RecommendModel

func (mb *ModelBenchmark) RecommendModel(taskType string) string

RecommendModel recommends the best model for the given task type based on results.

func (*ModelBenchmark) RunAll

func (mb *ModelBenchmark) RunAll(ctx context.Context, chatFn func(context.Context, ModelConfig, string) (string, int, float64, error)) error

RunAll executes all tasks for all models, repeating Runs times each. chatFn is called with (ctx, model config, prompt) and returns (response, tokens, cost, error).

type ModelConfig

type ModelConfig struct {
	Name        string
	Provider    string
	Model       string
	Temperature float64
	MaxTokens   int
}

ModelConfig describes an LLM model configuration for benchmarking.

type ModelResult

type ModelResult struct {
	Model       string
	PassRate    float64
	AvgTokens   int
	AvgCostUSD  float64
	AvgDuration time.Duration
	P50Duration time.Duration
	P95Duration time.Duration
	TaskResults []TaskResult
	Strengths   []string
	Weaknesses  []string
}

ModelResult holds aggregated benchmark results for a single model.

type PackageResult

type PackageResult struct {
	Package  string
	Passed   int
	Failed   int
	Skipped  int
	Duration time.Duration
	Output   string
	Tests    []TestCaseResult
	Error    string
}

PackageResult holds the test results for a single Go package.

type ParallelRunner

type ParallelRunner struct {
	MaxWorkers int
	Timeout    time.Duration
	Results    map[string]*PackageResult
	// contains filtered or unexported fields
}

ParallelRunner executes test packages concurrently using a worker pool.

func NewParallelRunner

func NewParallelRunner(workers int) *ParallelRunner

NewParallelRunner creates a ParallelRunner with the specified worker count. If workers <= 0, defaults to runtime.NumCPU().

func (*ParallelRunner) GetFailed

func (r *ParallelRunner) GetFailed() []TestCaseResult

GetFailed returns all failed test cases across all packages.

func (*ParallelRunner) GetSlowest

func (r *ParallelRunner) GetSlowest(n int) []TestCaseResult

GetSlowest returns the n slowest test cases across all packages.

func (*ParallelRunner) RunAll

func (r *ParallelRunner) RunAll(ctx context.Context, projectDir string) (*RunnerResult, error)

RunAll discovers all test packages under projectDir and runs them in parallel.

func (*ParallelRunner) RunPackages

func (r *ParallelRunner) RunPackages(ctx context.Context, packages []string) (*RunnerResult, error)

RunPackages runs the specified packages concurrently using a worker pool.

func (*ParallelRunner) RunSinglePackage

func (r *ParallelRunner) RunSinglePackage(ctx context.Context, pkg string) *PackageResult

RunSinglePackage runs `go test -v -json` for a single package and parses output.

type ResultHash

type ResultHash struct {
	TasksHash  string `json:"tasks_hash"`
	PromptHash string `json:"prompt_hash"`
	GitCommit  string `json:"git_commit"`
	GoVersion  string `json:"go_version"`
	OS         string `json:"os"`
	Arch       string `json:"arch"`
}

ResultHash captures reproducibility information for a benchmark run.

func ComputeHash

func ComputeHash(tasks []BenchmarkTask) *ResultHash

ComputeHash generates reproducibility hashes for a set of tasks.

type ResultStore

type ResultStore struct {
	Dir string
}

ResultStore handles reading/writing eval results to disk.

func DefaultResultStore

func DefaultResultStore() *ResultStore

DefaultResultStore returns a store at ~/.hawk/eval/results/.

func (*ResultStore) List

func (s *ResultStore) List() ([]string, error)

List returns all result files in the store directory.

func (*ResultStore) Load

func (s *ResultStore) Load(path string) (*StoredResult, error)

Load reads a stored result from a JSON file.

func (*ResultStore) Save

func (s *ResultStore) Save(result *SuiteResult, model, provider string, hash *ResultHash) (string, error)

Save writes a SuiteResult to disk as JSON.

type ResultSummary

type ResultSummary struct {
	TotalTasks    int     `json:"total_tasks"`
	Passed        int     `json:"passed"`
	Failed        int     `json:"failed"`
	PassRate      float64 `json:"pass_rate"`
	TotalDuration string  `json:"total_duration"`
	TotalTokens   int     `json:"total_tokens"`
	TotalCostUSD  float64 `json:"total_cost_usd"`
}

ResultSummary is the top-level metrics.

type Runner

type Runner struct {
	Model       string
	Provider    string
	MaxAttempts int
	Timeout     time.Duration
	LLM         LLMClient
	Cache       *Cache
	NoCache     bool
	Filters     []Filter
}

Runner executes benchmark tasks against a specific model/provider.

func NewRunner

func NewRunner(model, provider string) *Runner

NewRunner creates a Runner configured for the given model and provider.

func (*Runner) Run

func (r *Runner) Run(ctx context.Context, suite *BenchmarkSuite) (*SuiteResult, error)

Run executes all tasks in a benchmark suite and returns aggregated results.

func (*Runner) RunSingle

func (r *Runner) RunSingle(ctx context.Context, task *BenchmarkTask) (*TaskResult, error)

RunSingle executes a single benchmark task in an isolated temporary directory.

type RunnerResult

type RunnerResult struct {
	Packages     []PackageResult
	TotalPassed  int
	TotalFailed  int
	TotalSkipped int
	Duration     time.Duration
	Parallel     int
}

RunnerResult aggregates test results from all packages.

type StoredResult

type StoredResult struct {
	Version   string             `json:"version"`
	Timestamp time.Time          `json:"timestamp"`
	Suite     string             `json:"suite"`
	Model     string             `json:"model"`
	Provider  string             `json:"provider"`
	Hash      *ResultHash        `json:"hash,omitempty"`
	Summary   ResultSummary      `json:"summary"`
	Tasks     []StoredTaskResult `json:"tasks"`
}

StoredResult is the persistent JSON format for eval results.

type StoredTaskResult

type StoredTaskResult struct {
	TaskID   string   `json:"task_id"`
	Passed   bool     `json:"passed"`
	Duration string   `json:"duration"`
	Tokens   int      `json:"tokens"`
	CostUSD  float64  `json:"cost_usd"`
	Attempts int      `json:"attempts"`
	Error    string   `json:"error,omitempty"`
	Tags     []string `json:"tags,omitempty"`
}

StoredTaskResult is the per-task persistent format.

type SuiteResult

type SuiteResult struct {
	Suite         string
	TotalTasks    int
	Passed        int
	Failed        int
	TotalDuration time.Duration
	TotalTokens   int
	TotalCostUSD  float64
	PassRate      float64
	Results       []TaskResult
}

SuiteResult aggregates results from running an entire benchmark suite.

type TaskGroup

type TaskGroup struct {
	Name  string
	Tags  []string // tasks matching any of these tags belong to this group
	Tasks []BenchmarkTask
}

TaskGroup defines a named collection of tasks with aggregated metrics.

func DefaultGroups

func DefaultGroups() []TaskGroup

DefaultGroups returns the standard task groupings.

func GroupTasks

func GroupTasks(tasks []BenchmarkTask, groups []TaskGroup) []TaskGroup

GroupTasks assigns tasks to groups based on tag matching.

type TaskResult

type TaskResult struct {
	TaskID     string
	Passed     bool
	Duration   time.Duration
	TokensUsed int
	CostUSD    float64
	Attempts   int
	Error      string
}

TaskResult captures the outcome of running a single benchmark task.

type TestCaseResult

type TestCaseResult struct {
	Name     string
	Passed   bool
	Duration time.Duration
	Output   string
}

TestCaseResult represents the outcome of a single test case.

func ParseTestJSON

func ParseTestJSON(output string) []TestCaseResult

ParseTestJSON parses `go test -json` output into structured test results.

type TestSuggestion

type TestSuggestion struct {
	Function string
	File     string
	Priority string
	Reason   string
	Template string
}

TestSuggestion recommends a test to write for uncovered code.

func SuggestTests

func SuggestTests(uncovered []string) []TestSuggestion

SuggestTests generates test suggestions for uncovered functions.

type YAMLTask

type YAMLTask struct {
	Task        string            `yaml:"task"`
	Description string            `yaml:"description"`
	Language    string            `yaml:"language"`
	Tags        []string          `yaml:"tags"`
	Timeout     string            `yaml:"timeout"`
	MaxAttempts int               `yaml:"max_attempts"`
	Setup       string            `yaml:"setup"`
	Prompt      string            `yaml:"prompt"`
	Validate    []string          `yaml:"validate"`
	Files       map[string]string `yaml:"files"`
	Filters     []string          `yaml:"filters"`
}

YAMLTask is the declarative task definition format.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL