eval

package

v0.2.0 Latest Latest Go to latest Published: May 15, 2026 License: MIT Imports: 19 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/GrayCodeAI/hawk

Links

Open Source Insights

Documentation ¶

Index ¶

func ApplyFilters(input string, filters ...Filter) string
func CompareModels(results map[string]*SuiteResult) string
func DeltaCoverage(before, after *CoverageReport) string
func DiscoverTestPackages(projectDir string) ([]string, error)
func FindUncoveredFunctions(profile *CoverageReport, projectDir string) []string
func FormatReport(report *CoverageReport) string
func FormatResult(result *RunnerResult) string
func GenerateLeaderboard(results []SuiteResult) string
func GenerateReport(result *SuiteResult) string
func GenerateTestTemplate(funcName, pkg, signature string) string
func Percentile(durations []time.Duration, pct float64) time.Duration
func StripMarkdown(s string) string
func TrimExplanation(s string) string
type BenchmarkSuite
- func GoTasks() *BenchmarkSuite
type BenchmarkTask
- func LoadTasksFromYAML(dir string) ([]BenchmarkTask, error)
type Cache
- func DefaultCache() *Cache
- func (c *Cache) Clear() error
- func (c *Cache) Get(model, prompt string) *CacheEntry
- func (c *Cache) Key(model, prompt string) string
- func (c *Cache) Put(model, prompt, response string, tokens int, cost float64) error
type CacheEntry
type CoverageAnalyzer
- func NewCoverageAnalyzer(projectDir string) *CoverageAnalyzer
- func (ca *CoverageAnalyzer) RunCoverage() (*CoverageReport, error)
type CoverageReport
- func ParseCoverageProfile(data string) (*CoverageReport, error)
type FileCoverage
type Filter
- func ExtractCodeBlock(lang string) Filter
type GroupResult
- func AggregateGroupResults(groups []TaskGroup, results []TaskResult) []GroupResult
type LLMClient
type LineRange
type ModelBenchmark
- func NewModelBenchmark(name string, models []ModelConfig) *ModelBenchmark
- func (mb *ModelBenchmark) AnalyzeStrengths(model string) ([]string, []string)
- func (mb *ModelBenchmark) Compare() string
- func (mb *ModelBenchmark) ExportCSV() string
- func (mb *ModelBenchmark) RankModels(by string) []ModelResult
- func (mb *ModelBenchmark) RecommendModel(taskType string) string
- func (mb *ModelBenchmark) RunAll(ctx context.Context, ...) error
type ModelConfig
type ModelResult
type PackageResult
type ParallelRunner
- func NewParallelRunner(workers int) *ParallelRunner
- func (r *ParallelRunner) GetFailed() []TestCaseResult
- func (r *ParallelRunner) GetSlowest(n int) []TestCaseResult
- func (r *ParallelRunner) RunAll(ctx context.Context, projectDir string) (*RunnerResult, error)
- func (r *ParallelRunner) RunPackages(ctx context.Context, packages []string) (*RunnerResult, error)
- func (r *ParallelRunner) RunSinglePackage(ctx context.Context, pkg string) *PackageResult
type ResultHash
- func ComputeHash(tasks []BenchmarkTask) *ResultHash
type ResultStore
- func DefaultResultStore() *ResultStore
- func (s *ResultStore) List() ([]string, error)
- func (s *ResultStore) Load(path string) (*StoredResult, error)
- func (s *ResultStore) Save(result *SuiteResult, model, provider string, hash *ResultHash) (string, error)
type ResultSummary
type Runner
- func NewRunner(model, provider string) *Runner
- func (r *Runner) Run(ctx context.Context, suite *BenchmarkSuite) (*SuiteResult, error)
- func (r *Runner) RunSingle(ctx context.Context, task *BenchmarkTask) (*TaskResult, error)
type RunnerResult
type StoredResult
type StoredTaskResult
type SuiteResult
type TaskGroup
- func DefaultGroups() []TaskGroup
- func GroupTasks(tasks []BenchmarkTask, groups []TaskGroup) []TaskGroup
type TaskResult
type TestCaseResult
- func ParseTestJSON(output string) []TestCaseResult
type TestSuggestion
- func SuggestTests(uncovered []string) []TestSuggestion
type YAMLTask

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func ApplyFilters ¶

func ApplyFilters(input string, filters ...Filter) string

ApplyFilters runs a chain of filters on the input.

func CompareModels ¶

func CompareModels(results map[string]*SuiteResult) string

CompareModels produces a side-by-side comparison table for multiple models.

func DeltaCoverage ¶

func DeltaCoverage(before, after *CoverageReport) string

DeltaCoverage computes and formats the difference between two coverage reports.

func DiscoverTestPackages ¶

func DiscoverTestPackages(projectDir string) ([]string, error)

DiscoverTestPackages finds all directories containing _test.go files under projectDir and returns their relative import paths.

func FindUncoveredFunctions ¶

func FindUncoveredFunctions(profile *CoverageReport, projectDir string) []string

FindUncoveredFunctions cross-references coverage data with the AST to identify functions that have zero coverage.

func FormatReport ¶

func FormatReport(report *CoverageReport) string

FormatReport produces a human-readable coverage report with visual bars.

func FormatResult ¶

func FormatResult(result *RunnerResult) string

FormatResult returns a human-readable summary of test results.

func GenerateLeaderboard ¶

func GenerateLeaderboard(results []SuiteResult) string

GenerateLeaderboard produces a markdown leaderboard comparing multiple suite results.

func GenerateReport ¶

func GenerateReport(result *SuiteResult) string

GenerateReport produces a markdown report from a suite result.

func GenerateTestTemplate ¶

func GenerateTestTemplate(funcName, pkg, signature string) string

GenerateTestTemplate creates a table-driven test template for a function.

func Percentile ¶

func Percentile(durations []time.Duration, pct float64) time.Duration

Percentile computes the given percentile from a slice of durations. pct should be between 0 and 100 (e.g., 50 for P50, 95 for P95).

func StripMarkdown ¶

func StripMarkdown(s string) string

StripMarkdown removes all markdown formatting, keeping only code content.

func TrimExplanation ¶

func TrimExplanation(s string) string

TrimExplanation removes common LLM explanation prefixes/suffixes.

Types ¶

type BenchmarkSuite ¶

type BenchmarkSuite struct {
	Name    string
	Tasks   []BenchmarkTask
	Results []TaskResult
}

BenchmarkSuite represents a collection of benchmark tasks for evaluation.

func GoTasks ¶

func GoTasks() *BenchmarkSuite

GoTasks returns a BenchmarkSuite with 15 Go coding tasks.

type BenchmarkTask ¶

type BenchmarkTask struct {
	ID          string
	Description string
	SetupFn     func(workDir string) error
	ValidateFn  func(workDir string) (bool, string)
	Prompt      string
	TimeLimit   time.Duration
	Tags        []string
	MaxAttempts int
	Filters     []Filter
}

BenchmarkTask defines a single coding task to evaluate.

func LoadTasksFromYAML ¶

func LoadTasksFromYAML(dir string) ([]BenchmarkTask, error)

LoadTasksFromYAML loads task definitions from a directory of YAML files.

type Cache ¶

type Cache struct {
	Dir string
}

Cache stores LLM responses keyed by (model, prompt_hash, params) to avoid re-calling APIs.

func DefaultCache ¶

func DefaultCache() *Cache

DefaultCache returns a cache at ~/.hawk/eval/cache/.

func (*Cache) Clear ¶

func (c *Cache) Clear() error

Clear removes all cached entries.

func (*Cache) Get ¶

func (c *Cache) Get(model, prompt string) *CacheEntry

Get retrieves a cached response. Returns nil if not found.

func (*Cache) Key ¶

func (c *Cache) Key(model, prompt string) string

Key computes a cache key from model and prompt.

func (*Cache) Put ¶

func (c *Cache) Put(model, prompt, response string, tokens int, cost float64) error

Put stores a response in the cache.

type CacheEntry ¶

type CacheEntry struct {
	Model    string  `json:"model"`
	Prompt   string  `json:"prompt"`
	Response string  `json:"response"`
	Tokens   int     `json:"tokens"`
	CostUSD  float64 `json:"cost_usd"`
}

CacheEntry is a single cached LLM response.

type CoverageAnalyzer ¶

type CoverageAnalyzer struct {
	ProjectDir string
	// contains filtered or unexported fields
}

CoverageAnalyzer runs coverage analysis on a Go project.

func NewCoverageAnalyzer ¶

func NewCoverageAnalyzer(projectDir string) *CoverageAnalyzer

NewCoverageAnalyzer creates a new CoverageAnalyzer for the given project directory.

func (*CoverageAnalyzer) RunCoverage ¶

func (ca *CoverageAnalyzer) RunCoverage() (*CoverageReport, error)

RunCoverage executes go test with coverage and builds a structured report.

type CoverageReport ¶

type CoverageReport struct {
	TotalLines         int
	CoveredLines       int
	Percentage         float64
	Files              []FileCoverage
	UncoveredFunctions []string
	Suggestions        []TestSuggestion
}

CoverageReport holds overall coverage metrics and per-file details.

func ParseCoverageProfile ¶

func ParseCoverageProfile(data string) (*CoverageReport, error)

ParseCoverageProfile parses the Go coverage profile format. Each line after the mode header has the form:

file.go:startLine.startCol,endLine.endCol numStmts count

type FileCoverage ¶

type FileCoverage struct {
	Path            string
	TotalLines      int
	CoveredLines    int
	Percentage      float64
	UncoveredRanges []LineRange
}

FileCoverage holds coverage information for a single source file.

type Filter ¶

type Filter func(string) string

Filter transforms LLM output before validation.

func ExtractCodeBlock ¶

func ExtractCodeBlock(lang string) Filter

ExtractCodeBlock extracts the first fenced code block matching the given language. If no match, returns the original string.

type GroupResult ¶

type GroupResult struct {
	Name     string  `json:"name"`
	Total    int     `json:"total"`
	Passed   int     `json:"passed"`
	PassRate float64 `json:"pass_rate"`
}

GroupResult holds aggregated metrics for a task group.

func AggregateGroupResults ¶

func AggregateGroupResults(groups []TaskGroup, results []TaskResult) []GroupResult

AggregateGroupResults computes pass rates per group from task results.

type LLMClient ¶

type LLMClient interface {
	Complete(ctx context.Context, model, prompt string) (response string, tokens int, cost float64, err error)
}

LLMClient is the interface for invoking an LLM during evaluation.

type LineRange ¶

type LineRange struct {
	Start        int
	End          int
	FunctionName string
}

LineRange represents a contiguous range of uncovered lines.

type ModelBenchmark ¶

type ModelBenchmark struct {
	Name    string
	Models  []ModelConfig
	Tasks   []BenchmarkTask
	Runs    int
	Results map[string]*ModelResult
}

ModelBenchmark orchestrates benchmarking multiple LLM models on standardized tasks.

func NewModelBenchmark ¶

func NewModelBenchmark(name string, models []ModelConfig) *ModelBenchmark

NewModelBenchmark creates a new benchmark configured with the given models.

func (*ModelBenchmark) AnalyzeStrengths ¶

func (mb *ModelBenchmark) AnalyzeStrengths(model string) ([]string, []string)

AnalyzeStrengths returns (strengths, weaknesses) for the given model by looking at which task categories the model excels or fails at.

func (*ModelBenchmark) Compare ¶

func (mb *ModelBenchmark) Compare() string

Compare produces a side-by-side model comparison string.

func (*ModelBenchmark) ExportCSV ¶

func (mb *ModelBenchmark) ExportCSV() string

ExportCSV exports benchmark results as CSV for external analysis.

func (*ModelBenchmark) RankModels ¶

func (mb *ModelBenchmark) RankModels(by string) []ModelResult

RankModels returns model results sorted by the given criterion.

func (*ModelBenchmark) RecommendModel ¶

func (mb *ModelBenchmark) RecommendModel(taskType string) string

RecommendModel recommends the best model for the given task type based on results.

func (*ModelBenchmark) RunAll ¶

func (mb *ModelBenchmark) RunAll(ctx context.Context, chatFn func(context.Context, ModelConfig, string) (string, int, float64, error)) error

RunAll executes all tasks for all models, repeating Runs times each. chatFn is called with (ctx, model config, prompt) and returns (response, tokens, cost, error).

type ModelConfig ¶

type ModelConfig struct {
	Name        string
	Provider    string
	Model       string
	Temperature float64
	MaxTokens   int
}

ModelConfig describes an LLM model configuration for benchmarking.

type ModelResult ¶

type ModelResult struct {
	Model       string
	PassRate    float64
	AvgTokens   int
	AvgCostUSD  float64
	AvgDuration time.Duration
	P50Duration time.Duration
	P95Duration time.Duration
	TaskResults []TaskResult
	Strengths   []string
	Weaknesses  []string
}

ModelResult holds aggregated benchmark results for a single model.

type PackageResult ¶

type PackageResult struct {
	Package  string
	Passed   int
	Failed   int
	Skipped  int
	Duration time.Duration
	Output   string
	Tests    []TestCaseResult
	Error    string
}

PackageResult holds the test results for a single Go package.

type ParallelRunner ¶

type ParallelRunner struct {
	MaxWorkers int
	Timeout    time.Duration
	Results    map[string]*PackageResult
	// contains filtered or unexported fields
}

ParallelRunner executes test packages concurrently using a worker pool.

func NewParallelRunner ¶

func NewParallelRunner(workers int) *ParallelRunner

NewParallelRunner creates a ParallelRunner with the specified worker count. If workers <= 0, defaults to runtime.NumCPU().

func (*ParallelRunner) GetFailed ¶

func (r *ParallelRunner) GetFailed() []TestCaseResult

GetFailed returns all failed test cases across all packages.

func (*ParallelRunner) GetSlowest ¶

func (r *ParallelRunner) GetSlowest(n int) []TestCaseResult

GetSlowest returns the n slowest test cases across all packages.

func (*ParallelRunner) RunAll ¶

func (r *ParallelRunner) RunAll(ctx context.Context, projectDir string) (*RunnerResult, error)

RunAll discovers all test packages under projectDir and runs them in parallel.

func (*ParallelRunner) RunPackages ¶

func (r *ParallelRunner) RunPackages(ctx context.Context, packages []string) (*RunnerResult, error)

RunPackages runs the specified packages concurrently using a worker pool.

func (*ParallelRunner) RunSinglePackage ¶

func (r *ParallelRunner) RunSinglePackage(ctx context.Context, pkg string) *PackageResult

RunSinglePackage runs `go test -v -json` for a single package and parses output.

type ResultHash ¶

type ResultHash struct {
	TasksHash  string `json:"tasks_hash"`
	PromptHash string `json:"prompt_hash"`
	GitCommit  string `json:"git_commit"`
	GoVersion  string `json:"go_version"`
	OS         string `json:"os"`
	Arch       string `json:"arch"`
}

ResultHash captures reproducibility information for a benchmark run.

func ComputeHash ¶

func ComputeHash(tasks []BenchmarkTask) *ResultHash

ComputeHash generates reproducibility hashes for a set of tasks.

type ResultStore ¶

type ResultStore struct {
	Dir string
}

ResultStore handles reading/writing eval results to disk.

func DefaultResultStore ¶

func DefaultResultStore() *ResultStore

DefaultResultStore returns a store at ~/.hawk/eval/results/.

func (*ResultStore) List ¶

func (s *ResultStore) List() ([]string, error)

List returns all result files in the store directory.

func (*ResultStore) Load ¶

func (s *ResultStore) Load(path string) (*StoredResult, error)

Load reads a stored result from a JSON file.

func (*ResultStore) Save ¶

func (s *ResultStore) Save(result *SuiteResult, model, provider string, hash *ResultHash) (string, error)

Save writes a SuiteResult to disk as JSON.

type ResultSummary ¶

type ResultSummary struct {
	TotalTasks    int     `json:"total_tasks"`
	Passed        int     `json:"passed"`
	Failed        int     `json:"failed"`
	PassRate      float64 `json:"pass_rate"`
	TotalDuration string  `json:"total_duration"`
	TotalTokens   int     `json:"total_tokens"`
	TotalCostUSD  float64 `json:"total_cost_usd"`
}

ResultSummary is the top-level metrics.

type Runner ¶

type Runner struct {
	Model       string
	Provider    string
	MaxAttempts int
	Timeout     time.Duration
	LLM         LLMClient
	Cache       *Cache
	NoCache     bool
	Filters     []Filter
}

Runner executes benchmark tasks against a specific model/provider.

func NewRunner ¶

func NewRunner(model, provider string) *Runner

NewRunner creates a Runner configured for the given model and provider.

func (*Runner) Run ¶

func (r *Runner) Run(ctx context.Context, suite *BenchmarkSuite) (*SuiteResult, error)

Run executes all tasks in a benchmark suite and returns aggregated results.

func (*Runner) RunSingle ¶

func (r *Runner) RunSingle(ctx context.Context, task *BenchmarkTask) (*TaskResult, error)

RunSingle executes a single benchmark task in an isolated temporary directory.

type RunnerResult ¶

type RunnerResult struct {
	Packages     []PackageResult
	TotalPassed  int
	TotalFailed  int
	TotalSkipped int
	Duration     time.Duration
	Parallel     int
}

RunnerResult aggregates test results from all packages.

type StoredResult ¶

type StoredResult struct {
	Version   string             `json:"version"`
	Timestamp time.Time          `json:"timestamp"`
	Suite     string             `json:"suite"`
	Model     string             `json:"model"`
	Provider  string             `json:"provider"`
	Hash      *ResultHash        `json:"hash,omitempty"`
	Summary   ResultSummary      `json:"summary"`
	Tasks     []StoredTaskResult `json:"tasks"`
}

StoredResult is the persistent JSON format for eval results.

type StoredTaskResult ¶

type StoredTaskResult struct {
	TaskID   string   `json:"task_id"`
	Passed   bool     `json:"passed"`
	Duration string   `json:"duration"`
	Tokens   int      `json:"tokens"`
	CostUSD  float64  `json:"cost_usd"`
	Attempts int      `json:"attempts"`
	Error    string   `json:"error,omitempty"`
	Tags     []string `json:"tags,omitempty"`
}

StoredTaskResult is the per-task persistent format.

type SuiteResult ¶

type SuiteResult struct {
	Suite         string
	TotalTasks    int
	Passed        int
	Failed        int
	TotalDuration time.Duration
	TotalTokens   int
	TotalCostUSD  float64
	PassRate      float64
	Results       []TaskResult
}

SuiteResult aggregates results from running an entire benchmark suite.

type TaskGroup ¶

type TaskGroup struct {
	Name  string
	Tags  []string // tasks matching any of these tags belong to this group
	Tasks []BenchmarkTask
}

TaskGroup defines a named collection of tasks with aggregated metrics.

func DefaultGroups ¶

func DefaultGroups() []TaskGroup

DefaultGroups returns the standard task groupings.

func GroupTasks ¶

func GroupTasks(tasks []BenchmarkTask, groups []TaskGroup) []TaskGroup

GroupTasks assigns tasks to groups based on tag matching.

type TaskResult ¶

type TaskResult struct {
	TaskID     string
	Passed     bool
	Duration   time.Duration
	TokensUsed int
	CostUSD    float64
	Attempts   int
	Error      string
}

TaskResult captures the outcome of running a single benchmark task.

type TestCaseResult ¶

type TestCaseResult struct {
	Name     string
	Passed   bool
	Duration time.Duration
	Output   string
}

TestCaseResult represents the outcome of a single test case.

func ParseTestJSON ¶

func ParseTestJSON(output string) []TestCaseResult

ParseTestJSON parses `go test -json` output into structured test results.

type TestSuggestion ¶

type TestSuggestion struct {
	Function string
	File     string
	Priority string
	Reason   string
	Template string
}

TestSuggestion recommends a test to write for uncovered code.

func SuggestTests ¶

func SuggestTests(uncovered []string) []TestSuggestion

SuggestTests generates test suggestions for uncovered functions.

type YAMLTask ¶

type YAMLTask struct {
	Task        string            `yaml:"task"`
	Description string            `yaml:"description"`
	Language    string            `yaml:"language"`
	Tags        []string          `yaml:"tags"`
	Timeout     string            `yaml:"timeout"`
	MaxAttempts int               `yaml:"max_attempts"`
	Setup       string            `yaml:"setup"`
	Prompt      string            `yaml:"prompt"`
	Validate    []string          `yaml:"validate"`
	Files       map[string]string `yaml:"files"`
	Filters     []string          `yaml:"filters"`
}

YAMLTask is the declarative task definition format.

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL