Documentation
¶
Index ¶
- func ApplyFilters(input string, filters ...Filter) string
- func CompareModels(results map[string]*SuiteResult) string
- func DeltaCoverage(before, after *CoverageReport) string
- func DiscoverTestPackages(projectDir string) ([]string, error)
- func FindUncoveredFunctions(profile *CoverageReport, projectDir string) []string
- func FormatReport(report *CoverageReport) string
- func FormatResult(result *RunnerResult) string
- func GenerateLeaderboard(results []SuiteResult) string
- func GenerateReport(result *SuiteResult) string
- func GenerateTestTemplate(funcName, pkg, signature string) string
- func Percentile(durations []time.Duration, pct float64) time.Duration
- func StripMarkdown(s string) string
- func TrimExplanation(s string) string
- type BenchmarkSuite
- type BenchmarkTask
- type Cache
- type CacheEntry
- type CoverageAnalyzer
- type CoverageReport
- type FileCoverage
- type Filter
- type GroupResult
- type LLMClient
- type LineRange
- type ModelBenchmark
- func (mb *ModelBenchmark) AnalyzeStrengths(model string) ([]string, []string)
- func (mb *ModelBenchmark) Compare() string
- func (mb *ModelBenchmark) ExportCSV() string
- func (mb *ModelBenchmark) RankModels(by string) []ModelResult
- func (mb *ModelBenchmark) RecommendModel(taskType string) string
- func (mb *ModelBenchmark) RunAll(ctx context.Context, ...) error
- type ModelConfig
- type ModelResult
- type PackageResult
- type ParallelRunner
- func (r *ParallelRunner) GetFailed() []TestCaseResult
- func (r *ParallelRunner) GetSlowest(n int) []TestCaseResult
- func (r *ParallelRunner) RunAll(ctx context.Context, projectDir string) (*RunnerResult, error)
- func (r *ParallelRunner) RunPackages(ctx context.Context, packages []string) (*RunnerResult, error)
- func (r *ParallelRunner) RunSinglePackage(ctx context.Context, pkg string) *PackageResult
- type ResultHash
- type ResultStore
- type ResultSummary
- type Runner
- type RunnerResult
- type StoredResult
- type StoredTaskResult
- type SuiteResult
- type TaskGroup
- type TaskResult
- type TestCaseResult
- type TestSuggestion
- type YAMLTask
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func ApplyFilters ¶
ApplyFilters runs a chain of filters on the input.
func CompareModels ¶
func CompareModels(results map[string]*SuiteResult) string
CompareModels produces a side-by-side comparison table for multiple models.
func DeltaCoverage ¶
func DeltaCoverage(before, after *CoverageReport) string
DeltaCoverage computes and formats the difference between two coverage reports.
func DiscoverTestPackages ¶
DiscoverTestPackages finds all directories containing _test.go files under projectDir and returns their relative import paths.
func FindUncoveredFunctions ¶
func FindUncoveredFunctions(profile *CoverageReport, projectDir string) []string
FindUncoveredFunctions cross-references coverage data with the AST to identify functions that have zero coverage.
func FormatReport ¶
func FormatReport(report *CoverageReport) string
FormatReport produces a human-readable coverage report with visual bars.
func FormatResult ¶
func FormatResult(result *RunnerResult) string
FormatResult returns a human-readable summary of test results.
func GenerateLeaderboard ¶
func GenerateLeaderboard(results []SuiteResult) string
GenerateLeaderboard produces a markdown leaderboard comparing multiple suite results.
func GenerateReport ¶
func GenerateReport(result *SuiteResult) string
GenerateReport produces a markdown report from a suite result.
func GenerateTestTemplate ¶
GenerateTestTemplate creates a table-driven test template for a function.
func Percentile ¶
Percentile computes the given percentile from a slice of durations. pct should be between 0 and 100 (e.g., 50 for P50, 95 for P95).
func StripMarkdown ¶
StripMarkdown removes all markdown formatting, keeping only code content.
func TrimExplanation ¶
TrimExplanation removes common LLM explanation prefixes/suffixes.
Types ¶
type BenchmarkSuite ¶
type BenchmarkSuite struct {
Name string
Tasks []BenchmarkTask
Results []TaskResult
}
BenchmarkSuite represents a collection of benchmark tasks for evaluation.
func GoTasks ¶
func GoTasks() *BenchmarkSuite
GoTasks returns a BenchmarkSuite with 15 Go coding tasks.
type BenchmarkTask ¶
type BenchmarkTask struct {
ID string
Description string
SetupFn func(workDir string) error
ValidateFn func(workDir string) (bool, string)
Prompt string
TimeLimit time.Duration
Tags []string
MaxAttempts int
Filters []Filter
}
BenchmarkTask defines a single coding task to evaluate.
func LoadTasksFromYAML ¶
func LoadTasksFromYAML(dir string) ([]BenchmarkTask, error)
LoadTasksFromYAML loads task definitions from a directory of YAML files.
type Cache ¶
type Cache struct {
Dir string
}
Cache stores LLM responses keyed by (model, prompt_hash, params) to avoid re-calling APIs.
func (*Cache) Get ¶
func (c *Cache) Get(model, prompt string) *CacheEntry
Get retrieves a cached response. Returns nil if not found.
type CacheEntry ¶
type CacheEntry struct {
Model string `json:"model"`
Prompt string `json:"prompt"`
Response string `json:"response"`
Tokens int `json:"tokens"`
CostUSD float64 `json:"cost_usd"`
}
CacheEntry is a single cached LLM response.
type CoverageAnalyzer ¶
type CoverageAnalyzer struct {
ProjectDir string
// contains filtered or unexported fields
}
CoverageAnalyzer runs coverage analysis on a Go project.
func NewCoverageAnalyzer ¶
func NewCoverageAnalyzer(projectDir string) *CoverageAnalyzer
NewCoverageAnalyzer creates a new CoverageAnalyzer for the given project directory.
func (*CoverageAnalyzer) RunCoverage ¶
func (ca *CoverageAnalyzer) RunCoverage() (*CoverageReport, error)
RunCoverage executes go test with coverage and builds a structured report.
type CoverageReport ¶
type CoverageReport struct {
TotalLines int
CoveredLines int
Percentage float64
Files []FileCoverage
UncoveredFunctions []string
Suggestions []TestSuggestion
}
CoverageReport holds overall coverage metrics and per-file details.
func ParseCoverageProfile ¶
func ParseCoverageProfile(data string) (*CoverageReport, error)
ParseCoverageProfile parses the Go coverage profile format. Each line after the mode header has the form:
file.go:startLine.startCol,endLine.endCol numStmts count
type FileCoverage ¶
type FileCoverage struct {
Path string
TotalLines int
CoveredLines int
Percentage float64
UncoveredRanges []LineRange
}
FileCoverage holds coverage information for a single source file.
type Filter ¶
Filter transforms LLM output before validation.
func ExtractCodeBlock ¶
ExtractCodeBlock extracts the first fenced code block matching the given language. If no match, returns the original string.
type GroupResult ¶
type GroupResult struct {
Name string `json:"name"`
Total int `json:"total"`
Passed int `json:"passed"`
PassRate float64 `json:"pass_rate"`
}
GroupResult holds aggregated metrics for a task group.
func AggregateGroupResults ¶
func AggregateGroupResults(groups []TaskGroup, results []TaskResult) []GroupResult
AggregateGroupResults computes pass rates per group from task results.
type LLMClient ¶
type LLMClient interface {
Complete(ctx context.Context, model, prompt string) (response string, tokens int, cost float64, err error)
}
LLMClient is the interface for invoking an LLM during evaluation.
type ModelBenchmark ¶
type ModelBenchmark struct {
Name string
Models []ModelConfig
Tasks []BenchmarkTask
Runs int
Results map[string]*ModelResult
}
ModelBenchmark orchestrates benchmarking multiple LLM models on standardized tasks.
func NewModelBenchmark ¶
func NewModelBenchmark(name string, models []ModelConfig) *ModelBenchmark
NewModelBenchmark creates a new benchmark configured with the given models.
func (*ModelBenchmark) AnalyzeStrengths ¶
func (mb *ModelBenchmark) AnalyzeStrengths(model string) ([]string, []string)
AnalyzeStrengths returns (strengths, weaknesses) for the given model by looking at which task categories the model excels or fails at.
func (*ModelBenchmark) Compare ¶
func (mb *ModelBenchmark) Compare() string
Compare produces a side-by-side model comparison string.
func (*ModelBenchmark) ExportCSV ¶
func (mb *ModelBenchmark) ExportCSV() string
ExportCSV exports benchmark results as CSV for external analysis.
func (*ModelBenchmark) RankModels ¶
func (mb *ModelBenchmark) RankModels(by string) []ModelResult
RankModels returns model results sorted by the given criterion.
func (*ModelBenchmark) RecommendModel ¶
func (mb *ModelBenchmark) RecommendModel(taskType string) string
RecommendModel recommends the best model for the given task type based on results.
func (*ModelBenchmark) RunAll ¶
func (mb *ModelBenchmark) RunAll(ctx context.Context, chatFn func(context.Context, ModelConfig, string) (string, int, float64, error)) error
RunAll executes all tasks for all models, repeating Runs times each. chatFn is called with (ctx, model config, prompt) and returns (response, tokens, cost, error).
type ModelConfig ¶
type ModelConfig struct {
Name string
Provider string
Model string
Temperature float64
MaxTokens int
}
ModelConfig describes an LLM model configuration for benchmarking.
type ModelResult ¶
type ModelResult struct {
Model string
PassRate float64
AvgTokens int
AvgCostUSD float64
AvgDuration time.Duration
P50Duration time.Duration
P95Duration time.Duration
TaskResults []TaskResult
Strengths []string
Weaknesses []string
}
ModelResult holds aggregated benchmark results for a single model.
type PackageResult ¶
type PackageResult struct {
Package string
Passed int
Failed int
Skipped int
Duration time.Duration
Output string
Tests []TestCaseResult
Error string
}
PackageResult holds the test results for a single Go package.
type ParallelRunner ¶
type ParallelRunner struct {
MaxWorkers int
Timeout time.Duration
Results map[string]*PackageResult
// contains filtered or unexported fields
}
ParallelRunner executes test packages concurrently using a worker pool.
func NewParallelRunner ¶
func NewParallelRunner(workers int) *ParallelRunner
NewParallelRunner creates a ParallelRunner with the specified worker count. If workers <= 0, defaults to runtime.NumCPU().
func (*ParallelRunner) GetFailed ¶
func (r *ParallelRunner) GetFailed() []TestCaseResult
GetFailed returns all failed test cases across all packages.
func (*ParallelRunner) GetSlowest ¶
func (r *ParallelRunner) GetSlowest(n int) []TestCaseResult
GetSlowest returns the n slowest test cases across all packages.
func (*ParallelRunner) RunAll ¶
func (r *ParallelRunner) RunAll(ctx context.Context, projectDir string) (*RunnerResult, error)
RunAll discovers all test packages under projectDir and runs them in parallel.
func (*ParallelRunner) RunPackages ¶
func (r *ParallelRunner) RunPackages(ctx context.Context, packages []string) (*RunnerResult, error)
RunPackages runs the specified packages concurrently using a worker pool.
func (*ParallelRunner) RunSinglePackage ¶
func (r *ParallelRunner) RunSinglePackage(ctx context.Context, pkg string) *PackageResult
RunSinglePackage runs `go test -v -json` for a single package and parses output.
type ResultHash ¶
type ResultHash struct {
TasksHash string `json:"tasks_hash"`
PromptHash string `json:"prompt_hash"`
GitCommit string `json:"git_commit"`
GoVersion string `json:"go_version"`
OS string `json:"os"`
Arch string `json:"arch"`
}
ResultHash captures reproducibility information for a benchmark run.
func ComputeHash ¶
func ComputeHash(tasks []BenchmarkTask) *ResultHash
ComputeHash generates reproducibility hashes for a set of tasks.
type ResultStore ¶
type ResultStore struct {
Dir string
}
ResultStore handles reading/writing eval results to disk.
func DefaultResultStore ¶
func DefaultResultStore() *ResultStore
DefaultResultStore returns a store at ~/.hawk/eval/results/.
func (*ResultStore) List ¶
func (s *ResultStore) List() ([]string, error)
List returns all result files in the store directory.
func (*ResultStore) Load ¶
func (s *ResultStore) Load(path string) (*StoredResult, error)
Load reads a stored result from a JSON file.
func (*ResultStore) Save ¶
func (s *ResultStore) Save(result *SuiteResult, model, provider string, hash *ResultHash) (string, error)
Save writes a SuiteResult to disk as JSON.
type ResultSummary ¶
type ResultSummary struct {
TotalTasks int `json:"total_tasks"`
Passed int `json:"passed"`
Failed int `json:"failed"`
PassRate float64 `json:"pass_rate"`
TotalDuration string `json:"total_duration"`
TotalTokens int `json:"total_tokens"`
TotalCostUSD float64 `json:"total_cost_usd"`
}
ResultSummary is the top-level metrics.
type Runner ¶
type Runner struct {
Model string
Provider string
MaxAttempts int
Timeout time.Duration
LLM LLMClient
Cache *Cache
NoCache bool
Filters []Filter
}
Runner executes benchmark tasks against a specific model/provider.
func (*Runner) Run ¶
func (r *Runner) Run(ctx context.Context, suite *BenchmarkSuite) (*SuiteResult, error)
Run executes all tasks in a benchmark suite and returns aggregated results.
func (*Runner) RunSingle ¶
func (r *Runner) RunSingle(ctx context.Context, task *BenchmarkTask) (*TaskResult, error)
RunSingle executes a single benchmark task in an isolated temporary directory.
type RunnerResult ¶
type RunnerResult struct {
Packages []PackageResult
TotalPassed int
TotalFailed int
TotalSkipped int
Duration time.Duration
Parallel int
}
RunnerResult aggregates test results from all packages.
type StoredResult ¶
type StoredResult struct {
Version string `json:"version"`
Timestamp time.Time `json:"timestamp"`
Suite string `json:"suite"`
Model string `json:"model"`
Provider string `json:"provider"`
Hash *ResultHash `json:"hash,omitempty"`
Summary ResultSummary `json:"summary"`
Tasks []StoredTaskResult `json:"tasks"`
}
StoredResult is the persistent JSON format for eval results.
type StoredTaskResult ¶
type StoredTaskResult struct {
TaskID string `json:"task_id"`
Passed bool `json:"passed"`
Duration string `json:"duration"`
Tokens int `json:"tokens"`
CostUSD float64 `json:"cost_usd"`
Attempts int `json:"attempts"`
Error string `json:"error,omitempty"`
Tags []string `json:"tags,omitempty"`
}
StoredTaskResult is the per-task persistent format.
type SuiteResult ¶
type SuiteResult struct {
Suite string
TotalTasks int
Passed int
Failed int
TotalDuration time.Duration
TotalTokens int
TotalCostUSD float64
PassRate float64
Results []TaskResult
}
SuiteResult aggregates results from running an entire benchmark suite.
type TaskGroup ¶
type TaskGroup struct {
Name string
Tags []string // tasks matching any of these tags belong to this group
Tasks []BenchmarkTask
}
TaskGroup defines a named collection of tasks with aggregated metrics.
func DefaultGroups ¶
func DefaultGroups() []TaskGroup
DefaultGroups returns the standard task groupings.
func GroupTasks ¶
func GroupTasks(tasks []BenchmarkTask, groups []TaskGroup) []TaskGroup
GroupTasks assigns tasks to groups based on tag matching.
type TaskResult ¶
type TaskResult struct {
TaskID string
Passed bool
Duration time.Duration
TokensUsed int
CostUSD float64
Attempts int
Error string
}
TaskResult captures the outcome of running a single benchmark task.
type TestCaseResult ¶
TestCaseResult represents the outcome of a single test case.
func ParseTestJSON ¶
func ParseTestJSON(output string) []TestCaseResult
ParseTestJSON parses `go test -json` output into structured test results.
type TestSuggestion ¶
type TestSuggestion struct {
Function string
File string
Priority string
Reason string
Template string
}
TestSuggestion recommends a test to write for uncovered code.
func SuggestTests ¶
func SuggestTests(uncovered []string) []TestSuggestion
SuggestTests generates test suggestions for uncovered functions.
type YAMLTask ¶
type YAMLTask struct {
Task string `yaml:"task"`
Description string `yaml:"description"`
Language string `yaml:"language"`
Tags []string `yaml:"tags"`
Timeout string `yaml:"timeout"`
MaxAttempts int `yaml:"max_attempts"`
Setup string `yaml:"setup"`
Prompt string `yaml:"prompt"`
Validate []string `yaml:"validate"`
Files map[string]string `yaml:"files"`
Filters []string `yaml:"filters"`
}
YAMLTask is the declarative task definition format.