reporter

package
v0.8.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 29, 2026 License: MIT Imports: 6 Imported by: 0

Documentation

Overview

Package reporter writes benchmark results to disk as JSON and Markdown. The JSON file is machine-readable for post-processing and leaderboard submission. The Markdown file is the human-readable report.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func Timestamp

func Timestamp() string

Timestamp returns the current UTC time as a compact string for filenames.

Types

type ContextBenchLangResult

type ContextBenchLangResult struct {
	Language     string  `json:"language"`
	Tasks        int     `json:"tasks"`
	AvgPrecision float64 `json:"avg_precision"`
	AvgRecall    float64 `json:"avg_recall"`
	AvgF1        float64 `json:"avg_f1"`
}

ContextBenchLangResult holds per-language metrics.

type ContextBenchResult

type ContextBenchResult struct {
	Timestamp    string                   `json:"timestamp"`
	TotalTasks   int                      `json:"total_tasks"`
	AvgPrecision float64                  `json:"avg_precision"`
	AvgRecall    float64                  `json:"avg_recall"`
	AvgF1        float64                  `json:"avg_f1"`
	PerLanguage  []ContextBenchLangResult `json:"per_language"`
	TaskResults  []interface{}            `json:"tasks"` // []ContextBenchTaskResult from benchmarks pkg
}

ContextBenchResult holds the full results of a ContextBench run.

type FeatureBenchReport

type FeatureBenchReport struct {
	Timestamp  string         `json:"timestamp"`
	Mode       string         `json:"mode"`
	Model      string         `json:"model"`
	TotalTasks int            `json:"total_tasks"`
	PatchCount int            `json:"patch_count"`
	PatchRate  float64        `json:"patch_rate"`
	AvgTurns   float64        `json:"avg_turns"`
	ToolUsage  map[string]int `json:"tool_usage"`
	Tasks      []interface{}  `json:"tasks"`
}

FeatureBenchReport holds aggregated FeatureBench results. Defined here to avoid circular import with benchmarks package.

type GraphBenchMetrics

type GraphBenchMetrics struct {
	Precision float64 `json:"precision"`
	Recall    float64 `json:"recall"`
	F1        float64 `json:"f1"`
}

GraphBenchMetrics holds P/R/F1 scores.

type GraphBenchResult

type GraphBenchResult struct {
	Timestamp   string            `json:"timestamp"`
	TotalTests  int               `json:"total_tests"`
	ErrorCount  int               `json:"error_count"`
	Summary     GraphBenchMetrics `json:"summary"`
	ByQueryType []GraphBenchSlice `json:"by_query_type"`
	ByLanguage  []GraphBenchSlice `json:"by_language"`
	TestResults []interface{}     `json:"tests"`
}

GraphBenchResult holds the full results of a GraphBench run.

type GraphBenchSlice

type GraphBenchSlice struct {
	Label   string            `json:"label"`
	Tests   int               `json:"tests"`
	Metrics GraphBenchMetrics `json:"metrics"`
}

GraphBenchSlice is a breakdown by one dimension (query_type or language).

type NLBenchResult

type NLBenchResult struct {
	Timestamp   string            `json:"timestamp"`
	TotalTests  int               `json:"total_tests"`
	ErrorCount  int               `json:"error_count"`
	Summary     GraphBenchMetrics `json:"summary"`
	ByQueryType []GraphBenchSlice `json:"by_query_type"`
	ByLanguage  []GraphBenchSlice `json:"by_language"`
	TestResults []interface{}     `json:"tests"`
}

NLBenchResult holds the full results of an NLBench run. Reuses GraphBenchMetrics and GraphBenchSlice for consistency.

type RepoBenchConfig

type RepoBenchConfig struct {
	Config     string          `json:"config"`     // e.g. "python_cff"
	Difficulty string          `json:"difficulty"` // "easy" | "hard"
	Samples    int             `json:"samples"`
	AccAtK     map[int]float64 `json:"acc_at_k"` // k → accuracy
	AvgRank    float64         `json:"avg_gold_rank"`
}

RepoBenchConfig holds results for one config×difficulty combination.

type RepoBenchResult

type RepoBenchResult struct {
	Timestamp     string            `json:"timestamp"`
	RetrievalMode string            `json:"retrieval_mode"`
	Configs       []RepoBenchConfig `json:"configs"`
	Summary       RepoBenchSummary  `json:"summary"`
}

RepoBenchResult holds the full results of a RepoBench-R run.

type RepoBenchSummary

type RepoBenchSummary struct {
	TotalSamples int             `json:"total_samples"`
	AccAtK       map[int]float64 `json:"acc_at_k"` // macro-average across configs
}

RepoBenchSummary aggregates across all configs.

type Reporter

type Reporter struct {
	// contains filtered or unexported fields
}

Reporter writes benchmark results to an output directory.

func New

func New(dir string) *Reporter

New creates a reporter that writes files into dir.

func (*Reporter) PrintContextBenchSummary

func (r *Reporter) PrintContextBenchSummary(result *ContextBenchResult)

PrintContextBenchSummary prints a compact summary to stdout.

func (*Reporter) PrintFeatureBenchSummary

func (r *Reporter) PrintFeatureBenchSummary(result *FeatureBenchReport)

PrintFeatureBenchSummary prints a console summary.

func (*Reporter) PrintGraphBenchSummary

func (r *Reporter) PrintGraphBenchSummary(result *GraphBenchResult)

PrintGraphBenchSummary prints a compact summary to stdout.

func (*Reporter) PrintNLBenchSummary

func (r *Reporter) PrintNLBenchSummary(result *NLBenchResult)

PrintNLBenchSummary prints a compact summary to stdout.

func (*Reporter) PrintRepoBenchSummary

func (r *Reporter) PrintRepoBenchSummary(result *RepoBenchResult)

PrintRepoBenchSummary prints a compact summary to stdout.

func (*Reporter) PrintSWEBenchSummary

func (r *Reporter) PrintSWEBenchSummary(result *SWEBenchResult)

PrintSWEBenchSummary prints a console summary table.

func (*Reporter) WriteContextBench

func (r *Reporter) WriteContextBench(result *ContextBenchResult) error

WriteContextBench writes JSON + Markdown results for a ContextBench run.

func (*Reporter) WriteFeatureBench

func (r *Reporter) WriteFeatureBench(result *FeatureBenchReport) error

WriteFeatureBench writes JSON + Markdown results.

func (*Reporter) WriteGraphBench

func (r *Reporter) WriteGraphBench(result *GraphBenchResult) error

WriteGraphBench writes JSON + Markdown results for a GraphBench run.

func (*Reporter) WriteNLBench

func (r *Reporter) WriteNLBench(result *NLBenchResult) error

WriteNLBench writes JSON + Markdown results for an NLBench run.

func (*Reporter) WriteRepoBench

func (r *Reporter) WriteRepoBench(result *RepoBenchResult) error

WriteRepoBench writes JSON + Markdown results for a RepoBench-R run.

func (*Reporter) WriteSWEBench

func (r *Reporter) WriteSWEBench(result *SWEBenchResult) error

WriteSWEBench writes JSON + Markdown results for a SWE-bench run.

type SWEBenchResult

type SWEBenchResult struct {
	Timestamp       string         `json:"timestamp"`
	Mode            string         `json:"mode"`
	Model           string         `json:"model"`
	TotalTasks      int            `json:"total_tasks"`
	PassCount       int            `json:"pass_count"`
	PatchCount      int            `json:"patch_count"`
	PassRate        float64        `json:"pass_rate"`
	PatchRate       float64        `json:"patch_rate"`
	AvgTurns        float64        `json:"avg_turns"`
	AvgTokens       int            `json:"avg_tokens"`
	ToolContribRate float64        `json:"tool_contrib_rate"`
	ToolUsage       map[string]int `json:"tool_usage"`
	Tasks           []interface{}  `json:"tasks"`
}

SWEBenchResult holds the full results of a SWE-bench run.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL