Documentation
¶
Overview ¶
Package reporter writes benchmark results to disk as JSON and Markdown. The JSON file is machine-readable for post-processing and leaderboard submission. The Markdown file is the human-readable report.
Index ¶
- func Timestamp() string
- type ContextBenchLangResult
- type ContextBenchResult
- type FeatureBenchReport
- type GraphBenchMetrics
- type GraphBenchResult
- type GraphBenchSlice
- type NLBenchResult
- type RepoBenchConfig
- type RepoBenchResult
- type RepoBenchSummary
- type Reporter
- func (r *Reporter) PrintContextBenchSummary(result *ContextBenchResult)
- func (r *Reporter) PrintFeatureBenchSummary(result *FeatureBenchReport)
- func (r *Reporter) PrintGraphBenchSummary(result *GraphBenchResult)
- func (r *Reporter) PrintNLBenchSummary(result *NLBenchResult)
- func (r *Reporter) PrintRepoBenchSummary(result *RepoBenchResult)
- func (r *Reporter) PrintSWEBenchSummary(result *SWEBenchResult)
- func (r *Reporter) WriteContextBench(result *ContextBenchResult) error
- func (r *Reporter) WriteFeatureBench(result *FeatureBenchReport) error
- func (r *Reporter) WriteGraphBench(result *GraphBenchResult) error
- func (r *Reporter) WriteNLBench(result *NLBenchResult) error
- func (r *Reporter) WriteRepoBench(result *RepoBenchResult) error
- func (r *Reporter) WriteSWEBench(result *SWEBenchResult) error
- type SWEBenchResult
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
Types ¶
type ContextBenchLangResult ¶
type ContextBenchLangResult struct {
Language string `json:"language"`
Tasks int `json:"tasks"`
AvgPrecision float64 `json:"avg_precision"`
AvgRecall float64 `json:"avg_recall"`
AvgF1 float64 `json:"avg_f1"`
}
ContextBenchLangResult holds per-language metrics.
type ContextBenchResult ¶
type ContextBenchResult struct {
Timestamp string `json:"timestamp"`
TotalTasks int `json:"total_tasks"`
AvgPrecision float64 `json:"avg_precision"`
AvgRecall float64 `json:"avg_recall"`
AvgF1 float64 `json:"avg_f1"`
PerLanguage []ContextBenchLangResult `json:"per_language"`
TaskResults []interface{} `json:"tasks"` // []ContextBenchTaskResult from benchmarks pkg
}
ContextBenchResult holds the full results of a ContextBench run.
type FeatureBenchReport ¶
type FeatureBenchReport struct {
Timestamp string `json:"timestamp"`
Mode string `json:"mode"`
Model string `json:"model"`
TotalTasks int `json:"total_tasks"`
PatchCount int `json:"patch_count"`
PatchRate float64 `json:"patch_rate"`
AvgTurns float64 `json:"avg_turns"`
ToolUsage map[string]int `json:"tool_usage"`
Tasks []interface{} `json:"tasks"`
}
FeatureBenchReport holds aggregated FeatureBench results. Defined here to avoid circular import with benchmarks package.
type GraphBenchMetrics ¶
type GraphBenchMetrics struct {
Precision float64 `json:"precision"`
Recall float64 `json:"recall"`
F1 float64 `json:"f1"`
}
GraphBenchMetrics holds P/R/F1 scores.
type GraphBenchResult ¶
type GraphBenchResult struct {
Timestamp string `json:"timestamp"`
TotalTests int `json:"total_tests"`
ErrorCount int `json:"error_count"`
Summary GraphBenchMetrics `json:"summary"`
ByQueryType []GraphBenchSlice `json:"by_query_type"`
ByLanguage []GraphBenchSlice `json:"by_language"`
TestResults []interface{} `json:"tests"`
}
GraphBenchResult holds the full results of a GraphBench run.
type GraphBenchSlice ¶
type GraphBenchSlice struct {
Label string `json:"label"`
Tests int `json:"tests"`
Metrics GraphBenchMetrics `json:"metrics"`
}
GraphBenchSlice is a breakdown by one dimension (query_type or language).
type NLBenchResult ¶
type NLBenchResult struct {
Timestamp string `json:"timestamp"`
TotalTests int `json:"total_tests"`
ErrorCount int `json:"error_count"`
Summary GraphBenchMetrics `json:"summary"`
ByQueryType []GraphBenchSlice `json:"by_query_type"`
ByLanguage []GraphBenchSlice `json:"by_language"`
TestResults []interface{} `json:"tests"`
}
NLBenchResult holds the full results of an NLBench run. Reuses GraphBenchMetrics and GraphBenchSlice for consistency.
type RepoBenchConfig ¶
type RepoBenchConfig struct {
Config string `json:"config"` // e.g. "python_cff"
Difficulty string `json:"difficulty"` // "easy" | "hard"
Samples int `json:"samples"`
AccAtK map[int]float64 `json:"acc_at_k"` // k → accuracy
AvgRank float64 `json:"avg_gold_rank"`
}
RepoBenchConfig holds results for one config×difficulty combination.
type RepoBenchResult ¶
type RepoBenchResult struct {
Timestamp string `json:"timestamp"`
RetrievalMode string `json:"retrieval_mode"`
Configs []RepoBenchConfig `json:"configs"`
Summary RepoBenchSummary `json:"summary"`
}
RepoBenchResult holds the full results of a RepoBench-R run.
type RepoBenchSummary ¶
type RepoBenchSummary struct {
TotalSamples int `json:"total_samples"`
AccAtK map[int]float64 `json:"acc_at_k"` // macro-average across configs
}
RepoBenchSummary aggregates across all configs.
type Reporter ¶
type Reporter struct {
// contains filtered or unexported fields
}
Reporter writes benchmark results to an output directory.
func (*Reporter) PrintContextBenchSummary ¶
func (r *Reporter) PrintContextBenchSummary(result *ContextBenchResult)
PrintContextBenchSummary prints a compact summary to stdout.
func (*Reporter) PrintFeatureBenchSummary ¶
func (r *Reporter) PrintFeatureBenchSummary(result *FeatureBenchReport)
PrintFeatureBenchSummary prints a console summary.
func (*Reporter) PrintGraphBenchSummary ¶
func (r *Reporter) PrintGraphBenchSummary(result *GraphBenchResult)
PrintGraphBenchSummary prints a compact summary to stdout.
func (*Reporter) PrintNLBenchSummary ¶
func (r *Reporter) PrintNLBenchSummary(result *NLBenchResult)
PrintNLBenchSummary prints a compact summary to stdout.
func (*Reporter) PrintRepoBenchSummary ¶
func (r *Reporter) PrintRepoBenchSummary(result *RepoBenchResult)
PrintRepoBenchSummary prints a compact summary to stdout.
func (*Reporter) PrintSWEBenchSummary ¶
func (r *Reporter) PrintSWEBenchSummary(result *SWEBenchResult)
PrintSWEBenchSummary prints a console summary table.
func (*Reporter) WriteContextBench ¶
func (r *Reporter) WriteContextBench(result *ContextBenchResult) error
WriteContextBench writes JSON + Markdown results for a ContextBench run.
func (*Reporter) WriteFeatureBench ¶
func (r *Reporter) WriteFeatureBench(result *FeatureBenchReport) error
WriteFeatureBench writes JSON + Markdown results.
func (*Reporter) WriteGraphBench ¶
func (r *Reporter) WriteGraphBench(result *GraphBenchResult) error
WriteGraphBench writes JSON + Markdown results for a GraphBench run.
func (*Reporter) WriteNLBench ¶
func (r *Reporter) WriteNLBench(result *NLBenchResult) error
WriteNLBench writes JSON + Markdown results for an NLBench run.
func (*Reporter) WriteRepoBench ¶
func (r *Reporter) WriteRepoBench(result *RepoBenchResult) error
WriteRepoBench writes JSON + Markdown results for a RepoBench-R run.
func (*Reporter) WriteSWEBench ¶
func (r *Reporter) WriteSWEBench(result *SWEBenchResult) error
WriteSWEBench writes JSON + Markdown results for a SWE-bench run.
type SWEBenchResult ¶
type SWEBenchResult struct {
Timestamp string `json:"timestamp"`
Mode string `json:"mode"`
Model string `json:"model"`
TotalTasks int `json:"total_tasks"`
PassCount int `json:"pass_count"`
PatchCount int `json:"patch_count"`
PassRate float64 `json:"pass_rate"`
PatchRate float64 `json:"patch_rate"`
AvgTurns float64 `json:"avg_turns"`
AvgTokens int `json:"avg_tokens"`
ToolContribRate float64 `json:"tool_contrib_rate"`
ToolUsage map[string]int `json:"tool_usage"`
Tasks []interface{} `json:"tasks"`
}
SWEBenchResult holds the full results of a SWE-bench run.