bench

package
v0.1.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: May 14, 2026 License: Apache-2.0 Imports: 5 Imported by: 0

Documentation

Overview

Package timeline classifies agent tool calls into decision phases.

Package bench provides structured result storage and queries for infrastructure agent benchmark runs (PostgreSQL / pgx).

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

This section is empty.

Types

type BehaviorComparison

type BehaviorComparison struct {
	PassAvgTurns    float64 `json:"pass_avg_turns"`
	FailAvgTurns    float64 `json:"fail_avg_turns"`
	PassAvgDuration float64 `json:"pass_avg_duration"`
	FailAvgDuration float64 `json:"fail_avg_duration"`
	PassAvgTokens   float64 `json:"pass_avg_tokens"`
	FailAvgTokens   float64 `json:"fail_avg_tokens"`
	PassAvgCost     float64 `json:"pass_avg_cost"`
	FailAvgCost     float64 `json:"fail_avg_cost"`
}

BehaviorComparison shows metric differences between pass and fail runs.

type CheckFailureStat

type CheckFailureStat struct {
	CheckName string  `json:"check_name"`
	CheckType string  `json:"check_type"`
	FailCount int     `json:"fail_count"`
	FailRate  float64 `json:"fail_rate"` // percentage of failed runs where this check failed
	Message   string  `json:"message,omitempty"`
}

CheckFailureStat shows how often a specific check fails.

type FailureInsights

type FailureInsights struct {
	ScenarioID      string             `json:"scenario_id"`
	TotalRuns       int                `json:"total_runs"`
	FailedRuns      int                `json:"failed_runs"`
	PassedRuns      int                `json:"passed_runs"`
	CheckFailures   []CheckFailureStat `json:"check_failures"`
	ModelBreakdown  []ModelFailureStat `json:"model_breakdown"`
	BehaviorMetrics BehaviorComparison `json:"behavior_metrics"`
}

FailureInsights holds analyzed failure patterns for a scenario.

type LeaderboardEntry

type LeaderboardEntry struct {
	Model               string  `json:"model"`
	Scenarios           int     `json:"scenarios"`
	Runs                int     `json:"runs"`
	PassRate            float64 `json:"pass_rate"`
	AvgDuration         float64 `json:"avg_duration"`
	AvgCost             float64 `json:"avg_cost"`
	TotalCost           float64 `json:"total_cost"`
	PassK               float64 `json:"pass_k"`               // pass^k reliability (0-100)
	PassKTrials         int     `json:"pass_k_trials"`        // k value used
	SufficientScenarios int     `json:"sufficient_scenarios"` // scenarios with >= k trials
}

LeaderboardEntry represents one model's aggregate benchmark performance.

type ModelFailureStat

type ModelFailureStat struct {
	Model  string  `json:"model"`
	Runs   int     `json:"runs"`
	Passed int     `json:"passed"`
	Failed int     `json:"failed"`
	Rate   float64 `json:"rate"`
}

ModelFailureStat shows pass/fail per model for a scenario.

type ModelMatrix

type ModelMatrix struct {
	Models    []string                              `json:"models"`
	Scenarios []string                              `json:"scenarios"`
	Cells     map[string]map[string]ModelMatrixCell `json:"cells"` // [scenario][model]
}

ModelMatrix holds a comparison grid across models and scenarios.

type ModelMatrixCell

type ModelMatrixCell struct {
	Runs        int     `json:"runs"`
	Passed      int     `json:"passed"`
	PassRate    float64 `json:"pass_rate"`
	AvgCost     float64 `json:"avg_cost"`
	AvgTokens   int     `json:"avg_tokens"`
	AvgDuration float64 `json:"avg_duration"`
}

ModelMatrixCell holds aggregate metrics for one scenario/model pair.

type Phase

type Phase string

Phase represents a step in the agent's decision-making process.

const (
	PhaseDiscover Phase = "discover"
	PhaseDiagnose Phase = "diagnose"
	PhaseDecide   Phase = "decide"
	PhaseAct      Phase = "act"
	PhaseVerify   Phase = "verify"
	PhaseExplain  Phase = "explain"
)

type Regression

type Regression struct {
	ScenarioID   string  `json:"scenario_id"`
	Model        string  `json:"model"`
	LatestRunID  string  `json:"latest_run_id"`
	LatestPassed bool    `json:"latest_passed"`
	PrevPassed   int     `json:"prev_passed"`
	PrevTotal    int     `json:"prev_total"`
	PrevRate     float64 `json:"prev_rate"`
	Severity     string  `json:"severity"` // critical, warning
}

Regression describes a scenario/model pair where the latest run failed but previous runs had a positive pass rate.

type RunCatalog

type RunCatalog struct {
	Models                     []string            `json:"models"`
	Providers                  []string            `json:"providers"`
	ToolServers                []string            `json:"tool_servers"`
	ToolServerVersions         []string            `json:"tool_server_versions"`
	ToolServerVersionsByServer map[string][]string `json:"tool_server_versions_by_server,omitempty"`
	SkillIDs                   []string            `json:"skill_ids,omitempty"`
	SkillVersions              []string            `json:"skill_versions,omitempty"`
	SkillVersionsByID          map[string][]string `json:"skill_versions_by_id,omitempty"`
}

RunCatalog holds distinct metadata values used for UI filters.

type RunFilters

type RunFilters struct {
	ScenarioID        string
	ScenarioIDs       []string
	Model             string
	Provider          string
	ToolServer        string
	ToolServerVersion string
	SkillID           string
	SkillVersion      string
	SkillUnset        bool // exact baseline/native-prompt runs where skill_id is empty
	ReportID          string
	ToolServerUnset   bool // exact baseline/native-tool runs where tool_server is empty
	PassedOnly        bool
	FailedOnly        bool
	Since             *time.Time // cutoff time — handler parses, store just uses
	Limit             int
	Offset            int
	SortBy            string // column to sort by
	SortOrder         string // asc or desc (default: desc)
	ExcludeErrors     bool   // exclude infra errors (exit_code < 0)
}

RunFilters specifies filters for listing runs.

type RunRecord

type RunRecord struct {
	ID                string    `json:"id"`
	TenantID          string    `json:"tenant_id"`
	ScenarioID        string    `json:"scenario_id"`
	Model             string    `json:"model"`
	Provider          string    `json:"provider"`
	Adapter           string    `json:"adapter"`
	ToolServer        string    `json:"tool_server"`         // MCP server used (empty = baseline/direct exec)
	ToolServerVersion string    `json:"tool_server_version"` // version of MCP server binary
	SkillID           string    `json:"skill_id"`            // skill prompt identity (empty = no skill)
	SkillVersion      string    `json:"skill_version"`       // skill prompt version
	SkillSource       string    `json:"skill_source"`        // local-file, local-temp, registry, etc.
	SkillSHA256       string    `json:"skill_sha256"`        // skill prompt content digest
	ScenarioVersion   string    `json:"scenario_version"`    // version/hash of scenario definition
	Passed            bool      `json:"passed"`
	Duration          float64   `json:"duration_seconds"`
	ExitCode          int       `json:"exit_code"`
	Turns             int       `json:"turns"`
	MemoryWindow      int       `json:"memory_window"`
	PromptTokens      int       `json:"prompt_tokens"`
	CompletionTokens  int       `json:"completion_tokens"`
	EstimatedCost     float64   `json:"estimated_cost_usd"`
	ChecksPassed      int       `json:"checks_passed"`
	ChecksTotal       int       `json:"checks_total"`
	ChecksJSON        string    `json:"checks_json,omitempty"`
	MetadataJSON      string    `json:"metadata_json,omitempty"`
	ArtifactDir       string    `json:"artifact_dir,omitempty"` // local filesystem path (bench runner only)
	CreatedAt         time.Time `json:"created_at"`
}

RunRecord represents a single benchmark run stored in bench_runs.

type ScenarioStat

type ScenarioStat struct {
	ScenarioID string `json:"scenario_id"`
	Runs       int    `json:"runs"`
	Passed     int    `json:"passed"`
}

ScenarioStat holds per-scenario stats.

type ScenarioSummary

type ScenarioSummary struct {
	ID          string   `json:"id"`
	Title       string   `json:"title"`
	Description string   `json:"description,omitempty"`
	Category    string   `json:"category"`
	Track       string   `json:"track,omitempty"`
	Level       string   `json:"level,omitempty"`
	Timeout     string   `json:"timeout,omitempty"`
	Tags        []string `json:"tags"`
	Chaos       bool     `json:"chaos"`
	Skip        bool     `json:"skip,omitempty"`
}

ScenarioSummary holds metadata about a scenario for listing.

type SignalAggregation

type SignalAggregation struct {
	TotalRuns         int                    `json:"total_runs"`
	RunsWithScorecard int                    `json:"runs_with_scorecard"`
	Signals           map[string]SignalCount `json:"signals"`
	AvgScore          float64                `json:"avg_score"`
}

SignalAggregation holds aggregated signal counts across runs.

type SignalCount

type SignalCount struct {
	Total    int `json:"total"`     // total detections
	RunCount int `json:"run_count"` // runs where detected > 0
}

SignalCount holds detection stats for a single signal type.

type StatsResult

type StatsResult struct {
	TotalRuns  int            `json:"total_runs"`
	PassCount  int            `json:"pass_count"`
	FailCount  int            `json:"fail_count"`
	ByScenario []ScenarioStat `json:"by_scenario"`
}

StatsResult holds aggregate run statistics.

type Timeline

type Timeline struct {
	Steps         []TimelineStep `json:"steps"`
	PhaseCount    map[Phase]int  `json:"phase_count"`
	MutationCount int            `json:"mutation_count"`
	TotalSteps    int            `json:"total_steps"`
	// DiagnosisDepth counts diagnose-phase steps, which by construction
	// only occur before the first mutation.
	DiagnosisDepth int `json:"diagnosis_depth"`
}

Timeline is the full classified sequence of agent actions.

func Parse

func Parse(calls []ToolCall) *Timeline

Parse classifies a sequence of tool calls into a decision timeline.

type TimelineStep

type TimelineStep struct {
	Index     int    `json:"index"`
	Phase     Phase  `json:"phase"`
	Tool      string `json:"tool"`
	Operation string `json:"operation"`
	Command   string `json:"command"`
	Resource  string `json:"resource,omitempty"`
	Namespace string `json:"namespace,omitempty"`
	ExitCode  int    `json:"exit_code"`
	Summary   string `json:"summary"`
}

TimelineStep is a single classified step in the decision timeline.

type ToolCall

type ToolCall struct {
	Tool      string          `json:"tool"`
	Args      json.RawMessage `json:"args"`
	Result    string          `json:"result"`
	Timestamp string          `json:"timestamp"`
}

ToolCall matches the structure in tool-calls.json artifacts.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL