Documentation
¶
Overview ¶
Package timeline classifies agent tool calls into decision phases.
Package bench provides structured result storage and queries for infrastructure agent benchmark runs (PostgreSQL / pgx).
Index ¶
- type BehaviorComparison
- type CheckFailureStat
- type FailureInsights
- type LeaderboardEntry
- type ModelFailureStat
- type ModelMatrix
- type ModelMatrixCell
- type Phase
- type Regression
- type RunCatalog
- type RunFilters
- type RunRecord
- type ScenarioStat
- type ScenarioSummary
- type SignalAggregation
- type SignalCount
- type StatsResult
- type Timeline
- type TimelineStep
- type ToolCall
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
This section is empty.
Types ¶
type BehaviorComparison ¶
type BehaviorComparison struct {
PassAvgTurns float64 `json:"pass_avg_turns"`
FailAvgTurns float64 `json:"fail_avg_turns"`
PassAvgDuration float64 `json:"pass_avg_duration"`
FailAvgDuration float64 `json:"fail_avg_duration"`
PassAvgTokens float64 `json:"pass_avg_tokens"`
FailAvgTokens float64 `json:"fail_avg_tokens"`
PassAvgCost float64 `json:"pass_avg_cost"`
FailAvgCost float64 `json:"fail_avg_cost"`
}
BehaviorComparison shows metric differences between pass and fail runs.
type CheckFailureStat ¶
type CheckFailureStat struct {
CheckName string `json:"check_name"`
CheckType string `json:"check_type"`
FailCount int `json:"fail_count"`
FailRate float64 `json:"fail_rate"` // percentage of failed runs where this check failed
Message string `json:"message,omitempty"`
}
CheckFailureStat shows how often a specific check fails.
type FailureInsights ¶
type FailureInsights struct {
ScenarioID string `json:"scenario_id"`
TotalRuns int `json:"total_runs"`
FailedRuns int `json:"failed_runs"`
PassedRuns int `json:"passed_runs"`
CheckFailures []CheckFailureStat `json:"check_failures"`
ModelBreakdown []ModelFailureStat `json:"model_breakdown"`
BehaviorMetrics BehaviorComparison `json:"behavior_metrics"`
}
FailureInsights holds analyzed failure patterns for a scenario.
type LeaderboardEntry ¶
type LeaderboardEntry struct {
Model string `json:"model"`
Scenarios int `json:"scenarios"`
Runs int `json:"runs"`
PassRate float64 `json:"pass_rate"`
AvgDuration float64 `json:"avg_duration"`
AvgCost float64 `json:"avg_cost"`
TotalCost float64 `json:"total_cost"`
PassK float64 `json:"pass_k"` // pass^k reliability (0-100)
PassKTrials int `json:"pass_k_trials"` // k value used
SufficientScenarios int `json:"sufficient_scenarios"` // scenarios with >= k trials
}
LeaderboardEntry represents one model's aggregate benchmark performance.
type ModelFailureStat ¶
type ModelFailureStat struct {
Model string `json:"model"`
Runs int `json:"runs"`
Passed int `json:"passed"`
Failed int `json:"failed"`
Rate float64 `json:"rate"`
}
ModelFailureStat shows pass/fail per model for a scenario.
type ModelMatrix ¶
type ModelMatrix struct {
Models []string `json:"models"`
Scenarios []string `json:"scenarios"`
Cells map[string]map[string]ModelMatrixCell `json:"cells"` // [scenario][model]
}
ModelMatrix holds a comparison grid across models and scenarios.
type ModelMatrixCell ¶
type ModelMatrixCell struct {
Runs int `json:"runs"`
Passed int `json:"passed"`
PassRate float64 `json:"pass_rate"`
AvgCost float64 `json:"avg_cost"`
AvgTokens int `json:"avg_tokens"`
AvgDuration float64 `json:"avg_duration"`
}
ModelMatrixCell holds aggregate metrics for one scenario/model pair.
type Regression ¶
type Regression struct {
ScenarioID string `json:"scenario_id"`
Model string `json:"model"`
LatestRunID string `json:"latest_run_id"`
LatestPassed bool `json:"latest_passed"`
PrevPassed int `json:"prev_passed"`
PrevTotal int `json:"prev_total"`
PrevRate float64 `json:"prev_rate"`
Severity string `json:"severity"` // critical, warning
}
Regression describes a scenario/model pair where the latest run failed but previous runs had a positive pass rate.
type RunCatalog ¶
type RunCatalog struct {
Models []string `json:"models"`
Providers []string `json:"providers"`
ToolServers []string `json:"tool_servers"`
ToolServerVersions []string `json:"tool_server_versions"`
ToolServerVersionsByServer map[string][]string `json:"tool_server_versions_by_server,omitempty"`
SkillIDs []string `json:"skill_ids,omitempty"`
SkillVersions []string `json:"skill_versions,omitempty"`
SkillVersionsByID map[string][]string `json:"skill_versions_by_id,omitempty"`
}
RunCatalog holds distinct metadata values used for UI filters.
type RunFilters ¶
type RunFilters struct {
ScenarioID string
ScenarioIDs []string
Model string
Provider string
ToolServer string
ToolServerVersion string
SkillID string
SkillVersion string
SkillUnset bool // exact baseline/native-prompt runs where skill_id is empty
ReportID string
ToolServerUnset bool // exact baseline/native-tool runs where tool_server is empty
PassedOnly bool
FailedOnly bool
Since *time.Time // cutoff time — handler parses, store just uses
Limit int
Offset int
SortBy string // column to sort by
SortOrder string // asc or desc (default: desc)
ExcludeErrors bool // exclude infra errors (exit_code < 0)
}
RunFilters specifies filters for listing runs.
type RunRecord ¶
type RunRecord struct {
ID string `json:"id"`
TenantID string `json:"tenant_id"`
ScenarioID string `json:"scenario_id"`
Model string `json:"model"`
Provider string `json:"provider"`
Adapter string `json:"adapter"`
ToolServer string `json:"tool_server"` // MCP server used (empty = baseline/direct exec)
ToolServerVersion string `json:"tool_server_version"` // version of MCP server binary
SkillID string `json:"skill_id"` // skill prompt identity (empty = no skill)
SkillVersion string `json:"skill_version"` // skill prompt version
SkillSource string `json:"skill_source"` // local-file, local-temp, registry, etc.
SkillSHA256 string `json:"skill_sha256"` // skill prompt content digest
ScenarioVersion string `json:"scenario_version"` // version/hash of scenario definition
Passed bool `json:"passed"`
Duration float64 `json:"duration_seconds"`
ExitCode int `json:"exit_code"`
Turns int `json:"turns"`
MemoryWindow int `json:"memory_window"`
PromptTokens int `json:"prompt_tokens"`
CompletionTokens int `json:"completion_tokens"`
EstimatedCost float64 `json:"estimated_cost_usd"`
ChecksPassed int `json:"checks_passed"`
ChecksTotal int `json:"checks_total"`
ChecksJSON string `json:"checks_json,omitempty"`
MetadataJSON string `json:"metadata_json,omitempty"`
ArtifactDir string `json:"artifact_dir,omitempty"` // local filesystem path (bench runner only)
CreatedAt time.Time `json:"created_at"`
}
RunRecord represents a single benchmark run stored in bench_runs.
type ScenarioStat ¶
type ScenarioStat struct {
ScenarioID string `json:"scenario_id"`
Runs int `json:"runs"`
Passed int `json:"passed"`
}
ScenarioStat holds per-scenario stats.
type ScenarioSummary ¶
type ScenarioSummary struct {
ID string `json:"id"`
Title string `json:"title"`
Description string `json:"description,omitempty"`
Category string `json:"category"`
Track string `json:"track,omitempty"`
Level string `json:"level,omitempty"`
Timeout string `json:"timeout,omitempty"`
Tags []string `json:"tags"`
Chaos bool `json:"chaos"`
Skip bool `json:"skip,omitempty"`
}
ScenarioSummary holds metadata about a scenario for listing.
type SignalAggregation ¶
type SignalAggregation struct {
TotalRuns int `json:"total_runs"`
RunsWithScorecard int `json:"runs_with_scorecard"`
Signals map[string]SignalCount `json:"signals"`
AvgScore float64 `json:"avg_score"`
}
SignalAggregation holds aggregated signal counts across runs.
type SignalCount ¶
type SignalCount struct {
Total int `json:"total"` // total detections
RunCount int `json:"run_count"` // runs where detected > 0
}
SignalCount holds detection stats for a single signal type.
type StatsResult ¶
type StatsResult struct {
TotalRuns int `json:"total_runs"`
PassCount int `json:"pass_count"`
FailCount int `json:"fail_count"`
ByScenario []ScenarioStat `json:"by_scenario"`
}
StatsResult holds aggregate run statistics.
type Timeline ¶
type Timeline struct {
Steps []TimelineStep `json:"steps"`
PhaseCount map[Phase]int `json:"phase_count"`
MutationCount int `json:"mutation_count"`
TotalSteps int `json:"total_steps"`
// DiagnosisDepth counts diagnose-phase steps, which by construction
// only occur before the first mutation.
DiagnosisDepth int `json:"diagnosis_depth"`
}
Timeline is the full classified sequence of agent actions.
type TimelineStep ¶
type TimelineStep struct {
Index int `json:"index"`
Phase Phase `json:"phase"`
Tool string `json:"tool"`
Operation string `json:"operation"`
Command string `json:"command"`
Resource string `json:"resource,omitempty"`
Namespace string `json:"namespace,omitempty"`
ExitCode int `json:"exit_code"`
Summary string `json:"summary"`
}
TimelineStep is a single classified step in the decision timeline.