Documentation
¶
Index ¶
- func CompareEvalResults(baseline *loomv1.EvalResult, candidate *loomv1.EvalResult, metrics []string) map[string]float64
- func CompareWithGoldenFile(goldenFilePath string, actualOutput string, threshold float64) (*loomv1.GoldenFileResult, error)
- func CreateMockResult(suiteName, agentID string, passed bool) *loomv1.EvalResult
- func CreateMockResultWithTime(suiteName, agentID string, passed bool, runAt time.Time) *loomv1.EvalResult
- func ExportBatch(ctx context.Context, results []*loomv1.EvalResult, config *HawkExportConfig) error
- func ExportToHawk(ctx context.Context, result *loomv1.EvalResult, config *HawkExportConfig) error
- func FormatEvalResult(result *loomv1.EvalResult) string
- func LoadEvalSuite(path string) (*loomv1.EvalSuite, error)
- func SimilarityScore(a, b string) float64
- func UpdateGoldenFile(goldenFilePath string, content string) error
- func ValidateEvalSuite(suite *loomv1.EvalSuite) error
- func ValidateTestResult(tc *loomv1.TestCase, actualOutput string, toolsUsed []string, costUsd float64, ...) *loomv1.TestCaseResult
- type Comparison
- type ComparisonConfigYAML
- type EvalMetadataYAML
- type EvalSpecYAML
- type EvalSuiteYAML
- type EvalSummary
- type GoldenFileConfigYAML
- type HawkExportConfig
- type JudgeConfigYAML
- type MetricsCalculator
- type MultiJudgeConfigYAML
- type Store
- func (s *Store) Close() error
- func (s *Store) Compare(ctx context.Context, baselineID, candidateID int64) (*Comparison, error)
- func (s *Store) DeleteOlderThan(ctx context.Context, cutoff time.Time) (int64, error)
- func (s *Store) Get(ctx context.Context, id int64) (*loomv1.EvalResult, error)
- func (s *Store) GetLatest(ctx context.Context, suiteName string) (*loomv1.EvalResult, error)
- func (s *Store) GetSummary(ctx context.Context) (*EvalSummary, error)
- func (s *Store) GetTrends(ctx context.Context, suiteName string, days int) ([]*TrendPoint, error)
- func (s *Store) ListByAgent(ctx context.Context, agentID string, limit int) ([]*loomv1.EvalResult, error)
- func (s *Store) ListBySuite(ctx context.Context, suiteName string, limit int) ([]*loomv1.EvalResult, error)
- func (s *Store) Save(ctx context.Context, result *loomv1.EvalResult) (int64, error)
- type TestCaseYAML
- type TrendPoint
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func CompareEvalResults ¶
func CompareEvalResults(baseline *loomv1.EvalResult, candidate *loomv1.EvalResult, metrics []string) map[string]float64
CompareEvalResults compares two eval results for A/B testing
func CompareWithGoldenFile ¶
func CompareWithGoldenFile(goldenFilePath string, actualOutput string, threshold float64) (*loomv1.GoldenFileResult, error)
CompareWithGoldenFile compares actual output with a golden file
func CreateMockResult ¶
func CreateMockResult(suiteName, agentID string, passed bool) *loomv1.EvalResult
CreateMockResult creates a mock eval result for testing
func CreateMockResultWithTime ¶
func CreateMockResultWithTime(suiteName, agentID string, passed bool, runAt time.Time) *loomv1.EvalResult
CreateMockResultWithTime creates a mock eval result with a specific timestamp
func ExportBatch ¶ added in v1.0.2
func ExportBatch(ctx context.Context, results []*loomv1.EvalResult, config *HawkExportConfig) error
ExportBatch exports multiple eval results to Hawk in a single request
func ExportToHawk ¶
func ExportToHawk(ctx context.Context, result *loomv1.EvalResult, config *HawkExportConfig) error
ExportToHawk exports an eval result to Hawk for tracking and analysis
func FormatEvalResult ¶
func FormatEvalResult(result *loomv1.EvalResult) string
FormatEvalResult formats an eval result for human-readable output
func LoadEvalSuite ¶
LoadEvalSuite loads an eval suite from a YAML file
func SimilarityScore ¶
SimilarityScore calculates a simple similarity score between two strings This is a simpler alternative to calculateSimilarity for basic use cases
func UpdateGoldenFile ¶
UpdateGoldenFile updates a golden file with new content
func ValidateEvalSuite ¶
ValidateEvalSuite validates an eval suite configuration
func ValidateTestResult ¶
func ValidateTestResult(tc *loomv1.TestCase, actualOutput string, toolsUsed []string, costUsd float64, latencyMs int64) *loomv1.TestCaseResult
ValidateTestResult validates a single test case result against expectations
Types ¶
type Comparison ¶
type Comparison struct {
Baseline *loomv1.EvalResult
Candidate *loomv1.EvalResult
AccuracyDelta float64
CostDelta float64
LatencyDelta int64
PassedTestDelta int32
}
Comparison represents a comparison between two eval results
type ComparisonConfigYAML ¶
type EvalMetadataYAML ¶
type EvalSpecYAML ¶
type EvalSpecYAML struct {
AgentID string `yaml:"agent_id"`
TestCases []TestCaseYAML `yaml:"test_cases"`
Metrics []string `yaml:"metrics"`
HawkExport bool `yaml:"hawk_export"`
GoldenFiles GoldenFileConfigYAML `yaml:"golden_files"`
TimeoutSeconds int `yaml:"timeout_seconds"`
Comparison ComparisonConfigYAML `yaml:"comparison"`
MultiJudge MultiJudgeConfigYAML `yaml:"multi_judge"`
}
type EvalSuiteYAML ¶
type EvalSuiteYAML struct {
APIVersion string `yaml:"apiVersion"`
Kind string `yaml:"kind"`
Metadata EvalMetadataYAML `yaml:"metadata"`
Spec EvalSpecYAML `yaml:"spec"`
}
EvalSuiteYAML represents the YAML structure for eval suite configuration
type EvalSummary ¶
type EvalSummary struct {
TotalRuns int
PassedRuns int
AvgAccuracy float64
TotalCost float64
TotalSuites int
TotalAgents int
}
EvalSummary represents a summary of eval results
type GoldenFileConfigYAML ¶
type HawkExportConfig ¶
type HawkExportConfig struct {
// Endpoint is the Hawk API endpoint for eval results
// Default: $HAWK_ENDPOINT or http://localhost:8080
Endpoint string
// APIKey for authentication
// Default: $HAWK_API_KEY
APIKey string
// Timeout for HTTP requests
// Default: 10s
Timeout time.Duration
// HTTPClient for custom transport
// If nil, uses http.DefaultClient with configured timeout
HTTPClient *http.Client
}
HawkExportConfig configures Hawk export for eval results
type JudgeConfigYAML ¶
type MetricsCalculator ¶
type MetricsCalculator struct {
// contains filtered or unexported fields
}
MetricsCalculator calculates evaluation metrics from test results
func NewMetricsCalculator ¶
func NewMetricsCalculator(suite *loomv1.EvalSuite) *MetricsCalculator
NewMetricsCalculator creates a new metrics calculator
func (*MetricsCalculator) Calculate ¶
func (m *MetricsCalculator) Calculate(results []*loomv1.TestCaseResult) *loomv1.EvalMetrics
Calculate calculates all metrics for eval results
func (*MetricsCalculator) CreateEvalResult ¶
func (m *MetricsCalculator) CreateEvalResult( suiteName string, agentID string, testResults []*loomv1.TestCaseResult, ) *loomv1.EvalResult
CreateEvalResult creates a complete eval result from test case results
type MultiJudgeConfigYAML ¶
type MultiJudgeConfigYAML struct {
Parallel bool `yaml:"parallel"`
TimeoutSeconds int `yaml:"timeout_seconds"`
FailFast bool `yaml:"fail_fast"`
Aggregation string `yaml:"aggregation"`
ExecutionMode string `yaml:"execution_mode"`
ExportToHawk bool `yaml:"export_to_hawk"`
Judges []JudgeConfigYAML `yaml:"judges"`
}
type Store ¶
type Store struct {
// contains filtered or unexported fields
}
Store manages persistent storage of eval results
func NewStore ¶
NewStore creates a new eval store Use ":memory:" for in-memory database (useful for testing)
func (*Store) DeleteOlderThan ¶
DeleteOlderThan deletes eval results older than the specified time
func (*Store) GetSummary ¶
func (s *Store) GetSummary(ctx context.Context) (*EvalSummary, error)
GetSummary gets a summary of all eval results
func (*Store) ListByAgent ¶
func (s *Store) ListByAgent(ctx context.Context, agentID string, limit int) ([]*loomv1.EvalResult, error)
ListByAgent lists all eval results for an agent
func (*Store) ListBySuite ¶
func (s *Store) ListBySuite(ctx context.Context, suiteName string, limit int) ([]*loomv1.EvalResult, error)
ListBySuite lists all eval results for a suite
type TestCaseYAML ¶
type TestCaseYAML struct {
Name string `yaml:"name"`
Input string `yaml:"input"`
ExpectedOutputContains []string `yaml:"expected_output_contains"`
ExpectedOutputNotContains []string `yaml:"expected_output_not_contains"`
ExpectedOutputRegex string `yaml:"expected_output_regex"`
ExpectedTools []string `yaml:"expected_tools"`
MaxCostUSD float64 `yaml:"max_cost_usd"`
MaxLatencyMS int `yaml:"max_latency_ms"`
Context map[string]string `yaml:"context"`
GoldenFile string `yaml:"golden_file"`
}