evals

package
v1.0.2 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 15, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func CompareEvalResults

func CompareEvalResults(baseline *loomv1.EvalResult, candidate *loomv1.EvalResult, metrics []string) map[string]float64

CompareEvalResults compares two eval results for A/B testing

func CompareWithGoldenFile

func CompareWithGoldenFile(goldenFilePath string, actualOutput string, threshold float64) (*loomv1.GoldenFileResult, error)

CompareWithGoldenFile compares actual output with a golden file

func CreateMockResult

func CreateMockResult(suiteName, agentID string, passed bool) *loomv1.EvalResult

CreateMockResult creates a mock eval result for testing

func CreateMockResultWithTime

func CreateMockResultWithTime(suiteName, agentID string, passed bool, runAt time.Time) *loomv1.EvalResult

CreateMockResultWithTime creates a mock eval result with a specific timestamp

func ExportBatch added in v1.0.2

func ExportBatch(ctx context.Context, results []*loomv1.EvalResult, config *HawkExportConfig) error

ExportBatch exports multiple eval results to Hawk in a single request

func ExportToHawk

func ExportToHawk(ctx context.Context, result *loomv1.EvalResult, config *HawkExportConfig) error

ExportToHawk exports an eval result to Hawk for tracking and analysis

func FormatEvalResult

func FormatEvalResult(result *loomv1.EvalResult) string

FormatEvalResult formats an eval result for human-readable output

func LoadEvalSuite

func LoadEvalSuite(path string) (*loomv1.EvalSuite, error)

LoadEvalSuite loads an eval suite from a YAML file

func SimilarityScore

func SimilarityScore(a, b string) float64

SimilarityScore calculates a simple similarity score between two strings This is a simpler alternative to calculateSimilarity for basic use cases

func UpdateGoldenFile

func UpdateGoldenFile(goldenFilePath string, content string) error

UpdateGoldenFile updates a golden file with new content

func ValidateEvalSuite

func ValidateEvalSuite(suite *loomv1.EvalSuite) error

ValidateEvalSuite validates an eval suite configuration

func ValidateTestResult

func ValidateTestResult(tc *loomv1.TestCase, actualOutput string, toolsUsed []string, costUsd float64, latencyMs int64) *loomv1.TestCaseResult

ValidateTestResult validates a single test case result against expectations

Types

type Comparison

type Comparison struct {
	Baseline        *loomv1.EvalResult
	Candidate       *loomv1.EvalResult
	AccuracyDelta   float64
	CostDelta       float64
	LatencyDelta    int64
	PassedTestDelta int32
}

Comparison represents a comparison between two eval results

type ComparisonConfigYAML

type ComparisonConfigYAML struct {
	BaselineAgentID   string   `yaml:"baseline_agent_id"`
	ComparisonMetrics []string `yaml:"comparison_metrics"`
}

type EvalMetadataYAML

type EvalMetadataYAML struct {
	Name        string            `yaml:"name"`
	Version     string            `yaml:"version"`
	Description string            `yaml:"description"`
	Labels      map[string]string `yaml:"labels"`
}

type EvalSpecYAML

type EvalSpecYAML struct {
	AgentID        string               `yaml:"agent_id"`
	TestCases      []TestCaseYAML       `yaml:"test_cases"`
	Metrics        []string             `yaml:"metrics"`
	HawkExport     bool                 `yaml:"hawk_export"`
	GoldenFiles    GoldenFileConfigYAML `yaml:"golden_files"`
	TimeoutSeconds int                  `yaml:"timeout_seconds"`
	Comparison     ComparisonConfigYAML `yaml:"comparison"`
	MultiJudge     MultiJudgeConfigYAML `yaml:"multi_judge"`
}

type EvalSuiteYAML

type EvalSuiteYAML struct {
	APIVersion string           `yaml:"apiVersion"`
	Kind       string           `yaml:"kind"`
	Metadata   EvalMetadataYAML `yaml:"metadata"`
	Spec       EvalSpecYAML     `yaml:"spec"`
}

EvalSuiteYAML represents the YAML structure for eval suite configuration

type EvalSummary

type EvalSummary struct {
	TotalRuns   int
	PassedRuns  int
	AvgAccuracy float64
	TotalCost   float64
	TotalSuites int
	TotalAgents int
}

EvalSummary represents a summary of eval results

type GoldenFileConfigYAML

type GoldenFileConfigYAML struct {
	Directory           string  `yaml:"directory"`
	UpdateOnMismatch    bool    `yaml:"update_on_mismatch"`
	SimilarityThreshold float64 `yaml:"similarity_threshold"`
}

type HawkExportConfig

type HawkExportConfig struct {
	// Endpoint is the Hawk API endpoint for eval results
	// Default: $HAWK_ENDPOINT or http://localhost:8080
	Endpoint string

	// APIKey for authentication
	// Default: $HAWK_API_KEY
	APIKey string

	// Timeout for HTTP requests
	// Default: 10s
	Timeout time.Duration

	// HTTPClient for custom transport
	// If nil, uses http.DefaultClient with configured timeout
	HTTPClient *http.Client
}

HawkExportConfig configures Hawk export for eval results

type JudgeConfigYAML

type JudgeConfigYAML struct {
	Name            string   `yaml:"name"`
	Criteria        string   `yaml:"criteria"`
	Weight          float64  `yaml:"weight"`
	MinPassingScore float64  `yaml:"min_passing_score"`
	Criticality     string   `yaml:"criticality"`
	Dimensions      []string `yaml:"dimensions"`
}

type MetricsCalculator

type MetricsCalculator struct {
	// contains filtered or unexported fields
}

MetricsCalculator calculates evaluation metrics from test results

func NewMetricsCalculator

func NewMetricsCalculator(suite *loomv1.EvalSuite) *MetricsCalculator

NewMetricsCalculator creates a new metrics calculator

func (*MetricsCalculator) Calculate

func (m *MetricsCalculator) Calculate(results []*loomv1.TestCaseResult) *loomv1.EvalMetrics

Calculate calculates all metrics for eval results

func (*MetricsCalculator) CreateEvalResult

func (m *MetricsCalculator) CreateEvalResult(
	suiteName string,
	agentID string,
	testResults []*loomv1.TestCaseResult,
) *loomv1.EvalResult

CreateEvalResult creates a complete eval result from test case results

type MultiJudgeConfigYAML

type MultiJudgeConfigYAML struct {
	Parallel       bool              `yaml:"parallel"`
	TimeoutSeconds int               `yaml:"timeout_seconds"`
	FailFast       bool              `yaml:"fail_fast"`
	Aggregation    string            `yaml:"aggregation"`
	ExecutionMode  string            `yaml:"execution_mode"`
	ExportToHawk   bool              `yaml:"export_to_hawk"`
	Judges         []JudgeConfigYAML `yaml:"judges"`
}

type Store

type Store struct {
	// contains filtered or unexported fields
}

Store manages persistent storage of eval results

func NewStore

func NewStore(dbPath string) (*Store, error)

NewStore creates a new eval store Use ":memory:" for in-memory database (useful for testing)

func (*Store) Close

func (s *Store) Close() error

Close closes the database connection

func (*Store) Compare

func (s *Store) Compare(ctx context.Context, baselineID, candidateID int64) (*Comparison, error)

Compare compares two eval results and returns a comparison

func (*Store) DeleteOlderThan

func (s *Store) DeleteOlderThan(ctx context.Context, cutoff time.Time) (int64, error)

DeleteOlderThan deletes eval results older than the specified time

func (*Store) Get

func (s *Store) Get(ctx context.Context, id int64) (*loomv1.EvalResult, error)

Get retrieves an eval result by ID

func (*Store) GetLatest

func (s *Store) GetLatest(ctx context.Context, suiteName string) (*loomv1.EvalResult, error)

GetLatest gets the most recent eval result for a suite

func (*Store) GetSummary

func (s *Store) GetSummary(ctx context.Context) (*EvalSummary, error)

GetSummary gets a summary of all eval results

func (*Store) GetTrends

func (s *Store) GetTrends(ctx context.Context, suiteName string, days int) ([]*TrendPoint, error)

GetTrends gets accuracy trends over time for a suite

func (*Store) ListByAgent

func (s *Store) ListByAgent(ctx context.Context, agentID string, limit int) ([]*loomv1.EvalResult, error)

ListByAgent lists all eval results for an agent

func (*Store) ListBySuite

func (s *Store) ListBySuite(ctx context.Context, suiteName string, limit int) ([]*loomv1.EvalResult, error)

ListBySuite lists all eval results for a suite

func (*Store) Save

func (s *Store) Save(ctx context.Context, result *loomv1.EvalResult) (int64, error)

Save saves an eval result to the database

type TestCaseYAML

type TestCaseYAML struct {
	Name                      string            `yaml:"name"`
	Input                     string            `yaml:"input"`
	ExpectedOutputContains    []string          `yaml:"expected_output_contains"`
	ExpectedOutputNotContains []string          `yaml:"expected_output_not_contains"`
	ExpectedOutputRegex       string            `yaml:"expected_output_regex"`
	ExpectedTools             []string          `yaml:"expected_tools"`
	MaxCostUSD                float64           `yaml:"max_cost_usd"`
	MaxLatencyMS              int               `yaml:"max_latency_ms"`
	Context                   map[string]string `yaml:"context"`
	GoldenFile                string            `yaml:"golden_file"`
}

type TrendPoint

type TrendPoint struct {
	Date        time.Time
	AvgAccuracy float64
	AvgCost     float64
	Runs        int
}

TrendPoint represents a point in an accuracy trend

Directories

Path Synopsis
llm

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL