evals

package

v1.0.2 Latest Latest Go to latest Published: Jan 15, 2026 License: Apache-2.0 Imports: 16 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/teradata-labs/loom

Links

Open Source Insights

Documentation ¶

Index ¶

func CompareEvalResults(baseline *loomv1.EvalResult, candidate *loomv1.EvalResult, metrics []string) map[string]float64
func CompareWithGoldenFile(goldenFilePath string, actualOutput string, threshold float64) (*loomv1.GoldenFileResult, error)
func CreateMockResult(suiteName, agentID string, passed bool) *loomv1.EvalResult
func CreateMockResultWithTime(suiteName, agentID string, passed bool, runAt time.Time) *loomv1.EvalResult
func ExportBatch(ctx context.Context, results []*loomv1.EvalResult, config *HawkExportConfig) error
func ExportToHawk(ctx context.Context, result *loomv1.EvalResult, config *HawkExportConfig) error
func FormatEvalResult(result *loomv1.EvalResult) string
func LoadEvalSuite(path string) (*loomv1.EvalSuite, error)
func SimilarityScore(a, b string) float64
func UpdateGoldenFile(goldenFilePath string, content string) error
func ValidateEvalSuite(suite *loomv1.EvalSuite) error
func ValidateTestResult(tc *loomv1.TestCase, actualOutput string, toolsUsed []string, costUsd float64, ...) *loomv1.TestCaseResult
type Comparison
type ComparisonConfigYAML
type EvalMetadataYAML
type EvalSpecYAML
type EvalSuiteYAML
type EvalSummary
type GoldenFileConfigYAML
type HawkExportConfig
type JudgeConfigYAML
type MetricsCalculator
- func NewMetricsCalculator(suite *loomv1.EvalSuite) *MetricsCalculator
- func (m *MetricsCalculator) Calculate(results []*loomv1.TestCaseResult) *loomv1.EvalMetrics
- func (m *MetricsCalculator) CreateEvalResult(suiteName string, agentID string, testResults []*loomv1.TestCaseResult) *loomv1.EvalResult
type MultiJudgeConfigYAML
type Store
- func NewStore(dbPath string) (*Store, error)
- func (s *Store) Close() error
- func (s *Store) Compare(ctx context.Context, baselineID, candidateID int64) (*Comparison, error)
- func (s *Store) DeleteOlderThan(ctx context.Context, cutoff time.Time) (int64, error)
- func (s *Store) Get(ctx context.Context, id int64) (*loomv1.EvalResult, error)
- func (s *Store) GetLatest(ctx context.Context, suiteName string) (*loomv1.EvalResult, error)
- func (s *Store) GetSummary(ctx context.Context) (*EvalSummary, error)
- func (s *Store) GetTrends(ctx context.Context, suiteName string, days int) ([]*TrendPoint, error)
- func (s *Store) ListByAgent(ctx context.Context, agentID string, limit int) ([]*loomv1.EvalResult, error)
- func (s *Store) ListBySuite(ctx context.Context, suiteName string, limit int) ([]*loomv1.EvalResult, error)
- func (s *Store) Save(ctx context.Context, result *loomv1.EvalResult) (int64, error)
type TestCaseYAML
type TrendPoint

Constants ¶

This section is empty.

Variables ¶

This section is empty.

Functions ¶

func CompareEvalResults ¶

func CompareEvalResults(baseline *loomv1.EvalResult, candidate *loomv1.EvalResult, metrics []string) map[string]float64

CompareEvalResults compares two eval results for A/B testing

func CompareWithGoldenFile ¶

func CompareWithGoldenFile(goldenFilePath string, actualOutput string, threshold float64) (*loomv1.GoldenFileResult, error)

CompareWithGoldenFile compares actual output with a golden file

func CreateMockResult ¶

func CreateMockResult(suiteName, agentID string, passed bool) *loomv1.EvalResult

CreateMockResult creates a mock eval result for testing

func CreateMockResultWithTime ¶

func CreateMockResultWithTime(suiteName, agentID string, passed bool, runAt time.Time) *loomv1.EvalResult

CreateMockResultWithTime creates a mock eval result with a specific timestamp

func ExportBatch ¶ added in v1.0.2

func ExportBatch(ctx context.Context, results []*loomv1.EvalResult, config *HawkExportConfig) error

ExportBatch exports multiple eval results to Hawk in a single request

func ExportToHawk ¶

func ExportToHawk(ctx context.Context, result *loomv1.EvalResult, config *HawkExportConfig) error

ExportToHawk exports an eval result to Hawk for tracking and analysis

func FormatEvalResult ¶

func FormatEvalResult(result *loomv1.EvalResult) string

FormatEvalResult formats an eval result for human-readable output

func LoadEvalSuite ¶

func LoadEvalSuite(path string) (*loomv1.EvalSuite, error)

LoadEvalSuite loads an eval suite from a YAML file

func SimilarityScore ¶

func SimilarityScore(a, b string) float64

SimilarityScore calculates a simple similarity score between two strings This is a simpler alternative to calculateSimilarity for basic use cases

func UpdateGoldenFile ¶

func UpdateGoldenFile(goldenFilePath string, content string) error

UpdateGoldenFile updates a golden file with new content

func ValidateEvalSuite ¶

func ValidateEvalSuite(suite *loomv1.EvalSuite) error

ValidateEvalSuite validates an eval suite configuration

func ValidateTestResult ¶

func ValidateTestResult(tc *loomv1.TestCase, actualOutput string, toolsUsed []string, costUsd float64, latencyMs int64) *loomv1.TestCaseResult

ValidateTestResult validates a single test case result against expectations

Types ¶

type Comparison ¶

type Comparison struct {
	Baseline        *loomv1.EvalResult
	Candidate       *loomv1.EvalResult
	AccuracyDelta   float64
	CostDelta       float64
	LatencyDelta    int64
	PassedTestDelta int32
}

Comparison represents a comparison between two eval results

type ComparisonConfigYAML ¶

type ComparisonConfigYAML struct {
	BaselineAgentID   string   `yaml:"baseline_agent_id"`
	ComparisonMetrics []string `yaml:"comparison_metrics"`
}

type EvalMetadataYAML ¶

type EvalMetadataYAML struct {
	Name        string            `yaml:"name"`
	Version     string            `yaml:"version"`
	Description string            `yaml:"description"`
	Labels      map[string]string `yaml:"labels"`
}

type EvalSpecYAML ¶

type EvalSpecYAML struct {
	AgentID        string               `yaml:"agent_id"`
	TestCases      []TestCaseYAML       `yaml:"test_cases"`
	Metrics        []string             `yaml:"metrics"`
	HawkExport     bool                 `yaml:"hawk_export"`
	GoldenFiles    GoldenFileConfigYAML `yaml:"golden_files"`
	TimeoutSeconds int                  `yaml:"timeout_seconds"`
	Comparison     ComparisonConfigYAML `yaml:"comparison"`
	MultiJudge     MultiJudgeConfigYAML `yaml:"multi_judge"`
}

type EvalSuiteYAML ¶

type EvalSuiteYAML struct {
	APIVersion string           `yaml:"apiVersion"`
	Kind       string           `yaml:"kind"`
	Metadata   EvalMetadataYAML `yaml:"metadata"`
	Spec       EvalSpecYAML     `yaml:"spec"`
}

EvalSuiteYAML represents the YAML structure for eval suite configuration

type EvalSummary ¶

type EvalSummary struct {
	TotalRuns   int
	PassedRuns  int
	AvgAccuracy float64
	TotalCost   float64
	TotalSuites int
	TotalAgents int
}

EvalSummary represents a summary of eval results

type GoldenFileConfigYAML ¶

type GoldenFileConfigYAML struct {
	Directory           string  `yaml:"directory"`
	UpdateOnMismatch    bool    `yaml:"update_on_mismatch"`
	SimilarityThreshold float64 `yaml:"similarity_threshold"`
}

type HawkExportConfig ¶

type HawkExportConfig struct {
	// Endpoint is the Hawk API endpoint for eval results
	// Default: $HAWK_ENDPOINT or http://localhost:8080
	Endpoint string

	// APIKey for authentication
	// Default: $HAWK_API_KEY
	APIKey string

	// Timeout for HTTP requests
	// Default: 10s
	Timeout time.Duration

	// HTTPClient for custom transport
	// If nil, uses http.DefaultClient with configured timeout
	HTTPClient *http.Client
}

HawkExportConfig configures Hawk export for eval results

type JudgeConfigYAML ¶

type JudgeConfigYAML struct {
	Name            string   `yaml:"name"`
	Criteria        string   `yaml:"criteria"`
	Weight          float64  `yaml:"weight"`
	MinPassingScore float64  `yaml:"min_passing_score"`
	Criticality     string   `yaml:"criticality"`
	Dimensions      []string `yaml:"dimensions"`
}

type MetricsCalculator ¶

type MetricsCalculator struct {
	// contains filtered or unexported fields
}

MetricsCalculator calculates evaluation metrics from test results

func NewMetricsCalculator ¶

func NewMetricsCalculator(suite *loomv1.EvalSuite) *MetricsCalculator

NewMetricsCalculator creates a new metrics calculator

func (*MetricsCalculator) Calculate ¶

func (m *MetricsCalculator) Calculate(results []*loomv1.TestCaseResult) *loomv1.EvalMetrics

Calculate calculates all metrics for eval results

func (*MetricsCalculator) CreateEvalResult ¶

func (m *MetricsCalculator) CreateEvalResult(
	suiteName string,
	agentID string,
	testResults []*loomv1.TestCaseResult,
) *loomv1.EvalResult

CreateEvalResult creates a complete eval result from test case results

type MultiJudgeConfigYAML ¶

type MultiJudgeConfigYAML struct {
	Parallel       bool              `yaml:"parallel"`
	TimeoutSeconds int               `yaml:"timeout_seconds"`
	FailFast       bool              `yaml:"fail_fast"`
	Aggregation    string            `yaml:"aggregation"`
	ExecutionMode  string            `yaml:"execution_mode"`
	ExportToHawk   bool              `yaml:"export_to_hawk"`
	Judges         []JudgeConfigYAML `yaml:"judges"`
}

type Store ¶

type Store struct {
	// contains filtered or unexported fields
}

Store manages persistent storage of eval results

func NewStore ¶

func NewStore(dbPath string) (*Store, error)

NewStore creates a new eval store Use ":memory:" for in-memory database (useful for testing)

func (*Store) Close ¶

func (s *Store) Close() error

Close closes the database connection

func (*Store) Compare ¶

func (s *Store) Compare(ctx context.Context, baselineID, candidateID int64) (*Comparison, error)

Compare compares two eval results and returns a comparison

func (*Store) DeleteOlderThan ¶

func (s *Store) DeleteOlderThan(ctx context.Context, cutoff time.Time) (int64, error)

DeleteOlderThan deletes eval results older than the specified time

func (*Store) Get ¶

func (s *Store) Get(ctx context.Context, id int64) (*loomv1.EvalResult, error)

Get retrieves an eval result by ID

func (*Store) GetLatest ¶

func (s *Store) GetLatest(ctx context.Context, suiteName string) (*loomv1.EvalResult, error)

GetLatest gets the most recent eval result for a suite

func (*Store) GetSummary ¶

func (s *Store) GetSummary(ctx context.Context) (*EvalSummary, error)

GetSummary gets a summary of all eval results

func (*Store) GetTrends ¶

func (s *Store) GetTrends(ctx context.Context, suiteName string, days int) ([]*TrendPoint, error)

GetTrends gets accuracy trends over time for a suite

func (*Store) ListByAgent ¶

func (s *Store) ListByAgent(ctx context.Context, agentID string, limit int) ([]*loomv1.EvalResult, error)

ListByAgent lists all eval results for an agent

func (*Store) ListBySuite ¶

func (s *Store) ListBySuite(ctx context.Context, suiteName string, limit int) ([]*loomv1.EvalResult, error)

ListBySuite lists all eval results for a suite

func (*Store) Save ¶

func (s *Store) Save(ctx context.Context, result *loomv1.EvalResult) (int64, error)

Save saves an eval result to the database

type TestCaseYAML ¶

type TestCaseYAML struct {
	Name                      string            `yaml:"name"`
	Input                     string            `yaml:"input"`
	ExpectedOutputContains    []string          `yaml:"expected_output_contains"`
	ExpectedOutputNotContains []string          `yaml:"expected_output_not_contains"`
	ExpectedOutputRegex       string            `yaml:"expected_output_regex"`
	ExpectedTools             []string          `yaml:"expected_tools"`
	MaxCostUSD                float64           `yaml:"max_cost_usd"`
	MaxLatencyMS              int               `yaml:"max_latency_ms"`
	Context                   map[string]string `yaml:"context"`
	GoldenFile                string            `yaml:"golden_file"`
}

type TrendPoint ¶

type TrendPoint struct {
	Date        time.Time
	AvgAccuracy float64
	AvgCost     float64
	Runs        int
}

TrendPoint represents a point in an accuracy trend

Source Files ¶

View all Source files

Directories ¶

Path	Synopsis
judges
llm

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL