evaluation

package
v0.9.11 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 27, 2026 License: MIT Imports: 17 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GenerateRunID

func GenerateRunID() string

GenerateRunID generates a unique run ID

func GetDefaultCustomEvaluatorDir added in v0.9.10

func GetDefaultCustomEvaluatorDir() string

GetDefaultCustomEvaluatorDir returns the default directory for custom evaluators

func GetDefaultDatasetDir

func GetDefaultDatasetDir() string

GetDefaultDatasetDir returns the default directory for evaluation datasets

func GetDefaultResultsDir

func GetDefaultResultsDir() string

GetDefaultResultsDir returns the default directory for evaluation results

func SaveDataset

func SaveDataset(dataset *Dataset, filepath string) error

SaveDataset saves a dataset to a YAML file

func SaveResults

func SaveResults(run *EvaluationRun, format string) (string, error)

SaveResults saves evaluation results to a file

func SaveResultsToFile

func SaveResultsToFile(run *EvaluationRun, filePath, format string) error

SaveResultsToFile saves evaluation results to a specific file path

func UnmarshalYAML added in v0.9.10

func UnmarshalYAML(data []byte, v interface{}) error

UnmarshalYAML is a helper function to unmarshal YAML data

Types

type AccuracyEvaluator

type AccuracyEvaluator struct {
	// contains filtered or unexported fields
}

AccuracyEvaluator evaluates semantic similarity between expected and actual answers

func NewAccuracyEvaluator

func NewAccuracyEvaluator(llmClient llm.Client) *AccuracyEvaluator

NewAccuracyEvaluator creates a new accuracy evaluator

func (*AccuracyEvaluator) Evaluate

func (e *AccuracyEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate computes semantic similarity score

func (*AccuracyEvaluator) Name

func (e *AccuracyEvaluator) Name() string

Name returns the evaluator name

type CitationEvaluator

type CitationEvaluator struct{}

CitationEvaluator evaluates citation quality

func NewCitationEvaluator

func NewCitationEvaluator() *CitationEvaluator

NewCitationEvaluator creates a new citation evaluator

func (*CitationEvaluator) Evaluate

func (e *CitationEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate checks citation presence and format

func (*CitationEvaluator) Name

func (e *CitationEvaluator) Name() string

Name returns the evaluator name

type ContextRelevanceEvaluator added in v0.9.10

type ContextRelevanceEvaluator struct {
	// contains filtered or unexported fields
}

ContextRelevanceEvaluator evaluates how relevant retrieved context is to the query

func NewContextRelevanceEvaluator added in v0.9.10

func NewContextRelevanceEvaluator(llmClient llm.Client) *ContextRelevanceEvaluator

NewContextRelevanceEvaluator creates a new context relevance evaluator

func (*ContextRelevanceEvaluator) Evaluate added in v0.9.10

func (e *ContextRelevanceEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate scores how relevant the retrieved context is to the query

func (*ContextRelevanceEvaluator) Name added in v0.9.10

Name returns the evaluator name

type CustomEvaluator added in v0.9.10

type CustomEvaluator struct {
	// contains filtered or unexported fields
}

CustomEvaluator executes custom evaluator definitions

func NewCustomEvaluator added in v0.9.10

func NewCustomEvaluator(def *CustomEvaluatorDef, llmClient llm.Client) *CustomEvaluator

NewCustomEvaluator creates a custom evaluator from a definition

func (*CustomEvaluator) Evaluate added in v0.9.10

func (e *CustomEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate executes the custom evaluator

func (*CustomEvaluator) Name added in v0.9.10

func (e *CustomEvaluator) Name() string

Name returns the evaluator name

type CustomEvaluatorDef added in v0.9.10

type CustomEvaluatorDef struct {
	Name        string        `yaml:"name"`
	Description string        `yaml:"description"`
	Version     string        `yaml:"version"`
	Prompt      string        `yaml:"prompt"`
	Scoring     ScoringConfig `yaml:"scoring"`
	Model       *ModelConfig  `yaml:"model,omitempty"`
	Tags        []string      `yaml:"tags,omitempty"`
	Author      string        `yaml:"author,omitempty"`
	Required    bool          `yaml:"required"`
}

CustomEvaluatorDef defines a custom evaluator from YAML configuration

func (*CustomEvaluatorDef) Validate added in v0.9.10

func (d *CustomEvaluatorDef) Validate() error

Validate checks if the evaluator definition is valid

type CustomEvaluatorLoader added in v0.9.10

type CustomEvaluatorLoader struct {
	// contains filtered or unexported fields
}

CustomEvaluatorLoader loads custom evaluator definitions from YAML files

func NewCustomEvaluatorLoader added in v0.9.10

func NewCustomEvaluatorLoader(baseDir string) *CustomEvaluatorLoader

NewCustomEvaluatorLoader creates a loader for custom evaluators

func (*CustomEvaluatorLoader) Load added in v0.9.10

Load loads a single custom evaluator by name

func (*CustomEvaluatorLoader) LoadAll added in v0.9.10

func (l *CustomEvaluatorLoader) LoadAll() ([]*CustomEvaluatorDef, error)

LoadAll loads all custom evaluators from the directory

type Dataset

type Dataset struct {
	// Metadata
	Name        string   `yaml:"name"`
	Description string   `yaml:"description"`
	Version     string   `yaml:"version"`
	Author      string   `yaml:"author,omitempty"`
	Tags        []string `yaml:"tags,omitempty"`

	// Configuration
	Config DatasetConfig `yaml:"config"`

	// Custom evaluators to run (optional)
	CustomEvaluators []string `yaml:"custom_evaluators,omitempty"`

	// Test cases
	TestCases []TestCase `yaml:"test_cases"`
}

Dataset represents an evaluation dataset with test cases

func ListDatasets

func ListDatasets() ([]*Dataset, error)

ListDatasets returns all datasets in the default directory

func LoadDataset

func LoadDataset(filepath string) (*Dataset, error)

LoadDataset loads a dataset from a YAML file

func (*Dataset) Validate

func (d *Dataset) Validate() error

Validate validates the dataset structure

type DatasetConfig

type DatasetConfig struct {
	// Agent to use (optional - can be overridden at runtime)
	DefaultAgent string `yaml:"default_agent,omitempty"`

	// Collection to query (optional - can be overridden at runtime)
	DefaultCollection string `yaml:"default_collection,omitempty"`

	// Evaluation thresholds
	MinAccuracyScore   float64 `yaml:"min_accuracy_score,omitempty"`
	MinCitationScore   float64 `yaml:"min_citation_score,omitempty"`
	AllowHallucination bool    `yaml:"allow_hallucination,omitempty"`
}

DatasetConfig contains dataset-level configuration

type EvaluationConfig

type EvaluationConfig struct {
	AgentPath  string            `json:"agent_path" yaml:"agent_path"`
	Parameters map[string]string `json:"parameters,omitempty" yaml:"parameters,omitempty"`
}

EvaluationConfig stores the configuration used for the evaluation

type EvaluationRun

type EvaluationRun struct {
	// Metadata
	ID          string    `json:"id" yaml:"id"`
	Timestamp   time.Time `json:"timestamp" yaml:"timestamp"`
	DatasetName string    `json:"dataset_name" yaml:"dataset_name"`
	AgentName   string    `json:"agent_name" yaml:"agent_name"`
	Collection  string    `json:"collection,omitempty" yaml:"collection,omitempty"`

	// Configuration
	Config EvaluationConfig `json:"config" yaml:"config"`

	// Results
	Results []TestCaseResult `json:"results" yaml:"results"`

	// Summary
	Summary EvaluationSummary `json:"summary" yaml:"summary"`
}

EvaluationRun represents a complete evaluation run

func ListResults

func ListResults() ([]*EvaluationRun, error)

ListResults returns all available evaluation runs

func LoadResults

func LoadResults(runID string) (*EvaluationRun, error)

LoadResults loads evaluation results from a file

type EvaluationSummary

type EvaluationSummary struct {
	TotalTests  int     `json:"total_tests" yaml:"total_tests"`
	PassedTests int     `json:"passed_tests" yaml:"passed_tests"`
	FailedTests int     `json:"failed_tests" yaml:"failed_tests"`
	PassRate    float64 `json:"pass_rate" yaml:"pass_rate"`

	// Average scores
	AvgAccuracy         float64 `json:"avg_accuracy" yaml:"avg_accuracy"`
	AvgCitation         float64 `json:"avg_citation" yaml:"avg_citation"`
	AvgHallucination    float64 `json:"avg_hallucination" yaml:"avg_hallucination"`
	AvgContextRelevance float64 `json:"avg_context_relevance" yaml:"avg_context_relevance"`
	AvgFaithfulness     float64 `json:"avg_faithfulness" yaml:"avg_faithfulness"`

	// Performance
	TotalTime float64 `json:"total_time_ms" yaml:"total_time_ms"`
	AvgTime   float64 `json:"avg_time_ms" yaml:"avg_time_ms"`

	// Duration
	StartTime time.Time     `json:"start_time" yaml:"start_time"`
	EndTime   time.Time     `json:"end_time" yaml:"end_time"`
	Duration  time.Duration `json:"duration" yaml:"duration"`
}

EvaluationSummary contains aggregate statistics

func CalculateSummary

func CalculateSummary(results []TestCaseResult, startTime, endTime time.Time) EvaluationSummary

CalculateSummary calculates summary statistics from results

type Evaluator

type Evaluator interface {
	Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
	Name() string
}

Evaluator interface for all evaluators

type EvaluatorProvider added in v0.9.10

type EvaluatorProvider interface {
	// Name returns the provider name (e.g., "local", "opik")
	Name() string

	// GetAccuracyEvaluator returns an evaluator for semantic accuracy
	GetAccuracyEvaluator() Evaluator

	// GetFaithfulnessEvaluator returns an evaluator for faithfulness/groundedness
	GetFaithfulnessEvaluator() Evaluator

	// GetHallucinationEvaluator returns an evaluator for hallucination detection
	GetHallucinationEvaluator() Evaluator

	// GetContextRelevanceEvaluator returns an evaluator for context relevance
	GetContextRelevanceEvaluator() Evaluator

	// IsAvailable checks if the provider is properly configured
	IsAvailable(ctx context.Context) bool
}

EvaluatorProvider creates evaluators for different evaluation types. This interface allows pluggable evaluator backends (local, Opik, LangSmith, etc.)

func CreateProvider added in v0.9.10

func CreateProvider(ctx context.Context, providerType ProviderType, llmClient llm.Client) (EvaluatorProvider, error)

CreateProvider creates an evaluator provider of the specified type

type FaithfulnessEvaluator added in v0.9.10

type FaithfulnessEvaluator struct {
	// contains filtered or unexported fields
}

FaithfulnessEvaluator evaluates whether the answer is supported by the retrieved context

func NewFaithfulnessEvaluator added in v0.9.10

func NewFaithfulnessEvaluator(llmClient llm.Client) *FaithfulnessEvaluator

NewFaithfulnessEvaluator creates a new faithfulness evaluator

func (*FaithfulnessEvaluator) Evaluate added in v0.9.10

func (e *FaithfulnessEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate verifies that the answer is supported by the retrieved context

func (*FaithfulnessEvaluator) Name added in v0.9.10

func (e *FaithfulnessEvaluator) Name() string

Name returns the evaluator name

type HallucinationDetector

type HallucinationDetector struct {
	// contains filtered or unexported fields
}

HallucinationDetector detects potential hallucinations

func NewHallucinationDetector

func NewHallucinationDetector(llmClient llm.Client) *HallucinationDetector

NewHallucinationDetector creates a new hallucination detector

func (*HallucinationDetector) Evaluate

func (e *HallucinationDetector) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate detects hallucinations (returns score where 1.0 = no hallucination)

func (*HallucinationDetector) Name

func (e *HallucinationDetector) Name() string

Name returns the evaluator name

type LocalProvider added in v0.9.10

type LocalProvider struct {
	// contains filtered or unexported fields
}

LocalProvider uses our own LLM-as-judge evaluators. This is the default provider that uses OpenAI/Claude/etc. directly.

func NewLocalProvider added in v0.9.10

func NewLocalProvider(llmClient llm.Client) *LocalProvider

NewLocalProvider creates a local evaluator provider

func (*LocalProvider) GetAccuracyEvaluator added in v0.9.10

func (p *LocalProvider) GetAccuracyEvaluator() Evaluator

GetAccuracyEvaluator returns our AccuracyEvaluator

func (*LocalProvider) GetContextRelevanceEvaluator added in v0.9.10

func (p *LocalProvider) GetContextRelevanceEvaluator() Evaluator

GetContextRelevanceEvaluator returns our ContextRelevanceEvaluator

func (*LocalProvider) GetFaithfulnessEvaluator added in v0.9.10

func (p *LocalProvider) GetFaithfulnessEvaluator() Evaluator

GetFaithfulnessEvaluator returns our FaithfulnessEvaluator

func (*LocalProvider) GetHallucinationEvaluator added in v0.9.10

func (p *LocalProvider) GetHallucinationEvaluator() Evaluator

GetHallucinationEvaluator returns our HallucinationDetector

func (*LocalProvider) IsAvailable added in v0.9.10

func (p *LocalProvider) IsAvailable(ctx context.Context) bool

IsAvailable checks if local provider is available

func (*LocalProvider) Name added in v0.9.10

func (p *LocalProvider) Name() string

Name returns "local"

type ModelConfig added in v0.9.10

type ModelConfig struct {
	Provider    string  `yaml:"provider"`    // openai, anthropic
	Name        string  `yaml:"name"`        // model name
	Temperature float64 `yaml:"temperature"` // temperature for generation
}

ModelConfig specifies which LLM model to use

type OpikAccuracyEvaluator added in v0.9.10

type OpikAccuracyEvaluator struct {
	// contains filtered or unexported fields
}

OpikAccuracyEvaluator evaluates accuracy using Opik

func (*OpikAccuracyEvaluator) Evaluate added in v0.9.10

func (e *OpikAccuracyEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate runs local accuracy evaluation and sends trace to Opik

func (*OpikAccuracyEvaluator) Name added in v0.9.10

func (e *OpikAccuracyEvaluator) Name() string

Name returns "accuracy"

type OpikContextRelevanceEvaluator added in v0.9.10

type OpikContextRelevanceEvaluator struct {
	// contains filtered or unexported fields
}

OpikContextRelevanceEvaluator evaluates context relevance using Opik

func (*OpikContextRelevanceEvaluator) Evaluate added in v0.9.10

func (e *OpikContextRelevanceEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate runs local context relevance evaluation and sends trace to Opik

func (*OpikContextRelevanceEvaluator) Name added in v0.9.10

Name returns "context_relevance"

type OpikFaithfulnessEvaluator added in v0.9.10

type OpikFaithfulnessEvaluator struct {
	// contains filtered or unexported fields
}

OpikFaithfulnessEvaluator evaluates faithfulness using Opik

func (*OpikFaithfulnessEvaluator) Evaluate added in v0.9.10

func (e *OpikFaithfulnessEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate runs local faithfulness evaluation and sends trace to Opik

func (*OpikFaithfulnessEvaluator) Name added in v0.9.10

Name returns "faithfulness"

type OpikHallucinationEvaluator added in v0.9.10

type OpikHallucinationEvaluator struct {
	// contains filtered or unexported fields
}

OpikHallucinationEvaluator evaluates hallucination using Opik

func (*OpikHallucinationEvaluator) Evaluate added in v0.9.10

func (e *OpikHallucinationEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate runs local hallucination evaluation and sends trace to Opik

func (*OpikHallucinationEvaluator) Name added in v0.9.10

Name returns "hallucination"

type OpikProvider added in v0.9.10

type OpikProvider struct {
	// contains filtered or unexported fields
}

OpikProvider uses local LLM-as-judge evaluators and sends traces to Opik dashboard. This provides real evaluation scores in CLI output while enabling rich visualization in the Opik dashboard for production monitoring and analysis.

func NewOpikProvider added in v0.9.10

func NewOpikProvider(config *llm.OpikConfig, llmClient llm.Client) (*OpikProvider, error)

NewOpikProvider creates an Opik evaluator provider

func (*OpikProvider) GetAccuracyEvaluator added in v0.9.10

func (p *OpikProvider) GetAccuracyEvaluator() Evaluator

GetAccuracyEvaluator returns an Opik accuracy evaluator

func (*OpikProvider) GetContextRelevanceEvaluator added in v0.9.10

func (p *OpikProvider) GetContextRelevanceEvaluator() Evaluator

GetContextRelevanceEvaluator returns an Opik context relevance evaluator

func (*OpikProvider) GetFaithfulnessEvaluator added in v0.9.10

func (p *OpikProvider) GetFaithfulnessEvaluator() Evaluator

GetFaithfulnessEvaluator returns an Opik faithfulness evaluator

func (*OpikProvider) GetHallucinationEvaluator added in v0.9.10

func (p *OpikProvider) GetHallucinationEvaluator() Evaluator

GetHallucinationEvaluator returns an Opik hallucination evaluator

func (*OpikProvider) IsAvailable added in v0.9.10

func (p *OpikProvider) IsAvailable(ctx context.Context) bool

IsAvailable checks if Opik provider is available

func (*OpikProvider) Name added in v0.9.10

func (p *OpikProvider) Name() string

Name returns "opik"

func (*OpikProvider) Shutdown added in v0.9.10

func (p *OpikProvider) Shutdown(ctx context.Context) error

Shutdown gracefully shuts down the Opik provider

type ProviderType added in v0.9.10

type ProviderType string

ProviderType represents the type of evaluator provider

const (
	// ProviderTypeLocal uses our own LLM-as-judge evaluators
	ProviderTypeLocal ProviderType = "local"

	// ProviderTypeOpik uses Opik's evaluators via OpenTelemetry
	ProviderTypeOpik ProviderType = "opik"
)

func GetAvailableProviders added in v0.9.10

func GetAvailableProviders(ctx context.Context, llmClient llm.Client) []ProviderType

GetAvailableProviders returns a list of available provider types

func GetDefaultProvider added in v0.9.10

func GetDefaultProvider() ProviderType

GetDefaultProvider returns the default provider type

type Runner

type Runner struct {
	// contains filtered or unexported fields
}

Runner executes evaluation runs

func NewRunner

func NewRunner(llmClient llm.Client) *Runner

NewRunner creates a new evaluation runner

func (*Runner) RunEvaluation

func (r *Runner) RunEvaluation(ctx context.Context, dataset *Dataset, agentName string, collection string) (*EvaluationRun, error)

RunEvaluation executes a complete evaluation using the default local provider

func (*Runner) RunEvaluationWithProvider added in v0.9.10

func (r *Runner) RunEvaluationWithProvider(ctx context.Context, dataset *Dataset, agentName string, collection string, provider EvaluatorProvider) (*EvaluationRun, error)

RunEvaluationWithProvider executes a complete evaluation using the specified provider

type ScoringConfig added in v0.9.10

type ScoringConfig struct {
	Type      string  `yaml:"type"`              // llm_judge, regex, exact_match, contains
	Threshold float64 `yaml:"threshold"`         // Minimum passing score
	Weight    float64 `yaml:"weight"`            // Weight in overall evaluation (default 1.0)
	Pattern   string  `yaml:"pattern,omitempty"` // For regex type
}

ScoringConfig defines how to score the evaluation

type TestCase

type TestCase struct {
	// Identification
	ID          string   `yaml:"id"`
	Description string   `yaml:"description,omitempty"`
	Tags        []string `yaml:"tags,omitempty"`

	// Input
	Query      string `yaml:"query"`
	Collection string `yaml:"collection,omitempty"` // Override default collection

	// Expected output
	ExpectedAnswer    string   `yaml:"expected_answer"`
	ExpectedCitations []string `yaml:"expected_citations,omitempty"`
	RequiredConcepts  []string `yaml:"required_concepts,omitempty"`

	// Retrieved context (for advanced evaluation)
	RetrievedContext []string `yaml:"retrieved_context,omitempty"`

	// Evaluation criteria
	MinRelevanceScore float64 `yaml:"min_relevance_score,omitempty"`
	MustCite          bool    `yaml:"must_cite,omitempty"`
}

TestCase represents a single evaluation test case

func (*TestCase) Validate

func (tc *TestCase) Validate() error

Validate validates a test case

type TestCaseResult

type TestCaseResult struct {
	// Test case info
	TestCaseID string `json:"test_case_id" yaml:"test_case_id"`
	Query      string `json:"query" yaml:"query"`

	// Actual output
	ActualAnswer    string   `json:"actual_answer" yaml:"actual_answer"`
	ActualCitations []string `json:"actual_citations,omitempty" yaml:"actual_citations,omitempty"`
	ResponseTime    float64  `json:"response_time_ms" yaml:"response_time_ms"`

	// Evaluation scores
	AccuracyScore         float64 `json:"accuracy_score" yaml:"accuracy_score"`
	CitationScore         float64 `json:"citation_score" yaml:"citation_score"`
	HallucinationScore    float64 `json:"hallucination_score" yaml:"hallucination_score"`
	ContextRelevanceScore float64 `json:"context_relevance_score" yaml:"context_relevance_score"`
	FaithfulnessScore     float64 `json:"faithfulness_score" yaml:"faithfulness_score"`

	// Custom evaluator scores
	CustomScores map[string]float64 `json:"custom_scores,omitempty" yaml:"custom_scores,omitempty"`

	// Pass/fail
	Passed bool     `json:"passed" yaml:"passed"`
	Errors []string `json:"errors,omitempty" yaml:"errors,omitempty"`

	// Details
	Details map[string]interface{} `json:"details,omitempty" yaml:"details,omitempty"`
}

TestCaseResult represents the result of evaluating a single test case

func EvaluateTestCase

func EvaluateTestCase(ctx context.Context, testCase *TestCase, actualAnswer string, actualCitations []string, provider EvaluatorProvider) (*TestCaseResult, error)

EvaluateTestCase runs all evaluators on a test case using the specified provider. Rule-based evaluators (citation) always run locally. LLM-based evaluators use the provider (local or Opik).

func EvaluateTestCaseWithDataset added in v0.9.11

func EvaluateTestCaseWithDataset(ctx context.Context, testCase *TestCase, actualAnswer string, actualCitations []string, provider EvaluatorProvider, dataset *Dataset) (*TestCaseResult, error)

EvaluateTestCaseWithDataset evaluates a test case with optional custom evaluators from dataset

func EvaluateTestCaseWithLLMClient added in v0.9.10

func EvaluateTestCaseWithLLMClient(ctx context.Context, testCase *TestCase, actualAnswer string, actualCitations []string, llmClient llm.Client) (*TestCaseResult, error)

EvaluateTestCaseWithLLMClient is a convenience wrapper that creates a LocalProvider. This maintains backward compatibility with existing code.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL