Documentation
¶
Index ¶
- func GenerateRunID() string
- func GetDefaultCustomEvaluatorDir() string
- func GetDefaultDatasetDir() string
- func GetDefaultResultsDir() string
- func SaveDataset(dataset *Dataset, filepath string) error
- func SaveResults(run *EvaluationRun, format string) (string, error)
- func SaveResultsToFile(run *EvaluationRun, filePath, format string) error
- func UnmarshalYAML(data []byte, v interface{}) error
- type AccuracyEvaluator
- type CitationEvaluator
- type ContextRelevanceEvaluator
- type CustomEvaluator
- type CustomEvaluatorDef
- type CustomEvaluatorLoader
- type Dataset
- type DatasetConfig
- type EvaluationConfig
- type EvaluationRun
- type EvaluationSummary
- type Evaluator
- type EvaluatorProvider
- type FaithfulnessEvaluator
- type HallucinationDetector
- type LocalProvider
- func (p *LocalProvider) GetAccuracyEvaluator() Evaluator
- func (p *LocalProvider) GetContextRelevanceEvaluator() Evaluator
- func (p *LocalProvider) GetFaithfulnessEvaluator() Evaluator
- func (p *LocalProvider) GetHallucinationEvaluator() Evaluator
- func (p *LocalProvider) IsAvailable(ctx context.Context) bool
- func (p *LocalProvider) Name() string
- type ModelConfig
- type OpikAccuracyEvaluator
- type OpikContextRelevanceEvaluator
- type OpikFaithfulnessEvaluator
- type OpikHallucinationEvaluator
- type OpikProvider
- func (p *OpikProvider) GetAccuracyEvaluator() Evaluator
- func (p *OpikProvider) GetContextRelevanceEvaluator() Evaluator
- func (p *OpikProvider) GetFaithfulnessEvaluator() Evaluator
- func (p *OpikProvider) GetHallucinationEvaluator() Evaluator
- func (p *OpikProvider) IsAvailable(ctx context.Context) bool
- func (p *OpikProvider) Name() string
- func (p *OpikProvider) Shutdown(ctx context.Context) error
- type ProviderType
- type Runner
- type ScoringConfig
- type TestCase
- type TestCaseResult
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetDefaultCustomEvaluatorDir ¶ added in v0.9.10
func GetDefaultCustomEvaluatorDir() string
GetDefaultCustomEvaluatorDir returns the default directory for custom evaluators
func GetDefaultDatasetDir ¶
func GetDefaultDatasetDir() string
GetDefaultDatasetDir returns the default directory for evaluation datasets
func GetDefaultResultsDir ¶
func GetDefaultResultsDir() string
GetDefaultResultsDir returns the default directory for evaluation results
func SaveDataset ¶
SaveDataset saves a dataset to a YAML file
func SaveResults ¶
func SaveResults(run *EvaluationRun, format string) (string, error)
SaveResults saves evaluation results to a file
func SaveResultsToFile ¶
func SaveResultsToFile(run *EvaluationRun, filePath, format string) error
SaveResultsToFile saves evaluation results to a specific file path
func UnmarshalYAML ¶ added in v0.9.10
UnmarshalYAML is a helper function to unmarshal YAML data
Types ¶
type AccuracyEvaluator ¶
type AccuracyEvaluator struct {
// contains filtered or unexported fields
}
AccuracyEvaluator evaluates semantic similarity between expected and actual answers
func NewAccuracyEvaluator ¶
func NewAccuracyEvaluator(llmClient llm.Client) *AccuracyEvaluator
NewAccuracyEvaluator creates a new accuracy evaluator
func (*AccuracyEvaluator) Evaluate ¶
func (e *AccuracyEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate computes semantic similarity score
func (*AccuracyEvaluator) Name ¶
func (e *AccuracyEvaluator) Name() string
Name returns the evaluator name
type CitationEvaluator ¶
type CitationEvaluator struct{}
CitationEvaluator evaluates citation quality
func NewCitationEvaluator ¶
func NewCitationEvaluator() *CitationEvaluator
NewCitationEvaluator creates a new citation evaluator
func (*CitationEvaluator) Evaluate ¶
func (e *CitationEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate checks citation presence and format
func (*CitationEvaluator) Name ¶
func (e *CitationEvaluator) Name() string
Name returns the evaluator name
type ContextRelevanceEvaluator ¶ added in v0.9.10
type ContextRelevanceEvaluator struct {
// contains filtered or unexported fields
}
ContextRelevanceEvaluator evaluates how relevant retrieved context is to the query
func NewContextRelevanceEvaluator ¶ added in v0.9.10
func NewContextRelevanceEvaluator(llmClient llm.Client) *ContextRelevanceEvaluator
NewContextRelevanceEvaluator creates a new context relevance evaluator
func (*ContextRelevanceEvaluator) Evaluate ¶ added in v0.9.10
func (e *ContextRelevanceEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate scores how relevant the retrieved context is to the query
func (*ContextRelevanceEvaluator) Name ¶ added in v0.9.10
func (e *ContextRelevanceEvaluator) Name() string
Name returns the evaluator name
type CustomEvaluator ¶ added in v0.9.10
type CustomEvaluator struct {
// contains filtered or unexported fields
}
CustomEvaluator executes custom evaluator definitions
func NewCustomEvaluator ¶ added in v0.9.10
func NewCustomEvaluator(def *CustomEvaluatorDef, llmClient llm.Client) *CustomEvaluator
NewCustomEvaluator creates a custom evaluator from a definition
func (*CustomEvaluator) Evaluate ¶ added in v0.9.10
func (e *CustomEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate executes the custom evaluator
func (*CustomEvaluator) Name ¶ added in v0.9.10
func (e *CustomEvaluator) Name() string
Name returns the evaluator name
type CustomEvaluatorDef ¶ added in v0.9.10
type CustomEvaluatorDef struct {
Name string `yaml:"name"`
Description string `yaml:"description"`
Version string `yaml:"version"`
Prompt string `yaml:"prompt"`
Scoring ScoringConfig `yaml:"scoring"`
Model *ModelConfig `yaml:"model,omitempty"`
Tags []string `yaml:"tags,omitempty"`
Author string `yaml:"author,omitempty"`
Required bool `yaml:"required"`
}
CustomEvaluatorDef defines a custom evaluator from YAML configuration
func (*CustomEvaluatorDef) Validate ¶ added in v0.9.10
func (d *CustomEvaluatorDef) Validate() error
Validate checks if the evaluator definition is valid
type CustomEvaluatorLoader ¶ added in v0.9.10
type CustomEvaluatorLoader struct {
// contains filtered or unexported fields
}
CustomEvaluatorLoader loads custom evaluator definitions from YAML files
func NewCustomEvaluatorLoader ¶ added in v0.9.10
func NewCustomEvaluatorLoader(baseDir string) *CustomEvaluatorLoader
NewCustomEvaluatorLoader creates a loader for custom evaluators
func (*CustomEvaluatorLoader) Load ¶ added in v0.9.10
func (l *CustomEvaluatorLoader) Load(name string) (*CustomEvaluatorDef, error)
Load loads a single custom evaluator by name
func (*CustomEvaluatorLoader) LoadAll ¶ added in v0.9.10
func (l *CustomEvaluatorLoader) LoadAll() ([]*CustomEvaluatorDef, error)
LoadAll loads all custom evaluators from the directory
type Dataset ¶
type Dataset struct {
// Metadata
Name string `yaml:"name"`
Description string `yaml:"description"`
Version string `yaml:"version"`
Author string `yaml:"author,omitempty"`
Tags []string `yaml:"tags,omitempty"`
// Configuration
Config DatasetConfig `yaml:"config"`
// Custom evaluators to run (optional)
CustomEvaluators []string `yaml:"custom_evaluators,omitempty"`
// Test cases
TestCases []TestCase `yaml:"test_cases"`
}
Dataset represents an evaluation dataset with test cases
func ListDatasets ¶
ListDatasets returns all datasets in the default directory
func LoadDataset ¶
LoadDataset loads a dataset from a YAML file
type DatasetConfig ¶
type DatasetConfig struct {
// Agent to use (optional - can be overridden at runtime)
DefaultAgent string `yaml:"default_agent,omitempty"`
// Collection to query (optional - can be overridden at runtime)
DefaultCollection string `yaml:"default_collection,omitempty"`
// Evaluation thresholds
MinAccuracyScore float64 `yaml:"min_accuracy_score,omitempty"`
MinCitationScore float64 `yaml:"min_citation_score,omitempty"`
AllowHallucination bool `yaml:"allow_hallucination,omitempty"`
}
DatasetConfig contains dataset-level configuration
type EvaluationConfig ¶
type EvaluationConfig struct {
AgentPath string `json:"agent_path" yaml:"agent_path"`
Parameters map[string]string `json:"parameters,omitempty" yaml:"parameters,omitempty"`
}
EvaluationConfig stores the configuration used for the evaluation
type EvaluationRun ¶
type EvaluationRun struct {
// Metadata
ID string `json:"id" yaml:"id"`
Timestamp time.Time `json:"timestamp" yaml:"timestamp"`
DatasetName string `json:"dataset_name" yaml:"dataset_name"`
AgentName string `json:"agent_name" yaml:"agent_name"`
Collection string `json:"collection,omitempty" yaml:"collection,omitempty"`
// Configuration
Config EvaluationConfig `json:"config" yaml:"config"`
// Results
Results []TestCaseResult `json:"results" yaml:"results"`
// Summary
Summary EvaluationSummary `json:"summary" yaml:"summary"`
}
EvaluationRun represents a complete evaluation run
func ListResults ¶
func ListResults() ([]*EvaluationRun, error)
ListResults returns all available evaluation runs
func LoadResults ¶
func LoadResults(runID string) (*EvaluationRun, error)
LoadResults loads evaluation results from a file
type EvaluationSummary ¶
type EvaluationSummary struct {
TotalTests int `json:"total_tests" yaml:"total_tests"`
PassedTests int `json:"passed_tests" yaml:"passed_tests"`
FailedTests int `json:"failed_tests" yaml:"failed_tests"`
PassRate float64 `json:"pass_rate" yaml:"pass_rate"`
// Average scores
AvgAccuracy float64 `json:"avg_accuracy" yaml:"avg_accuracy"`
AvgCitation float64 `json:"avg_citation" yaml:"avg_citation"`
AvgHallucination float64 `json:"avg_hallucination" yaml:"avg_hallucination"`
AvgContextRelevance float64 `json:"avg_context_relevance" yaml:"avg_context_relevance"`
AvgFaithfulness float64 `json:"avg_faithfulness" yaml:"avg_faithfulness"`
// Performance
TotalTime float64 `json:"total_time_ms" yaml:"total_time_ms"`
AvgTime float64 `json:"avg_time_ms" yaml:"avg_time_ms"`
// Duration
StartTime time.Time `json:"start_time" yaml:"start_time"`
EndTime time.Time `json:"end_time" yaml:"end_time"`
Duration time.Duration `json:"duration" yaml:"duration"`
}
EvaluationSummary contains aggregate statistics
func CalculateSummary ¶
func CalculateSummary(results []TestCaseResult, startTime, endTime time.Time) EvaluationSummary
CalculateSummary calculates summary statistics from results
type Evaluator ¶
type Evaluator interface {
Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Name() string
}
Evaluator interface for all evaluators
type EvaluatorProvider ¶ added in v0.9.10
type EvaluatorProvider interface {
// Name returns the provider name (e.g., "local", "opik")
Name() string
// GetAccuracyEvaluator returns an evaluator for semantic accuracy
GetAccuracyEvaluator() Evaluator
// GetFaithfulnessEvaluator returns an evaluator for faithfulness/groundedness
GetFaithfulnessEvaluator() Evaluator
// GetHallucinationEvaluator returns an evaluator for hallucination detection
GetHallucinationEvaluator() Evaluator
// GetContextRelevanceEvaluator returns an evaluator for context relevance
GetContextRelevanceEvaluator() Evaluator
// IsAvailable checks if the provider is properly configured
IsAvailable(ctx context.Context) bool
}
EvaluatorProvider creates evaluators for different evaluation types. This interface allows pluggable evaluator backends (local, Opik, LangSmith, etc.)
func CreateProvider ¶ added in v0.9.10
func CreateProvider(ctx context.Context, providerType ProviderType, llmClient llm.Client) (EvaluatorProvider, error)
CreateProvider creates an evaluator provider of the specified type
type FaithfulnessEvaluator ¶ added in v0.9.10
type FaithfulnessEvaluator struct {
// contains filtered or unexported fields
}
FaithfulnessEvaluator evaluates whether the answer is supported by the retrieved context
func NewFaithfulnessEvaluator ¶ added in v0.9.10
func NewFaithfulnessEvaluator(llmClient llm.Client) *FaithfulnessEvaluator
NewFaithfulnessEvaluator creates a new faithfulness evaluator
func (*FaithfulnessEvaluator) Evaluate ¶ added in v0.9.10
func (e *FaithfulnessEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate verifies that the answer is supported by the retrieved context
func (*FaithfulnessEvaluator) Name ¶ added in v0.9.10
func (e *FaithfulnessEvaluator) Name() string
Name returns the evaluator name
type HallucinationDetector ¶
type HallucinationDetector struct {
// contains filtered or unexported fields
}
HallucinationDetector detects potential hallucinations
func NewHallucinationDetector ¶
func NewHallucinationDetector(llmClient llm.Client) *HallucinationDetector
NewHallucinationDetector creates a new hallucination detector
func (*HallucinationDetector) Evaluate ¶
func (e *HallucinationDetector) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate detects hallucinations (returns score where 1.0 = no hallucination)
func (*HallucinationDetector) Name ¶
func (e *HallucinationDetector) Name() string
Name returns the evaluator name
type LocalProvider ¶ added in v0.9.10
type LocalProvider struct {
// contains filtered or unexported fields
}
LocalProvider uses our own LLM-as-judge evaluators. This is the default provider that uses OpenAI/Claude/etc. directly.
func NewLocalProvider ¶ added in v0.9.10
func NewLocalProvider(llmClient llm.Client) *LocalProvider
NewLocalProvider creates a local evaluator provider
func (*LocalProvider) GetAccuracyEvaluator ¶ added in v0.9.10
func (p *LocalProvider) GetAccuracyEvaluator() Evaluator
GetAccuracyEvaluator returns our AccuracyEvaluator
func (*LocalProvider) GetContextRelevanceEvaluator ¶ added in v0.9.10
func (p *LocalProvider) GetContextRelevanceEvaluator() Evaluator
GetContextRelevanceEvaluator returns our ContextRelevanceEvaluator
func (*LocalProvider) GetFaithfulnessEvaluator ¶ added in v0.9.10
func (p *LocalProvider) GetFaithfulnessEvaluator() Evaluator
GetFaithfulnessEvaluator returns our FaithfulnessEvaluator
func (*LocalProvider) GetHallucinationEvaluator ¶ added in v0.9.10
func (p *LocalProvider) GetHallucinationEvaluator() Evaluator
GetHallucinationEvaluator returns our HallucinationDetector
func (*LocalProvider) IsAvailable ¶ added in v0.9.10
func (p *LocalProvider) IsAvailable(ctx context.Context) bool
IsAvailable checks if local provider is available
func (*LocalProvider) Name ¶ added in v0.9.10
func (p *LocalProvider) Name() string
Name returns "local"
type ModelConfig ¶ added in v0.9.10
type ModelConfig struct {
Provider string `yaml:"provider"` // openai, anthropic
Name string `yaml:"name"` // model name
Temperature float64 `yaml:"temperature"` // temperature for generation
}
ModelConfig specifies which LLM model to use
type OpikAccuracyEvaluator ¶ added in v0.9.10
type OpikAccuracyEvaluator struct {
// contains filtered or unexported fields
}
OpikAccuracyEvaluator evaluates accuracy using Opik
func (*OpikAccuracyEvaluator) Evaluate ¶ added in v0.9.10
func (e *OpikAccuracyEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate runs local accuracy evaluation and sends trace to Opik
func (*OpikAccuracyEvaluator) Name ¶ added in v0.9.10
func (e *OpikAccuracyEvaluator) Name() string
Name returns "accuracy"
type OpikContextRelevanceEvaluator ¶ added in v0.9.10
type OpikContextRelevanceEvaluator struct {
// contains filtered or unexported fields
}
OpikContextRelevanceEvaluator evaluates context relevance using Opik
func (*OpikContextRelevanceEvaluator) Evaluate ¶ added in v0.9.10
func (e *OpikContextRelevanceEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate runs local context relevance evaluation and sends trace to Opik
func (*OpikContextRelevanceEvaluator) Name ¶ added in v0.9.10
func (e *OpikContextRelevanceEvaluator) Name() string
Name returns "context_relevance"
type OpikFaithfulnessEvaluator ¶ added in v0.9.10
type OpikFaithfulnessEvaluator struct {
// contains filtered or unexported fields
}
OpikFaithfulnessEvaluator evaluates faithfulness using Opik
func (*OpikFaithfulnessEvaluator) Evaluate ¶ added in v0.9.10
func (e *OpikFaithfulnessEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate runs local faithfulness evaluation and sends trace to Opik
func (*OpikFaithfulnessEvaluator) Name ¶ added in v0.9.10
func (e *OpikFaithfulnessEvaluator) Name() string
Name returns "faithfulness"
type OpikHallucinationEvaluator ¶ added in v0.9.10
type OpikHallucinationEvaluator struct {
// contains filtered or unexported fields
}
OpikHallucinationEvaluator evaluates hallucination using Opik
func (*OpikHallucinationEvaluator) Evaluate ¶ added in v0.9.10
func (e *OpikHallucinationEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate runs local hallucination evaluation and sends trace to Opik
func (*OpikHallucinationEvaluator) Name ¶ added in v0.9.10
func (e *OpikHallucinationEvaluator) Name() string
Name returns "hallucination"
type OpikProvider ¶ added in v0.9.10
type OpikProvider struct {
// contains filtered or unexported fields
}
OpikProvider uses local LLM-as-judge evaluators and sends traces to Opik dashboard. This provides real evaluation scores in CLI output while enabling rich visualization in the Opik dashboard for production monitoring and analysis.
func NewOpikProvider ¶ added in v0.9.10
func NewOpikProvider(config *llm.OpikConfig, llmClient llm.Client) (*OpikProvider, error)
NewOpikProvider creates an Opik evaluator provider
func (*OpikProvider) GetAccuracyEvaluator ¶ added in v0.9.10
func (p *OpikProvider) GetAccuracyEvaluator() Evaluator
GetAccuracyEvaluator returns an Opik accuracy evaluator
func (*OpikProvider) GetContextRelevanceEvaluator ¶ added in v0.9.10
func (p *OpikProvider) GetContextRelevanceEvaluator() Evaluator
GetContextRelevanceEvaluator returns an Opik context relevance evaluator
func (*OpikProvider) GetFaithfulnessEvaluator ¶ added in v0.9.10
func (p *OpikProvider) GetFaithfulnessEvaluator() Evaluator
GetFaithfulnessEvaluator returns an Opik faithfulness evaluator
func (*OpikProvider) GetHallucinationEvaluator ¶ added in v0.9.10
func (p *OpikProvider) GetHallucinationEvaluator() Evaluator
GetHallucinationEvaluator returns an Opik hallucination evaluator
func (*OpikProvider) IsAvailable ¶ added in v0.9.10
func (p *OpikProvider) IsAvailable(ctx context.Context) bool
IsAvailable checks if Opik provider is available
func (*OpikProvider) Name ¶ added in v0.9.10
func (p *OpikProvider) Name() string
Name returns "opik"
type ProviderType ¶ added in v0.9.10
type ProviderType string
ProviderType represents the type of evaluator provider
const ( // ProviderTypeLocal uses our own LLM-as-judge evaluators ProviderTypeLocal ProviderType = "local" // ProviderTypeOpik uses Opik's evaluators via OpenTelemetry ProviderTypeOpik ProviderType = "opik" )
func GetAvailableProviders ¶ added in v0.9.10
func GetAvailableProviders(ctx context.Context, llmClient llm.Client) []ProviderType
GetAvailableProviders returns a list of available provider types
func GetDefaultProvider ¶ added in v0.9.10
func GetDefaultProvider() ProviderType
GetDefaultProvider returns the default provider type
type Runner ¶
type Runner struct {
// contains filtered or unexported fields
}
Runner executes evaluation runs
func (*Runner) RunEvaluation ¶
func (r *Runner) RunEvaluation(ctx context.Context, dataset *Dataset, agentName string, collection string) (*EvaluationRun, error)
RunEvaluation executes a complete evaluation using the default local provider
func (*Runner) RunEvaluationWithProvider ¶ added in v0.9.10
func (r *Runner) RunEvaluationWithProvider(ctx context.Context, dataset *Dataset, agentName string, collection string, provider EvaluatorProvider) (*EvaluationRun, error)
RunEvaluationWithProvider executes a complete evaluation using the specified provider
type ScoringConfig ¶ added in v0.9.10
type ScoringConfig struct {
Type string `yaml:"type"` // llm_judge, regex, exact_match, contains
Threshold float64 `yaml:"threshold"` // Minimum passing score
Weight float64 `yaml:"weight"` // Weight in overall evaluation (default 1.0)
Pattern string `yaml:"pattern,omitempty"` // For regex type
}
ScoringConfig defines how to score the evaluation
type TestCase ¶
type TestCase struct {
// Identification
ID string `yaml:"id"`
Description string `yaml:"description,omitempty"`
Tags []string `yaml:"tags,omitempty"`
// Input
Query string `yaml:"query"`
Collection string `yaml:"collection,omitempty"` // Override default collection
// Expected output
ExpectedAnswer string `yaml:"expected_answer"`
ExpectedCitations []string `yaml:"expected_citations,omitempty"`
RequiredConcepts []string `yaml:"required_concepts,omitempty"`
// Retrieved context (for advanced evaluation)
RetrievedContext []string `yaml:"retrieved_context,omitempty"`
// Evaluation criteria
MinRelevanceScore float64 `yaml:"min_relevance_score,omitempty"`
MustCite bool `yaml:"must_cite,omitempty"`
}
TestCase represents a single evaluation test case
type TestCaseResult ¶
type TestCaseResult struct {
// Test case info
TestCaseID string `json:"test_case_id" yaml:"test_case_id"`
Query string `json:"query" yaml:"query"`
// Actual output
ActualAnswer string `json:"actual_answer" yaml:"actual_answer"`
ActualCitations []string `json:"actual_citations,omitempty" yaml:"actual_citations,omitempty"`
ResponseTime float64 `json:"response_time_ms" yaml:"response_time_ms"`
// Evaluation scores
AccuracyScore float64 `json:"accuracy_score" yaml:"accuracy_score"`
CitationScore float64 `json:"citation_score" yaml:"citation_score"`
HallucinationScore float64 `json:"hallucination_score" yaml:"hallucination_score"`
ContextRelevanceScore float64 `json:"context_relevance_score" yaml:"context_relevance_score"`
FaithfulnessScore float64 `json:"faithfulness_score" yaml:"faithfulness_score"`
// Pass/fail
Passed bool `json:"passed" yaml:"passed"`
Errors []string `json:"errors,omitempty" yaml:"errors,omitempty"`
// Details
Details map[string]interface{} `json:"details,omitempty" yaml:"details,omitempty"`
}
TestCaseResult represents the result of evaluating a single test case
func EvaluateTestCase ¶
func EvaluateTestCase(ctx context.Context, testCase *TestCase, actualAnswer string, actualCitations []string, provider EvaluatorProvider) (*TestCaseResult, error)
EvaluateTestCase runs all evaluators on a test case using the specified provider. Rule-based evaluators (citation) always run locally. LLM-based evaluators use the provider (local or Opik).
func EvaluateTestCaseWithLLMClient ¶ added in v0.9.10
func EvaluateTestCaseWithLLMClient(ctx context.Context, testCase *TestCase, actualAnswer string, actualCitations []string, llmClient llm.Client) (*TestCaseResult, error)
EvaluateTestCaseWithLLMClient is a convenience wrapper that creates a LocalProvider. This maintains backward compatibility with existing code.