Documentation
¶
Index ¶
- func GenerateRunID() string
- func GetDefaultDatasetDir() string
- func GetDefaultResultsDir() string
- func SaveDataset(dataset *Dataset, filepath string) error
- func SaveResults(run *EvaluationRun, format string) (string, error)
- func SaveResultsToFile(run *EvaluationRun, filePath, format string) error
- type AccuracyEvaluator
- type CitationEvaluator
- type Dataset
- type DatasetConfig
- type EvaluationConfig
- type EvaluationRun
- type EvaluationSummary
- type Evaluator
- type HallucinationDetector
- type Runner
- type TestCase
- type TestCaseResult
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func GetDefaultDatasetDir ¶
func GetDefaultDatasetDir() string
GetDefaultDatasetDir returns the default directory for evaluation datasets
func GetDefaultResultsDir ¶
func GetDefaultResultsDir() string
GetDefaultResultsDir returns the default directory for evaluation results
func SaveDataset ¶
SaveDataset saves a dataset to a YAML file
func SaveResults ¶
func SaveResults(run *EvaluationRun, format string) (string, error)
SaveResults saves evaluation results to a file
func SaveResultsToFile ¶
func SaveResultsToFile(run *EvaluationRun, filePath, format string) error
SaveResultsToFile saves evaluation results to a specific file path
Types ¶
type AccuracyEvaluator ¶
type AccuracyEvaluator struct {
// contains filtered or unexported fields
}
AccuracyEvaluator evaluates semantic similarity between expected and actual answers
func NewAccuracyEvaluator ¶
func NewAccuracyEvaluator(llmClient llm.Client) *AccuracyEvaluator
NewAccuracyEvaluator creates a new accuracy evaluator
func (*AccuracyEvaluator) Evaluate ¶
func (e *AccuracyEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate computes semantic similarity score
func (*AccuracyEvaluator) Name ¶
func (e *AccuracyEvaluator) Name() string
Name returns the evaluator name
type CitationEvaluator ¶
type CitationEvaluator struct{}
CitationEvaluator evaluates citation quality
func NewCitationEvaluator ¶
func NewCitationEvaluator() *CitationEvaluator
NewCitationEvaluator creates a new citation evaluator
func (*CitationEvaluator) Evaluate ¶
func (e *CitationEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate checks citation presence and format
func (*CitationEvaluator) Name ¶
func (e *CitationEvaluator) Name() string
Name returns the evaluator name
type Dataset ¶
type Dataset struct {
// Metadata
Name string `yaml:"name"`
Description string `yaml:"description"`
Version string `yaml:"version"`
Author string `yaml:"author,omitempty"`
Tags []string `yaml:"tags,omitempty"`
// Configuration
Config DatasetConfig `yaml:"config"`
// Test cases
TestCases []TestCase `yaml:"test_cases"`
}
Dataset represents an evaluation dataset with test cases
func ListDatasets ¶
ListDatasets returns all datasets in the default directory
func LoadDataset ¶
LoadDataset loads a dataset from a YAML file
type DatasetConfig ¶
type DatasetConfig struct {
// Agent to use (optional - can be overridden at runtime)
DefaultAgent string `yaml:"default_agent,omitempty"`
// Collection to query (optional - can be overridden at runtime)
DefaultCollection string `yaml:"default_collection,omitempty"`
// Evaluation thresholds
MinAccuracyScore float64 `yaml:"min_accuracy_score,omitempty"`
MinCitationScore float64 `yaml:"min_citation_score,omitempty"`
AllowHallucination bool `yaml:"allow_hallucination,omitempty"`
}
DatasetConfig contains dataset-level configuration
type EvaluationConfig ¶
type EvaluationConfig struct {
AgentPath string `json:"agent_path" yaml:"agent_path"`
Parameters map[string]string `json:"parameters,omitempty" yaml:"parameters,omitempty"`
}
EvaluationConfig stores the configuration used for the evaluation
type EvaluationRun ¶
type EvaluationRun struct {
// Metadata
ID string `json:"id" yaml:"id"`
Timestamp time.Time `json:"timestamp" yaml:"timestamp"`
DatasetName string `json:"dataset_name" yaml:"dataset_name"`
AgentName string `json:"agent_name" yaml:"agent_name"`
Collection string `json:"collection,omitempty" yaml:"collection,omitempty"`
// Configuration
Config EvaluationConfig `json:"config" yaml:"config"`
// Results
Results []TestCaseResult `json:"results" yaml:"results"`
// Summary
Summary EvaluationSummary `json:"summary" yaml:"summary"`
}
EvaluationRun represents a complete evaluation run
func ListResults ¶
func ListResults() ([]*EvaluationRun, error)
ListResults returns all available evaluation runs
func LoadResults ¶
func LoadResults(runID string) (*EvaluationRun, error)
LoadResults loads evaluation results from a file
type EvaluationSummary ¶
type EvaluationSummary struct {
TotalTests int `json:"total_tests" yaml:"total_tests"`
PassedTests int `json:"passed_tests" yaml:"passed_tests"`
FailedTests int `json:"failed_tests" yaml:"failed_tests"`
PassRate float64 `json:"pass_rate" yaml:"pass_rate"`
// Average scores
AvgAccuracy float64 `json:"avg_accuracy" yaml:"avg_accuracy"`
AvgCitation float64 `json:"avg_citation" yaml:"avg_citation"`
AvgHallucination float64 `json:"avg_hallucination" yaml:"avg_hallucination"`
// Performance
TotalTime float64 `json:"total_time_ms" yaml:"total_time_ms"`
AvgTime float64 `json:"avg_time_ms" yaml:"avg_time_ms"`
// Duration
StartTime time.Time `json:"start_time" yaml:"start_time"`
EndTime time.Time `json:"end_time" yaml:"end_time"`
Duration time.Duration `json:"duration" yaml:"duration"`
}
EvaluationSummary contains aggregate statistics
func CalculateSummary ¶
func CalculateSummary(results []TestCaseResult, startTime, endTime time.Time) EvaluationSummary
CalculateSummary calculates summary statistics from results
type Evaluator ¶
type Evaluator interface {
Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Name() string
}
Evaluator interface for all evaluators
type HallucinationDetector ¶
type HallucinationDetector struct {
// contains filtered or unexported fields
}
HallucinationDetector detects potential hallucinations
func NewHallucinationDetector ¶
func NewHallucinationDetector(llmClient llm.Client) *HallucinationDetector
NewHallucinationDetector creates a new hallucination detector
func (*HallucinationDetector) Evaluate ¶
func (e *HallucinationDetector) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
Evaluate detects hallucinations (returns score where 1.0 = no hallucination)
func (*HallucinationDetector) Name ¶
func (e *HallucinationDetector) Name() string
Name returns the evaluator name
type Runner ¶
type Runner struct {
// contains filtered or unexported fields
}
Runner executes evaluation runs
func (*Runner) RunEvaluation ¶
func (r *Runner) RunEvaluation(ctx context.Context, dataset *Dataset, agentName string, collection string) (*EvaluationRun, error)
RunEvaluation executes a complete evaluation
type TestCase ¶
type TestCase struct {
// Identification
ID string `yaml:"id"`
Description string `yaml:"description,omitempty"`
Tags []string `yaml:"tags,omitempty"`
// Input
Query string `yaml:"query"`
Collection string `yaml:"collection,omitempty"` // Override default collection
// Expected output
ExpectedAnswer string `yaml:"expected_answer"`
ExpectedCitations []string `yaml:"expected_citations,omitempty"`
RequiredConcepts []string `yaml:"required_concepts,omitempty"`
// Evaluation criteria
MinRelevanceScore float64 `yaml:"min_relevance_score,omitempty"`
MustCite bool `yaml:"must_cite,omitempty"`
}
TestCase represents a single evaluation test case
type TestCaseResult ¶
type TestCaseResult struct {
// Test case info
TestCaseID string `json:"test_case_id" yaml:"test_case_id"`
Query string `json:"query" yaml:"query"`
// Actual output
ActualAnswer string `json:"actual_answer" yaml:"actual_answer"`
ActualCitations []string `json:"actual_citations,omitempty" yaml:"actual_citations,omitempty"`
ResponseTime float64 `json:"response_time_ms" yaml:"response_time_ms"`
// Evaluation scores
AccuracyScore float64 `json:"accuracy_score" yaml:"accuracy_score"`
CitationScore float64 `json:"citation_score" yaml:"citation_score"`
HallucinationScore float64 `json:"hallucination_score" yaml:"hallucination_score"`
// Pass/fail
Passed bool `json:"passed" yaml:"passed"`
Errors []string `json:"errors,omitempty" yaml:"errors,omitempty"`
// Details
Details map[string]interface{} `json:"details,omitempty" yaml:"details,omitempty"`
}
TestCaseResult represents the result of evaluating a single test case