evaluation

package
v0.9.9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jan 23, 2026 License: MIT Imports: 11 Imported by: 0

Documentation

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func GenerateRunID

func GenerateRunID() string

GenerateRunID generates a unique run ID

func GetDefaultDatasetDir

func GetDefaultDatasetDir() string

GetDefaultDatasetDir returns the default directory for evaluation datasets

func GetDefaultResultsDir

func GetDefaultResultsDir() string

GetDefaultResultsDir returns the default directory for evaluation results

func SaveDataset

func SaveDataset(dataset *Dataset, filepath string) error

SaveDataset saves a dataset to a YAML file

func SaveResults

func SaveResults(run *EvaluationRun, format string) (string, error)

SaveResults saves evaluation results to a file

func SaveResultsToFile

func SaveResultsToFile(run *EvaluationRun, filePath, format string) error

SaveResultsToFile saves evaluation results to a specific file path

Types

type AccuracyEvaluator

type AccuracyEvaluator struct {
	// contains filtered or unexported fields
}

AccuracyEvaluator evaluates semantic similarity between expected and actual answers

func NewAccuracyEvaluator

func NewAccuracyEvaluator(llmClient llm.Client) *AccuracyEvaluator

NewAccuracyEvaluator creates a new accuracy evaluator

func (*AccuracyEvaluator) Evaluate

func (e *AccuracyEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate computes semantic similarity score

func (*AccuracyEvaluator) Name

func (e *AccuracyEvaluator) Name() string

Name returns the evaluator name

type CitationEvaluator

type CitationEvaluator struct{}

CitationEvaluator evaluates citation quality

func NewCitationEvaluator

func NewCitationEvaluator() *CitationEvaluator

NewCitationEvaluator creates a new citation evaluator

func (*CitationEvaluator) Evaluate

func (e *CitationEvaluator) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate checks citation presence and format

func (*CitationEvaluator) Name

func (e *CitationEvaluator) Name() string

Name returns the evaluator name

type Dataset

type Dataset struct {
	// Metadata
	Name        string   `yaml:"name"`
	Description string   `yaml:"description"`
	Version     string   `yaml:"version"`
	Author      string   `yaml:"author,omitempty"`
	Tags        []string `yaml:"tags,omitempty"`

	// Configuration
	Config DatasetConfig `yaml:"config"`

	// Test cases
	TestCases []TestCase `yaml:"test_cases"`
}

Dataset represents an evaluation dataset with test cases

func ListDatasets

func ListDatasets() ([]*Dataset, error)

ListDatasets returns all datasets in the default directory

func LoadDataset

func LoadDataset(filepath string) (*Dataset, error)

LoadDataset loads a dataset from a YAML file

func (*Dataset) Validate

func (d *Dataset) Validate() error

Validate validates the dataset structure

type DatasetConfig

type DatasetConfig struct {
	// Agent to use (optional - can be overridden at runtime)
	DefaultAgent string `yaml:"default_agent,omitempty"`

	// Collection to query (optional - can be overridden at runtime)
	DefaultCollection string `yaml:"default_collection,omitempty"`

	// Evaluation thresholds
	MinAccuracyScore   float64 `yaml:"min_accuracy_score,omitempty"`
	MinCitationScore   float64 `yaml:"min_citation_score,omitempty"`
	AllowHallucination bool    `yaml:"allow_hallucination,omitempty"`
}

DatasetConfig contains dataset-level configuration

type EvaluationConfig

type EvaluationConfig struct {
	AgentPath  string            `json:"agent_path" yaml:"agent_path"`
	Parameters map[string]string `json:"parameters,omitempty" yaml:"parameters,omitempty"`
}

EvaluationConfig stores the configuration used for the evaluation

type EvaluationRun

type EvaluationRun struct {
	// Metadata
	ID          string    `json:"id" yaml:"id"`
	Timestamp   time.Time `json:"timestamp" yaml:"timestamp"`
	DatasetName string    `json:"dataset_name" yaml:"dataset_name"`
	AgentName   string    `json:"agent_name" yaml:"agent_name"`
	Collection  string    `json:"collection,omitempty" yaml:"collection,omitempty"`

	// Configuration
	Config EvaluationConfig `json:"config" yaml:"config"`

	// Results
	Results []TestCaseResult `json:"results" yaml:"results"`

	// Summary
	Summary EvaluationSummary `json:"summary" yaml:"summary"`
}

EvaluationRun represents a complete evaluation run

func ListResults

func ListResults() ([]*EvaluationRun, error)

ListResults returns all available evaluation runs

func LoadResults

func LoadResults(runID string) (*EvaluationRun, error)

LoadResults loads evaluation results from a file

type EvaluationSummary

type EvaluationSummary struct {
	TotalTests  int     `json:"total_tests" yaml:"total_tests"`
	PassedTests int     `json:"passed_tests" yaml:"passed_tests"`
	FailedTests int     `json:"failed_tests" yaml:"failed_tests"`
	PassRate    float64 `json:"pass_rate" yaml:"pass_rate"`

	// Average scores
	AvgAccuracy      float64 `json:"avg_accuracy" yaml:"avg_accuracy"`
	AvgCitation      float64 `json:"avg_citation" yaml:"avg_citation"`
	AvgHallucination float64 `json:"avg_hallucination" yaml:"avg_hallucination"`

	// Performance
	TotalTime float64 `json:"total_time_ms" yaml:"total_time_ms"`
	AvgTime   float64 `json:"avg_time_ms" yaml:"avg_time_ms"`

	// Duration
	StartTime time.Time     `json:"start_time" yaml:"start_time"`
	EndTime   time.Time     `json:"end_time" yaml:"end_time"`
	Duration  time.Duration `json:"duration" yaml:"duration"`
}

EvaluationSummary contains aggregate statistics

func CalculateSummary

func CalculateSummary(results []TestCaseResult, startTime, endTime time.Time) EvaluationSummary

CalculateSummary calculates summary statistics from results

type Evaluator

type Evaluator interface {
	Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)
	Name() string
}

Evaluator interface for all evaluators

type HallucinationDetector

type HallucinationDetector struct {
	// contains filtered or unexported fields
}

HallucinationDetector detects potential hallucinations

func NewHallucinationDetector

func NewHallucinationDetector(llmClient llm.Client) *HallucinationDetector

NewHallucinationDetector creates a new hallucination detector

func (*HallucinationDetector) Evaluate

func (e *HallucinationDetector) Evaluate(ctx context.Context, testCase *TestCase, actual string, actualCitations []string) (float64, error)

Evaluate detects hallucinations (returns score where 1.0 = no hallucination)

func (*HallucinationDetector) Name

func (e *HallucinationDetector) Name() string

Name returns the evaluator name

type Runner

type Runner struct {
	// contains filtered or unexported fields
}

Runner executes evaluation runs

func NewRunner

func NewRunner(llmClient llm.Client) *Runner

NewRunner creates a new evaluation runner

func (*Runner) RunEvaluation

func (r *Runner) RunEvaluation(ctx context.Context, dataset *Dataset, agentName string, collection string) (*EvaluationRun, error)

RunEvaluation executes a complete evaluation

type TestCase

type TestCase struct {
	// Identification
	ID          string   `yaml:"id"`
	Description string   `yaml:"description,omitempty"`
	Tags        []string `yaml:"tags,omitempty"`

	// Input
	Query      string `yaml:"query"`
	Collection string `yaml:"collection,omitempty"` // Override default collection

	// Expected output
	ExpectedAnswer    string   `yaml:"expected_answer"`
	ExpectedCitations []string `yaml:"expected_citations,omitempty"`
	RequiredConcepts  []string `yaml:"required_concepts,omitempty"`

	// Evaluation criteria
	MinRelevanceScore float64 `yaml:"min_relevance_score,omitempty"`
	MustCite          bool    `yaml:"must_cite,omitempty"`
}

TestCase represents a single evaluation test case

func (*TestCase) Validate

func (tc *TestCase) Validate() error

Validate validates a test case

type TestCaseResult

type TestCaseResult struct {
	// Test case info
	TestCaseID string `json:"test_case_id" yaml:"test_case_id"`
	Query      string `json:"query" yaml:"query"`

	// Actual output
	ActualAnswer    string   `json:"actual_answer" yaml:"actual_answer"`
	ActualCitations []string `json:"actual_citations,omitempty" yaml:"actual_citations,omitempty"`
	ResponseTime    float64  `json:"response_time_ms" yaml:"response_time_ms"`

	// Evaluation scores
	AccuracyScore      float64 `json:"accuracy_score" yaml:"accuracy_score"`
	CitationScore      float64 `json:"citation_score" yaml:"citation_score"`
	HallucinationScore float64 `json:"hallucination_score" yaml:"hallucination_score"`

	// Pass/fail
	Passed bool     `json:"passed" yaml:"passed"`
	Errors []string `json:"errors,omitempty" yaml:"errors,omitempty"`

	// Details
	Details map[string]interface{} `json:"details,omitempty" yaml:"details,omitempty"`
}

TestCaseResult represents the result of evaluating a single test case

func EvaluateTestCase

func EvaluateTestCase(ctx context.Context, testCase *TestCase, actualAnswer string, actualCitations []string, llmClient llm.Client) (*TestCaseResult, error)

EvaluateTestCase runs all evaluators on a test case

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL