evaluation

package

v0.0.0-...-6aca404 Latest Latest Go to latest Published: Jan 29, 2026 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BaSui01/agentflow

Links

Open Source Insights

Documentation ¶

Overview ¶

Package evaluation provides automated evaluation framework for AI agents.

Package evaluation provides automated evaluation framework for AI agents. Validates: Requirements 9.2, 9.4, 9.5, 9.6

Package evaluation provides automated evaluation framework for AI agents.

Index ¶

Constants
Variables
func RegisterBuiltinMetrics(registry *MetricRegistry)
type ABTester
- func NewABTester(store ExperimentStore, logger *zap.Logger) *ABTester
- func (t *ABTester) Analyze(ctx context.Context, experimentID string) (*ExperimentResult, error)
- func (t *ABTester) Assign(experimentID, userID string) (*Variant, error)
- func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)
- func (t *ABTester) CompleteExperiment(experimentID string) error
- func (t *ABTester) CreateExperiment(exp *Experiment) error
- func (t *ABTester) DeleteExperiment(experimentID string) error
- func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)
- func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)
- func (t *ABTester) ListExperiments() []*Experiment
- func (t *ABTester) PauseExperiment(experimentID string) error
- func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error
- func (t *ABTester) StartExperiment(experimentID string) error
type AccuracyMetric
- func NewAccuracyMetric() *AccuracyMetric
- func (m *AccuracyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *AccuracyMetric) Name() string
type AgentExecutor
type AggregatedJudgeResult
type Alert
type AlertHandler
type AlertLevel
type AlertThreshold
type BatchEvalReport
type ContainsScorer
- func (s *ContainsScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
type CostMetric
- func NewCostMetric() *CostMetric
- func NewCostMetricWithMax(maxCost float64) *CostMetric
- func (m *CostMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *CostMetric) Name() string
type DimensionScore
type EvalInput
- func NewEvalInput(prompt string) *EvalInput
- func (e *EvalInput) WithContext(ctx map[string]any) *EvalInput
- func (e *EvalInput) WithExpected(expected string) *EvalInput
- func (e *EvalInput) WithReference(reference string) *EvalInput
type EvalOutput
- func NewEvalOutput(response string) *EvalOutput
- func (e *EvalOutput) WithCost(cost float64) *EvalOutput
- func (e *EvalOutput) WithLatency(latency time.Duration) *EvalOutput
- func (e *EvalOutput) WithMetadata(metadata map[string]any) *EvalOutput
- func (e *EvalOutput) WithTokensUsed(tokens int) *EvalOutput
type EvalReport
type EvalResult
type EvalSuite
type EvalSummary
type EvalTask
type Evaluator
- func NewEvaluator(config EvaluatorConfig, logger *zap.Logger) *Evaluator
- func (e *Evaluator) AddAlertHandler(handler AlertHandler)
- func (e *Evaluator) ClearAlerts()
- func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent AgentExecutor) (*EvalReport, error)
- func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent AgentExecutor) ([]*EvalReport, error)
- func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport
- func (e *Evaluator) GetAlerts() []Alert
- func (e *Evaluator) RegisterScorer(taskType string, scorer Scorer)
- func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)
type EvaluatorConfig
- func DefaultEvaluatorConfig() EvaluatorConfig
type ExactMatchScorer
- func (s *ExactMatchScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
type Experiment
type ExperimentResult
type ExperimentStatus
type ExperimentStore
type InputOutputPair
type JSONScorer
- func (s *JSONScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
type JudgeDimension
type JudgeResult
type LLMJudge
- func NewLLMJudge(provider llm.Provider, config LLMJudgeConfig, logger *zap.Logger) *LLMJudge
- func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult
- func (j *LLMJudge) GetConfig() LLMJudgeConfig
- func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)
- func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)
type LLMJudgeConfig
- func DefaultLLMJudgeConfig() LLMJudgeConfig
type LatencyMetric
- func NewLatencyMetric() *LatencyMetric
- func NewLatencyMetricWithThreshold(thresholdMs float64) *LatencyMetric
- func (m *LatencyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *LatencyMetric) Name() string
type MemoryExperimentStore
- func NewMemoryExperimentStore() *MemoryExperimentStore
- func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error
- func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
- func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
- func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)
- func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)
- func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
- func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
- func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error
type Metric
type MetricEvalResult
- func NewMetricEvalResult(inputID string) *MetricEvalResult
- func (r *MetricEvalResult) AddError(err string) *MetricEvalResult
- func (r *MetricEvalResult) AddMetric(name string, value float64) *MetricEvalResult
- func (r *MetricEvalResult) SetPassed(passed bool) *MetricEvalResult
type MetricRegistry
- func NewMetricRegistry() *MetricRegistry
- func NewRegistryWithBuiltinMetrics() *MetricRegistry
- func (r *MetricRegistry) ComputeAll(ctx context.Context, input *EvalInput, output *EvalOutput) (*MetricEvalResult, error)
- func (r *MetricRegistry) Get(name string) (Metric, bool)
- func (r *MetricRegistry) List() []string
- func (r *MetricRegistry) Register(metric Metric)
type Scorer
type StatisticalReport
type TokenUsageMetric
- func NewTokenUsageMetric() *TokenUsageMetric
- func NewTokenUsageMetricWithMax(maxTokens int) *TokenUsageMetric
- func (m *TokenUsageMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *TokenUsageMetric) Name() string
type Variant
type VariantComparison
type VariantReport
type VariantResult

Constants ¶

View Source

const DefaultPromptTemplate = `` /* 1118-byte string literal not displayed */

DefaultPromptTemplate 默认评估提示模板 Validates: Requirements 10.2

Variables ¶

View Source

var (
	ErrExperimentNotFound  = errors.New("experiment not found")
	ErrExperimentNotActive = errors.New("experiment not active")
	ErrNoVariants          = errors.New("no variants defined")
	ErrInvalidWeights      = errors.New("invalid variant weights")
	ErrVariantNotFound     = errors.New("variant not found")
)

A/B 测试相关错误

Functions ¶

func RegisterBuiltinMetrics ¶

func RegisterBuiltinMetrics(registry *MetricRegistry)

RegisterBuiltinMetrics 注册所有内置指标到注册表

Types ¶

type ABTester ¶

type ABTester struct {
	// contains filtered or unexported fields
}

ABTester A/B 测试器 Validates: Requirements 11.1, 11.2, 11.3, 11.5

func NewABTester ¶

func NewABTester(store ExperimentStore, logger *zap.Logger) *ABTester

NewABTester 创建 A/B 测试器

func (*ABTester) Analyze ¶

func (t *ABTester) Analyze(ctx context.Context, experimentID string) (*ExperimentResult, error)

Analyze 分析实验结果 Validates: Requirements 11.3, 11.4

func (*ABTester) Assign ¶

func (t *ABTester) Assign(experimentID, userID string) (*Variant, error)

Assign 分配变体使用一致性哈希确保同一用户始终分配到同一变体 Validates: Requirements 11.2

func (*ABTester) AutoSelectWinner ¶

func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)

AutoSelectWinner automatically selects the winning variant configuration when statistical significance is detected. Validates: Requirements 11.6

func (*ABTester) CompleteExperiment ¶

func (t *ABTester) CompleteExperiment(experimentID string) error

CompleteExperiment 完成实验

func (*ABTester) CreateExperiment ¶

func (t *ABTester) CreateExperiment(exp *Experiment) error

CreateExperiment 创建实验 Validates: Requirements 11.1

func (*ABTester) DeleteExperiment ¶

func (t *ABTester) DeleteExperiment(experimentID string) error

DeleteExperiment 删除实验

func (*ABTester) GenerateReport ¶

func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)

GenerateReport generates a comprehensive statistical significance analysis report Validates: Requirements 11.4

func (*ABTester) GetExperiment ¶

func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)

GetExperiment 获取实验

func (*ABTester) ListExperiments ¶

func (t *ABTester) ListExperiments() []*Experiment

ListExperiments 列出所有实验

func (*ABTester) PauseExperiment ¶

func (t *ABTester) PauseExperiment(experimentID string) error

PauseExperiment 暂停实验

func (*ABTester) RecordResult ¶

func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error

RecordResult 记录结果 Validates: Requirements 11.3

func (*ABTester) StartExperiment ¶

func (t *ABTester) StartExperiment(experimentID string) error

StartExperiment 启动实验

type AccuracyMetric ¶

type AccuracyMetric struct {
	// CaseSensitive 是否区分大小写
	CaseSensitive bool
	// TrimWhitespace 是否去除首尾空白
	TrimWhitespace bool
	// UseContains 是否使用包含匹配（而非精确匹配）
	UseContains bool
}

AccuracyMetric 准确率指标通过比较实际输出与期望输出计算准确率 Validates: Requirements 9.3

func NewAccuracyMetric ¶

func NewAccuracyMetric() *AccuracyMetric

NewAccuracyMetric 创建准确率指标

func (*AccuracyMetric) Compute ¶

func (m *AccuracyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算准确率返回值范围: 0.0 - 1.0 - 1.0: 完全匹配 - 0.0 - 1.0: 部分匹配（基于字符相似度）

func (*AccuracyMetric) Name ¶

func (m *AccuracyMetric) Name() string

Name 返回指标名称

type AgentExecutor ¶

type AgentExecutor interface {
	Execute(ctx context.Context, input string) (output string, tokens int, err error)
}

AgentExecutor defines the interface for executing agent tasks.

type AggregatedJudgeResult ¶

type AggregatedJudgeResult struct {
	Results       []*JudgeResult     `json:"results"`
	AverageScore  float64            `json:"average_score"`
	ScoreStdDev   float64            `json:"score_std_dev"`
	NeedsReview   bool               `json:"needs_review"`
	ReviewReason  string             `json:"review_reason,omitempty"`
	DimensionAvgs map[string]float64 `json:"dimension_averages"`
}

AggregatedJudgeResult 聚合的评判结果 Validates: Requirements 10.5

type Alert ¶

type Alert struct {
	Level      AlertLevel `json:"level"`
	MetricName string     `json:"metric_name"`
	Threshold  float64    `json:"threshold"`
	Actual     float64    `json:"actual"`
	Message    string     `json:"message"`
	TaskID     string     `json:"task_id,omitempty"`
	Timestamp  time.Time  `json:"timestamp"`
}

Alert represents an evaluation alert triggered when metrics exceed thresholds. Validates: Requirements 9.6

type AlertHandler ¶

type AlertHandler func(alert *Alert)

AlertHandler is called when an alert is triggered.

type AlertLevel ¶

type AlertLevel string

AlertLevel defines the severity of an alert.

const (
	AlertLevelInfo     AlertLevel = "info"
	AlertLevelWarning  AlertLevel = "warning"
	AlertLevelCritical AlertLevel = "critical"
)

type AlertThreshold ¶

type AlertThreshold struct {
	MetricName string     `json:"metric_name"`
	Operator   string     `json:"operator"` // "gt", "lt", "gte", "lte", "eq"
	Value      float64    `json:"value"`
	Level      AlertLevel `json:"level"`
	Message    string     `json:"message,omitempty"`
}

AlertThreshold defines a threshold for triggering alerts.

type BatchEvalReport ¶

type BatchEvalReport struct {
	Reports           []*EvalReport `json:"reports"`
	AggregatedSummary EvalSummary   `json:"aggregated_summary"`
	Alerts            []Alert       `json:"alerts,omitempty"`
	Timestamp         time.Time     `json:"timestamp"`
}

BatchEvalReport represents a batch evaluation report. Validates: Requirements 9.5

type ContainsScorer ¶

type ContainsScorer struct{}

ContainsScorer scores based on whether output contains expected.

func (*ContainsScorer) Score ¶

func (s *ContainsScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)

type CostMetric ¶

type CostMetric struct {
	// MaxCost 最大成本，用于归一化
	// 如果设置，返回值为 max(0, 1 - cost/maxCost)
	// 如果不设置（0），直接返回成本值
	MaxCost float64
}

CostMetric 成本指标返回 API 调用成本 Validates: Requirements 9.3

func NewCostMetric ¶

func NewCostMetric() *CostMetric

NewCostMetric 创建成本指标

func NewCostMetricWithMax ¶

func NewCostMetricWithMax(maxCost float64) *CostMetric

NewCostMetricWithMax 创建带最大值的成本指标

func (*CostMetric) Compute ¶

func (m *CostMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算成本如果设置了最大值，返回归一化分数 (0.0 - 1.0) 否则返回原始成本值

func (*CostMetric) Name ¶

func (m *CostMetric) Name() string

Name 返回指标名称

type DimensionScore ¶

type DimensionScore struct {
	Score     float64 `json:"score"`
	Reasoning string  `json:"reasoning"`
}

DimensionScore 维度评分

type EvalInput ¶

type EvalInput struct {
	Prompt    string         `json:"prompt"`
	Context   map[string]any `json:"context,omitempty"`
	Expected  string         `json:"expected,omitempty"`
	Reference string         `json:"reference,omitempty"`
}

EvalInput 评估输入

func NewEvalInput ¶

func NewEvalInput(prompt string) *EvalInput

NewEvalInput 创建评估输入

func (*EvalInput) WithContext ¶

func (e *EvalInput) WithContext(ctx map[string]any) *EvalInput

WithContext 设置上下文

func (*EvalInput) WithExpected ¶

func (e *EvalInput) WithExpected(expected string) *EvalInput

WithExpected 设置期望输出

func (*EvalInput) WithReference ¶

func (e *EvalInput) WithReference(reference string) *EvalInput

WithReference 设置参考内容

type EvalOutput ¶

type EvalOutput struct {
	Response   string         `json:"response"`
	TokensUsed int            `json:"tokens_used"`
	Latency    time.Duration  `json:"latency"`
	Cost       float64        `json:"cost"`
	Metadata   map[string]any `json:"metadata,omitempty"`
}

EvalOutput 评估输出

func NewEvalOutput ¶

func NewEvalOutput(response string) *EvalOutput

NewEvalOutput 创建评估输出

func (*EvalOutput) WithCost ¶

func (e *EvalOutput) WithCost(cost float64) *EvalOutput

WithCost 设置成本

func (*EvalOutput) WithLatency ¶

func (e *EvalOutput) WithLatency(latency time.Duration) *EvalOutput

WithLatency 设置延迟

func (*EvalOutput) WithMetadata ¶

func (e *EvalOutput) WithMetadata(metadata map[string]any) *EvalOutput

WithMetadata 设置元数据

func (*EvalOutput) WithTokensUsed ¶

func (e *EvalOutput) WithTokensUsed(tokens int) *EvalOutput

WithTokensUsed 设置 Token 使用量

type EvalReport ¶

type EvalReport struct {
	SuiteID   string            `json:"suite_id"`
	SuiteName string            `json:"suite_name"`
	AgentID   string            `json:"agent_id"`
	Results   []EvalResult      `json:"results"`
	Summary   EvalSummary       `json:"summary"`
	StartTime time.Time         `json:"start_time"`
	EndTime   time.Time         `json:"end_time"`
	Duration  time.Duration     `json:"duration"`
	Metadata  map[string]string `json:"metadata,omitempty"`
}

EvalReport represents the complete evaluation report.

type EvalResult ¶

type EvalResult struct {
	TaskID     string             `json:"task_id"`
	Success    bool               `json:"success"`
	Output     string             `json:"output"`
	Expected   string             `json:"expected,omitempty"`
	Score      float64            `json:"score"` // 0.0 - 1.0
	Metrics    map[string]float64 `json:"metrics,omitempty"`
	Error      string             `json:"error,omitempty"`
	Duration   time.Duration      `json:"duration"`
	TokensUsed int                `json:"tokens_used,omitempty"`
	Cost       float64            `json:"cost,omitempty"`
}

EvalResult represents the result of evaluating a single task.

type EvalSuite ¶

type EvalSuite struct {
	ID          string     `json:"id"`
	Name        string     `json:"name"`
	Description string     `json:"description"`
	Tasks       []EvalTask `json:"tasks"`
	Version     string     `json:"version"`
}

EvalSuite represents a collection of evaluation tasks.

type EvalSummary ¶

type EvalSummary struct {
	TotalTasks     int                `json:"total_tasks"`
	PassedTasks    int                `json:"passed_tasks"`
	FailedTasks    int                `json:"failed_tasks"`
	PassRate       float64            `json:"pass_rate"`
	AverageScore   float64            `json:"average_score"`
	TotalTokens    int                `json:"total_tokens"`
	TotalCost      float64            `json:"total_cost"`
	TotalDuration  time.Duration      `json:"total_duration"`
	MetricAverages map[string]float64 `json:"metric_averages,omitempty"`
	// Statistical metrics
	ScoreStdDev float64            `json:"score_std_dev"`
	ScoreMin    float64            `json:"score_min"`
	ScoreMax    float64            `json:"score_max"`
	ScoreMedian float64            `json:"score_median"`
	Percentiles map[string]float64 `json:"percentiles,omitempty"` // p50, p90, p95, p99
}

EvalSummary contains aggregated evaluation metrics. Validates: Requirements 9.5

type EvalTask ¶

type EvalTask struct {
	ID          string            `json:"id"`
	Name        string            `json:"name"`
	Description string            `json:"description"`
	Input       string            `json:"input"`
	Expected    string            `json:"expected,omitempty"`
	Metadata    map[string]string `json:"metadata,omitempty"`
	Tags        []string          `json:"tags,omitempty"`
	Timeout     time.Duration     `json:"timeout,omitempty"`
}

EvalTask represents an evaluation task.

type Evaluator ¶

type Evaluator struct {
	// contains filtered or unexported fields
}

Evaluator runs evaluation suites against agents. Validates: Requirements 9.2, 9.4, 9.5, 9.6

func NewEvaluator ¶

func NewEvaluator(config EvaluatorConfig, logger *zap.Logger) *Evaluator

NewEvaluator creates a new evaluator.

func (*Evaluator) AddAlertHandler ¶

func (e *Evaluator) AddAlertHandler(handler AlertHandler)

AddAlertHandler adds a handler for alerts. Validates: Requirements 9.6

func (*Evaluator) ClearAlerts ¶

func (e *Evaluator) ClearAlerts()

ClearAlerts clears all triggered alerts.

func (*Evaluator) Evaluate ¶

func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent AgentExecutor) (*EvalReport, error)

Evaluate runs an evaluation suite against an agent. Validates: Requirements 9.2, 9.5

func (*Evaluator) EvaluateBatch ¶

func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent AgentExecutor) ([]*EvalReport, error)

EvaluateBatch runs batch evaluation on multiple suites. Validates: Requirements 9.4

func (*Evaluator) GenerateReport ¶

func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport

GenerateReport generates a comprehensive evaluation report. Validates: Requirements 9.5

func (*Evaluator) GetAlerts ¶

func (e *Evaluator) GetAlerts() []Alert

GetAlerts returns all triggered alerts.

func (*Evaluator) RegisterScorer ¶

func (e *Evaluator) RegisterScorer(taskType string, scorer Scorer)

RegisterScorer registers a scorer for a specific task type.

func (*Evaluator) SetMetricRegistry ¶

func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)

SetMetricRegistry sets a custom metric registry.

type EvaluatorConfig ¶

type EvaluatorConfig struct {
	Concurrency     int              `json:"concurrency"`
	DefaultTimeout  time.Duration    `json:"default_timeout"`
	StopOnFailure   bool             `json:"stop_on_failure"`
	RetryOnError    bool             `json:"retry_on_error"`
	MaxRetries      int              `json:"max_retries"`
	PassThreshold   float64          `json:"pass_threshold"` // Score threshold to pass
	AlertThresholds []AlertThreshold `json:"alert_thresholds,omitempty"`
	// Batch evaluation settings
	BatchSize      int  `json:"batch_size"`      // Number of tasks per batch
	CollectMetrics bool `json:"collect_metrics"` // Auto-collect metrics after execution
	EnableAlerts   bool `json:"enable_alerts"`   // Enable alert triggering
}

EvaluatorConfig configures the evaluator. Validates: Requirements 9.4, 9.6

func DefaultEvaluatorConfig ¶

func DefaultEvaluatorConfig() EvaluatorConfig

DefaultEvaluatorConfig returns sensible defaults.

type ExactMatchScorer ¶

type ExactMatchScorer struct{}

ExactMatchScorer scores based on exact string match.

func (*ExactMatchScorer) Score ¶

func (s *ExactMatchScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)

type Experiment ¶

type Experiment struct {
	ID          string           `json:"id"`
	Name        string           `json:"name"`
	Description string           `json:"description"`
	Variants    []Variant        `json:"variants"`
	Metrics     []string         `json:"metrics"`
	StartTime   time.Time        `json:"start_time"`
	EndTime     *time.Time       `json:"end_time,omitempty"`
	Status      ExperimentStatus `json:"status"`
}

Experiment 实验定义 Validates: Requirements 11.1

type ExperimentResult ¶

type ExperimentResult struct {
	ExperimentID   string                    `json:"experiment_id"`
	VariantResults map[string]*VariantResult `json:"variant_results"`
	Winner         string                    `json:"winner,omitempty"`
	Confidence     float64                   `json:"confidence"`
	SampleSize     int                       `json:"sample_size"`
	Duration       time.Duration             `json:"duration"`
}

ExperimentResult 实验结果 Validates: Requirements 11.3, 11.4

type ExperimentStatus ¶

type ExperimentStatus string

ExperimentStatus 实验状态

const (
	ExperimentStatusDraft    ExperimentStatus = "draft"
	ExperimentStatusRunning  ExperimentStatus = "running"
	ExperimentStatusPaused   ExperimentStatus = "paused"
	ExperimentStatusComplete ExperimentStatus = "completed"
)

type ExperimentStore ¶

type ExperimentStore interface {
	// SaveExperiment 保存实验
	SaveExperiment(ctx context.Context, exp *Experiment) error
	// LoadExperiment 加载实验
	LoadExperiment(ctx context.Context, id string) (*Experiment, error)
	// ListExperiments 列出所有实验
	ListExperiments(ctx context.Context) ([]*Experiment, error)
	// DeleteExperiment 删除实验
	DeleteExperiment(ctx context.Context, id string) error
	// RecordAssignment 记录分配
	RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
	// GetAssignment 获取用户分配
	GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
	// RecordResult 记录结果
	RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
	// GetResults 获取实验结果
	GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
}

ExperimentStore 实验存储接口

type InputOutputPair ¶

type InputOutputPair struct {
	Input  *EvalInput
	Output *EvalOutput
}

InputOutputPair 输入输出对，用于批量评判

type JSONScorer ¶

type JSONScorer struct{}

JSONScorer scores based on JSON structure matching.

func (*JSONScorer) Score ¶

func (s *JSONScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)

type JudgeDimension ¶

type JudgeDimension struct {
	Name        string  `json:"name"`
	Description string  `json:"description"`
	Weight      float64 `json:"weight"`
}

JudgeDimension 评判维度 Validates: Requirements 10.3

type JudgeResult ¶

type JudgeResult struct {
	OverallScore float64                   `json:"overall_score"`
	Dimensions   map[string]DimensionScore `json:"dimensions"`
	Reasoning    string                    `json:"reasoning"`
	Confidence   float64                   `json:"confidence"`
	// Additional metadata
	Model     string    `json:"model,omitempty"`
	Timestamp time.Time `json:"timestamp"`
}

JudgeResult 评判结果 Validates: Requirements 10.4

type LLMJudge ¶

type LLMJudge struct {
	// contains filtered or unexported fields
}

LLMJudge LLM 评判器使用 LLM 作为评估者来评估 Agent 输出质量 Validates: Requirements 10.1, 10.2, 10.3, 10.4, 10.5

func NewLLMJudge ¶

func NewLLMJudge(provider llm.Provider, config LLMJudgeConfig, logger *zap.Logger) *LLMJudge

NewLLMJudge 创建 LLM 评判器 Validates: Requirements 10.1

func (*LLMJudge) AggregateResults ¶

func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult

AggregateResults 聚合多个评判结果 Validates: Requirements 10.5

func (*LLMJudge) GetConfig ¶

func (j *LLMJudge) GetConfig() LLMJudgeConfig

GetConfig 返回当前配置

func (*LLMJudge) Judge ¶

func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)

Judge 执行评判 Validates: Requirements 10.2, 10.4

func (*LLMJudge) JudgeBatch ¶

func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)

JudgeBatch 批量评判 Validates: Requirements 10.4, 10.5

type LLMJudgeConfig ¶

type LLMJudgeConfig struct {
	Model            string           `json:"model"`
	Dimensions       []JudgeDimension `json:"dimensions"`
	PromptTemplate   string           `json:"prompt_template"`
	ScoreRange       [2]float64       `json:"score_range"` // [min, max]
	RequireReasoning bool             `json:"require_reasoning"`
	// Timeout for each judge call
	Timeout time.Duration `json:"timeout,omitempty"`
	// MaxConcurrency for batch judging
	MaxConcurrency int `json:"max_concurrency,omitempty"`
}

LLMJudgeConfig LLM 评判配置 Validates: Requirements 10.1, 10.3

func DefaultLLMJudgeConfig ¶

func DefaultLLMJudgeConfig() LLMJudgeConfig

DefaultLLMJudgeConfig 返回默认配置

type LatencyMetric ¶

type LatencyMetric struct {
	// ThresholdMs 延迟阈值（毫秒），用于归一化
	// 如果设置，返回值为 max(0, 1 - latency/threshold)
	// 如果不设置（0），直接返回毫秒数
	ThresholdMs float64
}

LatencyMetric 延迟指标返回响应延迟（毫秒） Validates: Requirements 9.3

func NewLatencyMetric ¶

func NewLatencyMetric() *LatencyMetric

NewLatencyMetric 创建延迟指标

func NewLatencyMetricWithThreshold ¶

func NewLatencyMetricWithThreshold(thresholdMs float64) *LatencyMetric

NewLatencyMetricWithThreshold 创建带阈值的延迟指标

func (*LatencyMetric) Compute ¶

func (m *LatencyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算延迟如果设置了阈值，返回归一化分数 (0.0 - 1.0) 否则返回原始延迟（毫秒）

func (*LatencyMetric) Name ¶

func (m *LatencyMetric) Name() string

Name 返回指标名称

type MemoryExperimentStore ¶

type MemoryExperimentStore struct {
	// contains filtered or unexported fields
}

MemoryExperimentStore 内存实验存储（用于测试和简单场景）

func NewMemoryExperimentStore ¶

func NewMemoryExperimentStore() *MemoryExperimentStore

NewMemoryExperimentStore 创建内存实验存储

func (*MemoryExperimentStore) DeleteExperiment ¶

func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error

DeleteExperiment 删除实验

func (*MemoryExperimentStore) GetAssignment ¶

func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)

GetAssignment 获取用户分配

func (*MemoryExperimentStore) GetAssignmentCount ¶

func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int

GetAssignmentCount 获取分配计数（用于测试）

func (*MemoryExperimentStore) GetResultCount ¶

func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int

GetResultCount 获取结果计数（用于测试）

func (*MemoryExperimentStore) GetResults ¶

func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)

GetResults 获取实验结果

func (*MemoryExperimentStore) ListExperiments ¶

func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)

ListExperiments 列出所有实验

func (*MemoryExperimentStore) LoadExperiment ¶

func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)

LoadExperiment 加载实验

func (*MemoryExperimentStore) RecordAssignment ¶

func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error

RecordAssignment 记录分配

func (*MemoryExperimentStore) RecordResult ¶

func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error

RecordResult 记录结果

func (*MemoryExperimentStore) SaveExperiment ¶

func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error

SaveExperiment 保存实验

type Metric ¶

type Metric interface {
	// Name 指标名称
	Name() string
	// Compute 计算指标值
	Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
}

Metric 评估指标接口 Validates: Requirements 9.1

type MetricEvalResult ¶

type MetricEvalResult struct {
	InputID   string             `json:"input_id"`
	Metrics   map[string]float64 `json:"metrics"`
	Passed    bool               `json:"passed"`
	Errors    []string           `json:"errors,omitempty"`
	Timestamp time.Time          `json:"timestamp"`
}

MetricEvalResult 评估结果（符合设计文档规范）注意：与现有 EvalResult 区分，此类型专用于 Metric 接口

func NewMetricEvalResult ¶

func NewMetricEvalResult(inputID string) *MetricEvalResult

NewMetricEvalResult 创建评估结果

func (*MetricEvalResult) AddError ¶

func (r *MetricEvalResult) AddError(err string) *MetricEvalResult

AddError 添加错误

func (*MetricEvalResult) AddMetric ¶

func (r *MetricEvalResult) AddMetric(name string, value float64) *MetricEvalResult

AddMetric 添加指标值

func (*MetricEvalResult) SetPassed ¶

func (r *MetricEvalResult) SetPassed(passed bool) *MetricEvalResult

SetPassed 设置是否通过

type MetricRegistry ¶

type MetricRegistry struct {
	// contains filtered or unexported fields
}

MetricRegistry 指标注册表

func NewMetricRegistry ¶

func NewMetricRegistry() *MetricRegistry

NewMetricRegistry 创建指标注册表

func NewRegistryWithBuiltinMetrics ¶

func NewRegistryWithBuiltinMetrics() *MetricRegistry

NewRegistryWithBuiltinMetrics 创建包含所有内置指标的注册表

func (*MetricRegistry) ComputeAll ¶

func (r *MetricRegistry) ComputeAll(ctx context.Context, input *EvalInput, output *EvalOutput) (*MetricEvalResult, error)

ComputeAll 计算所有注册的指标

func (*MetricRegistry) Get ¶

func (r *MetricRegistry) Get(name string) (Metric, bool)

Get 获取指标

func (*MetricRegistry) List ¶

func (r *MetricRegistry) List() []string

List 列出所有指标名称

func (*MetricRegistry) Register ¶

func (r *MetricRegistry) Register(metric Metric)

Register 注册指标

type Scorer ¶

type Scorer interface {
	Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
}

Scorer defines the interface for scoring evaluation results.

type StatisticalReport ¶

type StatisticalReport struct {
	ExperimentID     string                    `json:"experiment_id"`
	ExperimentName   string                    `json:"experiment_name"`
	Status           ExperimentStatus          `json:"status"`
	Duration         time.Duration             `json:"duration"`
	TotalSamples     int                       `json:"total_samples"`
	VariantReports   map[string]*VariantReport `json:"variant_reports"`
	Comparisons      []*VariantComparison      `json:"comparisons"`
	Winner           string                    `json:"winner,omitempty"`
	WinnerConfidence float64                   `json:"winner_confidence,omitempty"`
	Recommendation   string                    `json:"recommendation"`
	GeneratedAt      time.Time                 `json:"generated_at"`
}

StatisticalReport represents a detailed statistical analysis report Validates: Requirements 11.4

type TokenUsageMetric ¶

type TokenUsageMetric struct {
	// MaxTokens 最大 Token 数，用于归一化
	// 如果设置，返回值为 max(0, 1 - tokens/maxTokens)
	// 如果不设置（0），直接返回 Token 数
	MaxTokens int
}

TokenUsageMetric Token 使用量指标返回 Token 使用量 Validates: Requirements 9.3

func NewTokenUsageMetric ¶

func NewTokenUsageMetric() *TokenUsageMetric

NewTokenUsageMetric 创建 Token 使用量指标

func NewTokenUsageMetricWithMax ¶

func NewTokenUsageMetricWithMax(maxTokens int) *TokenUsageMetric

NewTokenUsageMetricWithMax 创建带最大值的 Token 使用量指标

func (*TokenUsageMetric) Compute ¶

func (m *TokenUsageMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算 Token 使用量如果设置了最大值，返回归一化分数 (0.0 - 1.0) 否则返回原始 Token 数

func (*TokenUsageMetric) Name ¶

func (m *TokenUsageMetric) Name() string

Name 返回指标名称

type Variant ¶

type Variant struct {
	ID        string         `json:"id"`
	Name      string         `json:"name"`
	Config    map[string]any `json:"config"`
	Weight    float64        `json:"weight"` // 流量权重
	IsControl bool           `json:"is_control"`
}

Variant 实验变体 Validates: Requirements 11.1, 11.5

type VariantComparison ¶

type VariantComparison struct {
	ControlID      string             `json:"control_id"`
	TreatmentID    string             `json:"treatment_id"`
	MetricDeltas   map[string]float64 `json:"metric_deltas"`   // treatment - control
	RelativeChange map[string]float64 `json:"relative_change"` // percentage change
	PValues        map[string]float64 `json:"p_values"`
	Confidence     map[string]float64 `json:"confidence"`
	Significant    map[string]bool    `json:"significant"` // at 95% level
}

VariantComparison contains comparison results between two variants

type VariantReport ¶

type VariantReport struct {
	VariantID    string                `json:"variant_id"`
	VariantName  string                `json:"variant_name"`
	IsControl    bool                  `json:"is_control"`
	SampleCount  int                   `json:"sample_count"`
	Metrics      map[string]float64    `json:"metrics"`
	StdDev       map[string]float64    `json:"std_dev"`
	ConfInterval map[string][2]float64 `json:"confidence_interval"` // 95% CI
}

VariantReport contains detailed statistics for a single variant

type VariantResult ¶

type VariantResult struct {
	VariantID   string             `json:"variant_id"`
	SampleCount int                `json:"sample_count"`
	Metrics     map[string]float64 `json:"metrics"`
	StdDev      map[string]float64 `json:"std_dev"`
	// contains filtered or unexported fields
}

VariantResult 变体结果 Validates: Requirements 11.3

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL