evaluation

package

v1.3.0 Latest Latest Go to latest Published: Feb 26, 2026 License: MIT Imports: 13 Imported by: 0

Details

Valid go.mod file
Redistributable license
Tagged version
Stable version
Learn more about best practices

Repository

github.com/BaSui01/agentflow

Links

Open Source Insights

Documentation ¶

Index ¶

Constants
Variables
func RegisterBuiltinMetrics(registry *MetricRegistry)
func RegisterResearchMetrics(evaluator *ResearchEvaluator, logger *zap.Logger)
type ABTester
- func NewABTester(store ExperimentStore, logger *zap.Logger) *ABTester
- func (t *ABTester) Analyze(ctx context.Context, experimentID string) (*ExperimentResult, error)
- func (t *ABTester) Assign(experimentID, userID string) (*Variant, error)
- func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)
- func (t *ABTester) CompleteExperiment(experimentID string) error
- func (t *ABTester) CreateExperiment(exp *Experiment) error
- func (t *ABTester) DeleteExperiment(experimentID string) error
- func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)
- func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)
- func (t *ABTester) ListExperiments() []*Experiment
- func (t *ABTester) PauseExperiment(experimentID string) error
- func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error
- func (t *ABTester) StartExperiment(experimentID string) error
type AccuracyMetric
- func NewAccuracyMetric() *AccuracyMetric
- func (m *AccuracyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *AccuracyMetric) Name() string
type AggregatedJudgeResult
type Alert
type AlertHandler
type AlertLevel
type AlertThreshold
type BatchEvalReport
type ClarityMetric
- func NewClarityMetric(logger *zap.Logger) *ClarityMetric
- func (m *ClarityMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *ClarityMetric) Name() string
type CompletenessMetric
- func NewCompletenessMetric(logger *zap.Logger) *CompletenessMetric
- func (m *CompletenessMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *CompletenessMetric) Name() string
type ContainsScorer
- func (s *ContainsScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
type CostMetric
- func NewCostMetric() *CostMetric
- func NewCostMetricWithMax(maxCost float64) *CostMetric
- func (m *CostMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *CostMetric) Name() string
type DimensionScore
type EvalExecutor
type EvalInput
- func NewEvalInput(prompt string) *EvalInput
- func (e *EvalInput) WithContext(ctx map[string]any) *EvalInput
- func (e *EvalInput) WithExpected(expected string) *EvalInput
- func (e *EvalInput) WithReference(reference string) *EvalInput
type EvalOutput
- func NewEvalOutput(response string) *EvalOutput
- func (e *EvalOutput) WithCost(cost float64) *EvalOutput
- func (e *EvalOutput) WithLatency(latency time.Duration) *EvalOutput
- func (e *EvalOutput) WithMetadata(metadata map[string]any) *EvalOutput
- func (e *EvalOutput) WithTokensUsed(tokens int) *EvalOutput
type EvalReport
type EvalResult
type EvalSuite
type EvalSummary
type EvalTask
type Evaluator
- func NewEvaluator(config EvaluatorConfig, logger *zap.Logger) *Evaluator
- func (e *Evaluator) AddAlertHandler(handler AlertHandler)
- func (e *Evaluator) ClearAlerts()
- func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent EvalExecutor) (*EvalReport, error)
- func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent EvalExecutor) ([]*EvalReport, error)
- func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport
- func (e *Evaluator) GetAlerts() []Alert
- func (e *Evaluator) RegisterScorer(taskType string, scorer Scorer)
- func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)
type EvaluatorConfig
- func DefaultEvaluatorConfig() EvaluatorConfig
type ExactMatchScorer
- func (s *ExactMatchScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
type Experiment
type ExperimentResult
type ExperimentStatus
type ExperimentStore
type InputOutputPair
type JSONScorer
- func (s *JSONScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
type JudgeDimension
type JudgeResult
type LLMJudge
- func NewLLMJudge(provider llm.Provider, config LLMJudgeConfig, logger *zap.Logger) *LLMJudge
- func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult
- func (j *LLMJudge) GetConfig() LLMJudgeConfig
- func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)
- func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)
type LLMJudgeConfig
- func DefaultLLMJudgeConfig() LLMJudgeConfig
type LatencyMetric
- func NewLatencyMetric() *LatencyMetric
- func NewLatencyMetricWithThreshold(thresholdMs float64) *LatencyMetric
- func (m *LatencyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *LatencyMetric) Name() string
type MemoryExperimentStore
- func NewMemoryExperimentStore() *MemoryExperimentStore
- func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error
- func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
- func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
- func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)
- func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)
- func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
- func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
- func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error
type Metric
type MetricEvalResult
- func NewMetricEvalResult(inputID string) *MetricEvalResult
- func (r *MetricEvalResult) AddError(err string) *MetricEvalResult
- func (r *MetricEvalResult) AddMetric(name string, value float64) *MetricEvalResult
- func (r *MetricEvalResult) SetPassed(passed bool) *MetricEvalResult
type MetricRegistry
- func NewMetricRegistry() *MetricRegistry
- func NewRegistryWithBuiltinMetrics() *MetricRegistry
- func (r *MetricRegistry) ComputeAll(ctx context.Context, input *EvalInput, output *EvalOutput) (*MetricEvalResult, error)
- func (r *MetricRegistry) Get(name string) (Metric, bool)
- func (r *MetricRegistry) List() []string
- func (r *MetricRegistry) Register(metric Metric)
type NoveltyMetric
- func NewNoveltyMetric(logger *zap.Logger) *NoveltyMetric
- func (m *NoveltyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *NoveltyMetric) Name() string
type ResearchDimension
type ResearchEvalConfig
- func DefaultResearchEvalConfig() ResearchEvalConfig
type ResearchEvalResult
type ResearchEvaluator
- func NewResearchEvaluator(config ResearchEvalConfig, logger *zap.Logger) *ResearchEvaluator
- func (e *ResearchEvaluator) BatchEvaluate(ctx context.Context, pairs []struct{ ... }) ([]*ResearchEvalResult, error)
- func (e *ResearchEvaluator) Evaluate(ctx context.Context, input *EvalInput, output *EvalOutput) (*ResearchEvalResult, error)
- func (e *ResearchEvaluator) RegisterMetric(dimension ResearchDimension, metric Metric)
type RigorMetric
- func NewRigorMetric(logger *zap.Logger) *RigorMetric
- func (m *RigorMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *RigorMetric) Name() string
type Scorer
type StatisticalReport
type TokenUsageMetric
- func NewTokenUsageMetric() *TokenUsageMetric
- func NewTokenUsageMetricWithMax(maxTokens int) *TokenUsageMetric
- func (m *TokenUsageMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
- func (m *TokenUsageMetric) Name() string
type Variant
type VariantComparison
type VariantReport
type VariantResult

Constants ¶

View Source

const DefaultPromptTemplate = `` /* 1118-byte string literal not displayed */

DefaultPromptTemplate 默认评估提示模板审定: 所需经费 10.2

Variables ¶

View Source

var (
	ErrExperimentNotFound  = errors.New("experiment not found")
	ErrExperimentNotActive = errors.New("experiment not active")
	ErrNoVariants          = errors.New("no variants defined")
	ErrInvalidWeights      = errors.New("invalid variant weights")
	ErrVariantNotFound     = errors.New("variant not found")
)

A/B 测试相关错误

Functions ¶

func RegisterResearchMetrics ¶

func RegisterResearchMetrics(evaluator *ResearchEvaluator, logger *zap.Logger)

RegisterResearchMetrics 注册所有研究评估指标到评估器

Types ¶

func (*ABTester) Analyze ¶

func (t *ABTester) Analyze(ctx context.Context, experimentID string) (*ExperimentResult, error)

Analyze 分析实验结果审定: 所需经费 11.3, 11.4

func (*ABTester) Assign ¶

func (t *ABTester) Assign(experimentID, userID string) (*Variant, error)

Assign 分配变体使用一致性哈希确保同一用户始终分配到同一变体审定:所需经费11.2

func (*ABTester) AutoSelectWinner ¶

func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)

自动选择Winner 自动选择获胜的变量配置当检测到统计意义时。核实:所需经费 11.6

func (*ABTester) CompleteExperiment ¶

func (t *ABTester) CompleteExperiment(experimentID string) error

CompleteExperiment 完成实验

func (*ABTester) CreateExperiment ¶

func (t *ABTester) CreateExperiment(exp *Experiment) error

CreateExperiment 创建实验审定:所需经费11.1

func (*ABTester) DeleteExperiment ¶

func (t *ABTester) DeleteExperiment(experimentID string) error

DeleteExperiment 删除实验

func (*ABTester) GenerateReport ¶

func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)

生成报告生成一份全面的统计意义分析报告审定:所需经费 11.4

func (*ABTester) GetExperiment ¶

func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)

GetExperiment 获取实验

func (*ABTester) ListExperiments ¶

func (t *ABTester) ListExperiments() []*Experiment

ListExperiments 列出所有实验

func (*ABTester) PauseExperiment ¶

func (t *ABTester) PauseExperiment(experimentID string) error

PauseExperiment 暂停实验

func (*ABTester) RecordResult ¶

func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error

RecordResult 记录结果核证:所需经费 11.3

func (*ABTester) StartExperiment ¶

func (t *ABTester) StartExperiment(experimentID string) error

StartExperiment 启动实验

type AccuracyMetric ¶

type AccuracyMetric struct {
	// CaseSensitive 是否区分大小写
	CaseSensitive bool
	// TrimWhitespace 是否去除首尾空白
	TrimWhitespace bool
	// UseContains 是否使用包含匹配（而非精确匹配）
	UseContains bool
}

AccuracyMetric 准确率指标通过比较实际输出与期望输出计算准确率核证:要求9.3

func (*AccuracyMetric) Compute ¶

func (m *AccuracyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算准确率返回值范围: 0.0 - 1.0 - 1.0: 完全匹配 - 0.0 - 1.0: 部分匹配（基于字符相似度）

func (*AccuracyMetric) Name ¶

func (m *AccuracyMetric) Name() string

Name 返回指标名称

type AlertLevel ¶

type AlertLevel string

警报等级定义了警报的严重程度.

const (
	AlertLevelInfo     AlertLevel = "info"
	AlertLevelWarning  AlertLevel = "warning"
	AlertLevelCritical AlertLevel = "critical"
)

func (*ClarityMetric) Compute ¶

func (m *ClarityMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

func (*ClarityMetric) Name ¶

func (m *ClarityMetric) Name() string

func (*CompletenessMetric) Compute ¶

func (m *CompletenessMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

func (*CompletenessMetric) Name ¶

func (m *CompletenessMetric) Name() string

func (*ContainsScorer) Score ¶

func (s *ContainsScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)

func (*CostMetric) Compute ¶

func (m *CostMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算成本如果设置了最大值，返回归一化分数 (0.0 - 1.0) 否则返回原始成本值

func (*CostMetric) Name ¶

func (m *CostMetric) Name() string

Name 返回指标名称

type EvalExecutor ¶ added in v1.0.0

type EvalExecutor interface {
	Execute(ctx context.Context, input string) (output string, tokens int, err error)
}

EvalExecutor defines the interface for executing an agent during evaluation. Renamed from AgentExecutor to avoid naming conflict with workflow.AgentExecutor. Unlike types.Executor (which uses any -> any), this interface uses string I/O and returns token count, which is specific to evaluation scoring needs.

type EvalInput ¶

type EvalInput struct {
	Prompt    string         `json:"prompt"`
	Context   map[string]any `json:"context,omitempty"`
	Expected  string         `json:"expected,omitempty"`
	Reference string         `json:"reference,omitempty"`
}

EvalInput 评估输入

func NewEvalInput ¶

func NewEvalInput(prompt string) *EvalInput

NewEvalInput 创建评估输入

func (*EvalInput) WithContext ¶

func (e *EvalInput) WithContext(ctx map[string]any) *EvalInput

WithContext 设置上下文

func (*EvalInput) WithExpected ¶

func (e *EvalInput) WithExpected(expected string) *EvalInput

WithExpected 设置期望输出

func (*EvalInput) WithReference ¶

func (e *EvalInput) WithReference(reference string) *EvalInput

WithReference 设置参考内容

type EvalOutput ¶

type EvalOutput struct {
	Response   string         `json:"response"`
	TokensUsed int            `json:"tokens_used"`
	Latency    time.Duration  `json:"latency"`
	Cost       float64        `json:"cost"`
	Metadata   map[string]any `json:"metadata,omitempty"`
}

EvalOutput 评估输出

func NewEvalOutput ¶

func NewEvalOutput(response string) *EvalOutput

NewEvalOutput 创建评估输出

func (*EvalOutput) WithCost ¶

func (e *EvalOutput) WithCost(cost float64) *EvalOutput

WithCost 设置成本

func (*EvalOutput) WithLatency ¶

func (e *EvalOutput) WithLatency(latency time.Duration) *EvalOutput

WithLatency 设置延迟

func (*EvalOutput) WithMetadata ¶

func (e *EvalOutput) WithMetadata(metadata map[string]any) *EvalOutput

WithMetadata 设置元数据

func (*EvalOutput) WithTokensUsed ¶

func (e *EvalOutput) WithTokensUsed(tokens int) *EvalOutput

WithTokensUsed 设置 Token 使用量

type EvalReport ¶

type EvalReport struct {
	SuiteID   string            `json:"suite_id"`
	SuiteName string            `json:"suite_name"`
	AgentID   string            `json:"agent_id"`
	Results   []EvalResult      `json:"results"`
	Summary   EvalSummary       `json:"summary"`
	StartTime time.Time         `json:"start_time"`
	EndTime   time.Time         `json:"end_time"`
	Duration  time.Duration     `json:"duration"`
	Metadata  map[string]string `json:"metadata,omitempty"`
}

Eval报告是完整的评价报告。

type EvalResult ¶

type EvalResult struct {
	TaskID     string             `json:"task_id"`
	Success    bool               `json:"success"`
	Output     string             `json:"output"`
	Expected   string             `json:"expected,omitempty"`
	Score      float64            `json:"score"` // 0.0 - 1.0
	Metrics    map[string]float64 `json:"metrics,omitempty"`
	Error      string             `json:"error,omitempty"`
	Duration   time.Duration      `json:"duration"`
	TokensUsed int                `json:"tokens_used,omitempty"`
	Cost       float64            `json:"cost,omitempty"`
}

EvalResult是评价一项单一任务的结果。

type EvalSuite ¶

type EvalSuite struct {
	ID          string     `json:"id"`
	Name        string     `json:"name"`
	Description string     `json:"description"`
	Tasks       []EvalTask `json:"tasks"`
	Version     string     `json:"version"`
}

EvalSuite代表了一系列评价任务。

type EvalSummary ¶

type EvalSummary struct {
	TotalTasks     int                `json:"total_tasks"`
	PassedTasks    int                `json:"passed_tasks"`
	FailedTasks    int                `json:"failed_tasks"`
	PassRate       float64            `json:"pass_rate"`
	AverageScore   float64            `json:"average_score"`
	TotalTokens    int                `json:"total_tokens"`
	TotalCost      float64            `json:"total_cost"`
	TotalDuration  time.Duration      `json:"total_duration"`
	MetricAverages map[string]float64 `json:"metric_averages,omitempty"`
	// 统计衡量标准
	ScoreStdDev float64            `json:"score_std_dev"`
	ScoreMin    float64            `json:"score_min"`
	ScoreMax    float64            `json:"score_max"`
	ScoreMedian float64            `json:"score_median"`
	Percentiles map[string]float64 `json:"percentiles,omitempty"` // p50, p90, p95, p99
}

评估摘要载有综合评价指标。审定:要求9.5

type EvalTask ¶

type EvalTask struct {
	ID          string            `json:"id"`
	Name        string            `json:"name"`
	Description string            `json:"description"`
	Input       string            `json:"input"`
	Expected    string            `json:"expected,omitempty"`
	Metadata    map[string]string `json:"metadata,omitempty"`
	Tags        []string          `json:"tags,omitempty"`
	Timeout     time.Duration     `json:"timeout,omitempty"`
}

EvalTask代表着一项评价任务.

type Evaluator ¶

type Evaluator struct {
	// contains filtered or unexported fields
}

评价员对代理人进行成套评价。核实:要求9.2、9.4、9.5、9.6

func NewEvaluator ¶

func NewEvaluator(config EvaluatorConfig, logger *zap.Logger) *Evaluator

新评价员创建了新的评价员.

func (*Evaluator) AddAlertHandler ¶

func (e *Evaluator) AddAlertHandler(handler AlertHandler)

添加AlertHandler为提醒添加了处理器. 核实:要求9.6

func (*Evaluator) ClearAlerts ¶

func (e *Evaluator) ClearAlerts()

ClearAlerts清除了所有被触发的警报.

func (*Evaluator) Evaluate ¶

func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent EvalExecutor) (*EvalReport, error)

评估运行一个评估套房对代理。审定:要求9.2、9.5

func (*Evaluator) EvaluateBatch ¶

func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent EvalExecutor) ([]*EvalReport, error)

评估批量在多个套间进行批量评价. 核证:要求9.4

func (*Evaluator) GenerateReport ¶

func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport

生成报告产生一份全面的评价报告。审定:要求9.5

func (*Evaluator) GetAlerts ¶

func (e *Evaluator) GetAlerts() []Alert

GetAlerts 返回所有被触发的提醒。

func (*Evaluator) RegisterScorer ¶

func (e *Evaluator) RegisterScorer(taskType string, scorer Scorer)

RegisterScounter为特定任务类型注册了计分器.

func (*Evaluator) SetMetricRegistry ¶

func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)

SetMetric Registry设置了自定义的度量衡注册.

type EvaluatorConfig ¶

type EvaluatorConfig struct {
	Concurrency     int              `json:"concurrency"`
	DefaultTimeout  time.Duration    `json:"default_timeout"`
	StopOnFailure   bool             `json:"stop_on_failure"`
	RetryOnError    bool             `json:"retry_on_error"`
	MaxRetries      int              `json:"max_retries"`
	PassThreshold   float64          `json:"pass_threshold"` // Score threshold to pass
	AlertThresholds []AlertThreshold `json:"alert_thresholds,omitempty"`
	// 批量评价设置
	BatchSize      int  `json:"batch_size"`      // Number of tasks per batch
	CollectMetrics bool `json:"collect_metrics"` // Auto-collect metrics after execution
	EnableAlerts   bool `json:"enable_alerts"`   // Enable alert triggering
}

评价员 Config 配置评价员。审定: 要求9.4、9.6

func DefaultEvaluatorConfig ¶

func DefaultEvaluatorConfig() EvaluatorConfig

默认评估器 Config 返回合理的默认值。

type ExactMatchScorer ¶

type ExactMatchScorer struct{}

基于精确字符串匹配的精确Match分数.

func (*ExactMatchScorer) Score ¶

func (s *ExactMatchScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)

type Experiment ¶

type Experiment struct {
	ID          string           `json:"id"`
	Name        string           `json:"name"`
	Description string           `json:"description"`
	Variants    []Variant        `json:"variants"`
	Metrics     []string         `json:"metrics"`
	StartTime   time.Time        `json:"start_time"`
	EndTime     *time.Time       `json:"end_time,omitempty"`
	Status      ExperimentStatus `json:"status"`
}

Experiment 实验定义审定:所需经费11.1

type ExperimentResult ¶

type ExperimentResult struct {
	ExperimentID   string                    `json:"experiment_id"`
	VariantResults map[string]*VariantResult `json:"variant_results"`
	Winner         string                    `json:"winner,omitempty"`
	Confidence     float64                   `json:"confidence"`
	SampleSize     int                       `json:"sample_size"`
	Duration       time.Duration             `json:"duration"`
}

ExperimentResult 实验结果审定: 所需经费 11.3, 11.4

type ExperimentStatus ¶

type ExperimentStatus string

ExperimentStatus 实验状态

const (
	ExperimentStatusDraft    ExperimentStatus = "draft"
	ExperimentStatusRunning  ExperimentStatus = "running"
	ExperimentStatusPaused   ExperimentStatus = "paused"
	ExperimentStatusComplete ExperimentStatus = "completed"
)

type ExperimentStore ¶

type ExperimentStore interface {
	// SaveExperiment 保存实验
	SaveExperiment(ctx context.Context, exp *Experiment) error
	// LoadExperiment 加载实验
	LoadExperiment(ctx context.Context, id string) (*Experiment, error)
	// ListExperiments 列出所有实验
	ListExperiments(ctx context.Context) ([]*Experiment, error)
	// DeleteExperiment 删除实验
	DeleteExperiment(ctx context.Context, id string) error
	// RecordAssignment 记录分配
	RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
	// GetAssignment 获取用户分配
	GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
	// RecordResult 记录结果
	RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
	// GetResults 获取实验结果
	GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
}

ExperimentStore 实验存储接口

type InputOutputPair ¶

type InputOutputPair struct {
	Input  *EvalInput
	Output *EvalOutput
}

InputOutputPair 输入输出对，用于批量评判

type JSONScorer ¶

type JSONScorer struct{}

基于JSON结构匹配的JSONS分数.

func (*JSONScorer) Score ¶

func (s *JSONScorer) Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)

type JudgeDimension ¶

type JudgeDimension struct {
	Name        string  `json:"name"`
	Description string  `json:"description"`
	Weight      float64 `json:"weight"`
}

JudgeDimension 评判维度审定:要求

type JudgeResult ¶

type JudgeResult struct {
	OverallScore float64                   `json:"overall_score"`
	Dimensions   map[string]DimensionScore `json:"dimensions"`
	Reasoning    string                    `json:"reasoning"`
	Confidence   float64                   `json:"confidence"`
	// 其他元数据
	Model     string    `json:"model,omitempty"`
	Timestamp time.Time `json:"timestamp"`
}

JudgeResult 评判结果审定:要求

type LLMJudge ¶

type LLMJudge struct {
	// contains filtered or unexported fields
}

LLMJudge LLM 评判器使用 LLM 作为评估者来评估 Agent 输出质量核证:要求10.1、10.2、10.3、10.4、10.5

func NewLLMJudge ¶

func NewLLMJudge(provider llm.Provider, config LLMJudgeConfig, logger *zap.Logger) *LLMJudge

NewLLMJudge 创建 LLM 评判器审定:要求10.1

func (*LLMJudge) AggregateResults ¶

func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult

AggregateResults 聚合多个评判结果审定:所需经费

func (*LLMJudge) GetConfig ¶

func (j *LLMJudge) GetConfig() LLMJudgeConfig

GetConfig 返回当前配置

func (*LLMJudge) Judge ¶

func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)

Judge 执行评判审定:要求10.2、10.4

func (*LLMJudge) JudgeBatch ¶

func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)

JudgeBatch 批量评判审定:要求10.4、10.5

type LLMJudgeConfig ¶

type LLMJudgeConfig struct {
	Model            string           `json:"model"`
	Dimensions       []JudgeDimension `json:"dimensions"`
	PromptTemplate   string           `json:"prompt_template"`
	ScoreRange       [2]float64       `json:"score_range"` // [min, max]
	RequireReasoning bool             `json:"require_reasoning"`
	// 每个法官拨打的超时
	Timeout time.Duration `json:"timeout,omitempty"`
	// 批量判断的最大货币
	MaxConcurrency int `json:"max_concurrency,omitempty"`
}

LLMJudgeConfig LLM 评判配置审定:要求10.1、10.3

func DefaultLLMJudgeConfig ¶

func DefaultLLMJudgeConfig() LLMJudgeConfig

DefaultLLMJudgeConfig 返回默认配置

type LatencyMetric ¶

type LatencyMetric struct {
	// ThresholdMs 延迟阈值（毫秒），用于归一化
	// 如果设置，返回值为 max(0, 1 - latency/threshold)
	// 如果不设置（0），直接返回毫秒数
	ThresholdMs float64
}

LatencyMetric 延迟指标返回响应延迟（毫秒）核证:要求9.3

func NewLatencyMetric ¶

func NewLatencyMetric() *LatencyMetric

NewLatencyMetric 创建延迟指标

func NewLatencyMetricWithThreshold ¶

func NewLatencyMetricWithThreshold(thresholdMs float64) *LatencyMetric

NewLatencyMetricWithThreshold 创建带阈值的延迟指标

func (*LatencyMetric) Compute ¶

func (m *LatencyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算延迟如果设置了阈值，返回归一化分数 (0.0 - 1.0) 否则返回原始延迟（毫秒）

func (*LatencyMetric) Name ¶

func (m *LatencyMetric) Name() string

Name 返回指标名称

type MemoryExperimentStore ¶

type MemoryExperimentStore struct {
	// contains filtered or unexported fields
}

MemoryExperimentStore 内存实验存储（用于测试和简单场景）

func NewMemoryExperimentStore ¶

func NewMemoryExperimentStore() *MemoryExperimentStore

NewMemoryExperimentStore 创建内存实验存储

func (*MemoryExperimentStore) DeleteExperiment ¶

func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error

DeleteExperiment 删除实验

func (*MemoryExperimentStore) GetAssignment ¶

func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)

GetAssignment 获取用户分配

func (*MemoryExperimentStore) GetAssignmentCount ¶

func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int

GetAssignmentCount 获取分配计数（用于测试）

func (*MemoryExperimentStore) GetResultCount ¶

func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int

GetResultCount 获取结果计数（用于测试）

func (*MemoryExperimentStore) GetResults ¶

func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)

GetResults 获取实验结果

func (*MemoryExperimentStore) ListExperiments ¶

func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)

ListExperiments 列出所有实验

func (*MemoryExperimentStore) LoadExperiment ¶

func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)

LoadExperiment 加载实验

func (*MemoryExperimentStore) RecordAssignment ¶

func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error

RecordAssignment 记录分配

func (*MemoryExperimentStore) RecordResult ¶

func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error

RecordResult 记录结果

func (*MemoryExperimentStore) SaveExperiment ¶

func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error

SaveExperiment 保存实验

type Metric ¶

type Metric interface {
	// Name 指标名称
	Name() string
	// Compute 计算指标值
	Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
}

Metric 评估指标接口审定:要求9.1

type MetricEvalResult ¶

type MetricEvalResult struct {
	InputID   string             `json:"input_id"`
	Metrics   map[string]float64 `json:"metrics"`
	Passed    bool               `json:"passed"`
	Errors    []string           `json:"errors,omitempty"`
	Timestamp time.Time          `json:"timestamp"`
}

MetricEvalResult 评估结果（符合设计文档规范）注意：与现有 EvalResult 区分，此类型专用于 Metric 接口

func NewMetricEvalResult ¶

func NewMetricEvalResult(inputID string) *MetricEvalResult

NewMetricEvalResult 创建评估结果

func (*MetricEvalResult) AddError ¶

func (r *MetricEvalResult) AddError(err string) *MetricEvalResult

AddError 添加错误

func (*MetricEvalResult) AddMetric ¶

func (r *MetricEvalResult) AddMetric(name string, value float64) *MetricEvalResult

AddMetric 添加指标值

func (*MetricEvalResult) SetPassed ¶

func (r *MetricEvalResult) SetPassed(passed bool) *MetricEvalResult

SetPassed 设置是否通过

type MetricRegistry ¶

type MetricRegistry struct {
	// contains filtered or unexported fields
}

MetricRegistry 指标注册表

func NewMetricRegistry ¶

func NewMetricRegistry() *MetricRegistry

NewMetricRegistry 创建指标注册表

func NewRegistryWithBuiltinMetrics ¶

func NewRegistryWithBuiltinMetrics() *MetricRegistry

NewRegistryWithBuiltinMetrics 创建包含所有内置指标的注册表

func (*MetricRegistry) ComputeAll ¶

func (r *MetricRegistry) ComputeAll(ctx context.Context, input *EvalInput, output *EvalOutput) (*MetricEvalResult, error)

ComputeAll 计算所有注册的指标

func (*MetricRegistry) Get ¶

func (r *MetricRegistry) Get(name string) (Metric, bool)

Get 获取指标

func (*MetricRegistry) List ¶

func (r *MetricRegistry) List() []string

List 列出所有指标名称

func (*MetricRegistry) Register ¶

func (r *MetricRegistry) Register(metric Metric)

Register 注册指标

type NoveltyMetric ¶

type NoveltyMetric struct {
	// contains filtered or unexported fields
}

NoveltyMetric 新颖性指标评估研究输出的原创性和创新程度

func NewNoveltyMetric ¶

func NewNoveltyMetric(logger *zap.Logger) *NoveltyMetric

NewNoveltyMetric 创建新颖性指标

func (*NoveltyMetric) Compute ¶

func (m *NoveltyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

func (*NoveltyMetric) Name ¶

func (m *NoveltyMetric) Name() string

type ResearchDimension ¶

type ResearchDimension string

ResearchDimension 研究评估维度

const (
	DimensionNovelty         ResearchDimension = "novelty"
	DimensionRigor           ResearchDimension = "rigor"
	DimensionClarity         ResearchDimension = "clarity"
	DimensionRelevance       ResearchDimension = "relevance"
	DimensionCompleteness    ResearchDimension = "completeness"
	DimensionReproducibility ResearchDimension = "reproducibility"
)

type ResearchEvalConfig ¶

type ResearchEvalConfig struct {
	// 尺寸权重( 一定和 1. 0)
	Weights map[ResearchDimension]float64 `json:"weights"`

	// 评价设置
	UseLLMJudge   bool          `json:"use_llm_judge"`  // Use LLM as judge
	NumJudges     int           `json:"num_judges"`     // Number of LLM judges (for voting)
	JudgeModel    string        `json:"judge_model"`    // Model for LLM judge
	Timeout       time.Duration `json:"timeout"`        // Per-evaluation timeout
	PassThreshold float64       `json:"pass_threshold"` // Minimum score to pass (0-1)

	// 基于参考的评价
	UseReferences bool `json:"use_references"` // Compare against reference papers
	MaxReferences int  `json:"max_references"` // Maximum reference papers to compare
}

ResearchEvalConfig 研究评估配置

func DefaultResearchEvalConfig ¶

func DefaultResearchEvalConfig() ResearchEvalConfig

DefaultResearchEvalConfig 返回默认研究评估配置

type ResearchEvalResult ¶

type ResearchEvalResult struct {
	OverallScore    float64                       `json:"overall_score"`    // 综合得分 (0-1)
	DimensionScores map[ResearchDimension]float64 `json:"dimension_scores"` // 各维度得分
	Passed          bool                          `json:"passed"`           // 是否通过
	Feedback        map[ResearchDimension]string  `json:"feedback"`         // 各维度反馈
	Strengths       []string                      `json:"strengths"`        // 优势
	Weaknesses      []string                      `json:"weaknesses"`       // 不足
	Suggestions     []string                      `json:"suggestions"`      // 改进建议
	EvaluatedAt     time.Time                     `json:"evaluated_at"`
	Duration        time.Duration                 `json:"duration"`
}

ResearchEvalResult 研究评估结果

type ResearchEvaluator ¶

type ResearchEvaluator struct {
	// contains filtered or unexported fields
}

ResearchEvaluator 研究评估器 - 编排多维度评估

func NewResearchEvaluator ¶

func NewResearchEvaluator(config ResearchEvalConfig, logger *zap.Logger) *ResearchEvaluator

NewResearchEvaluator 创建研究评估器

func (*ResearchEvaluator) BatchEvaluate ¶

func (e *ResearchEvaluator) BatchEvaluate(ctx context.Context, pairs []struct {
	Input  *EvalInput
	Output *EvalOutput
}) ([]*ResearchEvalResult, error)

BatchEvaluate 批量评估多个研究输出

func (*ResearchEvaluator) Evaluate ¶

func (e *ResearchEvaluator) Evaluate(ctx context.Context, input *EvalInput, output *EvalOutput) (*ResearchEvalResult, error)

Evaluate 执行完整的研究质量评估

func (*ResearchEvaluator) RegisterMetric ¶

func (e *ResearchEvaluator) RegisterMetric(dimension ResearchDimension, metric Metric)

RegisterMetric 注册评估维度指标

type RigorMetric ¶

type RigorMetric struct {
	// contains filtered or unexported fields
}

RigorMetric 严谨性指标评估研究方法论的严谨程度

func NewRigorMetric ¶

func NewRigorMetric(logger *zap.Logger) *RigorMetric

NewRigorMetric 创建严谨性指标

func (*RigorMetric) Compute ¶

func (m *RigorMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

func (*RigorMetric) Name ¶

func (m *RigorMetric) Name() string

type Scorer ¶

type Scorer interface {
	Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
}

分分器定义了评分评价结果的界面.

type StatisticalReport ¶

type StatisticalReport struct {
	ExperimentID     string                    `json:"experiment_id"`
	ExperimentName   string                    `json:"experiment_name"`
	Status           ExperimentStatus          `json:"status"`
	Duration         time.Duration             `json:"duration"`
	TotalSamples     int                       `json:"total_samples"`
	VariantReports   map[string]*VariantReport `json:"variant_reports"`
	Comparisons      []*VariantComparison      `json:"comparisons"`
	Winner           string                    `json:"winner,omitempty"`
	WinnerConfidence float64                   `json:"winner_confidence,omitempty"`
	Recommendation   string                    `json:"recommendation"`
	GeneratedAt      time.Time                 `json:"generated_at"`
}

统计报告是详细的统计分析报告审定:所需经费 11.4

type TokenUsageMetric ¶

type TokenUsageMetric struct {
	// MaxTokens 最大 Token 数，用于归一化
	// 如果设置，返回值为 max(0, 1 - tokens/maxTokens)
	// 如果不设置（0），直接返回 Token 数
	MaxTokens int
}

TokenUsageMetric Token 使用量指标返回 Token 使用量核证:要求9.3

func NewTokenUsageMetric ¶

func NewTokenUsageMetric() *TokenUsageMetric

NewTokenUsageMetric 创建 Token 使用量指标

func NewTokenUsageMetricWithMax ¶

func NewTokenUsageMetricWithMax(maxTokens int) *TokenUsageMetric

NewTokenUsageMetricWithMax 创建带最大值的 Token 使用量指标

func (*TokenUsageMetric) Compute ¶

func (m *TokenUsageMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)

Compute 计算 Token 使用量如果设置了最大值，返回归一化分数 (0.0 - 1.0) 否则返回原始 Token 数

func (*TokenUsageMetric) Name ¶

func (m *TokenUsageMetric) Name() string

Name 返回指标名称

type Variant ¶

type Variant struct {
	ID        string         `json:"id"`
	Name      string         `json:"name"`
	Config    map[string]any `json:"config"`
	Weight    float64        `json:"weight"` // 流量权重
	IsControl bool           `json:"is_control"`
}

Variant 实验变体审定:所需经费11.1、11.5

type VariantComparison ¶

type VariantComparison struct {
	ControlID      string             `json:"control_id"`
	TreatmentID    string             `json:"treatment_id"`
	MetricDeltas   map[string]float64 `json:"metric_deltas"`   // treatment - control
	RelativeChange map[string]float64 `json:"relative_change"` // percentage change
	PValues        map[string]float64 `json:"p_values"`
	Confidence     map[string]float64 `json:"confidence"`
	Significant    map[string]bool    `json:"significant"` // at 95% level
}

变量比较包含两个变量之间的比较结果

type VariantReport ¶

type VariantReport struct {
	VariantID    string                `json:"variant_id"`
	VariantName  string                `json:"variant_name"`
	IsControl    bool                  `json:"is_control"`
	SampleCount  int                   `json:"sample_count"`
	Metrics      map[string]float64    `json:"metrics"`
	StdDev       map[string]float64    `json:"std_dev"`
	ConfInterval map[string][2]float64 `json:"confidence_interval"` // 95% CI
}

变量报告载有单一变量的详细统计数据

type VariantResult ¶

type VariantResult struct {
	VariantID   string             `json:"variant_id"`
	SampleCount int                `json:"sample_count"`
	Metrics     map[string]float64 `json:"metrics"`
	StdDev      map[string]float64 `json:"std_dev"`
	// contains filtered or unexported fields
}

VariantResult 变体结果核证:所需经费 11.3

Source Files ¶

View all Source files

?	: This menu
/	: Search site
f or F	: Jump to
y or Y	: Canonical URL

Documentation ¶

Index ¶

Constants ¶

Variables ¶

Functions ¶

func RegisterBuiltinMetrics ¶

func RegisterResearchMetrics ¶

Types ¶

type ABTester ¶

func NewABTester ¶

func (*ABTester) Analyze ¶

func (*ABTester) Assign ¶

func (*ABTester) AutoSelectWinner ¶

func (*ABTester) CompleteExperiment ¶

func (*ABTester) CreateExperiment ¶

func (*ABTester) DeleteExperiment ¶

func (*ABTester) GenerateReport ¶

func (*ABTester) GetExperiment ¶

func (*ABTester) ListExperiments ¶

func (*ABTester) PauseExperiment ¶

func (*ABTester) RecordResult ¶

func (*ABTester) StartExperiment ¶

type AccuracyMetric ¶

func NewAccuracyMetric ¶

func (*AccuracyMetric) Compute ¶

func (*AccuracyMetric) Name ¶

type AggregatedJudgeResult ¶

type Alert ¶

type AlertHandler ¶

type AlertLevel ¶

type AlertThreshold ¶

type BatchEvalReport ¶

type ClarityMetric ¶

func NewClarityMetric ¶

func (*ClarityMetric) Compute ¶

func (*ClarityMetric) Name ¶

type CompletenessMetric ¶

func NewCompletenessMetric ¶

func (*CompletenessMetric) Compute ¶

func (*CompletenessMetric) Name ¶

type ContainsScorer ¶

func (*ContainsScorer) Score ¶

type CostMetric ¶

func NewCostMetric ¶

func NewCostMetricWithMax ¶

func (*CostMetric) Compute ¶

func (*CostMetric) Name ¶

type DimensionScore ¶

type EvalExecutor ¶ added in v1.0.0

type EvalInput ¶

func NewEvalInput ¶

func (*EvalInput) WithContext ¶

func (*EvalInput) WithExpected ¶

func (*EvalInput) WithReference ¶

type EvalOutput ¶

func NewEvalOutput ¶

func (*EvalOutput) WithCost ¶

func (*EvalOutput) WithLatency ¶

func (*EvalOutput) WithMetadata ¶

func (*EvalOutput) WithTokensUsed ¶

type EvalReport ¶

type EvalResult ¶

type EvalSuite ¶

type EvalSummary ¶

type EvalTask ¶

type Evaluator ¶

func NewEvaluator ¶

func (*Evaluator) AddAlertHandler ¶

func (*Evaluator) ClearAlerts ¶

func (*Evaluator) Evaluate ¶

func (*Evaluator) EvaluateBatch ¶

func (*Evaluator) GenerateReport ¶

func (*Evaluator) GetAlerts ¶

func (*Evaluator) RegisterScorer ¶

func (*Evaluator) SetMetricRegistry ¶

type EvaluatorConfig ¶

func DefaultEvaluatorConfig ¶

type ExactMatchScorer ¶

func (*ExactMatchScorer) Score ¶

type Experiment ¶