Documentation
¶
Index ¶
- Constants
- Variables
- func RegisterBuiltinMetrics(registry *MetricRegistry)
- func RegisterResearchMetrics(evaluator *ResearchEvaluator, logger *zap.Logger)
- type ABTester
- func (t *ABTester) Analyze(ctx context.Context, experimentID string) (*ExperimentResult, error)
- func (t *ABTester) Assign(experimentID, userID string) (*Variant, error)
- func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)
- func (t *ABTester) CompleteExperiment(experimentID string) error
- func (t *ABTester) CreateExperiment(exp *Experiment) error
- func (t *ABTester) DeleteExperiment(experimentID string) error
- func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)
- func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)
- func (t *ABTester) ListExperiments() []*Experiment
- func (t *ABTester) PauseExperiment(experimentID string) error
- func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error
- func (t *ABTester) StartExperiment(experimentID string) error
- type AccuracyMetric
- type AggregatedJudgeResult
- type Alert
- type AlertHandler
- type AlertLevel
- type AlertThreshold
- type BatchEvalReport
- type ClarityMetric
- type CompletenessMetric
- type ContainsScorer
- type CostMetric
- type DimensionScore
- type EvalExecutor
- type EvalInput
- type EvalOutput
- type EvalReport
- type EvalResult
- type EvalSuite
- type EvalSummary
- type EvalTask
- type Evaluator
- func (e *Evaluator) AddAlertHandler(handler AlertHandler)
- func (e *Evaluator) ClearAlerts()
- func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent EvalExecutor) (*EvalReport, error)
- func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent EvalExecutor) ([]*EvalReport, error)
- func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport
- func (e *Evaluator) GetAlerts() []Alert
- func (e *Evaluator) RegisterScorer(taskType string, scorer Scorer)
- func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)
- type EvaluatorConfig
- type ExactMatchScorer
- type Experiment
- type ExperimentResult
- type ExperimentStatus
- type ExperimentStore
- type InputOutputPair
- type JSONScorer
- type JudgeDimension
- type JudgeResult
- type LLMJudge
- func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult
- func (j *LLMJudge) GetConfig() LLMJudgeConfig
- func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)
- func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)
- type LLMJudgeConfig
- type LatencyMetric
- type MemoryExperimentStore
- func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error
- func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
- func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int
- func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
- func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)
- func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)
- func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
- func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
- func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error
- type Metric
- type MetricEvalResult
- type MetricRegistry
- type NoveltyMetric
- type ResearchDimension
- type ResearchEvalConfig
- type ResearchEvalResult
- type ResearchEvaluator
- func (e *ResearchEvaluator) BatchEvaluate(ctx context.Context, pairs []struct{ ... }) ([]*ResearchEvalResult, error)
- func (e *ResearchEvaluator) Evaluate(ctx context.Context, input *EvalInput, output *EvalOutput) (*ResearchEvalResult, error)
- func (e *ResearchEvaluator) RegisterMetric(dimension ResearchDimension, metric Metric)
- type RigorMetric
- type Scorer
- type StatisticalReport
- type TokenUsageMetric
- type Variant
- type VariantComparison
- type VariantReport
- type VariantResult
Constants ¶
const DefaultPromptTemplate = `` /* 1118-byte string literal not displayed */
DefaultPromptTemplate 默认评估提示模板 审定: 所需经费 10.2
Variables ¶
var ( ErrExperimentNotFound = errors.New("experiment not found") ErrExperimentNotActive = errors.New("experiment not active") ErrNoVariants = errors.New("no variants defined") ErrInvalidWeights = errors.New("invalid variant weights") ErrVariantNotFound = errors.New("variant not found") )
A/B 测试相关错误
Functions ¶
func RegisterBuiltinMetrics ¶
func RegisterBuiltinMetrics(registry *MetricRegistry)
RegisterBuiltinMetrics 注册所有内置指标到注册表
func RegisterResearchMetrics ¶
func RegisterResearchMetrics(evaluator *ResearchEvaluator, logger *zap.Logger)
RegisterResearchMetrics 注册所有研究评估指标到评估器
Types ¶
type ABTester ¶
type ABTester struct {
// contains filtered or unexported fields
}
ABTester A/B 测试器 11.1、11.2、11.3、11.5
func NewABTester ¶
func NewABTester(store ExperimentStore, logger *zap.Logger) *ABTester
NewABTester 创建 A/B 测试器
func (*ABTester) AutoSelectWinner ¶
func (t *ABTester) AutoSelectWinner(ctx context.Context, experimentID string, minConfidence float64) (*Variant, error)
自动选择Winner 自动选择获胜的变量配置 当检测到统计意义时。 核实:所需经费 11.6
func (*ABTester) CompleteExperiment ¶
CompleteExperiment 完成实验
func (*ABTester) CreateExperiment ¶
func (t *ABTester) CreateExperiment(exp *Experiment) error
CreateExperiment 创建实验 审定:所需经费11.1
func (*ABTester) DeleteExperiment ¶
DeleteExperiment 删除实验
func (*ABTester) GenerateReport ¶
func (t *ABTester) GenerateReport(ctx context.Context, experimentID string) (*StatisticalReport, error)
生成报告生成一份全面的统计意义分析报告 审定:所需经费 11.4
func (*ABTester) GetExperiment ¶
func (t *ABTester) GetExperiment(experimentID string) (*Experiment, error)
GetExperiment 获取实验
func (*ABTester) ListExperiments ¶
func (t *ABTester) ListExperiments() []*Experiment
ListExperiments 列出所有实验
func (*ABTester) PauseExperiment ¶
PauseExperiment 暂停实验
func (*ABTester) RecordResult ¶
func (t *ABTester) RecordResult(experimentID, variantID string, result *EvalResult) error
RecordResult 记录结果 核证:所需经费 11.3
func (*ABTester) StartExperiment ¶
StartExperiment 启动实验
type AccuracyMetric ¶
type AccuracyMetric struct {
// CaseSensitive 是否区分大小写
CaseSensitive bool
// TrimWhitespace 是否去除首尾空白
TrimWhitespace bool
// UseContains 是否使用包含匹配(而非精确匹配)
UseContains bool
}
AccuracyMetric 准确率指标 通过比较实际输出与期望输出计算准确率 核证:要求9.3
func (*AccuracyMetric) Compute ¶
func (m *AccuracyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算准确率 返回值范围: 0.0 - 1.0 - 1.0: 完全匹配 - 0.0 - 1.0: 部分匹配(基于字符相似度)
type AggregatedJudgeResult ¶
type AggregatedJudgeResult struct {
Results []*JudgeResult `json:"results"`
AverageScore float64 `json:"average_score"`
ScoreStdDev float64 `json:"score_std_dev"`
NeedsReview bool `json:"needs_review"`
ReviewReason string `json:"review_reason,omitempty"`
DimensionAvgs map[string]float64 `json:"dimension_averages"`
}
AggregatedJudgeResult 聚合的评判结果 审定:所需经费
type Alert ¶
type Alert struct {
Level AlertLevel `json:"level"`
MetricName string `json:"metric_name"`
Threshold float64 `json:"threshold"`
Actual float64 `json:"actual"`
Message string `json:"message"`
TaskID string `json:"task_id,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
警报是指在计量标准超过阈值时触发的评价警报. 核实:要求9.6
type AlertLevel ¶
type AlertLevel string
警报等级定义了警报的严重程度.
const ( AlertLevelInfo AlertLevel = "info" AlertLevelWarning AlertLevel = "warning" AlertLevelCritical AlertLevel = "critical" )
type AlertThreshold ¶
type AlertThreshold struct {
MetricName string `json:"metric_name"`
Operator string `json:"operator"` // "gt", "lt", "gte", "lte", "eq"
Value float64 `json:"value"`
Level AlertLevel `json:"level"`
Message string `json:"message,omitempty"`
}
警报压力定义了触发警报的门槛.
type BatchEvalReport ¶
type BatchEvalReport struct {
Reports []*EvalReport `json:"reports"`
AggregatedSummary EvalSummary `json:"aggregated_summary"`
Alerts []Alert `json:"alerts,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
批量Eval报告代表批量评价报告. 审定:要求9.5
type ClarityMetric ¶
type ClarityMetric struct {
// contains filtered or unexported fields
}
ClarityMetric 清晰度指标 评估研究输出的可读性和表达清晰度
func NewClarityMetric ¶
func NewClarityMetric(logger *zap.Logger) *ClarityMetric
NewClarityMetric 创建清晰度指标
func (*ClarityMetric) Compute ¶
func (m *ClarityMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
func (*ClarityMetric) Name ¶
func (m *ClarityMetric) Name() string
type CompletenessMetric ¶
type CompletenessMetric struct {
// contains filtered or unexported fields
}
CompletenessMetric 完整性指标 评估研究输出对主题的覆盖程度
func NewCompletenessMetric ¶
func NewCompletenessMetric(logger *zap.Logger) *CompletenessMetric
NewCompletenessMetric 创建完整性指标
func (*CompletenessMetric) Compute ¶
func (m *CompletenessMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
func (*CompletenessMetric) Name ¶
func (m *CompletenessMetric) Name() string
type CostMetric ¶
type CostMetric struct {
// MaxCost 最大成本,用于归一化
// 如果设置,返回值为 max(0, 1 - cost/maxCost)
// 如果不设置(0),直接返回成本值
MaxCost float64
}
CostMetric 成本指标 返回 API 调用成本 核证:要求9.3
func NewCostMetricWithMax ¶
func NewCostMetricWithMax(maxCost float64) *CostMetric
NewCostMetricWithMax 创建带最大值的成本指标
func (*CostMetric) Compute ¶
func (m *CostMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算成本 如果设置了最大值,返回归一化分数 (0.0 - 1.0) 否则返回原始成本值
type DimensionScore ¶
DimensionScore 维度评分
type EvalExecutor ¶ added in v1.0.0
type EvalExecutor interface {
Execute(ctx context.Context, input string) (output string, tokens int, err error)
}
EvalExecutor defines the interface for executing an agent during evaluation. Renamed from AgentExecutor to avoid naming conflict with workflow.AgentExecutor. Unlike types.Executor (which uses any -> any), this interface uses string I/O and returns token count, which is specific to evaluation scoring needs.
type EvalInput ¶
type EvalInput struct {
Prompt string `json:"prompt"`
Context map[string]any `json:"context,omitempty"`
Expected string `json:"expected,omitempty"`
Reference string `json:"reference,omitempty"`
}
EvalInput 评估输入
func (*EvalInput) WithContext ¶
WithContext 设置上下文
func (*EvalInput) WithExpected ¶
WithExpected 设置期望输出
func (*EvalInput) WithReference ¶
WithReference 设置参考内容
type EvalOutput ¶
type EvalOutput struct {
Response string `json:"response"`
TokensUsed int `json:"tokens_used"`
Latency time.Duration `json:"latency"`
Cost float64 `json:"cost"`
Metadata map[string]any `json:"metadata,omitempty"`
}
EvalOutput 评估输出
func (*EvalOutput) WithLatency ¶
func (e *EvalOutput) WithLatency(latency time.Duration) *EvalOutput
WithLatency 设置延迟
func (*EvalOutput) WithMetadata ¶
func (e *EvalOutput) WithMetadata(metadata map[string]any) *EvalOutput
WithMetadata 设置元数据
func (*EvalOutput) WithTokensUsed ¶
func (e *EvalOutput) WithTokensUsed(tokens int) *EvalOutput
WithTokensUsed 设置 Token 使用量
type EvalReport ¶
type EvalReport struct {
SuiteID string `json:"suite_id"`
SuiteName string `json:"suite_name"`
AgentID string `json:"agent_id"`
Results []EvalResult `json:"results"`
Summary EvalSummary `json:"summary"`
StartTime time.Time `json:"start_time"`
EndTime time.Time `json:"end_time"`
Duration time.Duration `json:"duration"`
Metadata map[string]string `json:"metadata,omitempty"`
}
Eval报告是完整的评价报告。
type EvalResult ¶
type EvalResult struct {
TaskID string `json:"task_id"`
Success bool `json:"success"`
Output string `json:"output"`
Expected string `json:"expected,omitempty"`
Score float64 `json:"score"` // 0.0 - 1.0
Metrics map[string]float64 `json:"metrics,omitempty"`
Error string `json:"error,omitempty"`
Duration time.Duration `json:"duration"`
TokensUsed int `json:"tokens_used,omitempty"`
Cost float64 `json:"cost,omitempty"`
}
EvalResult是评价一项单一任务的结果。
type EvalSuite ¶
type EvalSuite struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Tasks []EvalTask `json:"tasks"`
Version string `json:"version"`
}
EvalSuite代表了一系列评价任务。
type EvalSummary ¶
type EvalSummary struct {
TotalTasks int `json:"total_tasks"`
PassedTasks int `json:"passed_tasks"`
FailedTasks int `json:"failed_tasks"`
PassRate float64 `json:"pass_rate"`
AverageScore float64 `json:"average_score"`
TotalTokens int `json:"total_tokens"`
TotalCost float64 `json:"total_cost"`
TotalDuration time.Duration `json:"total_duration"`
MetricAverages map[string]float64 `json:"metric_averages,omitempty"`
// 统计衡量标准
ScoreStdDev float64 `json:"score_std_dev"`
ScoreMin float64 `json:"score_min"`
ScoreMax float64 `json:"score_max"`
ScoreMedian float64 `json:"score_median"`
Percentiles map[string]float64 `json:"percentiles,omitempty"` // p50, p90, p95, p99
}
评估摘要载有综合评价指标。 审定:要求9.5
type EvalTask ¶
type EvalTask struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Input string `json:"input"`
Expected string `json:"expected,omitempty"`
Metadata map[string]string `json:"metadata,omitempty"`
Tags []string `json:"tags,omitempty"`
Timeout time.Duration `json:"timeout,omitempty"`
}
EvalTask代表着一项评价任务.
type Evaluator ¶
type Evaluator struct {
// contains filtered or unexported fields
}
评价员对代理人进行成套评价。 核实:要求9.2、9.4、9.5、9.6
func NewEvaluator ¶
func NewEvaluator(config EvaluatorConfig, logger *zap.Logger) *Evaluator
新评价员创建了新的评价员.
func (*Evaluator) AddAlertHandler ¶
func (e *Evaluator) AddAlertHandler(handler AlertHandler)
添加AlertHandler为提醒添加了处理器. 核实:要求9.6
func (*Evaluator) Evaluate ¶
func (e *Evaluator) Evaluate(ctx context.Context, suite *EvalSuite, agent EvalExecutor) (*EvalReport, error)
评估运行一个评估套房 对代理。 审定:要求9.2、9.5
func (*Evaluator) EvaluateBatch ¶
func (e *Evaluator) EvaluateBatch(ctx context.Context, suites []*EvalSuite, agent EvalExecutor) ([]*EvalReport, error)
评估批量在多个套间进行批量评价. 核证:要求9.4
func (*Evaluator) GenerateReport ¶
func (e *Evaluator) GenerateReport(reports []*EvalReport) *BatchEvalReport
生成报告产生一份全面的评价报告。 审定:要求9.5
func (*Evaluator) RegisterScorer ¶
RegisterScounter为特定任务类型注册了计分器.
func (*Evaluator) SetMetricRegistry ¶
func (e *Evaluator) SetMetricRegistry(registry *MetricRegistry)
SetMetric Registry设置了自定义的度量衡注册.
type EvaluatorConfig ¶
type EvaluatorConfig struct {
Concurrency int `json:"concurrency"`
DefaultTimeout time.Duration `json:"default_timeout"`
StopOnFailure bool `json:"stop_on_failure"`
RetryOnError bool `json:"retry_on_error"`
MaxRetries int `json:"max_retries"`
PassThreshold float64 `json:"pass_threshold"` // Score threshold to pass
AlertThresholds []AlertThreshold `json:"alert_thresholds,omitempty"`
// 批量评价设置
BatchSize int `json:"batch_size"` // Number of tasks per batch
CollectMetrics bool `json:"collect_metrics"` // Auto-collect metrics after execution
EnableAlerts bool `json:"enable_alerts"` // Enable alert triggering
}
评价员 Config 配置评价员。 审定: 要求9.4、9.6
type Experiment ¶
type Experiment struct {
ID string `json:"id"`
Name string `json:"name"`
Description string `json:"description"`
Variants []Variant `json:"variants"`
Metrics []string `json:"metrics"`
StartTime time.Time `json:"start_time"`
EndTime *time.Time `json:"end_time,omitempty"`
Status ExperimentStatus `json:"status"`
}
Experiment 实验定义 审定:所需经费11.1
type ExperimentResult ¶
type ExperimentResult struct {
ExperimentID string `json:"experiment_id"`
VariantResults map[string]*VariantResult `json:"variant_results"`
Winner string `json:"winner,omitempty"`
Confidence float64 `json:"confidence"`
SampleSize int `json:"sample_size"`
Duration time.Duration `json:"duration"`
}
ExperimentResult 实验结果 审定: 所需经费 11.3, 11.4
type ExperimentStatus ¶
type ExperimentStatus string
ExperimentStatus 实验状态
const ( ExperimentStatusDraft ExperimentStatus = "draft" ExperimentStatusRunning ExperimentStatus = "running" ExperimentStatusPaused ExperimentStatus = "paused" ExperimentStatusComplete ExperimentStatus = "completed" )
type ExperimentStore ¶
type ExperimentStore interface {
// SaveExperiment 保存实验
SaveExperiment(ctx context.Context, exp *Experiment) error
// LoadExperiment 加载实验
LoadExperiment(ctx context.Context, id string) (*Experiment, error)
// ListExperiments 列出所有实验
ListExperiments(ctx context.Context) ([]*Experiment, error)
// DeleteExperiment 删除实验
DeleteExperiment(ctx context.Context, id string) error
// RecordAssignment 记录分配
RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
// GetAssignment 获取用户分配
GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
// RecordResult 记录结果
RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
// GetResults 获取实验结果
GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
}
ExperimentStore 实验存储接口
type InputOutputPair ¶
type InputOutputPair struct {
Input *EvalInput
Output *EvalOutput
}
InputOutputPair 输入输出对,用于批量评判
type JudgeDimension ¶
type JudgeDimension struct {
Name string `json:"name"`
Description string `json:"description"`
Weight float64 `json:"weight"`
}
JudgeDimension 评判维度 审定:要求
type JudgeResult ¶
type JudgeResult struct {
OverallScore float64 `json:"overall_score"`
Dimensions map[string]DimensionScore `json:"dimensions"`
Reasoning string `json:"reasoning"`
Confidence float64 `json:"confidence"`
// 其他元数据
Model string `json:"model,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
JudgeResult 评判结果 审定:要求
type LLMJudge ¶
type LLMJudge struct {
// contains filtered or unexported fields
}
LLMJudge LLM 评判器 使用 LLM 作为评估者来评估 Agent 输出质量 核证:要求10.1、10.2、10.3、10.4、10.5
func NewLLMJudge ¶
NewLLMJudge 创建 LLM 评判器 审定:要求10.1
func (*LLMJudge) AggregateResults ¶
func (j *LLMJudge) AggregateResults(results []*JudgeResult) *AggregatedJudgeResult
AggregateResults 聚合多个评判结果 审定:所需经费
func (*LLMJudge) Judge ¶
func (j *LLMJudge) Judge(ctx context.Context, input *EvalInput, output *EvalOutput) (*JudgeResult, error)
Judge 执行评判 审定:要求10.2、10.4
func (*LLMJudge) JudgeBatch ¶
func (j *LLMJudge) JudgeBatch(ctx context.Context, pairs []InputOutputPair) ([]*JudgeResult, error)
JudgeBatch 批量评判 审定:要求10.4、10.5
type LLMJudgeConfig ¶
type LLMJudgeConfig struct {
Model string `json:"model"`
Dimensions []JudgeDimension `json:"dimensions"`
PromptTemplate string `json:"prompt_template"`
ScoreRange [2]float64 `json:"score_range"` // [min, max]
RequireReasoning bool `json:"require_reasoning"`
// 每个法官拨打的超时
Timeout time.Duration `json:"timeout,omitempty"`
// 批量判断的最大货币
MaxConcurrency int `json:"max_concurrency,omitempty"`
}
LLMJudgeConfig LLM 评判配置 审定:要求10.1、10.3
func DefaultLLMJudgeConfig ¶
func DefaultLLMJudgeConfig() LLMJudgeConfig
DefaultLLMJudgeConfig 返回默认配置
type LatencyMetric ¶
type LatencyMetric struct {
// ThresholdMs 延迟阈值(毫秒),用于归一化
// 如果设置,返回值为 max(0, 1 - latency/threshold)
// 如果不设置(0),直接返回毫秒数
ThresholdMs float64
}
LatencyMetric 延迟指标 返回响应延迟(毫秒) 核证:要求9.3
func NewLatencyMetricWithThreshold ¶
func NewLatencyMetricWithThreshold(thresholdMs float64) *LatencyMetric
NewLatencyMetricWithThreshold 创建带阈值的延迟指标
func (*LatencyMetric) Compute ¶
func (m *LatencyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算延迟 如果设置了阈值,返回归一化分数 (0.0 - 1.0) 否则返回原始延迟(毫秒)
type MemoryExperimentStore ¶
type MemoryExperimentStore struct {
// contains filtered or unexported fields
}
MemoryExperimentStore 内存实验存储(用于测试和简单场景)
func NewMemoryExperimentStore ¶
func NewMemoryExperimentStore() *MemoryExperimentStore
NewMemoryExperimentStore 创建内存实验存储
func (*MemoryExperimentStore) DeleteExperiment ¶
func (s *MemoryExperimentStore) DeleteExperiment(ctx context.Context, id string) error
DeleteExperiment 删除实验
func (*MemoryExperimentStore) GetAssignment ¶
func (s *MemoryExperimentStore) GetAssignment(ctx context.Context, experimentID, userID string) (string, error)
GetAssignment 获取用户分配
func (*MemoryExperimentStore) GetAssignmentCount ¶
func (s *MemoryExperimentStore) GetAssignmentCount(experimentID string) map[string]int
GetAssignmentCount 获取分配计数(用于测试)
func (*MemoryExperimentStore) GetResultCount ¶
func (s *MemoryExperimentStore) GetResultCount(experimentID string) map[string]int
GetResultCount 获取结果计数(用于测试)
func (*MemoryExperimentStore) GetResults ¶
func (s *MemoryExperimentStore) GetResults(ctx context.Context, experimentID string) (map[string][]*EvalResult, error)
GetResults 获取实验结果
func (*MemoryExperimentStore) ListExperiments ¶
func (s *MemoryExperimentStore) ListExperiments(ctx context.Context) ([]*Experiment, error)
ListExperiments 列出所有实验
func (*MemoryExperimentStore) LoadExperiment ¶
func (s *MemoryExperimentStore) LoadExperiment(ctx context.Context, id string) (*Experiment, error)
LoadExperiment 加载实验
func (*MemoryExperimentStore) RecordAssignment ¶
func (s *MemoryExperimentStore) RecordAssignment(ctx context.Context, experimentID, userID, variantID string) error
RecordAssignment 记录分配
func (*MemoryExperimentStore) RecordResult ¶
func (s *MemoryExperimentStore) RecordResult(ctx context.Context, experimentID, variantID string, result *EvalResult) error
RecordResult 记录结果
func (*MemoryExperimentStore) SaveExperiment ¶
func (s *MemoryExperimentStore) SaveExperiment(ctx context.Context, exp *Experiment) error
SaveExperiment 保存实验
type Metric ¶
type Metric interface {
// Name 指标名称
Name() string
// Compute 计算指标值
Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
}
Metric 评估指标接口 审定:要求9.1
type MetricEvalResult ¶
type MetricEvalResult struct {
InputID string `json:"input_id"`
Metrics map[string]float64 `json:"metrics"`
Passed bool `json:"passed"`
Errors []string `json:"errors,omitempty"`
Timestamp time.Time `json:"timestamp"`
}
MetricEvalResult 评估结果(符合设计文档规范) 注意:与现有 EvalResult 区分,此类型专用于 Metric 接口
func NewMetricEvalResult ¶
func NewMetricEvalResult(inputID string) *MetricEvalResult
NewMetricEvalResult 创建评估结果
func (*MetricEvalResult) AddError ¶
func (r *MetricEvalResult) AddError(err string) *MetricEvalResult
AddError 添加错误
func (*MetricEvalResult) AddMetric ¶
func (r *MetricEvalResult) AddMetric(name string, value float64) *MetricEvalResult
AddMetric 添加指标值
func (*MetricEvalResult) SetPassed ¶
func (r *MetricEvalResult) SetPassed(passed bool) *MetricEvalResult
SetPassed 设置是否通过
type MetricRegistry ¶
type MetricRegistry struct {
// contains filtered or unexported fields
}
MetricRegistry 指标注册表
func NewRegistryWithBuiltinMetrics ¶
func NewRegistryWithBuiltinMetrics() *MetricRegistry
NewRegistryWithBuiltinMetrics 创建包含所有内置指标的注册表
func (*MetricRegistry) ComputeAll ¶
func (r *MetricRegistry) ComputeAll(ctx context.Context, input *EvalInput, output *EvalOutput) (*MetricEvalResult, error)
ComputeAll 计算所有注册的指标
type NoveltyMetric ¶
type NoveltyMetric struct {
// contains filtered or unexported fields
}
NoveltyMetric 新颖性指标 评估研究输出的原创性和创新程度
func NewNoveltyMetric ¶
func NewNoveltyMetric(logger *zap.Logger) *NoveltyMetric
NewNoveltyMetric 创建新颖性指标
func (*NoveltyMetric) Compute ¶
func (m *NoveltyMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
func (*NoveltyMetric) Name ¶
func (m *NoveltyMetric) Name() string
type ResearchDimension ¶
type ResearchDimension string
ResearchDimension 研究评估维度
const ( DimensionNovelty ResearchDimension = "novelty" DimensionRigor ResearchDimension = "rigor" DimensionClarity ResearchDimension = "clarity" DimensionRelevance ResearchDimension = "relevance" DimensionCompleteness ResearchDimension = "completeness" DimensionReproducibility ResearchDimension = "reproducibility" )
type ResearchEvalConfig ¶
type ResearchEvalConfig struct {
// 尺寸权重( 一定和 1. 0)
Weights map[ResearchDimension]float64 `json:"weights"`
// 评价设置
UseLLMJudge bool `json:"use_llm_judge"` // Use LLM as judge
NumJudges int `json:"num_judges"` // Number of LLM judges (for voting)
JudgeModel string `json:"judge_model"` // Model for LLM judge
Timeout time.Duration `json:"timeout"` // Per-evaluation timeout
PassThreshold float64 `json:"pass_threshold"` // Minimum score to pass (0-1)
// 基于参考的评价
UseReferences bool `json:"use_references"` // Compare against reference papers
MaxReferences int `json:"max_references"` // Maximum reference papers to compare
}
ResearchEvalConfig 研究评估配置
func DefaultResearchEvalConfig ¶
func DefaultResearchEvalConfig() ResearchEvalConfig
DefaultResearchEvalConfig 返回默认研究评估配置
type ResearchEvalResult ¶
type ResearchEvalResult struct {
OverallScore float64 `json:"overall_score"` // 综合得分 (0-1)
DimensionScores map[ResearchDimension]float64 `json:"dimension_scores"` // 各维度得分
Passed bool `json:"passed"` // 是否通过
Feedback map[ResearchDimension]string `json:"feedback"` // 各维度反馈
Strengths []string `json:"strengths"` // 优势
Weaknesses []string `json:"weaknesses"` // 不足
Suggestions []string `json:"suggestions"` // 改进建议
EvaluatedAt time.Time `json:"evaluated_at"`
Duration time.Duration `json:"duration"`
}
ResearchEvalResult 研究评估结果
type ResearchEvaluator ¶
type ResearchEvaluator struct {
// contains filtered or unexported fields
}
ResearchEvaluator 研究评估器 - 编排多维度评估
func NewResearchEvaluator ¶
func NewResearchEvaluator(config ResearchEvalConfig, logger *zap.Logger) *ResearchEvaluator
NewResearchEvaluator 创建研究评估器
func (*ResearchEvaluator) BatchEvaluate ¶
func (e *ResearchEvaluator) BatchEvaluate(ctx context.Context, pairs []struct { Input *EvalInput Output *EvalOutput }) ([]*ResearchEvalResult, error)
BatchEvaluate 批量评估多个研究输出
func (*ResearchEvaluator) Evaluate ¶
func (e *ResearchEvaluator) Evaluate(ctx context.Context, input *EvalInput, output *EvalOutput) (*ResearchEvalResult, error)
Evaluate 执行完整的研究质量评估
func (*ResearchEvaluator) RegisterMetric ¶
func (e *ResearchEvaluator) RegisterMetric(dimension ResearchDimension, metric Metric)
RegisterMetric 注册评估维度指标
type RigorMetric ¶
type RigorMetric struct {
// contains filtered or unexported fields
}
RigorMetric 严谨性指标 评估研究方法论的严谨程度
func (*RigorMetric) Compute ¶
func (m *RigorMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
func (*RigorMetric) Name ¶
func (m *RigorMetric) Name() string
type Scorer ¶
type Scorer interface {
Score(ctx context.Context, task *EvalTask, output string) (float64, map[string]float64, error)
}
分分器定义了评分评价结果的界面.
type StatisticalReport ¶
type StatisticalReport struct {
ExperimentID string `json:"experiment_id"`
ExperimentName string `json:"experiment_name"`
Status ExperimentStatus `json:"status"`
Duration time.Duration `json:"duration"`
TotalSamples int `json:"total_samples"`
VariantReports map[string]*VariantReport `json:"variant_reports"`
Comparisons []*VariantComparison `json:"comparisons"`
Winner string `json:"winner,omitempty"`
WinnerConfidence float64 `json:"winner_confidence,omitempty"`
Recommendation string `json:"recommendation"`
GeneratedAt time.Time `json:"generated_at"`
}
统计报告是详细的统计分析报告 审定:所需经费 11.4
type TokenUsageMetric ¶
type TokenUsageMetric struct {
// MaxTokens 最大 Token 数,用于归一化
// 如果设置,返回值为 max(0, 1 - tokens/maxTokens)
// 如果不设置(0),直接返回 Token 数
MaxTokens int
}
TokenUsageMetric Token 使用量指标 返回 Token 使用量 核证:要求9.3
func NewTokenUsageMetric ¶
func NewTokenUsageMetric() *TokenUsageMetric
NewTokenUsageMetric 创建 Token 使用量指标
func NewTokenUsageMetricWithMax ¶
func NewTokenUsageMetricWithMax(maxTokens int) *TokenUsageMetric
NewTokenUsageMetricWithMax 创建带最大值的 Token 使用量指标
func (*TokenUsageMetric) Compute ¶
func (m *TokenUsageMetric) Compute(ctx context.Context, input *EvalInput, output *EvalOutput) (float64, error)
Compute 计算 Token 使用量 如果设置了最大值,返回归一化分数 (0.0 - 1.0) 否则返回原始 Token 数
type Variant ¶
type Variant struct {
ID string `json:"id"`
Name string `json:"name"`
Config map[string]any `json:"config"`
Weight float64 `json:"weight"` // 流量权重
IsControl bool `json:"is_control"`
}
Variant 实验变体 审定:所需经费11.1、11.5
type VariantComparison ¶
type VariantComparison struct {
ControlID string `json:"control_id"`
TreatmentID string `json:"treatment_id"`
MetricDeltas map[string]float64 `json:"metric_deltas"` // treatment - control
RelativeChange map[string]float64 `json:"relative_change"` // percentage change
PValues map[string]float64 `json:"p_values"`
Confidence map[string]float64 `json:"confidence"`
Significant map[string]bool `json:"significant"` // at 95% level
}
变量比较包含两个变量之间的比较结果
type VariantReport ¶
type VariantReport struct {
VariantID string `json:"variant_id"`
VariantName string `json:"variant_name"`
IsControl bool `json:"is_control"`
SampleCount int `json:"sample_count"`
Metrics map[string]float64 `json:"metrics"`
StdDev map[string]float64 `json:"std_dev"`
ConfInterval map[string][2]float64 `json:"confidence_interval"` // 95% CI
}
变量报告载有单一变量的详细统计数据