Documentation
¶
Overview ¶
Package rubric provides types for rubric-based evaluation reports with categorical scoring and severity-based findings. This is suited for LLM-as-Judge style reviews like PRD and ARB evaluations.
Index ¶
- func AllRequiredPassing(results []CategoryResult, rubric *RubricSet) bool
- type ActionItem
- type AggregationMethod
- type Annotation
- type CategoricalAgreement
- type Category
- func (c *Category) AddOption(value, label string, criteria ...string) *Category
- func (c *Category) GetOptionForValue(value string) *ScaleOption
- func (c *Category) SetEvaluationPrompt(prompt string) *Category
- func (c *Category) SetExamples(examples *CategoryExamples) *Category
- func (c *Category) SetRequired(required bool) *Category
- func (c *Category) SetWeight(weight float64) *Category
- func (c *Category) WithBinary(passCriteria, failCriteria []string) *Category
- func (c *Category) WithChecklist(required, optional []string, threshold *ChecklistThreshold) *Category
- func (c *Category) WithLikert(config *LikertConfig) *Category
- func (c *Category) WithLikert5(anchors []LikertAnchor) *Category
- func (c *Category) WithPassPartialFail(passCriteria, partialCriteria, failCriteria []string) *Category
- type CategoryExamples
- type CategoryResult
- func NewCategoryResult(category string, score ScoreValue, reasoning string) *CategoryResult
- func NewCategoryResultFromLikert(category string, likertScore int, config *LikertConfig, reasoning string) *CategoryResult
- func NewCategoryResultWithNumeric(category string, score ScoreValue, numericScore float64, reasoning string) *CategoryResult
- func (cr *CategoryResult) AddEvidence(evidence ...string) *CategoryResult
- func (cr *CategoryResult) AddFinding(f Finding) *CategoryResult
- func (cr *CategoryResult) GetNumericScore() float64
- func (cr *CategoryResult) HasNumericScore() bool
- func (cr *CategoryResult) IsPassing() bool
- func (cr *CategoryResult) SetChecklistResults(results *ChecklistResults) *CategoryResult
- func (cr *CategoryResult) SetNumericScore(score float64) *CategoryResult
- type CategoryResultCounts
- type ChecklistResults
- type ChecklistThreshold
- type Decision
- type DecisionStatus
- type EvaluationType
- type Example
- type Finding
- type FindingCounts
- type FindingLimits
- type IRRMetrics
- type JudgeCategoricalScore
- type JudgeDisagreement
- type JudgeMetadata
- func (j *JudgeMetadata) SetLatency(d time.Duration)
- func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata
- func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata
- func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata
- func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata
- func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata
- type LikertAnchor
- type LikertConfig
- type MultiJudgeResult
- type NextSteps
- type PairwiseCategoryScore
- type PairwiseComparison
- type PairwiseResult
- type PairwiseWinner
- type PassCriteria
- type RatingPair
- type ReferenceData
- type ReferenceDataset
- type ReportMetadata
- type Rubric
- func (r *Rubric) AddCategoryResult(cr CategoryResult)
- func (r *Rubric) AddFinding(f Finding)
- func (r *Rubric) Evaluate(rubricSet *RubricSet) Decision
- func (r *Rubric) Finalize(rubricSet *RubricSet, rerunCommand string)
- func (r *Rubric) GenerateNextSteps(rerunCommand string)
- func (r *Rubric) GenerateSummary() string
- func (r *Rubric) GetCategoryResult(categoryID string) *CategoryResult
- func (r *Rubric) SetJudge(judge *JudgeMetadata)
- func (r *Rubric) SetPassCriteria(criteria PassCriteria)
- func (r *Rubric) SetReference(ref *ReferenceData)
- func (r *Rubric) SetRubricInfo(rubricID, rubricVersion string)
- type RubricMetadata
- type RubricPassCriteria
- type RubricSet
- func (rs *RubricSet) AddCategory(cat Category) *RubricSet
- func (rs *RubricSet) GetCategory(id string) *Category
- func (rs *RubricSet) GetRequiredCategories() []Category
- func (rs *RubricSet) SetJudgePrompt(template string) *RubricSet
- func (rs *RubricSet) SetMetadata(meta *RubricMetadata) *RubricSet
- func (rs *RubricSet) SetPassCriteria(criteria RubricPassCriteria) *RubricSet
- func (rs *RubricSet) ToJSON() ([]byte, error)
- func (rs *RubricSet) Validate() []string
- type Scale
- type ScaleOption
- type ScaleType
- type ScoreValue
- type Severity
- type TokenUsage
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func AllRequiredPassing ¶
func AllRequiredPassing(results []CategoryResult, rubric *RubricSet) bool
AllRequiredPassing checks if all required categories passed.
Types ¶
type ActionItem ¶
type ActionItem struct {
// Action describes what needs to be done.
Action string `json:"action"`
// Category is the related evaluation category.
Category string `json:"category,omitempty"`
// Severity is the related finding severity.
Severity Severity `json:"severity,omitempty"`
// Owner suggests who should do this.
Owner string `json:"owner,omitempty"`
// Effort estimates work required.
Effort string `json:"effort,omitempty"`
}
ActionItem is a specific action to take.
type AggregationMethod ¶
type AggregationMethod string
AggregationMethod specifies how to combine multiple judge scores.
const ( // AggregationMajority uses majority vote for pass/partial/fail. AggregationMajority AggregationMethod = "majority" // AggregationConservative uses the lowest/most critical score. AggregationConservative AggregationMethod = "conservative" // AggregationOptimistic uses the highest/most lenient score. AggregationOptimistic AggregationMethod = "optimistic" // AggregationUnanimous requires all judges to agree. AggregationUnanimous AggregationMethod = "unanimous" )
type Annotation ¶
type Annotation struct {
// Name is the annotation type (e.g., "quality", "relevance").
Name string `json:"name"`
// Score is a numeric score (if applicable).
Score float64 `json:"score,omitempty"`
// Label is a categorical label (if applicable).
Label string `json:"label,omitempty"`
// Explanation provides reasoning for the annotation.
Explanation string `json:"explanation,omitempty"`
// AnnotatorID identifies who provided this annotation.
AnnotatorID string `json:"annotator_id,omitempty"`
// AnnotatorType indicates human vs automated (e.g., "human", "llm", "rule").
AnnotatorType string `json:"annotator_type,omitempty"`
}
Annotation represents a human-provided label or score.
type CategoricalAgreement ¶
type CategoricalAgreement struct {
// ExactAgreement is percentage of exact categorical matches.
ExactAgreement float64 `json:"exactAgreement"`
// ConfusionMatrix shows disagreement patterns.
// Keys are "rater1_score:rater2_score" (e.g., "pass:partial").
ConfusionMatrix map[string]int `json:"confusionMatrix"`
// SampleSize is the number of paired ratings.
SampleSize int `json:"sampleSize"`
}
CategoricalAgreement computes agreement between categorical scores.
func ComputeCategoricalAgreement ¶
func ComputeCategoricalAgreement(results1, results2 []CategoryResult) *CategoricalAgreement
ComputeCategoricalAgreement computes agreement between categorical scores.
type Category ¶
type Category struct {
// ID uniquely identifies this category within the rubric.
ID string `json:"id"`
// Name is the human-readable category name.
Name string `json:"name"`
// Description explains what this category measures.
Description string `json:"description"`
// Weight is the relative importance (default 1.0).
Weight float64 `json:"weight,omitempty"`
// Required indicates if this category must pass for overall pass.
Required bool `json:"required,omitempty"`
// Scale defines how this category is scored.
Scale Scale `json:"scale"`
// EvaluationPrompt is a specific prompt for evaluating this category.
EvaluationPrompt string `json:"evaluationPrompt,omitempty"`
// Examples provides few-shot examples for LLM evaluation.
// Research shows 1 example per level improves LLM alignment.
Examples *CategoryExamples `json:"examples,omitempty"`
}
Category is a single evaluation dimension.
func NewCategory ¶
NewCategory creates a new category with a categorical scale.
func (*Category) GetOptionForValue ¶
func (c *Category) GetOptionForValue(value string) *ScaleOption
GetOptionForValue returns the scale option for a given value.
func (*Category) SetEvaluationPrompt ¶
SetEvaluationPrompt sets the evaluation prompt for this category.
func (*Category) SetExamples ¶
func (c *Category) SetExamples(examples *CategoryExamples) *Category
SetExamples sets few-shot examples for the category.
func (*Category) SetRequired ¶
SetRequired marks this category as required for pass.
func (*Category) WithBinary ¶
WithBinary sets up a binary pass/fail scale.
func (*Category) WithChecklist ¶
func (c *Category) WithChecklist(required, optional []string, threshold *ChecklistThreshold) *Category
WithChecklist sets up a checklist scale.
func (*Category) WithLikert ¶
func (c *Category) WithLikert(config *LikertConfig) *Category
WithLikert sets up a Likert scale with custom configuration.
func (*Category) WithLikert5 ¶
func (c *Category) WithLikert5(anchors []LikertAnchor) *Category
WithLikert5 sets up a standard 1-5 Likert scale. Default thresholds: 4-5 = pass, 3 = partial, 1-2 = fail.
func (*Category) WithPassPartialFail ¶
func (c *Category) WithPassPartialFail(passCriteria, partialCriteria, failCriteria []string) *Category
WithPassPartialFail sets up a standard pass/partial/fail scale.
type CategoryExamples ¶
type CategoryExamples struct {
Pass *Example `json:"pass,omitempty"`
Partial *Example `json:"partial,omitempty"`
Fail *Example `json:"fail,omitempty"`
}
CategoryExamples provides few-shot examples for a category. Research shows 1 example per level improves LLM alignment.
type CategoryResult ¶
type CategoryResult struct {
// Category is the category ID.
Category string `json:"category"`
// Score is the assigned score (pass, partial, fail).
// This is the authoritative score for decision-making.
Score ScoreValue `json:"score"`
// NumericScore is an optional numeric score (e.g., 1-5 Likert).
// Used for human comparison, inter-rater reliability, and calibration.
// The categorical Score takes precedence for pass/fail decisions.
NumericScore *float64 `json:"numericScore,omitempty"`
// Reasoning explains the score (chain-of-thought).
Reasoning string `json:"reasoning"`
// Evidence are specific quotes or observations.
Evidence []string `json:"evidence,omitempty"`
// Findings are issues discovered in this category.
Findings []Finding `json:"findings,omitempty"`
// ChecklistResults tracks checklist items (for checklist scales).
ChecklistResults *ChecklistResults `json:"checklistResults,omitempty"`
}
CategoryResult is the evaluation result for a single category.
func NewCategoryResult ¶
func NewCategoryResult(category string, score ScoreValue, reasoning string) *CategoryResult
NewCategoryResult creates a category result with the given score.
func NewCategoryResultFromLikert ¶
func NewCategoryResultFromLikert(category string, likertScore int, config *LikertConfig, reasoning string) *CategoryResult
NewCategoryResultFromLikert creates a category result from a Likert score. The categorical score is derived from the numeric score using the config thresholds.
func NewCategoryResultWithNumeric ¶
func NewCategoryResultWithNumeric(category string, score ScoreValue, numericScore float64, reasoning string) *CategoryResult
NewCategoryResultWithNumeric creates a category result with both categorical and numeric scores. The numeric score is used for human comparison; categorical score is authoritative for decisions.
func (*CategoryResult) AddEvidence ¶
func (cr *CategoryResult) AddEvidence(evidence ...string) *CategoryResult
AddEvidence adds evidence to the result.
func (*CategoryResult) AddFinding ¶
func (cr *CategoryResult) AddFinding(f Finding) *CategoryResult
AddFinding adds a finding to the result.
func (*CategoryResult) GetNumericScore ¶
func (cr *CategoryResult) GetNumericScore() float64
GetNumericScore returns the numeric score, or 0 if not set.
func (*CategoryResult) HasNumericScore ¶
func (cr *CategoryResult) HasNumericScore() bool
HasNumericScore returns true if a numeric score is set.
func (*CategoryResult) IsPassing ¶
func (cr *CategoryResult) IsPassing() bool
IsPassing returns true if this category passed.
func (*CategoryResult) SetChecklistResults ¶
func (cr *CategoryResult) SetChecklistResults(results *ChecklistResults) *CategoryResult
SetChecklistResults sets the checklist results.
func (*CategoryResult) SetNumericScore ¶
func (cr *CategoryResult) SetNumericScore(score float64) *CategoryResult
SetNumericScore sets the numeric score.
type CategoryResultCounts ¶
type CategoryResultCounts struct {
Pass int `json:"pass"`
Partial int `json:"partial"`
Fail int `json:"fail"`
Total int `json:"total"`
}
CountCategoryResults counts results by score value.
func CountResults ¶
func CountResults(results []CategoryResult) CategoryResultCounts
CountResults counts category results by score.
func (CategoryResultCounts) AllPassing ¶
func (c CategoryResultCounts) AllPassing() bool
AllPassing returns true if all results are passing.
type ChecklistResults ¶
type ChecklistResults struct {
// RequiredPresent are required items that were found.
RequiredPresent []string `json:"requiredPresent,omitempty"`
// RequiredMissing are required items that were not found.
RequiredMissing []string `json:"requiredMissing,omitempty"`
// OptionalPresent are optional items that were found.
OptionalPresent []string `json:"optionalPresent,omitempty"`
// OptionalMissing are optional items that were not found.
OptionalMissing []string `json:"optionalMissing,omitempty"`
}
ChecklistResults tracks which items were found for checklist scales.
type ChecklistThreshold ¶
type ChecklistThreshold struct {
// Required is "all" or a number of required items that must be present.
Required string `json:"required,omitempty"`
// Optional is the minimum number of optional items needed.
Optional int `json:"optional,omitempty"`
}
ChecklistThreshold defines pass criteria for checklist scales.
type Decision ¶
type Decision struct {
// Status is the decision outcome.
Status DecisionStatus `json:"status"`
// Passed indicates if the evaluation passed.
Passed bool `json:"passed"`
// Rationale explains the decision.
Rationale string `json:"rationale"`
// FindingCounts summarizes findings by severity.
FindingCounts FindingCounts `json:"findingCounts"`
// CategoryCounts summarizes category results.
CategoryCounts CategoryResultCounts `json:"categoryCounts"`
}
Decision represents the evaluation decision.
func EvaluateResults ¶
func EvaluateResults(results []CategoryResult, findings []Finding, criteria PassCriteria, rubricSet *RubricSet) Decision
EvaluateResults checks category results and findings against criteria.
type DecisionStatus ¶
type DecisionStatus string
DecisionStatus represents the decision outcome.
const ( DecisionPass DecisionStatus = "pass" // Meets all criteria DecisionConditional DecisionStatus = "conditional" // Partial scores or non-blocking findings DecisionFail DecisionStatus = "fail" // Has blocking findings or required categories failed DecisionHumanReview DecisionStatus = "human_review" // Requires human judgment )
type EvaluationType ¶
type EvaluationType string
EvaluationType defines how evaluation is performed.
const ( // EvaluationTypeAnalytic scores each category independently (recommended for LLM-as-Judge). EvaluationTypeAnalytic EvaluationType = "analytic" // EvaluationTypeHolistic provides a single overall score. EvaluationTypeHolistic EvaluationType = "holistic" )
type Example ¶
type Example struct {
// Excerpt is example content from a document.
Excerpt string `json:"excerpt"`
// Reasoning explains why this gets this score.
// Including reasoning improves LLM alignment (chain-of-thought).
Reasoning string `json:"reasoning"`
}
Example is a few-shot example for LLM evaluation.
type Finding ¶
type Finding struct {
// ID is the unique identifier for this finding.
ID string `json:"id"`
// Category is the evaluation category this relates to.
Category string `json:"category"`
// Severity indicates the impact level.
Severity Severity `json:"severity"`
// Title is a brief summary of the finding.
Title string `json:"title"`
// Description provides detailed explanation.
Description string `json:"description"`
// Recommendation explains how to fix the issue.
Recommendation string `json:"recommendation"`
// Evidence provides specific examples or references.
Evidence string `json:"evidence,omitempty"`
// Owner suggests who should address this finding.
Owner string `json:"owner,omitempty"`
// Effort estimates the work required (low, medium, high).
Effort string `json:"effort,omitempty"`
}
Finding represents an issue discovered during evaluation.
func (*Finding) IsBlocking ¶
IsBlocking returns true if this finding blocks approval.
type FindingCounts ¶
type FindingCounts struct {
Critical int `json:"critical"`
High int `json:"high"`
Medium int `json:"medium"`
Low int `json:"low"`
Info int `json:"info"`
Total int `json:"total"`
}
FindingCounts tracks the number of findings by severity.
func CountFindings ¶
func CountFindings(findings []Finding) FindingCounts
CountFindings counts findings by severity.
func (FindingCounts) BlockingCount ¶
func (c FindingCounts) BlockingCount() int
BlockingCount returns the number of blocking findings.
func (FindingCounts) HasBlocking ¶
func (c FindingCounts) HasBlocking() bool
HasBlocking returns true if there are any blocking findings.
type FindingLimits ¶
type FindingLimits struct {
Critical int `json:"critical"`
High int `json:"high"`
Medium int `json:"medium"`
Low int `json:"low,omitempty"`
}
FindingLimits sets maximum allowed findings per severity. Use -1 for unlimited.
type IRRMetrics ¶
type IRRMetrics struct {
// ExactAgreement is the percentage of exact score matches.
ExactAgreement float64 `json:"exactAgreement"`
// AdjacentAgreement is the percentage within ±1 of each other.
AdjacentAgreement float64 `json:"adjacentAgreement"`
// MeanAbsoluteDifference is the average absolute difference.
MeanAbsoluteDifference float64 `json:"meanAbsoluteDifference"`
// PearsonCorrelation measures linear correlation (-1 to 1).
PearsonCorrelation float64 `json:"pearsonCorrelation"`
// SampleSize is the number of paired ratings.
SampleSize int `json:"sampleSize"`
}
IRRMetrics contains inter-rater reliability metrics. These metrics are useful when comparing LLM and human ratings.
func ComputeIRR ¶
func ComputeIRR(pairs []RatingPair) *IRRMetrics
ComputeIRR calculates inter-rater reliability metrics from paired ratings.
func ComputeIRRFromResults ¶
func ComputeIRRFromResults(results1, results2 []CategoryResult) *IRRMetrics
ComputeIRRFromResults computes IRR metrics from two sets of category results. Useful for comparing LLM evaluation with human ground truth.
type JudgeCategoricalScore ¶
type JudgeCategoricalScore struct {
// JudgeID identifies the judge.
JudgeID string `json:"judgeId"`
// Score is the judge's categorical score.
Score ScoreValue `json:"score"`
}
JudgeCategoricalScore is a categorical score from a specific judge.
type JudgeDisagreement ¶
type JudgeDisagreement struct {
// Category is the evaluation dimension.
Category string `json:"category"`
// Scores are the individual judge scores.
Scores []JudgeCategoricalScore `json:"scores"`
// UniqueScores is the number of distinct scores given.
UniqueScores int `json:"uniqueScores"`
}
JudgeDisagreement captures where judges had significantly different scores.
type JudgeMetadata ¶
type JudgeMetadata struct {
// JudgeID is a unique identifier for this judge configuration.
JudgeID string `json:"judge_id,omitempty"`
// Model is the LLM model used (e.g., "claude-3-opus-20240229", "gpt-4-turbo").
Model string `json:"model"`
// ModelProvider is the API provider (e.g., "anthropic", "openai", "bedrock").
ModelProvider string `json:"model_provider,omitempty"`
// ModelVersion is the specific model version if applicable.
ModelVersion string `json:"model_version,omitempty"`
// PromptTemplate is the name/ID of the prompt template used.
PromptTemplate string `json:"prompt_template,omitempty"`
// PromptVersion is the version of the prompt template.
PromptVersion string `json:"prompt_version,omitempty"`
// SystemPrompt is the system prompt used (or hash/reference if too long).
SystemPrompt string `json:"system_prompt,omitempty"`
// Temperature is the sampling temperature used.
Temperature float64 `json:"temperature,omitempty"`
// MaxTokens is the max tokens setting.
MaxTokens int `json:"max_tokens,omitempty"`
// RubricID references the rubric set used for scoring.
RubricID string `json:"rubric_id,omitempty"`
// RubricVersion is the version of the rubric used.
RubricVersion string `json:"rubric_version,omitempty"`
// EvaluatedAt is when this evaluation was performed.
EvaluatedAt time.Time `json:"evaluated_at,omitempty"`
// Latency is the evaluation duration.
Latency time.Duration `json:"latency,omitempty"`
// TokensUsed tracks token consumption.
TokensUsed *TokenUsage `json:"tokens_used,omitempty"`
// TraceID links to observability trace (e.g., for Opik/Phoenix/Langfuse).
TraceID string `json:"trace_id,omitempty"`
// SpanID links to observability span.
SpanID string `json:"span_id,omitempty"`
}
JudgeMetadata tracks information about the LLM judge that produced an evaluation. This enables reproducibility, debugging, and comparison of different judge configurations.
func NewJudgeMetadata ¶
func NewJudgeMetadata(model string) *JudgeMetadata
NewJudgeMetadata creates judge metadata with required fields.
func (*JudgeMetadata) SetLatency ¶
func (j *JudgeMetadata) SetLatency(d time.Duration)
SetLatency records the evaluation duration.
func (*JudgeMetadata) WithPrompt ¶
func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata
WithPrompt sets the prompt template info.
func (*JudgeMetadata) WithProvider ¶
func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata
WithProvider sets the model provider.
func (*JudgeMetadata) WithRubric ¶
func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata
WithRubric sets the rubric reference.
func (*JudgeMetadata) WithTemperature ¶
func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata
WithTemperature sets the sampling temperature.
func (*JudgeMetadata) WithTokenUsage ¶
func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata
WithTokenUsage sets the token usage.
func (*JudgeMetadata) WithTrace ¶
func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata
WithTrace links to observability.
type LikertAnchor ¶
type LikertAnchor struct {
// Value is the numeric score.
Value int `json:"value"`
// Label is the short label (e.g., "Excellent", "Good").
Label string `json:"label"`
// Description explains what this score means.
Description string `json:"description,omitempty"`
}
LikertAnchor describes what a specific score level means.
func StandardLikert5Anchors ¶
func StandardLikert5Anchors() []LikertAnchor
StandardLikert5Anchors returns standard 1-5 Likert anchors.
type LikertConfig ¶
type LikertConfig struct {
// Min is the minimum score value (usually 1 or 0).
Min int `json:"min"`
// Max is the maximum score value (usually 5).
Max int `json:"max"`
// Anchors describe what each score level means.
Anchors []LikertAnchor `json:"anchors,omitempty"`
// PassThreshold is the minimum score for "pass" (default: top 40%).
// For 1-5 scale, default is 4.
PassThreshold *int `json:"passThreshold,omitempty"`
// PartialThreshold is the minimum score for "partial" (default: middle).
// For 1-5 scale, default is 3.
PartialThreshold *int `json:"partialThreshold,omitempty"`
}
LikertConfig defines a Likert scale configuration.
type MultiJudgeResult ¶
type MultiJudgeResult struct {
// Evaluations are the individual judge evaluations.
Evaluations []*Rubric `json:"evaluations"`
// Judges contains metadata for each judge.
Judges []*JudgeMetadata `json:"judges"`
// AggregatedCategories are the combined category results.
AggregatedCategories []CategoryResult `json:"aggregatedCategories"`
// AggregationMethod describes how scores were combined.
AggregationMethod AggregationMethod `json:"aggregationMethod"`
// Agreement measures inter-judge agreement (0-1, higher = more agreement).
Agreement float64 `json:"agreement"`
// Disagreements lists categories where judges significantly disagreed.
Disagreements []JudgeDisagreement `json:"disagreements,omitempty"`
// ConsolidatedDecision is the final decision after aggregation.
ConsolidatedDecision Decision `json:"consolidatedDecision"`
// ConsolidatedFindings merges findings from all judges.
ConsolidatedFindings []Finding `json:"consolidatedFindings"`
}
MultiJudgeResult aggregates evaluations from multiple judges. This improves reliability by combining perspectives and detecting disagreement.
func AggregateEvaluations ¶
func AggregateEvaluations(evaluations []*Rubric, method AggregationMethod) *MultiJudgeResult
AggregateEvaluations combines multiple evaluation reports.
type NextSteps ¶
type NextSteps struct {
// RerunCommand is the command to re-run evaluation.
RerunCommand string `json:"rerunCommand,omitempty"`
// Immediate are blocking actions that must be completed.
Immediate []ActionItem `json:"immediate,omitempty"`
// Recommended are suggested improvements.
Recommended []ActionItem `json:"recommended,omitempty"`
}
NextSteps provides actionable workflow guidance.
type PairwiseCategoryScore ¶
type PairwiseCategoryScore struct {
// Category is the evaluation dimension.
Category string `json:"category"`
// Winner indicates which output won for this category.
Winner PairwiseWinner `json:"winner"`
// Margin indicates how much better the winner is (0-1, higher = larger gap).
Margin float64 `json:"margin,omitempty"`
// Reasoning explains the category-level comparison.
Reasoning string `json:"reasoning,omitempty"`
}
PairwiseCategoryScore compares outputs on a specific dimension.
type PairwiseComparison ¶
type PairwiseComparison struct {
// ID is the unique identifier for this comparison.
ID string `json:"id,omitempty"`
// Input is the shared input/prompt for both outputs.
Input string `json:"input"`
// OutputA is the first output being compared.
OutputA string `json:"output_a"`
// OutputB is the second output being compared.
OutputB string `json:"output_b"`
// Winner indicates which output won ("A", "B", or "tie").
Winner PairwiseWinner `json:"winner"`
// Confidence is the judge's confidence in the decision (0-1).
Confidence float64 `json:"confidence,omitempty"`
// Reasoning explains why this winner was chosen.
Reasoning string `json:"reasoning"`
// CategoryScores provides per-category comparisons if applicable.
CategoryScores []PairwiseCategoryScore `json:"category_scores,omitempty"`
// Judge contains metadata about the LLM judge.
Judge *JudgeMetadata `json:"judge,omitempty"`
// Metadata contains additional comparison context.
Metadata map[string]any `json:"metadata,omitempty"`
// CreatedAt is when this comparison was made.
CreatedAt time.Time `json:"created_at,omitempty"`
}
PairwiseComparison represents a comparison between two outputs. This is an alternative to absolute scoring that can reduce position bias and improve reliability of LLM-as-Judge evaluations.
func NewPairwiseComparison ¶
func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison
NewPairwiseComparison creates a new pairwise comparison.
func (*PairwiseComparison) AddCategoryScore ¶
func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)
AddCategoryScore adds a per-category comparison.
func (*PairwiseComparison) SetWinner ¶
func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)
SetWinner sets the comparison result.
func (*PairwiseComparison) SwappedComparison ¶
func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison
SwappedComparison creates a comparison with A and B swapped. Running both orders helps detect position bias in the judge.
type PairwiseResult ¶
type PairwiseResult struct {
// Comparisons are all the individual comparisons.
Comparisons []PairwiseComparison `json:"comparisons"`
// WinRateA is the percentage of comparisons won by A.
WinRateA float64 `json:"win_rate_a"`
// WinRateB is the percentage of comparisons won by B.
WinRateB float64 `json:"win_rate_b"`
// TieRate is the percentage of ties.
TieRate float64 `json:"tie_rate"`
// OverallWinner is the aggregated winner.
OverallWinner PairwiseWinner `json:"overall_winner"`
// Confidence is the overall confidence in the result.
Confidence float64 `json:"confidence"`
}
PairwiseResult aggregates multiple pairwise comparisons.
func ComputePairwiseResult ¶
func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult
ComputeResult aggregates multiple comparisons into a result.
type PairwiseWinner ¶
type PairwiseWinner string
PairwiseWinner indicates the winner of a pairwise comparison.
const ( // WinnerA indicates output A is better. WinnerA PairwiseWinner = "A" // WinnerB indicates output B is better. WinnerB PairwiseWinner = "B" // WinnerTie indicates both outputs are roughly equal. WinnerTie PairwiseWinner = "tie" // WinnerUncertain indicates the judge couldn't determine a winner. WinnerUncertain PairwiseWinner = "uncertain" )
type PassCriteria ¶
type PassCriteria struct {
// MinCategoriesPassing specifies how many categories must pass.
// Values: "all", "all_required", or a number like "3".
MinCategoriesPassing string `json:"minCategoriesPassing"`
// MaxFindings limits findings by severity.
// Use -1 for unlimited.
MaxFindings *FindingLimits `json:"maxFindingsSeverity,omitempty"`
}
PassCriteria defines the requirements for approval. Aligned with LLM-as-Judge best practices.
func DefaultPassCriteria ¶
func DefaultPassCriteria() PassCriteria
DefaultPassCriteria returns standard pass criteria. All required categories must pass, 0 critical/high findings allowed.
func StrictPassCriteria ¶
func StrictPassCriteria() PassCriteria
StrictPassCriteria returns strict pass criteria. All categories must pass, max 3 medium findings.
type RatingPair ¶
type RatingPair struct {
// Rater1 is the first rater's score (e.g., human).
Rater1 float64
// Rater2 is the second rater's score (e.g., LLM).
Rater2 float64
// Category is the category being rated.
Category string
// ItemID identifies the item being rated.
ItemID string
}
RatingPair represents a pair of ratings for the same item.
type ReferenceData ¶
type ReferenceData struct {
// ID is the unique identifier for this reference.
ID string `json:"id,omitempty"`
// Input is the input/prompt that produced the reference output.
Input string `json:"input,omitempty"`
// ExpectedOutput is the gold/reference output.
ExpectedOutput string `json:"expected_output,omitempty"`
// ExpectedOutputs allows multiple acceptable outputs.
ExpectedOutputs []string `json:"expected_outputs,omitempty"`
// Context provides additional context (e.g., retrieved documents for RAG).
Context []string `json:"context,omitempty"`
// Annotations are human-provided labels or scores.
Annotations []Annotation `json:"annotations,omitempty"`
// Source indicates where this reference came from.
Source string `json:"source,omitempty"`
// Tags categorize or filter references.
Tags []string `json:"tags,omitempty"`
// Metadata contains additional reference data.
Metadata map[string]any `json:"metadata,omitempty"`
}
ReferenceData contains ground truth or expected data for evaluation. This enables reference-based evaluation where outputs are compared against known-good examples.
func NewReferenceData ¶
func NewReferenceData(input, expectedOutput string) *ReferenceData
NewReferenceData creates a new reference data item.
func (*ReferenceData) WithAnnotation ¶
func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData
WithAnnotation adds a human annotation.
func (*ReferenceData) WithContext ¶
func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData
WithContext adds context documents.
type ReferenceDataset ¶
type ReferenceDataset struct {
// ID is the unique identifier for this dataset.
ID string `json:"id"`
// Name is the display name.
Name string `json:"name"`
// Description explains what this dataset contains.
Description string `json:"description,omitempty"`
// Version tracks dataset iterations.
Version string `json:"version,omitempty"`
// Items are the reference data items.
Items []ReferenceData `json:"items"`
// Tags categorize the dataset.
Tags []string `json:"tags,omitempty"`
// Metadata contains additional dataset info.
Metadata map[string]any `json:"metadata,omitempty"`
}
ReferenceDataset is a collection of reference data items.
func NewReferenceDataset ¶
func NewReferenceDataset(id, name string) *ReferenceDataset
NewReferenceDataset creates a new reference dataset.
func (*ReferenceDataset) AddItem ¶
func (d *ReferenceDataset) AddItem(item ReferenceData)
AddItem adds a reference data item to the dataset.
func (*ReferenceDataset) GetByID ¶
func (d *ReferenceDataset) GetByID(id string) *ReferenceData
GetByID retrieves a reference item by ID.
type ReportMetadata ¶
type ReportMetadata struct {
// Document is the filename or path being evaluated.
Document string `json:"document"`
// DocumentID is the document identifier (e.g., PRD ID).
DocumentID string `json:"documentId,omitempty"`
// DocumentTitle is the document title.
DocumentTitle string `json:"documentTitle,omitempty"`
// DocumentVersion is the document version.
DocumentVersion string `json:"documentVersion,omitempty"`
// GeneratedAt is when the report was created.
GeneratedAt time.Time `json:"generatedAt"`
// GeneratedBy identifies what created this report.
GeneratedBy string `json:"generatedBy,omitempty"`
// ReviewerID identifies the reviewer (agent or human).
ReviewerID string `json:"reviewerId,omitempty"`
}
ReportMetadata contains report identification.
type Rubric ¶
type Rubric struct {
// Schema is the JSON Schema URL.
Schema string `json:"$schema,omitempty"`
// Metadata contains report identification and audit info.
Metadata ReportMetadata `json:"metadata"`
// ReviewType identifies the type of review (prd, arb, security, article, etc.).
ReviewType string `json:"reviewType"`
// Judge contains metadata about the LLM judge.
Judge *JudgeMetadata `json:"judge,omitempty"`
// RubricID references the rubric used for scoring.
RubricID string `json:"rubricId,omitempty"`
// RubricVersion is the version of the rubric used.
RubricVersion string `json:"rubricVersion,omitempty"`
// Reference contains gold/expected data for comparison.
Reference *ReferenceData `json:"reference,omitempty"`
// Categories contains results for each evaluation dimension.
Categories []CategoryResult `json:"categories"`
// Findings are all issues discovered during evaluation.
Findings []Finding `json:"findings"`
// PassCriteria defines the requirements for approval.
PassCriteria PassCriteria `json:"passCriteria"`
// Decision is the evaluation outcome.
Decision Decision `json:"decision"`
// OverallDecision is a simplified pass/conditional/fail status.
OverallDecision string `json:"overallDecision"`
// NextSteps provides actionable guidance.
NextSteps NextSteps `json:"nextSteps"`
// Summary is the overall assessment.
Summary string `json:"summary"`
}
Rubric is the detailed rubric-based evaluation report for LLM-as-Judge reviews.
func (*Rubric) AddCategoryResult ¶
func (r *Rubric) AddCategoryResult(cr CategoryResult)
AddCategoryResult adds a category result.
func (*Rubric) GenerateNextSteps ¶
GenerateNextSteps creates actionable next steps.
func (*Rubric) GenerateSummary ¶
GenerateSummary creates the summary text.
func (*Rubric) GetCategoryResult ¶
func (r *Rubric) GetCategoryResult(categoryID string) *CategoryResult
GetCategoryResult returns a category result by ID, or nil if not found.
func (*Rubric) SetJudge ¶
func (r *Rubric) SetJudge(judge *JudgeMetadata)
SetJudge sets the judge metadata.
func (*Rubric) SetPassCriteria ¶
func (r *Rubric) SetPassCriteria(criteria PassCriteria)
SetPassCriteria sets the pass criteria.
func (*Rubric) SetReference ¶
func (r *Rubric) SetReference(ref *ReferenceData)
SetReference sets the reference data for comparison.
func (*Rubric) SetRubricInfo ¶
SetRubricInfo sets the rubric ID and version.
type RubricMetadata ¶
type RubricMetadata struct {
CreatedAt string `json:"createdAt,omitempty"`
Author string `json:"author,omitempty"`
BasedOn []string `json:"basedOn,omitempty"`
}
RubricMetadata contains additional rubric information.
type RubricPassCriteria ¶
type RubricPassCriteria struct {
// MinCategoriesPassing is "all", "all_required", or a number.
MinCategoriesPassing string `json:"minCategoriesPassing,omitempty"`
// MaxFindings limits findings by severity.
MaxFindings *FindingLimits `json:"maxFindingsSeverity,omitempty"`
}
RubricPassCriteria defines requirements for overall pass/fail determination.
type RubricSet ¶
type RubricSet struct {
// ID uniquely identifies this rubric set.
ID string `json:"id"`
// Name is the human-readable name.
Name string `json:"name"`
// Version is the semantic version of this rubric.
Version string `json:"version"`
// Description explains what this rubric set evaluates.
Description string `json:"description,omitempty"`
// EvaluationType is "analytic" (per-category) or "holistic" (single score).
// Analytic is recommended for LLM-as-Judge.
EvaluationType EvaluationType `json:"evaluationType,omitempty"`
// PassCriteria defines requirements for overall pass/fail.
PassCriteria RubricPassCriteria `json:"passCriteria"`
// Categories are the evaluation dimensions.
Categories []Category `json:"categories"`
// JudgePromptTemplate is the prompt template for LLM evaluation.
// Supports placeholders: {content}, {categories}, etc.
JudgePromptTemplate string `json:"judgePromptTemplate,omitempty"`
// Metadata contains additional information about the rubric.
Metadata *RubricMetadata `json:"metadata,omitempty"`
}
RubricSet is a collection of rubrics for a complete evaluation. Follows Go-first principles: Go types are source of truth, JSON Schema generated from them.
func NewRubricSet ¶
NewRubricSet creates a new rubric set with required fields.
func (*RubricSet) AddCategory ¶
AddCategory adds a category to the rubric set.
func (*RubricSet) GetCategory ¶
GetCategory returns a category by ID, or nil if not found.
func (*RubricSet) GetRequiredCategories ¶
GetRequiredCategories returns all required categories.
func (*RubricSet) SetJudgePrompt ¶
SetJudgePrompt sets the judge prompt template.
func (*RubricSet) SetMetadata ¶
func (rs *RubricSet) SetMetadata(meta *RubricMetadata) *RubricSet
SetMetadata sets the rubric metadata.
func (*RubricSet) SetPassCriteria ¶
func (rs *RubricSet) SetPassCriteria(criteria RubricPassCriteria) *RubricSet
SetPassCriteria sets the pass criteria.
type Scale ¶
type Scale struct {
// Type is "categorical", "checklist", "binary", or "likert".
// Categorical with 2-3 options is recommended for LLM-as-Judge.
// Likert is better for human comparison studies.
Type ScaleType `json:"type"`
// Options are the scoring options (for categorical scales).
Options []ScaleOption `json:"options,omitempty"`
// RequiredItems are items that must be present (for checklist scales).
RequiredItems []string `json:"requiredItems,omitempty"`
// OptionalItems are items that add value (for checklist scales).
OptionalItems []string `json:"optionalItems,omitempty"`
// PassingThreshold defines pass criteria (for checklist scales).
PassingThreshold *ChecklistThreshold `json:"passingThreshold,omitempty"`
// LikertConfig defines the likert scale (for likert scales).
LikertConfig *LikertConfig `json:"likertConfig,omitempty"`
}
Scale defines the scoring mechanism for a category.
type ScaleOption ¶
type ScaleOption struct {
// Value is the machine-readable value (e.g., "pass", "partial", "fail").
Value string `json:"value"`
// Label is the human-readable label.
Label string `json:"label"`
// Criteria are specific requirements for this score level.
Criteria []string `json:"criteria"`
}
ScaleOption is a single option in a categorical scale.
type ScaleType ¶
type ScaleType string
ScaleType defines the type of scoring scale.
const ( // ScaleTypeCategorical uses discrete categories (pass/partial/fail). // Recommended for LLM-as-Judge - better calibrated than numeric scales. ScaleTypeCategorical ScaleType = "categorical" // ScaleTypeChecklist uses a list of required/optional items. ScaleTypeChecklist ScaleType = "checklist" // ScaleTypeBinary is simple pass/fail. ScaleTypeBinary ScaleType = "binary" // ScaleTypeLikert uses a numeric scale (e.g., 1-5). // Better for human comparison and inter-rater reliability studies. // Scores are mapped to categorical (pass/partial/fail) for decisions. ScaleTypeLikert ScaleType = "likert" )
type ScoreValue ¶
type ScoreValue string
ScoreValue represents a categorical score value.
const ( ScorePass ScoreValue = "pass" ScorePartial ScoreValue = "partial" ScoreFail ScoreValue = "fail" )
func LikertToCategorical ¶
func LikertToCategorical(score int, config *LikertConfig) ScoreValue
LikertToCategorical converts a Likert score to categorical (pass/partial/fail).
func (ScoreValue) Icon ¶
func (s ScoreValue) Icon() string
Icon returns the emoji icon for the score.
func (ScoreValue) IsFailing ¶
func (s ScoreValue) IsFailing() bool
IsFailing returns true if this score is failing.
func (ScoreValue) IsPartial ¶
func (s ScoreValue) IsPartial() bool
IsPartial returns true if this score is partial.
func (ScoreValue) IsPassing ¶
func (s ScoreValue) IsPassing() bool
IsPassing returns true if this score is considered passing.
type Severity ¶
type Severity string
Severity represents the severity level of a finding. Based on InfoSec severity classifications.
func AllSeverities ¶
func AllSeverities() []Severity
AllSeverities returns all severity levels in order of severity.
func (Severity) IsBlocking ¶
IsBlocking returns true if this severity blocks approval.
type TokenUsage ¶
type TokenUsage struct {
// InputTokens is the number of input/prompt tokens.
InputTokens int `json:"input_tokens"`
// OutputTokens is the number of output/completion tokens.
OutputTokens int `json:"output_tokens"`
// TotalTokens is the total tokens used.
TotalTokens int `json:"total_tokens"`
// CacheReadTokens is tokens read from cache (if applicable).
CacheReadTokens int `json:"cache_read_tokens,omitempty"`
// CacheWriteTokens is tokens written to cache (if applicable).
CacheWriteTokens int `json:"cache_write_tokens,omitempty"`
}
TokenUsage tracks token consumption for an evaluation.