rubric

package
v0.6.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Jun 1, 2026 License: MIT Imports: 5 Imported by: 0

Documentation

Overview

Package rubric provides types for rubric-based evaluation reports with categorical scoring and severity-based findings. This is suited for LLM-as-Judge style reviews like PRD and ARB evaluations.

Index

Constants

This section is empty.

Variables

This section is empty.

Functions

func AllRequiredPassing

func AllRequiredPassing(results []CategoryResult, rubric *RubricSet) bool

AllRequiredPassing checks if all required categories passed.

Types

type ActionItem

type ActionItem struct {
	// Action describes what needs to be done.
	Action string `json:"action"`

	// Category is the related evaluation category.
	Category string `json:"category,omitempty"`

	// Severity is the related finding severity.
	Severity Severity `json:"severity,omitempty"`

	// Owner suggests who should do this.
	Owner string `json:"owner,omitempty"`

	// Effort estimates work required.
	Effort string `json:"effort,omitempty"`
}

ActionItem is a specific action to take.

type AggregationMethod

type AggregationMethod string

AggregationMethod specifies how to combine multiple judge scores.

const (
	// AggregationMajority uses majority vote for pass/partial/fail.
	AggregationMajority AggregationMethod = "majority"

	// AggregationConservative uses the lowest/most critical score.
	AggregationConservative AggregationMethod = "conservative"

	// AggregationOptimistic uses the highest/most lenient score.
	AggregationOptimistic AggregationMethod = "optimistic"

	// AggregationUnanimous requires all judges to agree.
	AggregationUnanimous AggregationMethod = "unanimous"
)

type Annotation

type Annotation struct {
	// Name is the annotation type (e.g., "quality", "relevance").
	Name string `json:"name"`

	// Score is a numeric score (if applicable).
	Score float64 `json:"score,omitempty"`

	// Label is a categorical label (if applicable).
	Label string `json:"label,omitempty"`

	// Explanation provides reasoning for the annotation.
	Explanation string `json:"explanation,omitempty"`

	// AnnotatorID identifies who provided this annotation.
	AnnotatorID string `json:"annotator_id,omitempty"`

	// AnnotatorType indicates human vs automated (e.g., "human", "llm", "rule").
	AnnotatorType string `json:"annotator_type,omitempty"`
}

Annotation represents a human-provided label or score.

type CategoricalAgreement

type CategoricalAgreement struct {
	// ExactAgreement is percentage of exact categorical matches.
	ExactAgreement float64 `json:"exactAgreement"`

	// ConfusionMatrix shows disagreement patterns.
	// Keys are "rater1_score:rater2_score" (e.g., "pass:partial").
	ConfusionMatrix map[string]int `json:"confusionMatrix"`

	// SampleSize is the number of paired ratings.
	SampleSize int `json:"sampleSize"`
}

CategoricalAgreement computes agreement between categorical scores.

func ComputeCategoricalAgreement

func ComputeCategoricalAgreement(results1, results2 []CategoryResult) *CategoricalAgreement

ComputeCategoricalAgreement computes agreement between categorical scores.

type Category

type Category struct {
	// ID uniquely identifies this category within the rubric.
	ID string `json:"id"`

	// Name is the human-readable category name.
	Name string `json:"name"`

	// Description explains what this category measures.
	Description string `json:"description"`

	// Weight is the relative importance (default 1.0).
	Weight float64 `json:"weight,omitempty"`

	// Required indicates if this category must pass for overall pass.
	Required bool `json:"required,omitempty"`

	// Scale defines how this category is scored.
	Scale Scale `json:"scale"`

	// EvaluationPrompt is a specific prompt for evaluating this category.
	EvaluationPrompt string `json:"evaluationPrompt,omitempty"`

	// Examples provides few-shot examples for LLM evaluation.
	// Research shows 1 example per level improves LLM alignment.
	Examples *CategoryExamples `json:"examples,omitempty"`
}

Category is a single evaluation dimension.

func NewCategory

func NewCategory(id, name, description string) *Category

NewCategory creates a new category with a categorical scale.

func (*Category) AddOption

func (c *Category) AddOption(value, label string, criteria ...string) *Category

AddOption adds a scale option to a categorical category.

func (*Category) GetOptionForValue

func (c *Category) GetOptionForValue(value string) *ScaleOption

GetOptionForValue returns the scale option for a given value.

func (*Category) SetEvaluationPrompt

func (c *Category) SetEvaluationPrompt(prompt string) *Category

SetEvaluationPrompt sets the evaluation prompt for this category.

func (*Category) SetExamples

func (c *Category) SetExamples(examples *CategoryExamples) *Category

SetExamples sets few-shot examples for the category.

func (*Category) SetRequired

func (c *Category) SetRequired(required bool) *Category

SetRequired marks this category as required for pass.

func (*Category) SetWeight

func (c *Category) SetWeight(weight float64) *Category

SetWeight sets the category weight.

func (*Category) WithBinary

func (c *Category) WithBinary(passCriteria, failCriteria []string) *Category

WithBinary sets up a binary pass/fail scale.

func (*Category) WithChecklist

func (c *Category) WithChecklist(required, optional []string, threshold *ChecklistThreshold) *Category

WithChecklist sets up a checklist scale.

func (*Category) WithLikert

func (c *Category) WithLikert(config *LikertConfig) *Category

WithLikert sets up a Likert scale with custom configuration.

func (*Category) WithLikert5

func (c *Category) WithLikert5(anchors []LikertAnchor) *Category

WithLikert5 sets up a standard 1-5 Likert scale. Default thresholds: 4-5 = pass, 3 = partial, 1-2 = fail.

func (*Category) WithPassPartialFail

func (c *Category) WithPassPartialFail(passCriteria, partialCriteria, failCriteria []string) *Category

WithPassPartialFail sets up a standard pass/partial/fail scale.

type CategoryExamples

type CategoryExamples struct {
	Pass    *Example `json:"pass,omitempty"`
	Partial *Example `json:"partial,omitempty"`
	Fail    *Example `json:"fail,omitempty"`
}

CategoryExamples provides few-shot examples for a category. Research shows 1 example per level improves LLM alignment.

type CategoryResult

type CategoryResult struct {
	// Category is the category ID.
	Category string `json:"category"`

	// Score is the assigned score (pass, partial, fail).
	// This is the authoritative score for decision-making.
	Score ScoreValue `json:"score"`

	// NumericScore is an optional numeric score (e.g., 1-5 Likert).
	// Used for human comparison, inter-rater reliability, and calibration.
	// The categorical Score takes precedence for pass/fail decisions.
	NumericScore *float64 `json:"numericScore,omitempty"`

	// Reasoning explains the score (chain-of-thought).
	Reasoning string `json:"reasoning"`

	// Evidence are specific quotes or observations.
	Evidence []string `json:"evidence,omitempty"`

	// Findings are issues discovered in this category.
	Findings []Finding `json:"findings,omitempty"`

	// ChecklistResults tracks checklist items (for checklist scales).
	ChecklistResults *ChecklistResults `json:"checklistResults,omitempty"`
}

CategoryResult is the evaluation result for a single category.

func NewCategoryResult

func NewCategoryResult(category string, score ScoreValue, reasoning string) *CategoryResult

NewCategoryResult creates a category result with the given score.

func NewCategoryResultFromLikert

func NewCategoryResultFromLikert(category string, likertScore int, config *LikertConfig, reasoning string) *CategoryResult

NewCategoryResultFromLikert creates a category result from a Likert score. The categorical score is derived from the numeric score using the config thresholds.

func NewCategoryResultWithNumeric

func NewCategoryResultWithNumeric(category string, score ScoreValue, numericScore float64, reasoning string) *CategoryResult

NewCategoryResultWithNumeric creates a category result with both categorical and numeric scores. The numeric score is used for human comparison; categorical score is authoritative for decisions.

func (*CategoryResult) AddEvidence

func (cr *CategoryResult) AddEvidence(evidence ...string) *CategoryResult

AddEvidence adds evidence to the result.

func (*CategoryResult) AddFinding

func (cr *CategoryResult) AddFinding(f Finding) *CategoryResult

AddFinding adds a finding to the result.

func (*CategoryResult) GetNumericScore

func (cr *CategoryResult) GetNumericScore() float64

GetNumericScore returns the numeric score, or 0 if not set.

func (*CategoryResult) HasNumericScore

func (cr *CategoryResult) HasNumericScore() bool

HasNumericScore returns true if a numeric score is set.

func (*CategoryResult) IsPassing

func (cr *CategoryResult) IsPassing() bool

IsPassing returns true if this category passed.

func (*CategoryResult) SetChecklistResults

func (cr *CategoryResult) SetChecklistResults(results *ChecklistResults) *CategoryResult

SetChecklistResults sets the checklist results.

func (*CategoryResult) SetNumericScore

func (cr *CategoryResult) SetNumericScore(score float64) *CategoryResult

SetNumericScore sets the numeric score.

type CategoryResultCounts

type CategoryResultCounts struct {
	Pass    int `json:"pass"`
	Partial int `json:"partial"`
	Fail    int `json:"fail"`
	Total   int `json:"total"`
}

CountCategoryResults counts results by score value.

func CountResults

func CountResults(results []CategoryResult) CategoryResultCounts

CountResults counts category results by score.

func (CategoryResultCounts) AllPassing

func (c CategoryResultCounts) AllPassing() bool

AllPassing returns true if all results are passing.

type ChecklistResults

type ChecklistResults struct {
	// RequiredPresent are required items that were found.
	RequiredPresent []string `json:"requiredPresent,omitempty"`

	// RequiredMissing are required items that were not found.
	RequiredMissing []string `json:"requiredMissing,omitempty"`

	// OptionalPresent are optional items that were found.
	OptionalPresent []string `json:"optionalPresent,omitempty"`

	// OptionalMissing are optional items that were not found.
	OptionalMissing []string `json:"optionalMissing,omitempty"`
}

ChecklistResults tracks which items were found for checklist scales.

type ChecklistThreshold

type ChecklistThreshold struct {
	// Required is "all" or a number of required items that must be present.
	Required string `json:"required,omitempty"`

	// Optional is the minimum number of optional items needed.
	Optional int `json:"optional,omitempty"`
}

ChecklistThreshold defines pass criteria for checklist scales.

type Decision

type Decision struct {
	// Status is the decision outcome.
	Status DecisionStatus `json:"status"`

	// Passed indicates if the evaluation passed.
	Passed bool `json:"passed"`

	// Rationale explains the decision.
	Rationale string `json:"rationale"`

	// FindingCounts summarizes findings by severity.
	FindingCounts FindingCounts `json:"findingCounts"`

	// CategoryCounts summarizes category results.
	CategoryCounts CategoryResultCounts `json:"categoryCounts"`
}

Decision represents the evaluation decision.

func EvaluateResults

func EvaluateResults(results []CategoryResult, findings []Finding, criteria PassCriteria, rubricSet *RubricSet) Decision

EvaluateResults checks category results and findings against criteria.

type DecisionStatus

type DecisionStatus string

DecisionStatus represents the decision outcome.

const (
	DecisionPass        DecisionStatus = "pass"         // Meets all criteria
	DecisionConditional DecisionStatus = "conditional"  // Partial scores or non-blocking findings
	DecisionFail        DecisionStatus = "fail"         // Has blocking findings or required categories failed
	DecisionHumanReview DecisionStatus = "human_review" // Requires human judgment
)

type EvaluationType

type EvaluationType string

EvaluationType defines how evaluation is performed.

const (
	// EvaluationTypeAnalytic scores each category independently (recommended for LLM-as-Judge).
	EvaluationTypeAnalytic EvaluationType = "analytic"

	// EvaluationTypeHolistic provides a single overall score.
	EvaluationTypeHolistic EvaluationType = "holistic"
)

type Example

type Example struct {
	// Excerpt is example content from a document.
	Excerpt string `json:"excerpt"`

	// Reasoning explains why this gets this score.
	// Including reasoning improves LLM alignment (chain-of-thought).
	Reasoning string `json:"reasoning"`
}

Example is a few-shot example for LLM evaluation.

type Finding

type Finding struct {
	// ID is the unique identifier for this finding.
	ID string `json:"id"`

	// Category is the evaluation category this relates to.
	Category string `json:"category"`

	// Severity indicates the impact level.
	Severity Severity `json:"severity"`

	// Title is a brief summary of the finding.
	Title string `json:"title"`

	// Description provides detailed explanation.
	Description string `json:"description"`

	// Recommendation explains how to fix the issue.
	Recommendation string `json:"recommendation"`

	// Evidence provides specific examples or references.
	Evidence string `json:"evidence,omitempty"`

	// Owner suggests who should address this finding.
	Owner string `json:"owner,omitempty"`

	// Effort estimates the work required (low, medium, high).
	Effort string `json:"effort,omitempty"`
}

Finding represents an issue discovered during evaluation.

func (*Finding) IsBlocking

func (f *Finding) IsBlocking() bool

IsBlocking returns true if this finding blocks approval.

type FindingCounts

type FindingCounts struct {
	Critical int `json:"critical"`
	High     int `json:"high"`
	Medium   int `json:"medium"`
	Low      int `json:"low"`
	Info     int `json:"info"`
	Total    int `json:"total"`
}

FindingCounts tracks the number of findings by severity.

func CountFindings

func CountFindings(findings []Finding) FindingCounts

CountFindings counts findings by severity.

func (FindingCounts) BlockingCount

func (c FindingCounts) BlockingCount() int

BlockingCount returns the number of blocking findings.

func (FindingCounts) HasBlocking

func (c FindingCounts) HasBlocking() bool

HasBlocking returns true if there are any blocking findings.

type FindingLimits

type FindingLimits struct {
	Critical int `json:"critical"`
	High     int `json:"high"`
	Medium   int `json:"medium"`
	Low      int `json:"low,omitempty"`
}

FindingLimits sets maximum allowed findings per severity. Use -1 for unlimited.

type IRRMetrics

type IRRMetrics struct {
	// ExactAgreement is the percentage of exact score matches.
	ExactAgreement float64 `json:"exactAgreement"`

	// AdjacentAgreement is the percentage within ±1 of each other.
	AdjacentAgreement float64 `json:"adjacentAgreement"`

	// MeanAbsoluteDifference is the average absolute difference.
	MeanAbsoluteDifference float64 `json:"meanAbsoluteDifference"`

	// PearsonCorrelation measures linear correlation (-1 to 1).
	PearsonCorrelation float64 `json:"pearsonCorrelation"`

	// SampleSize is the number of paired ratings.
	SampleSize int `json:"sampleSize"`
}

IRRMetrics contains inter-rater reliability metrics. These metrics are useful when comparing LLM and human ratings.

func ComputeIRR

func ComputeIRR(pairs []RatingPair) *IRRMetrics

ComputeIRR calculates inter-rater reliability metrics from paired ratings.

func ComputeIRRFromResults

func ComputeIRRFromResults(results1, results2 []CategoryResult) *IRRMetrics

ComputeIRRFromResults computes IRR metrics from two sets of category results. Useful for comparing LLM evaluation with human ground truth.

type JudgeCategoricalScore

type JudgeCategoricalScore struct {
	// JudgeID identifies the judge.
	JudgeID string `json:"judgeId"`

	// Score is the judge's categorical score.
	Score ScoreValue `json:"score"`
}

JudgeCategoricalScore is a categorical score from a specific judge.

type JudgeDisagreement

type JudgeDisagreement struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Scores are the individual judge scores.
	Scores []JudgeCategoricalScore `json:"scores"`

	// UniqueScores is the number of distinct scores given.
	UniqueScores int `json:"uniqueScores"`
}

JudgeDisagreement captures where judges had significantly different scores.

type JudgeMetadata

type JudgeMetadata struct {
	// JudgeID is a unique identifier for this judge configuration.
	JudgeID string `json:"judge_id,omitempty"`

	// Model is the LLM model used (e.g., "claude-3-opus-20240229", "gpt-4-turbo").
	Model string `json:"model"`

	// ModelProvider is the API provider (e.g., "anthropic", "openai", "bedrock").
	ModelProvider string `json:"model_provider,omitempty"`

	// ModelVersion is the specific model version if applicable.
	ModelVersion string `json:"model_version,omitempty"`

	// PromptTemplate is the name/ID of the prompt template used.
	PromptTemplate string `json:"prompt_template,omitempty"`

	// PromptVersion is the version of the prompt template.
	PromptVersion string `json:"prompt_version,omitempty"`

	// SystemPrompt is the system prompt used (or hash/reference if too long).
	SystemPrompt string `json:"system_prompt,omitempty"`

	// Temperature is the sampling temperature used.
	Temperature float64 `json:"temperature,omitempty"`

	// MaxTokens is the max tokens setting.
	MaxTokens int `json:"max_tokens,omitempty"`

	// RubricID references the rubric set used for scoring.
	RubricID string `json:"rubric_id,omitempty"`

	// RubricVersion is the version of the rubric used.
	RubricVersion string `json:"rubric_version,omitempty"`

	// EvaluatedAt is when this evaluation was performed.
	EvaluatedAt time.Time `json:"evaluated_at,omitempty"`

	// Latency is the evaluation duration.
	Latency time.Duration `json:"latency,omitempty"`

	// TokensUsed tracks token consumption.
	TokensUsed *TokenUsage `json:"tokens_used,omitempty"`

	// TraceID links to observability trace (e.g., for Opik/Phoenix/Langfuse).
	TraceID string `json:"trace_id,omitempty"`

	// SpanID links to observability span.
	SpanID string `json:"span_id,omitempty"`
}

JudgeMetadata tracks information about the LLM judge that produced an evaluation. This enables reproducibility, debugging, and comparison of different judge configurations.

func NewJudgeMetadata

func NewJudgeMetadata(model string) *JudgeMetadata

NewJudgeMetadata creates judge metadata with required fields.

func (*JudgeMetadata) SetLatency

func (j *JudgeMetadata) SetLatency(d time.Duration)

SetLatency records the evaluation duration.

func (*JudgeMetadata) WithPrompt

func (j *JudgeMetadata) WithPrompt(template, version string) *JudgeMetadata

WithPrompt sets the prompt template info.

func (*JudgeMetadata) WithProvider

func (j *JudgeMetadata) WithProvider(provider string) *JudgeMetadata

WithProvider sets the model provider.

func (*JudgeMetadata) WithRubric

func (j *JudgeMetadata) WithRubric(id, version string) *JudgeMetadata

WithRubric sets the rubric reference.

func (*JudgeMetadata) WithTemperature

func (j *JudgeMetadata) WithTemperature(temp float64) *JudgeMetadata

WithTemperature sets the sampling temperature.

func (*JudgeMetadata) WithTokenUsage

func (j *JudgeMetadata) WithTokenUsage(input, output int) *JudgeMetadata

WithTokenUsage sets the token usage.

func (*JudgeMetadata) WithTrace

func (j *JudgeMetadata) WithTrace(traceID, spanID string) *JudgeMetadata

WithTrace links to observability.

type LikertAnchor

type LikertAnchor struct {
	// Value is the numeric score.
	Value int `json:"value"`

	// Label is the short label (e.g., "Excellent", "Good").
	Label string `json:"label"`

	// Description explains what this score means.
	Description string `json:"description,omitempty"`
}

LikertAnchor describes what a specific score level means.

func StandardLikert5Anchors

func StandardLikert5Anchors() []LikertAnchor

StandardLikert5Anchors returns standard 1-5 Likert anchors.

type LikertConfig

type LikertConfig struct {
	// Min is the minimum score value (usually 1 or 0).
	Min int `json:"min"`

	// Max is the maximum score value (usually 5).
	Max int `json:"max"`

	// Anchors describe what each score level means.
	Anchors []LikertAnchor `json:"anchors,omitempty"`

	// PassThreshold is the minimum score for "pass" (default: top 40%).
	// For 1-5 scale, default is 4.
	PassThreshold *int `json:"passThreshold,omitempty"`

	// PartialThreshold is the minimum score for "partial" (default: middle).
	// For 1-5 scale, default is 3.
	PartialThreshold *int `json:"partialThreshold,omitempty"`
}

LikertConfig defines a Likert scale configuration.

type MultiJudgeResult

type MultiJudgeResult struct {
	// Evaluations are the individual judge evaluations.
	Evaluations []*Rubric `json:"evaluations"`

	// Judges contains metadata for each judge.
	Judges []*JudgeMetadata `json:"judges"`

	// AggregatedCategories are the combined category results.
	AggregatedCategories []CategoryResult `json:"aggregatedCategories"`

	// AggregationMethod describes how scores were combined.
	AggregationMethod AggregationMethod `json:"aggregationMethod"`

	// Agreement measures inter-judge agreement (0-1, higher = more agreement).
	Agreement float64 `json:"agreement"`

	// Disagreements lists categories where judges significantly disagreed.
	Disagreements []JudgeDisagreement `json:"disagreements,omitempty"`

	// ConsolidatedDecision is the final decision after aggregation.
	ConsolidatedDecision Decision `json:"consolidatedDecision"`

	// ConsolidatedFindings merges findings from all judges.
	ConsolidatedFindings []Finding `json:"consolidatedFindings"`
}

MultiJudgeResult aggregates evaluations from multiple judges. This improves reliability by combining perspectives and detecting disagreement.

func AggregateEvaluations

func AggregateEvaluations(evaluations []*Rubric, method AggregationMethod) *MultiJudgeResult

AggregateEvaluations combines multiple evaluation reports.

type NextSteps

type NextSteps struct {
	// RerunCommand is the command to re-run evaluation.
	RerunCommand string `json:"rerunCommand,omitempty"`

	// Immediate are blocking actions that must be completed.
	Immediate []ActionItem `json:"immediate,omitempty"`

	// Recommended are suggested improvements.
	Recommended []ActionItem `json:"recommended,omitempty"`
}

NextSteps provides actionable workflow guidance.

type PairwiseCategoryScore

type PairwiseCategoryScore struct {
	// Category is the evaluation dimension.
	Category string `json:"category"`

	// Winner indicates which output won for this category.
	Winner PairwiseWinner `json:"winner"`

	// Margin indicates how much better the winner is (0-1, higher = larger gap).
	Margin float64 `json:"margin,omitempty"`

	// Reasoning explains the category-level comparison.
	Reasoning string `json:"reasoning,omitempty"`
}

PairwiseCategoryScore compares outputs on a specific dimension.

type PairwiseComparison

type PairwiseComparison struct {
	// ID is the unique identifier for this comparison.
	ID string `json:"id,omitempty"`

	// Input is the shared input/prompt for both outputs.
	Input string `json:"input"`

	// OutputA is the first output being compared.
	OutputA string `json:"output_a"`

	// OutputB is the second output being compared.
	OutputB string `json:"output_b"`

	// Winner indicates which output won ("A", "B", or "tie").
	Winner PairwiseWinner `json:"winner"`

	// Confidence is the judge's confidence in the decision (0-1).
	Confidence float64 `json:"confidence,omitempty"`

	// Reasoning explains why this winner was chosen.
	Reasoning string `json:"reasoning"`

	// CategoryScores provides per-category comparisons if applicable.
	CategoryScores []PairwiseCategoryScore `json:"category_scores,omitempty"`

	// Judge contains metadata about the LLM judge.
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// Metadata contains additional comparison context.
	Metadata map[string]any `json:"metadata,omitempty"`

	// CreatedAt is when this comparison was made.
	CreatedAt time.Time `json:"created_at,omitempty"`
}

PairwiseComparison represents a comparison between two outputs. This is an alternative to absolute scoring that can reduce position bias and improve reliability of LLM-as-Judge evaluations.

func NewPairwiseComparison

func NewPairwiseComparison(input, outputA, outputB string) *PairwiseComparison

NewPairwiseComparison creates a new pairwise comparison.

func (*PairwiseComparison) AddCategoryScore

func (p *PairwiseComparison) AddCategoryScore(category string, winner PairwiseWinner, reasoning string, margin float64)

AddCategoryScore adds a per-category comparison.

func (*PairwiseComparison) SetWinner

func (p *PairwiseComparison) SetWinner(winner PairwiseWinner, reasoning string, confidence float64)

SetWinner sets the comparison result.

func (*PairwiseComparison) SwappedComparison

func (p *PairwiseComparison) SwappedComparison() *PairwiseComparison

SwappedComparison creates a comparison with A and B swapped. Running both orders helps detect position bias in the judge.

type PairwiseResult

type PairwiseResult struct {
	// Comparisons are all the individual comparisons.
	Comparisons []PairwiseComparison `json:"comparisons"`

	// WinRateA is the percentage of comparisons won by A.
	WinRateA float64 `json:"win_rate_a"`

	// WinRateB is the percentage of comparisons won by B.
	WinRateB float64 `json:"win_rate_b"`

	// TieRate is the percentage of ties.
	TieRate float64 `json:"tie_rate"`

	// OverallWinner is the aggregated winner.
	OverallWinner PairwiseWinner `json:"overall_winner"`

	// Confidence is the overall confidence in the result.
	Confidence float64 `json:"confidence"`
}

PairwiseResult aggregates multiple pairwise comparisons.

func ComputePairwiseResult

func ComputePairwiseResult(comparisons []PairwiseComparison) *PairwiseResult

ComputeResult aggregates multiple comparisons into a result.

type PairwiseWinner

type PairwiseWinner string

PairwiseWinner indicates the winner of a pairwise comparison.

const (
	// WinnerA indicates output A is better.
	WinnerA PairwiseWinner = "A"

	// WinnerB indicates output B is better.
	WinnerB PairwiseWinner = "B"

	// WinnerTie indicates both outputs are roughly equal.
	WinnerTie PairwiseWinner = "tie"

	// WinnerUncertain indicates the judge couldn't determine a winner.
	WinnerUncertain PairwiseWinner = "uncertain"
)

type PassCriteria

type PassCriteria struct {
	// MinCategoriesPassing specifies how many categories must pass.
	// Values: "all", "all_required", or a number like "3".
	MinCategoriesPassing string `json:"minCategoriesPassing"`

	// MaxFindings limits findings by severity.
	// Use -1 for unlimited.
	MaxFindings *FindingLimits `json:"maxFindingsSeverity,omitempty"`
}

PassCriteria defines the requirements for approval. Aligned with LLM-as-Judge best practices.

func DefaultPassCriteria

func DefaultPassCriteria() PassCriteria

DefaultPassCriteria returns standard pass criteria. All required categories must pass, 0 critical/high findings allowed.

func StrictPassCriteria

func StrictPassCriteria() PassCriteria

StrictPassCriteria returns strict pass criteria. All categories must pass, max 3 medium findings.

type RatingPair

type RatingPair struct {
	// Rater1 is the first rater's score (e.g., human).
	Rater1 float64

	// Rater2 is the second rater's score (e.g., LLM).
	Rater2 float64

	// Category is the category being rated.
	Category string

	// ItemID identifies the item being rated.
	ItemID string
}

RatingPair represents a pair of ratings for the same item.

type ReferenceData

type ReferenceData struct {
	// ID is the unique identifier for this reference.
	ID string `json:"id,omitempty"`

	// Input is the input/prompt that produced the reference output.
	Input string `json:"input,omitempty"`

	// ExpectedOutput is the gold/reference output.
	ExpectedOutput string `json:"expected_output,omitempty"`

	// ExpectedOutputs allows multiple acceptable outputs.
	ExpectedOutputs []string `json:"expected_outputs,omitempty"`

	// Context provides additional context (e.g., retrieved documents for RAG).
	Context []string `json:"context,omitempty"`

	// Annotations are human-provided labels or scores.
	Annotations []Annotation `json:"annotations,omitempty"`

	// Source indicates where this reference came from.
	Source string `json:"source,omitempty"`

	// Tags categorize or filter references.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional reference data.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceData contains ground truth or expected data for evaluation. This enables reference-based evaluation where outputs are compared against known-good examples.

func NewReferenceData

func NewReferenceData(input, expectedOutput string) *ReferenceData

NewReferenceData creates a new reference data item.

func (*ReferenceData) WithAnnotation

func (r *ReferenceData) WithAnnotation(name string, score float64, annotatorID string) *ReferenceData

WithAnnotation adds a human annotation.

func (*ReferenceData) WithContext

func (r *ReferenceData) WithContext(ctx ...string) *ReferenceData

WithContext adds context documents.

type ReferenceDataset

type ReferenceDataset struct {
	// ID is the unique identifier for this dataset.
	ID string `json:"id"`

	// Name is the display name.
	Name string `json:"name"`

	// Description explains what this dataset contains.
	Description string `json:"description,omitempty"`

	// Version tracks dataset iterations.
	Version string `json:"version,omitempty"`

	// Items are the reference data items.
	Items []ReferenceData `json:"items"`

	// Tags categorize the dataset.
	Tags []string `json:"tags,omitempty"`

	// Metadata contains additional dataset info.
	Metadata map[string]any `json:"metadata,omitempty"`
}

ReferenceDataset is a collection of reference data items.

func NewReferenceDataset

func NewReferenceDataset(id, name string) *ReferenceDataset

NewReferenceDataset creates a new reference dataset.

func (*ReferenceDataset) AddItem

func (d *ReferenceDataset) AddItem(item ReferenceData)

AddItem adds a reference data item to the dataset.

func (*ReferenceDataset) GetByID

func (d *ReferenceDataset) GetByID(id string) *ReferenceData

GetByID retrieves a reference item by ID.

type ReportMetadata

type ReportMetadata struct {
	// Document is the filename or path being evaluated.
	Document string `json:"document"`

	// DocumentID is the document identifier (e.g., PRD ID).
	DocumentID string `json:"documentId,omitempty"`

	// DocumentTitle is the document title.
	DocumentTitle string `json:"documentTitle,omitempty"`

	// DocumentVersion is the document version.
	DocumentVersion string `json:"documentVersion,omitempty"`

	// GeneratedAt is when the report was created.
	GeneratedAt time.Time `json:"generatedAt"`

	// GeneratedBy identifies what created this report.
	GeneratedBy string `json:"generatedBy,omitempty"`

	// ReviewerID identifies the reviewer (agent or human).
	ReviewerID string `json:"reviewerId,omitempty"`
}

ReportMetadata contains report identification.

type Rubric

type Rubric struct {
	// Schema is the JSON Schema URL.
	Schema string `json:"$schema,omitempty"`

	// Metadata contains report identification and audit info.
	Metadata ReportMetadata `json:"metadata"`

	// ReviewType identifies the type of review (prd, arb, security, article, etc.).
	ReviewType string `json:"reviewType"`

	// Judge contains metadata about the LLM judge.
	Judge *JudgeMetadata `json:"judge,omitempty"`

	// RubricID references the rubric used for scoring.
	RubricID string `json:"rubricId,omitempty"`

	// RubricVersion is the version of the rubric used.
	RubricVersion string `json:"rubricVersion,omitempty"`

	// Reference contains gold/expected data for comparison.
	Reference *ReferenceData `json:"reference,omitempty"`

	// Categories contains results for each evaluation dimension.
	Categories []CategoryResult `json:"categories"`

	// Findings are all issues discovered during evaluation.
	Findings []Finding `json:"findings"`

	// PassCriteria defines the requirements for approval.
	PassCriteria PassCriteria `json:"passCriteria"`

	// Decision is the evaluation outcome.
	Decision Decision `json:"decision"`

	// OverallDecision is a simplified pass/conditional/fail status.
	OverallDecision string `json:"overallDecision"`

	// NextSteps provides actionable guidance.
	NextSteps NextSteps `json:"nextSteps"`

	// Summary is the overall assessment.
	Summary string `json:"summary"`
}

Rubric is the detailed rubric-based evaluation report for LLM-as-Judge reviews.

func NewRubric

func NewRubric(reviewType, document string) *Rubric

NewRubric creates a new rubric-based evaluation report.

func (*Rubric) AddCategoryResult

func (r *Rubric) AddCategoryResult(cr CategoryResult)

AddCategoryResult adds a category result.

func (*Rubric) AddFinding

func (r *Rubric) AddFinding(f Finding)

AddFinding adds a finding.

func (*Rubric) Evaluate

func (r *Rubric) Evaluate(rubricSet *RubricSet) Decision

Evaluate computes the decision based on findings and category results.

func (*Rubric) Finalize

func (r *Rubric) Finalize(rubricSet *RubricSet, rerunCommand string)

Finalize computes all derived fields.

func (*Rubric) GenerateNextSteps

func (r *Rubric) GenerateNextSteps(rerunCommand string)

GenerateNextSteps creates actionable next steps.

func (*Rubric) GenerateSummary

func (r *Rubric) GenerateSummary() string

GenerateSummary creates the summary text.

func (*Rubric) GetCategoryResult

func (r *Rubric) GetCategoryResult(categoryID string) *CategoryResult

GetCategoryResult returns a category result by ID, or nil if not found.

func (*Rubric) SetJudge

func (r *Rubric) SetJudge(judge *JudgeMetadata)

SetJudge sets the judge metadata.

func (*Rubric) SetPassCriteria

func (r *Rubric) SetPassCriteria(criteria PassCriteria)

SetPassCriteria sets the pass criteria.

func (*Rubric) SetReference

func (r *Rubric) SetReference(ref *ReferenceData)

SetReference sets the reference data for comparison.

func (*Rubric) SetRubricInfo

func (r *Rubric) SetRubricInfo(rubricID, rubricVersion string)

SetRubricInfo sets the rubric ID and version.

type RubricMetadata

type RubricMetadata struct {
	CreatedAt string   `json:"createdAt,omitempty"`
	Author    string   `json:"author,omitempty"`
	BasedOn   []string `json:"basedOn,omitempty"`
}

RubricMetadata contains additional rubric information.

type RubricPassCriteria

type RubricPassCriteria struct {
	// MinCategoriesPassing is "all", "all_required", or a number.
	MinCategoriesPassing string `json:"minCategoriesPassing,omitempty"`

	// MaxFindings limits findings by severity.
	MaxFindings *FindingLimits `json:"maxFindingsSeverity,omitempty"`
}

RubricPassCriteria defines requirements for overall pass/fail determination.

type RubricSet

type RubricSet struct {
	// ID uniquely identifies this rubric set.
	ID string `json:"id"`

	// Name is the human-readable name.
	Name string `json:"name"`

	// Version is the semantic version of this rubric.
	Version string `json:"version"`

	// Description explains what this rubric set evaluates.
	Description string `json:"description,omitempty"`

	// EvaluationType is "analytic" (per-category) or "holistic" (single score).
	// Analytic is recommended for LLM-as-Judge.
	EvaluationType EvaluationType `json:"evaluationType,omitempty"`

	// PassCriteria defines requirements for overall pass/fail.
	PassCriteria RubricPassCriteria `json:"passCriteria"`

	// Categories are the evaluation dimensions.
	Categories []Category `json:"categories"`

	// JudgePromptTemplate is the prompt template for LLM evaluation.
	// Supports placeholders: {content}, {categories}, etc.
	JudgePromptTemplate string `json:"judgePromptTemplate,omitempty"`

	// Metadata contains additional information about the rubric.
	Metadata *RubricMetadata `json:"metadata,omitempty"`
}

RubricSet is a collection of rubrics for a complete evaluation. Follows Go-first principles: Go types are source of truth, JSON Schema generated from them.

func NewRubricSet

func NewRubricSet(id, name, version string) *RubricSet

NewRubricSet creates a new rubric set with required fields.

func (*RubricSet) AddCategory

func (rs *RubricSet) AddCategory(cat Category) *RubricSet

AddCategory adds a category to the rubric set.

func (*RubricSet) GetCategory

func (rs *RubricSet) GetCategory(id string) *Category

GetCategory returns a category by ID, or nil if not found.

func (*RubricSet) GetRequiredCategories

func (rs *RubricSet) GetRequiredCategories() []Category

GetRequiredCategories returns all required categories.

func (*RubricSet) SetJudgePrompt

func (rs *RubricSet) SetJudgePrompt(template string) *RubricSet

SetJudgePrompt sets the judge prompt template.

func (*RubricSet) SetMetadata

func (rs *RubricSet) SetMetadata(meta *RubricMetadata) *RubricSet

SetMetadata sets the rubric metadata.

func (*RubricSet) SetPassCriteria

func (rs *RubricSet) SetPassCriteria(criteria RubricPassCriteria) *RubricSet

SetPassCriteria sets the pass criteria.

func (*RubricSet) ToJSON

func (rs *RubricSet) ToJSON() ([]byte, error)

ToJSON serializes a rubric set to JSON.

func (*RubricSet) Validate

func (rs *RubricSet) Validate() []string

Validate checks the rubric for common issues.

type Scale

type Scale struct {
	// Type is "categorical", "checklist", "binary", or "likert".
	// Categorical with 2-3 options is recommended for LLM-as-Judge.
	// Likert is better for human comparison studies.
	Type ScaleType `json:"type"`

	// Options are the scoring options (for categorical scales).
	Options []ScaleOption `json:"options,omitempty"`

	// RequiredItems are items that must be present (for checklist scales).
	RequiredItems []string `json:"requiredItems,omitempty"`

	// OptionalItems are items that add value (for checklist scales).
	OptionalItems []string `json:"optionalItems,omitempty"`

	// PassingThreshold defines pass criteria (for checklist scales).
	PassingThreshold *ChecklistThreshold `json:"passingThreshold,omitempty"`

	// LikertConfig defines the likert scale (for likert scales).
	LikertConfig *LikertConfig `json:"likertConfig,omitempty"`
}

Scale defines the scoring mechanism for a category.

type ScaleOption

type ScaleOption struct {
	// Value is the machine-readable value (e.g., "pass", "partial", "fail").
	Value string `json:"value"`

	// Label is the human-readable label.
	Label string `json:"label"`

	// Criteria are specific requirements for this score level.
	Criteria []string `json:"criteria"`
}

ScaleOption is a single option in a categorical scale.

type ScaleType

type ScaleType string

ScaleType defines the type of scoring scale.

const (
	// ScaleTypeCategorical uses discrete categories (pass/partial/fail).
	// Recommended for LLM-as-Judge - better calibrated than numeric scales.
	ScaleTypeCategorical ScaleType = "categorical"

	// ScaleTypeChecklist uses a list of required/optional items.
	ScaleTypeChecklist ScaleType = "checklist"

	// ScaleTypeBinary is simple pass/fail.
	ScaleTypeBinary ScaleType = "binary"

	// ScaleTypeLikert uses a numeric scale (e.g., 1-5).
	// Better for human comparison and inter-rater reliability studies.
	// Scores are mapped to categorical (pass/partial/fail) for decisions.
	ScaleTypeLikert ScaleType = "likert"
)

type ScoreValue

type ScoreValue string

ScoreValue represents a categorical score value.

const (
	ScorePass    ScoreValue = "pass"
	ScorePartial ScoreValue = "partial"
	ScoreFail    ScoreValue = "fail"
)

func LikertToCategorical

func LikertToCategorical(score int, config *LikertConfig) ScoreValue

LikertToCategorical converts a Likert score to categorical (pass/partial/fail).

func (ScoreValue) Icon

func (s ScoreValue) Icon() string

Icon returns the emoji icon for the score.

func (ScoreValue) IsFailing

func (s ScoreValue) IsFailing() bool

IsFailing returns true if this score is failing.

func (ScoreValue) IsPartial

func (s ScoreValue) IsPartial() bool

IsPartial returns true if this score is partial.

func (ScoreValue) IsPassing

func (s ScoreValue) IsPassing() bool

IsPassing returns true if this score is considered passing.

type Severity

type Severity string

Severity represents the severity level of a finding. Based on InfoSec severity classifications.

const (
	SeverityCritical Severity = "critical" // Blocks approval, must fix
	SeverityHigh     Severity = "high"     // Blocks approval, must fix
	SeverityMedium   Severity = "medium"   // Should fix before approval
	SeverityLow      Severity = "low"      // Nice to fix
	SeverityInfo     Severity = "info"     // Informational only
)

func AllSeverities

func AllSeverities() []Severity

AllSeverities returns all severity levels in order of severity.

func (Severity) Icon

func (s Severity) Icon() string

Icon returns the emoji icon for the severity.

func (Severity) IsBlocking

func (s Severity) IsBlocking() bool

IsBlocking returns true if this severity blocks approval.

func (Severity) Weight

func (s Severity) Weight() int

Weight returns a numeric weight for sorting (higher = more severe).

type TokenUsage

type TokenUsage struct {
	// InputTokens is the number of input/prompt tokens.
	InputTokens int `json:"input_tokens"`

	// OutputTokens is the number of output/completion tokens.
	OutputTokens int `json:"output_tokens"`

	// TotalTokens is the total tokens used.
	TotalTokens int `json:"total_tokens"`

	// CacheReadTokens is tokens read from cache (if applicable).
	CacheReadTokens int `json:"cache_read_tokens,omitempty"`

	// CacheWriteTokens is tokens written to cache (if applicable).
	CacheWriteTokens int `json:"cache_write_tokens,omitempty"`
}

TokenUsage tracks token consumption for an evaluation.

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL